diff --git a/.clang-format b/.clang-format
new file mode 100644
index 000000000..7f50b1820
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,90 @@
+---
+Language:        Cpp
+# BasedOnStyle:  Google
+AccessModifierOffset: -1
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: true
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:   
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+ColumnLimit:     100
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: true
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+IncludeCategories: 
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IndentCaseLabels: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    true
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Auto
+TabWidth:        8
+UseTab:          Never
+...
+
diff --git a/.gitignore b/.gitignore
index b27354ed0..87cffc498 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,5 +8,15 @@ read_input.c
 *~
 Makefile
 config.log
+tmlqcd_config.h
+config.status
 *.mod.c
 *.opari.inc
+build/*
+tags
+*.a
+hmc_tm
+invert
+offline_measurement
+lib/
+benchmark
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 000000000..fec60b8ed
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,24 @@
+########################################################################
+#
+# Copyright (C) 2017 Martin Ueding <dev@martin-ueding.de>
+#
+# This file is part of tmLQCD.
+#
+# tmLQCD is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# 
+# tmLQCD is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+########################################################################
+
+language: c
+script: ./travis-ci.sh
+dist: trusty
+sudo: true
diff --git a/AUTHORS b/AUTHORS
index 53ede4ecb..f14bfccbd 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,14 +1,17 @@
+Simone Bacchio
 Remi Baron
 Benoit Blossier
 Florian Burger
 Thomas Chiarappa
 Nils Christian
 Albert Deuzeman
+Jacob Finkenrath
 Elena Garcia Ramos
 Jenifer Gonzalez Lopez
 Gilbert Grosdidier
 Karl Jansen
 Bartosz Kostrzewa
+Peter Labus
 Joseph Nagel
 Andreas Nube
 David Palao
@@ -19,6 +22,7 @@ Dru Renner
 Francesco Sanfilippo
 Luigi Scorzato
 Andrea Shindler
+Martin Ueding
 Carsten Urbach
 Jan Volkholz
 Urs Wenger
diff --git a/DDalphaAMG_interface.c b/DDalphaAMG_interface.c
new file mode 100644
index 000000000..bc655806e
--- /dev/null
+++ b/DDalphaAMG_interface.c
@@ -0,0 +1,1399 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2016 Simone Bacchio, Jacob Finkenrath
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Interface for DDalphaAMG
+ *
+ *******************************************************************************/
+
+#include "DDalphaAMG_interface.h"
+
+#ifndef DDalphaAMG
+
+int mg_setup_iter;
+int mg_coarse_setup_iter;
+int mg_update_setup_iter;
+int mg_update_gauge;
+int mg_omp_num_threads;
+int mg_Nvec;
+int mg_lvl;
+int mg_blk[4];
+int mg_mixed_prec;
+int mg_setup_mu_set;
+int mg_no_shifts = 0;
+double mg_mms_mass = 0;
+double mg_setup_mu;
+double mg_cmu_factor;
+double mg_dtau_update;
+double mg_rho_update;
+
+void MG_init(void) {
+    printf("ERROR: MG_init called but DDalphaAMG library not included.\n");
+    exit(1);
+}
+
+void MG_update_gauge(double step) {
+    printf("ERROR: MG_update_gauge called but DDalphaAMG library not included.\n");
+    exit(1);
+}
+
+void MG_update_mu(double mu_tmLQCD, double odd_tmLQCD) {
+    printf("ERROR: MG_update_mu called but DDalphaAMG library not included.\n");
+    exit(1);
+}
+
+void MG_reset(void) {
+    printf("ERROR: MG_reset called but DDalphaAMG library not included.\n");
+    exit(1);
+}
+
+void MG_finalize(void) {
+    printf("ERROR: MG_finalize called but DDalphaAMG library not included.\n");
+    exit(1);
+}
+
+int MG_solver(spinor * const phi_new, spinor * const phi_old,
+              const double precision, const int max_iter,const int rel_prec,
+              const int N, su3 **gf, matrix_mult f) {
+    printf("ERROR: MG_solver called but DDalphaAMG library not included.\n");
+    exit(1);
+}
+
+int MG_solver_eo(spinor * const Even_new, spinor * const Odd_new,
+                 spinor * const Even, spinor * const Odd,
+                 const double precision, const int max_iter, const int rel_prec,
+                 const int N, su3 **gf, matrix_mult_full f_full) {
+    printf("ERROR: MG_solver_eo called but DDalphaAMG library not included.\n");
+    exit(1);
+}
+
+int MG_solver_nd(spinor * const up_new, spinor * const dn_new,
+		 spinor * const up_old, spinor * const dn_old,
+		 const double precision, const int max_iter, const int rel_prec,
+		 const int N, su3 **gf, matrix_mult_nd f) {
+    printf("ERROR: MG_solver_nd called but DDalphaAMG library not included.\n");
+    exit(1);
+}
+
+#else
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "boundary.h"
+#include "linalg/convert_eo_to_lexic.h"
+#include "solver/solver.h"
+#include "solver/solver_field.h"
+#include "gettime.h"
+#include "read_input.h"
+#include "DDalphaAMG.h"
+#include "linalg_eo.h"
+#include "phmc.h"
+#include "operator/D_psi.h"
+#include "operator/tm_operators.h"
+#include "operator/tm_operators_nd.h"
+#include "operator/clovertm_operators.h"
+#include "operator/Hopping_Matrix.h"
+
+//Enable variant for shifted operator in the ND sector.
+//The variant is used in case of initial guess for the squared operator.
+//It is faster and tests prove also to be safe (read Appendix A of arxiv:1801.##### by S.Bacchio et al.)
+#define VARIANT_FOR_SHIFTED
+
+DDalphaAMG_init mg_init;
+DDalphaAMG_parameters mg_params;
+DDalphaAMG_status mg_status;
+int mg_do_setup=1; //if one do or redo the setup
+int mg_update_gauge=1; //set to zero if gaugefield is up to date, set to one if it has to be updated
+int mg_update_setup=0; //Number of additional setup iteration 
+int mg_initialized=0;
+int mg_setup_iter=5;
+int mg_coarse_setup_iter=3;
+int mg_update_setup_iter=1;
+int mg_omp_num_threads=0;
+int mg_Nvec=24;
+int mg_lvl=3;
+int mg_blk[4] = {0, 0, 0, 0};
+int mg_mixed_prec=0;
+int mg_setup_mu_set = 0; //flag that enable the use of mg_setup_mu in the setup phase
+int mg_no_shifts = 0; // number of shifts to invert with MG. MMS-CG is used for the others at larger mass.
+double mg_mms_mass = 0.1; // mass shift value for switching from MMS-CG to MG. MMS-CG is used for larger masses than the value.
+double mg_setup_mu = 0.; 
+double mg_cmu_factor = 1.0;
+double mg_dtau_update = 0.0;
+double mg_rho_update = -1.0;
+double mg_tau = 0.0;
+double gauge_tau = 0.0;
+
+static int Cart_rank(MPI_Comm comm, const int coords[], int *rank) 
+{
+  int coords_l[4];
+  
+  coords_l[0]=coords[0];
+  coords_l[1]=coords[3];
+  coords_l[2]=coords[2];
+  coords_l[3]=coords[1];
+  
+  return MPI_Cart_rank(comm, coords_l, rank);
+}
+
+static int Cart_coords(MPI_Comm comm, int rank, int maxdims, int coords[]) 
+{
+  int stat;
+  
+  stat=MPI_Cart_coords(comm, rank, maxdims, coords);
+  int tmp=coords[1];
+  coords[1]=coords[3];
+  coords[3]=tmp;
+
+  return stat;
+}
+
+static int conf_index_fct(int t, int z, int y, int x, int mu) 
+{
+  int id;
+  
+  id=(g_ipt[t][x][y][z])*72; //9*2*4
+  id+=18*((mu%2==0)?mu:((mu==1)?3:1));//9*2
+  
+  return id;
+}
+
+static int vector_index_fct(int t, int z, int y, int x )
+{
+   int id;
+   
+   id=24*(g_ipt[t][x][y][z]);
+   
+   return id;
+}
+
+static inline int MG_check(spinor * const phi_new, spinor * const phi_old, const int N, const double precision, matrix_mult f) 
+{
+  double differ[2], residual;
+  spinor ** check_vect = NULL;
+  double acc_factor = 2;
+  
+  init_solver_field(&check_vect, VOLUMEPLUSRAND,1);
+  f( check_vect[0], phi_new);
+  diff( check_vect[0], check_vect[0], phi_old, N);
+  differ[0] = sqrt(square_norm(check_vect[0], N, 1));
+  differ[1] = sqrt(square_norm(phi_old, N, 1));
+  finalize_solver(check_vect, 1);
+  
+  residual = differ[0]/differ[1];
+  
+  if( residual > precision && residual < acc_factor*precision ) {
+    if(g_proc_id == 0)
+      printf("WARNING: solution accepted even if the residual wasn't complitely acceptable (%e > %e) \n", residual, precision);
+  } else if( residual > acc_factor*precision ) {
+    if(g_proc_id == 0) {
+      printf("ERROR: something bad happened... MG converged giving the wrong solution!! Trying to restart... \n");
+      printf("ERROR contd: || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > %e \n", differ[0],differ[1],differ[0]/differ[1],precision);
+    }
+    return 0;
+  } 
+
+  if (g_debug_level > 0 && g_proc_id == 0)
+    printf("MGTEST:  || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n", differ[0],differ[1],differ[0]/differ[1]);
+  
+  return 1;
+  
+}
+
+static inline int MG_check_nd( spinor * const up_new, spinor * const dn_new, spinor * const up_old, spinor * const dn_old,
+                               const int N, const double precision, matrix_mult_nd f) 
+{
+  double differ[2], residual;
+  spinor ** check_vect = NULL;
+  double acc_factor = 4;
+#ifdef VARIANT_FOR_SHIFTED
+  if((  f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+        f == Qsw_pm_ndpsi_shift )   // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+     && g_shift != 0 )
+    acc_factor = 1/sqrt(phmc_cheb_evmin/phmc_cheb_evmax + g_shift);
+#endif
+
+  init_solver_field(&check_vect, VOLUMEPLUSRAND,2);
+  f( check_vect[0], check_vect[1], up_new, dn_new);
+  diff( check_vect[0], check_vect[0], up_old, N);
+  diff( check_vect[1], check_vect[1], dn_old, N);
+  differ[0] = sqrt(square_norm(check_vect[0], N, 1)+square_norm(check_vect[1], N, 1));
+  differ[1] = sqrt(square_norm(up_old, N, 1)+square_norm(dn_old, N, 1));
+  finalize_solver(check_vect, 2);
+  
+  residual = differ[0]/differ[1];
+  
+  if( residual > precision && residual < acc_factor*precision ) {
+    if(g_proc_id == 0)
+      printf("WARNING: solution accepted even if the residual wasn't complitely acceptable (%e > %e). Max acc. factor %f.\n", residual, precision, acc_factor);
+  } else if( residual > acc_factor*precision ) {
+    if(g_proc_id == 0) {
+      printf("ERROR: something bad happened... MG converged giving the wrong solution!! Trying to restart... \n");
+      printf("ERROR contd: || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > %e \n", differ[0],differ[1],differ[0]/differ[1],precision);
+    }
+    return 0;
+  } 
+
+  if (g_debug_level > 0 && g_proc_id == 0)
+    printf("MGTEST:  || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n", differ[0],differ[1],differ[0]/differ[1]);
+  
+  return 1;
+  
+}
+
+static inline int MG_mms_check_nd( spinor **const up_new, spinor **const dn_new, 
+                                   spinor * const up_old, spinor * const dn_old,
+                                   const double * shifts, const int no_shifts, 
+                                   const int N, double * precision, matrix_mult_nd f) 
+{
+  double differ[2], residual;
+  spinor ** check_vect = NULL;
+  double acc_factor = 2;
+  
+  init_solver_field(&check_vect, VOLUMEPLUSRAND,2);
+
+  for( int i = 0; i < no_shifts; i++ ) {
+
+    g_shift = shifts[i]*shifts[i]; 
+
+    f( check_vect[0], check_vect[1], up_new[i], dn_new[i]);
+    diff( check_vect[0], check_vect[0], up_old, N);
+    diff( check_vect[1], check_vect[1], dn_old, N);
+    differ[0] = sqrt(square_norm(check_vect[0], N, 1)+square_norm(check_vect[1], N, 1));
+    differ[1] = sqrt(square_norm(up_old, N, 1)+square_norm(dn_old, N, 1));
+  
+    residual = differ[0]/differ[1];
+    
+    if( residual > precision[i] && residual < acc_factor*precision[i] ) {
+      if(g_proc_id == 0)
+        printf("WARNING: solution accepted even if the residual wasn't complitely acceptable (%e > %e) \n", residual, precision[i]);
+    } else if( residual > acc_factor*precision[i] ) {
+      if(g_proc_id == 0) {
+        printf("ERROR: something bad happened... MG converged giving the wrong solution!! Trying to restart... \n");
+        printf("ERROR contd: || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > %e \n", differ[0],differ[1],differ[0]/differ[1],precision[i]);
+      }
+      finalize_solver(check_vect, 2);
+      return 0;
+    } 
+    
+    if (g_debug_level > 0 && g_proc_id == 0)
+      printf("MGTEST:  || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n", differ[0],differ[1],differ[0]/differ[1]);
+    
+  }
+
+  finalize_solver(check_vect, 2);
+
+  return 1;
+  
+}
+
+
+static int MG_pre_solve( su3 **gf )
+{
+  
+  double dtau = fabs(mg_tau-gauge_tau);
+  // Checking if:
+  //  mg_update_setup < mg_update_setup_iter : maybe you want to do more iteration at this run
+  //  mg_dtau_update < dtau  : regular condition for update of setup
+  //  mg_dtau_update < -dtau : during reversability check dtau is negative!
+  //  mg_dtau_update == 0.0  : updating at every change of configuration -> valid as well if configuration changed outside the HMC
+  //  mg_rho_update < 0.0    : parameter ignore
+  //  mg_rho_update == rho   : updating only if this condition and the others are satisfied
+  if ( mg_do_setup == 0 && mg_update_setup < mg_update_setup_iter && 
+       ( (mg_dtau_update > 0.0 && dtau > 0.0 && mg_dtau_update < dtau+1e-6) ||
+         (mg_dtau_update == 0.0 && mg_update_gauge == 1)         ||
+         (mg_rho_update >= 0.0 && mg_rho_update == g_mu3) )) 
+    mg_update_setup = mg_update_setup_iter;
+  
+  if(g_debug_level > 0 && g_proc_id == 0)
+    printf("Tau has been increased since last MG setup update of %e\n", dtau);
+  
+  if (mg_initialized==0) {
+    MG_init();
+    mg_initialized = 1;
+    if (g_proc_id == 0)
+      printf("DDalphaAMG initialized\n");
+    MPI_Barrier(MPI_COMM_WORLD);
+  }
+  
+  if (mg_update_gauge==1) {
+    DDalphaAMG_set_configuration( (double*) &(gf[0][0]), &mg_status );
+    mg_update_gauge = 0;
+    if (mg_status.success && g_proc_id == 0) 
+      printf("DDalphaAMG cnfg set, plaquette %e\n", mg_status.info);
+    else if ( g_proc_id == 0)
+      printf("ERROR: configuration updating did not run correctly");
+  }
+  
+  if (mg_do_setup==1) {
+    if( mg_setup_mu_set ) {
+      if (g_proc_id == 0)
+        printf("DDalphaAMG using mu=%f during setup\n", mg_setup_mu);
+      MG_update_mu(mg_setup_mu, 0); 
+    } else
+      MG_update_mu(g_mu, 0);
+    if (g_proc_id == 0)
+      printf("DDalphaAMG running setup\n");
+    DDalphaAMG_setup(&mg_status);
+    mg_do_setup = 0;
+    mg_tau = gauge_tau;
+    if (mg_status.success && g_proc_id == 0)
+      printf("DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n",
+             mg_status.time, 100.*(mg_status.coarse_time/mg_status.time));
+    else if ( g_proc_id == 0)
+      printf("ERROR: setup procedure did not run correctly");
+  }
+  
+  if (mg_update_setup>0) {
+    if( mg_setup_mu_set ) {
+      if (g_proc_id == 0)
+        printf("DDalphaAMG using mu=%f during setup\n", mg_setup_mu);
+      MG_update_mu(mg_setup_mu, 0); 
+    } else
+      MG_update_mu(g_mu, 0);
+    if (g_proc_id == 0)
+      printf("DDalphaAMG updating setup\n");
+    DDalphaAMG_update_setup(mg_update_setup, &mg_status);
+    mg_update_setup = 0;
+    mg_tau = gauge_tau;
+    if (mg_status.success && g_proc_id == 0)
+      printf("DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n",
+             mg_status.time, 100.*(mg_status.coarse_time/mg_status.time));
+    else if ( g_proc_id == 0)
+      printf("ERROR: setup updating did not run correctly");
+  }
+  
+  return mg_status.success;
+}
+
+static int MG_solve(spinor * const phi_new, spinor * const phi_old, const double precision,
+                    const int N, matrix_mult f)
+{
+  
+  // for rescaling  convention in DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} -> rescale by 1/4+m
+  double mg_scale=0.5/g_kappa;
+  double *old = (double*) phi_old; 
+  double *new = (double*) phi_new;
+  spinor ** solver_field = NULL;
+  
+  if( N != VOLUME && N != VOLUME/2 ) {
+    if( g_proc_id == 0 )
+      printf("ERROR: N = %d in MG_solve. Expettected N == VOLUME (%d) or VOLUME/2 (%d)\n", N, VOLUME, VOLUME/2);
+    return 0;
+  }
+
+  if (N==VOLUME/2) {
+    init_solver_field(&solver_field, VOLUMEPLUSRAND,2);
+    old = (double*) solver_field[0];
+    new = (double*) solver_field[1];
+    convert_odd_to_lexic( (spinor*) old, phi_old);
+  }
+  
+  // Checking if the operator is in the list and compatible with N
+  if (      f == Msw_psi ||       //          Schur complement with mu=0 on odd sites
+            f == Qsw_psi ||       // Gamma5 - Schur complement with mu=0 on odd sites
+            f == Mtm_plus_psi ||  //          Schur complement with plus mu 
+            f == Msw_plus_psi ||  //          Schur complement with plus mu
+            f == Qtm_plus_psi ||  // Gamma5 - Schur complement with plus mu 
+            f == Qsw_plus_psi ||  // Gamma5 - Schur complement with plus mu
+            f == Mtm_minus_psi || //          Schur complement with minus mu 
+            f == Msw_minus_psi || //          Schur complement with minus mu
+            f == Qtm_minus_psi || // Gamma5 - Schur complement with minus mu 
+            f == Qsw_minus_psi || // Gamma5 - Schur complement with minus mu
+            f == Qtm_pm_psi ||    //          Schur complement squared
+            f == Qsw_pm_psi ) {   //          Schur complement squared
+    if( N != VOLUME/2 && g_proc_id == 0 )
+      printf("WARNING: expected N == VOLUME/2 for the required operator in MG_solve. Continuing with N == VOLUME\n");
+  }
+  else if ( f == D_psi ||         //          Full operator    with plus mu
+            f == Q_plus_psi ||    // Gamma5 - Full operator    with plus mu 
+            f == Q_minus_psi ||   // Gamma5 - Full operator    with minus mu
+            f == Q_pm_psi ||      //          Full operator    squared
+            f == Qsw_full_plus_psi || // Gamma5 - Full operator    with plus mu
+            f == Qsw_full_minus_psi|| //Gamma5 - Full operator    with plus mu
+            f == Qsw_full_pm_psi   || //          Full operator    squared
+            f == Msw_full_minus_psi) {//         Full operator    with minus mu
+    if( N != VOLUME && g_proc_id == 0 )
+      printf("WARNING: expected N == VOLUME for the required operator in MG_solve. Continuing with N == VOLUME/2\n");
+  }
+  else if( g_proc_id == 0 )
+    printf("WARNING: required operator unknown for MG_solve. Using standard operator: %s.\n",
+           N==VOLUME?"D_psi":"Msw_plus_psi");
+
+  // Setting mu
+  if (      f == Msw_psi ||       //          Schur complement with mu=0 on odd sites
+            f == Qsw_psi )        // Gamma5 - Schur complement with mu=0 on odd sites
+    MG_update_mu(g_mu, -g_mu);
+  else if ( f == Mtm_minus_psi || //          Schur complement with minus mu 
+            f == Msw_minus_psi || //          Schur complement with minus mu
+            f == Qtm_minus_psi || // Gamma5 - Schur complement with minus mu 
+            f == Qsw_minus_psi || // Gamma5 - Schur complement with minus mu
+            f == Qsw_full_minus_psi|| //Gamma5 - Full operator    with plus mu
+            f == Msw_full_minus_psi|| //         Full operator    with minus mu
+            f == Q_minus_psi )    // Gamma5 - Full operator    with minus mu
+    MG_update_mu(-g_mu, -g_mu3);
+  else if ( f == Mtm_plus_psi ||  //          Schur complement with plus mu 
+            f == Msw_plus_psi ||  //          Schur complement with plus mu
+            f == Qtm_plus_psi ||  // Gamma5 - Schur complement with plus mu 
+            f == Qsw_plus_psi ||  // Gamma5 - Schur complement with plus mu
+            f == D_psi ||         //          Full operator    with plus mu
+            f == Q_plus_psi ||    // Gamma5 - Full operator    with plus mu 
+            f == Qtm_pm_psi ||    //          Schur complement squared
+            f == Qsw_pm_psi ||    //          Schur complement squared
+            f == Qsw_full_plus_psi || // Gamma5 - Full operator    with plus mu
+            f == Qsw_full_pm_psi   || //          Full operator    squared
+            f == Q_pm_psi )       //          Full operator    squared
+    MG_update_mu(g_mu, g_mu3); 
+  else
+    MG_update_mu(g_mu, g_mu3); 
+
+  //Solving
+  if (      f == Qtm_plus_psi ||  // Gamma5 - Schur complement with plus mu 
+            f == Qsw_plus_psi ||  // Gamma5 - Schur complement with plus mu
+            f == Qtm_minus_psi || // Gamma5 - Schur complement with minus mu 
+            f == Qsw_minus_psi || // Gamma5 - Schur complement with minus mu 
+            f == Qsw_psi ||       // Gamma5 - Schur complement with mu=0 on odd sites
+            f == Q_plus_psi ||    // Gamma5 - Full operator    with plus mu 
+            f == Q_minus_psi ||   // Gamma5 - Full operator    with minus mu
+            f == Qsw_full_plus_psi || // Gamma5 - Full operator    with plus mu
+            f == Qsw_full_minus_psi|| //Gamma5 - Full operator    with plus mu
+            f == Qsw_full_pm_psi ) {  //          Full operator    squared
+    mul_gamma5((spinor *const) old, VOLUME);
+    DDalphaAMG_solve( new, old, precision, &mg_status );
+    if( N == VOLUME ) // in case of VOLUME/2 old is a just local vector
+      mul_gamma5((spinor *const) old, VOLUME);
+  }
+  else if ( f == Qtm_pm_psi ||    //          Schur complement squared
+            f == Qsw_pm_psi ) {   //          Schur complement squared
+    mg_scale *= mg_scale;
+    DDalphaAMG_solve_squared_odd( new, old, precision, &mg_status );
+  }
+  else if ( f == Q_pm_psi ) {     //          Full operator    squared
+    mg_scale *= mg_scale;
+    DDalphaAMG_solve_squared( new, old, precision, &mg_status );
+  }
+  else if ( f == Mtm_plus_psi ||  //          Schur complement with plus mu 
+            f == Msw_plus_psi ||  //          Schur complement with plus mu
+            f == Mtm_minus_psi || //          Schur complement with minus mu 
+            f == Msw_minus_psi || //          Schur complement with minus mu
+            f == Msw_psi ||       //          Schur complement with mu=0 on odd sites
+            f == D_psi ||         //          Full operator    with plus mu
+            f == Msw_full_minus_psi) {//         Full operator    with minus mu
+    DDalphaAMG_solve( new, old, precision, &mg_status );
+  }
+  else
+    DDalphaAMG_solve( new, old, precision, &mg_status );
+  
+  if (N==VOLUME/2) {
+    convert_lexic_to_odd(phi_new, (spinor*) new);
+    finalize_solver(solver_field, 2);
+  }
+  
+  mul_r(phi_new ,mg_scale, phi_new, N);
+
+  if (g_proc_id == 0) {
+    printf("Solving time %.2f sec (%.1f %% on coarse grid)\n", mg_status.time,
+           100.*(mg_status.coarse_time/mg_status.time));
+    printf("Total iterations on fine grid %d\n", mg_status.iter_count);
+    printf("Total iterations on coarse grids %d\n", mg_status.coarse_iter_count);
+    if (!mg_status.success) 
+      printf("ERROR: the solver did not converge!\n");
+  }
+  
+  return mg_status.success;
+}
+
+static int MG_solve_nd( spinor * up_new, spinor * dn_new, spinor * const up_old, spinor * const dn_old,
+                        const double precision, const int N, matrix_mult_nd f)
+{
+  
+  // for rescaling  convention in DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} -> rescale by 1/4+m
+  // moreover in the nd case, the tmLQCD is multiplied by phmc_invmaxev
+  double mg_scale=0.5/g_kappa/phmc_invmaxev;
+  double sqnorm;
+  int init_guess = 0;
+  spinor *old1 = up_old; 
+  spinor *old2 = dn_old; 
+  spinor *new1 = up_new, *new1tmp;
+  spinor *new2 = dn_new, *new2tmp;
+  spinor ** solver_field = NULL, ** oe_solver_field = NULL;
+  int no_solver_field = 0;
+
+  if( N != VOLUME && N != VOLUME/2 ) {
+    if( g_proc_id == 0 )
+      printf("ERROR: N = %d in MG_solve. Expettected N == VOLUME (%d) or VOLUME/2 (%d)\n", N, VOLUME, VOLUME/2);
+    return 0;
+  }
+
+  if (N==VOLUME/2) no_solver_field += 4;
+
+  // Checking if initial guess is given
+  sqnorm = square_norm(up_new, N, 1);
+  sqnorm += square_norm(dn_new, N, 1);
+  if ( sqnorm > 1e-14 ) init_guess = 1;
+
+  // In case of initial guess and squared operator, we do the inversion in two step and we need two more vectors
+  if ( init_guess && (
+            f == Qtm_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0
+            f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared
+            f == Qsw_pm_ndpsi_shift ))  // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+    no_solver_field += 2;
+
+  // Allocating and assigning fields
+  if(no_solver_field>0)
+    init_solver_field(&solver_field, VOLUMEPLUSRAND,no_solver_field);
+
+  int assign_solver_field = 0;
+  if (N==VOLUME/2) {
+    old1 = solver_field[assign_solver_field++];
+    old2 = solver_field[assign_solver_field++];
+    new1 = solver_field[assign_solver_field++];
+    new2 = solver_field[assign_solver_field++];
+    convert_odd_to_lexic(old1, up_old);
+    convert_odd_to_lexic(old2, dn_old);
+    set_even_to_zero(old1);
+    set_even_to_zero(old2);
+  }
+
+  if ( init_guess && (
+            f == Qtm_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0
+            f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared
+            f == Qsw_pm_ndpsi_shift )) {// (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+    new1tmp = solver_field[assign_solver_field++];
+    new2tmp = solver_field[assign_solver_field++];
+  }
+
+  // Reconstructing initial guess in case of oe
+  if ( init_guess && N==VOLUME/2 ) {
+    init_solver_field(&oe_solver_field, VOLUMEPLUSRAND, 4);
+    spinor* tmp11 = oe_solver_field[0];
+    spinor* tmp21 = oe_solver_field[1];
+    spinor* tmp12 = oe_solver_field[2];
+    spinor* tmp22 = oe_solver_field[3];
+
+    if (g_debug_level > 2) {
+      double differ[2];
+      f( tmp11, tmp12, up_new, dn_new);
+      diff( tmp11, tmp11, up_old, N);
+      diff( tmp12, tmp12, dn_old, N);
+      differ[0] = sqrt(square_norm(tmp11, N, 1)+square_norm(tmp12, N, 1));
+      differ[1] = sqrt(square_norm(up_old, N, 1)+square_norm(dn_old, N, 1));
+  
+      if(g_proc_id == 0)
+        printf("MG TEST: using initial guess. Relative residual = %e  \n", differ[0]/differ[1]);
+    }
+
+    /* Reconstruct the even sites                */
+    if (    f == Qtm_pm_ndpsi       ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0
+            f == Qsw_pm_ndpsi       ||  // (Gamma5 Dh tau1)^2 - Schur complement squared
+            f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+            f == Qtm_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and plus shift
+            f == Qtm_tau1_ndpsi_sub_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and minus shift
+            f == Qsw_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with plus shift
+            f == Qsw_tau1_ndpsi_sub_Ishift ) {// Gamma5 Dh tau1 - Schur complement with minus shift
+#ifdef VARIANT_FOR_SHIFTED
+      if((  f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi_shift )   // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+         && g_shift != 0 ) {
+        if( f == Qtm_pm_ndpsi_shift ) { // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+          Qtm_tau1_ndpsi_add_Ishift(tmp12, tmp11, up_new, dn_new); // tau1 exchange tmp11 <-> tmp12  
+        } else {                        // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+          Qsw_tau1_ndpsi_add_Ishift(tmp12, tmp11, up_new, dn_new); // tau1 exchange tmp11 <-> tmp12
+        }
+        // tau1 exchange new1tmp <-> new2tmp
+        convert_odd_to_lexic( new2, tmp11);
+        convert_odd_to_lexic( new1, tmp12);
+        Hopping_Matrix(EO, tmp21, tmp11);
+        Hopping_Matrix(EO, tmp22, tmp12);
+        Msw_ee_inv_ndpsi(tmp11, tmp12, tmp21, tmp22);
+        convert_even_to_lexic(new2, tmp11);
+        convert_even_to_lexic(new1, tmp12);
+      } else
+#endif
+      {
+        // tau1 exchange tmp11 <-> tmp12
+        Hopping_Matrix(EO, tmp12, up_new);
+        Hopping_Matrix(EO, tmp11, dn_new);
+
+        Msw_ee_inv_ndpsi(tmp21, tmp22, tmp11, tmp12);
+
+        /* Assigning with plus sign for the even
+         * since in Hopping_Matrix the minus is missing
+         */
+        // tau1 exchange tmp22 <-> tmp21
+        convert_eo_to_lexic(new1, tmp22, up_new);
+        convert_eo_to_lexic(new2, tmp21, dn_new);
+      }
+    } else {
+      Hopping_Matrix(EO, tmp11, up_new);
+      Hopping_Matrix(EO, tmp12, dn_new);
+
+      Msw_ee_inv_ndpsi(tmp21, tmp22, tmp11, tmp12);
+
+      /* Assigning with plus sign for the even
+       * since in Hopping_Matrix the minus is missing
+       */
+      convert_eo_to_lexic(new1, tmp21, up_new);
+      convert_eo_to_lexic(new2, tmp22, dn_new);
+    }
+  
+    // if squared obtaining initial guess for Gamma5 Dh
+    if (    f == Qtm_pm_ndpsi       ) { // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0
+      Qtm_dagger_ndpsi(tmp11, tmp12, up_new, dn_new); // tau1 Gamma5 Dh tau1
+    }
+    else if(f == Qsw_pm_ndpsi       ) { // (Gamma5 Dh tau1)^2 - Schur complement squared
+      Qsw_dagger_ndpsi(tmp11, tmp12, up_new, dn_new); // tau1 Gamma5 Dh tau1
+    }
+    else if(f == Qtm_pm_ndpsi_shift ) { // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+      Qtm_tau1_ndpsi_sub_Ishift(tmp12, tmp11, up_new, dn_new); // tau1 exchange tmp11 <-> tmp12  
+    }
+    else if(f == Qsw_pm_ndpsi_shift ) { // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+      Qsw_tau1_ndpsi_sub_Ishift(tmp12, tmp11, up_new, dn_new); // tau1 exchange tmp11 <-> tmp12
+    }
+
+    if (    f == Qtm_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0
+            f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared
+            f == Qsw_pm_ndpsi_shift ){  // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+
+      // tau1 exchange new1tmp <-> new2tmp
+      convert_odd_to_lexic( new2tmp, tmp11);
+      convert_odd_to_lexic( new1tmp, tmp12);
+      Hopping_Matrix(EO, tmp21, tmp11);
+      Hopping_Matrix(EO, tmp22, tmp12);
+      Msw_ee_inv_ndpsi(tmp11, tmp12, tmp21, tmp22);
+      convert_even_to_lexic(new2tmp, tmp11);
+      convert_even_to_lexic(new1tmp, tmp12);
+    } 
+    finalize_solver(oe_solver_field, 4);
+  } 
+
+  // Checking if the operator is in the list and compatible with N
+  if (      f == Qtm_ndpsi ||           //  Gamma5 Dh    - Schur complement with csw = 0
+            f == Qsw_ndpsi ||           //  Gamma5 Dh    - Schur complement
+            f == Qtm_dagger_ndpsi ||    //  Gamma5 Dh    - Schur complement with mu = -mubar and csw = 0
+            f == Qsw_dagger_ndpsi ||    //  Gamma5 Dh    - Schur complement with mu = -mubar
+            f == Qtm_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and plus shift
+            f == Qtm_tau1_ndpsi_sub_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and minus shift
+            f == Qsw_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with plus shift
+            f == Qsw_tau1_ndpsi_sub_Ishift || // Gamma5 Dh tau1 - Schur complement with minus shift
+            f == Qtm_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0
+            f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared
+            f == Qsw_pm_ndpsi_shift ) { // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+    if( N != VOLUME/2 && g_proc_id == 0 )
+      printf("WARNING: expected N == VOLUME/2 for the required operator in MG_solve. Continuing with N == VOLUME\n");
+  }
+  else if ( f == D_ndpsi ) {            //  Dh
+    if( N != VOLUME && g_proc_id == 0 )
+      printf("WARNING: expected N == VOLUME for the required operator in MG_solve. Continuing with N == VOLUME/2\n");
+  }
+  else if( g_proc_id == 0 )
+    printf("WARNING: required operator unknown for MG_solve. Using standard operator: %s.\n",
+           N==VOLUME?"":"Qsw_ndpsi");
+
+  // Setting mu and eps
+  if (      f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi_shift )   // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+    MG_update_mubar_epsbar( g_mubar, g_epsbar, sqrt(g_shift) );
+  else if ( f == Qtm_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and plus shift
+            f == Qsw_tau1_ndpsi_add_Ishift )  // Gamma5 Dh tau1 - Schur complement with plus shift
+    MG_update_mubar_epsbar( g_mubar, g_epsbar, sqrt(g_shift) );
+  else if ( f == Qtm_tau1_ndpsi_sub_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and minus shift
+            f == Qsw_tau1_ndpsi_sub_Ishift )  // Gamma5 Dh tau1 - Schur complement with minus shift
+    MG_update_mubar_epsbar( g_mubar, g_epsbar, -sqrt(g_shift) );
+  else if ( f == Qtm_dagger_ndpsi ||    //  Gamma5 Dh    - Schur complement with mu = -mubar csw = 0
+            f == Qsw_dagger_ndpsi )     //  Gamma5 Dh    - Schur complement with mu = -mubar
+    MG_update_mubar_epsbar( -g_mubar, g_epsbar, 0 );
+  else if ( f == Qtm_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0
+            f == Qsw_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared
+            f == D_ndpsi )              //  Dh
+    MG_update_mubar_epsbar( g_mubar, g_epsbar, 0 );
+  else
+    MG_update_mubar_epsbar( g_mubar, g_epsbar, 0 );
+  
+  //Solving
+  if (      f == Qtm_ndpsi ||           //  Gamma5 Dh    - Schur complement with csw = 0
+            f == Qsw_ndpsi ||           //  Gamma5 Dh    - Schur complement
+            f == Qtm_dagger_ndpsi ||    //  Gamma5 Dh    - Schur complement with mu = -mubar csw = 0
+            f == Qsw_dagger_ndpsi ) {   //  Gamma5 Dh    - Schur complement with mu = -mubar
+    mul_gamma5(old1, VOLUME);
+    mul_gamma5(old2, VOLUME);
+    if (init_guess) {
+      // Removing normalization from initial guess
+      mul_r(new1, 1/mg_scale, new1, VOLUME);
+      mul_r(new2, 1/mg_scale, new2, VOLUME);
+      DDalphaAMG_solve_doublet_with_guess( (double*) new1, (double*) old1, (double*) new2, (double*) old2,
+                                           precision, &mg_status );
+    } else {
+      DDalphaAMG_solve_doublet( (double*) new1, (double*) old1, (double*) new2, (double*) old2, 
+                                precision, &mg_status );
+    }
+    if( N == VOLUME ) { // in case of VOLUME/2 old is a just local vector
+      mul_gamma5(old1, VOLUME);
+      mul_gamma5(old2, VOLUME);
+    }
+  }
+  else if ( f == Qtm_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and plus shift
+            f == Qtm_tau1_ndpsi_sub_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and minus shift
+            f == Qsw_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with plus shift
+            f == Qsw_tau1_ndpsi_sub_Ishift ) {// Gamma5 Dh tau1 - Schur complement with minus shift
+    mul_gamma5(old1, VOLUME);
+    mul_gamma5(old2, VOLUME);
+    // tau1 exchange new1 <-> new2
+    if (init_guess) {
+      // Removing normalization from initial guess
+      mul_r(new1, 1/mg_scale, new1, VOLUME);
+      mul_r(new2, 1/mg_scale, new2, VOLUME);
+      DDalphaAMG_solve_doublet_with_guess( (double*) new2, (double*) old1, (double*) new1, (double*) old2, 
+                                           precision, &mg_status );
+    } else {
+      DDalphaAMG_solve_doublet( (double*) new2, (double*) old1, (double*) new1, (double*) old2, 
+                                precision, &mg_status );
+    }
+    if( N == VOLUME ) { // in case of VOLUME/2 old is a just local vector
+      mul_gamma5(old1, VOLUME);
+      mul_gamma5(old2, VOLUME);
+    }
+  }            
+  else if ( f == Qtm_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0
+            f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared
+            f == Qsw_pm_ndpsi_shift ) { // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+    // DDalphaAMG: tau1 gamma5 Dh tau1 gamma5 Dh
+    // tmLQCD:          gamma5 Dh tau1 gamma5 Dh tau1
+    if (init_guess) {
+      mul_gamma5(old1, VOLUME);
+      mul_gamma5(old2, VOLUME);
+      // Removing normalization from initial guess
+      mul_r(new1tmp, 1/mg_scale, new1tmp, VOLUME);
+      mul_r(new2tmp, 1/mg_scale, new2tmp, VOLUME);
+      DDalphaAMG_solve_doublet_with_guess( (double*) new2tmp, (double*) old1, (double*) new1tmp, (double*) old2,
+                                           precision/2, &mg_status );
+#ifdef VARIANT_FOR_SHIFTED
+      if((  f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi_shift )   // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+         && g_shift != 0 ) {
+        // Removing normalization from initial guess
+        mul_r(new1, 1/mg_scale, new1, VOLUME);
+        mul_r(new2, 1/mg_scale, new2, VOLUME);
+        MG_update_mubar_epsbar( g_mubar, g_epsbar, -sqrt(g_shift) );
+        DDalphaAMG_solve_doublet_with_guess( (double*) new2, (double*) old1, (double*) new1, (double*) old2,
+                                             precision/2, &mg_status );
+        assign_mul_add_mul(new1, -_Complex_I/2./sqrt(g_shift), new1tmp, _Complex_I/2./sqrt(g_shift), VOLUME);
+        assign_mul_add_mul(new2, -_Complex_I/2./sqrt(g_shift), new2tmp, _Complex_I/2./sqrt(g_shift), VOLUME);
+      } else 
+#endif
+      {
+        mul_gamma5(new1tmp, VOLUME);
+        mul_gamma5(new2tmp, VOLUME);
+        set_even_to_zero(new1tmp);
+        set_even_to_zero(new2tmp);
+        // Removing normalization from initial guess
+        mg_scale *= mg_scale;
+        mul_r(new1, 1/mg_scale, new1, VOLUME);
+        mul_r(new2, 1/mg_scale, new2, VOLUME);
+        if (      f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+                  f == Qsw_pm_ndpsi_shift )   // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+          MG_update_mubar_epsbar( g_mubar, g_epsbar, -sqrt(g_shift) );
+        DDalphaAMG_solve_doublet_with_guess( (double*) new2, (double*) new1tmp, (double*) new1, (double*) new2tmp,
+                                             precision/2, &mg_status );      
+      }
+      if( N == VOLUME ) { // in case of VOLUME/2 old is a just local vector
+        mul_gamma5(old1, VOLUME);
+        mul_gamma5(old2, VOLUME);
+      }
+    } else {
+      mg_scale *= mg_scale;
+      DDalphaAMG_solve_doublet_squared_odd( (double*) new2, (double*) old2, (double*) new1, (double*) old1,
+                                            precision, &mg_status );
+    }
+  }
+  else if ( f == D_ndpsi ) {            //  Dh
+    if (init_guess) {
+      // Removing normalization from initial guess
+      mul_r(new1, 1/mg_scale, new1, VOLUME);
+      mul_r(new2, 1/mg_scale, new2, VOLUME);
+      DDalphaAMG_solve_doublet_with_guess( (double*) new1, (double*) old1, (double*) new2, (double*) old2,
+                                           precision, &mg_status );
+    } else {
+      DDalphaAMG_solve_doublet( (double*) new1, (double*) old1, (double*) new2, (double*) old2,
+                                precision, &mg_status );
+    }
+  } else {
+    if (init_guess) {
+      // Removing normalization from initial guess
+      mul_r(new1, 1/mg_scale, new1, VOLUME);
+      mul_r(new2, 1/mg_scale, new2, VOLUME);
+      DDalphaAMG_solve_doublet_with_guess( (double*) new1, (double*) old1, (double*) new2, (double*) old2,
+                                           precision, &mg_status );
+    } else {
+      DDalphaAMG_solve_doublet( (double*) new1, (double*) old1, (double*) new2, (double*) old2,
+                                precision, &mg_status );
+    }
+  }
+  if (N==VOLUME/2) {
+    convert_lexic_to_odd(up_new, new1);
+    convert_lexic_to_odd(dn_new, new2);
+  }
+  if (no_solver_field>0)
+    finalize_solver(solver_field, no_solver_field);
+  mul_r(up_new ,mg_scale, up_new, N);
+  mul_r(dn_new ,mg_scale, dn_new, N);
+  
+  if (g_proc_id == 0) {
+    printf("Solving time %.2f sec (%.1f %% on coarse grid)\n", mg_status.time,
+           100.*(mg_status.coarse_time/mg_status.time));
+    printf("Total iterations on fine grid %d\n", mg_status.iter_count);
+    printf("Total iterations on coarse grids %d\n", mg_status.coarse_iter_count);
+    if (!mg_status.success) 
+      printf("ERROR: the solver did not converge!\n");
+  }
+  
+  return mg_status.success;
+}
+
+static int MG_mms_solve_nd( spinor **const up_new, spinor **const dn_new, 
+                            spinor * const up_old, spinor * const dn_old,
+                            const double * shifts, const int no_shifts,
+                            double * precision, const int N, matrix_mult_nd f)
+{
+  
+  // for rescaling  convention in DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} -> rescale by 1/4+m
+  // moreover in the nd case, the tmLQCD is multiplied by phmc_invmaxev
+  double mg_scale=0.5/g_kappa/phmc_invmaxev;
+  double *old1 = (double*) up_old; 
+  double *old2 = (double*) dn_old; 
+  double **new1, **new2, *mg_odd_shifts, *mg_even_shifts;
+  spinor ** solver_field = NULL;
+
+  //  if( N != VOLUME && N != VOLUME/2 ) {
+  if( N != VOLUME/2 ) { // no full VOLUME functions implemented at the moment 
+    if( g_proc_id == 0 )
+      printf("ERROR: N = %d in MG_solve. Expettected N == VOLUME (%d) or VOLUME/2 (%d)\n", N, VOLUME, VOLUME/2);
+    return 0;
+  }
+
+  new1 = (double**) malloc(no_shifts*sizeof(double*));
+  new2 = (double**) malloc(no_shifts*sizeof(double*));
+  mg_odd_shifts  = (double*) malloc(no_shifts*sizeof(double));
+  mg_even_shifts = (double*) malloc(no_shifts*sizeof(double));
+
+  if( N==VOLUME/2 ) {
+    init_solver_field(&solver_field, VOLUMEPLUSRAND,2+2*no_shifts);
+    old1 = (double*) solver_field[0];
+    old2 = (double*) solver_field[1];
+    convert_odd_to_lexic( (spinor*) old1, up_old);
+    convert_odd_to_lexic( (spinor*) old2, dn_old);
+
+    for( int i = 0; i < no_shifts; i++ ) {
+      new1[i] = (double*) solver_field[2+2*i];
+      new2[i] = (double*) solver_field[3+2*i];
+    }
+  } else {
+    for( int i = 0; i < no_shifts; i++ ) {
+      new1[i] = (double*) up_new[i];
+      new2[i] = (double*) dn_new[i];
+    }
+  }
+
+  // Checking if the operator is in the list and compatible with N
+  if (      f == Qtm_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and plus shift
+            f == Qtm_tau1_ndpsi_sub_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and minus shift
+            f == Qsw_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with plus shift
+            f == Qsw_tau1_ndpsi_sub_Ishift || // Gamma5 Dh tau1 - Schur complement with minus shift
+            f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi_shift ) { // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+    if( N != VOLUME/2 ) {
+      if( g_proc_id == 0 )
+        printf("ERROR: expected N == VOLUME/2 for the required operator in MG_mms_solve_nd.\n");
+      return 0;
+    }
+  }  else if( g_proc_id == 0 )
+    printf("WARNING: required operator unknown for MG_solve. Using standard operator: %s.\n",
+           N==VOLUME?"":"Qsw_pm_ndpsi_shift");
+
+  // Setting mubar, epsbar and shifts
+  if (      f == Qtm_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and plus shift
+            f == Qsw_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with plus shift
+            f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi_shift ) { // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+    MG_update_mubar_epsbar( g_mubar, g_epsbar, shifts[0] );
+    for( int i = 0; i < no_shifts; i++ ) {
+      mg_odd_shifts[i]  = shifts[i]*mg_scale;
+      mg_even_shifts[i] = 0;
+    }
+  }
+  else if ( f == Qtm_tau1_ndpsi_sub_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and minus shift
+            f == Qsw_tau1_ndpsi_sub_Ishift ) {// Gamma5 Dh tau1 - Schur complement with minus shift
+    MG_update_mubar_epsbar( g_mubar, g_epsbar, -shifts[0] );
+    for( int i = 0; i < no_shifts; i++ ) {
+      mg_odd_shifts[i]  = -shifts[i]*mg_scale;
+      mg_even_shifts[i] = 0;
+    }
+  }
+
+  //Solving
+  if (      f == Qtm_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and plus shift
+            f == Qtm_tau1_ndpsi_sub_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and minus shift
+            f == Qsw_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with plus shift
+            f == Qsw_tau1_ndpsi_sub_Ishift ) {// Gamma5 Dh tau1 - Schur complement with minus shift
+    mul_gamma5((spinor *const) old1, VOLUME);
+    mul_gamma5((spinor *const) old2, VOLUME);
+    // tau1 exchange new1 <-> new2
+    DDalphaAMG_solve_ms_doublet( new2, old1, new1, old2, mg_even_shifts, mg_odd_shifts, no_shifts, 
+                                 precision, &mg_status );
+    if( N == VOLUME ) { // in case of VOLUME/2 old is a just local vector
+      mul_gamma5((spinor *const) old1, VOLUME);
+      mul_gamma5((spinor *const) old2, VOLUME);
+    }
+  }            
+  else if ( f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi_shift ) { // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+    mg_scale *= mg_scale;
+    // DDalphaAMG: tau1 gamma5 Dh tau1 gamma5 Dh
+    // tmLQCD:          gamma5 Dh tau1 gamma5 Dh tau1
+    DDalphaAMG_solve_ms_doublet_squared_odd( new2, old2, new1, old1, mg_even_shifts, mg_odd_shifts, no_shifts,
+                                             precision, &mg_status );
+  }
+  else
+    DDalphaAMG_solve_ms_doublet( new1, old1, new2, old2, mg_even_shifts, mg_odd_shifts, no_shifts, 
+                                 precision, &mg_status );
+
+  if (N==VOLUME/2) {
+    for( int i = 0; i < no_shifts; i++ ) {
+      convert_lexic_to_odd(up_new[i], (spinor*) new1[i]);
+      convert_lexic_to_odd(dn_new[i], (spinor*) new2[i]);
+    }
+    finalize_solver(solver_field, 2+2*no_shifts);
+  }
+
+  for( int i = 0; i < no_shifts; i++ ) {
+    mul_r(up_new[i], mg_scale, up_new[i], N);
+    mul_r(dn_new[i], mg_scale, dn_new[i], N);
+  }
+
+  if (g_proc_id == 0) {
+    printf("Solving time %.2f sec (%.1f %% on coarse grid)\n", mg_status.time,
+           100.*(mg_status.coarse_time/mg_status.time));
+    printf("Total iterations on fine grid %d\n", mg_status.iter_count);
+    printf("Total iterations on coarse grids %d\n", mg_status.coarse_iter_count);
+    if (!mg_status.success) 
+      printf("ERROR: the solver did not converge!\n");
+  }
+
+  free(new1);
+  free(new2);
+  free(mg_odd_shifts);
+  free(mg_even_shifts);
+  
+  return mg_status.success;
+}
+
+void MG_init()
+{
+  mg_init.comm_cart=g_cart_grid;
+  mg_init.Cart_rank=Cart_rank;
+  mg_init.Cart_coords=Cart_coords;
+  
+  mg_init.global_lattice[0]=T*g_nproc_t;
+  mg_init.global_lattice[1]=LZ*g_nproc_z;
+  mg_init.global_lattice[2]=LY*g_nproc_y;
+  mg_init.global_lattice[3]=LX*g_nproc_x;
+  
+  mg_init.procs[0]=g_nproc_t;
+  mg_init.procs[1]=g_nproc_z;
+  mg_init.procs[2]=g_nproc_y;
+  mg_init.procs[3]=g_nproc_x;
+  
+  for(int i = 0; i<4; i++)
+    if(mg_blk[i]==0)
+      mg_blk[i]=(((L/g_nproc_x)%2==0)?(((L/g_nproc_x)%4==0)?4:2):
+                 (((L/g_nproc_x)%3==0)?3:1));
+  
+  mg_init.block_lattice[0]=mg_blk[0];
+  mg_init.block_lattice[1]=mg_blk[1];
+  mg_init.block_lattice[2]=mg_blk[2];
+  mg_init.block_lattice[3]=mg_blk[3];
+  
+  if (X0==0 && X1==0 && X2==0 && X3==0)
+    mg_init.bc=0;
+  else
+    mg_init.bc=2;
+  mg_init.theta[0] = X0;
+  mg_init.theta[1] = X3;
+  mg_init.theta[2] = X2;
+  mg_init.theta[3] = X1;
+
+  mg_init.number_of_levels=mg_lvl;
+#ifdef TM_USE_OMP
+  if(mg_omp_num_threads<=0)
+      mg_init.number_openmp_threads=omp_num_threads;
+  else
+      mg_init.number_openmp_threads=mg_omp_num_threads;
+#else
+  mg_init.number_openmp_threads=1;
+#endif   
+  mg_init.kappa=g_kappa;
+  mg_init.mu=0.5*g_mu/g_kappa;
+  
+  if (g_c_sw<0.00)
+    mg_init.csw=0.0;
+  else
+    mg_init.csw=g_c_sw;
+
+  if (reproduce_randomnumber_flag) {
+    mg_init.rnd_seeds = (unsigned int *) malloc(g_nproc*sizeof(unsigned int));
+    for (int i=0; i<g_nproc; i++)
+      mg_init.rnd_seeds[i] = random_seed + i*1000;   
+  }
+  else
+    mg_init.rnd_seeds = NULL;
+  
+  DDalphaAMG_initialize( &mg_init, &mg_params, &mg_status);
+
+  if (reproduce_randomnumber_flag)
+    free(mg_init.rnd_seeds);
+  
+  if (mg_status.success!=mg_lvl) {
+      if (g_proc_id == 0) {
+          printf("MG WARNING: %d level initialized instead of %d\n",mg_status.success,mg_lvl);
+          printf("MG WARNING: parameter: mg_lvl is changed to %d\n\n",mg_status.success);
+      }
+      mg_lvl=mg_status.success;
+  }
+  
+  mg_params.conf_index_fct=conf_index_fct;
+  mg_params.vector_index_fct=vector_index_fct;
+  
+  /* in DDalphaAMG
+   * Printing level:
+   *  -1: silent (errors or warnings)
+   *   0: minimal //default
+   *   1: g_debug_level > 0
+   */
+  if(g_debug_level > 0) {
+    mg_params.print=1;
+  }
+  else
+    mg_params.print=0;
+  
+  mg_params.mu_factor[mg_lvl-1]=mg_cmu_factor; // input param mg_cmu_factor
+  mg_params.mg_basis_vectors[0]=mg_Nvec;
+  for (int j=1;j < mg_lvl-1; j++)
+    mg_params.mg_basis_vectors[j]=fmax(28,mg_params.mg_basis_vectors[j-1]);
+  
+  mg_params.setup_iterations[0]=mg_setup_iter;
+  mg_params.setup_iterations[1]=mg_coarse_setup_iter;
+ 
+  // with mixed_precision = 2 the library adapt the solving precision according to the vector components
+  if(mg_mixed_prec)
+    mg_params.mixed_precision = 2;
+  else
+    mg_params.mixed_precision = 1;
+
+  DDalphaAMG_update_parameters(&mg_params, &mg_status);
+  
+}
+
+void MG_update_gauge(double step)
+{
+  gauge_tau += step;
+  mg_update_gauge = 1;
+}
+
+void MG_update_mu(double mu_tmLQCD, double shift_tmLQCD)
+{
+  double mu, shift;
+  mu    = 0.5 * mu_tmLQCD   /g_kappa;
+  shift = 0.5 * shift_tmLQCD/g_kappa;
+  
+  DDalphaAMG_get_parameters(&mg_params);
+  
+  if (mu != mg_params.mu || shift != mg_params.mu_odd_shift || mg_params.mu_even_shift != 0.0 || mg_params.smoother_iterations != 4 ) {
+    //Taking advantage of this function for updating printing in HMC
+    if(g_debug_level > 0) 
+      mg_params.print=1;
+    else
+      mg_params.print=0;
+
+    mg_params.mu = mu;
+    mg_params.mu_even_shift = 0.0;
+    mg_params.mu_odd_shift = shift;
+    mg_params.mu_factor[mg_lvl-1] = mg_cmu_factor;
+    mg_params.epsbar = 0.0;
+    mg_params.epsbar_ig5_even_shift = 0.0;
+    mg_params.epsbar_ig5_odd_shift = 0.0;
+    mg_params.smoother_iterations = 4;
+    DDalphaAMG_update_parameters(&mg_params, &mg_status);
+  }         
+}
+
+void MG_update_mubar_epsbar(double mubar_tmLQCD, double epsbar_tmLQCD, double shift_tmLQCD)
+{
+  double mubar, epsbar, shift;
+  mubar  = 0.5 * mubar_tmLQCD /g_kappa;
+  epsbar = 0.5 * epsbar_tmLQCD/g_kappa;
+  shift  = 0.5 * shift_tmLQCD/g_kappa/phmc_invmaxev;
+  
+  DDalphaAMG_get_parameters(&mg_params);
+  
+  if ( mubar != mg_params.mu || mg_params.mu_odd_shift != 0.0 || mg_params.mu_even_shift != 0.0 ||
+       epsbar != mg_params.epsbar || shift != mg_params.epsbar_ig5_odd_shift || mg_params.epsbar_ig5_even_shift != 0.0 || mg_params.smoother_iterations != 2 ) {
+    //Taking advantage of this function for updating printing in HMC
+    if(g_debug_level > 0) 
+      mg_params.print=1;
+    else
+      mg_params.print=0;
+
+    mg_params.mu = mubar;
+    mg_params.mu_even_shift = 0.0;
+    mg_params.mu_odd_shift = 0.0;
+    mg_params.mu_factor[mg_lvl-1] = 1.0;
+    mg_params.epsbar = epsbar;
+    mg_params.epsbar_ig5_even_shift = 0.0;
+    mg_params.epsbar_ig5_odd_shift = shift;
+    mg_params.smoother_iterations = 2;
+    DDalphaAMG_update_parameters(&mg_params, &mg_status);
+  }         
+}
+
+void MG_reset() {
+
+  if(mg_do_setup == 0)
+    DDalphaAMG_free();
+  
+  mg_update_gauge = 1;
+  mg_do_setup = 1;
+  mg_update_setup = 0;
+  mg_tau = 0.0;
+  gauge_tau = 0.0;
+}
+
+void MG_finalize()
+{
+  DDalphaAMG_finalize();
+}
+
+
+int MG_solver(spinor * const phi_new, spinor * const phi_old,
+              const double precision, const int max_iter,const int rel_prec,
+              const int N, su3 **gf, matrix_mult f)
+{
+  
+  int success=0;
+  double mg_prec = rel_prec?sqrt(precision):sqrt(precision/square_norm(phi_old, N, 1));
+  
+  MG_pre_solve(gf);
+
+  success = MG_solve( phi_new, phi_old, mg_prec, N, f );
+
+  if(success && g_debug_level > 2) 
+    success = MG_check( phi_new, phi_old, N, mg_prec, f );
+  
+  if(!success) {
+    MG_reset();
+    MG_pre_solve(gf);
+    success = MG_solve( phi_new, phi_old, mg_prec, N, f);
+    
+    if(success && g_debug_level > 2) 
+      success = MG_check( phi_new, phi_old, N, mg_prec, f );
+  }
+  
+  if(!success) {
+    if( g_proc_id == 0 )
+      printf("ERROR: solver didn't converge after two trials!! Aborting... \n");
+    //TODO: handle abort
+    DDalphaAMG_finalize();
+    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Finalize();
+    exit(1);
+  } 
+  // mg_status should have been used last time for the inversion.
+  return mg_status.iter_count;
+}
+
+int MG_solver_eo(spinor * const Even_new, spinor * const Odd_new,
+                 spinor * const Even, spinor * const Odd,
+                 const double precision, const int max_iter, const int rel_prec,
+                 const int N, su3 **gf, matrix_mult_full f_full)
+{
+  
+  int iter_count;
+  spinor ** solver_field = NULL;
+  matrix_mult f;
+  
+  init_solver_field(&solver_field, VOLUMEPLUSRAND, 2);
+  convert_eo_to_lexic(solver_field[0], Even, Odd);
+  
+  if (f_full == M_full)
+    f=&D_psi;
+  else if (f_full == Q_full)
+    f=&Q_plus_psi;
+  else if (f_full == Msw_full)
+    f=&D_psi;
+  else if (f_full == Qsw_full)
+    f=&Qsw_full_plus_psi;
+  else {
+    f=&D_psi;
+    if( g_proc_id == 0 )
+      printf("WARNING: required operator unknown for MG_solver_eo. Using standard operator.\n");
+  }
+
+  iter_count = MG_solver( solver_field[1], solver_field[0], precision, max_iter, rel_prec, VOLUME, gf, f );
+  
+  convert_lexic_to_eo(Even_new, Odd_new, solver_field[1]);
+  finalize_solver(solver_field, 2);
+  
+  return iter_count;
+}
+
+int MG_solver_nd(spinor * const up_new, spinor * const dn_new,
+                 spinor * const up_old, spinor * const dn_old,
+                 const double precision, const int max_iter, const int rel_prec,
+                 const int N, su3 **gf, matrix_mult_nd f)
+{
+  
+  int success=0;
+  double mg_prec = rel_prec?sqrt(precision):sqrt(precision/(square_norm(up_old, N, 1)+square_norm(dn_old, N, 1)));
+  
+  MG_pre_solve(gf);
+
+  success = MG_solve_nd( up_new, dn_new, up_old, dn_old, mg_prec, N, f );
+  
+  if(success && g_debug_level > 2) {
+    success = MG_check_nd( up_new, dn_new, up_old, dn_old, N, mg_prec, f );
+
+    if(!success) {
+      success = MG_solve_nd( up_new, dn_new, up_old, dn_old, mg_prec, N, f);
+    
+      if(success) 
+        success = MG_check_nd( up_new, dn_new, up_old, dn_old, N, mg_prec, f );
+    }
+  }
+  
+  if(!success) {
+    MG_reset();
+    MG_pre_solve(gf);
+    success = MG_solve_nd( up_new, dn_new, up_old, dn_old, mg_prec, N, f);
+    
+    if(success && g_debug_level > 2) 
+      success = MG_check_nd( up_new, dn_new, up_old, dn_old, N, mg_prec, f );
+  }
+  
+  if(!success) {
+    if( g_proc_id == 0 )
+      printf("ERROR: solver didn't converge after two trials!! Aborting... \n");
+    //TODO: handle abort
+    DDalphaAMG_finalize();
+    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Finalize();
+    exit(1);
+  } 
+  // mg_status should have been used last time for the inversion.
+  return mg_status.iter_count;
+}
+
+int MG_solver_nd_eo(spinor * const Even_new_up, spinor * const Odd_new_up, 
+                    spinor * const Even_new_dn, spinor * const Odd_new_dn,
+                    spinor * const Even_up, spinor * const Odd_up,
+                    spinor * const Even_dn, spinor * const Odd_dn,
+                    const double precision, const int max_iter, const int rel_prec,
+                    const int N, su3 **gf, matrix_mult_full_nd f_full)
+{
+  
+  int iter_count;
+  spinor ** solver_field = NULL;
+  matrix_mult_nd f;
+  
+  init_solver_field(&solver_field, VOLUMEPLUSRAND, 4);
+  convert_eo_to_lexic(solver_field[0], Even_up, Odd_up);
+  convert_eo_to_lexic(solver_field[1], Even_dn, Odd_dn);
+  
+  if (f_full == M_full_ndpsi)
+    f=&D_ndpsi;
+  else if (f_full == Msw_full_ndpsi)
+    f=&D_ndpsi;
+  else {
+    f=&D_ndpsi;
+    if( g_proc_id == 0 )
+      printf("WARNING: required operator unknown for MG_solver_eo. Using standard operator.\n");
+  }
+
+  iter_count = MG_solver_nd( solver_field[2], solver_field[3], solver_field[0], solver_field[1], precision, max_iter,
+                             rel_prec, VOLUME, gf, f );
+  
+  convert_lexic_to_eo(Even_new_up, Odd_new_up, solver_field[2]);
+  convert_lexic_to_eo(Even_new_dn, Odd_new_dn, solver_field[3]);
+  finalize_solver(solver_field, 4);
+  
+  return iter_count;
+}
+
+int MG_mms_solver_nd(spinor **const up_new, spinor **const dn_new,
+                     spinor * const up_old, spinor * const dn_old,
+                     const double * shifts, const int no_shifts,
+                     const double * precision, const int max_iter, const int rel_prec,
+                     const int N, su3 **gf, matrix_mult_nd f)
+{
+  
+  int success=0;
+  double mg_prec[no_shifts];
+  if(rel_prec) {
+    for(int i=0; i<no_shifts; i++)
+      mg_prec[i] = sqrt(precision[i]);
+  } else {
+    double nrhs = square_norm(up_old, N, 1)+square_norm(dn_old, N, 1);
+    for(int i=0; i<no_shifts; i++)
+      mg_prec[i] = sqrt(precision[i]/nrhs);
+  }  
+
+  MG_pre_solve(gf);
+
+  success = MG_mms_solve_nd( up_new, dn_new, up_old, dn_old, shifts, no_shifts, mg_prec, N, f );
+  
+  if(success && g_debug_level > 2) 
+    success = MG_mms_check_nd( up_new, dn_new, up_old, dn_old, shifts, no_shifts, N, mg_prec, f );
+  
+  if(!success) {
+    MG_reset();
+    MG_pre_solve(gf);
+    success = MG_mms_solve_nd( up_new, dn_new, up_old, dn_old, shifts, no_shifts, mg_prec, N, f);
+    
+    if(success && g_debug_level > 2) 
+      success = MG_mms_check_nd( up_new, dn_new, up_old, dn_old, shifts, no_shifts, N, mg_prec, f );
+  }
+  
+  if(!success) {
+    if( g_proc_id == 0 )
+      printf("ERROR: solver didn't converge after two trials!! Aborting... \n");
+    //TODO: handle abort
+    DDalphaAMG_finalize();
+    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Finalize();
+    exit(1);
+  } 
+  // mg_status should have been used last time for the inversion.
+  return mg_status.iter_count;
+}
+
+#endif
diff --git a/DDalphaAMG_interface.h b/DDalphaAMG_interface.h
new file mode 100644
index 000000000..0fa8e75ca
--- /dev/null
+++ b/DDalphaAMG_interface.h
@@ -0,0 +1,82 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2016 Simone Bacchio, Jacob Finkenrath
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Interface for DDalphaAMG
+ *
+ *******************************************************************************/
+
+#ifndef DDalphaAMG_INTERFACE_H_
+#define DDalphaAMG_INTERFACE_H_
+#include "global.h"
+#include "su3.h"
+#include"solver/matrix_mult_typedef.h"
+#include"solver/matrix_mult_typedef_nd.h"
+
+extern int mg_setup_iter;
+extern int mg_coarse_setup_iter;
+extern int mg_update_setup_iter;
+extern int mg_update_gauge;
+extern int mg_omp_num_threads;
+extern int mg_Nvec;
+extern int mg_lvl;
+extern int mg_blk[4];
+extern int mg_mixed_prec;
+extern int mg_setup_mu_set;
+extern int mg_no_shifts;
+extern double mg_mms_mass;
+extern double mg_setup_mu;
+extern double mg_cmu_factor;
+extern double mg_dtau_update;
+extern double mg_rho_update;
+
+void MG_init(void);
+void MG_update_gauge(double step);
+void MG_update_mu(double mu_tmLQCD, double odd_tmLQCD);
+void MG_update_mubar_epsbar(double mubar_tmLQCD, double epsbar_tmLQCD, double shift_tmLQCD);
+void MG_reset(void);
+void MG_finalize(void);
+
+int MG_solver(spinor * const phi_new, spinor * const phi_old,
+	      const double precision, const int max_iter,const int rel_prec,
+	      const int N, su3 **gf, matrix_mult f);
+
+int MG_solver_eo(spinor * const Even_new, spinor * const Odd_new,
+		 spinor * const Even, spinor * const Odd,
+		 const double precision, const int max_iter, const int rel_prec,
+		 const int N, su3 **gf, matrix_mult_full f_full);
+
+int MG_solver_nd(spinor * const up_new, spinor * const dn_new,
+		 spinor * const up_old, spinor * const dn_old,
+		 const double precision, const int max_iter, const int rel_prec,
+		 const int N, su3 **gf, matrix_mult_nd f);
+
+int MG_solver_nd_eo(spinor * const Even_new_up, spinor * const Odd_new_up, 
+                    spinor * const Even_new_dn, spinor * const Odd_new_dn,
+                    spinor * const Even_up, spinor * const Odd_up,
+                    spinor * const Even_dn, spinor * const Odd_dn,
+                    const double precision, const int max_iter, const int rel_prec,
+                    const int N, su3 **gf, matrix_mult_full_nd f_full);
+
+int MG_mms_solver_nd(spinor **const up_new, spinor **const dn_new,
+                     spinor * const up_old, spinor * const dn_old,
+                     const double * shifts, const int no_shifts,
+                     const double * precision, const int max_iter, const int rel_prec,
+                     const int N, su3 **gf, matrix_mult_nd f);
+
+#endif /* DDalphaAMG_INTERFACE_H_ */
diff --git a/DirectPut.c b/DirectPut.c
index 967846508..e6e7f54c4 100644
--- a/DirectPut.c
+++ b/DirectPut.c
@@ -18,16 +18,16 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-#  include<config.h>
+#  include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <sys/types.h>
 #include <stdint.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 #  include <mpi.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 #  include <omp.h>
 #endif
 #include "global.h"
diff --git a/GPU/ASYNC.cuh b/GPU/ASYNC.cuh
index 423c4a2a2..cce70e774 100644
--- a/GPU/ASYNC.cuh
+++ b/GPU/ASYNC.cuh
@@ -78,7 +78,7 @@ __global__ void dev_Hopping_Matrix_ASYNC (const dev_su3_2v * gf,
             
             #ifdef TEMPORALGAUGE
               // gf == ID for t != T-1 => just read the spinor
-              #ifdef MPI
+              #ifdef TM_USE_MPI
                 if ( ((gfindex_site[pos]) < (dev_T-1)*spatialvol) || (dev_rank < dev_nproc-1) ) {
                 //if ((gfindex_site[pos]) < (dev_T-1)*spatialvol) { // FAKE TEMPORALGAUGE
               #else
@@ -143,7 +143,7 @@ __global__ void dev_Hopping_Matrix_ASYNC (const dev_su3_2v * gf,
             //color
             #ifdef TEMPORALGAUGE
               // gf == ID for t != T-1 => just read the spinor
-              #ifdef MPI
+              #ifdef TM_USE_MPI
                 if ( ((gfindex_nextsite[hoppos]) < (dev_T-1)*spatialvol) || (dev_rank > 0) ) {
                 //if ((gfindex_nextsite[hoppos]) < (dev_T-1)*spatialvol) { // FAKE TEMPORALGAUGE
               #else
@@ -1205,7 +1205,7 @@ __global__ void dev_Hopping_Matrix_half_ASYNC (const dev_su3_2v_half * gf,
             
             #ifdef TEMPORALGAUGE
               // gf == ID for t != T-1 => just read the spinor
-              #ifdef MPI
+              #ifdef TM_USE_MPI
                 if ( ((gfindex_site[pos]) < (dev_T-1)*spatialvol) || (dev_rank < dev_nproc-1) ) {
                 //if ((gfindex_site[pos]) < (dev_T-1)*spatialvol) { // FAKE TEMPORALGAUGE
               #else
@@ -1282,7 +1282,7 @@ __global__ void dev_Hopping_Matrix_half_ASYNC (const dev_su3_2v_half * gf,
             //color
             #ifdef TEMPORALGAUGE
               // gf == ID for t != T-1 => just read the spinor
-              #ifdef MPI
+              #ifdef TM_USE_MPI
                 if ( ((gfindex_nextsite[hoppos]) < (dev_T-1)*spatialvol) || (dev_rank > 0) ) {
                 //if ((gfindex_nextsite[hoppos]) < (dev_T-1)*spatialvol) { // FAKE TEMPORALGAUGE
               #else
diff --git a/GPU/DEBUG/MATRIX_DEBUG.cuh b/GPU/DEBUG/MATRIX_DEBUG.cuh
index 33bbea249..aa4340df5 100644
--- a/GPU/DEBUG/MATRIX_DEBUG.cuh
+++ b/GPU/DEBUG/MATRIX_DEBUG.cuh
@@ -6,7 +6,7 @@
 
 extern "C" {
 //#ifdef HAVE_CONFIG_H
-//# include<config.h>
+//# include<tmlqcd_config.h>
 //#endif
 //#include <stdlib.h>
 //#include <stdio.h>
diff --git a/GPU/DEBUG/MATRIX_MPI_DEBUG.cuh b/GPU/DEBUG/MATRIX_MPI_DEBUG.cuh
index 9668aece0..f862d2af8 100644
--- a/GPU/DEBUG/MATRIX_MPI_DEBUG.cuh
+++ b/GPU/DEBUG/MATRIX_MPI_DEBUG.cuh
@@ -6,7 +6,7 @@
 
 extern "C" {
   //#ifdef HAVE_CONFIG_H
-  //# include<config.h>
+  //# include<tmlqcd_config.h>
   //#endif
   //#include <stdlib.h>
   //#include <stdio.h>
diff --git a/GPU/Hopping_Matrix.cuh b/GPU/Hopping_Matrix.cuh
index 2ed19dd57..5b85729b3 100644
--- a/GPU/Hopping_Matrix.cuh
+++ b/GPU/Hopping_Matrix.cuh
@@ -497,7 +497,7 @@ __global__ void dev_Hopping_Matrix(const dev_su3_2vM(RealT) * gf, const dev_spin
             
             #ifdef TEMPORALGAUGE
               // gf == ID for t != T-1 => just read the spinor
-              #ifdef MPI
+              #ifdef TM_USE_MPI
                 if ( ((gfindex_site[pos]) < (dev_T-1)*spatialvol) || (dev_rank < dev_nproc-1) ) {
                 //if ((gfindex_site[pos]) < (dev_T-1)*spatialvol) { // FAKE TEMPORALGAUGE
               #else
@@ -562,7 +562,7 @@ __global__ void dev_Hopping_Matrix(const dev_su3_2vM(RealT) * gf, const dev_spin
             //color
             #ifdef TEMPORALGAUGE
               // gf == ID for t != T-1 => just read the spinor
-              #ifdef MPI
+              #ifdef TM_USE_MPI
                 if ( ((gfindex_nextsite[hoppos]) < (dev_T-1)*spatialvol) || (dev_rank > 0) ) {
                 //if ((gfindex_nextsite[hoppos]) < (dev_T-1)*spatialvol) { // FAKE TEMPORALGAUGE
               #else
@@ -977,7 +977,7 @@ __global__ void dev_Hopping_Matrix_half(const dev_su3_2v_half * gf, const dev_sp
             
             #ifdef TEMPORALGAUGE
               // gf == ID for t != T-1 => just read the spinor
-              #ifdef MPI
+              #ifdef TM_USE_MPI
                 if ( ((gfindex_site[pos]) < (dev_T-1)*spatialvol) || (dev_rank < dev_nproc-1) ) {
                 //if ((gfindex_site[pos]) < (dev_T-1)*spatialvol) { // FAKE TEMPORALGAUGE
               #else
@@ -1054,7 +1054,7 @@ __global__ void dev_Hopping_Matrix_half(const dev_su3_2v_half * gf, const dev_sp
             //color
             #ifdef TEMPORALGAUGE
               // gf == ID for t != T-1 => just read the spinor
-              #ifdef MPI
+              #ifdef TM_USE_MPI
                 if ( ((gfindex_nextsite[hoppos]) < (dev_T-1)*spatialvol) || (dev_rank > 0) ) {
                 //if ((gfindex_nextsite[hoppos]) < (dev_T-1)*spatialvol) { // FAKE TEMPORALGAUGE
               #else
diff --git a/GPU/MACROS.cuh b/GPU/MACROS.cuh
index a1cf010eb..4124ceb96 100644
--- a/GPU/MACROS.cuh
+++ b/GPU/MACROS.cuh
@@ -70,7 +70,7 @@
 
 
 
-#ifndef MPI	//  non-MPI  ////////////////////////////////////////////////////////////
+#ifndef TM_USE_MPI	//  non-MPI  ////////////////////////////////////////////////////////////
 
 
 
diff --git a/GPU/gauge_reconstruction.cuh b/GPU/gauge_reconstruction.cuh
index fca0f0d24..2cb9bc68a 100644
--- a/GPU/gauge_reconstruction.cuh
+++ b/GPU/gauge_reconstruction.cuh
@@ -963,7 +963,7 @@ __global__ void dev_check_gauge_reconstruction_8(typename dev_su3_2vT<RealT>::ty
 template<class RealT>
 void su3to2vf4(su3** gf, typename dev_su3_2vT<RealT>::type* h2d_gf){
   int i,j;
-  #ifndef MPI
+  #ifndef TM_USE_MPI
     for (i = 0; i < VOLUME; i++) {
   #else
     for (i = 0; i < (VOLUME+RAND); i++) {
@@ -996,7 +996,7 @@ void su3to2vf4(su3** gf, typename dev_su3_2vT<RealT>::type* h2d_gf){
 template<class RealT>
 void su3to8(su3** gf, typename dev_su3_8T<RealT>::type* h2d_gf){
   int i,j;
-  #ifndef MPI
+  #ifndef TM_USE_MPI
     for (i = 0; i < VOLUME; i++) {
   #else
     for (i = 0; i < (VOLUME+RAND); i++) {
diff --git a/GPU/half.cuh b/GPU/half.cuh
index 1261f240d..79b7f901b 100644
--- a/GPU/half.cuh
+++ b/GPU/half.cuh
@@ -433,7 +433,7 @@ extern "C" int bind_halfspinor_texture(dev_spinor_half* sh, float* shnorm){
   size_t size, sizenorm;
   int gridsize;
   
-  #ifdef MPI
+  #ifdef TM_USE_MPI
     if(even_odd_flag){
       size = sizeof(short4)*6*(VOLUME+RAND)/2;
       sizenorm = sizeof(float)*(VOLUME+RAND)/2;
@@ -577,7 +577,7 @@ return(0);
 extern "C" int bind_texture_gf_half(dev_su3_2v_half * gf){
  //printf("Binding texture to gaugefield\n");
  
-  #ifdef MPI
+  #ifdef TM_USE_MPI
     #ifdef GF_8
      size_t size = sizeof(short4)*2*(VOLUME+RAND)*4;
     #else
@@ -1054,7 +1054,7 @@ float dotprod_half(dev_spinor_half* x, float* x_norm, dev_spinor_half* y, float*
    for(i=0; i<blas_half_redblocks; i++){
      finalsum += blas_half_sredfield[i];
    }
-   #ifdef MPI
+   #ifdef TM_USE_MPI
      MPI_Allreduce(&finalsum, &result, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
      finalsum=result;
    #endif
@@ -1089,7 +1089,7 @@ float squarenorm_half(dev_spinor_half* x, float * xnorm){
    for(i=0; i<blas_half_redblocks; i++){
      finalsum += blas_half_sredfield[i];
    }
-   #ifdef MPI
+   #ifdef TM_USE_MPI
      MPI_Allreduce(&finalsum, &result, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
      finalsum=result;
    #endif
diff --git a/GPU/half_solvers.cuh b/GPU/half_solvers.cuh
index 33b740053..194e70573 100644
--- a/GPU/half_solvers.cuh
+++ b/GPU/half_solvers.cuh
@@ -202,7 +202,7 @@ extern "C" int dev_cg_eo_half(
 
   //use full volume here as we need the complete gauge field!!!
   int Vol;
-   #ifndef MPI
+   #ifndef TM_USE_MPI
      Vol = VOLUME;
    #else
      Vol = VOLUME+RAND;
@@ -285,7 +285,7 @@ extern "C" int dev_cg_eo_half(
  for(i=0;i<maxit;i++){ //MAIN LOOP
   
   // Q_{-}Q{+}
-  #ifndef MPI
+  #ifndef TM_USE_MPI
     dev_Qtm_pm_psi_half(spin2, spin2_norm, spin3, spin3_norm, griddim3, blockdim3, griddim4, blockdim4);
   #else
     dev_Qtm_pm_psi_half_mpi(spin2, spin2_norm, spin3, spin3_norm, griddim3, blockdim3, griddim4, blockdim4);
@@ -367,7 +367,7 @@ extern "C" int dev_cg_eo_half(
     // DO NOT USE tm_dirac_dagger_kappa here, otherwise spin2 will be overwritten!!!
       
     // Q_{-}Q{+}
-    #ifndef MPI
+    #ifndef TM_USE_MPI
       dev_Qtm_pm_psi_half(spin1, spin1_norm, spin3, spin3_norm, griddim3, blockdim3, griddim4, blockdim4);
     #else
       dev_Qtm_pm_psi_half_mpi(spin1, spin1_norm, spin3, spin3_norm, griddim3, blockdim3, griddim4, blockdim4);
diff --git a/GPU/mixed_solve.cu b/GPU/mixed_solve.cu
index 5a33e4e37..652d147c4 100644
--- a/GPU/mixed_solve.cu
+++ b/GPU/mixed_solve.cu
@@ -74,7 +74,7 @@ extern "C" {
 #include "../su3spinor.h"
 #include "../solver/solver_field.h"
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   #include "../xchange.h"
 #endif 
 
@@ -83,15 +83,13 @@ extern "C" {
 
 
 #ifdef HAVE_CONFIG_H
-  #include<config.h>
+  #include<tmlqcd_config.h>
 #endif
 
 
-#ifdef MPI
-  #undef MPI
+#ifdef TM_USE_MPI
   #undef REAL
     #include <mpi.h>
-  #define MPI
   #define REAL float
 #endif
 
@@ -268,7 +266,7 @@ __device__  int  dev_LX,dev_LY,dev_LZ,dev_T,dev_VOLUME;
 
 
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 
 
 // from mixed_solve_eo_nd.cuh
@@ -346,7 +344,7 @@ EXTERN int g_nb_z_up, g_nb_z_dn;
 // mixed solver, even/odd, non-degenerate two flavour
 #include "mixed_solve_eo_nd.cuh"
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 // optimization of the communication
   #include "ASYNC.cuh"
 #endif 
@@ -424,7 +422,7 @@ void dev_Qtm_pm_psi(dev_spinorM(RealT)* spinin, dev_spinorM(RealT)* spinout, int
   //spinout == odd
   
   //Q_{-}
-  		#ifdef MPI
+  		#ifdef TM_USE_MPI
   		  xchange_field_wrapper(spinin, 0);
   		#endif
   #ifdef USETEXTURE
@@ -440,7 +438,7 @@ void dev_Qtm_pm_psi(dev_spinorM(RealT)* spinin, dev_spinorM(RealT)* spinout, int
   #endif
   dev_mul_one_pm_imu_inv<RealT> <<<gridsize2, blocksize2>>>(mixedsolveParameter.dev_spin_eo1,mixedsolveParameter.dev_spin_eo2, -1.);
   
-  		#ifdef MPI
+  		#ifdef TM_USE_MPI
   		  xchange_field_wrapper(mixedsolveParameter.dev_spin_eo2, 1);
   		#endif
   #ifdef USETEXTURE
@@ -458,7 +456,7 @@ void dev_Qtm_pm_psi(dev_spinorM(RealT)* spinin, dev_spinorM(RealT)* spinout, int
   
   
   //Q_{+}
-  		#ifdef MPI
+  		#ifdef TM_USE_MPI
   		  xchange_field_wrapper(mixedsolveParameter.dev_spin_eo2, 0);
   		#endif
   #ifdef USETEXTURE
@@ -474,7 +472,7 @@ void dev_Qtm_pm_psi(dev_spinorM(RealT)* spinin, dev_spinorM(RealT)* spinout, int
   #endif
   dev_mul_one_pm_imu_inv<RealT> <<<gridsize2, blocksize2>>>(mixedsolveParameter.dev_spin_eo1,spinout, +1.);
   
-  		#ifdef MPI
+  		#ifdef TM_USE_MPI
   		  xchange_field_wrapper(spinout, 1);
   		#endif
   #ifdef USETEXTURE
@@ -494,7 +492,7 @@ void dev_Qtm_pm_psi(dev_spinorM(RealT)* spinin, dev_spinorM(RealT)* spinout, int
 
 
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 // aequivalent to Qtm_pm_psi in tm_operators.c
 // using HOPPING_ASYNC for mpi
 template<class RealT>
@@ -663,7 +661,7 @@ extern "C" void dev_Qtm_pm_psi_half(dev_spinor_half* spinin, float* spinin_norm,
 }
 
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 
 // aequivalent to Qtm_pm_psi in tm_operators.c for half precision
 extern "C" void dev_Qtm_pm_psi_half_mpi(dev_spinor_half* spinin, float* spinin_norm, dev_spinor_half* spinout, float* spinout_norm, int gridsize, int blocksize, int gridsize2, int blocksize2){
@@ -828,7 +826,7 @@ extern "C" int find_devices() {
 
   cudaGetDeviceCount(&deviceCount);
     
-  #ifdef MPI
+  #ifdef TM_USE_MPI
     if (g_cart_id == 0) {
   #endif
     
@@ -888,7 +886,7 @@ extern "C" int find_devices() {
     #endif
     }
     
-    #ifdef MPI 
+    #ifdef TM_USE_MPI 
       }
     #endif
     
@@ -1303,7 +1301,7 @@ int dev_cg_eo(
   he_cg_init<<< 1, 1 >>> (grid, (REAL) g_kappa, (REAL)(g_mu/(2.0*g_kappa)), h0,h1,h2,h3);
   // BEWARE in dev_tm_dirac_kappa we need the true mu (not 2 kappa mu!)
   
-  #ifdef MPI
+  #ifdef TM_USE_MPI
     he_cg_init_nd_additional_mpi<<<1,1>>>(VOLUMEPLUSRAND, RAND, g_cart_id, g_nproc);
     // debug	// check dev_VOLUMEPLUSRAND and dev_RAND on device
   	if (g_proc_id == 0) {
@@ -1369,7 +1367,7 @@ int dev_cg_eo(
  
 
  //relative precision -> get initial residue
- #ifndef MPI
+ #ifndef TM_USE_MPI
    sourcesquarenorm = cublasDot (24*VOLUME/2, (const RealT*)spinin, 1, (const RealT*)spinin, 1);
  #else
    sourcesquarenorm = cublasDot_wrapper (24*VOLUME/2, (RealT*)spinin, 1, (RealT*)spinin, 1);
@@ -1391,7 +1389,7 @@ int dev_cg_eo(
  for(i=0;i<maxit;i++){ //MAIN LOOP
   
   // Q_{-}Q{+}
-  #ifndef MPI
+  #ifndef TM_USE_MPI
     dev_Qtm_pm_psi    <RealT>(spin2, spin3, griddim3, blockdim3, griddim4, blockdim4, mixedsolveParameter);
   #else
     dev_Qtm_pm_psi_mpi<RealT>(spin2, spin3, griddim3, blockdim3, griddim4, blockdim4, mixedsolveParameter);
@@ -1404,7 +1402,7 @@ int dev_cg_eo(
   
   
  //alpha
-  #ifndef MPI
+  #ifndef TM_USE_MPI
     host_dotprod = cublasDot (24*VOLUME/2, (const RealT*) spin2, 1, (const RealT*) spin3, 1);
   #else
     host_dotprod = cublasDot_wrapper (24*VOLUME/2, (RealT*) spin2, 1, (RealT*) spin3, 1);
@@ -1425,7 +1423,7 @@ int dev_cg_eo(
   }
 
   //Abbruch?
-  #ifndef MPI
+  #ifndef TM_USE_MPI
     host_dotprod = cublasDot (24*VOLUME/2, (const RealT*) spin0, 1,(const RealT*) spin0, 1);
   #else
     host_dotprod = cublasDot_wrapper (24*VOLUME/2, (RealT*) spin0, 1,(RealT*) spin0, 1);
@@ -1458,7 +1456,7 @@ int dev_cg_eo(
     // DO NOT USE tm_dirac_dagger_kappa here, otherwise spin2 will be overwritten!!!
 
     // Q_{-}Q{+}
-    #ifndef MPI
+    #ifndef TM_USE_MPI
         dev_Qtm_pm_psi    <RealT>(spin1, spin3, griddim3, blockdim3, griddim4, blockdim4, mixedsolveParameter);
     #else
         dev_Qtm_pm_psi_mpi<RealT>(spin1, spin3, griddim3, blockdim3, griddim4, blockdim4, mixedsolveParameter);
@@ -1725,7 +1723,7 @@ void convert2double_spin (typename dev_spinorT<RealT>::type* spin, spinor* h2d)
 
   int i, Vol;
   
-  //#ifndef MPI
+  //#ifndef TM_USE_MPI
     if (even_odd_flag) {
       Vol = VOLUME/2;
     }
@@ -1782,7 +1780,7 @@ void convert2REAL4_spin(spinor* spin, typename dev_spinorT<RealT>::type* h2d){
 
   int i, Vol;
  
-  //#ifndef MPI
+  //#ifndef TM_USE_MPI
     if (even_odd_flag) {
       Vol = VOLUME/2;
     }
@@ -2004,7 +2002,7 @@ MixedsolveParameter<RealT>* init_mixedsolve_eo(su3** gf){
     }
     
     // try to set active device to device_num given in input file (or mpi rank)
-    #ifndef MPI
+    #ifndef TM_USE_MPI
     // only if device_num is not the default (-1)
      if(device_num > -1){ 
     	if(device_num < ndev){
@@ -2062,7 +2060,7 @@ MixedsolveParameter<RealT>* init_mixedsolve_eo(su3** gf){
   }
   
   // output
-  #ifdef MPI
+  #ifdef TM_USE_MPI
     if (g_cart_id == 0) {
   #endif
   
@@ -2078,11 +2076,11 @@ MixedsolveParameter<RealT>* init_mixedsolve_eo(su3** gf){
   	  printf("Using GF 12 reconstruction.\n");
   	#endif
   
-  #ifdef MPI
+  #ifdef TM_USE_MPI
     }
   #endif
   
-  #ifndef MPI
+  #ifndef TM_USE_MPI
   	#ifdef GF_8
   	  /* allocate 8 floats for gf = 2*4*VOLUME float4's*/
   	  size_t dev_gfsize = 2*4*VOLUME * sizeof(dev_su3_8M(RealT));
@@ -2106,7 +2104,7 @@ MixedsolveParameter<RealT>* init_mixedsolve_eo(su3** gf){
     exit(200);
   }   // Allocate array on device
   else {
-    #ifndef MPI
+    #ifndef TM_USE_MPI
       printf("Allocated memory for gauge field on device.\n");
     #else
       if (g_cart_id == 0) printf("Allocated memory for gauge field on devices.\n");
@@ -2125,7 +2123,7 @@ MixedsolveParameter<RealT>* init_mixedsolve_eo(su3** gf){
   
   
   #ifdef HALF
-    #ifndef MPI
+    #ifndef TM_USE_MPI
       #ifdef GF_8
         /* allocate 8 floats for gf = 2*4*VOLUME float4's*/
         printf("Using half precision GF 8 reconstruction\n");
@@ -2169,7 +2167,7 @@ MixedsolveParameter<RealT>* init_mixedsolve_eo(su3** gf){
   cudaMalloc((void **) &dev_nn_eo, nnsize/2);
   cudaMalloc((void **) &dev_nn_oe, nnsize/2);
   
-  #ifndef MPI
+  #ifndef TM_USE_MPI
     size_t idxsize = VOLUME/2*sizeof(int);
   #else
     size_t idxsize = (VOLUME+RAND)/2*sizeof(int);
@@ -2179,7 +2177,7 @@ MixedsolveParameter<RealT>* init_mixedsolve_eo(su3** gf){
   cudaMalloc((void **) &dev_eoidx_even, idxsize);
   cudaMalloc((void **) &dev_eoidx_odd, idxsize);
   
-  #ifndef MPI
+  #ifndef TM_USE_MPI
     initnn();
     initnn_eo();
     //shownn_eo();
@@ -2212,7 +2210,7 @@ MixedsolveParameter<RealT>* init_mixedsolve_eo(su3** gf){
   	  printf("Could not allocate memory for mixedsolveParameter.h2d_spin. Aborting...\n");
   	  exit(200);
   	} // Allocate float conversion spinor on host
-  	#ifdef MPI
+  	#ifdef TM_USE_MPI
   	  size_t dev_spinsize_ext =  6*(VOLUME+RAND)/2*sizeof(dev_spinorM(RealT));
   	#endif
   #else
@@ -2226,14 +2224,14 @@ MixedsolveParameter<RealT>* init_mixedsolve_eo(su3** gf){
   	  printf("Could not allocate memory for mixedsolveParameter.h2d_spin_norm. Aborting...\n");
   	  exit(200);
   	} // Allocate float conversion norm on host 
-  	#ifdef MPI
+  	#ifdef TM_USE_MPI
   	  size_t dev_spinsize_ext =  6*(VOLUME+RAND)/2*sizeof(dev_spinor_half);
   	  size_t dev_normsize_ext =  (VOLUME+RAND)/2*sizeof(float);
   	#endif
   #endif
   
   
-  #ifndef MPI
+  #ifndef TM_USE_MPI
   	cudaMalloc((void **) &mixedsolveParameter.dev_spin1, dev_spinsize);   // Allocate array spin1 on device
   	cudaMalloc((void **) &mixedsolveParameter.dev_spin2, dev_spinsize);   // Allocate array spin2 on device
   	cudaMalloc((void **) &mixedsolveParameter.dev_spin3, dev_spinsize);   // Allocate array spin3 on device
@@ -2312,7 +2310,7 @@ MixedsolveParameter<RealT>* init_mixedsolve_eo(su3** gf){
   }
   
 
-  #ifdef MPI
+  #ifdef TM_USE_MPI
     /*  for async communication */
     // page-locked memory
    #ifndef HALF 
@@ -2466,7 +2464,7 @@ void finalize_mixedsolve(MixedsolveParameter<RealT>* mixedsolveParameterP){
     
   #endif
   
-#ifdef MPI
+#ifdef TM_USE_MPI
   cudaFreeHost(RAND1);
   cudaFreeHost(RAND3);
   
@@ -2679,7 +2677,7 @@ void benchmark(spinor * const Q,MixedsolveParameter<RealT>& mixedsolveParameter)
   cudaMemcpy(mixedsolveParameter.dev_spinin, mixedsolveParameter.h2d_spin, dev_spinsize, cudaMemcpyHostToDevice);
   printf("%s\n", cudaGetErrorString(cudaGetLastError()));
   
-  #ifndef MPI
+  #ifndef TM_USE_MPI
     assert((start = clock())!=-1);
   #else
     start = MPI_Wtime();
@@ -2735,7 +2733,7 @@ void benchmark(spinor * const Q,MixedsolveParameter<RealT>& mixedsolveParameter)
   printf("Applying H 1000 times\n");
   for(i=0; i<1000; i++){
   
-      #ifdef MPI
+      #ifdef TM_USE_MPI
            xchange_field_wrapper(mixedsolveParameter.dev_spinin, 0);
       #endif
       #ifdef USETEXTURE
@@ -2750,7 +2748,7 @@ void benchmark(spinor * const Q,MixedsolveParameter<RealT>& mixedsolveParameter)
       unbind_texture_spin(1);
     #endif
 
-    #ifdef MPI
+    #ifdef TM_USE_MPI
         xchange_field_wrapper(mixedsolveParameter.dev_spin_eo1, 0);
     #endif
        bind_texture_spin(mixedsolveParameter.dev_spin_eo1,1);
@@ -2769,7 +2767,7 @@ void benchmark(spinor * const Q,MixedsolveParameter<RealT>& mixedsolveParameter)
   
   
   
-  #ifndef MPI
+  #ifndef TM_USE_MPI
     assert((stop = clock())!=-1);
     timeelapsed = (double) (stop-start)/CLOCKS_PER_SEC;
     // x2 because 2x Hopping per iteration
@@ -2796,7 +2794,7 @@ void benchmark(spinor * const Q,MixedsolveParameter<RealT>& mixedsolveParameter)
 
 
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 template<class RealT>
 void benchmark2(spinor * const Q,MixedsolveParameter<RealT>& mixedsolveParameter){
   
@@ -2809,7 +2807,7 @@ void benchmark2(spinor * const Q,MixedsolveParameter<RealT>& mixedsolveParameter
   cudaMemcpy(mixedsolveParameter.dev_spinin, mixedsolveParameter.h2d_spin, dev_spinsize, cudaMemcpyHostToDevice);
   printf("%s\n", cudaGetErrorString(cudaGetLastError()));
   
-  #ifndef MPI
+  #ifndef TM_USE_MPI
     assert((start = clock())!=-1);
   #else
     start = MPI_Wtime();
@@ -2888,7 +2886,7 @@ void benchmark2(spinor * const Q,MixedsolveParameter<RealT>& mixedsolveParameter
   
   
   
-  #ifndef MPI
+  #ifndef TM_USE_MPI
     assert((stop = clock())!=-1);
     timeelapsed = (double) (stop-start)/CLOCKS_PER_SEC;
     // x8 because 8x Hopping per iteration
@@ -2967,7 +2965,7 @@ int mixed_solve_eoT (spinor * const P, spinor * const Q, const int max_iter,
   #ifndef HALF
   // small benchmark
     assign(solver_field[0],Q,N);
-    #ifndef MPI
+    #ifndef TM_USE_MPI
       benchmark(solver_field[0]);
     #else
       benchmark2(solver_field[0]); 
diff --git a/GPU/mixed_solve_eo_nd.cuh b/GPU/mixed_solve_eo_nd.cuh
index fdeac4b5f..0e2afeda8 100644
--- a/GPU/mixed_solve_eo_nd.cuh
+++ b/GPU/mixed_solve_eo_nd.cuh
@@ -63,7 +63,7 @@
 
 
 #ifdef HAVE_CONFIG_H
-  #include<config.h>
+  #include<tmlqcd_config.h>
 #endif
 
 extern "C" {
@@ -74,11 +74,9 @@ extern "C" {
 }
 #include "../global.h"
 
-#ifdef MPI
-  #undef MPI
+#ifdef TM_USE_MPI
   #undef REAL
     #include <mpi.h>
-  #define MPI
   #define REAL float
 #endif
 
@@ -93,7 +91,7 @@ size_t dev_gfsize;
 size_t dev_spinsize_int;		// making the structure transparent:							
 int N_sites_int;			// _int: internal sites
 int N_floats_int;			// _ext: internal sites + additional boundaries
-#ifdef MPI
+#ifdef TM_USE_MPI
   size_t dev_spinsize_ext;
   int N_sites_ext;
   int N_floats_ext;
@@ -134,7 +132,7 @@ dev_spinor * dev_spin_eo3_dn;
 __device__ float mubar, epsbar;
 
 
-#ifdef MPI					// collecting variables for the MPI implementation
+#ifdef TM_USE_MPI					// collecting variables for the MPI implementation
   						// put to mixed_solve.cu
   /*
   __device__ int dev_RAND;			// not used, maybe later ...
@@ -237,7 +235,7 @@ __global__ void he_cg_init_nd_additional (float param_mubar, float param_epsbar)
 
 
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 
 // puts the additional variables VOLUMEPLUSRAND and RAND on the device
 __global__ void he_cg_init_nd_additional_mpi (int param_VOLUMEPLUSRAND, int param_RAND, int rank, int nproc) {
@@ -262,7 +260,7 @@ __global__ void he_cg_init_nd_additional_mpi (int param_VOLUMEPLUSRAND, int para
 /////////////////////////////////////////////
 
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 
 // builds an array  iseven[global position]  to check wether is even or odd
 
@@ -525,7 +523,7 @@ void init_idxgauge_mpi() {		// works!
 
 void set_global_sizes() {
   
-  #ifndef MPI
+  #ifndef TM_USE_MPI
   	#ifdef GF_8
   	  // allocate 8 floats for gf = 2*4*VOLUME float4's			// dev_su3_8 = float4
   	  dev_gfsize = 4*VOLUME * 2*sizeof(dev_su3_8);				// allocates for each lattice site and for 4 directions  2*float4 = 8 floats  = 8 real parameters
@@ -544,7 +542,7 @@ void set_global_sizes() {
   dev_spinsize_int   =  6*VOLUME/2*sizeof(dev_spinor);				// 24 floats per lattice site
   N_sites_int        =    VOLUME/2;
   N_floats_int       = 24*VOLUME/2;
-  #ifdef MPI
+  #ifdef TM_USE_MPI
     dev_spinsize_ext =  6*(VOLUME+RAND)/2*sizeof(dev_spinor);
     N_sites_ext      =    (VOLUME+RAND)/2;
     N_floats_ext     = 24*(VOLUME+RAND)/2;
@@ -579,7 +577,7 @@ void init_mixedsolve_eo_nd (su3** gf) {	// gf is the full gauge field
   //////////////////////
   
   /*
-  #ifndef MPI
+  #ifndef TM_USE_MPI
   	#ifdef GF_8
   	  // allocate 8 floats for gf = 2*4*VOLUME float4's			// dev_su3_8 = float4
   	  dev_gfsize = 4*VOLUME * 2*sizeof(dev_su3_8);				// allocates for each lattice site and for 4 directions  2*float4 = 8 floats  = 8 real parameters
@@ -598,7 +596,7 @@ void init_mixedsolve_eo_nd (su3** gf) {	// gf is the full gauge field
   dev_spinsize_int   =  6*VOLUME/2*sizeof(dev_spinor);				// 24 floats per lattice site
   N_sites_int        =    VOLUME/2;
   N_floats_int       = 24*VOLUME/2;
-  #ifdef MPI
+  #ifdef TM_USE_MPI
     dev_spinsize_ext =  6*(VOLUME+RAND)/2*sizeof(dev_spinor);
     N_sites_ext      =    (VOLUME+RAND)/2;
     N_floats_ext     = 24*(VOLUME+RAND)/2;
@@ -635,7 +633,7 @@ void init_mixedsolve_eo_nd (su3** gf) {	// gf is the full gauge field
     exit(300);
   }
   
-  #ifndef MPI
+  #ifndef TM_USE_MPI
       // only if device_num is not the default (-1)
       if(device_num > -1){ 
     	// try to set active device to device_num given in input file
@@ -691,7 +689,7 @@ void init_mixedsolve_eo_nd (su3** gf) {	// gf is the full gauge field
   
   
   // output
-  #ifdef MPI
+  #ifdef TM_USE_MPI
     if (g_cart_id == 0) {
   #endif
   
@@ -707,7 +705,7 @@ void init_mixedsolve_eo_nd (su3** gf) {	// gf is the full gauge field
   	  printf("Using GF 12 reconstruction.\n");
   	#endif
   
-  #ifdef MPI
+  #ifdef TM_USE_MPI
     }
   #endif
   
@@ -719,7 +717,7 @@ void init_mixedsolve_eo_nd (su3** gf) {	// gf is the full gauge field
   /////////////////
   
   /*									// put to global
-  #ifndef MPI
+  #ifndef TM_USE_MPI
   	#ifdef GF_8
   	  // allocate 8 floats for gf = 2*4*VOLUME float4's		// dev_su3_8 = float4
   	  dev_gfsize = 4*VOLUME * 2*sizeof(dev_su3_8);			// allocates for each lattice site and for 4 directions  2*float4 = 8 floats  = 8 real parameters
@@ -743,7 +741,7 @@ void init_mixedsolve_eo_nd (su3** gf) {	// gf is the full gauge field
     exit(200);
   }
   else {
-    #ifndef MPI
+    #ifndef TM_USE_MPI
       printf("Allocated memory for gauge field on device.\n");
     #else
       if (g_cart_id == 0) printf("Allocated memory for gauge gauge field on devices.\n");
@@ -764,7 +762,7 @@ void init_mixedsolve_eo_nd (su3** gf) {	// gf is the full gauge field
   
   		// debug	// CUDA
   		#ifdef CUDA_DEBUG
-  		  #ifndef MPI
+  		  #ifndef TM_USE_MPI
   		    CUDA_CHECK("CUDA error in init_mixedsolve_eo_nd(). Copying MixedsolveParameter<RealT>::getGlobalP()->dev_gf to device failed.", "Copied MixedsolveParameter<RealT>::getGlobalP()->dev_gf to device.");
   		  #else
   		    CUDA_CHECK("CUDA error in init_mixedsolve_eo_nd(). Copying MixedsolveParameter<RealT>::getGlobalP()->dev_gf to device failed.", "Copied MixedsolveParameter<RealT>::getGlobalP()->dev_gf to devices.");
@@ -787,7 +785,7 @@ void init_mixedsolve_eo_nd (su3** gf) {	// gf is the full gauge field
   cudaMalloc((void **) &dev_nn_oe, nnsize/2);			// half the memory on device
   
   
-  #ifndef MPI
+  #ifndef TM_USE_MPI
     idxsize = VOLUME/2*sizeof(int);				// size of memory necessary for VOLUME/2 integers
   #else
     idxsize = (VOLUME+RAND)/2*sizeof(int);
@@ -798,7 +796,7 @@ void init_mixedsolve_eo_nd (su3** gf) {	// gf is the full gauge field
   cudaMalloc((void **) &dev_eoidx_odd, idxsize);		// allocate on device
   
   
-  #ifndef MPI
+  #ifndef TM_USE_MPI
     initnn();							// initialize nearest-neighbour table for gpu
     initnn_eo();						// initialize nearest-neighbour table for gpu with even-odd enabled
   #else
@@ -824,7 +822,7 @@ void init_mixedsolve_eo_nd (su3** gf) {	// gf is the full gauge field
   
   		// debug	// CUDA
   		#ifdef CUDA_DEBUG
-  		  #ifndef MPI
+  		  #ifndef TM_USE_MPI
   		    CUDA_CHECK("CUDA error in init_mixedsolve_eo_nd(). Memory allocation of grid stuff failed.", "Allocated grid stuff on device.");
   		  #else
   		    CUDA_CHECK("CUDA error in init_mixedsolve_eo_nd(). Memory allocation of grid stuff failed.", "Allocated grid stuff on devices.");
@@ -839,7 +837,7 @@ void init_mixedsolve_eo_nd (su3** gf) {	// gf is the full gauge field
   /////////////							// now we have to consider 2 flavors: up, dn
   
   /*
-  #ifndef MPI
+  #ifndef TM_USE_MPI
     dev_spinsize = 6*VOLUME/2*sizeof(dev_spinor);		// remember: dev_spinor = float4
   #else
     dev_spinsize = (VOLUME+RAND)/2 * 6*sizeof(dev_spinor);	// NOTICE: this refers to the memory requirements for the device, host needs twice the memory !!
@@ -847,7 +845,7 @@ void init_mixedsolve_eo_nd (su3** gf) {	// gf is the full gauge field
   */
   
   
-  #ifndef MPI
+  #ifndef TM_USE_MPI
   
     cudaMalloc((void **) &dev_spin1_up, dev_spinsize_int);   	// allocates device memory for the fields spinor fields used in dev_cg_eo_nd(...)
     cudaMalloc((void **) &dev_spin1_dn, dev_spinsize_int);	// pointing to device
@@ -888,7 +886,7 @@ void init_mixedsolve_eo_nd (su3** gf) {	// gf is the full gauge field
   #endif
   
   
-  #ifndef MPI
+  #ifndef TM_USE_MPI
   		// debug	// host code
   		if ( (void *) (h2d_spin_up = (dev_spinor *) malloc(dev_spinsize_int) ) == NULL) {
   		  printf("Could not allocate memory for h2d_spin_up. Aborting...\n");
@@ -913,7 +911,7 @@ void init_mixedsolve_eo_nd (su3** gf) {	// gf is the full gauge field
   #endif
   
   
-  #ifndef MPI
+  #ifndef TM_USE_MPI
   
     cudaMalloc((void **) &dev_spin_eo1_up, dev_spinsize_int);		// used for matrix_multiplication32(...)
     cudaMalloc((void **) &dev_spin_eo1_dn, dev_spinsize_int);
@@ -939,7 +937,7 @@ void init_mixedsolve_eo_nd (su3** gf) {	// gf is the full gauge field
   
   		// debug	// CUDA
   		#ifdef CUDA_DEBUG
-  		  #ifndef MPI
+  		  #ifndef TM_USE_MPI
   		    CUDA_CHECK("CUDA error in init_mixedsolve_eo_nd(). Memory allocation of spinor fields failed.", "Allocated spinor fields on device.");
   		  #else
   		    CUDA_CHECK("CUDA error in init_mixedsolve_eo_nd(). Memory allocation of spinor fields failed.", "Allocated spinor fields on devices.");
@@ -950,7 +948,7 @@ void init_mixedsolve_eo_nd (su3** gf) {	// gf is the full gauge field
   
   
   
-  #ifdef MPI
+  #ifdef TM_USE_MPI
   
   	#ifdef HOPPING_DEBUG													// Hopping_Matrix() is applied upon these spinor fields
   		// debug	// host code
@@ -1029,7 +1027,7 @@ void init_mixedsolve_eo_nd (su3** gf) {	// gf is the full gauge field
   
   		// debug	// CUDA
   		#ifdef CUDA_DEBUG
-  		  #ifndef MPI
+  		  #ifndef TM_USE_MPI
   		    CUDA_CHECK("CUDA error in init_mixedsolve_eo_nd(). Memory allocation output stuff failed.", "Allocated output stuff on device.");
   		  #else
   		    CUDA_CHECK("CUDA error in init_mixedsolve_eo_nd(). Memory allocation output stuff failed.", "Allocated output stuff on devices.");
@@ -1056,7 +1054,7 @@ void init_mixedsolve_eo_nd (su3** gf) {	// gf is the full gauge field
   
   		// debug	// CUDA
   		#ifdef CUDA_DEBUG
-  		  #ifndef MPI
+  		  #ifndef TM_USE_MPI
   		    CUDA_CHECK("CUDA error in init_mixedsolve_eo_nd(). Memory allocation of grid[] specifications failed.", "Allocated grid[] specifications on device.");
   		  #else
   		    CUDA_CHECK("CUDA error in init_mixedsolve_eo_nd(). Memory allocation of grid[] specifications failed.", "Allocated grid[] specifications on devices.");
@@ -1115,7 +1113,7 @@ void finalize_mixedsolve_eo_nd(void) {
   free(h2d_spin_up);
   free(h2d_spin_dn);
   
-  #ifdef MPI
+  #ifdef TM_USE_MPI
   	#ifndef ALTERNATE_FIELD_XCHANGE
   	  free(spinor_xchange);
   	#else
@@ -1154,7 +1152,7 @@ void finalize_mixedsolve_eo_nd(void) {
   free(MixedsolveParameter<RealT>::getGlobalP()->h2d_gf);
   
   
-  #ifdef MPI
+  #ifdef TM_USE_MPI
   	#ifdef ALTERNATE_HOPPING_MATRIX
   	  free_gpu_indexfields();
   	#endif
@@ -1212,7 +1210,7 @@ void finalize_mixedsolve_eo_nd(void) {
 // MPI //
 /////////
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 
 // convert spinor to double
 
@@ -1364,7 +1362,7 @@ void to_host (spinor * host, dev_spinor * device, dev_spinor * auxiliary, int si
 // boundary exchange //
 ///////////////////////
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 
 // all three versions do work:
 
@@ -1454,7 +1452,7 @@ void xchange_field_wrapper (dev_spinor * dev_spin, int ieo) {
 // hopping matrix //
 ////////////////////
 
-#ifdef MPI	// implemented for checking the MPI implementation of the hopping matrix
+#ifdef TM_USE_MPI	// implemented for checking the MPI implementation of the hopping matrix
   #ifdef HOPPING_DEBUG
 
   // applies the hopping matrix on host for debugging purposes
@@ -1484,7 +1482,7 @@ void xchange_field_wrapper (dev_spinor * dev_spin, int ieo) {
 // linear algebra //
 ////////////////////
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 
 // have to rebuilt some linear algebra functions which contain global communication
 // can be done as wrappers to appropriate CUBLAS routines
@@ -1866,7 +1864,7 @@ void matrix_multiplication32 (dev_spinor * spinout_up, dev_spinor * spinout_dn,
 
 
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 
 ///////////////////////////
 // MATRIX MULTIPLICATION //
@@ -2344,7 +2342,7 @@ extern "C" void benchmark_eo_nd (spinor * Q_up, spinor * Q_dn, int N) {
   
   
   // timing
-  #ifndef MPI
+  #ifndef TM_USE_MPI
     double timeElapsed;
   #else
     double singleTimeElapsed;
@@ -2363,7 +2361,7 @@ extern "C" void benchmark_eo_nd (spinor * Q_up, spinor * Q_dn, int N) {
   // double effectiveFlopsPerApp = 23984.0;	// hopping = 1488
   double effectiveFlopsPerApp = 21296.0;	// per lattice site
   
-  #ifndef MPI
+  #ifndef TM_USE_MPI
     /*
     double realDeviceFlops;
     double realFlops;
@@ -2388,7 +2386,7 @@ extern "C" void benchmark_eo_nd (spinor * Q_up, spinor * Q_dn, int N) {
   // size of a spinor
   /*
   size_t dev_spinsize_int = 6*VOLUME/2 * sizeof(dev_spinor);
-  #ifdef MPI
+  #ifdef TM_USE_MPI
     size_t dev_spinsize_ext = 6*(VOLUME+RAND)/2 * sizeof(dev_spinor);
   #endif
   */
@@ -2409,7 +2407,7 @@ extern "C" void benchmark_eo_nd (spinor * Q_up, spinor * Q_dn, int N) {
   dev_spinor * C_up;
   dev_spinor * C_dn;
   
-  #ifndef MPI
+  #ifndef TM_USE_MPI
     cudaMalloc((void **) &A_up, dev_spinsize_int);
     cudaMalloc((void **) &A_dn, dev_spinsize_int);
     cudaMalloc((void **) &B_up, dev_spinsize_int);
@@ -2521,7 +2519,7 @@ extern "C" void benchmark_eo_nd (spinor * Q_up, spinor * Q_dn, int N) {
   
   
   		//debug
-  		#ifndef MPI
+  		#ifndef TM_USE_MPI
   		  printf("\nStarting a little BENCHMARK. benchmark_eo_nd().\n");
   		#else
   		  if (g_proc_id == 0) printf("\nStarting a little BENCHMARK. benchmark_eo_nd_mpi().\n");
@@ -2562,7 +2560,7 @@ extern "C" void benchmark_eo_nd (spinor * Q_up, spinor * Q_dn, int N) {
   
   
   		// debug
-  		#ifndef MPI
+  		#ifndef TM_USE_MPI
   		  printf("Applying the eo-preconditioned matrix %i times.\n", N);
   		#else
   		  if (g_proc_id == 0) printf("Applying the eo-preconditioned matrix %i times.\n", N);
@@ -2574,7 +2572,7 @@ extern "C" void benchmark_eo_nd (spinor * Q_up, spinor * Q_dn, int N) {
   
   
   // timer
-  #ifndef MPI
+  #ifndef TM_USE_MPI
     startBenchmark = double(clock()) / double(CLOCKS_PER_SEC);
   #else
     startBenchmark = MPI_Wtime();
@@ -2586,7 +2584,7 @@ extern "C" void benchmark_eo_nd (spinor * Q_up, spinor * Q_dn, int N) {
   for (i = 0; i < N; i++) {
   
   
-    #ifndef MPI
+    #ifndef TM_USE_MPI
     	matrix_multiplication32(A_up, A_dn,					// A = (matrix)*B
     	                        B_up, B_dn,
     	                        griddim2, blockdim2,
@@ -2635,14 +2633,14 @@ extern "C" void benchmark_eo_nd (spinor * Q_up, spinor * Q_dn, int N) {
   
   
   // timer
-  #ifndef MPI
+  #ifndef TM_USE_MPI
     stopBenchmark = double(clock()) / double(CLOCKS_PER_SEC);
   #else
     stopBenchmark = MPI_Wtime();
   #endif
   
   
-  #ifndef MPI
+  #ifndef TM_USE_MPI
   
   	timeElapsed = stopBenchmark - startBenchmark;
   	/*
@@ -2840,7 +2838,7 @@ int cg_eo_nd (dev_su3_2v * gf,
   size_t dev_spinsize_int   =  6*VOLUME/2*sizeof(dev_spinor);
   int N_sites_int           =    VOLUME/2;
   int N_floats_int          = 24*VOLUME/2;// (single precision) CUBLAS functions get the number of floats as input
-  #ifdef MPI
+  #ifdef TM_USE_MPI
     size_t dev_spinsize_ext =  6*(VOLUME+RAND)/2*sizeof(dev_spinor);
     int N_sites_ext         =    (VOLUME+RAND)/2;
     int N_floats_ext        = 24*(VOLUME+RAND)/2;
@@ -3034,7 +3032,7 @@ int cg_eo_nd (dev_su3_2v * gf,
   
   
   // rr = (r_up)^2 + (r_dn)^2
-  #ifndef MPI
+  #ifndef TM_USE_MPI
     rr_up = cublasDot(N_floats_int, (float *) r_up, 1, (float *) r_up, 1);
     rr_dn = cublasDot(N_floats_int, (float *) r_dn, 1, (float *) r_dn, 1);
   #else
@@ -3058,7 +3056,7 @@ int cg_eo_nd (dev_su3_2v * gf,
   
   
   		// debug
-  		#ifndef MPI
+  		#ifndef TM_USE_MPI
     		  printf("\nEntering inner loop.\n");
     		#else
     		  if (g_cart_id == 0) printf("\nEntering inner loop.\n");
@@ -3071,7 +3069,7 @@ int cg_eo_nd (dev_su3_2v * gf,
 		#endif
   
   		// debug
-  		#ifndef MPI
+  		#ifndef TM_USE_MPI
   		  printf("Initial inner residue: %.6e\n", r0r0);
   		#else
   		  if (g_cart_id == 0) printf("Initial inner residue: %.6e\n", r0r0);
@@ -3086,7 +3084,7 @@ int cg_eo_nd (dev_su3_2v * gf,
     #ifndef MATRIX_DEBUG
     
       // A*d(k)
-      #ifndef MPI
+      #ifndef TM_USE_MPI
       		matrix_multiplication32(Ad_up, Ad_dn,										// normally:  matrix_multiplication32()
       		                         d_up,  d_dn,										// debugging: matrix_debug1(), matrix_multiplication_test()
       		                        griddim2, blockdim2,
@@ -3128,7 +3126,7 @@ int cg_eo_nd (dev_su3_2v * gf,
     		to_host(dn_field[3], d_dn, h2d_spin_dn, dev_spinsize_int);
     		
     		// matrix multiplication
-    		#ifndef MPI
+    		#ifndef TM_USE_MPI
     		  printf("This is Q_Qdagger_ND(). ");
     		#else
     		  if (g_proc_id == 0) printf("This is Q_Qdagger_ND(). ");
@@ -3151,7 +3149,7 @@ int cg_eo_nd (dev_su3_2v * gf,
     
     
     // alpha = r(k)*r(k) / d(k)*A*d(k)
-    #ifndef MPI
+    #ifndef TM_USE_MPI
       dAd_up = cublasDot(N_floats_int, (float *) d_up, 1, (float *) Ad_up, 1);
       dAd_dn = cublasDot(N_floats_int, (float *) d_dn, 1, (float *) Ad_dn, 1);
     #else
@@ -3187,7 +3185,7 @@ int cg_eo_nd (dev_su3_2v * gf,
     else {				// recalculate residue r(k+1) = b - A*x(k+1)
     					//	"feedback"
       		// debug
-      		#ifndef MPI
+      		#ifndef TM_USE_MPI
       		  printf("Recalculating the inner residue.\n");
       		#else
       		  if (g_proc_id == 0) printf("Recalculating the inner residue.\n");
@@ -3198,7 +3196,7 @@ int cg_eo_nd (dev_su3_2v * gf,
       
       #ifndef MATRIX_DEBUG
       
-      	#ifndef MPI
+      	#ifndef TM_USE_MPI
         	matrix_multiplication32(Ax_up, Ax_dn,
         	                         x_up,  x_dn,
         	                        griddim2, blockdim2,
@@ -3232,7 +3230,7 @@ int cg_eo_nd (dev_su3_2v * gf,
     		to_host(dn_field[3], x_dn, h2d_spin_dn, dev_spinsize_int);
     		
     		// matrix multiplication
-    		#ifndef MPI
+    		#ifndef TM_USE_MPI
     		  printf("This is Q_Qdagger_ND(). ");
     		#else
     		  if (g_proc_id == 0) printf("This is Q_Qdagger_ND(). ");
@@ -3269,7 +3267,7 @@ int cg_eo_nd (dev_su3_2v * gf,
     
     
     // r(k+1)*r(k+1)
-    #ifndef MPI
+    #ifndef TM_USE_MPI
       rr_up  = cublasDot(N_floats_int, (float *) r_up, 1, (float *) r_up, 1);
       rr_dn  = cublasDot(N_floats_int, (float *) r_dn, 1, (float *) r_dn, 1);
     #else
@@ -3285,7 +3283,7 @@ int cg_eo_nd (dev_su3_2v * gf,
     
     
     		// debug
-    		#ifndef MPI
+    		#ifndef TM_USE_MPI
     		  printf("inner iteration j = %i: rr = %.6e\n", j, rr);
     		#else
     		  if (g_proc_id == 0) printf("inner iteration j = %i: rr = %.6e\n", j, rr);
@@ -3301,7 +3299,7 @@ int cg_eo_nd (dev_su3_2v * gf,
     // aborting ?? // check wether precision is reached ...
     if ( (check_abs)&&(rr <= eps_abs) || (check_rel)&&(rr <= eps_rel*r0r0) ) {
     
-      #ifdef MPI
+      #ifdef TM_USE_MPI
         if (g_cart_id == 0) {
       #endif
       
@@ -3320,7 +3318,7 @@ int cg_eo_nd (dev_su3_2v * gf,
       		//debug
       		printf("Final inner residue: %.6e\n", rr);
       
-      #ifdef MPI
+      #ifdef TM_USE_MPI
         }
       #endif
       
@@ -3374,7 +3372,7 @@ int cg_eo_nd (dev_su3_2v * gf,
   
   
   		// debug
-  		#ifndef MPI
+  		#ifndef TM_USE_MPI
   		  printf("Finished inner loop beacuse of maximal number of inner iterations.\n");
   		  printf("Final inner residue: %.6e\n", rr);
   		#else
@@ -3435,7 +3433,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
   
   
   		// debug
-  		#ifdef MPI
+  		#ifdef TM_USE_MPI
   		  if (g_proc_id == 0) {
   		#endif
   		
@@ -3455,7 +3453,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
   		printf("\tabsolute precision: %.8e\n", innersolver_precision_abs);
   		printf("\trelative precision: %.8e\n", innersolver_precision_rel);
   
-  		#ifdef MPI
+  		#ifdef TM_USE_MPI
   		  }
   		#endif
   
@@ -3488,7 +3486,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
     // double hoppingflops = 1488.0;
     double hoppingflops = 1608.0;
     double matrixflops  = 2  *  (  2 * ( (2*hoppingflops+12+3) + (2*hoppingflops+3) + (12+2) + 12 )  );
-    #ifdef MPI
+    #ifdef TM_USE_MPI
       double allflops;				// flops added for all processes
     #endif
   #endif
@@ -3502,7 +3500,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
   clock_t totalouterclocks = 0;
   
   #ifdef ALGORITHM_BENCHMARK
-    #ifndef MPI
+    #ifndef TM_USE_MPI
       clock_t starteffective;
       clock_t stopeffective;
     #else
@@ -3532,7 +3530,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
   size_t dev_spinsize_int   =  6*VOLUME/2*sizeof(dev_spinor);		// 24 floats per spinor per even lattice site
   int N_sites_int           =    VOLUME/2;				// Carsten's functions get the number of lattice points as input
   int N_floats_int          = 24*VOLUME/2;
-  #ifdef MPI
+  #ifdef TM_USE_MPI
     size_t dev_spinsize_ext =  6*(VOLUME+RAND)/2*sizeof(dev_spinor);
     int N_sites_ext         =    (VOLUME+RAND)/2;
     int N_floats_ext        = 24*(VOLUME+RAND)/2;
@@ -3552,7 +3550,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
   
   
   		//debug
-  		#ifndef MPI
+  		#ifndef TM_USE_MPI
   		  printf("init_mixedsolve_eo_nd():\n");
   		#else
   		  if (g_cart_id == 0) printf("init_mixedsolve_eo_nd_mpi():\n");
@@ -3566,7 +3564,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
   								//	puts the nn- and eoidx-fields on device memory
 
   		//debug
-  		#ifndef MPI
+  		#ifndef TM_USE_MPI
   		  printf("mixedsolve_eo_nd():\n");
   		#else
   		  if (g_cart_id == 0) printf("mixedsolve_eo_nd_mpi():\n");
@@ -3629,11 +3627,11 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
   		// debug	// check stuff on device
   		#ifdef STUFF_DEBUG
   		
-  			#ifdef MPI
+  			#ifdef TM_USE_MPI
   			  if (g_proc_id == 0) {
   			#endif
   			
-  			#ifdef MPI
+  			#ifdef TM_USE_MPI
   			  printf("\tOn host:\n");
   			  printf("\tVOLUME = %i\n", VOLUME);							// checking VOLUME and RAND in the parallel case 
   			  printf("\tRAND   = %i\n", RAND);
@@ -3663,7 +3661,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
   			printf("\tkappa = %f\n", host_check_kappa);
   			// printf("\ttwokappamu = %f\n", host_check_twokappamu);
   			
-  			#ifdef MPI
+  			#ifdef TM_USE_MPI
   			  }
   			#endif
   		
@@ -3680,7 +3678,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
   		// debug	// check mubar and epsbar on host and device
   		#ifdef STUFF_DEBUG
   		
-  			#ifdef MPI
+  			#ifdef TM_USE_MPI
   			  if (g_proc_id == 0) {
   			#endif
   			
@@ -3695,14 +3693,14 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
   			printf("\tmubar = %f\n", host_check_mubar);
   			printf("\tepsbar = %f\n", host_check_epsbar);
   			
-  			#ifdef MPI
+  			#ifdef TM_USE_MPI
   			  }
   			#endif
   		
   		#endif
   
   
-  #ifdef MPI
+  #ifdef TM_USE_MPI
   
   	he_cg_init_nd_additional_mpi<<<1,1>>>(VOLUMEPLUSRAND, RAND, g_cart_id, g_nproc);
   	
@@ -3774,7 +3772,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
     Ax_dn = Ad_dn;
     
   		// debug
-  		#ifndef MPI
+  		#ifndef TM_USE_MPI
   		  printf("Now using the fields g_chi_up/dn_spinor_field[DUM_SOLVER{ , +1, +2}] in the mixedsolve_eo_nd().\n");
   		#else
   		  if (g_cart_id == 0) printf("Now using the fields g_chi_up/dn_spinor_field[DUM_SOLVER{ , +1, +2}] in the mixedsolve_eo_nd().\n");
@@ -3791,7 +3789,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
   		Ax_up = Ad_up;
   		Ax_dn = Ad_dn;
   				// debug
-  				#ifndef MPI
+  				#ifndef TM_USE_MPI
   				  printf("Now allocating new host space for the fields in mixedsolve_eo_nd().\n");
   				#else
   				  if (g_cart_id == 0) printf("Now allocating new host space for the fields in mixedsolve_eo_nd().\n");
@@ -3810,7 +3808,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
   startouter = clock();
   
   #ifdef ALGORITHM_BENCHMARK
-    #ifndef MPI
+    #ifndef TM_USE_MPI
       starteffective = ((double)clock()) / ((double)(CLOCKS_PER_SEC));
     #else
       starteffective = MPI_Wtime();
@@ -3822,7 +3820,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
   if (!initial_guess) {		// r(0) = b = Q	// for x(0) = 0
     assign(r_up, Q_up, N_sites_int);
     assign(r_dn, Q_dn, N_sites_int);
-    #ifndef MPI
+    #ifndef TM_USE_MPI
       printf("x(0) = 0\n");
     #else
       if (g_cart_id == 0) printf("x(0) = 0\n");
@@ -3830,7 +3828,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
   }
   else {			// r(0) = b - A*x(0) = Q - A*P
     bb = square_norm(P_up, N_sites_int, 1) + square_norm(P_dn, N_sites_int, 1);
-    #ifndef MPI
+    #ifndef TM_USE_MPI
       printf("bb = %.10e\n", bb);
     #else
       if (g_cart_id == 0) printf("bb = %.10e\n", bb);
@@ -3838,7 +3836,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
     if (bb == 0) {
       assign(r_up, Q_up, N_sites_int);
       assign(r_dn, Q_dn, N_sites_int);
-      #ifndef MPI
+      #ifndef TM_USE_MPI
         printf("x(0) = 0\n");
       #else
         if (g_cart_id == 0) printf("x(0) = 0\n");
@@ -3848,7 +3846,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
       Q_Qdagger_ND(Ax_up, Ax_dn, P_up, P_dn);
       diff(r_up, Q_up, Ax_up, N_sites_int);
       diff(r_dn, Q_dn, Ax_dn, N_sites_int);
-      #ifndef MPI
+      #ifndef TM_USE_MPI
         printf("x(0) != 0\n");
       #else
         if (g_cart_id == 0) printf("x(0) != 0\n");
@@ -3867,7 +3865,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
   rr_old = rr; // for the first iteration
   
   		// debug
-  		#ifndef MPI
+  		#ifndef TM_USE_MPI
   		  printf("Initial outer residue: %.10e\n", rr_old);
   		#else
   		  if (g_cart_id == 0) printf("Initial outer residue: %.10e\n", rr_old);
@@ -3886,7 +3884,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
   ////////////////
   
   		// debug
-  		#ifndef MPI
+  		#ifndef TM_USE_MPI
     		  printf("\nEntering outer loop.");
     		#else
     		  if (g_cart_id == 0) printf("\nEntering outer loop.");
@@ -3898,7 +3896,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
     i++;
   
     		// debug
-    		#ifndef MPI
+    		#ifndef TM_USE_MPI
     		  printf("\nouter iteration i = %i\n", i);
     		#else
     		  if (g_cart_id == 0) printf("\nouter iteration i = %i\n", i);
@@ -3930,7 +3928,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
     startinner = clock();
     
     		// debug
-    		#ifndef MPI
+    		#ifndef TM_USE_MPI
     		  printf("cg_eo_nd():\n");
     		#else
     		  if (g_cart_id == 0) printf("cg_eo_nd():\n");
@@ -3954,7 +3952,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
     totalinnerclocks = totalinnerclocks + innerclocks;
     
     		// debug
-    		#ifndef MPI
+    		#ifndef TM_USE_MPI
     		  printf("Inner solver done in: %.4e sec\n", double(innerclocks) / double(CLOCKS_PER_SEC));
     		#else
     		  if (g_cart_id == 0) printf("Inner solver done in: %.4e sec\n", double(innerclocks) / double(CLOCKS_PER_SEC));
@@ -3976,7 +3974,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
     
     
     				// debug
-    				#ifndef MPI
+    				#ifndef TM_USE_MPI
     				  printf("cg_her_nd():\n");
     				#else
     				  if (g_cart_id == 0) printf("cg_her_nd():\n");
@@ -3989,7 +3987,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
     		outercount = outercount + innercount;
     		
     				// debug
-    				#ifndef MPI
+    				#ifndef TM_USE_MPI
     				  printf("cg_her_nd() on host was used for debugging purposes.\n");
     				#else
     				  if (g_cart_id == 0) printf("cg_her_nd() on host was used for debugging purposes.\n");
@@ -4000,7 +3998,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
     
     
     		// debug
-    		#ifndef MPI
+    		#ifndef TM_USE_MPI
     		  printf("mixedsolve_eo_nd():\n");
     		#else
     		  if (g_cart_id == 0) printf("mixedsolve_eo_nd():\n");
@@ -4019,7 +4017,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
       // A*x(k+1)
       Q_Qdagger_ND(Ax_up, Ax_dn, x_up, x_dn);
       		// debug
-      		#ifndef MPI
+      		#ifndef TM_USE_MPI
       		  printf("The matrix was applied on CPU in double precision. r = b - Ax\n");
       		#else
       		  if (g_cart_id == 0) printf("The matrix was applied on CPU in double precision. r = b - Ax\n");
@@ -4031,7 +4029,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
       // A*d(k+1)
       Q_Qdagger_ND(Ad_up, Ad_dn, d_up, d_dn);
     		// debug
-    		#ifndef MPI
+    		#ifndef TM_USE_MPI
     		  printf("The matrix was applied on CPU in double precision. r = r - Ad\n");
     		#else
     		  if (g_cart_id == 0) printf("The matrix was applied on CPU in double precision. r = r - Ad\n");
@@ -4050,7 +4048,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
     rr    = rr_up + rr_dn;
     
     		// debug
-    		#ifndef MPI
+    		#ifndef TM_USE_MPI
     		  printf("Outer residue in the outer iteration i = %i after %i total inner iterations : %.10e\n", i, outercount, rr);
     		#else
     		  if (g_cart_id == 0) printf("Outer residue in the outer iteration i = %i after %i total inner iterations : %.10e\n", i, outercount, rr);
@@ -4073,7 +4071,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
       totalouterclocks = stopouter-startouter - totalinnerclocks;
       
       #ifdef ALGORITHM_BENCHMARK
-        #ifndef MPI
+        #ifndef TM_USE_MPI
           stopeffective = ((double)clock()) / ((double)(CLOCKS_PER_SEC));
         #else
           stopeffective = MPI_Wtime();
@@ -4082,7 +4080,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
       
       
       		// debug
-      		#ifdef MPI
+      		#ifdef TM_USE_MPI
       		  if (g_cart_id == 0) {
       		#endif
       		printf("\nEO inversion done in mixed precision.\n");
@@ -4092,7 +4090,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
       		printf("Total number of outer iterations: %i\n", i+1);
       		printf("Squared residue: %.10e\n", rr); 
       		printf("Outer solver done in: %.4e sec\n", double(stopouter-startouter) / double(CLOCKS_PER_SEC));
-      		#ifdef MPI
+      		#ifdef TM_USE_MPI
       		  }
       		#endif
       		
@@ -4102,7 +4100,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
       		  // effectiveflops  =  #(inner iterations)*(matrixflops+linalgflops)*VOLUME/2  +  #(outer iterations)*(matrixflops+linalgflops)*VOLUME/2
       		  // outer loop: linalg  =  flops for calculating  r(k+1) and x(k+1)
       		  // inner loop: linalg  =  flops for calculating  alpha, x(k+1), r(k+1), beta, d(k+1)
-      		  #ifndef MPI
+      		  #ifndef TM_USE_MPI
       		  	effectiveflops = outercount*(matrixflops + 2*2*2*24 + 2*2*24 + 2*2*24 + 2*2*2*24 + 2*2*24)*VOLUME/2   +   i*(matrixflops + 2*24 + 2*24)*VOLUME/2;
       		  	printf("effective BENCHMARK:\n");
       		  	printf("\ttotal mixed solver time:   %.4e sec\n", double(stopeffective-starteffective));
@@ -4148,7 +4146,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
       */
       
       		// debug
-      		#ifndef MPI
+      		#ifndef TM_USE_MPI
       		  printf("finalize_mixedsolve_eo_nd():\n");
       		#else
       		  if (g_cart_id == 0) printf("finalize_mixedsolve_eo_nd():\n");
@@ -4157,7 +4155,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
       finalize_mixedsolve_eo_nd();
       
       		// debug
-      		#ifndef MPI
+      		#ifndef TM_USE_MPI
       		  printf("\n");
       		#else
       		  if (g_cart_id == 0) printf("\n");
@@ -4183,7 +4181,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
   totalouterclocks = stopouter-startouter - totalinnerclocks;
   
   #ifdef ALGORITHM_BENCHMARK
-    #ifndef MPI
+    #ifndef TM_USE_MPI
       stopeffective = ((double)clock()) / ((double)(CLOCKS_PER_SEC));
     #else
       stopeffective = MPI_Wtime();
@@ -4192,7 +4190,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
   
   
   		// debug
-  		#ifdef MPI
+  		#ifdef TM_USE_MPI
   		  if (g_cart_id == 0) {
   		#endif
   		printf("\nEO inversion done in mixed precision.\n");
@@ -4201,7 +4199,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
       		printf("Total number of outer iterations: %i\n", i+1);
       		printf("Squared residue: %.10e\n", rr); 
       		printf("Outer solver done in: %.4e sec\n", double(stopouter-startouter)/CLOCKS_PER_SEC);
-      		#ifdef MPI
+      		#ifdef TM_USE_MPI
       		  }
       		#endif
       		
@@ -4211,7 +4209,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
       		  // effectiveflops  =  #(inner iterations)*(matrixflops+linalgflops)*VOLUME/2  +  #(outer iterations)*(matrixflops+linalgflops)*VOLUME/2
       		  // outer loop: linalg  =  flops for calculating  r(k+1) and x(k+1)
       		  // inner loop: linalg  =  flops for calculating  alpha, x(k+1), r(k+1), beta, d(k+1)
-      		  #ifndef MPI
+      		  #ifndef TM_USE_MPI
       		  	effectiveflops = outercount*(matrixflops + 2*2*2*24 + 2*2*24 + 2*2*24 + 2*2*2*24 + 2*2*24)*VOLUME/2   +   i*(matrixflops + 2*24 + 2*24)*VOLUME/2;
       		  	printf("effective BENCHMARK:\n");
       		  	printf("\ttotal mixed solver time:   %.4e sec\n", double(stopeffective-starteffective));
@@ -4257,7 +4255,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
       */
   
   		// debug
-  		#ifndef MPI
+  		#ifndef TM_USE_MPI
   		  printf("finalize_mixedsolve_eo_nd():\n");  
   		#else
   		  if (g_cart_id == 0) printf("finalize_mixedsolve_eo_nd():\n");
@@ -4266,7 +4264,7 @@ extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn,
   finalize_mixedsolve_eo_nd();
   
   		// debug
-  		#ifndef MPI
+  		#ifndef TM_USE_MPI
   		  printf("\n");
   		#else
   		  if (g_cart_id == 0) printf("\n");
diff --git a/GPU/textures.cuh b/GPU/textures.cuh
index 262b21731..41c037b0f 100644
--- a/GPU/textures.cuh
+++ b/GPU/textures.cuh
@@ -28,7 +28,7 @@
 
 
 #ifdef HAVE_CONFIG_H
-  #include<config.h>
+  #include<tmlqcd_config.h>
 #endif
  
  /* texture for nearest neighbours*/
@@ -58,7 +58,7 @@ extern "C" int bind_texture_spin(dev_spinor* s, int i){
   
   size_t size;
   
-  #ifdef MPI
+  #ifdef TM_USE_MPI
     if(even_odd_flag){
       size = sizeof(float4)*6*(VOLUME+RAND)/2;
     }
@@ -138,7 +138,7 @@ return(1);
 extern "C" int bind_texture_gf(dev_su3_2v * gf){
  //printf("Binding texture to gaugefield\n");
  
-  #ifdef MPI
+  #ifdef TM_USE_MPI
     #ifdef GF_8
      size_t size = sizeof(float4)*2*(VOLUME+RAND)*4;
     #else
@@ -177,7 +177,7 @@ extern "C" int bind_texture_nn(int* nn){
  //printf("Binding texture to nn field\n");
   size_t size;
   
-  #ifdef MPI
+  #ifdef TM_USE_MPI
     if(even_odd_flag){
       size = sizeof(int)*8*(VOLUME+RAND)/2;
     }
diff --git a/LapH_ev.c b/LapH_ev.c
index b15829b92..02744dcd1 100644
--- a/LapH_ev.c
+++ b/LapH_ev.c
@@ -24,9 +24,9 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include "config.h"
+# include "tmlqcd_config.h"
 #else
-#error "no config.h"
+#error "no tmlqcd_config.h"
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -36,7 +36,7 @@
 #if (defined BGL && !defined BGP)
 #  include <rts.h>
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
@@ -57,7 +57,7 @@ int main(int argc,char *argv[])
   int tslice,j,k;
   char conf_filename[50];
   
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Init(&argc, &argv);
 #endif
   
@@ -100,7 +100,7 @@ int main(int argc,char *argv[])
     printf("# the code was compiled for persistent MPI calls (halfspinor only)\n");
 #  endif
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
 #  ifdef _NON_BLOCKING
     printf("# the code was compiled for non-blocking MPI calls (spinor and gauge)\n");
 #  endif
@@ -114,7 +114,7 @@ int main(int argc,char *argv[])
   printf(" Error: WITHLAPH not defined");
   exit(0);
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
 #ifndef _INDEX_INDEP_GEOM
   printf(" Error: _INDEX_INDEP_GEOM not defined");
   exit(0);
@@ -167,7 +167,7 @@ int main(int argc,char *argv[])
     fflush(stdout);
   }
   
-#ifdef MPI
+#ifdef TM_USE_MPI
   /*For parallelization: exchange the gaugefield */
   xchange_gauge(g_gauge_field);
 #endif
@@ -175,7 +175,7 @@ int main(int argc,char *argv[])
   /* Init Jacobi field */
   init_jacobi_field(SPACEVOLUME+SPACERAND,3);
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   {
      /* for debugging in parallel set i_gdb = 0 */
     volatile int i_gdb = 8;
@@ -203,7 +203,7 @@ int main(int argc,char *argv[])
     eigenvalues_Jacobi(&no_eigenvalues,5000, eigenvalue_precision,0,tslice,nstore);
   }
   
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Finalize();
 #endif
   return(0);
diff --git a/Makefile.global b/Makefile.global
index 5369f4d99..c31c72a44 100644
--- a/Makefile.global
+++ b/Makefile.global
@@ -3,7 +3,9 @@
 
 # refresh Makefile and other stuff
 
-PROGRAMS_WITH_GIT_HASH := hmc_tm invert
+
+
+PROGRAMS_WITH_GIT_HASH := hmc_tm invert offline_measurement test_Dslash
 
 .SUFFIXES:
 
@@ -14,8 +16,8 @@ Makefile: ${top_srcdir}/Makefile.global $(srcdir)/Makefile.in $(abs_top_builddir
 $(abs_top_builddir)/config.status: $(top_srcdir)/configure
 	( cd ${abs_top_builddir} && $(SHELL) ./config.status --recheck ) 
 
-$(abs_top_builddir)/config.h: $(top_srcdir)/config.h.in $(abs_top_builddir)/config.status $(top_srcdir)/configure
-	( cd ${abs_top_builddir} && $(SHELL) ./config.status --header=config.h )
+$(abs_top_builddir)/include/tmlqcd_config.h: $(top_srcdir)/include/tmlqcd_config.h.in $(abs_top_builddir)/config.status $(top_srcdir)/configure
+	( cd ${abs_top_builddir} && $(SHELL) ./config.status --header=include/tmlqcd_config.h )
 
 # rebuild configure if configure.in changes but ignore errors
 # on many machines some of the macros fail to be recognized
@@ -30,7 +32,9 @@ $(top_srcdir)/configure: $(top_srcdir)/configure.in
 # we filter the list of all objects and treat these separately
 $(addsuffix .d, $(filter-out ${PROGRAMS_WITH_GIT_HASH},${ALLOBJ})): %.d: ${srcdir}/%.c Makefile
 	@ $(CCDEP) ${DEPFLAGS} ${DEFS} ${INCLUDES} $< > $@
-
+$(addsuffix .d, $(filter-out ${PROGRAMS_WITH_GIT_HASH},${CXXMODULES})): %.d: ${srcdir}/%.cpp Makefile
+	@ $(CXXDEP) ${CXXDEPFLAGS} ${DEFS} ${INCLUDES} $< > $@
+	
 # dirty hack to prevent make from entering an infinite loop because a phony target is given as a real
 # dependency (make will build invert.d and hmc_tm.d indefinitely)
 # when git_hash.h does not exist (as checked using wildcard) it is given as a dependency of invert.d and hmc_tm.d
diff --git a/Makefile.in b/Makefile.in
index 20d755a9f..c8f1f9f9d 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -13,10 +13,14 @@ subdir = .
 AR = @AR@
 RANLIB = @RANLIB@
 CC = @CC@
+CXX = @CXX@
 CCDEP = @CCDEP@
+CXXDEP = @CXXDEP@
 CFLAGS = @CFLAGS@
+CXXFLAGS = @CXXFLAGS@
 LDFLAGS = @LDFLAGS@
 DEPFLAGS = @DEPFLAGS@
+CXXDEPFLAGS = @CXXDEPFLAGS@
 CPPFLAGS = @CPPFLAGS@
 CCLD = @CCLD@
 LEX = @LEX@
@@ -35,26 +39,30 @@ INCLUDES = @INCLUDES@
 LINK = $(CCLD) -o $@ ${LDFLAGS}
 
 COMPILE = ${CC} ${DEFS} ${INCLUDES} -o $@ ${CFLAGS}
+CXXCOMPILE = ${CXX} ${DEFS} ${INCLUDES} -o $@ ${CXXFLAGS} ${LDFLAGS}
 
 SMODULES = 
 
 MODULES = read_input gamma measure_gauge_action start \
-	measure_oriented_plaquettes \
-	expo get_staples update_backward_gauge \
+	expo matrix_utils get_staples update_backward_gauge \
 	measure_rectangles get_rectangle_staples  \
 	test/check_geometry test/check_xchange \
 	test/overlaptests \
 	invert_eo invert_doublet_eo update_gauge \
-	polyakov_loop getopt sighandler reweighting_factor \
+	getopt sighandler reweighting_factor \
 	source_generation boundary update_tm ranlxd  \
 	mpi_init deriv_Sb deriv_Sb_D_psi ranlxs \
-	geometry_eo invert_overlap \
+	geometry_eo invert_overlap aligned_malloc \
 	prepare_source chebyshev_polynomial_nd Ptilde_nd  \
 	reweighting_factor_nd rnd_gauge_trafo \
-	online_measurement update_momenta integrator  phmc \
-	little_D block operator measurements pion_norm \
+        update_momenta update_momenta_fg integrator  phmc \
+	little_D block operator \
 	temporalgauge spinor_fft X_psi P_M_eta \
-	jacobi fatal_error invert_clover_eo gettime @SPI_FILES@
+	jacobi fatal_error invert_clover_eo gettime \
+	tm_debug_printf \
+        @SPI_FILES@ @QUDA_INTERFACE@ @DDalphaAMG_INTERFACE@
+
+CXXMODULES = @QPHIX_INTERFACE@
 
 ## the GPU modules (all .cu files in $GPUDIR)
 GPUSOURCES := $(wildcard $(srcdir)/$(GPUDIR)/*.cu)
@@ -65,8 +73,9 @@ GPUOBJECTS := $(patsubst $(srcdir)/$(GPUDIR)/%.cu, $(GPUDIR)/%.o, $(GPUSOURCES))
 
 NOOPTMOD = test/check_xchange test/check_geometry
 
-PROGRAMS = hmc_tm benchmark invert gen_sources test_DslashBSM  test_DslashBSM2\
-	check_locallity test_lemon hopping_test LapH_ev contractions_BSM prop_io_test
+PROGRAMS = hmc_tm benchmark invert gen_sources test_DslashBSM  test_DslashBSM2 test_DslashBSM3 \
+	check_locallity test_lemon hopping_test LapH_ev contractions_BSM \
+	offline_measurement @QPHIX_PROGRAMS@
 
 ALLOBJ = ${MODULES} ${PROGRAMS} ${SMODULES}
 SUBDIRS = ${USESUBDIRS}
@@ -74,7 +83,7 @@ SUBDIRS = ${USESUBDIRS}
 # delete the default suffix rules
 .SUFFIXES:
 
-all: Makefile dep $(SUBDIRS) hmc_tm invert benchmark test_DslashBSM test_DslashBSM2 contractions_BSM prop_io_test
+all: Makefile dep $(SUBDIRS) hmc_tm invert benchmark offline_measurement test_DslashBSM test_DslashBSM2 test_DslashBSM3 contractions_BSM @QPHIX_PROGRAMS@
 
 $(SUBDIRS):
 	$(MAKE) --directory=$@
@@ -85,12 +94,13 @@ ${top_srcdir}/git_hash.h:
 	@ ( cd @srcdir@ && sh GIT-VERSION-GEN )
 
 -include $(addsuffix .d,$(ALLOBJ))
+-include $(addsuffix .d,$(CXXMODULES))
 
 include ${top_srcdir}/Makefile.global
 
 ${top_srcdir}/read_input.c: ${top_srcdir}/read_input.l
 ifneq (,$(findstring lex,${LEX}))
-	${LEX} -Ptmlqcd -i -t ${top_srcdir}/read_input.l > ${top_srcdir}/read_input.c
+	${LEX} -Ca -Ptmlqcd -i -t ${top_srcdir}/read_input.l > ${top_srcdir}/read_input.c
 else
 	$(error Unable to find (f)lex, read_input.c not built. Please install (f)lex!)
 endif
@@ -101,26 +111,30 @@ libhmc.a: ${addsuffix .o, ${MODULES} ${SMODULES}} Makefile
 	@$(RANLIB) libhmc.a
 	@cp libhmc.a ${top_builddir}/lib/libhmc.a
 
-$(addsuffix .o,$(filter-out ${NOOPTMOD},${MODULES})): %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/config.h
+$(addsuffix .o,$(filter-out ${NOOPTMOD},${MODULES})): %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/include/tmlqcd_config.h
 	${COMPILE} ${OPTARGS} -c $<
 
 #here we don't need optimisation
-$(addsuffix .o,$(filter ${NOOPTMOD},${MODULES})): %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/config.h
+$(addsuffix .o,$(filter ${NOOPTMOD},${MODULES})): %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/include/tmlqcd_config.h
 	${COMPILE} -c $<
 
-${addsuffix .o, ${SMODULES}}: %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/config.h
+${addsuffix .o, ${SMODULES}}: %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/include/tmlqcd_config.h
 	${COMPILE} ${SOPTARGS} -c $<
 
-${addsuffix .o, ${PROGRAMS}}: %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/config.h ${top_srcdir}/git_hash.h
+# C++ modules
+$(addsuffix .o,${CXXMODULES}): %.o: ${srcdir}/%.cpp %.d Makefile $(abs_top_builddir)/include/tmlqcd_config.h
+	${CXXCOMPILE} -c $<
+	
+${addsuffix .o, ${PROGRAMS}}: %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/include/tmlqcd_config.h ${top_srcdir}/git_hash.h
 	${COMPILE} ${OPTARGS} -c $<
 
-${PROGRAMS}: %: %.o libhmc.a $(SUBDIRS)
-	 ${LINK} $@.o $(GPUOBJECTS) $(GPUOBJECTS_C) $(LIBS) 
+${PROGRAMS}: %: %.o libhmc.a $(SUBDIRS) $(addsuffix .o,${CXXMODULES})
+	 ${LINK} $@.o $(addsuffix .o,${CXXMODULES}) $(GPUOBJECTS) $(GPUOBJECTS_C) $(LIBS) ${LDFLAGS}
 
 # The rules for unit tests are kept in a separate file for tidyness
 include ${top_srcdir}/Makefile.tests
 
-dep: $(addsuffix .d,$(ALLOBJ))
+dep: $(addsuffix .d,$(ALLOBJ)) $(addsuffix .d,$(CXXMODULES))
 	@ echo "...dependency files built"
 
 install: Makefile
@@ -147,8 +161,8 @@ clean: clean-recursive Makefile
 	rm -f benchmark hmc_tm invert test_DslashBSM *.o *.d test/*.o test/*.d tests/*.o tests/*.d
 
 distclean: distclean-recursive Makefile
-	rm -f benchmark hmc_tm invert test_DslashBSM *.o *.d *~ Makefile config.log config.status fixed_volume.h
-	rm -f config.h
+	rm -f benchmark hmc_tm invert test_DslashBSM test_DslashBSM3 *.o *.d *~ Makefile config.log config.status fixed_volume.h
+	rm -f include/tmlqcd_config.h
 
 .PHONY: all ${SUBDIRS} ${top_srcdir}/git_hash.h clean compile-clean distclean dep install \
 	all-recursive all-debug-recursive all-profile-recursive \
diff --git a/P_M_eta.c b/P_M_eta.c
index 6e44bbbdc..ecfa8ad4e 100644
--- a/P_M_eta.c
+++ b/P_M_eta.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 
 #include <stdlib.h>
diff --git a/Ptilde_nd.c b/Ptilde_nd.c
index cc0e5b9a4..a8137a49c 100644
--- a/Ptilde_nd.c
+++ b/Ptilde_nd.c
@@ -19,12 +19,12 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
@@ -279,7 +279,7 @@ void degree_of_Ptilde(int * _degree, double ** coefs,
       fprintf(stderr, "Error: n_cheby=%d > phmc_max_ptilde_degree=%d in ptilde\n",
               degree, phmc_max_ptilde_degree);
       fprintf(stderr, "Increase n_chebymax\n");
-#ifdef MPI
+#ifdef TM_USE_MPI
       MPI_Finalize();
 #endif
       exit(-5);
diff --git a/X_psi.c b/X_psi.c
index f35bfdf51..2efd1249c 100644
--- a/X_psi.c
+++ b/X_psi.c
@@ -19,12 +19,12 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <math.h>
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include "global.h"
 #include "su3.h"
diff --git a/aligned_malloc.c b/aligned_malloc.c
new file mode 100644
index 000000000..1fdb03b87
--- /dev/null
+++ b/aligned_malloc.c
@@ -0,0 +1,119 @@
+/***********************************************************************                                                             
+ * Copyright (C) 2015 Bartosz Kostrzewa
+ *               2016 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+***********************************************************************/
+
+#if HAVE_CONFIG_H
+#include <tmlqcd_config.h>
+#endif
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include "aligned_malloc.h"
+#include "su3.h"
+#include "su3adj.h"
+
+#include "fatal_error.h"
+ 
+void *aligned_malloc(size_t const size) {
+  void *mem = malloc(size+ALIGN_BASE+sizeof(void*));
+  void ** ptr;
+
+  if(mem == NULL) {
+    return(mem);
+  }
+
+  ptr = (void**)(((uintptr_t)mem+(uintptr_t)ALIGN_BASE+sizeof(void*)) & ~ (uintptr_t)(ALIGN_BASE));
+  ptr[-1] = mem;
+  
+  return ptr;
+}
+
+void *aligned_malloc_zero(size_t const size) {
+  void *mem = malloc(size+ALIGN_BASE+sizeof(void*));
+  void ** ptr;
+
+  if(mem == NULL) {
+    return(mem);
+  }
+
+  ptr = (void**)(((uintptr_t)mem+(uintptr_t)ALIGN_BASE+sizeof(void*)) & ~ (uintptr_t)(ALIGN_BASE));
+  ptr[-1] = mem;
+  memset(ptr, 0, size);
+
+  return ptr;
+}
+
+
+void aligned_free(void *ptr) {
+  free(((void**)ptr)[-1]);
+}
+
+aligned_su3_field_t aligned_su3_field_alloc(const unsigned int V) {
+  aligned_su3_field_t f_struct;
+
+  su3** field = (su3**) aligned_malloc(V*sizeof(su3*));
+  su3* mem = (su3*)aligned_malloc((4*V+1)*sizeof(su3));
+
+  if( (void*)field == (void*)NULL || (void*)mem == (void*)NULL ) {
+    fatal_error("Memory allocation error!","aligned_su3_field_alloc");
+  }
+
+  field[0] = mem;
+  for(int i = 1; i < V; ++i) {
+    field[i] = field[i-1]+4;
+  }
+
+  f_struct.field = field;
+  f_struct.mem = mem;
+
+  return(f_struct);
+}
+
+aligned_su3adj_field_t aligned_su3adj_field_alloc(const unsigned int V) {
+  aligned_su3adj_field_t f_struct;
+  su3adj** field = (su3adj**) aligned_malloc(V*sizeof(su3adj*));
+  su3adj* mem = (su3adj*)aligned_malloc((4*V+1)*sizeof(su3adj));
+
+  if( (void*)field == (void*)NULL || (void*)mem == (void*)NULL ) {
+    fatal_error("Memory allocation error!","aligned_su3_field_alloc");
+  }
+
+  field[0] = mem;
+  for(int i = 1; i < V; ++i) {
+    field[i] = field[i-1]+4;
+  }
+
+  f_struct.field = field;
+  f_struct.mem = mem;
+
+  return(f_struct);
+}
+
+void aligned_su3_field_free(const aligned_su3_field_t* f_struct) {
+  aligned_free((void*)f_struct->field);
+  aligned_free((void*)f_struct->mem);
+}
+
+void aligned_su3adj_field_free(const aligned_su3adj_field_t* f_struct) {
+  aligned_free((void*)f_struct->field);
+  aligned_free((void*)f_struct->mem);
+}
+
diff --git a/aligned_malloc.h b/aligned_malloc.h
new file mode 100644
index 000000000..acb905c99
--- /dev/null
+++ b/aligned_malloc.h
@@ -0,0 +1,46 @@
+/***********************************************************************                                                                                     
+ * Copyright (C) 2015 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+***********************************************************************/
+
+#ifndef _ALIGNED_MALLOC_H
+#define _ALIGNED_MALLOC_H
+
+#include "su3.h"
+#include "su3adj.h"
+
+typedef struct {
+  su3** field;
+  su3* mem;
+} aligned_su3_field_t;
+
+typedef struct {
+  su3adj** field;
+  su3adj* mem;
+} aligned_su3adj_field_t;
+
+void *aligned_malloc(size_t const size);
+void aligned_free(void *ptr);
+void *aligned_malloc_zero(size_t const size);
+
+aligned_su3_field_t aligned_su3_field_alloc(const unsigned int V);
+aligned_su3adj_field_t aligned_su3adj_field_alloc(const unsigned int V); 
+
+void aligned_su3_field_free(const aligned_su3_field_t* f_struct);
+void aligned_su3adj_field_free(const aligned_su3adj_field_t* f_stuct);
+
+#endif
diff --git a/benchmark.c b/benchmark.c
index 837b31b38..f9a796625 100644
--- a/benchmark.c
+++ b/benchmark.c
@@ -24,7 +24,7 @@
 *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -34,14 +34,14 @@
 #if (defined BGL && !defined BGP)
 #  include <rts.h>
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 # ifdef HAVE_LIBLEMON
 #  include <io/params.h>
 #  include <io/gauge.h>
 # endif
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 # include "init/init_openmp.h"
 #endif
@@ -93,15 +93,14 @@ int main(int argc,char *argv[])
   static double t1,t2,dt,sdt,dts,qdt,sqdt;
   double antioptaway=0.0;
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   static double dt2;
   
   DUM_DERI = 6;
-  DUM_SOLVER = DUM_DERI+2;
-  DUM_MATRIX = DUM_SOLVER+6;
+  DUM_MATRIX = DUM_DERI+8;
   NO_OF_SPINORFIELDS = DUM_MATRIX+2;
 
-#  ifdef OMP
+#  ifdef TM_USE_OMP
   int mpi_thread_provided;
   MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_thread_provided);
 #  else
@@ -121,7 +120,7 @@ int main(int argc,char *argv[])
     exit(-1);
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   init_openmp();
 #endif
 
@@ -163,7 +162,7 @@ int main(int argc,char *argv[])
     printf("# The code was compiled for persistent MPI calls (halfspinor only)\n");
 #  endif
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
 #  ifdef _NON_BLOCKING
     printf("# The code was compiled for non-blocking MPI calls (spinor and gauge)\n");
 #  endif
@@ -241,57 +240,80 @@ int main(int argc,char *argv[])
     fprintf(stderr, "Checking of geometry failed. Unable to proceed.\nAborting....\n");
     exit(1);
   }
-#if (defined MPI && !(defined _USE_SHMEM))
+#if (defined TM_USE_MPI && !(defined _USE_SHMEM))
   check_xchange(); 
 #endif
 
   start_ranlux(1, 123456);
   random_gauge_field(reproduce_randomnumber_flag, g_gauge_field);
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   /*For parallelization: exchange the gaugefield */
   xchange_gauge(g_gauge_field);
 #endif
 
   if(even_odd_flag) {
+    sdt=0.; sqdt=0.0;
     /*initialize the pseudo-fermion fields*/
-    j_max=2048;
-    sdt=0.;
     for (k = 0; k < k_max; k++) {
       random_spinor_field_eo(g_spinor_field[k], reproduce_randomnumber_flag, RN_GAUSS);
     }
     
-    while(sdt < 30.) {
-#ifdef MPI
-      MPI_Barrier(MPI_COMM_WORLD);
-#endif
-      t1 = gettime();
-      antioptaway=0.0;
-      for (j=0;j<j_max;j++) {
-        for (k=0;k<k_max;k++) {
-          Hopping_Matrix(0, g_spinor_field[k+k_max], g_spinor_field[k]);
-          Hopping_Matrix(1, g_spinor_field[2*k_max], g_spinor_field[k+k_max]);
-          antioptaway+=creal(g_spinor_field[2*k_max][0].s0.c0);
-        }
+    j_max=512;
+    antioptaway=0.0;
+    /* compute approximately how many applications we need to do to get a reliable measurement */
+#ifdef TM_USE_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+    t1 = gettime();
+    for (j=0;j<j_max;j++) {
+      for (k=0;k<k_max;k++) {
+        Hopping_Matrix(0, g_spinor_field[k+k_max], g_spinor_field[k]);
+        Hopping_Matrix(1, g_spinor_field[2*k_max], g_spinor_field[k+k_max]);
+        antioptaway+=creal(g_spinor_field[2*k_max][0].s0.c0);
       }
-      t2 = gettime();
-      dt = t2-t1;
-#ifdef MPI
-      MPI_Allreduce (&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    }
+    dt = gettime()-t1;
+    // division by g_nproc because we will average over processes
+    j = (int)(ceil(j_max*31.0/dt/g_nproc));
+#ifdef TM_USE_MPI
+    MPI_Allreduce(&j,&j_max, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
 #else
-      sdt = dt;
+    j_max = j;
 #endif
-      qdt=dt*dt;
-#ifdef MPI
-      MPI_Allreduce (&qdt, &sqdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-#else
-      sqdt = qdt;
+
+
+
+    /* perform the actual benchmark */
+#ifdef TM_USE_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
 #endif
-      sdt=sdt/((double)g_nproc);
-      sqdt=sqrt(sqdt/g_nproc-sdt*sdt);
-      j_max*=2;
+    t1 = gettime();
+    antioptaway=0.0;
+    for (j=0;j<j_max;j++) {
+      for (k=0;k<k_max;k++) {
+        Hopping_Matrix(0, g_spinor_field[k+k_max], g_spinor_field[k]);
+        Hopping_Matrix(1, g_spinor_field[2*k_max], g_spinor_field[k+k_max]);
+        antioptaway+=creal(g_spinor_field[2*k_max][0].s0.c0);
+      }
     }
-    j_max=j_max/2;
+    dt = gettime()-t1;
+#ifdef TM_USE_MPI
+    MPI_Allreduce (&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#else
+    sdt = dt;
+#endif
+    
+    qdt=dt*dt;
+#ifdef TM_USE_MPI
+    MPI_Allreduce (&qdt, &sqdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#else
+    sqdt = qdt;
+#endif
+
+    sdt=sdt/((double)g_nproc);
+    sqdt=sqrt(sqdt/g_nproc-sdt*sdt);
+     
     dts=dt;
     sdt=1.0e6f*sdt/((double)(k_max*j_max*(VOLUME)));
     sqdt=1.0e6f*sqdt/((double)(k_max*j_max*(VOLUME)));
@@ -299,15 +321,18 @@ int main(int argc,char *argv[])
     if(g_proc_id==0) {
       printf("# The following result is just to make sure that the calculation is not optimized away: %e\n", antioptaway);
       printf("# Total compute time %e sec, variance of the time %e sec. (%d iterations).\n", sdt, sqdt, j_max);
-      printf("# Communication switched on:\n# (%d Mflops [%d bit arithmetic])\n", (int)(1608.0f/sdt),(int)sizeof(spinor)/3);
-#ifdef OMP
-      printf("# Mflops per OpenMP thread ~ %d\n",(int)(1608.0f/(omp_num_threads*sdt)));
+#ifdef TM_USE_MPI
+      printf("# Communication switched on: \n");
+#endif
+      printf("\n%12d Mflops(total) %8d Mflops(process)", (int)(g_nproc*1608.0f/sdt),(int)(1608.0f/sdt));
+#ifdef TM_USE_OMP
+      printf(" %8d Mflops(thread)",(int)(1608.0f/(omp_num_threads*sdt)));
 #endif
-      printf("\n");
+      printf(" [ %d bit arithmetic ]\n\n",(int)(sizeof(spinor)/3)); 
       fflush(stdout);
     }
     
-#ifdef MPI
+#ifdef TM_USE_MPI
     /* isolated computation */
     t1 = gettime();
     antioptaway=0.0;
@@ -329,11 +354,11 @@ int main(int argc,char *argv[])
     dt=1.0e6f*dt/((double)(k_max*j_max*(VOLUME)));
     if(g_proc_id==0) {
       printf("# The following result is printed just to make sure that the calculation is not optimized away: %e\n",antioptaway);
-      printf("# Communication switched off: \n# (%d Mflops [%d bit arithmetic])\n", (int)(1608.0f/dt),(int)sizeof(spinor)/3);
-#ifdef OMP
-      printf("# Mflops per OpenMP thread ~ %d\n",(int)(1608.0f/(omp_num_threads*dt)));
+      printf("# Communication switched off: \n\n%12d Mflops(total) %8d Mflops(process)", (int)(g_nproc*1608.0f/dt),(int)(1608.0f/dt));
+#ifdef TM_USE_OMP
+      printf(" %8d Mflops(thread)",(int)(1608.0f/(omp_num_threads*dt)));
 #endif
-      printf("\n"); 
+      printf(" [ %d bit arithmetic ]\n\n",(int)(sizeof(spinor)/3)); 
       fflush(stdout);
     }
     sdt=sdt/((double)k_max);
@@ -353,56 +378,75 @@ int main(int argc,char *argv[])
   else {
     /* the non even/odd case now */
     /*initialize the pseudo-fermion fields*/
-    j_max=1;
+    j_max=128;
     sdt=0.;
     for (k=0;k<k_max;k++) {
       random_spinor_field_lexic(g_spinor_field[k], reproduce_randomnumber_flag, RN_GAUSS);
     }
     
-    while(sdt < 3.) {
-#ifdef MPI
-      MPI_Barrier(MPI_COMM_WORLD);
-#endif
-      t1 = gettime();
-      for (j=0;j<j_max;j++) {
-        for (k=0;k<k_max;k++) {
-          D_psi(g_spinor_field[k+k_max], g_spinor_field[k]);
-          antioptaway+=creal(g_spinor_field[k+k_max][0].s0.c0);
-        }
+    /* estimate a reasonable number of applications to get a reliable measurement */
+#ifdef TM_USE_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+    t1 = gettime();
+    for (j=0;j<j_max;j++) {
+      for (k=0;k<k_max;k++) {
+        D_psi(g_spinor_field[k+k_max], g_spinor_field[k]);
+        antioptaway+=creal(g_spinor_field[k+k_max][0].s0.c0);
       }
-      t2 = gettime();
-      dt=t2-t1;
-#ifdef MPI
-      MPI_Allreduce (&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    }
+    t2 = gettime();
+    dt=t2-t1;
+    // division by g_nproc because we will average over processes using  MPI_SUM
+    j = (int)(ceil(j_max*31.0/dt/g_nproc));
+#ifdef TM_USE_MPI
+    MPI_Allreduce(&j,&j_max, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
 #else
-      sdt = dt;
+    j_max = j;
 #endif
-      qdt=dt*dt;
-#ifdef MPI
-      MPI_Allreduce (&qdt, &sqdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-#else
-      sqdt = qdt;
+
+    /* perform the actual measurement */
+#ifdef TM_USE_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
 #endif
-      sdt=sdt/((double)g_nproc);
-      sqdt=sqrt(sqdt/g_nproc-sdt*sdt);
-      j_max*=2;
+    t1 = gettime();
+    for (j=0;j<j_max;j++) {
+      for (k=0;k<k_max;k++) {
+        D_psi(g_spinor_field[k+k_max], g_spinor_field[k]);
+        antioptaway+=creal(g_spinor_field[k+k_max][0].s0.c0);
+      }
     }
-    j_max=j_max/2;
+    t2 = gettime();
+    dt=t2-t1;
+#ifdef TM_USE_MPI
+    MPI_Allreduce (&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#else
+    sdt = dt;
+#endif
+    qdt=dt*dt;
+#ifdef TM_USE_MPI
+    MPI_Allreduce (&qdt, &sqdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#else
+    sqdt = qdt;
+#endif
+    sdt=sdt/((double)g_nproc);
+    sqdt=sqrt(sqdt/g_nproc-sdt*sdt);
     dts=dt;
     sdt=1.0e6f*sdt/((double)(k_max*j_max*(VOLUME)));
     sqdt=1.0e6f*sqdt/((double)(k_max*j_max*(VOLUME)));
 
     if(g_proc_id==0) {
       printf("# The following result is just to make sure that the calculation is not optimized away: %e\n", antioptaway);
-      printf("# Total compute time %e sec, variance of the time %e sec. (%d iterations).\n", sdt, sqdt, j_max);
-      printf("\n# (%d Mflops [%d bit arithmetic])\n", (int)(1680.0f/sdt),(int)sizeof(spinor)/3);
-#ifdef OMP
-      printf("# Mflops per OpenMP thread ~ %d\n",(int)(1680.0f/(omp_num_threads*sdt)));
+      printf("# Total compute time %e sec, variance of the time %e sec. (%d iterations).\n\n", sdt, sqdt, j_max);
+      printf(" %12d Mflops(total) %8d Mflops(process)", (int)(1680.0f*g_nproc/sdt),(int)(1680.0f/sdt));
+#ifdef TM_USE_OMP
+      printf(" %8d Mflops(thread)",(int)(1680.0f/(omp_num_threads*sdt)));
 #endif
-      printf("\n"); 
+      printf(" [ %d bit arithmetic ]\n\n",(int)(sizeof(spinor)/3)); 
       fflush(stdout);
     }
   }
+
 #ifdef HAVE_LIBLEMON
   if(g_proc_id==0) {
     printf("# Performing parallel IO test ...\n");
@@ -416,14 +460,14 @@ int main(int argc,char *argv[])
 #endif
 
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   free_omp_accumulators();
 #endif
   free_gauge_field();
   free_geometry_indices();
   free_spinor_field();
   free_moment_field();
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Barrier(MPI_COMM_WORLD);
   MPI_Finalize();
 #endif
diff --git a/bgq.h b/bgq.h
index e539c3955..552e69761 100644
--- a/bgq.h
+++ b/bgq.h
@@ -16,6 +16,11 @@
   r1 = vec_ld(32L, (double*) &(phi).c0);		\
   r2 = vec_ld(64L, (double*) &(phi).c0);
 
+#define _vec_load_halfspinor_32(r0, r1, r2, phi)	\
+  r0 = vec_ld(0L, (float*) &(phi).c0);			\
+  r1 = vec_ld(16L, (float*) &(phi).c0);		\
+  r2 = vec_ld(32L, (float*) &(phi).c0);
+
 
 #define _vec_store_spinor(phi, r0, r1, r2, r3, r4, r5) \
   vec_st(r0, 0L, (double*) &(phi).c0);		       \
@@ -40,6 +45,11 @@
   r0 = vec_ld(0L, (double*) &(phi).c0);		\
   r1 = vec_ld2(0L, (double*) &(phi).c2); 
 
+#define _vec_load_32(r0, r1, phi)			\
+  r0 = vec_ld(0L, (float*) &(phi).c0);		\
+  r1 = vec_ld2(0L, (float*) &(phi).c2); 
+  
+  
 // works also with 16 byte alignement of phi
 #define _vec_load16(r0, r1, phi, tmp)		\
   r0 = vec_ld2(0L, (double*) &(phi).c0);	\
@@ -49,6 +59,15 @@
   tmp = vec_gpci(02301);			\
   r1 = vec_perm(r1, r0, tmp);
 
+#define _vec_load16_32(r0, r1, phi, tmp)		\
+  r0 = vec_ld2(0L, (float*) &(phi).c0);	\
+  r1 = vec_ld(0L, (float*) &(phi).c1);		\
+  tmp = vec_gpci(00145);			\
+  r0 = vec_perm(r0, r1, tmp);			\
+  tmp = vec_gpci(02301);			\
+  r1 = vec_perm(r1, r0, tmp);  
+  
+  
 // alternative
 #define _vec_load16c(r0, r1, phi, tmp)		\
   r0 = vec_ld2(0L, (double*) &(phi).c0);	\
@@ -61,20 +80,43 @@
 #define _vec_store(phi, r0, r1)			\
   vec_st((r0), 0L, (double*) &(phi).c0);	\
   vec_st2((r1), 0L, (double*) &(phi).c2);
-
+  
+  
+// requires 16 byte alignment of phi
+#define _vec_store_32(phi, r0, r1)			\
+  vec_st((r0), 0L, (float*) &(phi).c0);			\
+  vec_st2((r1), 0L, (float*) &(phi).c2);
+  
+  
 // requires 16 (and must not be 32) byte alignment of phi
 #define _vec_store16(phi, r0, r1, tmp)		\
   vec_st2((r0), 0L, (double*) &(phi).c0);	\
   tmp = vec_gpci(02345);			\
   r0 = vec_perm(r0, r1, tmp);			\
   vec_st((r0), 0L, (double *) &(phi).c1);
-
+  
+  
+// requires 8 (and must not be 16) byte alignment of phi
+#define _vec_store16_32(phi, r0, r1, tmp)		\
+  vec_st2((r0), 0L, (float*) &(phi).c0);	\
+  tmp = vec_gpci(02345);			\
+  r0 = vec_perm(r0, r1, tmp);			\
+  vec_st((r0), 0L, (float *) &(phi).c1);
+  
+  
 // requires 32 byte alignment of phi
 #define _vec_store_halfspinor(phi, r0, r1, r2)	\
   vec_st((r0), 0L, (double*) &(phi).c0);	\
   vec_st((r1), 32L, (double*) &(phi).c0);	\
   vec_st((r2), 64L, (double*) &(phi).c0);
 
+  // requires 16 byte alignment of phi
+#define _vec_store_halfspinor_32(phi, r0, r1, r2)	\
+  vec_st((r0), 0L, (float*) &(phi).c0);	\
+  vec_st((r1), 16L, (float*) &(phi).c0);	\
+  vec_st((r2), 32L, (float*) &(phi).c0);
+  
+  
 #define _vec_add(rs0, rs1, r0, r1, s0, s1) \
   rs0 = vec_add(r0, s0);		   \
   rs1 = vec_add(r1, s1);
diff --git a/bgq2.h b/bgq2.h
index cd5b028b2..49c01bfca 100644
--- a/bgq2.h
+++ b/bgq2.h
@@ -334,6 +334,139 @@
   r10= vec_xxnpmadd(U7, r5, r10);	\
   r11= vec_xxnpmadd(U1, r5, r11); 
 
+
+//same as _vec_su3_multiply_double2 but loading a 32bit gauge field
+#define _vec_su3_multiply_double2_32(u)		\
+  U0 = vec_ld2(0, (float*) &(u)->c00);	\
+  U3 = vec_ld2(0, (float*) &(u)->c01);	\
+  U6 = vec_ld2(0, (float*) &(u)->c02);	\
+  U1 = vec_ld2(0, (float*) &(u)->c10);	\
+  U4 = vec_ld2(0, (float*) &(u)->c11);	\
+  U7 = vec_ld2(0, (float*) &(u)->c12);	\
+  U2 = vec_ld2(0, (float*) &(u)->c20);	\
+  r6 = vec_xmul(r0, U0);			\
+  r7 = vec_xmul(r0, U1);			\
+  r8 = vec_xmul(r0, U2);			\
+  r9 = vec_xmul(r3, U0);			\
+  r10= vec_xmul(r3, U1);			\
+  r11= vec_xmul(r3, U2);			\
+						\
+  r6 = vec_xxnpmadd(U0, r0, r6);	\
+  r7 = vec_xxnpmadd(U1, r0, r7);	\
+  r8 = vec_xxnpmadd(U2, r0, r8);	\
+  r9 = vec_xxnpmadd(U0, r3, r9);	\
+  r10= vec_xxnpmadd(U1, r3, r10);	\
+  r11= vec_xxnpmadd(U2, r3, r11);	\
+  U0 = vec_ld2(0, (float*) &(u)->c21);	\
+  						\
+  r6 = vec_xmadd(r1, U3, r6);		\
+  r7 = vec_xmadd(r1, U4, r7);		\
+  r8 = vec_xmadd(r1, U0, r8);		\
+  r9 = vec_xmadd(r4, U3, r9);		\
+  r10= vec_xmadd(r4, U4, r10);		\
+  r11= vec_xmadd(r4, U0, r11);		\
+       					\
+  r6 = vec_xxnpmadd(U3, r1, r6);	\
+  r7 = vec_xxnpmadd(U4, r1, r7);	\
+  r8 = vec_xxnpmadd(U0, r1, r8);	\
+  r9 = vec_xxnpmadd(U3, r4, r9);	\
+  r10= vec_xxnpmadd(U4, r4, r10);	\
+  r11= vec_xxnpmadd(U0, r4, r11);	\
+  U1 = vec_ld2(0, (float*) &(u)->c22);	\
+       					\
+  r6 = vec_xmadd(r2, U6, r6);		\
+  r7 = vec_xmadd(r2, U7, r7);		\
+  r8 = vec_xmadd(r2, U1, r8);		\
+  r9 = vec_xmadd(r5, U6, r9);		\
+  r10= vec_xmadd(r5, U7, r10);		\
+  r11= vec_xmadd(r5, U1, r11);		\
+       					\
+  r6 = vec_xxnpmadd(U6, r2, r6);	\
+  r7 = vec_xxnpmadd(U7, r2, r7);	\
+  r8 = vec_xxnpmadd(U1, r2, r8);	\
+  r9 = vec_xxnpmadd(U6, r5, r9);	\
+  r10= vec_xxnpmadd(U7, r5, r10);	\
+  r11= vec_xxnpmadd(U1, r5, r11); 
+
+
+  
+  
+#define _vec_su3_multiply(u)		\
+  U0 = vec_ld2(0, (double*) &(u)->c00);	\
+  U3 = vec_ld2(0, (double*) &(u)->c01);	\
+  U6 = vec_ld2(0, (double*) &(u)->c02);	\
+  U1 = vec_ld2(0, (double*) &(u)->c10);	\
+  U4 = vec_ld2(0, (double*) &(u)->c11);	\
+  U7 = vec_ld2(0, (double*) &(u)->c12);	\
+  U2 = vec_ld2(0, (double*) &(u)->c20);	\
+  r6 = vec_xmul(r0, U0);			\
+  r7 = vec_xmul(r0, U1);			\
+  r8 = vec_xmul(r0, U2);			\
+						\
+  r6 = vec_xxnpmadd(U0, r0, r6);	\
+  r7 = vec_xxnpmadd(U1, r0, r7);	\
+  r8 = vec_xxnpmadd(U2, r0, r8);	\
+  U0 = vec_ld2(0, (double*) &(u)->c21);	\
+  					\
+  r6 = vec_xmadd(r1, U3, r6);		\
+  r7 = vec_xmadd(r1, U4, r7);		\
+  r8 = vec_xmadd(r1, U0, r8);		\
+       					\
+  r6 = vec_xxnpmadd(U3, r1, r6);	\
+  r7 = vec_xxnpmadd(U4, r1, r7);	\
+  r8 = vec_xxnpmadd(U0, r1, r8);	\
+  U1 = vec_ld2(0, (double*) &(u)->c22);	\
+       					\
+  r6 = vec_xmadd(r2, U6, r6);		\
+  r7 = vec_xmadd(r2, U7, r7);		\
+  r8 = vec_xmadd(r2, U1, r8);		\
+       					\
+  r6 = vec_xxnpmadd(U6, r2, r6);	\
+  r7 = vec_xxnpmadd(U7, r2, r7);	\
+  r8 = vec_xxnpmadd(U1, r2, r8);	\
+
+
+  
+#define _vec_su3_inverse_multiply(u)    \
+  U0 = vec_ld2(0, (double*) &(u)->c00);		\
+  U1 = vec_ld2(0, (double*) &(u)->c01);		\
+  U2 = vec_ld2(0, (double*) &(u)->c02);		\
+  						\
+  r6 = vec_xmul(U0, r0);                        \
+  r7 = vec_xmul(U1, r0);                        \
+  r8 = vec_xmul(U2, r0);                        \
+  						\
+  r6 = vec_xxcpnmadd(r0, U0, r6);		\
+  r7 = vec_xxcpnmadd(r0, U1, r7);		\
+  r8 = vec_xxcpnmadd(r0, U2, r8);		\
+  						\
+  U3 = vec_ld2(0, (double*) &(u)->c10);		\
+  U4 = vec_ld2(0, (double*) &(u)->c11);		\
+  U6 = vec_ld2(0, (double*) &(u)->c12);		\
+  						\
+  r6 = vec_xmadd(U3, r1, r6);			\
+  r7 = vec_xmadd(U4, r1, r7);			\
+  r8 = vec_xmadd(U6, r1, r8);			\
+  						\
+  r6 = vec_xxcpnmadd(r1, U3, r6);		\
+  r7 = vec_xxcpnmadd(r1, U4, r7);		\
+  r8 = vec_xxcpnmadd(r1, U6, r8);		\
+  						\
+  U0 = vec_ld2(0, (double*) &(u)->c20);		\
+  U1 = vec_ld2(0, (double*) &(u)->c21);		\
+  U2 = vec_ld2(0, (double*) &(u)->c22);		\
+  						\
+  r6 = vec_xmadd(U0, r2, r6);			\
+  r7 = vec_xmadd(U1, r2, r7);			\
+  r8 = vec_xmadd(U2, r2, r8);			\
+  						\
+  r6 = vec_xxcpnmadd(r2, U0, r6);		\
+  r7 = vec_xxcpnmadd(r2, U1, r7);		\
+  r8 = vec_xxcpnmadd(r2, U2, r8);		\
+  
+  
+  
+  
 // expects the spinor to act on in
 // r0, r1 -> s0
 // r2, r3 -> s1
@@ -376,6 +509,49 @@
   r5 = vec_xxnpmadd(U7, r7, r5);		\
   r6 = vec_xxnpmadd(U1, r7, r6);
 
+  
+  
+#define _vec_su3_multiply_double2c_32(u)	\
+  r8 = vec_gpci(00145);				\
+  r9 = vec_gpci(02367);				\
+  U0 = vec_ld2(0, (float*) &(u)->c00);		\
+  U3 = vec_ld2(0, (float*) &(u)->c01);		\
+  U6 = vec_ld2(0, (float*) &(u)->c02);		\
+  U1 = vec_ld2(0, (float*) &(u)->c10);		\
+  r7 = vec_perm(r0, r2, r8);			\
+  U4 = vec_ld2(0, (float*) &(u)->c11);		\
+  U7 = vec_ld2(0, (float*) &(u)->c12);		\
+  U2 = vec_ld2(0, (float*) &(u)->c20);		\
+  r4 = vec_xmul(r7, U0);			\
+  r5 = vec_xmul(r7, U1);			\
+  r6 = vec_xmul(r7, U2);			\
+						\
+  r4 = vec_xxnpmadd(U0, r7, r4);		\
+  r5 = vec_xxnpmadd(U1, r7, r5);		\
+  r6 = vec_xxnpmadd(U2, r7, r6);		\
+  r7 = vec_perm(r0, r2, r9);			\
+  U0 = vec_ld2(0, (float*) &(u)->c21);		\
+						\
+  r4 = vec_xmadd(r7, U3, r4);			\
+  r5 = vec_xmadd(r7, U4, r5);			\
+  r6 = vec_xmadd(r7, U0, r6);			\
+  						\
+  r4 = vec_xxnpmadd(U3, r7, r4);		\
+  r5 = vec_xxnpmadd(U4, r7, r5);		\
+  r6 = vec_xxnpmadd(U0, r7, r6);		\
+  r7 = vec_perm(r1, r3, r8);			\
+  U1 = vec_ld2(0, (float*) &(u)->c22);		\
+						\
+  r4 = vec_xmadd(r7, U6, r4);			\
+  r5 = vec_xmadd(r7, U7, r5);			\
+  r6 = vec_xmadd(r7, U1, r6);			\
+  						\
+  r4 = vec_xxnpmadd(U6, r7, r4);		\
+  r5 = vec_xxnpmadd(U7, r7, r5);		\
+  r6 = vec_xxnpmadd(U1, r7, r6);
+  
+  
+  
 #define _vec_su3_multiply_double2ct(u)		\
   r8 = vec_gpci(00167);				\
   U0 = vec_ld2(0, (double*) &(u)->c00);		\
@@ -478,6 +654,64 @@
   r11= vec_xxcpnmadd(r5, U2, r11);
 
 
+//same as _vec_su3_inverse_multiply_double2 but for 32bit gauge field
+#define _vec_su3_inverse_multiply_double2_32(u)    \
+  U0 = vec_ld2(0, (float*) &(u)->c00);		\
+  U1 = vec_ld2(0, (float*) &(u)->c01);		\
+  U2 = vec_ld2(0, (float*) &(u)->c02);		\
+  						\
+  r6 = vec_xmul(U0, r0);                        \
+  r7 = vec_xmul(U1, r0);                        \
+  r8 = vec_xmul(U2, r0);                        \
+  r9 = vec_xmul(U0, r3);                        \
+  r10= vec_xmul(U1, r3);                        \
+  r11= vec_xmul(U2, r3);                        \
+  						\
+  r6 = vec_xxcpnmadd(r0, U0, r6);		\
+  r7 = vec_xxcpnmadd(r0, U1, r7);		\
+  r8 = vec_xxcpnmadd(r0, U2, r8);		\
+  r9 = vec_xxcpnmadd(r3, U0, r9);		\
+  r10= vec_xxcpnmadd(r3, U1, r10);		\
+  r11= vec_xxcpnmadd(r3, U2, r11);		\
+  						\
+  U3 = vec_ld2(0, (float*) &(u)->c10);		\
+  U4 = vec_ld2(0, (float*) &(u)->c11);		\
+  U6 = vec_ld2(0, (float*) &(u)->c12);		\
+  						\
+  r6 = vec_xmadd(U3, r1, r6);			\
+  r7 = vec_xmadd(U4, r1, r7);			\
+  r8 = vec_xmadd(U6, r1, r8);			\
+  r9 = vec_xmadd(U3, r4, r9);			\
+  r10= vec_xmadd(U4, r4, r10);			\
+  r11= vec_xmadd(U6, r4, r11);			\
+  						\
+  r6 = vec_xxcpnmadd(r1, U3, r6);		\
+  r7 = vec_xxcpnmadd(r1, U4, r7);		\
+  r8 = vec_xxcpnmadd(r1, U6, r8);		\
+  r9 = vec_xxcpnmadd(r4, U3, r9);		\
+  r10= vec_xxcpnmadd(r4, U4, r10);		\
+  r11= vec_xxcpnmadd(r4, U6, r11);		\
+  						\
+  U0 = vec_ld2(0, (float*) &(u)->c20);		\
+  U1 = vec_ld2(0, (float*) &(u)->c21);		\
+  U2 = vec_ld2(0, (float*) &(u)->c22);		\
+  						\
+  r6 = vec_xmadd(U0, r2, r6);			\
+  r7 = vec_xmadd(U1, r2, r7);			\
+  r8 = vec_xmadd(U2, r2, r8);			\
+  r9 = vec_xmadd(U0, r5, r9);			\
+  r10= vec_xmadd(U1, r5, r10);			\
+  r11= vec_xmadd(U2, r5, r11);			\
+  						\
+  r6 = vec_xxcpnmadd(r2, U0, r6);		\
+  r7 = vec_xxcpnmadd(r2, U1, r7);		\
+  r8 = vec_xxcpnmadd(r2, U2, r8);		\
+  r9 = vec_xxcpnmadd(r5, U0, r9);		\
+  r10= vec_xxcpnmadd(r5, U1, r10);		\
+  r11= vec_xxcpnmadd(r5, U2, r11);
+
+
+
 #define _vec_su3_inverse_multiply_double2c(u)	\
   U0 = vec_ld2(0, (double*) &(u)->c00);		\
   r8 = vec_gpci(00145);				\
@@ -520,6 +754,52 @@
   r5 = vec_xxcpnmadd(r7, U1, r5);		\
   r6 = vec_xxcpnmadd(r7, U2, r6);
 
+  
+#define _vec_su3_inverse_multiply_double2c_32(u)	\
+  U0 = vec_ld2(0, (float*) &(u)->c00);		\
+  r8 = vec_gpci(00145);				\
+  r9 = vec_gpci(02367);				\
+  U1 = vec_ld2(0, (float*) &(u)->c01);		\
+  r7 = vec_perm(r0, r2, r8);			\
+  U2 = vec_ld2(0, (float*) &(u)->c02);		\
+						\
+  r4 = vec_xmul(U0, r7);			\
+  r5 = vec_xmul(U1, r7);			\
+  r6 = vec_xmul(U2, r7);			\
+						\
+  r4 = vec_xxcpnmadd(r7, U0, r4);		\
+  r5 = vec_xxcpnmadd(r7, U1, r5);		\
+  r6 = vec_xxcpnmadd(r7, U2, r6);		\
+						\
+  r7 = vec_perm(r0, r2, r9);			\
+  U3 = vec_ld2(0, (float*) &(u)->c10);		\
+  U4 = vec_ld2(0, (float*) &(u)->c11);		\
+  U6 = vec_ld2(0, (float*) &(u)->c12);		\
+  						\
+  r4 = vec_xmadd(U3, r7, r4);			\
+  r5 = vec_xmadd(U4, r7, r5);			\
+  r6 = vec_xmadd(U6, r7, r6);			\
+  						\
+  r4 = vec_xxcpnmadd(r7, U3, r4);		\
+  r5 = vec_xxcpnmadd(r7, U4, r5);		\
+  r6 = vec_xxcpnmadd(r7, U6, r6);		\
+						\
+  r7 = vec_perm(r1, r3, r8);			\
+  U0 = vec_ld2(0, (float*) &(u)->c20);		\
+  U1 = vec_ld2(0, (float*) &(u)->c21);		\
+  U2 = vec_ld2(0, (float*) &(u)->c22);		\
+  						\
+  r4 = vec_xmadd(U0, r7, r4);			\
+  r5 = vec_xmadd(U1, r7, r5);			\
+  r6 = vec_xmadd(U2, r7, r6);			\
+  						\
+  r4 = vec_xxcpnmadd(r7, U0, r4);		\
+  r5 = vec_xxcpnmadd(r7, U1, r5);		\
+  r6 = vec_xxcpnmadd(r7, U2, r6);
+  
+  
+  
+  
 #define _vec_su3_inverse_multiply_double2ct(u)	\
   U0 = vec_ld2(0, (double*) &(u)->c00);		\
   r8 = vec_gpci(00167);				\
diff --git a/block.c b/block.c
index f6644ac3f..ee4dd581a 100644
--- a/block.c
+++ b/block.c
@@ -20,17 +20,23 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <errno.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <complex.h>
 #include "global.h"
+#include "aligned_malloc.h"
+#include "boundary.h"
+#include "gettime.h"
+#include "read_input.h"
 #include "operator/D_psi.h"
 #include "linalg_eo.h"
 #include "start.h"
+#include "gamma.h"
 #include "xchange/xchange.h"
 #include "block.h"
 #include "solver/lu_solve.h"
@@ -48,7 +54,7 @@ int * bipt;
 _Complex double * little_A = NULL;
 _Complex float * little_A32 = NULL;
 _Complex double * little_A_eo = NULL;
-_Complex float * little_A32_eo = NULL;
+_Complex float * little_A_eo_32 = NULL;
 int * block_idx;
 int * block_evenidx;
 int * block_oddidx;
@@ -72,27 +78,28 @@ static void (*boundary_D[8])(spinor * const r, spinor * const s, su3 *u) =
 block * block_list = NULL;
 static spinor * basis = NULL;
 static su3 * u = NULL;
+static su3_32 * u_32 = NULL;
 const int spinpad = 1;
 static int block_init = 0;
 
 int dT, dX, dY, dZ; /* Block dimension */
   
 
-int index_a(int t, int x, int y, int z){
+inline int index_a(int t, int x, int y, int z){
   /* Provides the absolute lexicographic index of (t, x, y, z)
      Useful to walk over the blocks, maybe could be just g_ipt[t][x][y][z]
      Claude Tadonki (claude.tadonki@u-psud.fr)
   */
   return ((t*LX + x)*LY + y)*(LZ) + z;
 }
-int index_b(int t, int x, int y, int z){
+inline int index_b(int t, int x, int y, int z){
   /* Provides the block lexicographic index of (t, x, y, z)
      Useful to walk inside a block
      Claude Tadonki (claude.tadonki@u-psud.fr)
   */
   return ((t*dX + x)*dY + y)*(dZ) + z;
 }
-int block_index(int t, int x, int y, int z){
+inline int block_index(int t, int x, int y, int z){
   /* Provides the lexicographic index of the block (t, x, y, z)
      Useful to walk over the blocks
      Claude Tadonki (claude.tadonki@u-psud.fr)
@@ -102,7 +109,9 @@ int block_index(int t, int x, int y, int z){
 
 int init_blocks(const int nt, const int nx, const int ny, const int nz) {
   int i,j;
+  double atime, etime;
   /* Initialization of block-global variables for blocks */
+  atime = gettime();
   nb_blocks = 1; 
   nblks_t = nt;
   nblks_x = nx;
@@ -114,40 +123,46 @@ int init_blocks(const int nt, const int nx, const int ny, const int nz) {
   nblks_dir[2] = nblks_y;
   nblks_dir[3] = nblks_z;
   nb_blocks = nblks_t*nblks_x*nblks_y*nblks_z;
+  //if(nblks_t%2 == 1 || nblks_x%2 == 1 || nblks_y%2 == 1 || nblks_z%2 == 1 ) {
+  //    fprintf(stderr, "no of blocks in all directions must be even! Aborting...\n");
+  if(nblks_z%2 == 1) {
+    fprintf(stderr, "no of MPI local blocks in z direction must be even! Aborting...\n");
+    exit(0);
+  }
   dT = T/nblks_t; 
   dX = LX/nblks_x; 
   dY = LY/nblks_y; 
   dZ = LZ/nblks_z;
   if(g_proc_id == 0 && g_debug_level > 0) {
-    printf("# Number of deflation blocks = %d\n  n_block_t = %d\n  n_block_x = %d\n  n_block_y = %d\n  n_block_z = %d\n",
-	   nb_blocks, nblks_t, nblks_x, nblks_y, nblks_z);
-    /*     printf("# Number of iteration with the polynomial preconditioner = %d \n", dfl_field_iter); */
-    /*     printf("# Number of iteration in the polynomial preconditioner   = %d \n", dfl_poly_iter); */
+    printf("# Number of deflation blocks per MPI process = %d\n  n_block_t = %d\n  n_block_x = %d\n  n_block_y = %d\n  n_block_z = %d\n",
+           nb_blocks, nblks_t, nblks_x, nblks_y, nblks_z);
+    printf("# Block size: %d x %d x %d x %d\n", dT, dX, dY, dZ);
   }
   
   free_blocks();
   block_init = 1;
   block_list = calloc(nb_blocks, sizeof(block));
-  if((void*)(basis = (spinor*)calloc((nb_blocks + 1) * g_N_s * (VOLUME/nb_blocks + spinpad) + 1, sizeof(spinor))) == NULL) {
+  if((void*)(basis = (spinor*)aligned_malloc_zero((nb_blocks + 1) * g_N_s * (VOLUME/nb_blocks + spinpad)*sizeof(spinor))) == NULL) {
+    CALLOC_ERROR_CRASH;
+  }
+  if((void*)(u = (su3*)aligned_malloc_zero(8 * VOLUME * sizeof(su3))) == 0) {
     CALLOC_ERROR_CRASH;
   }
-  if((void*)(u = (su3*)calloc(1+8*VOLUME, sizeof(su3))) == NULL) {
+  if((void*)(u_32 = (su3_32*)aligned_malloc_zero(8 * VOLUME * sizeof(su3_32))) == 0) {
     CALLOC_ERROR_CRASH;
   }
   for(i = 0; i < nb_blocks; i++) {
     block_list[i].basis = (spinor**)calloc(g_N_s, sizeof(spinor*));
   }
   
-#if ( defined SSE || defined SSE2 || defined SSE3)
-  block_list[0].basis[0] = (spinor*)(((unsigned long int)(basis)+ALIGN_BASE)&~ALIGN_BASE);
-  block_list[0].u = (su3*)(((unsigned long int)(u)+ALIGN_BASE)&~ALIGN_BASE);
-#else
   block_list[0].basis[0] = basis;
   block_list[0].u = u;
-#endif
+  block_list[0].u_32 = u_32;
+
   for(j = 1; j < nb_blocks; j++) { 
     block_list[j].basis[0] = block_list[j-1].basis[0] + g_N_s*((VOLUME/nb_blocks) + spinpad) ;
     block_list[j].u = block_list[j-1].u + 8*(VOLUME/nb_blocks);
+    block_list[j].u_32 = block_list[j-1].u_32 + 8*(VOLUME/nb_blocks);
   }
   for(j = 0; j < nb_blocks; j++) {
     for(i = 1 ; i < g_N_s ; i ++ ) {
@@ -183,78 +198,64 @@ int init_blocks(const int nt, const int nx, const int ny, const int nz) {
     block_list[i].ns = g_N_s;
     block_list[i].spinpad = spinpad;
 
-    /* The following has not yet been adapted for */
-    /* new block geometry right? (C.U.)           */
-    for (j = 0 ; j < 6; ++j) {
-#ifdef MPI
-      block_list[i].mpilocal_neighbour[j] = (g_nb_list[j] == g_cart_id) ? i : -1;
-#else
-      block_list[i].mpilocal_neighbour[j] = i;
-#endif
-    }
-#ifdef MPI
-    block_list[i].mpilocal_neighbour[6] = (i == 0 ? 1 : (g_nb_list[j] == g_cart_id) ? 0 : -1);
-    block_list[i].mpilocal_neighbour[7] = (i == 1 ? 0 : (g_nb_list[j] == g_cart_id) ? 1 : -1);
-#else
-    block_list[i].mpilocal_neighbour[6] = (i == 0 ? 1 : 0);
-    block_list[i].mpilocal_neighbour[7] = (i == 0 ? 1 : 0);
-#endif
-    if(g_debug_level > 4 && g_proc_id == 0) {
-      for(j = 0; j < 8; j++) {
-	printf("block %d mpilocal_neighbour[%d] = %d\n", i, j, block_list[i].mpilocal_neighbour[j]);
-      }
-    }
-    /* till here... (C.U.)                        */
-
-    /* block coordinate on the mpilocal processor */
+    // block coordinate on the mpilocal processor
     block_list[i].mpilocal_coordinate[0] = (i / (nblks_x * nblks_y * nblks_z));
     block_list[i].mpilocal_coordinate[1] = (i / (nblks_y * nblks_z)) % nblks_x;
     block_list[i].mpilocal_coordinate[2] = (i / (nblks_z)) % nblks_y;
     block_list[i].mpilocal_coordinate[3] = i % nblks_z;
 
-    /* global block coordinate                    */
+    // global block coordinate
     for(j = 0; j < 4; j++) {
       block_list[i].coordinate[j] = nblks_dir[j] * g_proc_coords[j] + block_list[i].mpilocal_coordinate[j];
     }
-    /* even/odd id of block coordinate            */
+    // even/odd id of block coordinate
+    // using the global coordinates here should allow for
+    // odd (MPI) local no of blocks 
+    // -> of course the global number of blocks in each direction must be even.
+    // and at least the local no of blocks in z-direction must be even.
     block_list[i].evenodd = (block_list[i].coordinate[0] + block_list[i].coordinate[1] + 
-			     block_list[i].coordinate[2] + block_list[i].coordinate[3]) % 2;
+                             block_list[i].coordinate[2] + block_list[i].coordinate[3]) % 2;
+
+    block_list[i].evenodd_id = block_list[i].id / 2;
+    if(block_list[i].evenodd) block_list[i].evenodd_id += nb_blocks/2;
 
     /* block_list[i].evenodd = i % 2; */
-    if(g_proc_id == 0 && g_debug_level > 1) {
-      printf("%d %d (%d %d %d %d)\n", i, block_list[i].evenodd, block_list[i].coordinate[0], block_list[i].coordinate[1], block_list[i].coordinate[2], block_list[i].coordinate[3]);
+    if(g_proc_id == 0 && g_debug_level > 4) {
+      printf("# Block id %d even odd id %d coordinate (%d %d %d %d)\n", 
+             i, block_list[i].evenodd, block_list[i].coordinate[0], block_list[i].coordinate[1], 
+             block_list[i].coordinate[2], block_list[i].coordinate[3]);
     }
-    if ((void*)(block_idx = calloc(8 * (VOLUME/nb_blocks), sizeof(int))) == NULL)
-      CALLOC_ERROR_CRASH;
-
-    if ((void*)(block_evenidx = calloc(8 * (VOLUME/nb_blocks/2), sizeof(int))) == NULL)
-      CALLOC_ERROR_CRASH;
-
-    if ((void*)(block_oddidx = calloc(8 * (VOLUME/nb_blocks/2), sizeof(int))) == NULL)
-      CALLOC_ERROR_CRASH;
-
-    for (j = 0; j < g_N_s; j++) { /* write a zero element at the end of every spinor */
+    for (j = 0; j < g_N_s; j++) { 
+      // write a zero element at the end of every spinor
+      // this we need for boundary points, which we treat like this
       _spinor_null(block_list[i].basis[j][VOLUME/nb_blocks]);
     }
 
-    if ((void*)(block_list[i].little_dirac_operator = calloc(9 * g_N_s * g_N_s, sizeof(_Complex double))) == NULL)
+    if ((void*)(block_list[i].little_dirac_operator =       aligned_malloc_zero(9 * g_N_s * g_N_s * sizeof(_Complex double))) == NULL)
       CALLOC_ERROR_CRASH;
-    if ((void*)(block_list[i].little_dirac_operator32 = calloc(9 * g_N_s * g_N_s, sizeof(_Complex float))) == NULL)
+    if ((void*)(block_list[i].little_dirac_operator_32 =    aligned_malloc_zero(9 * g_N_s * g_N_s * sizeof(_Complex float))) == NULL)
       CALLOC_ERROR_CRASH;
-    if ((void*)(block_list[i].little_dirac_operator_eo = calloc(9*g_N_s * g_N_s, sizeof(_Complex double))) == NULL)
+    if ((void*)(block_list[i].little_dirac_operator_eo =    aligned_malloc_zero(9 * g_N_s * g_N_s * sizeof(_Complex double))) == NULL)
+      CALLOC_ERROR_CRASH;
+    if ((void*)(block_list[i].little_dirac_operator_eo_32 = aligned_malloc_zero(9 * g_N_s * g_N_s * sizeof(_Complex float))) == NULL)
       CALLOC_ERROR_CRASH;
-    for (j = 0; j < 9 * g_N_s * g_N_s; ++j) {
-      block_list[i].little_dirac_operator[j] = 0.0;
-      block_list[i].little_dirac_operator32[j] = 0.0;
-      block_list[i].little_dirac_operator_eo[j] = 0.0;
-    }
   }
- 
- 
-   
+  if ((void*)(block_idx = calloc(8 * (VOLUME/nb_blocks), sizeof(int))) == NULL)
+    CALLOC_ERROR_CRASH;
+  
+  if ((void*)(block_evenidx = calloc(8 * (VOLUME/nb_blocks/2), sizeof(int))) == NULL)
+    CALLOC_ERROR_CRASH;
+  
+  if ((void*)(block_oddidx = calloc(8 * (VOLUME/nb_blocks/2), sizeof(int))) == NULL)
+    CALLOC_ERROR_CRASH;
+  
   init_blocks_geometry();
   init_blocks_gaugefield();
-
+  etime = gettime();
+  if(g_proc_id == 0 && g_debug_level > 0) {
+    printf("# time for block initialisation %e s\n", etime - atime);
+    fflush(stdout);
+  }
   return 0;
 }
 
@@ -263,22 +264,28 @@ int free_blocks() {
   if(block_init == 1) {
     for(i = 0; i < nb_blocks; ++i) {
       free(block_list[i].basis);
-      free(block_list[i].little_dirac_operator);
-      free(block_list[i].little_dirac_operator32);
-      free(block_list[i].little_dirac_operator_eo);
+      aligned_free(block_list[i].little_dirac_operator);
+      aligned_free(block_list[i].little_dirac_operator_32);
+      aligned_free(block_list[i].little_dirac_operator_eo);
+      aligned_free(block_list[i].little_dirac_operator_eo_32);
     }
     free(block_ipt);
     free(bipt__);
     free(bipt_);
     free(bipt);
     free(index_block_eo);
-    free(u);
-    free(basis);
+    free(block_idx);
+    free(block_evenidx);
+    free(block_oddidx);
+    aligned_free(u);
+    aligned_free(u_32);
+    aligned_free(basis);
     free(block_list);
     block_init = 0;
   }
   return 0;
 }
+
 int init_blocks_gaugefield() {
   /* 
      Copies the existing gauge field on the processor into the separate blocks in a form
@@ -287,34 +294,75 @@ int init_blocks_gaugefield() {
      memory. 
   */
 
-  int i, x, y, z, t, ix, ix_new = 0;
-  int bx, by, bz, bt;
+  int i, ix, ix_new = 0;
+
+  for(int t = 0; t < dT;  t++) {
+    for(int x = 0; x < dX; x++) {
+      for(int y = 0; y < dY; y++) {
+        for(int z = 0; z < dZ; z++) {
+          i = 0;
+          for(int bt = 0; bt < nblks_t; bt ++) {
+            for(int bx = 0; bx < nblks_x; bx ++) {
+              for(int by = 0; by < nblks_y; by ++) {
+                for(int bz = 0; bz < nblks_z; bz ++) {
+                  ix = g_ipt[t + bt*dT][x + bx*dX][y + by*dY][z + bz*dZ];
+                  memcpy(block_list[i].u + ix_new,     &g_gauge_field[ ix           ][0], sizeof(su3));
+                  memcpy(block_list[i].u + ix_new + 1, &g_gauge_field[ g_idn[ix][0] ][0], sizeof(su3));
+                  memcpy(block_list[i].u + ix_new + 2, &g_gauge_field[ ix           ][1], sizeof(su3));
+                  memcpy(block_list[i].u + ix_new + 3, &g_gauge_field[ g_idn[ix][1] ][1], sizeof(su3));
+                  memcpy(block_list[i].u + ix_new + 4, &g_gauge_field[ ix           ][2], sizeof(su3));
+                  memcpy(block_list[i].u + ix_new + 5, &g_gauge_field[ g_idn[ix][2] ][2], sizeof(su3));
+                  memcpy(block_list[i].u + ix_new + 6, &g_gauge_field[ ix           ][3], sizeof(su3));
+                  memcpy(block_list[i].u + ix_new + 7, &g_gauge_field[ g_idn[ix][3] ][3], sizeof(su3));
+                  i++;
+                }
+              }
+            }
+          }
+          ix_new += 8;
+        }
+      }
+    }
+  }
+  blk_gauge_eo = 0;
+  return(0);
+}
 
-  for (t = 0; t < dT;  t++) {
-    for (x = 0; x < dX; x++) {
-      for (y = 0; y < dY; y++) {
-	for (z = 0; z < dZ; z++) {
-	  i = 0;
-	  for(bt = 0; bt < nblks_t; bt ++) {
-	    for(bx = 0; bx < nblks_x; bx ++) {
-	      for(by = 0; by < nblks_y; by ++) {
-		for(bz = 0; bz < nblks_z; bz ++) {
-		  ix = g_ipt[t + bt*dT][x + bx*dX][y + by*dY][z + bz*dZ];
-		  memcpy(block_list[i].u + ix_new,     &g_gauge_field[ ix           ][0], sizeof(su3));
-		  memcpy(block_list[i].u + ix_new + 1, &g_gauge_field[ g_idn[ix][0] ][0], sizeof(su3));
-		  memcpy(block_list[i].u + ix_new + 2, &g_gauge_field[ ix           ][1], sizeof(su3));
-		  memcpy(block_list[i].u + ix_new + 3, &g_gauge_field[ g_idn[ix][1] ][1], sizeof(su3));
-		  memcpy(block_list[i].u + ix_new + 4, &g_gauge_field[ ix           ][2], sizeof(su3));
-		  memcpy(block_list[i].u + ix_new + 5, &g_gauge_field[ g_idn[ix][2] ][2], sizeof(su3));
-		  memcpy(block_list[i].u + ix_new + 6, &g_gauge_field[ ix           ][3], sizeof(su3));
-		  memcpy(block_list[i].u + ix_new + 7, &g_gauge_field[ g_idn[ix][3] ][3], sizeof(su3));
-		  i++;
-		}
-	      }
-	    }
-	  }
-	  ix_new += 8;
-	}
+int init_blocks_gaugefield_32() {
+  /* 
+     Copies the existing gauge field on the processor into the separate blocks in a form
+     that is readable by the block Dirac operator. Specifically, in consecutive memory
+     now +t,-t,+x,-x,+y,-y,+z,-z gauge links are stored. This requires double the storage in
+     memory. 
+  */
+
+  int i, ix, ix_new = 0;
+
+  for(int t = 0; t < dT;  t++) {
+    for(int x = 0; x < dX; x++) {
+      for(int y = 0; y < dY; y++) {
+        for(int z = 0; z < dZ; z++) {
+          i = 0;
+          for(int bt = 0; bt < nblks_t; bt ++) {
+            for(int bx = 0; bx < nblks_x; bx ++) {
+              for(int by = 0; by < nblks_y; by ++) {
+                for(int bz = 0; bz < nblks_z; bz ++) {
+                  ix = g_ipt[t + bt*dT][x + bx*dX][y + by*dY][z + bz*dZ];
+                  memcpy(block_list[i].u_32 + ix_new,     &g_gauge_field_32[ ix           ][0], sizeof(su3_32));
+                  memcpy(block_list[i].u_32 + ix_new + 1, &g_gauge_field_32[ g_idn[ix][0] ][0], sizeof(su3_32));
+                  memcpy(block_list[i].u_32 + ix_new + 2, &g_gauge_field_32[ ix           ][1], sizeof(su3_32));
+                  memcpy(block_list[i].u_32 + ix_new + 3, &g_gauge_field_32[ g_idn[ix][1] ][1], sizeof(su3_32));
+                  memcpy(block_list[i].u_32 + ix_new + 4, &g_gauge_field_32[ ix           ][2], sizeof(su3_32));
+                  memcpy(block_list[i].u_32 + ix_new + 5, &g_gauge_field_32[ g_idn[ix][2] ][2], sizeof(su3_32));
+                  memcpy(block_list[i].u_32 + ix_new + 6, &g_gauge_field_32[ ix           ][3], sizeof(su3_32));
+                  memcpy(block_list[i].u_32 + ix_new + 7, &g_gauge_field_32[ g_idn[ix][3] ][3], sizeof(su3_32));
+                  i++;
+                }
+              }
+            }
+          }
+          ix_new += 8;
+        }
       }
     }
   }
@@ -330,43 +378,89 @@ int init_blocks_eo_gaugefield() {
      memory. 
   */
 
-  int i, x, y, z, t, ix, ix_even = 0, ix_odd = (dT*dX*dY*dZ*8)/2, ixeo;
-  int bx, by, bz, bt, even=0;
+  int i, ix, ix_even = 0, ix_odd = (dT*dX*dY*dZ*8)/2, ixeo;
 
-  for (t = 0; t < dT;  t++) {
-    for (x = 0; x < dX; x++) {
-      for (y = 0; y < dY; y++) {
-	for (z = 0; z < dZ; z++) {
-	  if((t+x+y+z)%2 == 0) {
-	    even = 1;
-	    ixeo = ix_even;
-	  }
-	  else {
-	    even = 0;
-	    ixeo = ix_odd;
-	  }
-	  i = 0;
-	  for(bt = 0; bt < nblks_t; bt ++) {
-	    for(bx = 0; bx < nblks_x; bx ++) {
-	      for(by = 0; by < nblks_y; by ++) {
-		for(bz = 0; bz < nblks_z; bz ++) {
-		  ix = g_ipt[t + bt*dT][x + bx*dX][y + by*dY][z + bz*dZ];
-		  memcpy(block_list[i].u + ixeo,     &g_gauge_field[ ix           ][0], sizeof(su3));
-		  memcpy(block_list[i].u + ixeo + 1, &g_gauge_field[ g_idn[ix][0] ][0], sizeof(su3));
-		  memcpy(block_list[i].u + ixeo + 2, &g_gauge_field[ ix           ][1], sizeof(su3));
-		  memcpy(block_list[i].u + ixeo + 3, &g_gauge_field[ g_idn[ix][1] ][1], sizeof(su3));
-		  memcpy(block_list[i].u + ixeo + 4, &g_gauge_field[ ix           ][2], sizeof(su3));
-		  memcpy(block_list[i].u + ixeo + 5, &g_gauge_field[ g_idn[ix][2] ][2], sizeof(su3));
-		  memcpy(block_list[i].u + ixeo + 6, &g_gauge_field[ ix           ][3], sizeof(su3));
-		  memcpy(block_list[i].u + ixeo + 7, &g_gauge_field[ g_idn[ix][3] ][3], sizeof(su3));
-		  i++;
-		}
-	      }
-	    }
-	  }
-	  if(even) ix_even += 8;
-	  else ix_odd += 8;
-	}
+  for (int t = 0; t < dT;  t++) {
+    for (int x = 0; x < dX; x++) {
+      for (int y = 0; y < dY; y++) {
+        for (int z = 0; z < dZ; z++) {
+          if((t+x+y+z)%2 == 0) {
+            ixeo = ix_even;
+            ix_even += 8;
+          }
+          else {
+            ixeo = ix_odd;
+            ix_odd += 8;
+          }
+          i = 0;
+          for(int bt = 0; bt < nblks_t; bt ++) {
+            for(int bx = 0; bx < nblks_x; bx ++) {
+              for(int by = 0; by < nblks_y; by ++) {
+                for(int bz = 0; bz < nblks_z; bz ++) {
+                  ix = g_ipt[t + bt*dT][x + bx*dX][y + by*dY][z + bz*dZ];
+                  memcpy(block_list[i].u + ixeo,     &g_gauge_field[ ix           ][0], sizeof(su3));
+                  memcpy(block_list[i].u + ixeo + 1, &g_gauge_field[ g_idn[ix][0] ][0], sizeof(su3));
+                  memcpy(block_list[i].u + ixeo + 2, &g_gauge_field[ ix           ][1], sizeof(su3));
+                  memcpy(block_list[i].u + ixeo + 3, &g_gauge_field[ g_idn[ix][1] ][1], sizeof(su3));
+                  memcpy(block_list[i].u + ixeo + 4, &g_gauge_field[ ix           ][2], sizeof(su3));
+                  memcpy(block_list[i].u + ixeo + 5, &g_gauge_field[ g_idn[ix][2] ][2], sizeof(su3));
+                  memcpy(block_list[i].u + ixeo + 6, &g_gauge_field[ ix           ][3], sizeof(su3));
+                  memcpy(block_list[i].u + ixeo + 7, &g_gauge_field[ g_idn[ix][3] ][3], sizeof(su3));
+                  i++;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  blk_gauge_eo = 1;
+  return(0);
+}
+
+int init_blocks_eo_gaugefield_32() {
+  /* 
+     Copies the existing gauge field on the processor into the separate blocks in a form
+     that is readable by the block Hopping matrix. Specifically, in consecutive memory
+     now +t,-t,+x,-x,+y,-y,+z,-z gauge links are stored. This requires double the storage in
+     memory. 
+  */
+
+  int i, ix, ix_even = 0, ix_odd = (dT*dX*dY*dZ*8)/2, ixeo;
+
+  for(int t = 0; t < dT;  t++) {
+    for(int x = 0; x < dX; x++) {
+      for(int y = 0; y < dY; y++) {
+        for(int z = 0; z < dZ; z++) {
+          if((t+x+y+z)%2 == 0) {
+            ixeo = ix_even;
+            ix_even += 8;
+          }
+          else {
+            ixeo = ix_odd;
+            ix_odd += 8;
+          }
+          i = 0;
+          for(int bt = 0; bt < nblks_t; bt ++) {
+            for(int bx = 0; bx < nblks_x; bx ++) {
+              for(int by = 0; by < nblks_y; by ++) {
+                for(int bz = 0; bz < nblks_z; bz ++) {
+                  ix = g_ipt[t + bt*dT][x + bx*dX][y + by*dY][z + bz*dZ];
+                  memcpy(block_list[i].u_32 + ixeo,     &g_gauge_field_32[ ix           ][0], sizeof(su3_32));
+                  memcpy(block_list[i].u_32 + ixeo + 1, &g_gauge_field_32[ g_idn[ix][0] ][0], sizeof(su3_32));
+                  memcpy(block_list[i].u_32 + ixeo + 2, &g_gauge_field_32[ ix           ][1], sizeof(su3_32));
+                  memcpy(block_list[i].u_32 + ixeo + 3, &g_gauge_field_32[ g_idn[ix][1] ][1], sizeof(su3_32));
+                  memcpy(block_list[i].u_32 + ixeo + 4, &g_gauge_field_32[ ix           ][2], sizeof(su3_32));
+                  memcpy(block_list[i].u_32 + ixeo + 5, &g_gauge_field_32[ g_idn[ix][2] ][2], sizeof(su3_32));
+                  memcpy(block_list[i].u_32 + ixeo + 6, &g_gauge_field_32[ ix           ][3], sizeof(su3_32));
+                  memcpy(block_list[i].u_32 + ixeo + 7, &g_gauge_field_32[ g_idn[ix][3] ][3], sizeof(su3_32));
+                  i++;
+                }
+              }
+            }
+          }
+        }
       }
     }
   }
@@ -498,7 +592,7 @@ int check_blocks_geometry(block * blk) {
     }
   }
 
-  if(g_proc_id == 0 && g_debug_level > 1) {
+  if(g_proc_id == 0 && g_debug_level > 4) {
     printf("# block geometry checked successfully for block %d !\n", blk->id);
   }
   for(i = 0; i < blk->volume; i++) {
@@ -538,87 +632,87 @@ int check_blocks_geometry(block * blk) {
     for(x = 0; x < LX/nblks_x; x++) {
       for(y = 0; y < LY/nblks_y; y++) {
         for(z = 0; z < LZ/nblks_z; z++) {
-	  if((x + y + z + t)%2 == 0) {
-	    i = block_ipt[t][x][y][z]/2;
-	    if(t != T/nblks_t-1) {
-	      if(*ipt != block_ipt[t+1][x][y][z]/2 && g_proc_id == 0)
-		printf("Shit +t! (%d %d %d %d): %d != %d at %d\n",
-		       t, x, y, z, *ipt, block_ipt[t+1][x][y][z]/2, i);
-	    }
-	    else if(*ipt != VOLUME/nb_blocks/2)
-	      printf("Shit +t! (%d %d %d %d): %d != %d at %d\n",
+          if((x + y + z + t)%2 == 0) {
+            i = block_ipt[t][x][y][z]/2;
+            if(t != T/nblks_t-1) {
+              if(*ipt != block_ipt[t+1][x][y][z]/2 && g_proc_id == 0)
+                printf("Shit +t! (%d %d %d %d): %d != %d at %d\n",
+                       t, x, y, z, *ipt, block_ipt[t+1][x][y][z]/2, i);
+            }
+            else if(*ipt != VOLUME/nb_blocks/2)
+              printf("Shit +t! (%d %d %d %d): %d != %d at %d\n",
                    t, x, y, z, *ipt, VOLUME/nb_blocks/2, i);
-	    ipt++;
-	    if(t != 0) {
-	      if(*ipt != block_ipt[t-1][x][y][z]/2 && g_proc_id == 0)
-		printf("Shit -t! (%d %d %d %d): %d != %d at %d\n",
-		       t, x, y, z, *ipt, block_ipt[t+1][x][y][z]/2, i);
-	    }
-	    else if(*ipt != VOLUME/nb_blocks/2)
-	      printf("Shit -t! (%d %d %d %d): %d != %d at %d\n",
-		     t, x, y, z, *ipt, VOLUME/nb_blocks/2, i);
-	    ipt++;
-	    if(x != LX/nblks_x-1) {
-	      if(*ipt != block_ipt[t][x+1][y][z]/2 && g_proc_id == 0)
-		printf("Shit +x! (%d %d %d %d): %d != %d at %d\n",
-		       t, x, y, z, *ipt, block_ipt[t][x+1][y][z]/2, i);
-	    }
-	    else if(*ipt != VOLUME/nb_blocks/2)
-	      printf("Shit +x! (%d %d %d %d): %d != %d at %d\n",
-		     t, x, y, z, *ipt, VOLUME/nb_blocks/2, i);
-	    ipt++;
-	    if(x != 0) {
-	      if(*ipt != block_ipt[t][x-1][y][z]/2 && g_proc_id == 0)
-		printf("Shit -x! (%d %d %d %d): %d != %d at %d\n",
-		       t, x, y, z, *ipt, block_ipt[t][x-1][y][z]/2, i);
-	    }
-	    else if(*ipt != VOLUME/nb_blocks/2)
-	      printf("Shit -x! (%d %d %d %d): %d != %d at %d\n",
-		     t, x, y, z, *ipt, VOLUME/nb_blocks, i);
-	    ipt++;
-	    if(y != LY/nblks_y-1) {
-	      if(*ipt != block_ipt[t][x][y+1][z]/2 && g_proc_id == 0)
-		printf("Shit +y! (%d %d %d %d): %d != %d at %d\n",
-		       t, x, y, z, *ipt, block_ipt[t][x][y+1][z]/2, i);
-	    }
-	    else if(*ipt != VOLUME/nb_blocks/2)
-	      printf("Shit +y! (%d %d %d %d): %d != %d at %d\n",
-		     t, x, y, z, *ipt, VOLUME/nb_blocks/2, i);
-	    ipt++;
-	    if(y != 0) {
-	      if(*ipt != block_ipt[t][x][y-1][z]/2 && g_proc_id == 0)
-		printf("Shit -y! (%d %d %d %d): %d != %d at %d\n",
-		       t, x, y, z, *ipt, block_ipt[t][x][y-1][z]/2, i);
-	    }
-	    else if(*ipt != VOLUME/nb_blocks/2)
-	      printf("Shit -y! (%d %d %d %d): %d != %d at %d\n",
-		     t, x, y, z, *ipt, VOLUME/nb_blocks/2, i);
-	    ipt++;
-	    if(z != LZ/nblks_z-1) {
-	      if(*ipt != block_ipt[t][x][y][z+1]/2 && g_proc_id == 0)
-		printf("Shit +z! (%d %d %d %d): %d != %d at %d\n",
-		       t, x, y, z, *ipt, block_ipt[t][x][y][z+1]/2, i);
-	    }
-	    else if(*ipt != VOLUME/nb_blocks/2)
-	      printf("Shit +z! (%d %d %d %d): %d != %d at %d\n",
-		     t, x, y, z, *ipt, VOLUME/nb_blocks/2, i);
-	    ipt++;
-	    if(z != 0) {
-	      if(*ipt != block_ipt[t][x][y][z-1]/2 && g_proc_id == 0)
-		printf("Shit -z! (%d %d %d %d): %d != %d at %d\n",
-		       t, x, y, z, *ipt, block_ipt[t][x][y][z-1]/2, i);
-	    }
-	    else if(*ipt != VOLUME/nb_blocks/2)
-	      printf("Shit -z! (%d %d %d %d): %d != %d at %d\n",
-		     t, x, y, z, *ipt, VOLUME/nb_blocks/2, i);
-	    ipt++;
-	  }
+            ipt++;
+            if(t != 0) {
+              if(*ipt != block_ipt[t-1][x][y][z]/2 && g_proc_id == 0)
+                printf("Shit -t! (%d %d %d %d): %d != %d at %d\n",
+                       t, x, y, z, *ipt, block_ipt[t+1][x][y][z]/2, i);
+            }
+            else if(*ipt != VOLUME/nb_blocks/2)
+              printf("Shit -t! (%d %d %d %d): %d != %d at %d\n",
+                     t, x, y, z, *ipt, VOLUME/nb_blocks/2, i);
+            ipt++;
+            if(x != LX/nblks_x-1) {
+              if(*ipt != block_ipt[t][x+1][y][z]/2 && g_proc_id == 0)
+                printf("Shit +x! (%d %d %d %d): %d != %d at %d\n",
+                       t, x, y, z, *ipt, block_ipt[t][x+1][y][z]/2, i);
+            }
+            else if(*ipt != VOLUME/nb_blocks/2)
+              printf("Shit +x! (%d %d %d %d): %d != %d at %d\n",
+                     t, x, y, z, *ipt, VOLUME/nb_blocks/2, i);
+            ipt++;
+            if(x != 0) {
+              if(*ipt != block_ipt[t][x-1][y][z]/2 && g_proc_id == 0)
+                printf("Shit -x! (%d %d %d %d): %d != %d at %d\n",
+                       t, x, y, z, *ipt, block_ipt[t][x-1][y][z]/2, i);
+            }
+            else if(*ipt != VOLUME/nb_blocks/2)
+              printf("Shit -x! (%d %d %d %d): %d != %d at %d\n",
+                     t, x, y, z, *ipt, VOLUME/nb_blocks, i);
+            ipt++;
+            if(y != LY/nblks_y-1) {
+              if(*ipt != block_ipt[t][x][y+1][z]/2 && g_proc_id == 0)
+                printf("Shit +y! (%d %d %d %d): %d != %d at %d\n",
+                       t, x, y, z, *ipt, block_ipt[t][x][y+1][z]/2, i);
+            }
+            else if(*ipt != VOLUME/nb_blocks/2)
+              printf("Shit +y! (%d %d %d %d): %d != %d at %d\n",
+                     t, x, y, z, *ipt, VOLUME/nb_blocks/2, i);
+            ipt++;
+            if(y != 0) {
+              if(*ipt != block_ipt[t][x][y-1][z]/2 && g_proc_id == 0)
+                printf("Shit -y! (%d %d %d %d): %d != %d at %d\n",
+                       t, x, y, z, *ipt, block_ipt[t][x][y-1][z]/2, i);
+            }
+            else if(*ipt != VOLUME/nb_blocks/2)
+              printf("Shit -y! (%d %d %d %d): %d != %d at %d\n",
+                     t, x, y, z, *ipt, VOLUME/nb_blocks/2, i);
+            ipt++;
+            if(z != LZ/nblks_z-1) {
+              if(*ipt != block_ipt[t][x][y][z+1]/2 && g_proc_id == 0)
+                printf("Shit +z! (%d %d %d %d): %d != %d at %d\n",
+                       t, x, y, z, *ipt, block_ipt[t][x][y][z+1]/2, i);
+            }
+            else if(*ipt != VOLUME/nb_blocks/2)
+              printf("Shit +z! (%d %d %d %d): %d != %d at %d\n",
+                     t, x, y, z, *ipt, VOLUME/nb_blocks/2, i);
+            ipt++;
+            if(z != 0) {
+              if(*ipt != block_ipt[t][x][y][z-1]/2 && g_proc_id == 0)
+                printf("Shit -z! (%d %d %d %d): %d != %d at %d\n",
+                       t, x, y, z, *ipt, block_ipt[t][x][y][z-1]/2, i);
+            }
+            else if(*ipt != VOLUME/nb_blocks/2)
+              printf("Shit -z! (%d %d %d %d): %d != %d at %d\n",
+                     t, x, y, z, *ipt, VOLUME/nb_blocks/2, i);
+            ipt++;
+          }
         }
       }
     }
   }
 
-  if(g_proc_id == 0 && g_debug_level > 1) {
+  if(g_proc_id == 0 && g_debug_level > 4) {
     printf("# block eo geometry checked successfully for block %d !\n", blk->id);
   }
 
@@ -627,25 +721,25 @@ int check_blocks_geometry(block * blk) {
 }
 
 int init_blocks_geometry() {
-  int i, ix, x, y, z, t, eo, i_even, i_odd;
+  int i_even, i_odd;
   int zstride = 1;
   int ystride = dZ;
   int xstride = dY * dZ;
   int tstride = dX * dY * dZ;
   int boundidx = VOLUME/nb_blocks;
-  for (ix = 0; ix < VOLUME/nb_blocks; ++ix) {
-    block_idx[8 * ix + 0] = ix           >= VOLUME/nb_blocks - tstride ? boundidx : ix + tstride;/* +t */
-    block_idx[8 * ix + 1] = ix           <  tstride                    ? boundidx : ix - tstride;/* -t */
-    block_idx[8 * ix + 2] = (ix % tstride >= dZ * dY * (dX - 1)		? boundidx : ix + xstride);/* +x */
-    block_idx[8 * ix + 3] = ix % tstride <  dZ * dY			? boundidx : ix - xstride;/* -x */
-    block_idx[8 * ix + 4] = (ix % xstride >= dZ * (dY - 1)		? boundidx : ix + ystride);/* +y */
-    block_idx[8 * ix + 5] = ix % xstride <  dZ				? boundidx : ix - ystride;/* -y */
-    block_idx[8 * ix + 6] = ix % ystride == dZ - 1			? boundidx : ix + zstride;/* +z */
-    block_idx[8 * ix + 7] = ix % ystride == 0				? boundidx : ix - zstride;/* -z */
+  for (int ix = 0; ix < VOLUME/nb_blocks; ++ix) {
+    block_idx[8 * ix + 0] = ix           >= VOLUME/nb_blocks - tstride  ? boundidx : ix + tstride;/* +t */
+    block_idx[8 * ix + 1] = ix           <  tstride                     ? boundidx : ix - tstride;/* -t */
+    block_idx[8 * ix + 2] = ix % tstride >= dZ * dY * (dX - 1)          ? boundidx : ix + xstride;/* +x */
+    block_idx[8 * ix + 3] = ix % tstride <  dZ * dY                     ? boundidx : ix - xstride;/* -x */
+    block_idx[8 * ix + 4] = ix % xstride >= dZ * (dY - 1)               ? boundidx : ix + ystride;/* +y */
+    block_idx[8 * ix + 5] = ix % xstride <  dZ                          ? boundidx : ix - ystride;/* -y */
+    block_idx[8 * ix + 6] = ix % ystride == dZ - 1                      ? boundidx : ix + zstride;/* +z */
+    block_idx[8 * ix + 7] = ix % ystride == 0                           ? boundidx : ix - zstride;/* -z */
     /* Assume that all directions have even extension */
     /* even and odd versions should be equal          */
-    eo = ((ix%dZ)+(ix/ystride)%dY+(ix/(xstride))%dX
-	  +ix/(tstride))%2;
+    int eo = ((ix%dZ)+(ix/ystride)%dY+(ix/(xstride))%dX
+          +ix/(tstride))%2;
     if(eo == 0) {
       block_evenidx[8*(ix/2) + 0] = block_idx[8 * ix + 0] / 2;
       block_evenidx[8*(ix/2) + 1] = block_idx[8 * ix + 1] / 2;
@@ -667,16 +761,16 @@ int init_blocks_geometry() {
       block_oddidx[8*(ix/2) + 7] = block_idx[8 * ix + 7] / 2;
     }
   }
-  for(i = 0; i < nb_blocks; i++) {
+  for(int i = 0; i < nb_blocks; i++) {
     block_list[i].idx = block_idx;
     block_list[i].evenidx = block_evenidx;
     block_list[i].oddidx = block_oddidx;
   }
-  ix = 0;
-  for(t = 0; t < dT; t++) {
-    for(x = 0; x < dX; x++) {
-      for(y = 0; y < dY; y++) {
-        for(z = 0; z < dZ; z++) {
+  int ix = 0;
+  for(int t = 0; t < dT; t++) {
+    for(int x = 0; x < dX; x++) {
+      for(int y = 0; y < dY; y++) {
+        for(int z = 0; z < dZ; z++) {
           block_ipt[t][x][y][z] = ix;
           ix++;
         }
@@ -686,24 +780,25 @@ int init_blocks_geometry() {
 
   i_even = 0;
   i_odd = 0;
-  for (t=0;t<nblks_t;t++) {
-    for (x=0;x<nblks_x;x++) {
-      for (y=0;y<nblks_y;y++) {
-	for (z=0;z<nblks_z;z++) {
-	  if ((t+x+y+z)%2==0) {
-	    index_block_eo[block_index(t,x,y,z)]=i_even;
-	    i_even++;
-	  }
-	  if ((t+x+y+z)%2==1) {
-	    index_block_eo[block_index(t,x,y,z)]=i_odd;
-	    i_odd++;
-	  }
-	}
+  for (int t = 0; t < nblks_t; t++) {
+    for (int x = 0; x < nblks_x; x++) {
+      for (int y = 0; y < nblks_y; y++) {
+        for (int z = 0; z < nblks_z; z++) {
+          ix = block_index(t,x,y,z);
+          if (block_list[ix].evenodd == 0) {
+            index_block_eo[ix] = i_even;
+            i_even++;
+          }
+          else {
+            index_block_eo[ix] = i_odd;
+            i_odd++;
+          }
+        }
       }
     }
   }
 
-  for(ix = 0; ix < nb_blocks; ix++) {
+  for(int ix = 0; ix < nb_blocks; ix++) {
     zstride = check_blocks_geometry(&block_list[ix]);
   }
 
@@ -775,124 +870,43 @@ void block_contract_basis(int const idx, int const vecnum, int const dir, spinor
   }
 }
 
-void alt_block_compute_little_D() {
-  int i, j, k, l;
-  spinor *_rec, *rec, *_app, *app, *zero;
-  spinor *psi, **psi_blocks;
 
-  _rec = calloc(VOLUMEPLUSRAND+1, sizeof(spinor));
-#if ( defined SSE || defined SSE2 || defined SSE3)
-  rec = (spinor*)(((unsigned long int)(_rec)+ALIGN_BASE)&~ALIGN_BASE);
-#else
-  rec = _rec;
-#endif  
-  _app = calloc(VOLUMEPLUSRAND+1, sizeof(spinor));
+
+/* checked CU */
+void compute_little_D_diagonal(const int mul_g5) {
+  int i,j, blk;
+  spinor * tmp, * _tmp;
+  _Complex double * M;
+  _tmp = calloc( block_list[0].volume + block_list[0].spinpad + 1, sizeof(spinor));
 #if ( defined SSE || defined SSE2 || defined SSE3)
-  app = (spinor*)(((unsigned long int)(_app)+ALIGN_BASE)&~ALIGN_BASE);
+  tmp = (spinor*)(((unsigned long int)(_tmp)+ALIGN_BASE)&~ALIGN_BASE);
 #else
-  app = _app;
+  tmp = _tmp;
 #endif  
-  zero = calloc(VOLUMEPLUSRAND, sizeof(spinor));
-  psi = calloc(VOLUME+nb_blocks, sizeof(spinor));
-  psi_blocks = (spinor**)calloc(nb_blocks, sizeof(spinor*));
-  for(i=0;i<nb_blocks;i++) psi_blocks[i] = psi + i*(VOLUME/nb_blocks + 1);
-
-  for (j = 0; j < VOLUMEPLUSRAND; ++j){
-    _spinor_null(zero[j]);
-  }
-
-  for (k = 0; k < g_nproc; ++k) {
-    for (i = 0; i < g_N_s; ++i) {
-      for(l=0;l<nb_blocks;l++) {
-	/* Lower Z block */
-	for (j = 0; j < VOLUMEPLUSRAND; ++j){
-	  _spinor_null(rec[j]);
-	}
-	if (g_cart_id == k) {
-	  reconstruct_global_field_GEN_ID(rec, block_list, i, nb_blocks);
-	}
-	D_psi(app, rec);
-	split_global_field_GEN(psi_blocks, app, nb_blocks);
-	if (g_cart_id == k) {
-	  block_contract_basis(0, i, NONE, psi);
-	  block_contract_basis(1, i, Z_DN, psi);
-	}
-#ifdef MPI
-	else if (k == g_nb_t_up) {
-	  block_contract_basis(0, i, T_UP, psi);
-	}
-	else if (k == g_nb_t_dn) {
-	  block_contract_basis(0, i, T_DN, psi);
-	}
-	else if (k == g_nb_x_up) {
-	  block_contract_basis(0, i, X_UP, psi);
-	}
-	else if (k == g_nb_x_dn) {
-	  block_contract_basis(0, i, X_DN, psi);
-	}
-	else if (k == g_nb_y_up) {
-	  block_contract_basis(0, i, Y_UP, psi);
-	}
-	else if (k == g_nb_y_dn) {
-	  block_contract_basis(0, i, Y_DN, psi);
-	}
-	else if (k == g_nb_z_up) {
-	  block_contract_basis(1, i, Z_UP, psi);
-	}
-#endif
+
+  for(blk = 0; blk < nb_blocks; blk++) {
+    M = block_list[blk].little_dirac_operator;
+    for(i = 0; i < g_N_s; i++) {
+      Block_D_psi(&block_list[blk], tmp, block_list[blk].basis[i]);
+      if(mul_g5) gamma5(tmp, tmp, block_list[blk].volume);
+      for(j = 0; j < g_N_s; j++) {
+        M[i * g_N_s + j]  = scalar_prod(block_list[blk].basis[j], tmp, block_list[blk].volume, 0);
+        block_list[blk].little_dirac_operator_32[i*g_N_s + j] = (_Complex float)M[i * g_N_s + j];
       }
-      /* Upper Z block */
-      /*      for (j = 0; j < VOLUMEPLUSRAND; ++j){
-	      _spinor_null(rec[j]);
-	      }
-
-	      if (g_cart_id == k){
-	      reconstruct_global_field(rec, zero, block_list[nb_blocks-1].basis[i]);
-	      }
-
-	      D_psi(app, rec);
-
-	      split_global_field(psi_blocks, app);
-	      if (g_cart_id == k){
-	      block_contract_basis(0, i, Z_UP, psi);
-	      block_contract_basis(1, i, NONE, psi);
-	      }
-	      #ifdef MPI
-	      else if (k == g_nb_t_up){
-	      block_contract_basis(1, i, T_UP, psi);
-	      }
-	      else if (k == g_nb_t_dn){
-	      block_contract_basis(1, i, T_DN, psi);
-	      }
-	      else if (k == g_nb_x_up){
-	      block_contract_basis(1, i, X_UP, psi);
-	      }
-	      else if (k == g_nb_x_dn){
-	      block_contract_basis(1, i, X_DN, psi);
-	      }
-	      else if (k == g_nb_y_up){
-	      block_contract_basis(1, i, Y_UP, psi);
-	      }
-	      else if (k == g_nb_y_dn){
-	      block_contract_basis(1, i, Y_DN, psi);
-	      }
-	      else if (k == g_nb_z_dn){
-	      block_contract_basis(0, i, Z_DN, psi);
-	      }
-
-	      MPI_Barrier(MPI_COMM_WORLD);
-	      #endif */
     }
   }
 
-  if(g_debug_level > -1) {
-    if (g_N_s <= 5 && g_cart_id == 0){
+  if(g_debug_level > 2) {
+    if (g_N_s <= 5 && !g_cart_id){
       printf("\n\n  *** CHECKING LITTLE D ***\n");
       printf("\n  ** node 0, lower block **\n");
       for (i = 0*g_N_s; i < 9 * g_N_s; ++i){
         printf(" [ ");
         for (j = 0; j < g_N_s; ++j){
-          printf("%s%1.3e %s %1.3e i", creal(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? "  " : "- ", creal(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? creal(block_list[0].little_dirac_operator[i * g_N_s + j]) : -creal(block_list[0].little_dirac_operator[i * g_N_s + j]), cimag(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? "+" : "-", cimag(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? cimag(block_list[0].little_dirac_operator[i * g_N_s + j]) : -cimag(block_list[0].little_dirac_operator[i * g_N_s + j]));
+          printf("%s%1.3e %s %1.3e i", creal(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? "  " : "- ", 
+                 creal(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? creal(block_list[0].little_dirac_operator[i * g_N_s + j]) : -creal(block_list[0].little_dirac_operator[i * g_N_s + j]), 
+                 cimag(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? "+" : "-", 
+                 cimag(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? cimag(block_list[0].little_dirac_operator[i * g_N_s + j]) : -cimag(block_list[0].little_dirac_operator[i * g_N_s + j]));
           if (j != g_N_s - 1){
             printf(",\t");
           }
@@ -901,13 +915,16 @@ void alt_block_compute_little_D() {
         if ((i % g_N_s) == (g_N_s - 1))
           printf("\n");
       }
-
+      
       printf("\n\n  *** CHECKING LITTLE D ***\n");
       printf("\n  ** node 0, upper block **\n");
       for (i = 0*g_N_s; i < 9 * g_N_s; ++i){
         printf(" [ ");
         for (j = 0; j < g_N_s; ++j){
-          printf("%s%1.3e %s %1.3e i", creal(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? "  " : "- ", creal(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? creal(block_list[1].little_dirac_operator[i * g_N_s + j]) : -creal(block_list[1].little_dirac_operator[i * g_N_s + j]), cimag(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? "+" : "-", cimag(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? cimag(block_list[1].little_dirac_operator[i * g_N_s + j]) : -cimag(block_list[1].little_dirac_operator[i * g_N_s + j]));
+          printf("%s%1.3e %s %1.3e i", creal(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? "  " : "- ", 
+                 creal(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? creal(block_list[1].little_dirac_operator[i * g_N_s + j]) : -creal(block_list[1].little_dirac_operator[i * g_N_s + j]), 
+                 cimag(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? "+" : "-", 
+                 cimag(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? cimag(block_list[1].little_dirac_operator[i * g_N_s + j]) : -cimag(block_list[1].little_dirac_operator[i * g_N_s + j]));
           if (j != g_N_s - 1){
             printf(",\t");
           }
@@ -915,258 +932,259 @@ void alt_block_compute_little_D() {
         printf(" ]\n");
         if ((i % g_N_s) == (g_N_s - 1))
           printf("\n");
+        
       }
     }
   }
 
-  free(_rec);
-  free(_app);
-  free(zero);
-  free(psi);
-}
-
 
-/* checked CU */
-void compute_little_D_diagonal() {
-  int i,j, blk;
-  spinor * tmp, * _tmp;
-  _Complex double * M;
-  _tmp = calloc( block_list[0].volume + block_list[0].spinpad + 1, sizeof(spinor));
-#if ( defined SSE || defined SSE2 || defined SSE3)
-  tmp = (spinor*)(((unsigned long int)(_tmp)+ALIGN_BASE)&~ALIGN_BASE);
-#else
-  tmp = _tmp;
-#endif  
-
-  for(blk = 0; blk < nb_blocks; blk++) {
-    M = block_list[blk].little_dirac_operator;
-    for(i = 0; i < g_N_s; i++) {
-      Block_D_psi(&block_list[blk], tmp, block_list[blk].basis[i]);
-      for(j = 0; j < g_N_s; j++) {
-	M[i * g_N_s + j]  = scalar_prod(block_list[blk].basis[j], tmp, block_list[blk].volume, 0);
-	block_list[blk].little_dirac_operator32[i*g_N_s + j] = M[i * g_N_s + j];
-      }
-    }
-  }
   free(_tmp);
   return;
 }
 
 
-/* what happens if this routine is called in a one dimensional parallelisation? */
-/* or even serially ?                                                           */
-/* checked CU */
-void compute_little_D() {
+/* checked CU in 4d parallel case */
+void compute_little_D(const int mul_g5) {
   /* 
      This is the little dirac routine rewritten according to multidimensional blocking
      Adaptation by Claude Tadonki (claude.tadonki@u-psud.fr)
      Date: May 2010
   */
-  spinor *scratch, * temp, *_scratch;
-  spinor *r, *s;
-  su3 * u;
-  int x, y, z=0, t, ix, iy=0, i, j, pm, mu=0, blk;
+  spinor *scratch, *temp, *_scratch;
+  int mu=0;
+  double atime, etime;
+  // the block volume
+  int bvol = block_list[1].volume;
   int t_start, t_end, x_start, x_end, y_start, y_end, z_start, z_end;
-  _Complex double c, *M;
-  int count=0;
-  int bx, by, bz, bt, block_id = 0, block_id_e, block_id_o,is_up = 0, ib;
+  //_Complex double c;
+  //int count=0;
+  int is_up = 0;
   int dT, dX, dY, dZ;
-  dT = T/nblks_t; dX = LX/nblks_x; dY = LY/nblks_y; dZ = LZ/nblks_z;
+  double musave = g_mu;
+  double kappasave = g_kappa;
+  if(kappa_dfl > 0) {
+    g_kappa = kappa_dflgen;
+  }
+  if(mu_dfl > -10) {
+    g_mu = mu_dfl;
+    if(g_mu*musave < 0) g_mu *= -1.;
+  }
+  boundary(g_kappa);
 
-  if(g_proc_id == 0) printf("||-----------------------\n||compute_little_D\n||-----------------------\n");
+  dT = T/nblks_t; 
+  dX = LX/nblks_x; 
+  dY = LY/nblks_y; 
+  dZ = LZ/nblks_z;
 
+  if(g_proc_id == 0 && g_debug_level > 1) {
+    printf("# compute_little_D called with mul_g5 = %d\n", mul_g5);
+    printf("# compute_little_D parameters mu= %.12f, kappa= %.12f\n", g_mu/2./g_kappa, g_kappa);
+  }
 
   /* for a full spinor field we need VOLUMEPLUSRAND                 */
   /* because we use the same geometry as for the                    */
   /* gauge field                                                    */
   /* It is VOLUME + 2*LZ*(LY*LX + T*LY + T*LX) + 4*LZ*(LY + T + LX) */
-  _scratch = calloc(2*VOLUMEPLUSRAND+1, sizeof(spinor));
-#if ( defined SSE || defined SSE2 || defined SSE3)
+  if( (_scratch = calloc(2*VOLUMEPLUSRAND+1, sizeof(spinor))) == NULL) {
+    fprintf(stderr, "not enough memory for scratch in compute_little_D! Aborting...\n");
+    exit(-1);
+  }
   scratch = (spinor*)(((unsigned long int)(_scratch)+ALIGN_BASE)&~ALIGN_BASE);
-#else
-  scratch = _scratch;
-#endif
+  // not needed?
   temp = scratch + VOLUMEPLUSRAND;
-  // NEED TO BE REWRITTEN
-  block_id_e = 0;
-  block_id_o = 0;
-  for(blk = 0; blk < nb_blocks; blk++) {
+  // NEEDs TO BE REWRITTEN
+  atime = gettime();
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+    spinor * bscratch;
+    _Complex double * M;
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(int blk = 0; blk < nb_blocks; blk++) {
+    bscratch = scratch + blk*bvol;
     M = block_list[blk].little_dirac_operator;
-    for(i = 0; i < g_N_s; i++) {
-      Block_D_psi(&block_list[blk], scratch, block_list[blk].basis[i]);
-      for(j = 0; j < g_N_s; j++) {
-	M[i * g_N_s + j]  = scalar_prod(block_list[blk].basis[j], scratch, block_list[blk].volume, 0);
-	
-	if (block_list[blk].evenodd==0) {
-	  block_list[block_id_e].little_dirac_operator_eo[i * g_N_s + j] = M[i * g_N_s + j];
-	}
-	if (block_list[blk].evenodd==1) {
-	  block_list[(nb_blocks/2)+block_id_o].little_dirac_operator_eo[i * g_N_s + j] = M[i * g_N_s + j];
-	}
+    for(int i = 0; i < g_N_s; i++) {
+      Block_D_psi(&block_list[blk], bscratch, block_list[blk].basis[i]);
+      if(mul_g5) gamma5(bscratch, bscratch, bvol);
+      for(int j = 0; j < g_N_s; j++) {
+        M[i * g_N_s + j]  = scalar_prod_ts(block_list[blk].basis[j], bscratch, bvol, 0);
+        block_list[blk].little_dirac_operator_32[i * g_N_s + j] = (_Complex float)M[i * g_N_s + j];
+        
+        block_list[block_list[blk].evenodd_id].little_dirac_operator_eo[i * g_N_s + j] = M[i * g_N_s + j];
       }
     }
-    if (block_list[blk].evenodd==0) block_id_e++;
-    if (block_list[blk].evenodd==1) block_id_o++;
   }
-  
   /* computation of little_Dhat^{-1}_ee */
-  
-  for(blk = 0; blk < nb_blocks/2; blk++) {
-    LUInvert(g_N_s,block_list[blk].little_dirac_operator_eo,g_N_s);
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(int blk = 0; blk < nb_blocks/2; blk++) {
+    LUInvert(g_N_s, block_list[blk].little_dirac_operator_eo, g_N_s);
   }
-  for (i = 0; i < g_N_s; i++) {
-    if(i==0) count = 0;
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+  for (int i = 0; i < g_N_s; i++) {
     reconstruct_global_field_GEN_ID(scratch, block_list, i , nb_blocks);
     
-#ifdef MPI
+#ifdef TM_USE_MPI
     xchange_lexicfield(scratch);
 #endif
     
     /* the initialisation causes troubles on a single processor */
     if(g_nproc == -1) zero_spinor_field(scratch, VOLUME);
     /* +-t +-x +-y +-z */
-    for(pm = 0; pm < 8; pm++) {
+
+    for(int pm = 0; pm < 8; pm++) {
       /* We set up the generic bounds */
       t_start = 0; t_end = dT;
       x_start = 0; x_end = dX;
       y_start = 0; y_end = dY;
       z_start = 0; z_end = dZ;
-      switch(pm){ 
-      case 0: t_start = dT - 1; t_end = t_start + 1; mu = 0; is_up = 1; break; /* Boundary in direction +t */
-      case 1: t_start = 0;      t_end = t_start + 1; mu = 0; is_up = 0; break; /* Boundary in direction -t */
-      case 2: x_start = dX - 1; x_end = x_start + 1; mu = 1; is_up = 1; break; /* Boundary in direction +x */
-      case 3: x_start = 0;      x_end = x_start + 1; mu = 1; is_up = 0; break; /* Boundary in direction -x */
-      case 4: y_start = dY - 1; y_end = y_start + 1; mu = 2; is_up = 1; break; /* Boundary in direction +y */
-      case 5: y_start = 0;      y_end = y_start + 1; mu = 2; is_up = 0; break; /* Boundary in direction -y */
-      case 6: z_start = dZ - 1; z_end = z_start + 1; mu = 3; is_up = 1; break; /* Boundary in direction +z */
-      case 7: z_start = 0;      z_end = z_start + 1; mu = 3; is_up = 0; break; /* Boundary in direction -z */
+      switch(pm) { 
+      case 0: t_start = dT - 1; t_end = t_start + 1; mu = 0; is_up = 1; break; // Boundary in dir +t
+      case 1: t_start = 0;      t_end = t_start + 1; mu = 0; is_up = 0; break; // Boundary in dir -t
+      case 2: x_start = dX - 1; x_end = x_start + 1; mu = 1; is_up = 1; break; // Boundary in dir +x
+      case 3: x_start = 0;      x_end = x_start + 1; mu = 1; is_up = 0; break; // Boundary in dir -x
+      case 4: y_start = dY - 1; y_end = y_start + 1; mu = 2; is_up = 1; break; // Boundary in dir +y
+      case 5: y_start = 0;      y_end = y_start + 1; mu = 2; is_up = 0; break; // Boundary in dir -y
+      case 6: z_start = dZ - 1; z_end = z_start + 1; mu = 3; is_up = 1; break; // Boundary in dir +z
+      case 7: z_start = 0;      z_end = z_start + 1; mu = 3; is_up = 0; break; // Boundary in dir -z
       default: ;
       }
       /* Dirac operator on the boundaries */
-      r = temp;
-      for(bt = 0; bt < nblks_t; bt++) {
-        for(bx = 0; bx < nblks_x; bx++) {
-	  for(by = 0; by < nblks_y; by++) {
-	    for(bz = 0; bz < nblks_z; bz++) {
-	      for(t = t_start; t < t_end; t++) {
-		for(x = x_start; x < x_end; x++) {
-		  for(y = y_start; y < y_end; y++) {
-		    for(z = z_start; z < z_end; z++) {
-		      /* We treat the case when we need to cross between blocks                             */
-		      /* We are in block (bt, bx, by, bz) and compute direction pm                          */
-		      /* We check inner block statement by ( b_ > 0 )&&( b_ < nblks_ - 1 )                  */
-		      /* Other cases are threated in a standard way using the boundary of the scracth array */
-		      ib = -1; /* ib is the index of the selected block if any */
-		      if((pm==0)&&(bt<nblks_t-1)&&(t==t_end-1)){ //direction +t
-			iy = index_b(0, x, y, z); /* lowest edge of upper block needed */
-			ib = block_index(bt+1, bx, by, bz);
-		      }
-		      else if((pm==1)&&(bt>0)&&(t==0)){ //direction -t
-			iy = index_b(dT - 1, x, y, z); /* highest edge of lower block needed */
-			ib = block_index(bt-1, bx, by, bz);
-		      }
-		      else if((pm==2)&&(bx<nblks_x-1)&&(x==x_end-1)){ //direction +x
-			iy = index_b(t, 0, y, z); /* lowest edge of upper block needed */
-			ib = block_index(bt, bx+1, by, bz);
-		      }
-		      else if((pm==3)&&(bx>0)&&(x==0)){ //direction -x
-			iy = index_b(t, dX - 1, y, z); /* highest edge of lower block needed */
-			ib = block_index(bt, bx-1, by, bz);
-		      }
-		      else if((pm==4)&&(by<nblks_y-1)&&(y==y_end-1)){ //direction +y
-			iy = index_b(t, x, 0, z); /* lowest edge of upper block needed */
-			ib = block_index(bt, bx, by+1, bz);
-		      }
-		      else if((pm==5)&&(by>0)&&(y==0)){ //direction -y
-			iy = index_b(t, x, dY - 1, z); /* highest edge of lower block needed */
-			ib = block_index(bt, bx, by-1, bz);
-		      }
-		      else if((pm==6)&&(bz<nblks_z-1)&&(z==z_end-1)){ //direction +z
-			iy = index_b(t, x, y, 0); /* lowest edge of upper block needed */
-			ib = block_index(bt, bx, by, bz+1);
-		      }
-		      else if((pm==7)&&(bz>0)&&(z==0)){ //direction -z
-			iy = index_b(t, x, y, dZ - 1); /* highest edge of lower block needed */
-			ib = block_index(bt, bx, by, bz-1);
-		      }
-		      ix = index_a(dT*bt + t, dX*bx + x, dY*by + y, dZ*bz + z);// GAFFE ICI
-		      if(is_up == 1) {
-			s = &scratch[ g_iup[ ix ][mu] ]; 
-			u = &g_gauge_field[ ix ][mu];
-		      }
-		      else {
-			s = &scratch[ g_idn[ ix ][mu] ];
-			u = &g_gauge_field[ g_idn[ix][mu] ][mu];
-		      }
-		      if(ib >= 0) s = &block_list[ib].basis[ i ][ iy ] ; 
-		      boundary_D[pm](r, s, u);
-		      r++;
-		    }
-		  }
-		}
-	      }
-	    }
-	  }
-	}
-      }
-      
-      /* Now all the scalar products */
-      for(j = 0; j < g_N_s; j++) {
-	iy = i * g_N_s + j  + (pm + 1) * g_N_s * g_N_s;
-	block_id = 0;
-	block_id_e=0;
-	block_id_o=0;
-	r = temp;
-	for(bt = 0; bt < nblks_t; bt++) {
-	  for(bx = 0; bx < nblks_x; bx++) {
-            for(by = 0; by < nblks_y; by++) {
-	      for(bz = 0; bz < nblks_z; bz++){
-		block_list[block_id].little_dirac_operator[ iy ] = 0.0;
-		if (block_list[block_id].evenodd==0) {block_list[block_id_e].little_dirac_operator_eo[ iy ] = 0.0;}
- 		if (block_list[block_id].evenodd==1) {block_list[block_id_o+nb_blocks/2].little_dirac_operator_eo[ iy ] = 0.0;}
-		/* We need to contract g_N_s times with the same set of fields */
-		for(t = t_start; t < t_end; t++) {
-		  for(x = x_start; x < x_end; x++) {
-		    for(y = y_start; y < y_end; y++) {
-		      for(z = z_start; z < z_end; z++) {
-			ix = index_b(t, x, y, z); // TO BE INLINED
-			s = &block_list[block_id].basis[j][ ix ];
-			c = scalar_prod(s, r, 1, 0);// TO BE INLINED
-			block_list[block_id].little_dirac_operator[ iy ] += c;
-		if (block_list[block_id].evenodd==0) {
-		block_list[block_id_e].little_dirac_operator_eo[ iy ] += c;
-		}
-		if (block_list[block_id].evenodd==1) {
-		block_list[block_id_o+nb_blocks/2].little_dirac_operator_eo[ iy ] += c;
-		}
-			r++;
-		      }
-			   
-		    }
-		  }
-		}
-		if (block_list[block_id].evenodd==0) block_id_e++;
-		if (block_list[block_id].evenodd==1) block_id_o++;	
-		block_id++;
-	      }
-	    }
-	  }
-	}
-      }
+#ifdef TM_USE_OMP
+#pragma omp parallel 
+      {
+#endif
+        spinor r;
+        spinor * s = NULL;
+        su3 * u;
+        int ib, iy, ix, iz;
+        int bx, by, bz, bt;
+	int block_id_eo;
+        _Complex double c;
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+        for(int block_id = 0; block_id < nb_blocks; block_id++) {
+          block_id_eo =  block_list[block_id].evenodd_id;
+
+          for(int j = 0; j < g_N_s; j++) {
+            iz = i * g_N_s + j  + (pm + 1) * g_N_s * g_N_s;
+            block_list[block_id].little_dirac_operator[ iz ] = 0.0;
+            block_list[block_id_eo].little_dirac_operator_eo[ iz ] = 0.0;
+          }
+
+          s = NULL;
+          iy = 0; ix = 0;
+          bz = block_id % nblks_z;
+          by = ((block_id - bz) % (nblks_y*nblks_z)) / nblks_z;
+          bx = ((block_id - bz - by*nblks_z) % (nblks_x*nblks_y*nblks_z)) / ( nblks_z*nblks_y);
+          bt = (block_id - bz - by*nblks_z - bx*nblks_z*nblks_y) / ( nblks_z*nblks_y*nblks_x);
+          for(int t = t_start; t < t_end; t++) {
+            for(int x = x_start; x < x_end; x++) {
+              for(int y = y_start; y < y_end; y++) {
+                for(int z = z_start; z < z_end; z++) {
+                  // We treat the case when we need to cross between blocks
+                  // We are in block (bt, bx, by, bz) and compute direction pm
+                  // We check inner block statement by ( b_ > 0 )&&( b_ < nblks_ - 1 )
+                  // Other cases are treated in a standard way using the boundary of the scracth array 
+                  ib = -1; // ib is the index of the selected block if any
+                  if((pm==0)&&(bt<nblks_t-1)&&(t==t_end-1)){ //direction +t
+                    iy = index_b(0, x, y, z); /* lowest edge of upper block needed */
+                    ib = block_index(bt+1, bx, by, bz);
+                  }
+                  else if((pm==1)&&(bt>0)&&(t==0)){ //direction -t
+                    iy = index_b(dT - 1, x, y, z); /* highest edge of lower block needed */
+                    ib = block_index(bt-1, bx, by, bz);
+                  }
+                  else if((pm==2)&&(bx<nblks_x-1)&&(x==x_end-1)){ //direction +x
+                    iy = index_b(t, 0, y, z); /* lowest edge of upper block needed */
+                    ib = block_index(bt, bx+1, by, bz);
+                  }
+                  else if((pm==3)&&(bx>0)&&(x==0)){ //direction -x
+                    iy = index_b(t, dX - 1, y, z); /* highest edge of lower block needed */
+                    ib = block_index(bt, bx-1, by, bz);
+                  }
+                  else if((pm==4)&&(by<nblks_y-1)&&(y==y_end-1)){ //direction +y
+                    iy = index_b(t, x, 0, z); /* lowest edge of upper block needed */
+                    ib = block_index(bt, bx, by+1, bz);
+                  }
+                  else if((pm==5)&&(by>0)&&(y==0)){ //direction -y
+                    iy = index_b(t, x, dY - 1, z); /* highest edge of lower block needed */
+                    ib = block_index(bt, bx, by-1, bz);
+                  }
+                  else if((pm==6)&&(bz<nblks_z-1)&&(z==z_end-1)){ //direction +z
+                    iy = index_b(t, x, y, 0); /* lowest edge of upper block needed */
+                    ib = block_index(bt, bx, by, bz+1);
+                  }
+                  else if((pm==7)&&(bz>0)&&(z==0)){ //direction -z
+                    iy = index_b(t, x, y, dZ - 1); /* highest edge of lower block needed */
+                    ib = block_index(bt, bx, by, bz-1);
+                  }
+                  ix = index_a(dT*bt + t, dX*bx + x, dY*by + y, dZ*bz + z);// GAFFE ICI
+                  if(is_up == 1) {
+                    s = &scratch[ g_iup[ ix ][mu] ]; 
+                    u = &g_gauge_field[ ix ][mu];
+                  }
+                  else {
+                    s = &scratch[ g_idn[ ix ][mu] ];
+                    u = &g_gauge_field[ g_idn[ix][mu] ][mu];
+                  }
+                  if(ib >= 0) s = &block_list[ib].basis[ i ][ iy ] ; 
+                  boundary_D[pm](&r, s, u);
+                  if(mul_g5) gamma5(&r, &r, 1);
+                  // now all the scalar products
+                  for(int j = 0; j < g_N_s; j++) {
+                    iz = i * g_N_s + j  + (pm + 1) * g_N_s * g_N_s;
+                    ix = index_b(t, x, y, z); 
+                    s = &block_list[block_id].basis[j][ ix ];
+                    c = scalar_prod_ts(s, &r, 1, 0);
+                    block_list[block_id].little_dirac_operator[ iz ] += c;
+                    block_list[block_id_eo].little_dirac_operator_eo[ iz ] += c;
+                  }
+                }
+              }
+            }
+          }
+        }
+#ifdef TM_USE_OMP
+      } // OMP closing brace
+#endif
     }
   }
-  for(i = 0; i < nb_blocks; i++)
-    for(j = 0; j < 9 * g_N_s * g_N_s; j++)
-      block_list[i].little_dirac_operator32[j] = (_Complex float)block_list[i].little_dirac_operator[ iy ];
+#ifdef TM_USE_OMP
+#pragma omp parallel for
+#endif
+  for(int ij = 0; ij < nb_blocks*9*g_N_s*g_N_s; ij++) {
+    int i = ij / (9*g_N_s*g_N_s);
+    int j = ij % (9*g_N_s*g_N_s);
+    block_list[i].little_dirac_operator_32[ j ] 
+      = (_Complex float)block_list[i].little_dirac_operator[ j ];
+    block_list[i].little_dirac_operator_eo_32[ j ] 
+      = (_Complex float)block_list[i].little_dirac_operator_eo[ j ];
+  }
+  etime = gettime();
+  if(g_debug_level > 2 && g_proc_id == 0) {
+    printf("# time for compute_little_D: %e\n", etime-atime);
+  }
 
-  if(g_debug_level > 3) {
+  if(g_debug_level > 2) {
     if (g_N_s <= 5 && !g_cart_id){
       printf("\n\n  *** CHECKING LITTLE D ***\n");
       printf("\n  ** node 0, lower block **\n");
-      for (i = 0*g_N_s; i < 9 * g_N_s; ++i){
+      for (int i = 0*g_N_s; i < 9 * g_N_s; ++i){
         printf(" [ ");
-        for (j = 0; j < g_N_s; ++j){
-          printf("%s%1.3e %s %1.3e i", creal(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? "  " : "- ", creal(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? creal(block_list[0].little_dirac_operator[i * g_N_s + j]) : -creal(block_list[0].little_dirac_operator[i * g_N_s + j]), cimag(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? "+" : "-", cimag(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? cimag(block_list[0].little_dirac_operator[i * g_N_s + j]) : -cimag(block_list[0].little_dirac_operator[i * g_N_s + j]));
+        for (int j = 0; j < g_N_s; ++j){
+          printf("%s%1.3e %s %1.3e i", 
+                 creal(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? "  " : "- ", 
+                 creal(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? creal(block_list[0].little_dirac_operator[i * g_N_s + j]) : -creal(block_list[0].little_dirac_operator[i * g_N_s + j]), 
+                 cimag(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? "+" : "-", 
+                 cimag(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? cimag(block_list[0].little_dirac_operator[i * g_N_s + j]) : -cimag(block_list[0].little_dirac_operator[i * g_N_s + j]));
           if (j != g_N_s - 1){
             printf(",\t");
           }
@@ -1178,10 +1196,14 @@ void compute_little_D() {
       
       printf("\n\n  *** CHECKING LITTLE D ***\n");
       printf("\n  ** node 0, upper block **\n");
-      for (i = 0*g_N_s; i < 9 * g_N_s; ++i){
+      for (int i = 0*g_N_s; i < 9 * g_N_s; ++i){
         printf(" [ ");
-        for (j = 0; j < g_N_s; ++j){
-          printf("%s%1.3e %s %1.3e i", creal(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? "  " : "- ", creal(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? creal(block_list[1].little_dirac_operator[i * g_N_s + j]) : -creal(block_list[1].little_dirac_operator[i * g_N_s + j]), cimag(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? "+" : "-", cimag(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? cimag(block_list[1].little_dirac_operator[i * g_N_s + j]) : -cimag(block_list[1].little_dirac_operator[i * g_N_s + j]));
+        for (int j = 0; j < g_N_s; ++j){
+          printf("%s%1.3e %s %1.3e i", 
+                 creal(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? "  " : "- ", 
+                 creal(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? creal(block_list[1].little_dirac_operator[i * g_N_s + j]) : -creal(block_list[1].little_dirac_operator[i * g_N_s + j]), 
+                 cimag(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? "+" : "-", 
+                 cimag(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? cimag(block_list[1].little_dirac_operator[i * g_N_s + j]) : -cimag(block_list[1].little_dirac_operator[i * g_N_s + j]));
           if (j != g_N_s - 1){
             printf(",\t");
           }
@@ -1189,10 +1211,12 @@ void compute_little_D() {
         printf(" ]\n");
         if ((i % g_N_s) == (g_N_s - 1))
           printf("\n");
-	
       }
     }
   }
+  g_mu = musave;
+  g_kappa = kappasave;
+  boundary(g_kappa);
   
   free(_scratch);
   return;
@@ -1206,21 +1230,21 @@ int split_global_field_GEN(spinor ** const psi, spinor * const field, const int
   for (t = 0; t < dT; t++) {
     for (x = 0; x < dX; x++) {
       for (y = 0; y < dY; y++) {
-	for (z = 0; z < dZ; z++) {
-	  block_id = 0;
-	  for(bt = 0; bt < nblks_t; bt++) {
-	    for(bx = 0; bx < nblks_x; bx++) {
-	      for(by = 0; by < nblks_y; by++) {
-		for(bz = 0; bz < nblks_z; bz++) {
-		  _spinor_assign(*(psi[block_id] + ctr_t), 
-				 *(field + index_a(dT*bt + t, dX*bx + x, dY*by + y, dZ*bz + z)));
-		  block_id++;
-		}
-	      }
-	    }
-	  }
-	  ctr_t++;
-	}
+        for (z = 0; z < dZ; z++) {
+          block_id = 0;
+          for(bt = 0; bt < nblks_t; bt++) {
+            for(bx = 0; bx < nblks_x; bx++) {
+              for(by = 0; by < nblks_y; by++) {
+                for(bz = 0; bz < nblks_z; bz++) {
+                  _spinor_assign(*(psi[block_id] + ctr_t), 
+                                 *(field + index_a(dT*bt + t, dX*bx + x, dY*by + y, dZ*bz + z)));
+                  block_id++;
+                }
+              }
+            }
+          }
+          ctr_t++;
+        }
       }
     }
   }
@@ -1239,21 +1263,21 @@ int split_global_field_GEN_ID(block * const block_list, const int id, spinor * c
   for (t = 0; t < dT; t++) {
     for (x = 0; x < dX; x++) {
       for (y = 0; y < dY; y++) {
-	for (z = 0; z < dZ; z++) {
-	  block_id = 0;
-	  for(bt = 0; bt < nblks_t; bt++) {
-	    for(bx = 0; bx < nblks_x; bx++) {
-	      for(by = 0; by < nblks_y; by++) {
-		for(bz = 0; bz < nblks_z; bz++) {
-		  _spinor_assign(*(block_list[block_id].basis[id] + ctr_t), 
-				 *(field + index_a(dT*bt + t, dX*bx + x, dY*by + y, dZ*bz + z)));
-		    block_id++;
-		}
-	      }
-	    }
-	  }
-	  ctr_t++;
-	}
+        for (z = 0; z < dZ; z++) {
+          block_id = 0;
+          for(bt = 0; bt < nblks_t; bt++) {
+            for(bx = 0; bx < nblks_x; bx++) {
+              for(by = 0; by < nblks_y; by++) {
+                for(bz = 0; bz < nblks_z; bz++) {
+                  _spinor_assign(*(block_list[block_id].basis[id] + ctr_t), 
+                                 *(field + index_a(dT*bt + t, dX*bx + x, dY*by + y, dZ*bz + z)));
+                    block_id++;
+                }
+              }
+            }
+          }
+          ctr_t++;
+        }
       }
     }
   }
@@ -1268,7 +1292,7 @@ int split_global_field_GEN_ID(block * const block_list, const int id, spinor * c
 /* copies the part of globalfields corresponding to block blk */
 /* to the block field blockfield                              */
 void copy_global_to_block(spinor * const blockfield, spinor * const globalfield, const int blk) {
-  int i,it,ix,iy,iz;
+  int it,ix,iy,iz;
   int ibt,ibx,iby,ibz;
   int itb,ixb,iyb,izb;
   int ixcurrent;
@@ -1279,7 +1303,8 @@ void copy_global_to_block(spinor * const blockfield, spinor * const globalfield,
   ibt = blk / (nblks_x * nblks_y*nblks_z);
 
   ixcurrent=0;
-  for (i = 0; i < VOLUME; i++) {
+  // FIXME: here we should better run only through the block volume!
+  for (int i = 0; i < VOLUME; i++) {
 
     /* global coordinates */
     iz = i%LZ;
@@ -1313,19 +1338,60 @@ void copy_global_to_block_eo(spinor * const beven, spinor * const bodd, spinor *
     for(x = 0; x < block_list[blk].BLX; x++) {
       ix = x +  block_list[blk].mpilocal_coordinate[1]*block_list[blk].BLX;
       for(y = 0; y < block_list[blk].BLY; y++) {
-	iy = y +  block_list[blk].mpilocal_coordinate[2]*block_list[blk].BLY;
-	for(z = 0; z < block_list[blk].BLZ; z++) {
-	  iz = z +  block_list[blk].mpilocal_coordinate[3]*block_list[blk].BLZ;
-	  i = g_ipt[it][ix][iy][iz];
-	  if((t+x+y+z)%2 == 0) {
-	    memcpy(beven + even, globalfield + i, sizeof(spinor));
-	    even++;
-	  }
-	  else {
-	    memcpy(bodd + odd, globalfield + i, sizeof(spinor));
-	    odd++;
-	  }
-	}
+        iy = y +  block_list[blk].mpilocal_coordinate[2]*block_list[blk].BLY;
+        for(z = 0; z < block_list[blk].BLZ; z++) {
+          iz = z +  block_list[blk].mpilocal_coordinate[3]*block_list[blk].BLZ;
+          i = g_ipt[it][ix][iy][iz];
+          if((t+x+y+z)%2 == 0) {
+            memcpy(beven + even, globalfield + i, sizeof(spinor));
+            even++;
+          }
+          else {
+            memcpy(bodd + odd, globalfield + i, sizeof(spinor));
+            odd++;
+          }
+        }
+      }
+    }
+  }
+  return;
+}
+
+void copy_global_to_block_eo_32(spinor32 * const beven, spinor32 * const bodd, 
+                                spinor * const globalfield, const int blk) {
+  int i,it,ix,iy,iz;
+  int even = 0, odd = 0;
+  _Complex float * to = NULL;
+  _Complex double * from = NULL;
+  spinor32 * tmp = NULL;
+  
+  for(int t = 0; t < block_list[blk].BT; t++) {
+    it = t + block_list[blk].mpilocal_coordinate[0]*block_list[blk].BT;
+    for(int x = 0; x < block_list[blk].BLX; x++) {
+      ix = x +  block_list[blk].mpilocal_coordinate[1]*block_list[blk].BLX;
+      for(int y = 0; y < block_list[blk].BLY; y++) {
+        iy = y +  block_list[blk].mpilocal_coordinate[2]*block_list[blk].BLY;
+        for(int z = 0; z < block_list[blk].BLZ; z++) {
+          iz = z +  block_list[blk].mpilocal_coordinate[3]*block_list[blk].BLZ;
+          i = g_ipt[it][ix][iy][iz];
+
+          if((t+x+y+z)%2 == 0) {
+            tmp = beven + even;
+            even++;
+          }
+          else {
+            tmp = bodd + odd;
+            odd++;
+          }
+
+          to = (_Complex float*) tmp;
+          from = (_Complex double*) (globalfield + i);
+          for(int k = 0; k < 12; k++) {
+            (*to) = (_Complex float) (*from);
+            to++;
+            from++;
+          }
+        }
       }
     }
   }
@@ -1343,19 +1409,19 @@ void copy_block_eo_to_global(spinor * const globalfield, spinor * const beven, s
     for(x = 0; x < block_list[blk].BLX; x++) {
       ix = x +  block_list[blk].mpilocal_coordinate[1]*block_list[blk].BLX;
       for(y = 0; y < block_list[blk].BLY; y++) {
-	iy = y +  block_list[blk].mpilocal_coordinate[2]*block_list[blk].BLY;
-	for(z = 0; z < block_list[blk].BLZ; z++) {
-	  iz = z +  block_list[blk].mpilocal_coordinate[3]*block_list[blk].BLZ;
-	  i = g_ipt[it][ix][iy][iz];
-	  if((t+x+y+z)%2 == 0) {
-	    memcpy(globalfield + i, beven + even, sizeof(spinor));
-	    even++;
-	  }
-	  else {
-	    memcpy(globalfield + i, bodd + odd, sizeof(spinor));
-	    odd++;
-	  }
-	}
+        iy = y +  block_list[blk].mpilocal_coordinate[2]*block_list[blk].BLY;
+        for(z = 0; z < block_list[blk].BLZ; z++) {
+          iz = z +  block_list[blk].mpilocal_coordinate[3]*block_list[blk].BLZ;
+          i = g_ipt[it][ix][iy][iz];
+          if((t+x+y+z)%2 == 0) {
+            memcpy(globalfield + i, beven + even, sizeof(spinor));
+            even++;
+          }
+          else {
+            memcpy(globalfield + i, bodd + odd, sizeof(spinor));
+            odd++;
+          }
+        }
       }
     }
   }
@@ -1366,7 +1432,7 @@ void copy_block_eo_to_global(spinor * const globalfield, spinor * const beven, s
 /* reconstructs the parts of globalfield corresponding to block blk */
 /* from block field blockfield                                      */
 void copy_block_to_global(spinor * const globalfield, spinor * const blockfield, const int blk) {
-  int i,it,ix,iy,iz;
+  int it,ix,iy,iz;
   int ibt,ibx,iby,ibz;
   int itb,ixb,iyb,izb;
   int ixcurrent;
@@ -1377,7 +1443,8 @@ void copy_block_to_global(spinor * const globalfield, spinor * const blockfield,
   ibt = blk / (nblks_x * nblks_y*nblks_z);
 
   ixcurrent=0;
-  for (i = 0; i < VOLUME; i++) {
+  // FIXME: here we should better run only through the block volume!
+  for (int i = 0; i < VOLUME; i++) {
 
     /* global coordinates */
     iz = i%LZ;
@@ -1405,28 +1472,22 @@ void copy_block_to_global(spinor * const globalfield, spinor * const blockfield,
 
 /* Reconstructs a global field from the little basis of nb_blocks blocks */
 void reconstruct_global_field_GEN(spinor * const rec_field, spinor ** const psi, const int nb_blocks) {
-  int ctr_t=0;
-  int x, y, z, t;
-  int bx, by, bz, bt, block_id;
-  for (t = 0; t < dT; t++) {
-    for (x = 0; x < dX; x++) {
-      for (y = 0; y < dY; y++) {
-	for (z = 0; z < dZ; z++) {
-	  block_id = 0;
-	  for(bt = 0; bt < nblks_t; bt++) {
-	    for(bx = 0; bx < nblks_x; bx++) {
-	      for(by = 0; by < nblks_y; by++) {
-		for(bz = 0; bz < nblks_z; bz++) {
-		  _spinor_assign(*(rec_field + index_a(dT*bt + t, dX*bx + x, dY*by + y, dZ*bz + z)), 
-				 *(psi[block_id] + ctr_t));
-		  block_id++;
-		}
-	      }
-	    }
-	  }
-	  ctr_t++;
-	}
-      }
+
+#ifdef TM_USE_OMP
+#pragma omp parallel for
+#endif
+  for(int ix = 0; ix < dT*dX*dY*dZ; ix++) {
+    int z = ix % dZ;
+    int y = ((ix - z) % (dY*dZ)) / dZ;
+    int x = (ix - z - y*dZ) % (dX*dY*dZ) / (dZ*dY);
+    int t = (ix - z - y*dZ - x*dZ*dY) / (dZ*dY*dX);
+    for(int block_id = 0; block_id < nb_blocks; block_id++) {
+      int bz = block_id % nblks_z;
+      int by = ((block_id - bz) % (nblks_y*nblks_z)) / nblks_z;
+      int bx = ((block_id - bz - by*nblks_z) % (nblks_x*nblks_y*nblks_z)) / ( nblks_z*nblks_y);
+      int bt = (block_id - bz - by*nblks_z - bx*nblks_z*nblks_y) / ( nblks_z*nblks_y*nblks_x);
+      _spinor_assign(*(rec_field + index_a(dT*bt + t, dX*bx + x, dY*by + y, dZ*bz + z)), 
+                     *(psi[block_id] + ix));
     }
   }
   return;
@@ -1434,28 +1495,21 @@ void reconstruct_global_field_GEN(spinor * const rec_field, spinor ** const psi,
 
 /* Reconstructs a global field from the little basis of nb_blocks blocks taken from block_list[*].basis[id] */
 void reconstruct_global_field_GEN_ID(spinor * const rec_field, block * const block_list, const int id, const int nb_blocks) {
-  int ctr_t=0;
-  int x, y, z, t;
-  int bx, by, bz, bt, block_id;
-  for (t = 0; t < dT; t++) {
-    for (x = 0; x < dX; x++) {
-      for (y = 0; y < dY; y++) {
-	for (z = 0; z < dZ; z++) {
-	  block_id = 0;
-	  for(bt = 0; bt < nblks_t; bt++) {
-	    for(bx = 0; bx < nblks_x; bx++) {
-	      for(by = 0; by < nblks_y; by++) {
-		for(bz = 0; bz < nblks_z; bz++) {
-		  _spinor_assign(*(rec_field + index_a(dT*bt + t, dX*bx + x, dY*by + y, dZ*bz + z)), 
-				 *(block_list[block_id].basis[id] + ctr_t));
-		  block_id++;
-		}
-	      }
-	    }
-	  }
-	  ctr_t++;
-	}
-      }
+#ifdef TM_USE_OMP
+#pragma omp parallel for
+#endif
+  for(int ix = 0; ix < dT*dX*dY*dZ; ix++) {
+    int z = ix % dZ;
+    int y = ((ix - z) % (dY*dZ)) / dZ;
+    int x = (ix - z - y*dZ) % (dX*dY*dZ) / (dZ*dY);
+    int t = (ix - z - y*dZ - x*dZ*dY) / (dZ*dY*dX);
+    for(int block_id = 0; block_id < nb_blocks; block_id++) {
+      int bz = block_id % nblks_z;
+      int by = ((block_id - bz) % (nblks_y*nblks_z)) / nblks_z;
+      int bx = ((block_id - bz - by*nblks_z) % (nblks_x*nblks_y*nblks_z)) / ( nblks_z*nblks_y);
+      int bt = (block_id - bz - by*nblks_z - bx*nblks_z*nblks_y) / ( nblks_z*nblks_y*nblks_x);
+      _spinor_assign(*(rec_field + index_a(dT*bt + t, dX*bx + x, dY*by + y, dZ*bz + z)), 
+                     *(block_list[block_id].basis[id] + ix));
     }
   }
   return;
@@ -1471,25 +1525,66 @@ void add_eo_block_to_global(spinor * const globalfield, spinor * const beven, sp
     for(x = 0; x < block_list[blk].BLX; x++) {
       ix = x +  block_list[blk].mpilocal_coordinate[1]*block_list[blk].BLX;
       for(y = 0; y < block_list[blk].BLY; y++) {
-	iy = y +  block_list[blk].mpilocal_coordinate[2]*block_list[blk].BLY;
-	for(z = 0; z < block_list[blk].BLZ; z++) {
-	  iz = z +  block_list[blk].mpilocal_coordinate[3]*block_list[blk].BLZ;
-	  i = g_ipt[it][ix][iy][iz];
-	  if((t+x+y+z)%2 == 0) {
-	    add(globalfield + i, globalfield + i, beven + even, 1);
-	    even++;
-	  }
-	  else {
-	    add(globalfield + i, globalfield + i, bodd + odd, 1);
-	    odd++;
-	  }
-	}
+        iy = y +  block_list[blk].mpilocal_coordinate[2]*block_list[blk].BLY;
+        for(z = 0; z < block_list[blk].BLZ; z++) {
+          iz = z +  block_list[blk].mpilocal_coordinate[3]*block_list[blk].BLZ;
+          i = g_ipt[it][ix][iy][iz];
+          if((t+x+y+z)%2 == 0) {
+            add(globalfield + i, globalfield + i, beven + even, 1);
+            even++;
+          }
+          else {
+            add(globalfield + i, globalfield + i, bodd + odd, 1);
+            odd++;
+          }
+        }
+      }
+    }
+  }
+  return;
+}
+
+void add_eo_block_32_to_global(spinor * const globalfield, 
+                               spinor32 * const beven, spinor32 * const bodd, const int blk) {
+  int i,it,ix,iy,iz;
+  int even = 0, odd = 0;
+  spinor32 * tmp = NULL;
+  _Complex double * to = NULL;
+  _Complex float * from = NULL;
+
+  for(int t = 0; t < block_list[blk].BT; t++) {
+    it = t + block_list[blk].mpilocal_coordinate[0]*block_list[blk].BT;
+    for(int x = 0; x < block_list[blk].BLX; x++) {
+      ix = x +  block_list[blk].mpilocal_coordinate[1]*block_list[blk].BLX;
+      for(int y = 0; y < block_list[blk].BLY; y++) {
+        iy = y +  block_list[blk].mpilocal_coordinate[2]*block_list[blk].BLY;
+        for(int z = 0; z < block_list[blk].BLZ; z++) {
+          iz = z +  block_list[blk].mpilocal_coordinate[3]*block_list[blk].BLZ;
+          i = g_ipt[it][ix][iy][iz];
+          if((t+x+y+z)%2 == 0) {
+            tmp = beven + even;
+            even++;
+          }
+          else {
+            tmp = bodd + odd;
+            odd++;
+          }
+          to = (_Complex double*) (globalfield + i);
+          from = (_Complex float*) tmp;
+          for(int k = 0; k < 12; k++) {
+            (*to) += (_Complex double) (*from);
+            to++;
+            from++;
+          }
+
+        }
       }
     }
   }
   return;
 }
 
+
 void add_block_to_global(spinor * const globalfield, spinor * const blockfield, const int blk) {
   int i;
   spinor * r, * s;
@@ -1539,17 +1634,17 @@ void block_convert_eo_to_lexic(spinor * const P, spinor * const s, spinor * cons
   for(x = 0; x < dX; x++) {
     for(y = 0; y < dY; y++) {
       for(z = 0; z < dZ; z++) {
-	for(t = 0; t < dT; t++) {
-	  ix = block_ipt[t][x][y][z];
-	  i = ix / 2;
-	  if((x + y + z + t)%2 == 0) {
-	    p = s;
-	  }
-	  else {
-	    p = r;
-	  }
-	  memcpy((P+ix), (p+i), sizeof(spinor));
-	}
+        for(t = 0; t < dT; t++) {
+          ix = block_ipt[t][x][y][z];
+          i = ix / 2;
+          if((x + y + z + t)%2 == 0) {
+            p = s;
+          }
+          else {
+            p = r;
+          }
+          memcpy((P+ix), (p+i), sizeof(spinor));
+        }
       }
     }
   }
@@ -1568,17 +1663,17 @@ void block_convert_lexic_to_eo(spinor * const s, spinor * const r, spinor * cons
   for(x = 0; x < dX; x++) {
     for(y = 0; y < dY; y++) {
       for(z = 0; z < dZ; z++) {
-	for(t = 0; t < dT; t++) {
-	  ix = block_ipt[t][x][y][z];
-	  i = ix / 2;
-	  if((x + y + z + t)%2 == 0) {
-	    p = s;
-	  }
-	  else {
-	    p = r;
-	  }
-	  memcpy((p+i), (P+ix), sizeof(spinor));
-	}
+        for(t = 0; t < dT; t++) {
+          ix = block_ipt[t][x][y][z];
+          i = ix / 2;
+          if((x + y + z + t)%2 == 0) {
+            p = s;
+          }
+          else {
+            p = r;
+          }
+          memcpy((p+i), (P+ix), sizeof(spinor));
+        }
       }
     }
   }
diff --git a/block.h b/block.h
index 913420fe1..ec1d597bf 100644
--- a/block.h
+++ b/block.h
@@ -26,10 +26,10 @@
 #include "su3.h"
 #include "su3spinor.h"
 
-_Complex double * little_A;
-_Complex float * little_A32;
-_Complex double * little_A_eo;
-_Complex float * little_A32_eo;
+extern _Complex double * little_A;
+extern _Complex float * little_A32;
+extern _Complex double * little_A_eo;
+extern _Complex float * little_A_eo_32;
 
 
 typedef struct {
@@ -46,15 +46,18 @@ typedef struct {
   int *oddidx;                 /* provides the next neighbours for spinors on the block even/odd case */
   spinor **basis;               /* generated orthonormal basis for little D [Ns x local_volume] */
   su3 * u;                      /* block local gauge field, for use in D */
+  su3_32 * u_32;                /* 32 bit block local gauge field, for use in D */
   int spinpad;                  /* number of elements needed to store the boundaries of the spinor */
   int evenodd;                  /* block even or odd (0 or 1) */
+  int evenodd_id;               /* sequence of even and odd blocks */
 
   /* storage will be g_Ns x (9 * g_Ns)                 */
   /* build_little_diraclocal g_Ns x g_Ns block first (the diagonal part) */
   /* then +t, -t, +x, -x, +y, -y, +z, -z               */
   _Complex double    *little_dirac_operator;  /* full dense representation of the little D */
-  _Complex float  *little_dirac_operator32;
+  _Complex float  *little_dirac_operator_32;
   _Complex double    *little_dirac_operator_eo;  /* full dense representation of the little D in e/o order */
+  _Complex float    *little_dirac_operator_eo_32; 
 } block;
 
 int init_blocks(const int nt, const int nx, const int ny, const int nz);
@@ -62,13 +65,24 @@ int free_blocks();
 
 int init_blocks_gaugefield();
 int init_blocks_eo_gaugefield();
+int init_blocks_gaugefield_32();
+int init_blocks_eo_gaugefield_32();
 
 void copy_global_to_block(spinor * const blockfield, spinor * const globalfield, const int blk);
 void copy_block_to_global(spinor * const globalfield, spinor * const blockfield, const int blk);
-void copy_global_to_block_eo(spinor * const beven, spinor * const bodd, spinor * const globalfield, const int blk);
-void copy_block_eo_to_global(spinor * const globalfield, spinor * const beven, spinor * const bodd, const int blk);
+void copy_global_to_block_eo(spinor * const beven, spinor * const bodd, 
+			     spinor * const globalfield, const int blk);
+void copy_global_to_block_eo_32(spinor32 * const beven, spinor32 * const bodd, 
+				spinor * const globalfield, const int blk);
+
+void copy_block_eo_to_global(spinor * const globalfield, 
+			     spinor * const beven, spinor * const bodd, const int blk);
+
 void add_block_to_global(spinor * const globalfield, spinor * const blockfield, const int blk);
-void add_eo_block_to_global(spinor * const globalfield, spinor * const beven, spinor * const bodd, const int blk);
+void add_eo_block_to_global(spinor * const globalfield, 
+			    spinor * const beven, spinor * const bodd, const int blk);
+void add_eo_block_32_to_global(spinor * const globalfield, 
+			       spinor32 * const beven, spinor32 * const bodd, const int blk);
 
 void block_convert_lexic_to_eo(spinor * const s, spinor * const r, spinor * const P);
 void block_convert_eo_to_lexic(spinor * const P, spinor * const s, spinor * const r);
@@ -76,8 +90,8 @@ void block_convert_eo_to_lexic(spinor * const P, spinor * const s, spinor * cons
 void block_orthonormalize(block *parent);
 void block_orthonormalize_free(block *parent);
 
-void compute_little_D();
-void compute_little_D_diagonal();
+void compute_little_D(const int mu_g5);
+void compute_little_D_diagonal(const int mul_g5);
 void alt_block_compute_little_D();
 
 extern int dfl_field_iter;
diff --git a/boundary.c b/boundary.c
index 8835e0ddf..62931f8b9 100644
--- a/boundary.c
+++ b/boundary.c
@@ -23,7 +23,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <math.h>
@@ -34,7 +34,9 @@
   
 _Complex double ALIGN ka0, ka1, ka2, ka3;
 _Complex double ALIGN phase_0, phase_1, phase_2, phase_3;
+#if defined TM_USE_BSM
 _Complex double ALIGN phase_00, phase_11, phase_22, phase_33;
+#endif
 const double PI_ = 3.14159265358979;
 double X0, X1, X2, X3;
 
@@ -53,8 +55,10 @@ void boundary(const double kappa)
   phase_1 = -ka1;
   phase_2 = -ka2;
   phase_3 = -ka3;
+#if defined TM_USE_BSM
   phase_00 = phase_0*phase_0;
   phase_11 = phase_1*phase_1;
   phase_22 = phase_2*phase_2;
   phase_33 = phase_3*phase_3;
+#endif
 }
diff --git a/boundary.h b/boundary.h
index 45ab63743..142f16afa 100644
--- a/boundary.h
+++ b/boundary.h
@@ -23,7 +23,10 @@
 #include "su3.h"
 
 extern _Complex double ka0, ka1, ka2, ka3;
+extern _Complex double phase_0, phase_1, phase_2, phase_3;
+#ifdef TM_USE_BSM
 extern _Complex double phase_0, phase_1, phase_2, phase_3, phase_00, phase_11, phase_22, phase_33;
+#endif
 void boundary(const double kappa);
 
 #endif
diff --git a/buffers/Makefile.in b/buffers/Makefile.in
index 93a31f5e8..ffdbe2ee8 100644
--- a/buffers/Makefile.in
+++ b/buffers/Makefile.in
@@ -66,7 +66,7 @@ include ${top_srcdir}/Makefile.global
 
 # rule to compile objects
 
-%.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h
+%.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config.h
 	$(COMPILE) -c $<
 
 
diff --git a/buffers/gauge.ih b/buffers/gauge.ih
index 9b66819bd..69e709ca6 100644
--- a/buffers/gauge.ih
+++ b/buffers/gauge.ih
@@ -1,5 +1,5 @@
 #ifdef HAVE_CONFIG_H
-# include <config.h>
+# include <tmlqcd_config.h>
 #endif
 
 #include <stdio.h>
diff --git a/buffers/utils.ih b/buffers/utils.ih
index 27575f427..d3efd3fdf 100644
--- a/buffers/utils.ih
+++ b/buffers/utils.ih
@@ -1,12 +1,10 @@
 #ifdef HAVE_CONFIG_H
-# include <config.h>
+# include <tmlqcd_config.h>
 #endif
-#ifndef _UTILS_H
-#define _UTILS_H
+
 #include <string.h>
 
 #include <global.h>
 #include <xchange/xchange.h>
 
 #include <buffers/utils.h>
-#endif
diff --git a/buffers/utils_generic_exchange.c b/buffers/utils_generic_exchange.c
index 8f9b1d40d..cdcb62365 100644
--- a/buffers/utils_generic_exchange.c
+++ b/buffers/utils_generic_exchange.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 #include "utils.ih"
 
-#ifndef MPI /*Let's deal with this case once and for all*/
+#ifndef TM_USE_MPI /*Let's deal with this case once and for all*/
 void generic_exchange(void *field_in, int bytes_per_site)
 {}
 #else /* MPI */
@@ -161,22 +161,23 @@ void generic_exchange(void *field_in, int bytes_per_site)
 #endif /* _NON_BLOCKING */
 }
 
-#endif /* MPI */
 
+#endif /* MPI */
 
-void copy_gauge_field(gauge_field_t dest, gauge_field_t orig)
+inline void copy_gauge_field(gauge_field_t dest, gauge_field_t orig)
 {
   memmove((void*)dest.field, (void*)orig.field, sizeof(su3_tuple) * VOLUMEPLUSRAND + 1);
 }
 
-void exchange_gauge_field(gauge_field_t target)
+
+inline void exchange_gauge_field(gauge_field_t target)
 {
   generic_exchange((void*)target.field, sizeof(su3_tuple));
 }
 
-void exchange_gauge_field_array(gauge_field_array_t target)
+
+inline void exchange_gauge_field_array(gauge_field_array_t target)
 {
   for (unsigned int idx = 0; idx < target.length; ++idx)
     exchange_gauge_field(target.field_array[idx]);
 }
-
diff --git a/buffers/utils_generic_exchange_nogauge.c b/buffers/utils_generic_exchange_nogauge.c
index f255e1223..eccd25f51 100644
--- a/buffers/utils_generic_exchange_nogauge.c
+++ b/buffers/utils_generic_exchange_nogauge.c
@@ -18,7 +18,8 @@
  * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
  ***********************************************************************/
 #include "utils_nogauge.h"
-#ifndef MPI /*Let's deal with this case once and for all*/
+#ifdef TM_USE_BSM
+#ifndef TM_USE_MPI /*Let's deal with this case once and for all*/
 void generic_exchange_nogauge(void *field_in, int bytes_per_site )
 {}
 #else /* MPI */
@@ -104,4 +105,4 @@ void generic_exchange_nogauge(void *field_in, int bytes_per_site )
 }
 
 #endif /* MPI */
-
+#endif /* TM_USE_BSM */
diff --git a/buffers/utils_generic_exchange_nogauge.inc b/buffers/utils_generic_exchange_nogauge.inc
index 8605f3e8a..a4f54443c 100644
--- a/buffers/utils_generic_exchange_nogauge.inc
+++ b/buffers/utils_generic_exchange_nogauge.inc
@@ -1,3 +1,4 @@
+#if defined TM_USE_BSM
 #     if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
        MPI_Isend(buffer[0],          1, slice_T_cont_type, g_nb_t_dn, 83,
 	    g_cart_grid, &request[cntr]);
@@ -72,3 +73,4 @@
       cntr=cntr+2;
       MPI_Waitall(cntr, request, status);
 #    endif
+#endif
diff --git a/buffers/utils_generic_exchange_nonblocking.c b/buffers/utils_generic_exchange_nonblocking.c
index 8aad4b6ba..116576909 100644
--- a/buffers/utils_generic_exchange_nonblocking.c
+++ b/buffers/utils_generic_exchange_nonblocking.c
@@ -18,9 +18,9 @@
  * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
  ***********************************************************************/
 #include "utils_nonblocking.ih"
-
-#ifndef MPI /*Let's deal with this case once and for all*/
-void generic_exchange_direction_nonblocking(void *field_in, int bytes_per_site, int direction, MPI_Request *inreq, int* counter)
+#if defined TM_USE_BSM
+#ifndef TM_USE_MPI /*Let's deal with this case once and for all*/
+void generic_exchange_direction_nonblocking(void *field_in, int bytes_per_site, int direction, int* counter)
 {}
 #else /* MPI */
 void generic_exchange_direction_nonblocking(void *field_in, int bytes_per_site, int direction, MPI_Request *inreq, int* counter)
@@ -170,4 +170,4 @@ void generic_exchange_direction_nonblocking(void *field_in, int bytes_per_site,
   }
 }
 #endif /* MPI */
-
+#endif /* TM_USE_BSM */
diff --git a/buffers/utils_nogauge.h b/buffers/utils_nogauge.h
index f98adea47..7295a39c1 100644
--- a/buffers/utils_nogauge.h
+++ b/buffers/utils_nogauge.h
@@ -2,7 +2,7 @@
 #define _UTILS_NOGAUGE_H
 
 #ifdef HAVE_CONFIG_H
-# include <config.h>
+# include <tmlqcd_config.h>
 #endif
 
 #include <global.h>
diff --git a/buffers/utils_nonblocking.h b/buffers/utils_nonblocking.h
index d1475e394..7a49f0dd4 100644
--- a/buffers/utils_nonblocking.h
+++ b/buffers/utils_nonblocking.h
@@ -1,8 +1,11 @@
 #ifndef _UTILS_NONBLOCKING_H
 #define _UTILS_NONBLOCKING_H
 
+#ifndef TM_USE_MPI
+void generic_exchange_direction_nonblocking(void *field_in, int bytes_per_site, int direction, int *counter);
+#else
 void generic_exchange_direction_nonblocking(void *field_in, int bytes_per_site, int direction, MPI_Request *inreq, int *counter);
-
+#endif
 void generic_exchange_nogauge(void *field_in, int bytes_per_site );
 
 #endif
diff --git a/buffers/utils_nonblocking.ih b/buffers/utils_nonblocking.ih
index 1b22247e0..71f52bb4f 100644
--- a/buffers/utils_nonblocking.ih
+++ b/buffers/utils_nonblocking.ih
@@ -1,5 +1,5 @@
 #ifdef HAVE_CONFIG_H
-# include <config.h>
+# include <tmlqcd_config.h>
 #endif
 
 #include <string.h>
diff --git a/chebyshev_polynomial.c b/chebyshev_polynomial.c
index bbf6e4218..abd1c45e0 100644
--- a/chebyshev_polynomial.c
+++ b/chebyshev_polynomial.c
@@ -20,7 +20,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/chebyshev_polynomial_nd.c b/chebyshev_polynomial_nd.c
index 05b8a81a2..a58e0c8df 100644
--- a/chebyshev_polynomial_nd.c
+++ b/chebyshev_polynomial_nd.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/check_locallity.c b/check_locallity.c
index a4ca64e13..2ddf5e21f 100644
--- a/check_locallity.c
+++ b/check_locallity.c
@@ -19,7 +19,7 @@
 
 #include"lime.h"
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -27,7 +27,7 @@
 #include <time.h>
 #include <string.h>
 #include <signal.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
 #include "global.h"
@@ -36,7 +36,7 @@
 #include "geometry_eo.h"
 #include "start.h"
 #include "measure_gauge_action.h"
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include "xchange/xchange.h"
 #endif
 #include "read_input.h"
@@ -85,7 +85,7 @@ int main(int argc,char *argv[]) {
 #ifdef _GAUGE_COPY
   int kb=0;
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
   double atime=0., etime=0.;
 #endif
 #ifdef _KOJAK_INST
@@ -95,8 +95,7 @@ int main(int argc,char *argv[]) {
 
   DUM_DERI = 6;
   /* DUM_DERI + 2 is enough (not 7) */
-  DUM_SOLVER = DUM_DERI+2;
-  DUM_MATRIX = DUM_SOLVER+6;
+  DUM_MATRIX = DUM_DERI+8;
   /* DUM_MATRIX + 2 is enough (not 6) */
   NO_OF_SPINORFIELDS = DUM_MATRIX+2;
 
@@ -104,7 +103,7 @@ int main(int argc,char *argv[]) {
   g_use_clover_flag = 0;
   g_nr_of_psf = 1;
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Init(&argc, &argv);
 #endif
 
@@ -146,7 +145,7 @@ int main(int argc,char *argv[]) {
 
   g_dbw2rand = 0;
 
-#ifndef MPI
+#ifndef TM_USE_MPI
   g_dbw2rand = 0;
 #endif
 
@@ -219,7 +218,7 @@ int main(int argc,char *argv[]) {
     if (g_proc_id == 0){
       printf("done!\n"); fflush(stdout);
     }
-#ifdef MPI
+#ifdef TM_USE_MPI
     xchange_gauge(g_gauge_field);
 #endif
 
@@ -253,7 +252,8 @@ int main(int argc,char *argv[]) {
     if(even_odd_flag) {
       i = invert_eo(g_spinor_field[2], g_spinor_field[3], g_spinor_field[0], g_spinor_field[1], 
 		    solver_precision, max_solver_iterations, solver_flag, g_relative_precision_flag,
-		    sub_evs_cg_flag, even_odd_flag, 0, NULL, -1);
+		    sub_evs_cg_flag, even_odd_flag, 0, NULL, -1,
+            NO_EXT_INV, SLOPPY_DOUBLE, NO_COMPRESSION);
       convert_eo_to_lexic(g_spinor_field[DUM_DERI+1], g_spinor_field[2], g_spinor_field[3]);
     }
 
@@ -294,7 +294,7 @@ int main(int argc,char *argv[]) {
     nstore+=Nsave;
   }
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Finalize();
 #endif
   free_gauge_field();
diff --git a/configure.in b/configure.in
index 722f5ea9a..66c02a071 100644
--- a/configure.in
+++ b/configure.in
@@ -3,7 +3,7 @@
 #
 AC_PREREQ(2.59)
 AC_INIT(tmLQCD, 5.2.0, curbach@gmx.de)
-AC_CONFIG_HEADER(config.h)
+AC_CONFIG_HEADER(include/tmlqcd_config.h)
 AC_CONFIG_SRCDIR([hmc_tm.c])
 AC_CANONICAL_HOST()
 AC_PREFIX_DEFAULT($HOME)
@@ -32,6 +32,7 @@ fi
 AC_PROG_MAKE_SET
 AC_PROG_RANLIB
 AC_CHECK_PROG(CCDEP, gcc, "gcc", "$CC")
+AC_CHECK_PROG(CXXDEP, g++, "g++", "$CXX")
 #(endian="", AC_DEFINE(LITTLE_ENDIAN,1,The endian of the architechture))
 
 # AC_PROG_FC([ifort gfortran])
@@ -41,7 +42,7 @@ LDFLAGS="$LDFLAGS -L\${HOME}/lib -L\${top_builddir}/lib"
 CCLD=${CC}
 
 # compilation in operator is slowest so we do it first, saves time in parallel compiles
-USESUBDIRS="operator linalg solver monomial buffers cu io xchange init rational wrapper"
+USESUBDIRS="operator linalg solver monomial buffers cu io meas xchange init rational wrapper contractions"
 
 AC_CHECK_HEADERS([stdint.h],
 [ dnl for inttypes.h and stdint.h for uint_xxx types
@@ -104,11 +105,43 @@ AC_ARG_ENABLE(mpi,
   enable_mpi=$enableval, enable_mpi=yes)
 if test $enable_mpi = yes; then
   AC_MSG_RESULT(yes)
-  AC_DEFINE(MPI,1,Compile with MPI support)
+  AC_DEFINE(TM_USE_MPI,1,Compile with MPI support)
 else
   AC_MSG_RESULT(no)
 fi
 
+
+AC_MSG_CHECKING(checking whether we want to use BSM operators with two separate gauge fields)
+AC_ARG_ENABLE(bsm,
+  AS_HELP_STRING([--enable-bsm],[enable bsm operators using two separate gauge field [default=yes]]),
+  enable_BSM=$enableval, enable_BSM=yes)
+if test $enable_BSM = yes; then
+  AC_MSG_RESULT(yes)
+  AC_DEFINE(TM_USE_BSM,1,bsm operator with two seperate gauge field)
+else
+ AC_MSG_RESULT(no)
+fi
+
+AC_MSG_CHECKING(whether we want to use DDalphaAMG)
+AC_ARG_WITH(DDalphaAMG,
+            AS_HELP_STRING([--with-DDalphaAMG[=dir]], [use DDalphaAMG, to be found in dir]),
+             [echo yes
+              DDalphaAMG_AVAILABLE=1
+              DDalphaAMG_INTERFACE="DDalphaAMG_interface"
+              AC_DEFINE(DDalphaAMG,1,Using DDalphaAMG)
+              DDalphaAMG_dir=$withval
+              LDFLAGS="$LDFLAGS -L${DDalphaAMG_dir}/lib"
+              INCLUDES="$INCLUDES -I${DDalphaAMG_dir}/include/"
+              AC_CHECK_LIB([DDalphaAMG],
+                           [DDalphaAMG_finalize],
+                           [],
+                           [AC_MSG_ERROR([library DDalphaAMG was not found])])],
+             [echo no
+              DDalphaAMG_AVAILABLE=0
+              DDalphaAMG_INTERFACE="DDalphaAMG_interface"
+              ])
+
+
 AC_MSG_CHECKING(whether to use QPX intrinsics)
 AC_ARG_ENABLE(qpx,
   AS_HELP_STRING([--enable-qpx], [enable use of qpx intrinsics [default=no]]),
@@ -143,7 +176,7 @@ AC_ARG_ENABLE(omp,
   enable_omp=$enableval, enable_omp=yes)
 if test $enable_omp = yes; then
   AC_MSG_RESULT(yes)
-  AC_DEFINE(OMP,1,Compile with OpenMP support)
+  AC_DEFINE(TM_USE_OMP,1,Compile with OpenMP support)
   AC_CHECK_HEADERS([omp.h],,[AC_MSG_ERROR([Cannot find OpenMP headers!])])
   AC_OPENMP
 # -- AC_OPENMP provides a compiler-dependent OPENMP_CFLAGS so we can set it here
@@ -349,44 +382,65 @@ AC_SUBST(INCLUDES)
 AC_SUBST(AUTOCONF)
 AC_SUBST(SOLVEROUT)
 AC_SUBST(CCDEP)
+AC_SUBST(CXXDEP)
 AC_SUBST(CCLD)
 AC_SUBST(DEPFLAGS)
+AC_SUBST(CXXDEPFLAGS)
 AC_SUBST(DEBUG_FLAG)
 AC_SUBST(PROFILE_FLAG)
 AC_SUBST(XCHANGELIB)
 AC_SUBST(XCHANGEDIR)
+AC_SUBST(MEASDIR)
 AC_SUBST(XLIB)
 AC_SUBST([LEMON_AVAILABLE])
 AC_SUBST(SPI_FILES)
+AC_SUBST(QUDA_INTERFACE)
+AC_SUBST(QPHIX_INTERFACE)
+AC_SUBST(QPHIX_PROGRAMS)
+AC_SUBST(DDalphaAMG_INTERFACE)
 
-INCLUDES="$INCLUDES -I\$(HOME)/include/ -I. -I\${abs_top_builddir}/  -I\${abs_top_srcdir}/ -I${lime_dir}/include/ -I${lemon_dir}/include/"
+INCLUDES="$INCLUDES -I\$(HOME)/include/ -I. -I\${abs_top_builddir}/  -I\${abs_top_builddir}/include/ -I\${abs_top_srcdir}/ -I${lime_dir}/include/ -I${lemon_dir}/include/"
 DEPFLAGS="$DEPFLAGS"
 
 AC_MSG_CHECKING(what alignment we want for arrays)
 AC_ARG_ENABLE(alignment,
-  [AS_HELP_STRING([--enable-alignment[=n]], [Automatically or expliclty align arrays to byte number: auto, none, 16, 32 [default=auto]])],
+  [AS_HELP_STRING([--enable-alignment[=n]], [Automatically or expliclty align arrays to byte number: auto, none, 16, 32, 64 [default=auto]])],
   withalign=$enableval, withalign=auto)
 if test "$withalign" = "none"; then
   AC_MSG_RESULT(none)
   withalign=1
   AC_DEFINE(ALIGN_BASE, 0x00, [Align base])
   AC_DEFINE(ALIGN, [])
+  AC_DEFINE(ALIGN_BASE32, 0x00, [Align base32])
+  AC_DEFINE(ALIGN32, [], [])
 elif test $withalign = 16; then
   AC_MSG_RESULT(16 bytes)
   AC_DEFINE(ALIGN_BASE, 0x0F, [Align base])
   AC_DEFINE(ALIGN, [__attribute__ ((aligned (16)))])
+  AC_DEFINE(ALIGN_BASE32, 0x0F, [Align base32])
+  AC_DEFINE(ALIGN32, [__attribute__ ((aligned (16)))], [])
 elif test $withalign = 32; then
   AC_MSG_RESULT(32 bytes)
   AC_DEFINE(ALIGN_BASE, 0x1F, [Align base])
   AC_DEFINE(ALIGN, [__attribute__ ((aligned (32)))])
+  AC_DEFINE(ALIGN_BASE32, 0x1F, [Align base32])
+  AC_DEFINE(ALIGN32, [__attribute__ ((aligned (32)))], [])
+elif test $withalign = 64; then
+  AC_MSG_RESULT(64 bytes)
+  AC_DEFINE(ALIGN_BASE, 0x3F, [Align base])
+  AC_DEFINE(ALIGN, [__attribute__ ((aligned (64)))])
+  AC_DEFINE(ALIGN_BASE32, 0x3F, [Align base32])
+  AC_DEFINE(ALIGN32, [__attribute__ ((aligned (64)))], [])
 elif test $withalign = auto; then
   withautoalign=1
   AC_MSG_RESULT(auto)
   AC_DEFINE(ALIGN_BASE, 0x00, [Align base])
   AC_DEFINE(ALIGN, [], [])
+  AC_DEFINE(ALIGN_BASE32, 0x00, [Align base32])
+  AC_DEFINE(ALIGN32, [], [])
 else
   AC_MSG_RESULT(Unusable value for array alignment)
-  AC_MSG_ERROR([Allowed values are: auto, none, 16, 32])
+  AC_MSG_ERROR([Allowed values are: auto, none, 16, 32, 64])
 fi
 
 dnl in the following we check for extra options
@@ -404,6 +458,9 @@ if test "$host_cpu" = "i686" || test "$host_cpu" = "x86_64"; then
         AC_MSG_RESULT(increasing array alignment to 16 bytes for P4 instructions)
         AC_DEFINE(ALIGN_BASE, 0x0F, [Align base])
         AC_DEFINE(ALIGN, [__attribute__ ((aligned (16)))])
+        AC_MSG_RESULT(increasing array 32 bit alignment to 16 bytes for P4 instructions)
+        AC_DEFINE(ALIGN_BASE32, 0x0F, [Align base])
+        AC_DEFINE(ALIGN32, [__attribute__ ((aligned (16)))])
         withautoalign=16
       fi
     elif test $withalign -lt 16; then
@@ -425,6 +482,9 @@ if test "$host_cpu" = "i686" || test "$host_cpu" = "x86_64"; then
         AC_MSG_RESULT(increasing array alignment to 16 bytes for Opteron instructions)
         AC_DEFINE(ALIGN_BASE, 0x0F, [Align base])
         AC_DEFINE(ALIGN, [__attribute__ ((aligned (16)))])
+        AC_MSG_RESULT(increasing array 32 bit alignment to 16 bytes for Opteron instructions)
+        AC_DEFINE(ALIGN_BASE32, 0x0F, [Align base32])
+        AC_DEFINE(ALIGN32, [__attribute__ ((aligned (16)))])
         withautoalign=16
       fi
     elif test $withalign -lt 16; then
@@ -466,6 +526,9 @@ if test "$host_cpu" = "i686" || test "$host_cpu" = "x86_64"; then
         AC_MSG_RESULT(increasing array alignment to 16 bytes for SSE instructions)
         AC_DEFINE(ALIGN_BASE, 0x0F, [Align base])
         AC_DEFINE(ALIGN, [__attribute__ ((aligned (16)))])
+        AC_MSG_RESULT(increasing 32bit array alignment to 16 bytes for SSE instructions)
+        AC_DEFINE(ALIGN_BASE32, 0x0F, [Align base32])	
+        AC_DEFINE(ALIGN32, [__attribute__ ((aligned (16)))])
         withautoalign=16
       fi
     fi
@@ -479,6 +542,9 @@ if test $enable_qpx = yes; then
       AC_MSG_RESULT(increasing array alignment to 32 bytes for use of QPX instructions on BG/Q)
       AC_DEFINE(ALIGN_BASE, 0x1F, [Align base])
       AC_DEFINE(ALIGN, [__attribute__ ((aligned (32)))])
+      AC_MSG_RESULT(increasing 32bit array alignment to 16 bytes for use of QPX instructions on BG/Q)
+      AC_DEFINE(ALIGN_BASE32, 0x0F, [Align base32])
+      AC_DEFINE(ALIGN32, [__attribute__ ((aligned (16)))])
       withautoalign=32
     fi
   elif test $withalign -lt 32; then
@@ -493,7 +559,7 @@ if test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm" && test "$host_os
     if test $withautoalign -lt 16; then
       AC_MSG_RESULT(increasing array alignment to 16 bytes for BG/L optimization)
       AC_DEFINE(ALIGN_BASE, 0x0F, [Align base])
-      AC_DEFINE(ALIGN, [__attribute__ ((aligned (16)))])
+      AC_DEFINE(ALIGN, [__attribute__ ((aligned (16)))], [Align base])
       withautoalign=16
     fi
   fi
@@ -502,7 +568,7 @@ elif test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm" && test "$host_
     if test $withautoalign -lt 16; then
       AC_MSG_RESULT(increasing array alignment to 16 bytes for BG/P optimization)
       AC_DEFINE(ALIGN_BASE, 0x0F, [Align base])
-      AC_DEFINE(ALIGN, [__attribute__ ((aligned (16)))])
+      AC_DEFINE(ALIGN, [__attribute__ ((aligned (16)))], [Align base])
       withautoalign=16
     fi
   fi
@@ -512,6 +578,9 @@ elif test "$host_cpu" = "powerpc64" && test "$host_vendor" = "unknown" && test "
       AC_MSG_RESULT(increasing array alignment to 32 bytes for BG/Q and generic POWER optimization)
       AC_DEFINE(ALIGN_BASE, 0x1F, [Align base])
       AC_DEFINE(ALIGN, [__attribute__ ((aligned (32)))])
+      AC_MSG_RESULT(increasing array 32 bit alignment to 16 bytes for BG/Q and generic POWER optimization)
+      AC_DEFINE(ALIGN_BASE32, 0x0F, [Align base])
+      AC_DEFINE(ALIGN32, [__attribute__ ((aligned (16)))])
       withautoalign=32
     fi
   fi
@@ -584,6 +653,10 @@ dnl the GNU compiler
     if test $enable_mpi = yes; then
       CCDEP="gcc"
     fi
+    CXXDEP="$CXX"
+    if test $enable_mpi = yes; then
+      CXXDEP="g++"
+    fi
     DEBUG_FLAG="-g"
 dnl other compilers
   else
@@ -606,12 +679,24 @@ dnl check for icc
       DEBUG_FLAG="-g"
       PROFILE_FLAG="-p -g"
       CCDEP="$CC"
+      CXXDEP="$CXX"
 
     else
+      # other compilers might support SSE inline assembly too
+      # (the cray compiler, for example)
+      if test $enable_sse3 = yes; then
+        echo Using SSE3 and SSE2 macros!
+        AC_DEFINE(SSE3,1,Compile with SSE3 support)
+      elif test $enable_sse2 = yes; then
+        echo Using SSE2 macros only!
+        AC_DEFINE(SSE2,1,Compile with SSE2 support)
+      fi
+
       DEPFLAGS="-M"
       CFLAGS="$CFLAGS -O"
       DEBUG_FLAG="-g"
       CCDEP="$CC"
+      CXXDEP="$CXX"
     fi
   fi
 
@@ -746,6 +831,7 @@ else
   SOPTARGS=
 fi
 
+CXXDEPFLAGS="$DEPFLAGS --std=c++11"
 
 AC_MSG_CHECKING(whether we want to switch on optimisation)
 AC_ARG_ENABLE(optimize,
@@ -871,6 +957,92 @@ AC_SUBST(GPUCFLAGS)
 AC_SUBST(GPUMPICOMPILER)
 
 
+# QUDA library for GPUs
+AC_MSG_CHECKING(whether we want to use QUDA GPU)
+AC_ARG_WITH(qudadir,
+            AS_HELP_STRING([--with-qudadir[=dir]], [use QUDA, to be found in dir]),
+             [echo yes
+              QUDA_AVAILABLE=1
+              AC_DEFINE(TM_USE_QUDA,1,Using QUDA GPU)
+              quda_dir=$withval
+              LDFLAGS="$LDFLAGS -L${quda_dir}/lib"
+              INCLUDES="$INCLUDES -I${quda_dir}/include/"
+              QUDA_INTERFACE="quda_interface"
+              AC_MSG_CHECKING([where to search for CUDA libs])
+						  AC_ARG_WITH(cudadir,
+						    AS_HELP_STRING([--with-cudadir[=dir]], [if using QUDA, then set CUDA lib dir [default=/usr/local/cuda/lib]]),
+						    cuda_dir=$withval, cuda_dir="/usr/local/cuda/lib")
+						  AC_MSG_RESULT($cuda_dir)
+						  LDFLAGS="$LDFLAGS -L$cuda_dir -lcuda -lcublas"
+              AC_CHECK_LIB([cudart],
+                           [cudaMalloc],
+                           [],
+                           [AC_MSG_ERROR([Can't link a simple program against library cudart.])]
+                           )
+              # Perform test in C++
+              AC_LANG_PUSH([C++])
+              AC_CHECK_LIB([quda],
+                           [freeGaugeQuda],
+                           [],
+                           [AC_MSG_ERROR([Can't link a simple program against library libquda. (Did you set CXX properly?)])]
+                           )
+              AC_LANG_PUSH([C++])
+              #QUDA needs to be linked with C++ linker
+              CCLD=${CXX}
+             ],
+             [echo no
+              QUDA_AVAILABLE=0
+              QUDA_INTERFACE=""
+              ]
+              )
+AC_SUBST([QUDA_AVAILABLE])
+
+
+# QPhiX library for Intel Xeon and Xeon Phis
+AC_MSG_CHECKING(whether we want to use QPhiX)
+AC_ARG_WITH(qphixdir,
+            AS_HELP_STRING([--with-qphixdir[=dir]], [use QPhiX, to be found in dir]),
+             [echo yes
+              QPHIX_AVAILABLE=1
+              AC_DEFINE(TM_USE_QPHIX,1,Using QPhiX)
+              qphix_dir=$withval
+              LDFLAGS="$LDFLAGS -L${qphix_dir}/lib -lqphix_solver -lqphix_codegen"
+              INCLUDES="$INCLUDES -I${qphix_dir}/include/" 
+              QPHIX_INTERFACE="qphix_interface"
+              QPHIX_PROGRAMS=""
+              # Due to github issue #404, the qphix test_Dslash code has been disabled by BaKo
+              # for the time being
+              # it should be updated to make use of the QPhiX internal interfaces
+              # for passing full lattice spinors
+              # "qphix_test_Dslash"
+
+              # QMP: TODO AC_CHECK_LIB
+              AC_MSG_CHECKING([where to search for QMP libs])
+              AC_ARG_WITH(qmpdir,
+                          AS_HELP_STRING([--with-qmpdir[=dir]], [if using QPhiX, then set QMP lib dir]),
+                          qmp_dir=$withval
+                          LDFLAGS="$LDFLAGS -L${qmp_dir}/lib -lqmp"
+                          INCLUDES="$INCLUDES -I${qmp_dir}/include/"
+                          )
+              AC_MSG_RESULT($qmp_dir)
+
+              AC_MSG_CHECKING([Setting QPhiX SOALEN])
+              AC_ARG_ENABLE(qphix-soalen,
+                            AS_HELP_STRING([--enable-qphix-soalen], [if using QPhiX, set SOALEN [default=4]]),
+                            enable_qphix_soalen=$enableval, enable_qphix_soalen=4)
+              AC_MSG_RESULT($enable_qphix_soalen)
+              AC_DEFINE_UNQUOTED(QPHIX_SOALEN, ${enable_qphix_soalen}, Structure of Array length to use with QPhiX)
+
+              AC_PROG_CXX
+              #QPhiX needs to be linked with C++ linker
+              CCLD=${CXX}
+             ],
+             [echo no
+              QPHIX_AVAILABLE=0
+              QPHIX_INTERFACE=""])
+AC_SUBST([QPHIX_AVAILABLE])
+
+
 AC_MSG_CHECKING(checking consistency)
 if test $enable_mpi = yes ; then
  if test $enable_iig = yes && test $withpersistent = yes ; then
@@ -911,7 +1083,7 @@ if test ! -e tests/regressions; then
 fi
 
 
-LIBS="-lhmc -lmonomial -loperator -lsolver -linit -llinalg -lhmc -lxchange -lrational -lio -lbuffers $LIBS"
+LIBS="-lhmc -lmonomial -loperator -lsolver -linit -lbuffers -lmeas -llinalg -lhmc -lxchange -lrational -lio -lcontractions  $LIBS"
 AUTOCONF=autoconf
 
 for i in $USESUBDIRS
diff --git a/contractions/Makefile.in b/contractions/Makefile.in
new file mode 100644
index 000000000..cc4192562
--- /dev/null
+++ b/contractions/Makefile.in
@@ -0,0 +1,84 @@
+
+srcdir = @srcdir@
+top_builddir =  @top_builddir@
+abs_top_builddir = @abs_top_builddir@
+top_srcdir = @top_srcdir@
+abs_top_srcdir = @abs_top_srcdir@
+subdir = contractions
+builddir = @builddir@
+
+CFLAGS = @CFLAGS@
+DEPFLAGS = @DEPFLAGS@
+LDFLAGS = @LDFLAGS@
+DEFS = @DEFS@
+OPTARGS = @OPTARGS@
+SOPTARGS = @SOPTARGS@
+
+AR = @AR@
+RANLIB = @RANLIB@
+CC = @CC@
+CCDEP = @CCDEP@
+CCLD = ${CC}
+LINK = ${CCLD} ${CFLAGS} ${LDFLAGS} ${OPTARGS} -o $@
+LEX = @LEX@
+AUTOCONF = @AUTOCONF@
+DEFS = @DEFS@
+
+INCLUDES = @INCLUDES@
+LDADD =
+#COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS}
+COMPILE = ${CC} $(DEFS) ${INCLUDES} ${CFLAGS}
+
+LIBRARIES = libcontractions
+libcontractions_TARGETS = contractions_helper contractions_FP contractions_checks contractions_currentdensityextended 
+
+libcontractions_OBJECTS = $(addsuffix .o, ${libcontractions_TARGETS})
+
+# default rule
+
+all: Makefile dep libcontractions.a
+
+# rules for debugging
+debug all-debug: CFLAGS := $(CFLAGS) @DEBUG_FLAG@
+debug all-debug: all
+
+# rules for profiling information
+profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) @PROFILE_FLAG@
+profile all-profile: all
+
+#include dep rules
+-include $(addsuffix .d,${liboperator_TARGETS})
+
+include ${top_srcdir}/Makefile.global
+
+# rule to compile objects
+${libcontractions_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config.h
+	$(COMPILE) ${OPTARGS} -c $<
+
+# rule to make liboperator
+libcontractions.a: ${libcontractions_OBJECTS} Makefile
+	@rm -f libcontractions.a
+	@${AR} cru libcontractions.a ${libcontractions_OBJECTS} 
+	@$(RANLIB) libcontractions.a
+	@cp libcontractions.a ../lib/libcontractions.a
+
+# rule to generate .d files
+$(addsuffix .d, ${libcontractions_TARGETS}) : %.d: ${srcdir}/%.c Makefile
+	@${CCDEP} ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@
+
+# rule to make dependencies
+dep: ${addsuffix .d, ${libcontractions_TARGETS}}
+
+# rules to clean
+
+compile-clean: Makefile
+	rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} ${$(addsuffix _SOBJECTS, ${LIBRARIES})} *.d
+
+clean: compile-clean 
+	rm -f $(addsuffix .a, ${LIBRARIES})
+	rm -f ../lib/liboperator.a
+
+distclean: clean
+	rm -f Makefile
+
+.PHONY: all dep clean compile-clean distclean profile all-profile debug all-debug
diff --git a/contractions/contractions_FP.c b/contractions/contractions_FP.c
new file mode 100644
index 000000000..9f7dfc117
--- /dev/null
+++ b/contractions/contractions_FP.c
@@ -0,0 +1,1773 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2017 Ferenc Pittler
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#ifdef TM_USE_BSM
+#include"lime.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <errno.h>
+#include <time.h>
+#ifdef TM_USE_MPI
+#include <mpi.h>
+#endif
+#include "global.h"
+#include "getopt.h"
+#include "default_input_values.h"
+#include "read_input.h"
+#include "su3.h"
+#include "operator/tm_operators.h"
+#include "linalg_eo.h"
+#include "geometry_eo.h"
+#include "linalg/assign.h"
+#include "operator/D_psi.h"
+#include "operator/D_psi_BSM.h"
+#include "operator/D_psi_BSM2b.h"
+#include "operator/D_psi_BSM2f.h"
+#include "operator/D_psi_BSM2m.h"
+#include "operator/Dov_psi.h"
+#include "operator/tm_operators_nd.h"
+#include "operator/Hopping_Matrix.h"
+#include "invert_eo.h"
+#include "invert_doublet_eo.h"
+#include "invert_overlap.h"
+#include "invert_clover_eo.h"
+#include "init/init_scalar_field.h"
+#include "init/init_bsm_2hop_lookup.h"
+#include "boundary.h"
+#include "start.h"
+#include "solver/solver.h"
+#include "xchange/xchange_gauge.h"
+#include "prepare_source.h"
+#include <io/params.h>
+#include <io/gauge.h>
+#include <io/spinor.h>
+#include <io/utils.h>
+#include "io/scalar.h"
+#include "buffers/utils_nonblocking.h"
+#include "buffers/utils_nogauge.h"
+#include "test/overlaptests.h"
+#include "solver/index_jd.h"
+#include "operator/clovertm_operators.h"
+#include "operator/clover_leaf.h"
+#include "operator.h"
+#include "gettime.h"
+#include "measure_gauge_action.h"
+#include "mpi_init.h"
+#include "init/init_geometry_indices.h"
+#include "init/init_openmp.h"
+#include "init/init_gauge_field.h"
+#include "init/init_spinor_field.h"
+#include "init/init_bispinor_field.h"
+#include "solver/solver_field.h"
+#include "ranlxd.h"
+#include "contractions/contractions_helper.h"
+
+extern int DAGGER;
+extern int NO_DAGG;
+
+extern int GAMMA_UP;
+extern int GAMMA_DN;
+extern int NO_GAMMA;
+
+extern int WITH_SCALAR;
+extern int NO_SCALAR;
+
+extern int TYPE_A;
+extern int TYPE_B;
+
+extern int TYPE_1;
+extern int TYPE_2;
+extern int TYPE_3;
+extern int TYPE_4;
+
+extern int TYPE_I;
+extern int TYPE_II;
+   
+extern int RIGHT;
+extern int LEFT;
+
+
+void density_density_1234_s0s0( bispinor ** propfields, int type_1234, _Complex double **results ){
+   int ix,i;
+   int f1,c1,s1;
+   int spinorstart=0, spinorend=4;
+   bispinor running;
+
+   _Complex double *colortrace;
+   _Complex double *spacetrace;
+   _Complex double *spinortrace;
+   _Complex double *flavortrace;
+
+   int type;
+   colortrace=(_Complex double *)malloc(sizeof(_Complex double) *8);
+   spacetrace=(_Complex double *)malloc(sizeof(_Complex double) *8*T_global);
+   spinortrace=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
+   flavortrace=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+
+   *results=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+   if (*results == NULL){
+     printf("Error in memory allocation for results in s0s0\n");
+     exit(1);
+   }
+
+   if ( (colortrace == NULL) || (spacetrace == NULL) || (spinortrace == NULL) || (flavortrace == NULL) )
+   {
+     printf("Error in mem allocation in density_density_1234_s0s0\n");
+     exit(1);
+   } 
+
+   if ( ( type_1234 == TYPE_1 )|| ( type_1234 == TYPE_3 ) ) {
+     spinorstart=0;
+     spinorend  =2;
+   }
+   else if ( ( type_1234 == TYPE_2) || (type_1234 == TYPE_4) ){
+     spinorstart=2;
+     spinorend  =4;
+   }
+   else{
+     fprintf(stdout,"Wrong argument for type_ab, it can only be TYPE_1, TYPE_2, TYPE_3 or TYPE_4\n");
+     exit(1);
+   }
+
+   for (i=0; i<T_global; ++i)
+      flavortrace[i]=0.;
+//Trace over flavor space
+   for (f1=0; f1<2; ++f1){
+//Trace over the spinor indices
+      for (i=0; i<2*T_global; ++i)
+         spinortrace[i]=0.;
+
+      for (s1= spinorstart; s1<spinorend; ++s1){
+
+//Trace over the spatial indices
+         for (i=0; i<8*T_global; ++i)
+            spacetrace[i]=0.;
+
+         for (ix = 0; ix< VOLUME; ++ix){
+
+//Trace over the color indices for each sites
+            for (i=0; i<8; ++i)
+               colortrace[i]=0.;
+            for (c1=0; c1<3; ++c1){
+/*   
+       TYPE  1 OR  2            (1-g5)/2*S(x  ,ytilde) fixed indices (c1, s1, f1)
+       TYPE  3 OR  4            (1+g5)/2*S(x  ,ytilde) running indices bispinor
+*/
+
+//for the up quark
+               if ( (type_1234 == TYPE_1) || (type_1234 == TYPE_2) ){
+                 _vector_null( running.sp_up.s0 );
+                 _vector_null( running.sp_up.s1 );
+                 _vector_assign( running.sp_up.s2, propfields[12*s1+4*c1+2*f1][ix].sp_up.s2 );
+                 _vector_assign( running.sp_up.s3, propfields[12*s1+4*c1+2*f1][ix].sp_up.s3 );
+                 _vector_null( running.sp_dn.s0 );
+                 _vector_null( running.sp_dn.s1 );
+                 _vector_assign( running.sp_dn.s2, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s2 );
+                 _vector_assign( running.sp_dn.s3, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s3 );
+               }
+               else if ((type_1234 == TYPE_3) || ( type_1234 == TYPE_4) ){
+                 _vector_null( running.sp_up.s2 );
+                 _vector_null( running.sp_up.s3 );
+                 _vector_assign( running.sp_up.s0, propfields[12*s1+4*c1+2*f1][ix].sp_up.s0 );
+                 _vector_assign( running.sp_up.s1, propfields[12*s1+4*c1+2*f1][ix].sp_up.s1 );
+                 _vector_null( running.sp_dn.s2 );
+                 _vector_null( running.sp_dn.s3 );
+                 _vector_assign( running.sp_dn.s0, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s0 );
+                 _vector_assign( running.sp_dn.s1, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s1 );
+               }
+
+/*   
+       TYPE  1 OR  2     phi^dagger(x)*tau_i*  (1-g5)/2*S(x  ,ytilde)
+       TYPE  3 OR  4     tau_i*phi(x)          (1+g5)/2*S(x  ,ytilde)
+*/
+               if ( (type_1234 == TYPE_1) || (type_1234 == TYPE_2)){
+                 taui_scalarfield_spinor_s0s0( &running, &running, GAMMA_DN, ix, NODIR, DAGGER );
+               }
+               else if ( (type_1234 == TYPE_3) || (type_1234 == TYPE_4) ){
+                 taui_scalarfield_spinor_s0s0( &running, &running, GAMMA_UP, ix, NODIR, NO_DAGG);
+               }
+/*   
+       TYPE  1 OR  2     S(ytilde, x)*phi^dagger(x)*tau_i*  (1-g5)/2*S(x  ,ytilde)
+       TYPE  3 OR  4     S(ytilde, x)*tau_i*phi(x)          (1+g5)/2*S(x  ,ytilde)
+*/
+               multiply_backward_propagator(&running, propfields, &running, ix, NODIR );
+
+               trace_in_color(colortrace,&running,c1);
+
+            }  //End of trace color
+
+            trace_in_space(spacetrace,colortrace,ix);
+
+         } //End of trace space
+ 
+//Gather the results from all nodes to complete the trace in space
+
+#if defined TM_USE_MPI
+         for (i=0; i<8*T_global; ++i){
+            _Complex double tmp;
+            MPI_Allreduce(&spacetrace[i], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
+            spacetrace[i]= tmp;
+         }
+#endif
+         trace_in_spinor(spinortrace, spacetrace, s1);
+      }
+
+//End of trace in spinor space
+/*   
+       TYPE  1      tau_i*phi(ytilde)*       (1+gamma5)/2*S(ytilde, x)*phi^dagger(x)*tau_i*  (1-g5)/2*S(x  ,ytilde)
+       TYPE  2      phi(ytilde)^dagger*tau_i*(1-gamma5)/2*S(ytilde, x)*phi^dagger(x)*tau_i*  (1-g5)/2*S(x  ,ytilde)
+       TYPE  3      tau_i*phi(ytilde)*       (1+gamma5)/2*S(ytilde, x)*tau_i*phi(x)          (1+g5)/2*S(x  ,ytilde)
+       TYPE  4      phi(ytilde)^dagger*tau_i*(1-gamma5)/2*S(ytilde, x)*tau_i*phi(x)          (1+g5)/2*S(x  ,ytilde)
+*/
+      if ( (type_1234 == TYPE_1) || (type_1234 == TYPE_3) ){
+         taui_scalarfield_flavoronly_s0s0( spinortrace, NO_DAGG );
+      }
+      else if ( (type_1234 == TYPE_4) || ( type_1234 == TYPE_2) ){
+         taui_scalarfield_flavoronly_s0s0( spinortrace, DAGGER  );
+      }
+
+      trace_in_flavor( flavortrace, spinortrace, f1 );
+   } //End of traCe in flavor space
+
+   type = type_1234 == TYPE_1 ? 1 : type_1234 == TYPE_2 ? 2 : type_1234 == TYPE_3 ? 3 : 4 ;
+   if (g_cart_id == 0){ printf( "Density Density correlator type (%s) results\n", type_1234 == TYPE_1 ? "1" : type_1234 == TYPE_2 ? "2" : type_1234 == TYPE_3 ? "3" : "4");}
+   for (i=0; i<T_global; ++i){
+      if (g_cart_id == 0){
+        printf("DDS0S0 %d %.3d %10.10e %10.10e\n", type, i, creal(flavortrace[i])/4.,cimag(flavortrace[i])/4.);
+        fflush(stdout);
+      }
+      (*results)[i]=flavortrace[i]/4.;
+   }
+   
+   free(flavortrace);
+   free(spacetrace);
+   free(spinortrace);
+   free(colortrace);
+
+}
+
+
+void density_density_1234( bispinor ** propfields, int type_1234, _Complex double  **results ){
+   int ix,i;
+   int f1,c1,s1,tauindex;
+   int spinorstart=0, spinorend=4;
+   bispinor running;
+
+   _Complex double *colortrace;
+   _Complex double *spacetrace;
+   _Complex double *spinortrace;
+   _Complex double *flavortrace;
+   _Complex double *paulitrace;
+   int type;
+
+   colortrace=(_Complex double *)malloc(sizeof(_Complex double) *8);
+   spacetrace=(_Complex double *)malloc(sizeof(_Complex double) *8*T_global);
+   spinortrace=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
+   flavortrace=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+   paulitrace= (_Complex double *)malloc(sizeof(_Complex double)*T_global);
+
+   if ( (colortrace == NULL) || (spacetrace == NULL) || (spinortrace == NULL) || (flavortrace == NULL) || (paulitrace == NULL) )
+   {
+     printf("Error in mem allocation in density_density_1234\n");
+     exit(1);
+   }
+   *results=(_Complex double *)malloc(sizeof(_Complex double)*4*T_global);
+   if (*results == NULL){
+     printf("Not enough memory for the results\n"); 
+     exit(1);
+   }
+
+   if ( ( type_1234 == TYPE_1 )|| ( type_1234 == TYPE_3 ) ) {
+     spinorstart=0; 
+     spinorend  =2;
+   }
+   else if ( ( type_1234 == TYPE_2) || (type_1234 == TYPE_4) ){
+     spinorstart=2;
+     spinorend  =4;
+   }
+   else{
+     if (g_cart_id ==0) fprintf(stdout, "Wrong arument for type_1234, it can only be TYPE_1, TYPE_2, TYPE_3, TYPE_4\n");
+     exit(1);
+  }
+
+//Trace over the Pauli matrices
+   for (i=0; i<T_global; ++i)
+      paulitrace[i]=0.;
+
+   for (tauindex=0; tauindex<3; ++tauindex){
+
+//Trace over up and down flavors
+      for (i=0; i<T_global; ++i)
+         flavortrace[i]=0.;
+
+      for (f1=0; f1<2; ++f1){
+//Trace over the spinor indices you have to trace only over those two spinor 
+//component that appear in the final spinor
+         for (i=0; i<2*T_global; ++i)
+            spinortrace[i]=0.;
+
+         for (s1= spinorstart; s1<spinorend; ++s1){
+
+//Trace over the spatial indices
+            for (i=0; i<8*T_global; ++i)
+               spacetrace[i]=0.;
+
+            for (ix = 0; ix< VOLUME; ++ix){
+
+//Trace over the color indices for each sites
+
+               for (i=0; i<8; ++i)
+                  colortrace[i]=0.;
+               for (c1=0; c1<3; ++c1){
+/*   
+       TYPE  1 OR  2            (1-g5)/2*S(x  ,ytilde) fixed indices (c1, s1, f1)
+       TYPE  3 OR  4            (1+g5)/2*S(x  ,ytilde) running indices bispinor
+*/
+
+//for the up quark
+                  if ( (type_1234 == TYPE_1) || (type_1234 == TYPE_2) ){
+                    _vector_null( running.sp_up.s0 );
+                    _vector_null( running.sp_up.s1 );
+                    _vector_assign( running.sp_up.s2, propfields[12*s1+4*c1+2*f1][ix].sp_up.s2 );
+                    _vector_assign( running.sp_up.s3, propfields[12*s1+4*c1+2*f1][ix].sp_up.s3 );
+                    _vector_null( running.sp_dn.s0 );
+                    _vector_null( running.sp_dn.s1 );
+                    _vector_assign( running.sp_dn.s2, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s2 );
+                    _vector_assign( running.sp_dn.s3, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s3 );
+                  }
+                  else if ((type_1234 == TYPE_3) || ( type_1234 == TYPE_4)){
+                    _vector_null( running.sp_up.s2 );
+                    _vector_null( running.sp_up.s3 );
+                    _vector_assign( running.sp_up.s0, propfields[12*s1+4*c1+2*f1][ix].sp_up.s0 );
+                    _vector_assign( running.sp_up.s1, propfields[12*s1+4*c1+2*f1][ix].sp_up.s1 );
+                    _vector_null( running.sp_dn.s2 );
+                    _vector_null( running.sp_dn.s3 );
+                    _vector_assign( running.sp_dn.s0, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s0 );
+                    _vector_assign( running.sp_dn.s1, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s1 );
+                  }
+
+/*   
+       TYPE  1 OR  2     phi^dagger(x)*tau_i*  (1-g5)/2*S(x  ,ytilde)
+       TYPE  3 OR  4     tau_i*phi(x)          (1+g5)/2*S(x  ,ytilde)
+*/
+                  if ( (type_1234 == TYPE_1) || (type_1234 == TYPE_2)){
+                    taui_scalarfield_spinor( &running, &running, GAMMA_DN, tauindex, ix, NODIR, DAGGER );
+                  }
+                  else if ( (type_1234 == TYPE_3) || (type_1234 == TYPE_4) ){
+                    taui_scalarfield_spinor( &running, &running, GAMMA_UP, tauindex, ix, NODIR, NO_DAGG);
+                  }
+/*   
+       TYPE  1 OR  2     S(ytilde, x)*phi^dagger(x)*tau_i*  (1-g5)/2*S(x  ,ytilde)
+       TYPE  3 OR  4     S(ytilde, x)*tau_i*phi(x)          (1+g5)/2*S(x  ,ytilde)
+*/
+                  multiply_backward_propagator(&running, propfields, &running, ix, NODIR );
+
+                  //delta( color component of bispinor running, c1) for all spinor and flavor indices
+                  trace_in_color(colortrace,&running,c1);
+
+               }  //End of trace color
+               //sum over all lattice sites the result of the color trace
+               trace_in_space(spacetrace,colortrace,ix);
+
+            } //End of trace space
+
+//Gather the results from all nodes to complete the trace in space
+#if defined TM_USE_MPI
+            for (i=0; i<8*T_global; ++i){
+               _Complex double tmp;
+               MPI_Allreduce(&spacetrace[i], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, g_cart_grid);
+               spacetrace[i]= tmp;
+            }
+#endif
+            // delta (spinor components of spacetrace, s1) for all time slices and flavor indices
+            trace_in_spinor(spinortrace, spacetrace, s1);
+         }//End of trace in spinor space
+/*   
+       TYPE  1      tau_i*phi(ytilde)*       (1+gamma5)/2*S(ytilde, x)*phi^dagger(x)*tau_i*  (1-g5)/2*S(x  ,ytilde)
+       TYPE  2      phi(ytilde)^dagger*tau_i*(1-gamma5)/2*S(ytilde, x)*phi^dagger(x)*tau_i*  (1-g5)/2*S(x  ,ytilde)
+       TYPE  3      tau_i*phi(ytilde)*       (1+gamma5)/2*S(ytilde, x)*tau_i*phi(x)          (1+g5)/2*S(x  ,ytilde)
+       TYPE  4      phi(ytilde)^dagger*tau_i*(1-gamma5)/2*S(ytilde, x)*tau_i*phi(x)          (1+g5)/2*S(x  ,ytilde)
+*/
+         if ( (type_1234 == TYPE_1) || (type_1234 == TYPE_3) ){
+           taui_scalarfield_flavoronly( spinortrace, tauindex, NO_DAGG, LEFT );
+         }
+         else if ( (type_1234 == TYPE_4) || ( type_1234 == TYPE_2) ){
+           taui_scalarfield_flavoronly( spinortrace, tauindex, DAGGER, LEFT  );
+         }
+         //delta(flavor component in spinortrace, f1) for all time slices
+         trace_in_flavor( flavortrace, spinortrace, f1 );
+      } //End of trace in flavor space
+      type = type_1234 == TYPE_1 ? 1 : type_1234 == TYPE_2 ? 2 : type_1234 == TYPE_3 ? 3 : 4 ;
+      if (g_cart_id == 0){printf("Density Density correlator type (%s) for tau matrix %d results\n", type_1234 == TYPE_1 ? "1" : type_1234 == TYPE_2 ? "2" : type_1234 == TYPE_3 ? "3" : "4",tauindex);}
+      for (i=0; i<T_global; ++i){
+        if (g_cart_id == 0){
+         printf( "DDTAU%dTAU%d %d %.3d %10.10e %10.10e\n", tauindex,tauindex,type, i, creal(flavortrace[i])/4.,cimag(flavortrace[i])/4.);
+         fflush(stdout);
+        }
+        (*results)[i+T_global*tauindex]=flavortrace[i]/4.;
+      }
+      //sum for all Pauli matrices
+      for (i=0;i<T_global; ++i)
+         paulitrace[i]+=flavortrace[i];
+   } //End of trace for Pauli matrices
+
+   type = type_1234 == TYPE_1 ? 1 : type_1234 == TYPE_2 ? 2 : type_1234 == TYPE_3 ? 3 : 4 ;
+   if (g_cart_id == 0){printf("Density Density correlator type (%s) results\n", type_1234 == TYPE_1 ? "1" : type_1234 == TYPE_2 ? "2" : type_1234 == TYPE_3 ? "3" : "4");}
+   for (i=0; i<T_global; ++i){
+      if (g_cart_id == 0){
+        printf("DD %d %.3d %10.10e %10.10e\n", type, i, creal(paulitrace[i])/4.,cimag(paulitrace[i])/4.);
+        fflush(stdout);
+      }
+      (*results)[i+3*T_global]=paulitrace[i]/4.;
+   }
+   free(flavortrace);
+   free(paulitrace);
+   free(spacetrace);
+   free(spinortrace);
+   free(colortrace);
+
+}
+void giancarlodensity( bispinor ** propfields, int tau3, _Complex double  **results ){
+   int ix,i;
+   int f1,c1,s1;
+   int spinorstart=0, spinorend=4;
+   bispinor running;
+
+   _Complex double *colortrace;
+   _Complex double *spacetrace;
+   _Complex double *spinortrace;
+   _Complex double *flavortrace;
+
+   colortrace= (_Complex double *)malloc(sizeof(_Complex double) *8);
+   spacetrace= (_Complex double *)malloc(sizeof(_Complex double) *8*T_global);
+   spinortrace=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
+   flavortrace=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+
+   if ( (colortrace == NULL) || (spacetrace == NULL) || (spinortrace == NULL) || (flavortrace == NULL) )
+   {
+     printf("Error in mem allocation in giancarlo\n");
+     exit(1);
+   }
+   *results=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+   if (*results == NULL){
+     printf("Not enough memory for the results\n");
+     exit(1);
+   }
+   spinorstart=2;
+   spinorend  =4;
+   
+//Trace over up and down flavors
+   for (i=0; i<T_global; ++i)
+     flavortrace[i]=0.;
+   for (f1=0; f1<2; ++f1){
+//Trace over the spinor indices you have to trace only over those two spinor 
+//component that appear in the final spinor
+     for (i=0; i<2*T_global; ++i)
+       spinortrace[i]=0.;
+     for (s1= spinorstart; s1<spinorend; ++s1){
+//Trace over the spatial indices
+       for (i=0; i<8*T_global; ++i)
+          spacetrace[i]=0.;
+       for (ix = 0; ix< VOLUME; ++ix){
+//Trace over the color indices for each sites
+         for (i=0; i<8; ++i)
+           colortrace[i]=0.;
+         for (c1=0; c1<3; ++c1){
+//for the up quark
+           _vector_null( running.sp_up.s2 );
+           _vector_null( running.sp_up.s3 );
+           _vector_assign( running.sp_up.s0, propfields[12*s1+4*c1+2*f1][ix].sp_up.s0 );
+           _vector_assign( running.sp_up.s1, propfields[12*s1+4*c1+2*f1][ix].sp_up.s1 );
+           _vector_null( running.sp_dn.s2 );
+           _vector_null( running.sp_dn.s3 );
+           _vector_assign( running.sp_dn.s0, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s0 );
+           _vector_assign( running.sp_dn.s1, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s1 );
+
+           if (tau3 == 1){
+             taui_spinor( &running, &running, 2);              
+           }
+
+           mult_phi(&running, &running, ix, NO_DAGG);
+
+           if (tau3 == 1){
+             taui_spinor( &running, &running, 2);
+           }
+
+           multiply_backward_propagator(&running, propfields, &running, ix, NODIR );
+
+            //delta( color component of bispinor running, c1) for all spinor and flavor indices
+           trace_in_color(colortrace,&running,c1);
+
+         }  //End of trace color
+         //sum over all lattice sites the result of the color trace
+         trace_in_space(spacetrace,colortrace,ix);
+       } //End of trace space
+//Gather the results from all nodes to complete the trace in space
+#if defined TM_USE_MPI
+       for (i=0; i<8*T_global; ++i){
+          _Complex double tmp;
+          MPI_Allreduce(&spacetrace[i], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, g_cart_grid);
+          spacetrace[i]= tmp;
+       }
+#endif
+       // delta (spinor components of spacetrace, s1) for all time slices and flavor indices
+       trace_in_spinor(spinortrace, spacetrace, s1);
+     }//End of trace in spinor space
+
+     if (tau3 == 1){
+       mult_taui_flavoronly(spinortrace, 2);
+     }
+     mult_phi_flavoronly(spinortrace, DAGGER);
+     if (tau3 == 1){
+       mult_taui_flavoronly(spinortrace, 2);
+     }
+     trace_in_flavor( flavortrace, spinortrace, f1 );
+   }
+   if (g_cart_id == 0){printf("Giancarlo correlator  (%s) for tau3 results\n", tau3 == 1 ? "with" : "without" );fflush(stdout);}
+   for (i=0; i<T_global; ++i){
+     if (g_cart_id == 0){
+       printf( "GIANCARLO %.3d %10.10e %10.10e\n", i, creal(flavortrace[i])/4.,cimag(flavortrace[i])/4.);
+       fflush(stdout);
+     }
+     (*results)[i]=flavortrace[i]/4.;
+   }
+   free(flavortrace);
+   free(spacetrace);
+   free(spinortrace);
+   free(colortrace);
+
+}
+void density_ptau_density_vector( bispinor **propfields, int type_12,_Complex double **results){
+
+   int ix,i;
+   int f1,c1,s1,tauindex;
+   int spinorstart=0, spinorend=4;
+   bispinor running;
+
+   _Complex double *colortrace;
+   _Complex double *spacetrace;
+   _Complex double *spinortrace;
+   _Complex double *flavortrace;
+   _Complex double *paulitrace;
+   int type;
+
+   colortrace= (_Complex double *)malloc(sizeof(_Complex double)*8);
+   spacetrace= (_Complex double *)malloc(sizeof(_Complex double)*8*T_global);
+   spinortrace=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
+   flavortrace=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+   paulitrace= (_Complex double *)malloc(sizeof(_Complex double)*T_global);
+
+   if ( (colortrace == NULL) || (spacetrace == NULL) || (spinortrace == NULL) || (flavortrace == NULL) || (paulitrace == NULL) )
+   {
+     printf("Error in mem allocation in density_density_1234_sxsx\n");
+     exit(1);
+   }
+   *results=(_Complex double *)malloc(sizeof(_Complex double)*4*T_global);
+   if (*results == NULL){
+     printf("Error in ptau anti commutator memory allocation routine\n");
+     exit(1);
+   }
+
+//Trace over the Pauli matrices
+   for (i=0; i<T_global; ++i)
+      paulitrace[i]=0.;
+   for (tauindex=0; tauindex<3; ++tauindex){
+//Trace over up and down flavors
+     for (i=0; i<T_global; ++i)
+       flavortrace[i]=0.;
+     for (f1=0; f1<2; ++f1){
+//Trace over the spinor indices you have to trace only over those two spinor 
+//component that appear in the final spinor
+       for (i=0; i<2*T_global; ++i)
+         spinortrace[i]=0.;
+       for (s1= spinorstart; s1<spinorend; ++s1){
+//Trace over the spatial indices
+         for (i=0; i<8*T_global; ++i)
+           spacetrace[i]=0.;
+         for (ix = 0; ix< VOLUME; ++ix){
+//Trace over the color indices for each sites
+           for (i=0; i<8; ++i)
+             colortrace[i]=0.;
+           for (c1=0; c1<3; ++c1){
+/*   
+       TYPE  1         (1+g5)/2*S(x  ,ytilde) fixed indices (c1, s1, f1)
+       TYPE  2         (1-g5)/2*S(x  ,ytilde) running indices bispinor
+*/
+             if ( (type_12 == TYPE_1) ){
+                 _vector_null( running.sp_up.s2 );
+                 _vector_null( running.sp_up.s3 );
+                 _vector_assign( running.sp_up.s0, propfields[12*s1+4*c1+2*f1][ix].sp_up.s0 );
+                 _vector_assign( running.sp_up.s1, propfields[12*s1+4*c1+2*f1][ix].sp_up.s1 );
+                 _vector_null( running.sp_dn.s2 );
+                 _vector_null( running.sp_dn.s3 );
+                 _vector_assign( running.sp_dn.s0, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s0 );
+                 _vector_assign( running.sp_dn.s1, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s1 );
+             }
+             else if ((type_12 == TYPE_2)){
+                 _vector_null( running.sp_up.s0 );
+                 _vector_null( running.sp_up.s1 );
+                 _vector_assign( running.sp_up.s2, propfields[12*s1+4*c1+2*f1][ix].sp_up.s2 );
+                 _vector_assign( running.sp_up.s3, propfields[12*s1+4*c1+2*f1][ix].sp_up.s3 );
+                 _vector_null( running.sp_dn.s0 );
+                 _vector_null( running.sp_dn.s1 );
+                 _vector_assign( running.sp_dn.s2, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s2 );
+                 _vector_assign( running.sp_dn.s3, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s3 );
+             }
+             else{
+                 if (g_cart_id ==0) {printf("Wrong type of argument in ptau density contraction\n");exit(1);}
+             }
+/*
+       TYPE  1         {tau_i;phi(x)}       *(1+g5)/2*S(x  ,ytilde) fixed indices (c1, s1, f1)
+       TYPE  2         {tau_i;phi^dagger(x)}*(1-g5)/2*S(x  ,ytilde) running indices bispinor
+*/
+
+             if ( (type_12 == TYPE_1)){
+                 phix_taui_anti_commutator_bispinor( &running,tauindex, GAMMA_UP, NO_DAGG, ix );
+             }
+             else if ((type_12 == TYPE_2)){
+                 phix_taui_anti_commutator_bispinor( &running,tauindex, GAMMA_DN, DAGGER, ix );
+             }
+/*   
+       TYPE  1     S(ytilde, x)*{tau_i;phi(x)}       *(1+g5)/2*S(x  ,ytilde) fixed indices (c1, s1, f1)
+       TYPE  2     S(ytilde, x)*{tau_i;phi^dagger(x)}*(1-g5)/2*S(x  ,ytilde) running indices bispinor
+*/
+             multiply_backward_propagator(&running, propfields, &running, ix, NODIR );
+
+/*   
+       TYPE  1     gamma5*S(ytilde, x)*{tau_i;phi(x)}       *(1+g5)/2*S(x  ,ytilde) fixed indices (c1, s1, f1)
+       TYPE  2     gamma5*S(ytilde, x)*{tau_i;phi^dagger(x)}*(1-g5)/2*S(x  ,ytilde) running indices bispinor
+*/
+             bispinor_timesgamma5( &running );
+
+/*   
+       TYPE  1     tau_i*gamma5*S(ytilde, x)*{tau_i;phi(x)}       *(1+g5)/2*S(x  ,ytilde) fixed indices (c1, s1, f1)
+       TYPE  2     tau_i*gamma5*S(ytilde, x)*{tau_i;phi^dagger(x)}*(1-g5)/2*S(x  ,ytilde) running indices bispinor
+*/
+             bispinor_taui( &running, tauindex);
+
+
+                  //delta( color component of bispinor running, c1) for all spinor and flavor indices
+             trace_in_color(colortrace,&running,c1);
+
+           }  //End of trace color
+               //sum over all lattice sites the result of the color trace
+           trace_in_space(spacetrace,colortrace,ix);
+
+         } //End of trace space
+
+//Gather the results from all nodes to complete the trace in space
+#if defined TM_USE_MPI
+         for (i=0; i<8*T_global; ++i){
+           _Complex double tmp;
+           MPI_Allreduce(&spacetrace[i], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, g_cart_grid);
+           spacetrace[i]= tmp;
+         }
+#endif
+            // delta (spinor components of spacetrace, s1) for all time slices and flavor indices
+         trace_in_spinor(spinortrace, spacetrace, s1);
+       }//End of trace in spinor space
+
+         //delta(flavor component in spinortrace, f1) for all time slices
+       trace_in_flavor( flavortrace, spinortrace, f1 );
+     } //End of trace in flavor space
+     type = type_12 == TYPE_1 ? 1 : type_12 == TYPE_2 ? 2 : 0;
+     if (type==0 && g_cart_id==0) {printf("Error in calculating the ptau vector density contractions\n");exit(1);}
+     if (g_cart_id == 0){printf("DensityP Vector Density correlator type (%s) results\n", type_12 == TYPE_1 ? "1" : "2");}
+     for (i=0; i<T_global; ++i){
+       if (g_cart_id == 0){
+         printf("P%dDP%d %d %.3d %10.10e %10.10e\n", tauindex+1, tauindex+1, type, i, creal(flavortrace[i])/4.,cimag(flavortrace[i])/4.);
+         fflush(stdout);
+       }
+     }
+     //sum for all Pauli matrices
+     for (i=0;i<T_global; ++i){
+       paulitrace[i]+=flavortrace[i];
+       (*results)[i+tauindex*T_global]=flavortrace[i]/4.;
+     }
+   } //End of trace for Pauli matrices
+
+   type = type_12 == TYPE_1 ? 1 : type_12 == TYPE_2 ? 2 : 0 ;
+   if (type==0 && g_cart_id==0) {printf("Error in calculating the ptau vector density contractions\n");exit(1);}
+   if (g_cart_id == 0){printf("DensityP Vector Density correlator type (%s) results\n", type_12 == TYPE_1 ? "1" : "2");}
+   for (i=0; i<T_global; ++i){
+      if (g_cart_id == 0){
+        printf("PDP %d %.3d %10.10e %10.10e\n", type, i, creal(paulitrace[i])/4.,cimag(paulitrace[i])/4.);
+        fflush(stdout);
+      }
+      (*results)[i+3*T_global]=paulitrace[i]/4.;
+   }
+   free(flavortrace);
+   free(paulitrace);
+   free(spacetrace);
+   free(spinortrace);
+   free(colortrace);
+}
+void density_density_1234_sxsx( bispinor ** propfields, int type_1234, _Complex double **results ){
+   int ix,i;
+   int f1,c1,s1,tauindex;
+   int spinorstart=0, spinorend=4;
+   bispinor running;
+
+   _Complex double *colortrace;
+   _Complex double *spacetrace;
+   _Complex double *spinortrace;
+   _Complex double *flavortrace;
+   _Complex double *paulitrace;
+   int type;
+
+   colortrace= (_Complex double *)malloc(sizeof(_Complex double)*8);
+   spacetrace= (_Complex double *)malloc(sizeof(_Complex double)*8*T_global);
+   spinortrace=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
+   flavortrace=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+   paulitrace= (_Complex double *)malloc(sizeof(_Complex double)*T_global);
+
+   if ( (colortrace == NULL) || (spacetrace == NULL) || (spinortrace == NULL) || (flavortrace == NULL) || (paulitrace == NULL) )
+   {
+     printf("Error in mem allocation in density_density_1234_sxsx\n");
+     exit(1);
+   }
+   *results=(_Complex double *)malloc(sizeof(_Complex double)*4*T_global);
+   if (*results == NULL){
+     printf("Error in sxsx\n");
+     exit(1);
+   }
+   if ( ( type_1234 == TYPE_1 )|| ( type_1234 == TYPE_3 ) ) {
+     spinorstart=0;
+     spinorend  =2;
+   }
+   else if ( ( type_1234 == TYPE_2) || (type_1234 == TYPE_4) ){
+     spinorstart=2;
+     spinorend  =4;
+   }
+   else{
+     if (g_cart_id ==0) fprintf(stdout, "Wrong arument for type_1234, it can only be TYPE_1, TYPE_2, TYPE_3, TYPE_4\n");
+     exit(1);
+   }
+
+//Trace over the Pauli matrices
+   for (i=0; i<T_global; ++i)
+      paulitrace[i]=0.;
+   for (tauindex=0; tauindex<3; ++tauindex){
+//Trace over up and down flavors
+     for (i=0; i<T_global; ++i)
+       flavortrace[i]=0.;
+     for (f1=0; f1<2; ++f1){
+
+//Trace over the spinor indices you have to trace only over those two spinor 
+//component that appear in the final spinor
+       for (i=0; i<2*T_global; ++i)
+         spinortrace[i]=0.;
+       for (s1= spinorstart; s1<spinorend; ++s1){
+
+//Trace over the spatial indices
+         for (i=0; i<8*T_global; ++i)
+           spacetrace[i]=0.;
+
+         for (ix = 0; ix< VOLUME; ++ix){
+
+//Trace over the color indices for each sites
+           for (i=0; i<8; ++i)
+             colortrace[i]=0.;
+           for (c1=0; c1<3; ++c1){
+/*   
+       TYPE  1 OR  2            (1-g5)/2*S(x  ,ytilde) fixed indices (c1, s1, f1)
+       TYPE  3 OR  4            (1+g5)/2*S(x  ,ytilde) running indices bispinor
+*/
+
+//for the up quark
+             if ( (type_1234 == TYPE_1) || (type_1234 == TYPE_2) ){
+                 _vector_null( running.sp_up.s0 );
+                 _vector_null( running.sp_up.s1 );
+                 _vector_assign( running.sp_up.s2, propfields[12*s1+4*c1+2*f1][ix].sp_up.s2 );
+                 _vector_assign( running.sp_up.s3, propfields[12*s1+4*c1+2*f1][ix].sp_up.s3 );
+                 _vector_null( running.sp_dn.s0 );
+                 _vector_null( running.sp_dn.s1 );
+                 _vector_assign( running.sp_dn.s2, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s2 );
+                 _vector_assign( running.sp_dn.s3, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s3 );
+             }
+             else if ((type_1234 == TYPE_3) || ( type_1234 == TYPE_4)){
+                 _vector_null( running.sp_up.s2 );
+                 _vector_null( running.sp_up.s3 );
+                 _vector_assign( running.sp_up.s0, propfields[12*s1+4*c1+2*f1][ix].sp_up.s0 );
+                 _vector_assign( running.sp_up.s1, propfields[12*s1+4*c1+2*f1][ix].sp_up.s1 );
+                 _vector_null( running.sp_dn.s2 );
+                 _vector_null( running.sp_dn.s3 );
+                 _vector_assign( running.sp_dn.s0, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s0 );
+                 _vector_assign( running.sp_dn.s1, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s1 );
+                }
+
+/*   
+       TYPE  1 OR  2     phi^dagger(x)*tau_i*  (1-g5)/2*S(x  ,ytilde)
+       TYPE  3 OR  4     tau_i*phi(x)          (1+g5)/2*S(x  ,ytilde)
+*/
+             if ( (type_1234 == TYPE_1) || (type_1234 == TYPE_2)){
+                 taui_scalarfield_spinor( &running, &running, GAMMA_DN, tauindex, ix, NODIR, DAGGER );
+             }
+             else if ( (type_1234 == TYPE_3) || (type_1234 == TYPE_4) ){
+                 taui_scalarfield_spinor( &running, &running, GAMMA_UP, tauindex, ix, NODIR, NO_DAGG);
+             }
+/*   
+       TYPE  1 OR  2     S(ytilde, x)*phi^dagger(x)*tau_i*  (1-g5)/2*S(x  ,ytilde)
+       TYPE  3 OR  4     S(ytilde, x)*tau_i*phi(x)          (1+g5)/2*S(x  ,ytilde)
+*/
+             multiply_backward_propagator(&running, propfields, &running, ix, NODIR );
+
+                  //delta( color component of bispinor running, c1) for all spinor and flavor indices
+             trace_in_color(colortrace,&running,c1);
+
+           }  //End of trace color
+               //sum over all lattice sites the result of the color trace
+           trace_in_space(spacetrace,colortrace,ix);
+
+         } //End of trace space
+
+//Gather the results from all nodes to complete the trace in space
+#if defined TM_USE_MPI
+         for (i=0; i<8*T_global; ++i){
+           _Complex double tmp;
+           MPI_Allreduce(&spacetrace[i], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, g_cart_grid);
+           spacetrace[i]= tmp;
+         }
+#endif
+            // delta (spinor components of spacetrace, s1) for all time slices and flavor indices
+         trace_in_spinor(spinortrace, spacetrace, s1);
+       }//End of trace in spinor space
+/*   
+       TYPE  1      tau_i*phi(ytilde)*       (1+gamma5)/2*S(ytilde, x)*phi^dagger(x)*tau_i*  (1-g5)/2*S(x  ,ytilde)
+       TYPE  2      phi(ytilde)^dagger*tau_i*(1-gamma5)/2*S(ytilde, x)*phi^dagger(x)*tau_i*  (1-g5)/2*S(x  ,ytilde)
+       TYPE  3      tau_i*phi(ytilde)*       (1+gamma5)/2*S(ytilde, x)*tau_i*phi(x)          (1+g5)/2*S(x  ,ytilde)
+       TYPE  4      phi(ytilde)^dagger*tau_i*(1-gamma5)/2*S(ytilde, x)*tau_i*phi(x)          (1+g5)/2*S(x  ,ytilde)
+*/
+       if ( (type_1234 == TYPE_1) || (type_1234 == TYPE_3) ){
+         taui_scalarfield_flavoronly( spinortrace, tauindex, NO_DAGG, LEFT );
+       }
+       else if ( (type_1234 == TYPE_4) || ( type_1234 == TYPE_2) ){
+         taui_scalarfield_flavoronly( spinortrace, tauindex, DAGGER, LEFT  );
+       }
+         //delta(flavor component in spinortrace, f1) for all time slices
+       trace_in_flavor( flavortrace, spinortrace, f1 );
+      } //End of trace in flavor space
+      type = type_1234 == TYPE_1 ? 1 : type_1234 == TYPE_2 ? 2 : type_1234 == TYPE_3 ? 3 : 4 ;
+      if (g_cart_id == 0){printf("Density Density correlator type (%s) results\n", type_1234 == TYPE_1 ? "1" : type_1234 == TYPE_2 ? "2" : type_1234 == TYPE_3 ? "3" : "4");}
+      for (i=0; i<T_global; ++i){
+        if (g_cart_id == 0){
+          printf("DDS%dS%d %d %.3d %10.10e %10.10e\n", tauindex+1, tauindex+1, type, i, creal(flavortrace[i])/4.,cimag(flavortrace[i])/4.);
+          fflush(stdout);
+        }
+      }
+      //sum for all Pauli matrices
+      for (i=0;i<T_global; ++i){
+        paulitrace[i]+=flavortrace[i];
+        (*results)[i+tauindex*T_global]=flavortrace[i]/4.;
+      }
+      
+   } //End of trace for Pauli matrices
+
+   type = type_1234 == TYPE_1 ? 1 : type_1234 == TYPE_2 ? 2 : type_1234 == TYPE_3 ? 3 : 4 ;
+   if (g_cart_id == 0){printf("Density Density correlator type (%s) results\n", type_1234 == TYPE_1 ? "1" : type_1234 == TYPE_2 ? "2" : type_1234 == TYPE_3 ? "3" : "4");}
+   for (i=0; i<T_global; ++i){
+      if (g_cart_id == 0){
+        printf("DD %d %.3d %10.10e %10.10e\n", type, i, creal(paulitrace[i])/4.,cimag(paulitrace[i])/4.);
+        fflush(stdout);
+      }
+      (*results)[i+3*T_global]=paulitrace[i]/4.;
+   }
+
+   free(flavortrace);
+   free(paulitrace);
+   free(spacetrace);
+   free(spinortrace);
+   free(colortrace);
+}
+
+
+
+
+void vector_axial_current_density_1234( bispinor ** propfields, int type_1234,int taudensity, int taucurrent, int vectororaxial, int scalarorpseudoscalar, _Complex double **results ){
+   int ix,i;
+   int f1,c1,s1;
+   int spinorstart=0, spinorend=4;
+   bispinor running;
+
+   _Complex double *colortrace;
+   _Complex double *spacetrace;
+   _Complex double *spinortrace;
+   _Complex double *flavortrace;
+   int type;
+
+#if defined TM_USE_MPI
+   int count;
+   MPI_Status  statuses[8];
+   MPI_Request *request;
+   request=( MPI_Request *) malloc(sizeof(MPI_Request)*8);
+#endif
+
+
+   colortrace=(_Complex double *)malloc(sizeof(_Complex double) *8);
+   spacetrace=(_Complex double *)malloc(sizeof(_Complex double) *8*T_global);
+   spinortrace=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
+   flavortrace=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+
+   if ( (colortrace == NULL) || (spacetrace == NULL) || (spinortrace == NULL) || (flavortrace == NULL) )
+   {
+     printf("Error in mem allocation in density_density_1234\n");
+     exit(1);
+   }
+   *results=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+   if (*results == NULL){
+     printf("Error in vector current density\n");
+     exit(1);
+   }
+
+
+   if ( ( type_1234 == TYPE_1 )|| ( type_1234 == TYPE_3 ) ) {
+     spinorstart=0;
+     spinorend  =2;
+   }
+   else if ( ( type_1234 == TYPE_2) || (type_1234 == TYPE_4) ){
+     spinorstart=2;
+     spinorend  =4;
+   }
+   else{
+     if (g_cart_id ==0) fprintf(stdout, "Wrong arument for type_1234, it can only be TYPE_1, TYPE_2, TYPE_3, TYPE_4\n");
+     exit(1);
+   }
+
+//Doing the neccessary communication
+#if defined TM_USE_MPI
+   for (s1=spinorstart; s1<spinorend; ++s1)
+      for (c1=0; c1<3; ++c1)
+         for (f1=0; f1<2; ++f1){
+            count=0;
+            generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 0], sizeof(bispinor), TDOWN, request, &count );
+            MPI_Waitall( count, request, statuses);
+            count=0;
+            generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 1], sizeof(bispinor), TDOWN, request, &count);
+            MPI_Waitall( count, request, statuses);
+         }
+   free(request);
+#endif
+
+
+//Trace over up and down flavors
+   for (i=0; i<T_global; ++i)
+     flavortrace[i]=0.;
+   for (f1=0; f1<2; ++f1){
+//Trace over the spinor indices you have to trace only over those two spinor 
+//component that appear in the final spinor
+     for (i=0; i<2*T_global; ++i)
+       spinortrace[i]=0.;
+
+     for (s1= spinorstart; s1<spinorend; ++s1){
+//Trace over the spatial indices
+       for (i=0; i<8*T_global; ++i)
+         spacetrace[i]=0.;
+       for (ix = 0; ix< VOLUME; ++ix){
+//Trace over the color indices for each sites
+         for (i=0; i<8; ++i)
+           colortrace[i]=0.;
+         for (c1=0; c1<3; ++c1){
+           if ( (type_1234 == TYPE_1) || (type_1234 == TYPE_2) ){
+            bispinor_mult_su3matrix( &running, &propfields[12*s1+4*c1+2*f1][ix], &g_gauge_field[g_idn[ix][TUP]][TUP], NO_DAGG);
+           }
+           else if ((type_1234 == TYPE_3) || ( type_1234 == TYPE_4)){
+            bispinor_mult_su3matrix( &running, &propfields[12*s1+4*c1+2*f1][g_idn[ix][TUP]],  &g_gauge_field[g_idn[ix][TUP]][TUP], DAGGER);
+           }
+
+           if (vectororaxial == 1){
+            bispinor_timesgamma5(&running);
+           }
+//Multiplication with gamma0
+           bispinor_timesgamma0(&running);
+
+//Multiplication with tau_i input parameter for the current
+           bispinor_taui(&running, taucurrent);
+
+//Backward propagator multiplication
+           if (type_1234 == TYPE_1 || type_1234 == TYPE_2){
+              multiply_backward_propagator(&running, propfields, &running, ix, TDOWN );
+           }
+           else if (type_1234 == TYPE_3 || type_1234 == TYPE_4){
+              multiply_backward_propagator(&running, propfields, &running, ix, NODIR );
+           }
+           trace_in_color(colortrace,&running,c1);
+         }  //End of trace color
+         trace_in_space(spacetrace,colortrace,ix);
+       } //End of trace space
+//Gather the results from all nodes to complete the trace in space
+#if defined TM_USE_MPI
+       for (i=0; i<8*T_global; ++i){
+         _Complex double tmp;
+        MPI_Allreduce(&spacetrace[i], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, g_cart_grid);
+        spacetrace[i]= tmp;
+       }
+#endif
+           // delta (spinor components of spacetrace, s1) for all time slices and flavor indices
+       trace_in_spinor(spinortrace, spacetrace, s1);
+
+     }//End of trace in spinor space
+/*
+     if ( type_1234 == TYPE_1 || type_1234 == TYPE_3 ){
+       taui_scalarfield_flavoronly( spinortrace, taudensity, NO_DAGG, RIGHT );
+     }
+     else if ( type_1234 == TYPE_2 || type_1234 == TYPE_4 ){
+       taui_scalarfield_flavoronly( spinortrace, taudensity, DAGGER, RIGHT  );
+     }*/
+     if (scalarorpseudoscalar == 0){
+       phi0_taui_commutator( spinortrace, taudensity );
+     }
+     else if (scalarorpseudoscalar == 1){
+      if ( type_1234 == TYPE_1 || type_1234 == TYPE_3 ){
+       phi0_taui_anticommutator( spinortrace, taudensity, NO_DAGG );
+      }
+      else if ( type_1234 == TYPE_2 || type_1234 == TYPE_4 ){
+       phi0_taui_anticommutator( spinortrace, taudensity, DAGGER );
+      }
+     }
+     //delta(flavor component in spinortrace, f1) for all time slices
+     trace_in_flavor( flavortrace, spinortrace, f1 );
+   } //End of trace in flavor space
+   type = type_1234 == TYPE_1 ? 1 : type_1234 == TYPE_2 ? 2 : type_1234 == TYPE_3 ? 3 : 4 ;
+   if (g_cart_id == 0){printf("Vector current Density correlator type (%s) for tau matrix current %d density %d results\n", type_1234 == TYPE_1 ? "1" : type_1234 == TYPE_2 ? "2" : type_1234 == TYPE_3 ? "3" : "4", taucurrent, taudensity);}
+   for (i=0; i<T_global; ++i){
+     if (g_cart_id == 0){
+      if (vectororaxial == 0)
+        printf("VECTORCURRENT%dDENSITY%d %d %.3d %10.10e %10.10e\n", taucurrent,taudensity, type, i, creal(flavortrace[i])/4.,cimag(flavortrace[i])/4.);
+      else
+       printf("AXIALCURRENT%dDENSITY%d %d %.3d %10.10e %10.10e\n", taucurrent,taudensity, type, i, creal(flavortrace[i])/4.,cimag(flavortrace[i])/4.);
+      fflush(stdout);
+     }
+     (*results)[i]=flavortrace[i]/4.;
+   }
+   free(flavortrace);
+   free(spacetrace);
+   free(spinortrace);
+   free(colortrace);
+}
+
+
+
+void vector_2_psueodoscalar_1_12( bispinor ** propfields, int type_12,_Complex double **results ){
+   int ix,i;
+   int f1,c1,s1;
+   int spinorstart=0, spinorend=4;
+   bispinor running;
+
+   _Complex double *colortrace;
+   _Complex double *spacetrace;
+   _Complex double *spinortrace;
+   _Complex double *flavortrace;
+   int type;
+
+#if defined TM_USE_MPI
+   int count;
+   MPI_Status  statuses[8];
+   MPI_Request *request;
+   request=( MPI_Request *) malloc(sizeof(MPI_Request)*8);
+#endif
+
+
+   colortrace=(_Complex double *)malloc(sizeof(_Complex double) *8);
+   spacetrace=(_Complex double *)malloc(sizeof(_Complex double) *8*T_global);
+   spinortrace=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
+   flavortrace=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+
+   if ( (colortrace == NULL) || (spacetrace == NULL) || (spinortrace == NULL) || (flavortrace == NULL) )
+   {
+     printf("Error in mem allocation in density_density_1234\n");
+     exit(1);
+   }
+   *results=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+   if (*results == NULL){
+     printf("Error in vector current density\n");
+     exit(1);
+   }
+
+   spinorstart=0;
+   spinorend  =4;
+
+//Doing the neccessary communication
+#if defined TM_USE_MPI
+   for (s1=spinorstart; s1<spinorend; ++s1)
+      for (c1=0; c1<3; ++c1)
+         for (f1=0; f1<2; ++f1){
+            count=0;
+            generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 0], sizeof(bispinor), TDOWN, request, &count );
+            MPI_Waitall( count, request, statuses);
+            count=0;
+            generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 1], sizeof(bispinor), TDOWN, request, &count);
+            MPI_Waitall( count, request, statuses);
+         }
+   free(request);
+#endif
+
+
+//Trace over up and down flavors
+   for (i=0; i<T_global; ++i)
+     flavortrace[i]=0.;
+   for (f1=0; f1<2; ++f1){
+//Trace over the spinor indices you have to trace only over those two spinor 
+//component that appear in the final spinor
+     for (i=0; i<2*T_global; ++i)
+       spinortrace[i]=0.;
+
+     for (s1= spinorstart; s1<spinorend; ++s1){
+//Trace over the spatial indices
+       for (i=0; i<8*T_global; ++i)
+         spacetrace[i]=0.;
+       for (ix = 0; ix< VOLUME; ++ix){
+//Trace over the color indices for each sites
+         for (i=0; i<8; ++i)
+           colortrace[i]=0.;
+         for (c1=0; c1<3; ++c1){
+           if ( type_12 == TYPE_1 ){
+            bispinor_mult_su3matrix( &running, &propfields[12*s1+4*c1+2*f1][ix], &g_gauge_field[g_idn[ix][TUP]][TUP], NO_DAGG);
+           }
+           else if (type_12 == TYPE_2 ){
+            bispinor_mult_su3matrix( &running, &propfields[12*s1+4*c1+2*f1][g_idn[ix][TUP]],  &g_gauge_field[g_idn[ix][TUP]][TUP], DAGGER);
+           }
+           else{
+            if (g_cart_id == 0) printf("Error in indexing in test routine\n");
+           }
+
+//Multiplication with gamma0
+           bispinor_timesgamma0(&running);
+
+//Multiplication with tau_i input parameter for the current
+           bispinor_taui(&running, 1);
+
+//Backward propagator multiplication
+           if ( type_12 == TYPE_1 ){
+              multiply_backward_propagator(&running, propfields, &running, ix, TDOWN );
+           }
+           else if (type_12 == TYPE_2 ){
+              multiply_backward_propagator(&running, propfields, &running, ix, NODIR );
+           }
+           else{
+            if (g_cart_id == 0) printf("Error in indexing in test routine\n");
+           }
+           trace_in_color(colortrace,&running,c1);
+         }  //End of trace color
+         trace_in_space(spacetrace,colortrace,ix);
+       } //End of trace space
+//Gather the results from all nodes to complete the trace in space
+#if defined TM_USE_MPI
+       for (i=0; i<8*T_global; ++i){
+         _Complex double tmp;
+        MPI_Allreduce(&spacetrace[i], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, g_cart_grid);
+        spacetrace[i]= tmp;
+       }
+#endif
+       int fx;
+       for(i=0;i<T_global;++i){
+         for (fx=0;fx<2;++fx){
+           spacetrace[8*i+4*fx+0]*=+1.;
+           spacetrace[8*i+4*fx+1]*=+1.;
+           spacetrace[8*i+4*fx+2]*=-1.;
+           spacetrace[8*i+4*fx+3]*=-1.;
+         }
+       }
+      // delta (spinor components of spacetrace, s1) for all time slices and flavor indices
+       trace_in_spinor(spinortrace, spacetrace, s1);
+
+     }//End of trace in spinor space
+     for (i=0;i<T_global;++i){
+       _Complex double tmpup=spinortrace[2*i+0];
+       _Complex double tmpdn=spinortrace[2*i+1];
+       spinortrace[2*i+0]=tmpdn;
+       spinortrace[2*i+1]=tmpup;
+     }
+
+     //delta(flavor component in spinortrace, f1) for all time slices
+     trace_in_flavor( flavortrace, spinortrace, f1 );
+   } //End of trace in flavor space
+   type = type_12 == TYPE_1 ? 1 : 2;
+   if (g_cart_id == 0){printf("TESTING Vector current Density correlator type (%s) for tau matrix current %d density %d results\n", type_12 == TYPE_1 ? "1" : "2", 1, 0);}
+   for (i=0; i<T_global; ++i){
+     if (g_cart_id == 0){
+       printf("VECTORCURRENT%dDENSITY%d %d %.3d %10.10e %10.10e\n", 1,0, type, i, creal(flavortrace[i])/4.,cimag(flavortrace[i])/4.);
+       fflush(stdout);
+     }
+     (*results)[i]=flavortrace[i]/4.;
+   }
+   free(flavortrace);
+   free(spacetrace);
+   free(spinortrace);
+   free(colortrace);
+}
+void vector_density_density_1234( bispinor ** propfields, int type_1234,int taudensity, _Complex double **results ){
+   int ix,i;
+   int f1,c1,s1;
+   int spinorstart=0, spinorend=4;
+   bispinor running;
+
+   _Complex double *colortrace;
+   _Complex double *spacetrace;
+   _Complex double *spinortrace;
+   _Complex double *flavortrace;
+   int type;
+
+
+
+   colortrace=(_Complex double *)malloc(sizeof(_Complex double) *8);
+   spacetrace=(_Complex double *)malloc(sizeof(_Complex double) *8*T_global);
+   spinortrace=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
+   flavortrace=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+
+   if ( (colortrace == NULL) || (spacetrace == NULL) || (spinortrace == NULL) || (flavortrace == NULL) )
+   {
+     printf("Error in mem allocation in density_density_1234\n");
+     exit(1);
+   }
+   *results=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+   if (*results == NULL){
+     printf("Error in vector current density\n");
+     exit(1);
+   }
+
+
+   if ( ( type_1234 == TYPE_1 )|| ( type_1234 == TYPE_3 ) ) {
+     spinorstart=0;
+     spinorend  =2;
+   }
+   else if ( ( type_1234 == TYPE_2) || (type_1234 == TYPE_4) ){
+     spinorstart=2;
+     spinorend  =4;
+   }
+   else{
+     if (g_cart_id ==0) fprintf(stdout, "Wrong arument for type_1234, it can only be TYPE_1, TYPE_2, TYPE_3, TYPE_4\n");
+     exit(1);
+   }
+
+
+//Trace over up and down flavors
+   for (i=0; i<T_global; ++i)
+     flavortrace[i]=0.;
+   for (f1=0; f1<2; ++f1){
+//Trace over the spinor indices you have to trace only over those two spinor 
+//component that appear in the final spinor
+     for (i=0; i<2*T_global; ++i)
+       spinortrace[i]=0.;
+
+     for (s1= spinorstart; s1<spinorend; ++s1){
+//Trace over the spatial indices
+       for (i=0; i<8*T_global; ++i)
+         spacetrace[i]=0.;
+       for (ix = 0; ix< VOLUME; ++ix){
+//Trace over the color indices for each sites
+         for (i=0; i<8; ++i)
+           colortrace[i]=0.;
+         for (c1=0; c1<3; ++c1){
+
+           _bispinor_null(running);
+
+           if ( (type_1234 == TYPE_1) || (type_1234 == TYPE_2) ){
+
+             _vector_assign(running.sp_up.s0, propfields[12*s1+4*c1+2*f1][ix].sp_up.s0);
+             _vector_assign(running.sp_up.s1, propfields[12*s1+4*c1+2*f1][ix].sp_up.s1);
+
+             _vector_assign(running.sp_dn.s0, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s0);
+             _vector_assign(running.sp_dn.s1, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s1);
+
+
+              phix_taui_commutator_bispinor( &running, taudensity, GAMMA_UP, ix );
+
+           }
+           else if ((type_1234 == TYPE_3) || ( type_1234 == TYPE_4)){
+
+             _vector_assign(running.sp_up.s2, propfields[12*s1+4*c1+2*f1][ix].sp_up.s2);
+             _vector_assign(running.sp_up.s3, propfields[12*s1+4*c1+2*f1][ix].sp_up.s3);
+
+             _vector_assign(running.sp_dn.s2, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s2);
+             _vector_assign(running.sp_dn.s3, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s3);
+              
+              phix_taui_commutator_bispinor( &running, taudensity, GAMMA_DN, ix );
+
+           }
+
+           multiply_backward_propagator(&running, propfields, &running, ix, NODIR );
+
+           trace_in_color(colortrace,&running,c1);
+
+         }  
+       
+         trace_in_space(spacetrace,colortrace,ix);
+
+       } 
+//Gather the results from all nodes to complete the trace in space
+#if defined TM_USE_MPI
+       for (i=0; i<8*T_global; ++i){
+         _Complex double tmp;
+        MPI_Allreduce(&spacetrace[i], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, g_cart_grid);
+        spacetrace[i]= tmp;
+       }
+#endif
+           // delta (spinor components of spacetrace, s1) for all time slices and flavor indices
+       trace_in_spinor(spinortrace, spacetrace, s1);
+     }//End of trace in spinor space
+
+     phi0_taui_commutator( spinortrace, taudensity );
+     
+     //delta(flavor component in spinortrace, f1) for all time slices
+     trace_in_flavor( flavortrace, spinortrace, f1 );
+   } //End of trace in flavor space
+   type = type_1234 == TYPE_1 ? 1 : type_1234 == TYPE_2 ? 2 : type_1234 == TYPE_3 ? 3 : 4 ;
+   if (g_cart_id == 0){printf("Vector Density Density correlator type (%s) for tau matrix density %d results\n", type_1234 == TYPE_1 ? "1" : type_1234 == TYPE_2 ? "2" : type_1234 == TYPE_3 ? "3" : "4", taudensity);}
+   for (i=0; i<T_global; ++i){
+     if (g_cart_id == 0){
+      printf("VECTORDENSITY%dDENSITY%d %d %.3d %10.10e %10.10e\n", taudensity,taudensity, type, i, creal(flavortrace[i])/4.,cimag(flavortrace[i])/4.);
+      fflush(stdout);
+     }
+     (*results)[i]=flavortrace[i]/4.;
+   }
+   free(flavortrace);
+   free(spacetrace);
+   free(spinortrace);
+   free(colortrace);
+}
+
+
+
+void naivedirac_current_density_12ab( bispinor ** propfields, int type_12, int type_ab, _Complex double **results ){
+   int ix,i;
+   int f1,c1,s1,tauindex;
+   int spinorstart=0, spinorend=4;
+   su3 * restrict upm;
+   bispinor running;
+#if defined TM_USE_MPI
+   int count;
+   MPI_Status  statuses[8];
+   MPI_Request *request;
+   request=( MPI_Request *) malloc(sizeof(MPI_Request)*8);
+#endif
+   _Complex double *colortrace;
+   _Complex double *spacetrace;
+   _Complex double *spinortrace;
+   _Complex double *flavortrace;
+   _Complex double *paulitrace;
+
+   colortrace=(_Complex double *)malloc(sizeof(_Complex double) *8);
+   spacetrace=(_Complex double *)malloc(sizeof(_Complex double) *8*T_global);
+   spinortrace=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
+   flavortrace=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+   paulitrace= (_Complex double *)malloc(sizeof(_Complex double)*T_global);
+
+   if ( (colortrace == NULL) || (spacetrace == NULL) || (spinortrace == NULL) || (flavortrace == NULL) || (paulitrace == NULL) )
+   {
+     printf("Error in mem allocation in naivedirac_current_density_12ab\n");
+     exit(1);
+   }
+   *results=(_Complex double *)malloc(sizeof(_Complex double)*T_global*4);
+   if ( *results == NULL){
+     printf("Not enough memory for results in current density naive\n");
+     exit(1);
+   }
+   if ( type_ab == TYPE_A ) {
+     spinorstart=0;
+     spinorend  =2;
+   }
+   else if ( type_ab == TYPE_B ){
+     spinorstart=2;
+     spinorend  =4;
+   }
+   else{
+    if (g_cart_id == 0){
+      fprintf(stdout, "Wrong argument for type_ab, it can only be TYPE_A or TYPE_B\n");exit(1);}
+   }
+
+
+//Doing the neccessary communication
+#if defined TM_USE_MPI
+   for (s1=spinorstart; s1<spinorend; ++s1)
+      for (c1=0; c1<3; ++c1)
+         for (f1=0; f1<2; ++f1){
+            count=0;
+            generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 0], sizeof(bispinor), TDOWN, request, &count );
+            MPI_Waitall( count, request, statuses);
+            count=0;
+            generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 1], sizeof(bispinor), TDOWN, request, &count);
+            MPI_Waitall( count, request, statuses);
+         }
+   free(request);
+#endif
+
+//Trace over the Pauli matrices
+   for (i=0; i<T_global; ++i)
+      paulitrace[i]=0.;
+
+   for (tauindex=0; tauindex<3; ++tauindex){
+      for (i=0; i<T_global; ++i)
+         flavortrace[i]=0.;
+
+      for (f1=0; f1<2; ++f1){
+
+//Trace over the spinor indices
+         for (i=0; i<2*T_global; ++i)
+            spinortrace[i]=0.;
+
+         for (s1= spinorstart; s1<spinorend; ++s1){
+
+//Trace over the spatial indices
+            for (i=0; i<8*T_global; ++i)
+               spacetrace[i]=0.;
+            for (ix = 0; ix< VOLUME; ++ix){
+
+//Trace over the color indices for each sites
+
+               for (i=0; i<8; ++i)
+                  colortrace[i]=0.;
+               for (c1=0; c1<3; ++c1){
+/*   
+       TYPE  IA OR  IB     U0(x-0)*       (1-g5)/2*S(x  ,ytilde) fixed indices (c1, s1, f1)
+       TYPE IIA OR IIB     U0^dagger(x-0)*(1-g5)/2*S(x-0,ytilde) running indices bispinor
+*/
+                  upm = &g_gauge_field[g_idn[ix][TUP]][TUP];
+
+//for the up quark
+                  _vector_null( running.sp_up.s0 );
+                  _vector_null( running.sp_up.s1 );
+
+                  if  ( type_12 == TYPE_I ){
+                    _su3_multiply( running.sp_up.s2, (*upm), propfields[12*s1+4*c1+2*f1][ix].sp_up.s2 );
+                    _su3_multiply( running.sp_up.s3, (*upm), propfields[12*s1+4*c1+2*f1][ix].sp_up.s3 );
+
+
+                    _complex_times_vector(running.sp_up.s2,phase_0,running.sp_up.s2);
+                    _complex_times_vector(running.sp_up.s3,phase_0,running.sp_up.s3);
+                  }
+                  else if ( type_12 == TYPE_II ){
+                    _su3_inverse_multiply( running.sp_up.s2, (*upm), propfields[12*s1+4*c1+2*f1][g_idn[ix][TUP]].sp_up.s2 );
+                    _su3_inverse_multiply( running.sp_up.s3, (*upm), propfields[12*s1+4*c1+2*f1][g_idn[ix][TUP]].sp_up.s3 );
+
+                    _complexcjg_times_vector(running.sp_up.s2,phase_0,running.sp_up.s2);
+                    _complexcjg_times_vector(running.sp_up.s3,phase_0,running.sp_up.s3);
+
+                  }
+
+
+//for the dn quark
+                  _vector_null( running.sp_dn.s0 );
+                  _vector_null( running.sp_dn.s1 );
+                  if  ( type_12 == TYPE_I ){
+                    _su3_multiply( running.sp_dn.s2, (*upm), propfields[12*s1+4*c1+2*f1][ix].sp_dn.s2 );
+                    _su3_multiply( running.sp_dn.s3, (*upm), propfields[12*s1+4*c1+2*f1][ix].sp_dn.s3 );
+
+                    _complex_times_vector(running.sp_dn.s2,phase_0,running.sp_dn.s2);
+                    _complex_times_vector(running.sp_dn.s3,phase_0,running.sp_dn.s3);
+
+                  }
+                  else if ( type_12 == TYPE_II ){
+                    _su3_inverse_multiply( running.sp_dn.s2, (*upm), propfields[12*s1+4*c1+2*f1][g_idn[ix][TUP]].sp_dn.s2 );
+                    _su3_inverse_multiply( running.sp_dn.s3, (*upm), propfields[12*s1+4*c1+2*f1][g_idn[ix][TUP]].sp_dn.s3 );
+
+                    _complexcjg_times_vector(running.sp_dn.s2,phase_0,running.sp_dn.s2);
+                    _complexcjg_times_vector(running.sp_dn.s3,phase_0,running.sp_dn.s3);
+
+                  }
+
+
+
+/*   
+       TYPE  IA OR  IB     gamma0*U0(x-0)*       (1-g5)/2*S(x  ,ytilde)
+       TYPE IIA OR IIB     gamma0*U0^dagger(x-0)*(1-g5)/2*S(x-0,ytilde)
+*/
+                  _vector_add_assign(running.sp_up.s0, running.sp_up.s2);
+                  _vector_add_assign(running.sp_up.s1, running.sp_up.s3);
+                  _vector_null(running.sp_up.s2);
+                  _vector_null(running.sp_up.s3);
+
+                  _vector_add_assign(running.sp_dn.s0, running.sp_dn.s2);
+                  _vector_add_assign(running.sp_dn.s1, running.sp_dn.s3);
+                  _vector_null(running.sp_dn.s2);
+                  _vector_null(running.sp_dn.s3);
+
+/*   
+       TYPE  IA OR  IB     tau_i*gamma0*U0(x-0)*       (1-g5)/2*S(x  ,ytilde)
+       TYPE IIA OR IIB     tau_i*gamma0*U0^dagger(x-0)*(1-g5)/2*S(x-0,ytilde)
+*/
+                  taui_spinor( &running, &running, tauindex  );
+
+/*   
+       TYPE  IA OR  IB     S(ytilde, x-0)* tau_i*gamma0*U0(x-0)*       (1-g5)/2*S(x  ,ytilde)
+       TYPE IIA OR IIB     S(ytilde, x  )* tau_i*gamma0*U0^dagger(x-0)*(1-g5)/2*S(x-0,ytilde)
+*/
+                  if ( type_12 == TYPE_I ){
+                    multiply_backward_propagator(&running, propfields, &running, ix, TDOWN);
+                  }
+                  else if ( type_12 == TYPE_II ){
+                    multiply_backward_propagator(&running, propfields, &running, ix,NODIR);
+                  }
+                  //delta( color component of bispinor running, c1) for all spinor and flavor indices
+                  trace_in_color(colortrace,&running,c1);
+
+               }  //End of trace color
+               //sum over all lattice sites the result of the color trace
+               trace_in_space(spacetrace,colortrace,ix);
+
+            } //End of trace space
+
+//Gather the results from all nodes to complete the trace in space
+#if defined TM_USE_MPI
+            for (i=0; i<8*T_global; ++i){
+               _Complex double tmp;
+               MPI_Allreduce(&spacetrace[i], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, g_cart_grid);
+               spacetrace[i]= tmp;
+            }
+#endif
+            // delta (spinor components of spacetrace, s1) for all time slices and flavor components
+            trace_in_spinor(spinortrace, spacetrace, s1);
+
+         }//End of trace in spinor space
+
+/*   
+       TYPE  IA tau_i*phi(ytilde)        *  (1+gamma5)/2  *   S(ytilde, x-0)*   tau_i*gamma0*U0(x-0)*       (1-g5)/2*   S(x  ,ytilde)
+       TYPE  IB phi ^dagger(ytilde)*tau_i *  (1-gamma5)/2  *   S(ytilde, x-0)*   tau_i*gamma0*U0(x-0)*       (1-g5)/2*   S(x  ,ytilde)
+
+       TYPE IIA tau_i*phi(ytilde)        *  (1+gamma5)/2  *   S(ytilde, x  )*   tau_i*gamma0*U0^dagger(x-0)*(1-g5)/2*   S(x-0,ytilde)
+       TYPE IIB phi^dagger(ytilde)*tau_i *  (1-gamma5)/2  *   S(ytilde, x  )*   tau_i*gamma0*U0^dagger(x-0)*(1-g5)/2*   S(x-0,ytilde)
+
+*/
+         if ( type_ab == TYPE_A ){
+           taui_scalarfield_flavoronly( spinortrace, tauindex, NO_DAGG, LEFT );
+         }
+         else if ( type_ab == TYPE_B){
+           taui_scalarfield_flavoronly( spinortrace, tauindex, DAGGER, LEFT  );
+         }
+         //delta(flavor component in spinortrace, f1) for all time slices 
+         trace_in_flavor( flavortrace, spinortrace, f1 );
+      } //End of trace in flavor space
+      //sum for all Pauli matrices
+      if (g_cart_id == 0){printf("NaiveDirac Current Density correlator type (%s %s) for tau matrixes %d results\n", type_12 == TYPE_I ? "I" : "II",type_ab == TYPE_A ? "a" :"b", tauindex);}
+      for (i=0; i<T_global; ++i){
+        if (g_cart_id == 0){
+          printf("DCDTAU%dTAU%d %d %d %.3d %10.10e %10.10e\n",tauindex, tauindex,type_12, type_ab,  i, creal(flavortrace[i])/4.,cimag(flavortrace[i])/4.0);
+        }
+        (*results)[i+tauindex*T_global]=flavortrace[i]/4.;
+      }
+      for (i=0;i<T_global; ++i)
+         paulitrace[i]+=flavortrace[i];
+   } //End of trace for Pauli matrices
+
+   if (g_cart_id == 0){printf( "NaiveDirac Current Density correlator type (%s %s) results\n", type_12 == TYPE_I ? "I" : "II",type_ab == TYPE_A ? "a" :"b");}
+   for (i=0; i<T_global; ++i){
+      if (g_cart_id == 0){
+        printf("DCD %d %d %.3d %10.10e %10.10e\n",type_12, type_ab,  i, creal(paulitrace[i])/4.,cimag(paulitrace[i])/4.0);
+      }
+      (*results)[i+tauindex*T_global]=paulitrace[i]/4.;
+   }
+   free(flavortrace);
+   free(paulitrace);
+   free(spacetrace);
+   free(spinortrace);
+   free(colortrace);
+
+}
+
+void vector_axial_current_current_1234( bispinor ** propfields_source_zero, bispinor ** propfields_source_ntmone, int type_1234, int taucurrent, int vectororaxial, _Complex double **results ){
+   int ix,i;
+   int f1,c1,s1;
+   int spinorstart=0, spinorend=4;
+   bispinor running;
+
+   _Complex double *colortrace;
+   _Complex double *spacetrace;
+   _Complex double *spinortrace;
+   _Complex double *flavortrace;
+   int type;
+   su3 untminusonet;
+
+#if defined TM_USE_MPI
+   int count;
+   MPI_Status  statuses[8];
+   MPI_Request *request;
+   request=( MPI_Request *) malloc(sizeof(MPI_Request)*8);
+#endif
+
+   untminusonet.c00=g_gauge_field[g_idn[0][TUP]][TUP].c00;
+   untminusonet.c01=g_gauge_field[g_idn[0][TUP]][TUP].c01;
+   untminusonet.c02=g_gauge_field[g_idn[0][TUP]][TUP].c02;
+   untminusonet.c10=g_gauge_field[g_idn[0][TUP]][TUP].c10;
+   untminusonet.c11=g_gauge_field[g_idn[0][TUP]][TUP].c11;
+   untminusonet.c12=g_gauge_field[g_idn[0][TUP]][TUP].c12;
+   untminusonet.c20=g_gauge_field[g_idn[0][TUP]][TUP].c20;
+   untminusonet.c21=g_gauge_field[g_idn[0][TUP]][TUP].c21;
+   untminusonet.c22=g_gauge_field[g_idn[0][TUP]][TUP].c22;
+
+#if defined TM_USE_MPI
+   MPI_Bcast(&untminusonet.c00, 1, MPI_DOUBLE_COMPLEX, 0, g_cart_grid);
+   MPI_Bcast(&untminusonet.c01, 1, MPI_DOUBLE_COMPLEX, 0, g_cart_grid);
+   MPI_Bcast(&untminusonet.c02, 1, MPI_DOUBLE_COMPLEX, 0, g_cart_grid);
+   MPI_Bcast(&untminusonet.c10, 1, MPI_DOUBLE_COMPLEX, 0, g_cart_grid);
+   MPI_Bcast(&untminusonet.c11, 1, MPI_DOUBLE_COMPLEX, 0, g_cart_grid);
+   MPI_Bcast(&untminusonet.c12, 1, MPI_DOUBLE_COMPLEX, 0, g_cart_grid);
+   MPI_Bcast(&untminusonet.c20, 1, MPI_DOUBLE_COMPLEX, 0, g_cart_grid);
+   MPI_Bcast(&untminusonet.c21, 1, MPI_DOUBLE_COMPLEX, 0, g_cart_grid);
+   MPI_Bcast(&untminusonet.c22, 1, MPI_DOUBLE_COMPLEX, 0, g_cart_grid);
+#endif
+
+
+   colortrace=(_Complex double *)malloc(sizeof(_Complex double) *8);
+   spacetrace=(_Complex double *)malloc(sizeof(_Complex double) *8*T_global);
+   spinortrace=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
+   flavortrace=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+
+   if ( (colortrace == NULL) || (spacetrace == NULL) || (spinortrace == NULL) || (flavortrace == NULL) )
+   {
+     printf("Error in mem allocation in density_density_1234\n");
+     exit(1);
+   }
+   *results=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+   if (*results == NULL){
+     printf("Error in vector current density\n");
+     exit(1);
+   }
+   spinorstart=0;
+   spinorend=4;
+
+
+//Doing the neccessary communication
+#if defined TM_USE_MPI
+   for (s1=spinorstart; s1<spinorend; ++s1)
+      for (c1=0; c1<3; ++c1)
+         for (f1=0; f1<2; ++f1){
+            count=0;
+            generic_exchange_direction_nonblocking( propfields_source_ntmone[12*s1 + 4*c1 + 2*f1 + 0], sizeof(bispinor), TDOWN, request, &count );
+            MPI_Waitall( count, request, statuses);
+            count=0;
+            generic_exchange_direction_nonblocking( propfields_source_zero[12*s1 + 4*c1 + 2*f1 + 0]  , sizeof(bispinor), TDOWN, request, &count );
+            MPI_Waitall( count, request, statuses);
+            count=0;
+            generic_exchange_direction_nonblocking( propfields_source_ntmone[12*s1 + 4*c1 + 2*f1 + 1], sizeof(bispinor), TDOWN, request, &count );
+            MPI_Waitall( count, request, statuses);
+            count=0;
+            generic_exchange_direction_nonblocking( propfields_source_zero[12*s1 + 4*c1 + 2*f1 + 1],   sizeof(bispinor), TDOWN, request, &count );
+            MPI_Waitall( count, request, statuses);
+         }
+   free(request);
+#endif
+//Trace over up and down flavors
+   for (i=0; i<T_global; ++i)
+     flavortrace[i]=0.;
+   for (f1=0; f1<2; ++f1){
+//Trace over the spinor indices you have to trace only over those two spinor 
+//component that appear in the final spinor
+     for (i=0; i<2*T_global; ++i)
+       spinortrace[i]=0.;
+     for (s1= spinorstart; s1<spinorend; ++s1){
+//Trace over the spatial indices
+       for (i=0; i<8*T_global; ++i)
+         spacetrace[i]=0.;
+       for (ix = 0; ix< VOLUME; ++ix){
+//Trace over the color indices for each sites
+         for (i=0; i<8; ++i)
+           colortrace[i]=0.;
+         for (c1=0; c1<3; ++c1){
+           _bispinor_null(running);
+           if ( type_1234 == TYPE_1 ){
+             bispinor_mult_su3matrix( &running, &propfields_source_ntmone[12*s1+4*c1+2*f1][ix], &g_gauge_field[g_idn[ix][TUP]][TUP], NO_DAGG);
+           }
+           else if ( type_1234 == TYPE_2 ){
+             bispinor_mult_su3matrix( &running, &propfields_source_ntmone[12*s1+4*c1+2*f1][g_idn[ix][TUP]], &g_gauge_field[g_idn[ix][TUP]][TUP], DAGGER);
+           }
+           else if ( type_1234 == TYPE_3 ){
+             bispinor_mult_su3matrix( &running, &propfields_source_zero[12*s1+4*c1+2*f1][ix],   &g_gauge_field[g_idn[ix][TUP]][TUP], NO_DAGG);
+           }
+           else if ( type_1234 == TYPE_4 ){
+             bispinor_mult_su3matrix( &running, &propfields_source_zero[12*s1+4*c1+2*f1][g_idn[ix][TUP]],  &g_gauge_field[g_idn[ix][TUP]][TUP], DAGGER);
+           }
+           else {
+              if (g_cart_id == 0){
+                printf("Wrong type index in current current correlator\n");
+                exit(1);
+              }
+           }
+           if (vectororaxial == 1){
+            bispinor_timesgamma5(&running);
+           }
+//Multiplication with gamma0
+           bispinor_timesgamma0(&running);
+
+//Multiplication with tau_i input parameter for the current
+           bispinor_taui(&running, taucurrent);
+
+//Backward propagator multiplication
+           if ( type_1234 == TYPE_1 ){
+              multiply_backward_propagator(&running, propfields_source_zero, &running, ix, TDOWN );
+              bispinor_mult_su3matrix( &running, &running, &untminusonet, NO_DAGG);
+           }
+           else if ( type_1234 == TYPE_2 ){
+              multiply_backward_propagator(&running, propfields_source_zero, &running, ix, NODIR );
+              bispinor_mult_su3matrix( &running, &running, &untminusonet, NO_DAGG);
+           }
+           else if ( type_1234 == TYPE_3 ){
+              multiply_backward_propagator(&running, propfields_source_ntmone, &running, ix, TDOWN );
+              bispinor_mult_su3matrix( &running, &running, &untminusonet, DAGGER);
+           }
+           else if ( type_1234 == TYPE_4 ){
+              multiply_backward_propagator(&running, propfields_source_ntmone, &running, ix, NODIR );
+              bispinor_mult_su3matrix( &running, &running, &untminusonet, DAGGER);
+           }
+
+           if (vectororaxial == 1){
+            bispinor_timesgamma5(&running);
+           }
+//Multiplication with gamma0
+           bispinor_timesgamma0(&running);
+
+//Multiplication with tau_i input parameter for the current
+           bispinor_taui(&running, taucurrent);
+
+           trace_in_color(colortrace,&running,c1);
+         }  //End of trace color
+         trace_in_space(spacetrace,colortrace,ix);
+       } //End of trace space
+//Gather the results from all nodes to complete the trace in space
+#if defined TM_USE_MPI
+       for (i=0; i<8*T_global; ++i){
+         _Complex double tmp;
+        MPI_Allreduce(&spacetrace[i], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, g_cart_grid);
+        spacetrace[i]= tmp;
+       }
+#endif
+           // delta (spinor components of spacetrace, s1) for all time slices and flavor indices
+       trace_in_spinor(spinortrace, spacetrace, s1);
+     }//End of trace in spinor space
+     //delta(flavor component in spinortrace, f1) for all time slices
+     trace_in_flavor( flavortrace, spinortrace, f1 );
+   } //End of trace in flavor space
+   type = type_1234 == TYPE_1 ? 1 : type_1234 == TYPE_2 ? 2 : type_1234 == TYPE_3 ? 3 : 4 ;
+   if (g_cart_id == 0){printf("%s current current correlator type (%s) for tau matrix current %d results\n", vectororaxial==1 ? "Axial" : "Vector", type_1234 == TYPE_1 ? "1" : type_1234 == TYPE_2 ? "2" : type_1234 == TYPE_3 ? "3" : "4", taucurrent);}
+   for (i=0; i<T_global; ++i){
+     if (g_cart_id == 0){
+       printf("%sCURRENT%dCURRENT%d %d %.3d %10.10e %10.10e\n", vectororaxial==1 ? "AXIAL" : "VECTOR", taucurrent,taucurrent, type, i, creal(flavortrace[i])/4.,cimag(flavortrace[i])/4.);
+       fflush(stdout);
+     }
+     (*results)[i]=flavortrace[i]/4.;
+   }
+   free(flavortrace);
+   free(spacetrace);
+   free(spinortrace);
+   free(colortrace);
+}
+#endif
diff --git a/contractions/contractions_FP.h b/contractions/contractions_FP.h
new file mode 100644
index 000000000..b0e87a77c
--- /dev/null
+++ b/contractions/contractions_FP.h
@@ -0,0 +1,38 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2015 Ferenc Pittler
+ *         
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _CONTRACTIONS_FP_H
+#define _CONTRACTIONS_FP_H
+void density_density_1234(bispinor **propagators, int type, _Complex double **res);
+void density_density_1234_s0s0( bispinor ** propagators, int type, _Complex double **res );
+void density_density_1234_sxsx( bispinor ** propagators, int type, _Complex double **res );
+void vector_2_psueodoscalar_1_12( bispinor ** propfields, int type_12,_Complex double **results );
+void naivedirac_current_density_12ab( bispinor ** propagators, int type_12, int type_ab,  _Complex double **tes );
+void naivedirac_current_density_12ab_lr( bispinor ** propagators, int type_12, int type_ab,  _Complex double **tes );
+void wilsonterm_current_density_312ab( bispinor ** propagators, int type_12, int type_ab, _Complex double **res );
+void wilsonterm_current_density_412ab( bispinor ** propagators, int type_12, int type_ab, _Complex double **res );
+void wilsonterm_current_density_512ab( bispinor ** propagators, int type_12, int type_ab, _Complex double **res );
+void wilsonterm_current_density_612ab( bispinor ** propagators, int type_12, int type_ab, _Complex double **res );
+void vector_axial_current_density_1234( bispinor ** propfields, int type_1234,int taudensity, int taucurrent, int vectororaxial, int scalarorpseudoscalar,_Complex double **results );
+void vector_density_density_1234( bispinor ** propfields, int type_1234,int taudensity, _Complex double **results );
+void vector_axial_current_current_1234( bispinor ** propfields_source_zero, bispinor ** propfields_source_ntmone, int type_1234, int taucurrent, int vectororaxial, _Complex double **results );
+void giancarlodensity( bispinor ** propfields, int tau3, _Complex double  **results );
+void density_ptau_density_vector( bispinor **propfields, int type_12, _Complex double **results);
+#endif
diff --git a/contractions/contractions_checks.c b/contractions/contractions_checks.c
new file mode 100644
index 000000000..ba4ebf074
--- /dev/null
+++ b/contractions/contractions_checks.c
@@ -0,0 +1,1606 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2017 Ferenc Pittler
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#ifdef TM_USE_BSM
+#include"lime.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <errno.h>
+#include <time.h>
+#ifdef TM_USE_MPI
+#include <mpi.h>
+#endif
+#include "global.h"
+#include "getopt.h"
+#include "default_input_values.h"
+#include "read_input.h"
+#include "su3.h"
+#include "operator/tm_operators.h"
+#include "linalg_eo.h"
+#include "geometry_eo.h"
+#include "linalg/assign.h"
+#include "operator/D_psi.h"
+#include "operator/D_psi_BSM.h"
+#include "operator/D_psi_BSM2b.h"
+#include "operator/D_psi_BSM2f.h"
+#include "operator/D_psi_BSM2m.h"
+#include "operator/Dov_psi.h"
+#include "operator/tm_operators_nd.h"
+#include "operator/Hopping_Matrix.h"
+#include "invert_eo.h"
+#include "invert_doublet_eo.h"
+#include "invert_overlap.h"
+#include "invert_clover_eo.h"
+#include "init/init_scalar_field.h"
+#include "init/init_bsm_2hop_lookup.h"
+#include "boundary.h"
+#include "start.h"
+#include "solver/solver.h"
+#include "xchange/xchange_gauge.h"
+#include "prepare_source.h"
+#include <io/params.h>
+#include <io/gauge.h>
+#include <io/spinor.h>
+#include <io/utils.h>
+#include "io/scalar.h"
+#include "buffers/utils_nonblocking.h"
+#include "buffers/utils_nogauge.h"
+#include "test/overlaptests.h"
+#include "solver/index_jd.h"
+#include "operator/clovertm_operators.h"
+#include "operator/clover_leaf.h"
+#include "operator.h"
+#include "gettime.h"
+#include "measure_gauge_action.h"
+#include "mpi_init.h"
+#include "init/init_geometry_indices.h"
+#include "init/init_openmp.h"
+#include "init/init_gauge_field.h"
+#include "init/init_spinor_field.h"
+#include "init/init_bispinor_field.h"
+#include "solver/solver_field.h"
+/* indexing of propfields;
+   
+   propagator for  (dagger or nondagger source)
+              for  flavor component f
+              for  color  component c    
+              for  spinor component s
+   is the following bispinor array of size VOLUME(PLUSRAND)
+
+   propfields[12*s + 4*c + 2*f + dagg ? 1: 0]  
+     
+ */
+/**************************
+Multiplication with the backward propagator
+
+S == matrix element of D^-1 between the following states
+
+S( ytilde , x+-dir )       psi   x
+   flavor2, flavor1    x         flavor1
+   spinor2, spinor1              spinor1
+   color 2, color 1              color1
+
+=
+Stilde* (x+-dir , ytilde)      psi   x
+         flavor1, flavor2  x         flavor1
+         spinor1, spinor2            spinor1  
+         color 1, color 2            color1
+where Stilde is the matrix element of D^dagger^-1 between 
+the correspondig states
+
+**************************/
+
+static void trace_in_spinor_and_color( _Complex double *c, bispinor **prop, int ix, int f3, int f4, int f6, int f1){
+     int alpha2;
+     int c1;
+     c[ix]=0.;
+     for (alpha2=0; alpha2<2;++alpha2)
+       for (c1=0; c1<3; ++c1){
+          if ( (f6 == 0) && (f4==0) ){
+             c[ix]+= prop[12*alpha2+4*c1+2*f1][ix].sp_up.s2.c0*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s2.c0)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_up.s2.c1*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s2.c1)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_up.s2.c2*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s2.c2)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_up.s3.c0*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s3.c0)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_up.s3.c1*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s3.c1)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_up.s3.c2*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s3.c2);
+          }
+          if ( (f6 == 1) && (f4==0) ){
+             c[ix]+= prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s2.c0*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s2.c0)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s2.c1*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s2.c1)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s2.c2*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s2.c2)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s3.c0*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s3.c0)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s3.c1*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s3.c1)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s3.c2*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s3.c2);
+          }
+          if ( (f6 == 0) && (f4==1) ){
+             c[ix]+= prop[12*alpha2+4*c1+2*f1][ix].sp_up.s2.c0*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s2.c0)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_up.s2.c1*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s2.c1)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_up.s2.c2*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s2.c2)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_up.s3.c0*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s3.c0)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_up.s3.c1*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s3.c1)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_up.s3.c2*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s3.c2);
+          }
+          if ( (f6 == 1) && (f4==1) ){
+             c[ix]+= prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s2.c0*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s2.c0)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s2.c1*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s2.c1)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s2.c2*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s2.c2)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s3.c0*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s3.c0)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s3.c1*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s3.c1)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s3.c2*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s3.c2);
+          }
+       }
+}
+static void trace_in_spinor_and_color62a( _Complex double *c, bispinor **prop, int ix, int f3, int f4, int f6, int f1){
+     int alpha2;
+     int c1;
+     c[ix]=0.;
+     bispinor running;
+     su3 * restrict upm;
+     su3_vector tmpvec;
+     for (alpha2=0; alpha2<2;++alpha2)
+       for (c1=0; c1<3; ++c1){
+          if ( (f6 == 0) && (f4==0) ){
+              upm = &g_gauge_field[ix][TUP];
+
+              _vector_null( running.sp_up.s0 );
+              _vector_null( running.sp_up.s1 );
+              _vector_null( running.sp_up.s2 );
+              _vector_null( running.sp_up.s3 );
+
+
+              _su3_multiply( running.sp_up.s2, (*upm), prop[12*alpha2+4*c1+2*f1][g_iup[ix][TUP]].sp_up.s2 );
+              _su3_multiply( running.sp_up.s3, (*upm), prop[12*alpha2+4*c1+2*f1][g_iup[ix][TUP]].sp_up.s3 );
+
+              upm = &g_gauge_field[g_idn[ix][TUP]][TUP];
+
+              _vector_null( tmpvec );
+              _su3_multiply(tmpvec, (*upm), running.sp_up.s2);
+              _vector_assign( running.sp_up.s2, tmpvec);
+
+              _vector_null( tmpvec );
+              _su3_multiply(tmpvec, (*upm), running.sp_up.s3);
+              _vector_assign( running.sp_up.s3, tmpvec);
+
+              _complex_times_vector(running.sp_up.s2,phase_00,running.sp_up.s2);
+              _complex_times_vector(running.sp_up.s3,phase_00,running.sp_up.s3);
+
+ 
+              c[ix]+= running.sp_up.s2.c0*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s2.c0)
+                     +running.sp_up.s2.c1*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s2.c1)
+                     +running.sp_up.s2.c2*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s2.c2)
+                     +running.sp_up.s3.c0*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s3.c0)
+                     +running.sp_up.s3.c1*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s3.c1)
+                     +running.sp_up.s3.c2*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s3.c2);
+          }
+          if ( (f6 == 1) && (f4==0) ){
+              upm = &g_gauge_field[ix][TUP];
+
+              _vector_null( running.sp_dn.s0 );
+              _vector_null( running.sp_dn.s1 );
+              _vector_null( running.sp_dn.s2 );
+              _vector_null( running.sp_dn.s3 );
+
+
+              _su3_multiply( running.sp_dn.s2, (*upm), prop[12*alpha2+4*c1+2*f1][g_iup[ix][TUP]].sp_dn.s2 );
+              _su3_multiply( running.sp_dn.s3, (*upm), prop[12*alpha2+4*c1+2*f1][g_iup[ix][TUP]].sp_dn.s3 );
+
+              upm = &g_gauge_field[g_idn[ix][TUP]][TUP];
+
+              _vector_null( tmpvec );
+              _su3_multiply(tmpvec, (*upm), running.sp_dn.s2);
+              _vector_assign( running.sp_dn.s2, tmpvec);
+
+              _vector_null( tmpvec );
+              _su3_multiply(tmpvec, (*upm), running.sp_dn.s3);
+              _vector_assign( running.sp_dn.s3, tmpvec);
+
+              _complex_times_vector(running.sp_dn.s2,phase_00,running.sp_dn.s2);
+              _complex_times_vector(running.sp_dn.s3,phase_00,running.sp_dn.s3);
+
+
+              c[ix]+= running.sp_dn.s2.c0*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s2.c0)
+                     +running.sp_dn.s2.c1*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s2.c1)
+                     +running.sp_dn.s2.c2*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s2.c2)
+                     +running.sp_dn.s3.c0*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s3.c0)
+                     +running.sp_dn.s3.c1*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s3.c1)
+                     +running.sp_dn.s3.c2*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s3.c2);
+          }
+          if ( (f6 == 0) && (f4==1) ){
+              upm = &g_gauge_field[ix][TUP];
+
+              _vector_null( running.sp_up.s0 );
+              _vector_null( running.sp_up.s1 );
+              _vector_null( running.sp_up.s2 );
+              _vector_null( running.sp_up.s3 );
+
+
+              _su3_multiply( running.sp_up.s2, (*upm), prop[12*alpha2+4*c1+2*f1][g_iup[ix][TUP]].sp_up.s2 );
+              _su3_multiply( running.sp_up.s3, (*upm), prop[12*alpha2+4*c1+2*f1][g_iup[ix][TUP]].sp_up.s3 );
+
+              upm = &g_gauge_field[g_idn[ix][TUP]][TUP];
+
+              _vector_null( tmpvec );
+              _su3_multiply(tmpvec, (*upm), running.sp_up.s2);
+              _vector_assign( running.sp_up.s2, tmpvec);
+
+              _vector_null( tmpvec );
+              _su3_multiply(tmpvec, (*upm), running.sp_up.s3);
+              _vector_assign( running.sp_up.s3, tmpvec);
+
+              _complex_times_vector(running.sp_up.s2,phase_00,running.sp_up.s2);
+              _complex_times_vector(running.sp_up.s3,phase_00,running.sp_up.s3);
+
+
+              c[ix]+= running.sp_up.s2.c0*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s2.c0)
+                     +running.sp_up.s2.c1*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s2.c1)
+                     +running.sp_up.s2.c2*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s2.c2)
+                     +running.sp_up.s3.c0*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s3.c0)
+                     +running.sp_up.s3.c1*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s3.c1)
+                     +running.sp_up.s3.c2*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s3.c2);
+          }
+          if ( (f6 == 1) && (f4==1) ){
+              upm = &g_gauge_field[ix][TUP];
+
+              _vector_null( running.sp_dn.s0 );
+              _vector_null( running.sp_dn.s1 );
+              _vector_null( running.sp_dn.s2 );
+              _vector_null( running.sp_dn.s3 );
+
+
+              _su3_multiply( running.sp_dn.s2, (*upm), prop[12*alpha2+4*c1+2*f1][g_iup[ix][TUP]].sp_dn.s2 );
+              _su3_multiply( running.sp_dn.s3, (*upm), prop[12*alpha2+4*c1+2*f1][g_iup[ix][TUP]].sp_dn.s3 );
+
+              upm = &g_gauge_field[g_idn[ix][TUP]][TUP];
+
+              _vector_null( tmpvec );
+              _su3_multiply(tmpvec, (*upm), running.sp_dn.s2);
+              _vector_assign( running.sp_dn.s2, tmpvec);
+
+              _vector_null( tmpvec );
+              _su3_multiply(tmpvec, (*upm), running.sp_dn.s3);
+              _vector_assign( running.sp_dn.s3, tmpvec);
+
+              _complex_times_vector(running.sp_dn.s2,phase_00,running.sp_dn.s2);
+              _complex_times_vector(running.sp_dn.s3,phase_00,running.sp_dn.s3);
+
+
+              c[ix]+= running.sp_dn.s2.c0*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s2.c0)
+                     +running.sp_dn.s2.c1*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s2.c1)
+                     +running.sp_dn.s2.c2*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s2.c2)
+                     +running.sp_dn.s3.c0*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s3.c0)
+                     +running.sp_dn.s3.c1*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s3.c1)
+                     +running.sp_dn.s3.c2*conj(prop[12*alpha2+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s3.c2);
+          }
+       }
+}
+static void trace_in_spinor_and_color61b( _Complex double *c, bispinor **prop, int ix, int f3, int f4, int f6, int f1){
+     int alpha2;
+     int c1;
+     c[ix]=0.;
+     for (alpha2=2; alpha2<4;++alpha2)
+       for (c1=0; c1<3; ++c1){
+          if ( (f6 == 0) && (f4==0) ){
+             c[ix]+= prop[12*alpha2+4*c1+2*f1][ix].sp_up.s2.c0*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s2.c0)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_up.s2.c1*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s2.c1)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_up.s2.c2*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s2.c2)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_up.s3.c0*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s3.c0)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_up.s3.c1*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s3.c1)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_up.s3.c2*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s3.c2);
+          }
+          if ( (f6 == 1) && (f4==0) ){
+             c[ix]+= prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s2.c0*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s2.c0)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s2.c1*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s2.c1)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s2.c2*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s2.c2)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s3.c0*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s3.c0)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s3.c1*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s3.c1)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s3.c2*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_up.s3.c2);
+          }
+          if ( (f6 == 0) && (f4==1) ){
+             c[ix]+= prop[12*alpha2+4*c1+2*f1][ix].sp_up.s2.c0*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s2.c0)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_up.s2.c1*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s2.c1)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_up.s2.c2*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s2.c2)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_up.s3.c0*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s3.c0)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_up.s3.c1*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s3.c1)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_up.s3.c2*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s3.c2);
+          }
+          if ( (f6 == 1) && (f4==1) ){
+             c[ix]+= prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s2.c0*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s2.c0)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s2.c1*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s2.c1)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s2.c2*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s2.c2)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s3.c0*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s3.c0)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s3.c1*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s3.c1)
+                    +prop[12*alpha2+4*c1+2*f1][ix].sp_dn.s3.c2*conj(prop[12*alpha2+4*c1+2*f3+1][ix].sp_dn.s3.c2);
+          }
+       }
+}
+static void trace_in_spinor_and_color1a( _Complex double *c, bispinor **prop, int ix, int f3, int f4, int f6, int f1){
+     int alpha1;
+     int c1;
+     c[ix]=0.;
+     bispinor running;
+     su3 * restrict upm;
+     for (alpha1=0; alpha1<2;++alpha1)
+       for (c1=0; c1<3; ++c1){
+          if ( (f6 == 0) && (f4==0) ){
+              upm = &g_gauge_field[g_idn[ix][TUP]][TUP];
+
+              _vector_null( running.sp_up.s0 );
+              _vector_null( running.sp_up.s1 );
+              _vector_null( running.sp_up.s2 );
+              _vector_null( running.sp_up.s3 );
+
+
+              _su3_multiply( running.sp_up.s2, (*upm), prop[12*alpha1+4*c1+2*f1][ix].sp_up.s2 );
+              _su3_multiply( running.sp_up.s3, (*upm), prop[12*alpha1+4*c1+2*f1][ix].sp_up.s3 );
+
+              _complex_times_vector(running.sp_up.s2,phase_0,running.sp_up.s2);
+              _complex_times_vector(running.sp_up.s3,phase_0,running.sp_up.s3);
+ 
+
+              _vector_add_assign(running.sp_up.s0, running.sp_up.s2);
+              _vector_add_assign(running.sp_up.s1, running.sp_up.s3);
+              _vector_null(running.sp_up.s2);
+              _vector_null(running.sp_up.s3);
+
+              c[ix]+= running.sp_up.s0.c0*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s0.c0)
+                     +running.sp_up.s0.c1*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s0.c1)
+                     +running.sp_up.s0.c2*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s0.c2)
+                     +running.sp_up.s1.c0*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s1.c0)
+                     +running.sp_up.s1.c1*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s1.c1)
+                     +running.sp_up.s1.c2*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s1.c2);
+          }
+          if ( (f6 == 1) && (f4==0) ){
+              upm = &g_gauge_field[g_idn[ix][TUP]][TUP];
+
+              _vector_null( running.sp_dn.s0 );
+              _vector_null( running.sp_dn.s1 );
+              _vector_null( running.sp_dn.s2 );
+              _vector_null( running.sp_dn.s3 );
+
+
+              _su3_multiply( running.sp_dn.s2, (*upm), prop[12*alpha1+4*c1+2*f1][ix].sp_dn.s2 );
+              _su3_multiply( running.sp_dn.s3, (*upm), prop[12*alpha1+4*c1+2*f1][ix].sp_dn.s3 );
+
+
+              _complex_times_vector(running.sp_dn.s2,phase_0,running.sp_dn.s2);
+              _complex_times_vector(running.sp_dn.s3,phase_0,running.sp_dn.s3);
+
+              _vector_add_assign(running.sp_dn.s0, running.sp_dn.s2);
+              _vector_add_assign(running.sp_dn.s1, running.sp_dn.s3);
+              _vector_null(running.sp_dn.s2);
+              _vector_null(running.sp_dn.s3);
+
+              c[ix]+= running.sp_dn.s0.c0*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s0.c0)
+                     +running.sp_dn.s0.c1*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s0.c1)
+                     +running.sp_dn.s0.c2*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s0.c2)
+                     +running.sp_dn.s1.c0*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s1.c0)
+                     +running.sp_dn.s1.c1*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s1.c1)
+                     +running.sp_dn.s1.c2*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s1.c2);
+          }
+          if ( (f6 == 0) && (f4==1) ){
+              upm = &g_gauge_field[g_idn[ix][TUP]][TUP];
+
+              _vector_null( running.sp_up.s0 );
+              _vector_null( running.sp_up.s1 );
+              _vector_null( running.sp_up.s2 );
+              _vector_null( running.sp_up.s3 );
+
+
+              _su3_multiply( running.sp_up.s2, (*upm), prop[12*alpha1+4*c1+2*f1][ix].sp_up.s2 );
+              _su3_multiply( running.sp_up.s3, (*upm), prop[12*alpha1+4*c1+2*f1][ix].sp_up.s3 );
+
+              _complex_times_vector(running.sp_up.s2,phase_0,running.sp_up.s2);
+              _complex_times_vector(running.sp_up.s3,phase_0,running.sp_up.s3);
+
+              _vector_add_assign(running.sp_up.s0, running.sp_up.s2);
+              _vector_add_assign(running.sp_up.s1, running.sp_up.s3);
+              _vector_null(running.sp_up.s2);
+              _vector_null(running.sp_up.s3);
+
+              c[ix]+= running.sp_up.s0.c0*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s0.c0)
+                     +running.sp_up.s0.c1*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s0.c1)
+                     +running.sp_up.s0.c2*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s0.c2)
+                     +running.sp_up.s1.c0*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s1.c0)
+                     +running.sp_up.s1.c1*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s1.c1)
+                     +running.sp_up.s1.c2*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s1.c2);
+          }
+          if ( (f6 == 1) && (f4==1) ){
+              upm = &g_gauge_field[g_idn[ix][TUP]][TUP];
+
+              _vector_null( running.sp_dn.s0 );
+              _vector_null( running.sp_dn.s1 );
+              _vector_null( running.sp_dn.s2 );
+              _vector_null( running.sp_dn.s3 );
+
+
+              _su3_multiply( running.sp_dn.s2, (*upm), prop[12*alpha1+4*c1+2*f1][ix].sp_dn.s2 );
+              _su3_multiply( running.sp_dn.s3, (*upm), prop[12*alpha1+4*c1+2*f1][ix].sp_dn.s3 );
+
+              _complex_times_vector(running.sp_dn.s2,phase_0,running.sp_dn.s2);
+              _complex_times_vector(running.sp_dn.s3,phase_0,running.sp_dn.s3);
+
+
+              _vector_add_assign(running.sp_dn.s0, running.sp_dn.s2);
+              _vector_add_assign(running.sp_dn.s1, running.sp_dn.s3);
+              _vector_null(running.sp_dn.s2);
+              _vector_null(running.sp_dn.s3);
+
+              c[ix]+= running.sp_dn.s0.c0*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s0.c0)
+                     +running.sp_dn.s0.c1*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s0.c1)
+                     +running.sp_dn.s0.c2*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s0.c2)
+                     +running.sp_dn.s1.c0*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s1.c0)
+                     +running.sp_dn.s1.c1*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s1.c1)
+                     +running.sp_dn.s1.c2*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s1.c2);
+          }
+       }
+}
+
+
+
+static void trace_in_spinor_and_color3a( _Complex double *c, bispinor **prop, int ix, int f3, int f4, int f6, int f1){
+     int alpha1;
+     int c1;
+     c[ix]=0.;
+     bispinor running;
+     su3 * restrict upm;
+     bispinor tmp;
+     for (alpha1=0; alpha1<2;++alpha1)
+       for (c1=0; c1<3; ++c1){
+          if ( (f6 == 0) && (f4==0) ){
+              upm = &g_gauge_field[ix][TUP];
+
+              _vector_null( tmp.sp_up.s2 );
+              _vector_null( tmp.sp_up.s3 );
+              _vector_null( tmp.sp_up.s0 );
+              _vector_null( tmp.sp_up.s1 );
+
+              _vector_null( running.sp_up.s0 );
+              _vector_null( running.sp_up.s1 );
+              _vector_null( running.sp_up.s2 );
+              _vector_null( running.sp_up.s3 );
+
+
+
+              _su3_multiply( tmp.sp_up.s0, (*upm), prop[12*alpha1+4*c1+2*f1][g_iup[ix][TUP]].sp_up.s0 );
+              _su3_multiply( tmp.sp_up.s1, (*upm), prop[12*alpha1+4*c1+2*f1][g_iup[ix][TUP]].sp_up.s1 );
+
+              upm = &g_gauge_field[g_idn[ix][TUP]][TUP];
+
+              _su3_multiply( running.sp_up.s0, (*upm), tmp.sp_up.s0 );
+              _su3_multiply( running.sp_up.s1, (*upm), tmp.sp_up.s1 );
+
+              _complex_times_vector(running.sp_up.s0,phase_00,running.sp_up.s0);
+              _complex_times_vector(running.sp_up.s1,phase_00,running.sp_up.s1);
+
+
+
+              c[ix]+= running.sp_up.s0.c0*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s0.c0)
+                     +running.sp_up.s0.c1*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s0.c1)
+                     +running.sp_up.s0.c2*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s0.c2)
+                     +running.sp_up.s1.c0*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s1.c0)
+                     +running.sp_up.s1.c1*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s1.c1)
+                     +running.sp_up.s1.c2*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s1.c2);
+          }
+          if ( (f6 == 1) && (f4==0) ){
+              upm = &g_gauge_field[ix][TUP];
+
+              _vector_null( tmp.sp_dn.s2 );
+              _vector_null( tmp.sp_dn.s3 );
+              _vector_null( tmp.sp_dn.s0 );
+              _vector_null( tmp.sp_dn.s1 );
+
+              _vector_null( running.sp_dn.s0 );
+              _vector_null( running.sp_dn.s1 );
+              _vector_null( running.sp_dn.s2 );
+              _vector_null( running.sp_dn.s3 );
+
+
+              _su3_multiply( tmp.sp_dn.s0, (*upm), prop[12*alpha1+4*c1+2*f1][g_iup[ix][TUP]].sp_dn.s0 );
+              _su3_multiply( tmp.sp_dn.s1, (*upm), prop[12*alpha1+4*c1+2*f1][g_iup[ix][TUP]].sp_dn.s1 );
+
+              upm = &g_gauge_field[g_idn[ix][TUP]][TUP];
+
+              _su3_multiply( running.sp_dn.s0, (*upm), tmp.sp_dn.s0 );
+              _su3_multiply( running.sp_dn.s1, (*upm), tmp.sp_dn.s1 );
+
+
+              _complex_times_vector(running.sp_dn.s0,phase_00,running.sp_dn.s0);
+              _complex_times_vector(running.sp_dn.s1,phase_00,running.sp_dn.s1);
+
+
+              c[ix]+= running.sp_dn.s0.c0*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s0.c0)
+                     +running.sp_dn.s0.c1*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s0.c1)
+                     +running.sp_dn.s0.c2*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s0.c2)
+                     +running.sp_dn.s1.c0*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s1.c0)
+                     +running.sp_dn.s1.c1*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s1.c1)
+                     +running.sp_dn.s1.c2*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_up.s1.c2);
+
+          }
+          if ( (f6 == 0) && (f4==1) ){
+              upm = &g_gauge_field[ix][TUP];
+
+              _vector_null( tmp.sp_up.s2 );
+              _vector_null( tmp.sp_up.s3 );
+              _vector_null( tmp.sp_up.s0 );
+              _vector_null( tmp.sp_up.s1 );
+
+              _vector_null( running.sp_up.s0 );
+              _vector_null( running.sp_up.s1 );
+              _vector_null( running.sp_up.s2 );
+              _vector_null( running.sp_up.s3 );
+
+              _su3_multiply( tmp.sp_up.s0, (*upm), prop[12*alpha1+4*c1+2*f1][g_iup[ix][TUP]].sp_up.s0 );
+              _su3_multiply( tmp.sp_up.s1, (*upm), prop[12*alpha1+4*c1+2*f1][g_iup[ix][TUP]].sp_up.s1 );
+
+              upm = &g_gauge_field[g_idn[ix][TUP]][TUP];
+
+              _su3_multiply( running.sp_up.s0, (*upm), tmp.sp_up.s0 );
+              _su3_multiply( running.sp_up.s1, (*upm), tmp.sp_up.s1 );
+
+
+              _complex_times_vector(running.sp_up.s0,phase_00,running.sp_up.s0);
+              _complex_times_vector(running.sp_up.s1,phase_00,running.sp_up.s1);
+
+
+              c[ix]+= running.sp_up.s0.c0*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s0.c0)
+                     +running.sp_up.s0.c1*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s0.c1)
+                     +running.sp_up.s0.c2*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s0.c2)
+                     +running.sp_up.s1.c0*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s1.c0)
+                     +running.sp_up.s1.c1*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s1.c1)
+                     +running.sp_up.s1.c2*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s1.c2);
+          }
+          if ( (f6 == 1) && (f4==1) ){
+              upm = &g_gauge_field[ix][TUP];
+
+              _vector_null( tmp.sp_dn.s2 );
+              _vector_null( tmp.sp_dn.s3 );
+              _vector_null( tmp.sp_dn.s0 );
+              _vector_null( tmp.sp_dn.s1 );
+
+              _vector_null( running.sp_dn.s0 );
+              _vector_null( running.sp_dn.s1 );
+              _vector_null( running.sp_dn.s2 );
+              _vector_null( running.sp_dn.s3 );
+
+
+              _su3_multiply( tmp.sp_dn.s0, (*upm), prop[12*alpha1+4*c1+2*f1][g_iup[ix][TUP]].sp_dn.s0 );
+              _su3_multiply( tmp.sp_dn.s1, (*upm), prop[12*alpha1+4*c1+2*f1][g_iup[ix][TUP]].sp_dn.s1 );
+
+              upm = &g_gauge_field[g_idn[ix][TUP]][TUP];
+
+              _su3_multiply( running.sp_dn.s0, (*upm), tmp.sp_dn.s0 );
+              _su3_multiply( running.sp_dn.s1, (*upm), tmp.sp_dn.s1 );
+
+
+              _complex_times_vector(running.sp_dn.s0,phase_00,running.sp_dn.s0);
+              _complex_times_vector(running.sp_dn.s1,phase_00,running.sp_dn.s1);
+               
+              c[ix]+= running.sp_dn.s0.c0*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s0.c0)
+                     +running.sp_dn.s0.c1*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s0.c1)
+                     +running.sp_dn.s0.c2*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s0.c2)
+                     +running.sp_dn.s1.c0*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s1.c0)
+                     +running.sp_dn.s1.c1*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s1.c1)
+                     +running.sp_dn.s1.c2*conj(prop[12*alpha1+4*c1+2*f3+1][g_idn[ix][TUP]].sp_dn.s1.c2);
+
+          }
+       }
+}
+
+void wilsoncurrent31a_petros( bispinor **propfields )
+{
+
+    _Complex double **phimatrix=(_Complex double **)malloc(sizeof(_Complex double *)*4);
+
+    _Complex double *C0000=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0001=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0010=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0011=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0100=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0101=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0110=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0111=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1000=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1001=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1010=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1011=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1100=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1101=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1110=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1111=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+
+    _Complex double *final_corr=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+
+    _Complex double *phimatrixspatialnull=(_Complex double *)malloc(sizeof(_Complex double)*4);
+
+    int ix;
+
+// Doing the neccessary communication
+#if defined TM_USE_MPI
+   int s1,c1,f1;
+   int count;
+   MPI_Status  statuses[8];
+   MPI_Request *request;
+   request=( MPI_Request *) malloc(sizeof(MPI_Request)*8);
+   for (s1=0; s1<2; ++s1)
+      for (c1=0; c1<3; ++c1)
+         for (f1=0; f1<2; ++f1){
+            count=0;
+            generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 0], sizeof(bispinor), TUP   , request, &count );
+            MPI_Waitall( count, request, statuses);
+            count=0;
+            generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 0], sizeof(bispinor), TDOWN , request, &count );
+            MPI_Waitall( count, request, statuses);
+            count=0;
+            generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 1], sizeof(bispinor), TDOWN , request, &count );
+            MPI_Waitall( count, request, statuses);
+         }
+   free(request);
+#endif
+
+    for (ix=0;ix<4;++ix)
+       phimatrix[ix]=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    for (ix=0;ix<VOLUME;++ix)
+    {
+       if (smearedcorrelator_BSM == 1){
+         phimatrix[0][ix]= 1.*g_smeared_scalar_field[0][ix] + I*g_smeared_scalar_field[3][ix];
+         phimatrix[1][ix]= 1.*g_smeared_scalar_field[2][ix] + I*g_smeared_scalar_field[1][ix];
+         phimatrix[2][ix]=-1.*g_smeared_scalar_field[2][ix] + I*g_smeared_scalar_field[1][ix];
+         phimatrix[3][ix]= 1.*g_smeared_scalar_field[0][ix] - I*g_smeared_scalar_field[3][ix];
+       }
+       else{
+         phimatrix[0][ix]= 1.*g_scalar_field[0][ix] + I*g_scalar_field[3][ix];
+         phimatrix[1][ix]= 1.*g_scalar_field[2][ix] + I*g_scalar_field[1][ix];
+         phimatrix[2][ix]=-1.*g_scalar_field[2][ix] + I*g_scalar_field[1][ix];
+         phimatrix[3][ix]= 1.*g_scalar_field[0][ix] - I*g_scalar_field[3][ix];
+       }
+    }
+
+    for (ix=0;ix<4;++ix)
+       phimatrixspatialnull[ix]=phimatrix[ix][0];
+
+#if defined TM_USE_MPI
+    for (ix=0;ix<4;++ix)
+       MPI_Bcast(&phimatrixspatialnull[ix], 1, MPI_DOUBLE_COMPLEX, 0, g_cart_grid);
+#endif
+
+    for (ix=0; ix<VOLUME; ++ix){
+       trace_in_spinor_and_color3a(C0000,propfields,ix,0,0,0,0);
+       trace_in_spinor_and_color3a(C0001,propfields,ix,0,0,0,1);
+       trace_in_spinor_and_color3a(C0010,propfields,ix,0,0,1,0);
+       trace_in_spinor_and_color3a(C0011,propfields,ix,0,0,1,1);
+       trace_in_spinor_and_color3a(C0100,propfields,ix,0,1,0,0);
+       trace_in_spinor_and_color3a(C0101,propfields,ix,0,1,0,1);
+       trace_in_spinor_and_color3a(C0110,propfields,ix,0,1,1,0);
+       trace_in_spinor_and_color3a(C0111,propfields,ix,0,1,1,1);
+       trace_in_spinor_and_color3a(C1000,propfields,ix,1,0,0,0);
+       trace_in_spinor_and_color3a(C1001,propfields,ix,1,0,0,1);
+       trace_in_spinor_and_color3a(C1010,propfields,ix,1,0,1,0);
+       trace_in_spinor_and_color3a(C1011,propfields,ix,1,0,1,1);
+       trace_in_spinor_and_color3a(C1100,propfields,ix,1,1,0,0);
+       trace_in_spinor_and_color3a(C1101,propfields,ix,1,1,0,1);
+       trace_in_spinor_and_color3a(C1110,propfields,ix,1,1,1,0);
+       trace_in_spinor_and_color3a(C1111,propfields,ix,1,1,1,1);
+
+    }
+    for (ix=0; ix<T_global; ++ix)
+       final_corr[ix]=0.; 
+    for (ix=0; ix<VOLUME; ++ix){
+
+//tau_1
+       final_corr[g_coord[ix][TUP]]+=  1.*phimatrixspatialnull[1*2+0]*C0000[ix]*phimatrix[1*2+0][ix]
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0010[ix]*phimatrix[1*2+1][ix]
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0100[ix]*phimatrix[0*2+0][ix]
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0110[ix]*phimatrix[0*2+1][ix]
+
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1000[ix]*phimatrix[1*2+0][ix]
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1010[ix]*phimatrix[1*2+1][ix]
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1100[ix]*phimatrix[0*2+0][ix]
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1110[ix]*phimatrix[0*2+1][ix]
+
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0001[ix]*phimatrix[1*2+0][ix]
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0011[ix]*phimatrix[1*2+1][ix]
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0101[ix]*phimatrix[0*2+0][ix]
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0111[ix]*phimatrix[0*2+1][ix]
+
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1001[ix]*phimatrix[1*2+0][ix]
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1011[ix]*phimatrix[1*2+1][ix]
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1101[ix]*phimatrix[0*2+0][ix]
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1111[ix]*phimatrix[0*2+1][ix];
+
+//tau2
+       final_corr[g_coord[ix][TUP]]+= -1.*phimatrixspatialnull[1*2+0]*C0000[ix]*phimatrix[1*2+0][ix]
+                                     +-1.*phimatrixspatialnull[1*2+0]*C0010[ix]*phimatrix[1*2+1][ix]
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0100[ix]*phimatrix[0*2+0][ix]
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0110[ix]*phimatrix[0*2+1][ix]
+
+                                     +-1.*phimatrixspatialnull[1*2+1]*C1000[ix]*phimatrix[1*2+0][ix]
+                                     +-1.*phimatrixspatialnull[1*2+1]*C1010[ix]*phimatrix[1*2+1][ix]
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1100[ix]*phimatrix[0*2+0][ix]
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1110[ix]*phimatrix[0*2+1][ix]
+
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0001[ix]*phimatrix[1*2+0][ix]
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0011[ix]*phimatrix[1*2+1][ix]
+                                     +-1.*phimatrixspatialnull[0*2+0]*C0101[ix]*phimatrix[0*2+0][ix]
+                                     +-1.*phimatrixspatialnull[0*2+0]*C0111[ix]*phimatrix[0*2+1][ix]
+
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1001[ix]*phimatrix[1*2+0][ix]
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1011[ix]*phimatrix[1*2+1][ix]
+                                     +-1.*phimatrixspatialnull[0*2+1]*C1101[ix]*phimatrix[0*2+0][ix]
+                                     +-1.*phimatrixspatialnull[0*2+1]*C1111[ix]*phimatrix[0*2+1][ix];
+//tau3
+       final_corr[g_coord[ix][TUP]]+=  1.*phimatrixspatialnull[0*2+0]*C0000[ix]*phimatrix[0*2+0][ix]
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0010[ix]*phimatrix[0*2+1][ix]
+                                     +-1.*phimatrixspatialnull[0*2+0]*C0100[ix]*phimatrix[1*2+0][ix]
+                                     +-1.*phimatrixspatialnull[0*2+0]*C0110[ix]*phimatrix[1*2+1][ix]
+
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1000[ix]*phimatrix[0*2+0][ix]
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1010[ix]*phimatrix[0*2+1][ix]
+                                     +-1.*phimatrixspatialnull[0*2+1]*C1100[ix]*phimatrix[1*2+0][ix]
+                                     +-1.*phimatrixspatialnull[0*2+1]*C1110[ix]*phimatrix[1*2+1][ix]
+
+                                     +-1.*phimatrixspatialnull[1*2+0]*C0001[ix]*phimatrix[0*2+0][ix]
+                                     +-1.*phimatrixspatialnull[1*2+0]*C0011[ix]*phimatrix[0*2+1][ix]
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0101[ix]*phimatrix[1*2+0][ix]
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0111[ix]*phimatrix[1*2+1][ix]
+
+                                     +-1.*phimatrixspatialnull[1*2+1]*C1001[ix]*phimatrix[0*2+0][ix]
+                                     +-1.*phimatrixspatialnull[1*2+1]*C1011[ix]*phimatrix[0*2+1][ix]
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1101[ix]*phimatrix[1*2+0][ix]
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1111[ix]*phimatrix[1*2+1][ix];
+
+    }
+#if defined TM_USE_MPI
+    for (ix=0; ix<T_global; ++ix){
+       _Complex double tmp;
+       MPI_Allreduce(&final_corr[ix], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
+       final_corr[ix]= tmp;
+    }
+#endif 
+    if (g_cart_id == 0){printf("Wilson current  Density correlator type a la Petros (1) results\n");}
+      for (ix=0; ix<T_global; ++ix){
+        if (g_cart_id == 0){
+        printf("WCDPR1 0 0 %.3d %10.10e %10.10e\n", ix, creal(final_corr[ix])/4.,cimag(final_corr[ix])/4.);
+      }
+    }
+
+
+    free(C0000);
+    free(C0001);
+    free(C0010);
+    free(C0011);
+    free(C0100);
+    free(C0101);
+    free(C0110);
+    free(C0111);
+    free(C1000);
+    free(C1001);
+    free(C1010);
+    free(C1011);
+    free(C1100);
+    free(C1101);
+    free(C1110);
+    free(C1111);
+
+    for (ix=0;ix<4;++ix)
+       free(phimatrix[ix]);
+    free(phimatrix);
+    free(final_corr);
+
+}
+
+
+void density_density_1234_petros( bispinor **propfields )
+{
+
+    _Complex double **phimatrix=(_Complex double **)malloc(sizeof(_Complex double *)*4);
+
+    _Complex double *C0000=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0001=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0010=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0011=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0100=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0101=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0110=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0111=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1000=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1001=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1010=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1011=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1100=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1101=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1110=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1111=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+
+    _Complex double *final_corr=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+
+    _Complex double *phimatrixspatialnull=(_Complex double *)malloc(sizeof(_Complex double)*4);
+
+    int ix;
+
+
+    for (ix=0;ix<4;++ix)
+       phimatrix[ix]=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    for (ix=0;ix<VOLUME;++ix)
+    {
+       if (smearedcorrelator_BSM == 1){
+         phimatrix[0][ix]= 1.*g_smeared_scalar_field[0][ix] + I*g_smeared_scalar_field[3][ix];
+         phimatrix[1][ix]= 1.*g_smeared_scalar_field[2][ix] + I*g_smeared_scalar_field[1][ix];
+         phimatrix[2][ix]=-1.*g_smeared_scalar_field[2][ix] + I*g_smeared_scalar_field[1][ix];
+         phimatrix[3][ix]= 1.*g_smeared_scalar_field[0][ix] - I*g_smeared_scalar_field[3][ix];
+       }
+       else{
+         phimatrix[0][ix]= 1.*g_scalar_field[0][ix] + I*g_scalar_field[3][ix];
+         phimatrix[1][ix]= 1.*g_scalar_field[2][ix] + I*g_scalar_field[1][ix];
+         phimatrix[2][ix]=-1.*g_scalar_field[2][ix] + I*g_scalar_field[1][ix];
+         phimatrix[3][ix]= 1.*g_scalar_field[0][ix] - I*g_scalar_field[3][ix];
+       }
+    }
+
+    for (ix=0;ix<4;++ix)
+       phimatrixspatialnull[ix]=phimatrix[ix][0];
+
+#if defined TM_USE_MPI
+    for (ix=0;ix<4;++ix){
+       MPI_Bcast(&phimatrixspatialnull[ix], 1, MPI_DOUBLE_COMPLEX, 0, g_cart_grid);
+    }
+#endif
+    
+    for (ix=0; ix<VOLUME; ++ix){
+       trace_in_spinor_and_color(C0000,propfields,ix,0,0,0,0);
+       trace_in_spinor_and_color(C0001,propfields,ix,0,0,0,1);
+       trace_in_spinor_and_color(C0010,propfields,ix,0,0,1,0);
+       trace_in_spinor_and_color(C0011,propfields,ix,0,0,1,1);
+       trace_in_spinor_and_color(C0100,propfields,ix,0,1,0,0);
+       trace_in_spinor_and_color(C0101,propfields,ix,0,1,0,1);
+       trace_in_spinor_and_color(C0110,propfields,ix,0,1,1,0);
+       trace_in_spinor_and_color(C0111,propfields,ix,0,1,1,1);
+       trace_in_spinor_and_color(C1000,propfields,ix,1,0,0,0);
+       trace_in_spinor_and_color(C1001,propfields,ix,1,0,0,1);
+       trace_in_spinor_and_color(C1010,propfields,ix,1,0,1,0);
+       trace_in_spinor_and_color(C1011,propfields,ix,1,0,1,1);
+       trace_in_spinor_and_color(C1100,propfields,ix,1,1,0,0);
+       trace_in_spinor_and_color(C1101,propfields,ix,1,1,0,1);
+       trace_in_spinor_and_color(C1110,propfields,ix,1,1,1,0);
+       trace_in_spinor_and_color(C1111,propfields,ix,1,1,1,1);
+
+    }
+    for (ix=0; ix<T_global; ++ix)
+       final_corr[ix]=0.;
+    for (ix=0; ix<VOLUME; ++ix){
+
+//tau_1
+       final_corr[g_coord[ix][TUP]]+=  1.*phimatrixspatialnull[1*2+0]*C0010[ix]*conj(phimatrix[0*2+0][ix])
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0000[ix]*conj(phimatrix[1*2+0][ix])
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0110[ix]*conj(phimatrix[0*2+1][ix])
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0100[ix]*conj(phimatrix[1*2+1][ix])
+
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1010[ix]*conj(phimatrix[0*2+0][ix])
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1000[ix]*conj(phimatrix[1*2+0][ix])
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1110[ix]*conj(phimatrix[0*2+1][ix])
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1100[ix]*conj(phimatrix[1*2+1][ix])
+
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0011[ix]*conj(phimatrix[0*2+0][ix])
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0001[ix]*conj(phimatrix[1*2+0][ix])
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0111[ix]*conj(phimatrix[0*2+1][ix])
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0101[ix]*conj(phimatrix[1*2+1][ix])
+
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1011[ix]*conj(phimatrix[0*2+0][ix])
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1001[ix]*conj(phimatrix[1*2+0][ix])
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1111[ix]*conj(phimatrix[0*2+1][ix])
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1101[ix]*conj(phimatrix[1*2+1][ix]);
+
+//tau_2
+       final_corr[g_coord[ix][TUP]]+= -1.*phimatrixspatialnull[1*2+0]*C0010[ix]*conj(phimatrix[0*2+0][ix])
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0000[ix]*conj(phimatrix[1*2+0][ix])
+                                     +-1.*phimatrixspatialnull[1*2+0]*C0110[ix]*conj(phimatrix[0*2+1][ix])
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0100[ix]*conj(phimatrix[1*2+1][ix])
+
+                                     +-1.*phimatrixspatialnull[1*2+1]*C1010[ix]*conj(phimatrix[0*2+0][ix])
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1000[ix]*conj(phimatrix[1*2+0][ix])
+                                     +-1.*phimatrixspatialnull[1*2+1]*C1110[ix]*conj(phimatrix[0*2+1][ix])
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1100[ix]*conj(phimatrix[1*2+1][ix])
+
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0011[ix]*conj(phimatrix[0*2+0][ix])
+                                     +-1.*phimatrixspatialnull[0*2+0]*C0001[ix]*conj(phimatrix[1*2+0][ix])
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0111[ix]*conj(phimatrix[0*2+1][ix])
+                                     +-1.*phimatrixspatialnull[0*2+0]*C0101[ix]*conj(phimatrix[1*2+1][ix])
+
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1011[ix]*conj(phimatrix[0*2+0][ix])
+                                     +-1.*phimatrixspatialnull[0*2+1]*C1001[ix]*conj(phimatrix[1*2+0][ix])
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1111[ix]*conj(phimatrix[0*2+1][ix])
+                                     +-1.*phimatrixspatialnull[0*2+1]*C1101[ix]*conj(phimatrix[1*2+1][ix]);
+
+//tau3
+       final_corr[g_coord[ix][TUP]]+=  1.*phimatrixspatialnull[0*2+0]*C0000[ix]*conj(phimatrix[0*2+0][ix])
+                                     +-1.*phimatrixspatialnull[0*2+0]*C0010[ix]*conj(phimatrix[1*2+0][ix])
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0100[ix]*conj(phimatrix[0*2+1][ix])
+                                     +-1.*phimatrixspatialnull[0*2+0]*C0110[ix]*conj(phimatrix[1*2+1][ix])
+
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1000[ix]*conj(phimatrix[0*2+0][ix])
+                                     +-1.*phimatrixspatialnull[0*2+1]*C1010[ix]*conj(phimatrix[1*2+0][ix])
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1100[ix]*conj(phimatrix[0*2+1][ix])
+                                     +-1.*phimatrixspatialnull[0*2+1]*C1110[ix]*conj(phimatrix[1*2+1][ix])
+
+                                     +-1.*phimatrixspatialnull[1*2+0]*C0001[ix]*conj(phimatrix[0*2+0][ix])
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0011[ix]*conj(phimatrix[1*2+0][ix])
+                                     +-1.*phimatrixspatialnull[1*2+0]*C0101[ix]*conj(phimatrix[0*2+1][ix])
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0111[ix]*conj(phimatrix[1*2+1][ix])
+
+                                     +-1.*phimatrixspatialnull[1*2+1]*C1001[ix]*conj(phimatrix[0*2+0][ix])
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1011[ix]*conj(phimatrix[1*2+0][ix])
+                                     +-1.*phimatrixspatialnull[1*2+1]*C1101[ix]*conj(phimatrix[0*2+1][ix])
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1111[ix]*conj(phimatrix[1*2+1][ix]);
+
+    }
+#if defined TM_USE_MPI
+    for (ix=0; ix<T_global; ++ix){
+       _Complex double tmp;
+       MPI_Allreduce(&final_corr[ix], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
+       final_corr[ix]= tmp;
+    }
+#endif
+    if (g_cart_id == 0){printf("Density Density correlator type a la Petros (1) results\n");}
+      for (ix=0; ix<T_global; ++ix){
+        if (g_cart_id == 0){
+        printf("DD 1 %.3d %10.10e %10.10e\n", ix, creal(final_corr[ix])/4.,cimag(final_corr[ix])/4.);
+      }
+    }
+
+
+    free(C0000);
+    free(C0001);
+    free(C0010);
+    free(C0011);
+    free(C0100);
+    free(C0101);
+    free(C0110);
+    free(C0111);
+    free(C1000);
+    free(C1001);
+    free(C1010);
+    free(C1011);
+    free(C1100);
+    free(C1101);
+    free(C1110);
+    free(C1111);
+
+    for (ix=0;ix<4;++ix)
+       free(phimatrix[ix]);
+    free(phimatrix);
+    free(final_corr);
+
+}
+
+
+
+void diraccurrent1a_petros( bispinor **propfields )
+{
+
+    _Complex double **phimatrix=(_Complex double **)malloc(sizeof(_Complex double *)*4);
+
+    _Complex double *C0000=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0001=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0010=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0011=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0100=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0101=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0110=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0111=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1000=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1001=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1010=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1011=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1100=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1101=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1110=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1111=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+
+    _Complex double *final_corr=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+
+    _Complex double *phimatrixspatialnull=(_Complex double *)malloc(sizeof(_Complex double)*4);
+
+    int ix;
+
+
+    for (ix=0;ix<4;++ix)
+       phimatrix[ix]=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    for (ix=0;ix<VOLUME;++ix)
+    {
+       if (smearedcorrelator_BSM == 1){
+         phimatrix[0][ix]= 1.*g_smeared_scalar_field[0][ix] + I*g_smeared_scalar_field[3][ix];
+         phimatrix[1][ix]= 1.*g_smeared_scalar_field[2][ix] + I*g_smeared_scalar_field[1][ix];
+         phimatrix[2][ix]=-1.*g_smeared_scalar_field[2][ix] + I*g_smeared_scalar_field[1][ix];
+         phimatrix[3][ix]= 1.*g_smeared_scalar_field[0][ix] - I*g_smeared_scalar_field[3][ix];
+       }
+       else{
+         phimatrix[0][ix]= 1.*g_scalar_field[0][ix] + I*g_scalar_field[3][ix];
+         phimatrix[1][ix]= 1.*g_scalar_field[2][ix] + I*g_scalar_field[1][ix];
+         phimatrix[2][ix]=-1.*g_scalar_field[2][ix] + I*g_scalar_field[1][ix];
+         phimatrix[3][ix]= 1.*g_scalar_field[0][ix] - I*g_scalar_field[3][ix];
+       }
+    }
+
+    for (ix=0;ix<4;++ix)
+       phimatrixspatialnull[ix]=phimatrix[ix][0];
+
+#if defined TM_USE_MPI
+    for (ix=0;ix<4;++ix){
+       MPI_Bcast(&phimatrixspatialnull[ix], 1, MPI_DOUBLE_COMPLEX, 0, g_cart_grid);
+    }
+#endif
+
+    for (ix=0; ix<VOLUME; ++ix){
+       trace_in_spinor_and_color1a(C0000,propfields,ix,0,0,0,0);
+       trace_in_spinor_and_color1a(C0001,propfields,ix,0,0,0,1);
+       trace_in_spinor_and_color1a(C0010,propfields,ix,0,0,1,0);
+       trace_in_spinor_and_color1a(C0011,propfields,ix,0,0,1,1);
+       trace_in_spinor_and_color1a(C0100,propfields,ix,0,1,0,0);
+       trace_in_spinor_and_color1a(C0101,propfields,ix,0,1,0,1);
+       trace_in_spinor_and_color1a(C0110,propfields,ix,0,1,1,0);
+       trace_in_spinor_and_color1a(C0111,propfields,ix,0,1,1,1);
+       trace_in_spinor_and_color1a(C1000,propfields,ix,1,0,0,0);
+       trace_in_spinor_and_color1a(C1001,propfields,ix,1,0,0,1);
+       trace_in_spinor_and_color1a(C1010,propfields,ix,1,0,1,0);
+       trace_in_spinor_and_color1a(C1011,propfields,ix,1,0,1,1);
+       trace_in_spinor_and_color1a(C1100,propfields,ix,1,1,0,0);
+       trace_in_spinor_and_color1a(C1101,propfields,ix,1,1,0,1);
+       trace_in_spinor_and_color1a(C1110,propfields,ix,1,1,1,0);
+       trace_in_spinor_and_color1a(C1111,propfields,ix,1,1,1,1);
+    }
+    for (ix=0; ix<T_global; ++ix)
+       final_corr[ix]=0.; 
+    for (ix=0; ix<VOLUME; ++ix){
+
+//tau_1
+       final_corr[g_coord[ix][TUP]]+=  1.*phimatrixspatialnull[1*2+0]*C0010[ix]
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0100[ix]
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1010[ix]
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1100[ix]
+
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0011[ix]
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0101[ix]
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1011[ix]
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1101[ix];
+
+//tau_2
+       final_corr[g_coord[ix][TUP]]+= -1.*phimatrixspatialnull[1*2+0]*C0010[ix]
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0100[ix]
+                                     +-1.*phimatrixspatialnull[1*2+1]*C1010[ix]
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1100[ix]
+
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0011[ix]
+                                     +-1.*phimatrixspatialnull[0*2+0]*C0101[ix]
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1011[ix]
+                                     +-1.*phimatrixspatialnull[0*2+1]*C1101[ix];
+//tau_3
+       final_corr[g_coord[ix][TUP]]+=  1.*phimatrixspatialnull[0*2+0]*C0000[ix]
+                                     +-1.*phimatrixspatialnull[0*2+0]*C0110[ix]
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1000[ix]
+                                     +-1.*phimatrixspatialnull[0*2+1]*C1110[ix]
+
+                                     +-1.*phimatrixspatialnull[1*2+0]*C0001[ix]
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0111[ix]
+                                     +-1.*phimatrixspatialnull[1*2+1]*C1001[ix]
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1111[ix];
+
+    }
+#if defined TM_USE_MPI
+    for (ix=0; ix<T_global; ++ix){
+       _Complex double tmp;
+       MPI_Allreduce(&final_corr[ix], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
+       final_corr[ix]= tmp;
+    }
+#endif 
+    if (g_cart_id == 0){printf("Dirac Current Density correlator type a la Petros (1) results\n");}
+      for (ix=0; ix<T_global; ++ix){
+        if (g_cart_id == 0){
+        printf("DCD 0 0  %.3d %10.10e %10.10e\n", ix, creal(final_corr[ix])/4.,cimag(final_corr[ix])/4.);
+      }
+    }
+
+
+    free(C0000);
+    free(C0001);
+    free(C0010);
+    free(C0011);
+    free(C0100);
+    free(C0101);
+    free(C0110);
+    free(C0111);
+    free(C1000);
+    free(C1001);
+    free(C1010);
+    free(C1011);
+    free(C1100);
+    free(C1101);
+    free(C1110);
+    free(C1111);
+
+    for (ix=0;ix<4;++ix)
+       free(phimatrix[ix]);
+    free(phimatrix);
+    free(final_corr);
+
+}
+
+void wilsoncurrent61a_petros( bispinor **propfields )
+{
+#if defined TM_USE_MPI
+    int count;
+    MPI_Status  statuses[8];
+    MPI_Request *request;
+    request=( MPI_Request *) malloc(sizeof(MPI_Request)*8);
+#endif
+
+    _Complex double **phimatrix=(_Complex double **)malloc(sizeof(_Complex double *)*4);
+
+    _Complex double *C0000=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0001=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0010=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0011=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0100=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0101=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0110=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C0111=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1000=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1001=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1010=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1011=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1100=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1101=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1110=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+    _Complex double *C1111=(_Complex double *)malloc(sizeof(_Complex double)*VOLUME);
+
+    _Complex double *final_corr=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+
+    _Complex double *phimatrixspatialnull=(_Complex double *)malloc(sizeof(_Complex double)*4);
+
+    int ix;
+
+
+    for (ix=0;ix<4;++ix)
+       phimatrix[ix]=(_Complex double *)malloc(sizeof(_Complex double)*VOLUMEPLUSRAND);
+    for (ix=0;ix<VOLUME;++ix)
+    {
+       if (smearedcorrelator_BSM == 1){
+         phimatrix[0][ix]= 1.*g_smeared_scalar_field[0][ix] + I*g_smeared_scalar_field[3][ix];
+         phimatrix[1][ix]= 1.*g_smeared_scalar_field[2][ix] + I*g_smeared_scalar_field[1][ix];
+         phimatrix[2][ix]=-1.*g_smeared_scalar_field[2][ix] + I*g_smeared_scalar_field[1][ix];
+         phimatrix[3][ix]= 1.*g_smeared_scalar_field[0][ix] - I*g_smeared_scalar_field[3][ix];
+       }
+       else{
+         phimatrix[0][ix]= 1.*g_scalar_field[0][ix] + I*g_scalar_field[3][ix];
+         phimatrix[1][ix]= 1.*g_scalar_field[2][ix] + I*g_scalar_field[1][ix];
+         phimatrix[2][ix]=-1.*g_scalar_field[2][ix] + I*g_scalar_field[1][ix];
+         phimatrix[3][ix]= 1.*g_scalar_field[0][ix] - I*g_scalar_field[3][ix];
+       }
+    }
+
+    for (ix=0;ix<4;++ix)
+       phimatrixspatialnull[ix]=phimatrix[ix][0];
+
+#if defined TM_USE_MPI
+    for (ix=0; ix<4; ++ix){
+      count=0;
+      generic_exchange_direction_nonblocking( phimatrix[ix], sizeof(_Complex double), TDOWN   , request, &count );
+      MPI_Waitall( count, request, statuses);
+      count=0;
+    }
+    for (ix=0;ix<4;++ix){
+       MPI_Bcast(&phimatrixspatialnull[ix], 1, MPI_DOUBLE_COMPLEX, 0, g_cart_grid);
+    }
+#endif
+
+    for (ix=0; ix<VOLUME; ++ix){
+       trace_in_spinor_and_color(C0000,propfields,ix,0,0,0,0);
+       trace_in_spinor_and_color(C0001,propfields,ix,0,0,0,1);
+       trace_in_spinor_and_color(C0010,propfields,ix,0,0,1,0);
+       trace_in_spinor_and_color(C0011,propfields,ix,0,0,1,1);
+       trace_in_spinor_and_color(C0100,propfields,ix,0,1,0,0);
+       trace_in_spinor_and_color(C0101,propfields,ix,0,1,0,1);
+       trace_in_spinor_and_color(C0110,propfields,ix,0,1,1,0);
+       trace_in_spinor_and_color(C0111,propfields,ix,0,1,1,1);
+       trace_in_spinor_and_color(C1000,propfields,ix,1,0,0,0);
+       trace_in_spinor_and_color(C1001,propfields,ix,1,0,0,1);
+       trace_in_spinor_and_color(C1010,propfields,ix,1,0,1,0);
+       trace_in_spinor_and_color(C1011,propfields,ix,1,0,1,1);
+       trace_in_spinor_and_color(C1100,propfields,ix,1,1,0,0);
+       trace_in_spinor_and_color(C1101,propfields,ix,1,1,0,1);
+       trace_in_spinor_and_color(C1110,propfields,ix,1,1,1,0);
+       trace_in_spinor_and_color(C1111,propfields,ix,1,1,1,1);
+
+    }
+    for (ix=0; ix<T_global; ++ix)
+       final_corr[ix]=0.;
+    for (ix=0; ix<VOLUME; ++ix){
+
+//tau_1
+       final_corr[g_coord[ix][TUP]]+=  1.*phimatrixspatialnull[1*2+0]*C0010[ix]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0000[ix]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0110[ix]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0100[ix]*conj(phimatrix[1*2+1][g_idn[ix][TUP]])
+
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1010[ix]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1000[ix]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1110[ix]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1100[ix]*conj(phimatrix[1*2+1][g_idn[ix][TUP]])
+
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0011[ix]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0001[ix]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0111[ix]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0101[ix]*conj(phimatrix[1*2+1][g_idn[ix][TUP]])
+
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1011[ix]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1001[ix]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1111[ix]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1101[ix]*conj(phimatrix[1*2+1][g_idn[ix][TUP]]);
+
+//tau_2
+       final_corr[g_coord[ix][TUP]]+= -1.*phimatrixspatialnull[1*2+0]*C0010[ix]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0000[ix]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[1*2+0]*C0110[ix]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0100[ix]*conj(phimatrix[1*2+1][g_idn[ix][TUP]])
+
+                                     +-1.*phimatrixspatialnull[1*2+1]*C1010[ix]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1000[ix]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[1*2+1]*C1110[ix]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1100[ix]*conj(phimatrix[1*2+1][g_idn[ix][TUP]])
+
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0011[ix]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[0*2+0]*C0001[ix]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0111[ix]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[0*2+0]*C0101[ix]*conj(phimatrix[1*2+1][g_idn[ix][TUP]])
+
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1011[ix]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[0*2+1]*C1001[ix]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1111[ix]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[0*2+1]*C1101[ix]*conj(phimatrix[1*2+1][g_idn[ix][TUP]]);
+
+//tau3
+       final_corr[g_coord[ix][TUP]]+=  1.*phimatrixspatialnull[0*2+0]*C0000[ix]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[0*2+0]*C0010[ix]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0100[ix]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[0*2+0]*C0110[ix]*conj(phimatrix[1*2+1][g_idn[ix][TUP]])
+
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1000[ix]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[0*2+1]*C1010[ix]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1100[ix]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[0*2+1]*C1110[ix]*conj(phimatrix[1*2+1][g_idn[ix][TUP]])
+
+                                     +-1.*phimatrixspatialnull[1*2+0]*C0001[ix]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0011[ix]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[1*2+0]*C0101[ix]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0111[ix]*conj(phimatrix[1*2+1][g_idn[ix][TUP]])
+
+                                     +-1.*phimatrixspatialnull[1*2+1]*C1001[ix]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1011[ix]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[1*2+1]*C1101[ix]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1111[ix]*conj(phimatrix[1*2+1][g_idn[ix][TUP]]);
+
+    }
+#if defined TM_USE_MPI
+    for (ix=0; ix<T_global; ++ix){
+       _Complex double tmp;
+       MPI_Allreduce(&final_corr[ix], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
+       final_corr[ix]= tmp;
+    }
+#endif
+    if (g_cart_id == 0){printf("Wilson Current Density correlator type 61a a la Petros (1) results\n");}
+      for (ix=0; ix<T_global; ++ix){
+        if (g_cart_id == 0){
+        printf("WCDPL2 1 %.3d %10.10e %10.10e\n", ix, creal(final_corr[ix])/4.,cimag(final_corr[ix])/4.);
+      }
+    }
+
+
+    free(C0000);
+    free(C0001);
+    free(C0010);
+    free(C0011);
+    free(C0100);
+    free(C0101);
+    free(C0110);
+    free(C0111);
+    free(C1000);
+    free(C1001);
+    free(C1010);
+    free(C1011);
+    free(C1100);
+    free(C1101);
+    free(C1110);
+    free(C1111);
+
+    for (ix=0;ix<4;++ix)
+       free(phimatrix[ix]);
+    free(phimatrix);
+    free(final_corr);
+#if defined TM_USE_MPI
+    free(request);
+#endif
+
+}
+
+
+void wilsoncurrent62a_petros( bispinor **propfields )
+{
+#if defined TM_USE_MPI
+    int count;  
+    MPI_Status  statuses[8];
+    MPI_Request *request; 
+    request=( MPI_Request *) malloc(sizeof(MPI_Request)*8);
+#endif
+
+    _Complex double **phimatrix=(_Complex double **)malloc(sizeof(_Complex double *)*4);
+
+    _Complex double *C0000=(_Complex double *)malloc(sizeof(_Complex double)*VOLUMEPLUSRAND);
+    _Complex double *C0001=(_Complex double *)malloc(sizeof(_Complex double)*VOLUMEPLUSRAND);
+    _Complex double *C0010=(_Complex double *)malloc(sizeof(_Complex double)*VOLUMEPLUSRAND);
+    _Complex double *C0011=(_Complex double *)malloc(sizeof(_Complex double)*VOLUMEPLUSRAND);
+    _Complex double *C0100=(_Complex double *)malloc(sizeof(_Complex double)*VOLUMEPLUSRAND);
+    _Complex double *C0101=(_Complex double *)malloc(sizeof(_Complex double)*VOLUMEPLUSRAND);
+    _Complex double *C0110=(_Complex double *)malloc(sizeof(_Complex double)*VOLUMEPLUSRAND);
+    _Complex double *C0111=(_Complex double *)malloc(sizeof(_Complex double)*VOLUMEPLUSRAND);
+    _Complex double *C1000=(_Complex double *)malloc(sizeof(_Complex double)*VOLUMEPLUSRAND);
+    _Complex double *C1001=(_Complex double *)malloc(sizeof(_Complex double)*VOLUMEPLUSRAND);
+    _Complex double *C1010=(_Complex double *)malloc(sizeof(_Complex double)*VOLUMEPLUSRAND);
+    _Complex double *C1011=(_Complex double *)malloc(sizeof(_Complex double)*VOLUMEPLUSRAND);
+    _Complex double *C1100=(_Complex double *)malloc(sizeof(_Complex double)*VOLUMEPLUSRAND);
+    _Complex double *C1101=(_Complex double *)malloc(sizeof(_Complex double)*VOLUMEPLUSRAND);
+    _Complex double *C1110=(_Complex double *)malloc(sizeof(_Complex double)*VOLUMEPLUSRAND);
+    _Complex double *C1111=(_Complex double *)malloc(sizeof(_Complex double)*VOLUMEPLUSRAND);
+
+    _Complex double *final_corr=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+
+    _Complex double *phimatrixspatialnull=(_Complex double *)malloc(sizeof(_Complex double)*4);
+
+    int ix;
+
+
+    for (ix=0;ix<4;++ix)
+       phimatrix[ix]=(_Complex double *)malloc(sizeof(_Complex double)*VOLUMEPLUSRAND);
+    for (ix=0;ix<VOLUME;++ix)
+    {
+       if (smearedcorrelator_BSM == 1){
+         phimatrix[0][ix]= 1.*g_smeared_scalar_field[0][ix] + I*g_smeared_scalar_field[3][ix];
+         phimatrix[1][ix]= 1.*g_smeared_scalar_field[2][ix] + I*g_smeared_scalar_field[1][ix];
+         phimatrix[2][ix]=-1.*g_smeared_scalar_field[2][ix] + I*g_smeared_scalar_field[1][ix];
+         phimatrix[3][ix]= 1.*g_smeared_scalar_field[0][ix] - I*g_smeared_scalar_field[3][ix];
+       }
+       else{
+         phimatrix[0][ix]= 1.*g_scalar_field[0][ix] + I*g_scalar_field[3][ix];
+         phimatrix[1][ix]= 1.*g_scalar_field[2][ix] + I*g_scalar_field[1][ix];
+         phimatrix[2][ix]=-1.*g_scalar_field[2][ix] + I*g_scalar_field[1][ix];
+         phimatrix[3][ix]= 1.*g_scalar_field[0][ix] - I*g_scalar_field[3][ix];
+       }
+    }
+
+    for (ix=0;ix<4;++ix)
+       phimatrixspatialnull[ix]=phimatrix[ix][0];
+
+#if defined TM_USE_MPI
+    for (ix=0; ix<4; ++ix){
+      count=0;
+      generic_exchange_direction_nonblocking( phimatrix[ix], sizeof(_Complex double), TDOWN   , request, &count );
+      MPI_Waitall( count, request, statuses);
+      count=0;
+    }
+    for (ix=0;ix<4;++ix){
+       MPI_Bcast(&phimatrixspatialnull[ix], 1, MPI_DOUBLE_COMPLEX, 0, g_cart_grid);
+    }
+   int s1,c1,f1;
+   for (s1=0; s1<2; ++s1)
+      for (c1=0; c1<3; ++c1)
+         for (f1=0; f1<2; ++f1){
+            count=0;
+            generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 0], sizeof(bispinor), TUP   , request, &count );
+            MPI_Waitall( count, request, statuses);
+            count=0;
+            generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 1], sizeof(bispinor), TDOWN , request, &count );
+            MPI_Waitall( count, request, statuses);
+         }
+#endif
+
+
+    for (ix=0; ix<VOLUME; ++ix){
+       trace_in_spinor_and_color62a(C0000,propfields,ix,0,0,0,0);
+       trace_in_spinor_and_color62a(C0001,propfields,ix,0,0,0,1);
+       trace_in_spinor_and_color62a(C0010,propfields,ix,0,0,1,0);
+       trace_in_spinor_and_color62a(C0011,propfields,ix,0,0,1,1);
+       trace_in_spinor_and_color62a(C0100,propfields,ix,0,1,0,0);
+       trace_in_spinor_and_color62a(C0101,propfields,ix,0,1,0,1);
+       trace_in_spinor_and_color62a(C0110,propfields,ix,0,1,1,0);
+       trace_in_spinor_and_color62a(C0111,propfields,ix,0,1,1,1);
+       trace_in_spinor_and_color62a(C1000,propfields,ix,1,0,0,0);
+       trace_in_spinor_and_color62a(C1001,propfields,ix,1,0,0,1);
+       trace_in_spinor_and_color62a(C1010,propfields,ix,1,0,1,0);
+       trace_in_spinor_and_color62a(C1011,propfields,ix,1,0,1,1);
+       trace_in_spinor_and_color62a(C1100,propfields,ix,1,1,0,0);
+       trace_in_spinor_and_color62a(C1101,propfields,ix,1,1,0,1);
+       trace_in_spinor_and_color62a(C1110,propfields,ix,1,1,1,0);
+       trace_in_spinor_and_color62a(C1111,propfields,ix,1,1,1,1);
+
+    }
+#if defined TM_USE_MPI
+    count=0;
+    generic_exchange_direction_nonblocking( C0000, sizeof(_Complex double), TDOWN   , request, &count );
+    MPI_Waitall( count, request, statuses);
+    count=0;
+    generic_exchange_direction_nonblocking( C0001, sizeof(_Complex double), TDOWN   , request, &count );
+    MPI_Waitall( count, request, statuses);
+    count=0;
+    generic_exchange_direction_nonblocking( C0010, sizeof(_Complex double), TDOWN   , request, &count );
+    MPI_Waitall( count, request, statuses);
+    count=0;
+    generic_exchange_direction_nonblocking( C0011, sizeof(_Complex double), TDOWN   , request, &count );
+    MPI_Waitall( count, request, statuses);
+    count=0;
+    generic_exchange_direction_nonblocking( C0100, sizeof(_Complex double), TDOWN   , request, &count );
+    MPI_Waitall( count, request, statuses);
+    count=0;
+    generic_exchange_direction_nonblocking( C0101, sizeof(_Complex double), TDOWN   , request, &count );
+    MPI_Waitall( count, request, statuses);
+    count=0;
+    generic_exchange_direction_nonblocking( C0110, sizeof(_Complex double), TDOWN   , request, &count );
+    MPI_Waitall( count, request, statuses);
+    count=0;
+    generic_exchange_direction_nonblocking( C0111, sizeof(_Complex double), TDOWN   , request, &count );
+    MPI_Waitall( count, request, statuses);
+    count=0;
+    generic_exchange_direction_nonblocking( C1000, sizeof(_Complex double), TDOWN   , request, &count );
+    MPI_Waitall( count, request, statuses);
+    count=0;
+    generic_exchange_direction_nonblocking( C1001, sizeof(_Complex double), TDOWN   , request, &count );
+    MPI_Waitall( count, request, statuses);
+    count=0;
+    generic_exchange_direction_nonblocking( C1010, sizeof(_Complex double), TDOWN   , request, &count );
+    MPI_Waitall( count, request, statuses);
+    count=0;
+    generic_exchange_direction_nonblocking( C1011, sizeof(_Complex double), TDOWN   , request, &count );
+    MPI_Waitall( count, request, statuses);
+    count=0;
+    generic_exchange_direction_nonblocking( C1100, sizeof(_Complex double), TDOWN   , request, &count );
+    MPI_Waitall( count, request, statuses);
+    count=0;
+    generic_exchange_direction_nonblocking( C1101, sizeof(_Complex double), TDOWN   , request, &count );
+    MPI_Waitall( count, request, statuses);
+    count=0;
+    generic_exchange_direction_nonblocking( C1110, sizeof(_Complex double), TDOWN   , request, &count );
+    MPI_Waitall( count, request, statuses);
+    count=0;
+    generic_exchange_direction_nonblocking( C1111, sizeof(_Complex double), TDOWN   , request, &count );
+    MPI_Waitall( count, request, statuses);
+    count=0;
+#endif
+
+
+    for (ix=0; ix<T_global; ++ix)
+       final_corr[ix]=0.; 
+    for (ix=0; ix<VOLUME; ++ix){
+
+//tau_1
+       final_corr[g_coord[ix][TUP]]+=  1.*phimatrixspatialnull[1*2+0]*C0010[g_idn[ix][TUP]]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0000[g_idn[ix][TUP]]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0110[g_idn[ix][TUP]]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0100[g_idn[ix][TUP]]*conj(phimatrix[1*2+1][g_idn[ix][TUP]])
+
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1010[g_idn[ix][TUP]]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1000[g_idn[ix][TUP]]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1110[g_idn[ix][TUP]]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1100[g_idn[ix][TUP]]*conj(phimatrix[1*2+1][g_idn[ix][TUP]])
+
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0011[g_idn[ix][TUP]]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0001[g_idn[ix][TUP]]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0111[g_idn[ix][TUP]]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0101[g_idn[ix][TUP]]*conj(phimatrix[1*2+1][g_idn[ix][TUP]])
+
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1011[g_idn[ix][TUP]]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1001[g_idn[ix][TUP]]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1111[g_idn[ix][TUP]]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1101[g_idn[ix][TUP]]*conj(phimatrix[1*2+1][g_idn[ix][TUP]]);
+
+//tau_2
+       final_corr[g_coord[ix][TUP]]+= -1.*phimatrixspatialnull[1*2+0]*C0010[g_idn[ix][TUP]]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0000[g_idn[ix][TUP]]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[1*2+0]*C0110[g_idn[ix][TUP]]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0100[g_idn[ix][TUP]]*conj(phimatrix[1*2+1][g_idn[ix][TUP]])
+
+                                     +-1.*phimatrixspatialnull[1*2+1]*C1010[g_idn[ix][TUP]]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1000[g_idn[ix][TUP]]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[1*2+1]*C1110[g_idn[ix][TUP]]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1100[g_idn[ix][TUP]]*conj(phimatrix[1*2+1][g_idn[ix][TUP]])
+
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0011[g_idn[ix][TUP]]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[0*2+0]*C0001[g_idn[ix][TUP]]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0111[g_idn[ix][TUP]]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[0*2+0]*C0101[g_idn[ix][TUP]]*conj(phimatrix[1*2+1][g_idn[ix][TUP]])
+
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1011[g_idn[ix][TUP]]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[0*2+1]*C1001[g_idn[ix][TUP]]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1111[g_idn[ix][TUP]]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[0*2+1]*C1101[g_idn[ix][TUP]]*conj(phimatrix[1*2+1][g_idn[ix][TUP]]);
+
+//tau3
+       final_corr[g_coord[ix][TUP]]+=  1.*phimatrixspatialnull[0*2+0]*C0000[g_idn[ix][TUP]]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[0*2+0]*C0010[g_idn[ix][TUP]]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[0*2+0]*C0100[g_idn[ix][TUP]]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[0*2+0]*C0110[g_idn[ix][TUP]]*conj(phimatrix[1*2+1][g_idn[ix][TUP]])
+
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1000[g_idn[ix][TUP]]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[0*2+1]*C1010[g_idn[ix][TUP]]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[0*2+1]*C1100[g_idn[ix][TUP]]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[0*2+1]*C1110[g_idn[ix][TUP]]*conj(phimatrix[1*2+1][g_idn[ix][TUP]])
+
+                                     +-1.*phimatrixspatialnull[1*2+0]*C0001[g_idn[ix][TUP]]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0011[g_idn[ix][TUP]]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[1*2+0]*C0101[g_idn[ix][TUP]]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+0]*C0111[g_idn[ix][TUP]]*conj(phimatrix[1*2+1][g_idn[ix][TUP]])
+
+                                     +-1.*phimatrixspatialnull[1*2+1]*C1001[g_idn[ix][TUP]]*conj(phimatrix[0*2+0][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1011[g_idn[ix][TUP]]*conj(phimatrix[1*2+0][g_idn[ix][TUP]])
+                                     +-1.*phimatrixspatialnull[1*2+1]*C1101[g_idn[ix][TUP]]*conj(phimatrix[0*2+1][g_idn[ix][TUP]])
+                                     + 1.*phimatrixspatialnull[1*2+1]*C1111[g_idn[ix][TUP]]*conj(phimatrix[1*2+1][g_idn[ix][TUP]]);
+
+    }
+#if defined TM_USE_MPI
+    for (ix=0; ix<T_global; ++ix){
+       _Complex double tmp;
+       MPI_Allreduce(&final_corr[ix], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
+       final_corr[ix]= tmp;
+    }
+#endif 
+    if (g_cart_id == 0){printf("Wilson Current Density correlator type 62a a la Petros (1) results\n");}
+      for (ix=0; ix<T_global; ++ix){
+        if (g_cart_id == 0){
+        printf("WCDPL2 1 %.3d %10.10e %10.10e\n", ix, creal(final_corr[ix])/4.,cimag(final_corr[ix])/4.);
+      }
+    }
+
+
+    free(C0000);
+    free(C0001);
+    free(C0010);
+    free(C0011);
+    free(C0100);
+    free(C0101);
+    free(C0110);
+    free(C0111);
+    free(C1000);
+    free(C1001);
+    free(C1010);
+    free(C1011);
+    free(C1100);
+    free(C1101);
+    free(C1110);
+    free(C1111);
+
+    for (ix=0;ix<4;++ix)
+       free(phimatrix[ix]);
+    free(phimatrix);
+    free(final_corr);
+#if defined TM_USE_MPI
+    free(request);
+#endif
+
+}
+#endif
diff --git a/contractions/contractions_checks.h b/contractions/contractions_checks.h
new file mode 100644
index 000000000..dfa600f2b
--- /dev/null
+++ b/contractions/contractions_checks.h
@@ -0,0 +1,28 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2015 Ferenc Pittler
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _CONTRACTIONS_CHECK_H
+#define _CONTRACTIONS_CHECK_H
+void density_density_1234_petros( bispinor **propfields );
+void diraccurrent1a_petros( bispinor **propfields );
+void wilsoncurrent31a_petros( bispinor **propfields );
+void wilsoncurrent61a_petros( bispinor **propfields );
+void wilsoncurrent62a_petros( bispinor **propfields );
+#endif
diff --git a/contractions/contractions_currentdensityextended.c b/contractions/contractions_currentdensityextended.c
new file mode 100644
index 000000000..d2f80d87c
--- /dev/null
+++ b/contractions/contractions_currentdensityextended.c
@@ -0,0 +1,950 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2017 Ferenc Pittler
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#ifdef TM_USE_BSM
+#include"lime.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <errno.h>
+#include <time.h>
+#ifdef TM_USE_MPI
+#include <mpi.h>
+#endif
+#include "global.h"
+#include "getopt.h"
+#include "default_input_values.h"
+#include "read_input.h"
+#include "su3.h"
+#include "su3spinor.h"
+#include "operator/tm_operators.h"
+#include "linalg_eo.h"
+#include "geometry_eo.h"
+#include "linalg/assign.h"
+#include "operator/D_psi.h"
+#include "operator/D_psi_BSM.h"
+#include "operator/D_psi_BSM2b.h"
+#include "operator/D_psi_BSM2f.h"
+#include "operator/D_psi_BSM2m.h"
+#include "operator/Dov_psi.h"
+#include "operator/tm_operators_nd.h"
+#include "operator/Hopping_Matrix.h"
+#include "invert_eo.h"
+#include "invert_doublet_eo.h"
+#include "invert_overlap.h"
+#include "invert_clover_eo.h"
+#include "init/init_scalar_field.h"
+#include "init/init_bsm_2hop_lookup.h"
+#include "boundary.h"
+#include "start.h"
+#include "solver/solver.h"
+#include "xchange/xchange_gauge.h"
+#include "prepare_source.h"
+#include <io/params.h>
+#include <io/gauge.h>
+#include <io/spinor.h>
+#include <io/utils.h>
+#include "io/scalar.h"
+#include "buffers/utils_nonblocking.h"
+#include "buffers/utils_nogauge.h"
+#include "test/overlaptests.h"
+#include "solver/index_jd.h"
+#include "operator/clovertm_operators.h"
+#include "operator/clover_leaf.h"
+#include "operator.h"
+#include "gettime.h"
+#include "measure_gauge_action.h"
+#include "mpi_init.h"
+#include "init/init_geometry_indices.h"
+#include "init/init_openmp.h"
+#include "init/init_gauge_field.h"
+#include "init/init_spinor_field.h"
+#include "init/init_bispinor_field.h"
+#include "solver/solver_field.h"
+#include "ranlxd.h"
+#include "contractions/contractions_helper.h"
+
+extern int DAGGER;
+extern int NO_DAGG;
+
+extern int GAMMA_UP;
+extern int GAMMA_DN;
+extern int NO_GAMMA;
+
+extern int WITH_SCALAR;
+extern int NO_SCALAR;
+
+extern int TYPE_A;
+extern int TYPE_B;
+
+extern int TYPE_1;
+extern int TYPE_2;
+extern int TYPE_3;
+extern int TYPE_4;
+
+extern int TYPE_I;
+extern int TYPE_II;
+   
+extern int RIGHT;
+extern int LEFT;
+
+
+void wilsonterm_current_density_312ab( bispinor ** propfields, int type_12, int type_ab, _Complex double **results ){
+   int ix,i;
+   int f1,c1,s1,tauindex;
+   int spinorstart=0, spinorend=4;
+   bispinor running;
+#if defined TM_USE_MPI
+   int count;
+   MPI_Status  statuses[8];
+   MPI_Request *request;
+   request=( MPI_Request *) malloc(sizeof(MPI_Request)*8);
+#endif
+   _Complex double *colortrace;
+   _Complex double *spacetrace;
+   _Complex double *spinortrace;
+   _Complex double *flavortrace;
+   _Complex double *paulitrace;
+
+   colortrace= (_Complex double *)malloc(sizeof(_Complex double) *8);
+   spacetrace= (_Complex double *)malloc(sizeof(_Complex double) *8*T_global);
+   spinortrace=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
+   flavortrace=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+   paulitrace= (_Complex double *)malloc(sizeof(_Complex double)*T_global);
+
+   if ( (colortrace == NULL) || (spacetrace == NULL) || (spinortrace == NULL) || (flavortrace == NULL) || (paulitrace == NULL) )
+   {
+     printf("Error in mem allocation in wilsonterm_current_density_312ab\n");
+     exit(1);
+   }
+   *results=(_Complex double *)malloc(sizeof(_Complex double)*4*T_global);
+   if (*results == NULL){
+      printf("Not enough memory for results in current density three\n");
+      exit(1);
+   }
+   if ( type_ab == TYPE_A ) {
+      spinorstart=0;
+      spinorend  =2;
+   }
+   else if ( type_ab == TYPE_B ){
+      spinorstart=2;
+      spinorend  =4;
+   }
+   else{
+      if (g_cart_id == 0){fprintf(stdout,"Wrong argument for type_1234, it can only be TYPE_1, TYPE_2,  TYPE_3 or TYPE_4 \n"); exit(1);}
+   }
+
+
+// Doing the neccessary communication
+#if defined TM_USE_MPI
+   for (s1=spinorstart; s1<spinorend; ++s1)
+      for (c1=0; c1<3; ++c1)
+         for (f1=0; f1<2; ++f1){
+            count=0;
+            generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 0], sizeof(bispinor), TUP   , request, &count );
+            MPI_Waitall( count, request, statuses); 
+            count=0;
+            generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 0], sizeof(bispinor), TDOWN , request, &count );
+            MPI_Waitall( count, request, statuses);
+            count=0;
+            generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 1], sizeof(bispinor), TDOWN , request, &count );
+            MPI_Waitall( count, request, statuses);
+         }
+   free(request);
+#endif
+//Trace over the Pauli matrices
+   for (i=0; i<T_global; ++i){
+      paulitrace[i]=0.;
+   }
+   for (tauindex= 0; tauindex <3; ++tauindex){
+//Trace over flavour degrees of freedom
+      for (i=0; i<T_global; ++i)
+         flavortrace[i]=0.;
+
+      for (f1=0; f1<2; ++f1){
+
+//Trace over spinor indices
+         for (i=0; i<2*T_global; ++i){
+            spinortrace[i]=0.;
+         }
+
+         for (s1=spinorstart; s1<spinorend; ++s1){
+
+//Trace over spatial indices
+            for (i=0; i<8*T_global; ++i){
+               spacetrace[i]=0.;
+            }
+            for (ix=0; ix<VOLUME; ++ix){
+
+//Trace over the color indices for each sites
+               for (i=0; i<8; ++i)
+                  colortrace[i]=0.;
+               for (c1=0; c1<3; ++c1){
+/*   
+       TYPE III.1.a OR  III.1.b     U0(x-0)*U0(x)* (1+gamma5)/2 *  S(x+0,ytilde)
+       TYPE III.2.a OR  III.2.b                    (1+gamma5)/2 *  S(x-0,ytilde)
+*/
+                  _bispinor_null(running);
+                  
+                  if ( type_12 == TYPE_1){
+                    bispinor_spinup_mult_su3matrix( &running, &propfields[12*s1 + 4*c1 + 2*f1][g_iup[ix][TUP]], &g_gauge_field[ix][TUP], NO_DAGG);
+
+                    bispinor_spinup_mult_su3matrix( &running, &running, &g_gauge_field[g_idn[ix][TUP]][TUP], NO_DAGG);
+
+                  }
+                  else if ( type_12 == TYPE_2){
+
+                    _vector_assign( running.sp_up.s0, propfields[12*s1 + 4*c1 + 2*f1][g_idn[ix][TUP]].sp_up.s0 );
+                    _vector_assign( running.sp_up.s1, propfields[12*s1 + 4*c1 + 2*f1][g_idn[ix][TUP]].sp_up.s1 );
+                    _vector_assign( running.sp_dn.s0, propfields[12*s1 + 4*c1 + 2*f1][g_idn[ix][TUP]].sp_dn.s0 );
+                    _vector_assign( running.sp_dn.s1, propfields[12*s1 + 4*c1 + 2*f1][g_idn[ix][TUP]].sp_dn.s1 );
+                  }
+/*   
+       TYPE III.1.a OR  III.1.b     tau_i*phi(x)*U0(x-0)*U0(x)* (1+gamma5)/2 *  S(x+0,ytilde)
+       TYPE III.2.a OR  III.2.b     tau_i*phi(x)*               (1+gamma5)/2 *  S(x-0,ytilde)
+*/
+                  taui_scalarfield_spinor( &running, &running, GAMMA_UP, tauindex, ix, NODIR, NO_DAGG);
+
+/*   
+       TYPE III.1.a OR  III.1.b     S(ytilde, x-0)*tau_i*phi(x)*U0(x-0)*U0(x)* (1+gamma5)/2 *  S(x+0,ytilde)
+       TYPE III.2.a OR  III.2.b     S(ytilde, x-0)*tau_i*phi(x)*               (1+gamma5)/2 *  S(x-0,ytilde)
+*/
+                  multiply_backward_propagator(&running, propfields, &running, ix, TDOWN);
+
+                  //delta( color component of bispinor running, c1) for all spinor and flavor indices                  
+                  trace_in_color(colortrace, &running, c1 );
+               } //End of trace color
+               //sum over all lattice sites the result of the color trace
+               trace_in_space( spacetrace, colortrace, ix);
+            }  //End of trace in space
+
+//Gather the results from all nodes to complete the trace in space
+#if defined TM_USE_MPI
+            for (i=0; i<8*T_global; ++i){
+               _Complex double tmp;
+               MPI_Allreduce(&spacetrace[i], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, g_cart_grid);
+               spacetrace[i]= tmp;
+            }
+#endif
+            // delta (spinor components of spacetrace, s1) for all time slices and flavor indices 
+            trace_in_spinor(spinortrace, spacetrace, s1);
+
+         } //End of trace in spinor space
+         
+/*   
+       TYPE III.1.a                 tau_i*phi(ytilde)*       (1+gamma5)/2* S(ytilde, x-0)*tau_i*phi(x)*U0(x-0)*U0(x)* (1+gamma5)/2 *  S(x+0,ytilde)
+       TYPE III.1.b                 phi^dagger(ytilde)*tau_i*(1-gamma5)/2* S(ytilde, x-0)*tau_i*phi(x)*U0(x-0)*U0(x)* (1+gamma5)/2 *  S(x+0,ytilde)
+
+       TYPE III.2.a                 tau_i*phi(ytilde)*       (1+gamma5)/2* S(ytilde, x-0)*tau_i*phi(x)*               (1+gamma5)/2 *  S(x-0,ytilde)
+       TYPE III.2.b                 phi^dagger(ytilde)*tau_i*(1-gamma5)/2* S(ytilde, x-0)*tau_i*phi(x)*               (1+gamma5)/2 *  S(x-0,ytilde)
+
+*/ 
+         if ( type_ab == TYPE_A ){
+           taui_scalarfield_flavoronly( spinortrace, tauindex, NO_DAGG, LEFT );
+         }
+         else if ( type_ab == TYPE_B){
+           taui_scalarfield_flavoronly( spinortrace, tauindex, DAGGER, LEFT  );
+         }
+         //delta(flavor component in spinortrace, f1) for all time slices
+         trace_in_flavor( flavortrace, spinortrace, f1 );
+
+      } //End of trace in flavor space
+      //sum for all Pauli matrices
+      for (int ii=0; ii<T_global; ++ii){
+        (*results)[ii+tauindex*T_global]=flavortrace[ii]/4.;
+      }
+      for (i=0;i<T_global; ++i)
+         paulitrace[i]+=flavortrace[i];
+   } //End of trace for Pauli matrices
+
+ 
+   if (g_cart_id == 0){printf("Wilson term Dirac Current Density correlator typeIII results= %s %s\n", type_12 == TYPE_1 ? "1" : "2",type_ab == TYPE_A ? "a" :"b");}
+   for (i=0; i<T_global; ++i){
+      if (g_cart_id == 0){
+        printf("WCDPR1 %d %d %.3d %10.10e %10.10e\n", type_12, type_ab, i, creal(paulitrace[i])/4., cimag(paulitrace[i])/4.);
+      }
+      for (int ii=0; ii<T_global; ++ii){
+        (*results)[ii+3*T_global]=paulitrace[ii]/4.;
+      }
+   }
+   free(flavortrace);
+   free(paulitrace);
+   free(spacetrace);
+   free(spinortrace);
+   free(colortrace);
+}
+
+void wilsonterm_current_density_412ab( bispinor ** propfields, int type_12, int type_ab, _Complex double **results ){
+   int ix,i;
+   int f1,c1,s1,tauindex;
+   int spinorstart=0, spinorend=4;
+   bispinor **starting2d;
+   bispinor running;
+   su3 * restrict upm;
+#if defined TM_USE_MPI
+   int count;
+   MPI_Status  statuses[8];
+   MPI_Request *request;
+#endif
+   _Complex double *colortrace;
+   _Complex double *spacetrace;
+   _Complex double *spinortrace;
+   _Complex double *flavortrace;
+   _Complex double *paulitrace;
+
+   colortrace= (_Complex double *)malloc(sizeof(_Complex double)*8);
+   spacetrace= (_Complex double *)malloc(sizeof(_Complex double)*8*T_global);
+   spinortrace=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
+   flavortrace=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+   paulitrace= (_Complex double *)malloc(sizeof(_Complex double)*T_global);
+
+   if ( (colortrace == NULL) || (spacetrace == NULL) || (spinortrace == NULL) || (flavortrace == NULL) || (paulitrace == NULL) )
+   {
+     printf("Error in mem allocation\n");
+     exit(1);
+   }
+   *results=(_Complex double *)malloc(sizeof(_Complex double)*4*T_global);
+   if ( *results == NULL ){
+     printf("Not enough memory in wilson current density 4\n");
+     exit(1);
+   }
+#if defined TM_USE_MPI
+   request=( MPI_Request *) malloc(sizeof(MPI_Request)*8);
+#endif
+   if ( type_ab == TYPE_A ) {
+        spinorstart=0;
+        spinorend  =2;
+   }
+   else if ( type_ab == TYPE_B ){
+        spinorstart=2;
+        spinorend  =4;
+   }
+   else{
+       if (g_cart_id == 0) fprintf(stdout,"Wrong argument for type_1234, it can only be TYPE_1, TYPE_2,  TYPE_3 or TYPE_4 \n");
+       exit(1);
+   }
+
+   if (type_12 == TYPE_1){
+     for (i=0; i<T_global; ++i)
+       paulitrace[i]=0.;
+     for (tauindex=0; tauindex<3; ++tauindex){
+       for (i=0; i<T_global; ++i)
+         flavortrace[i]=0.;
+       for (f1=0; f1<2; ++f1){
+         for (i=0; i<2*T_global; ++i)
+           spinortrace[i]=0.;
+         for (s1= spinorstart; s1<spinorend; ++s1){
+           for (i=0; i<8*T_global; ++i)
+             spacetrace[i]=0.;
+           for (ix = 0; ix< VOLUME; ++ix){
+             for (i=0; i<8; ++i)
+               colortrace[i]=0.;
+             for (c1=0; c1<3; ++c1){
+               _vector_null( running.sp_up.s2 );
+               _vector_null( running.sp_up.s3 );
+               _vector_assign( running.sp_up.s0, propfields[12*s1+4*c1+2*f1][ix].sp_up.s0 );
+               _vector_assign( running.sp_up.s1, propfields[12*s1+4*c1+2*f1][ix].sp_up.s1 );
+               _vector_null( running.sp_dn.s2 );
+               _vector_null( running.sp_dn.s3 );
+               _vector_assign( running.sp_dn.s0, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s0 );
+               _vector_assign( running.sp_dn.s1, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s1 );
+
+               taui_scalarfield_spinor( &running, &running, GAMMA_UP, tauindex, ix, TDOWN, NO_DAGG );
+
+               multiply_backward_propagator(&running, propfields, &running, ix, NODIR );
+
+               trace_in_color(colortrace,&running,c1);
+             }  //End of trace color
+              //sum over all lattice sites the result of the color trace
+             trace_in_space(spacetrace,colortrace,ix);
+           } //End of trace space
+#if defined TM_USE_MPI
+           for (i=0; i<8*T_global; ++i){
+             _Complex double tmp;
+             MPI_Allreduce(&spacetrace[i], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, g_cart_grid);
+             spacetrace[i]= tmp;
+           }
+#endif
+            // delta (spinor components of spacetrace, s1) for all time slices and flavor indices
+           trace_in_spinor(spinortrace, spacetrace, s1);
+         }//End of trace in spinor space
+         if ( type_ab == TYPE_A ){
+           taui_scalarfield_flavoronly( spinortrace, tauindex, NO_DAGG, LEFT);
+         }
+         else if ( type_ab == TYPE_B ){
+           taui_scalarfield_flavoronly( spinortrace, tauindex, DAGGER, LEFT  );
+         }
+         trace_in_flavor( flavortrace, spinortrace, f1 );
+       } //End of trace in flavor space
+      //sum for all Pauli matrices
+       for (i=0;i<T_global; ++i){
+         paulitrace[i]+=flavortrace[i];
+         (*results)[i+tauindex*T_global] = flavortrace[i]/4.;
+       }
+     } //End of trace for Pauli matrices
+     if  (g_cart_id == 0){printf("Wilson term Dirac Current Density correlator typeIV results= %s %s\n", "1",type_ab == TYPE_A ? "a" :"b");}
+     for (i=0; i<T_global; ++i){
+       if (g_cart_id == 0){
+        printf("WCDPR2 %d %d %.3d %10.10e %10.10e\n", type_12, type_ab, i, creal(paulitrace[i])/4., cimag(paulitrace[i])/4.);
+       }
+       (*results)[i+3*T_global]=paulitrace[i]/4.;
+     }
+   }
+   if (type_12 == TYPE_2 ){
+      starting2d=(bispinor **)malloc(sizeof(bispinor *)*3);
+      if (starting2d == NULL){
+        if (g_cart_id == 0){
+          printf("Memory allocation failure in extended current density contractions IV\n");
+          exit(1);
+        }
+      }
+      for (i=0; i<3; ++i){
+        starting2d[i] =(bispinor *)malloc(sizeof(bispinor)*VOLUMEPLUSRAND);
+        if (starting2d[i] == NULL){
+          if (g_cart_id == 0) printf("Memory allocation error starting2d VI\n");
+          exit(1);
+        }
+        for (ix=0; ix<VOLUME; ++ix){
+          _bispinor_null(starting2d[i][ix]);
+        }
+      }
+//Doing the neccesary communication
+#if defined TM_USE_MPI
+      for (s1=spinorstart; s1<spinorend; ++s1)
+        for (c1=0; c1<3; ++c1)
+           for (f1=0; f1<2; ++f1){
+             count=0;
+             generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 0], sizeof(bispinor), TDOWN   , request, &count );
+             MPI_Waitall( count, request, statuses);
+             count=0;
+             generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 1], sizeof(bispinor), TUP     , request, &count );
+             MPI_Waitall( count, request, statuses);
+          }
+#endif
+      for (i=0; i<T_global; ++i)
+        paulitrace[i]=0.;
+      for (tauindex=0; tauindex<3; ++tauindex){
+        for (i=0; i<T_global; ++i)
+          flavortrace[i]=0.;
+        for (f1=0; f1<2; ++f1){
+          for (i=0; i<2*T_global; ++i)
+            spinortrace[i]=0.;
+          for (s1= spinorstart; s1<spinorend; ++s1){
+            for (i=0; i<8*T_global; ++i)
+              spacetrace[i]=0.;
+            for (ix = 0; ix< VOLUME; ++ix){
+              for (i=0; i<8; ++i)
+                colortrace[i]=0.;
+              for (c1=0; c1<3; ++c1){
+
+                _bispinor_null(starting2d[c1][ix]);
+
+                bispinor_spinup_mult_su3matrix( &starting2d[c1][ix], &propfields[12*s1 + 4*c1 + 2*f1][g_idn[ix][TUP]], &g_gauge_field[g_idn[ix][TUP]][TUP], DAGGER );
+
+                bispinor_spinup_mult_su3matrix( &starting2d[c1][ix], &starting2d[c1][ix], &g_gauge_field[ix][TUP], DAGGER );
+
+                taui_scalarfield_spinor( &starting2d[c1][ix], &starting2d[c1][ix], GAMMA_UP, tauindex, ix, NODIR, NO_DAGG );
+                
+                multiply_backward_propagator(&starting2d[c1][ix], propfields, &starting2d[c1][ix], ix, TUP);
+              }
+            }
+#if defined TM_USE_MPI
+            for (c1=0; c1<3; ++c1){
+              count=0;
+              generic_exchange_direction_nonblocking( starting2d[c1], sizeof(bispinor), TDOWN, request, &count );
+              MPI_Waitall( count, request, statuses);
+            }
+#endif
+            for (ix=0; ix<VOLUME; ++ix){
+              for (i=0; i<8; ++i)
+                colortrace[i]=0.;
+              for (c1=0; c1<3; ++c1)
+                trace_in_color(colortrace,&starting2d[c1][g_idn[ix][TUP]],c1);
+              trace_in_space(spacetrace,colortrace,ix);
+            }
+#if defined TM_USE_MPI
+            for (i=0; i<8*T_global; ++i){
+              _Complex double tmp;
+              MPI_Allreduce(&spacetrace[i], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, g_cart_grid);
+              spacetrace[i]= tmp;
+            }
+#endif
+            // delta (spinor components of spacetrace, s1) for all time slices and flavor indices
+            trace_in_spinor(spinortrace, spacetrace, s1);
+          }//End of trace in spinor space
+          if ( type_ab == TYPE_A ){
+            taui_scalarfield_flavoronly( spinortrace, tauindex, NO_DAGG, LEFT );
+          }
+          else if ( type_ab == TYPE_B ){
+            taui_scalarfield_flavoronly( spinortrace, tauindex, DAGGER, LEFT  );
+          }
+          trace_in_flavor( flavortrace, spinortrace, f1 );
+        } //End of trace in flavor space
+      //sum for all Pauli matrices
+        for (i=0;i<T_global; ++i){
+          paulitrace[i]+=flavortrace[i];
+          (*results)[i+tauindex*T_global]=flavortrace[i]/4.;
+        }
+      } //End of trace for Pauli matrices
+      if  (g_cart_id == 0){printf("Wilson term Dirac Current Density correlator typeIV results= %s %s\n", type_12 == TYPE_1 ? "1" : "2",type_ab == TYPE_A ? "a" :"b");}
+      for (i=0; i<T_global; ++i){
+        if (g_cart_id == 0){
+          printf("WCDPR2 %d %d %.3d %10.10e %10.10e\n", type_12, type_ab, i, creal(paulitrace[i])/4., cimag(paulitrace[i])/4.);
+        }          
+        (*results)[i+3*T_global]=paulitrace[i]/4.;
+      }
+      for (i=0; i<3; ++i){
+        free(starting2d[i]);
+      }
+      free(starting2d);
+   }
+   free(flavortrace);
+   free(paulitrace);
+   free(spacetrace);
+   free(spinortrace);
+   free(colortrace);
+#if defined TM_USE_MPI
+   if (type_12 == TYPE_2)
+     free(request);
+#endif
+}
+
+void wilsonterm_current_density_512ab( bispinor ** propfields, int type_12, int type_ab, _Complex  double **results ){
+   int ix,i;
+   int f1,c1,s1,tauindex;
+   int spinorstart=0, spinorend=4;
+   su3 * restrict upm;
+   bispinor running;
+#if defined TM_USE_MPI
+   int count;
+   MPI_Status  statuses[8];
+   MPI_Request *request;
+   request=( MPI_Request *) malloc(sizeof(MPI_Request)*8);
+#endif
+   _Complex double *colortrace;
+   _Complex double *spacetrace;
+   _Complex double *spinortrace;
+   _Complex double *flavortrace;
+   _Complex double *paulitrace;
+
+   colortrace= (_Complex double *)malloc(sizeof(_Complex double)*8);
+   spacetrace= (_Complex double *)malloc(sizeof(_Complex double)*8*T_global);
+   spinortrace=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
+   flavortrace=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+   paulitrace= (_Complex double *)malloc(sizeof(_Complex double)*T_global);
+
+   if ( (colortrace == NULL) || (spacetrace == NULL) || (spinortrace == NULL) || (flavortrace == NULL) || (paulitrace == NULL) )
+   {
+     printf("Error in mem allocation\n");
+     exit(1);
+   }
+   *results=(_Complex double *)malloc(sizeof(_Complex double)*4*T_global);
+   if (*results == NULL){
+     printf("not enough memory in current density five\n");
+   }
+
+   if ( type_ab == TYPE_A ) {
+     spinorstart=0;
+     spinorend  =2;
+   }
+   else if ( type_ab == TYPE_B ){
+     spinorstart=2;
+     spinorend  =4;
+   }
+   else{
+     if (g_cart_id == 0) fprintf(stdout,"Wrong argument for type_1234, it can only be TYPE_1, TYPE_2,  TYPE_3 or TYPE_4 \n");                                                                      
+     exit(1);                                                                                                                                                                  
+   }
+#if defined TM_USE_MPI
+//Doing the neccesary communication
+   for (s1=spinorstart; s1<spinorend; ++s1)
+     for (c1=0; c1<3; ++c1)
+       for (f1=0; f1<2; ++f1){
+           count=0;
+           generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 0], sizeof(bispinor), TDOWN , request, &count );
+           MPI_Waitall( count, request, statuses);
+           count=0;
+           generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 1], sizeof(bispinor), TUP   , request, &count );
+           MPI_Waitall( count, request, statuses);
+           count=0;
+           generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 1], sizeof(bispinor), TDOWN , request, &count);
+           MPI_Waitall( count, request, statuses);
+       }
+   free(request);
+#endif
+   for (i=0; i<T_global; ++i)
+       paulitrace[i]=0.;
+// Trace over the Pauli matrices
+   for (tauindex=0; tauindex<3; ++tauindex){
+
+//Trace over flavour degrees of freedom
+      for (i=0; i<T_global; ++i)
+         flavortrace[i]=0.;
+
+      for (f1=0; f1<2; ++f1){
+
+//Trace over spinor indices
+         for (i=0; i<2*T_global; ++i){
+            spinortrace[i]=0.;
+         }
+
+         for (s1=spinorstart; s1<spinorend; ++s1){
+
+//Trace over spatial indices
+            for (i=0; i<8*T_global; ++i){
+               spacetrace[i]=0.;
+            }
+            for (ix=0; ix<VOLUME; ++ix){
+
+//Trace over the color indices for each sites
+               for (i=0; i<8; ++i)
+                  colortrace[i]=0.;
+               for (c1=0; c1<3; ++c1){
+/*   
+       TYPE V.1.a OR  V.1.b     U0^dagger(x)*U0^dagger(x-0)* (1-gamma5)/2 *  S(x-0,ytilde)
+       TYPE V.2.a OR  V.2.b                                  (1-gamma5)/2 *  S(x-0,ytilde)
+*/
+                  _bispinor_null(running);
+                  
+                  if ( type_12 == TYPE_1){
+
+                    bispinor_spinup_mult_su3matrix( &running, &propfields[12*s1 + 4*c1 + 2*f1][g_idn[ix][TUP]], &g_gauge_field[g_idn[ix][TUP]][TUP], DAGGER );
+
+                    bispinor_spinup_mult_su3matrix( &running, &running, &g_gauge_field[ix][TUP], DAGGER );
+
+                  }
+                  else if ( type_12 == TYPE_2){
+                    _vector_assign( running.sp_up.s2, propfields[12*s1 + 4*c1 + 2*f1][g_idn[ix][TUP]].sp_up.s2 );
+                    _vector_assign( running.sp_up.s3, propfields[12*s1 + 4*c1 + 2*f1][g_idn[ix][TUP]].sp_up.s3 );
+
+                    _vector_assign( running.sp_dn.s2, propfields[12*s1 + 4*c1 + 2*f1][g_idn[ix][TUP]].sp_dn.s2 );
+                    _vector_assign( running.sp_dn.s3, propfields[12*s1 + 4*c1 + 2*f1][g_idn[ix][TUP]].sp_dn.s3 );
+                  }
+/*   
+       TYPE V.1.a OR  V.1.b     phi^dagger(x)*tau_i*     U0^dagger(x)*U0^dagger(x-0)* (1-gamma5)/2 *  S(x-0,ytilde)
+       TYPE V.2.a OR  V.2.b     phi^dagger(x)*tau_i                                        (1-gamma5)/2 *  S(x-0,ytilde)
+*/
+                  taui_scalarfield_spinor( &running, &running, GAMMA_DN, tauindex, ix, NODIR, DAGGER);
+
+/*   
+       TYPE V.1.a OR  V.1.b     S(ytilde,x+0)*phi^dagger(x)*tau_i*     U0^dagger(x)*U0^dagger(x-0)* (1-gamma5)/2 *  S(x-0,ytilde)
+       TYPE V.2.a OR  V.2.b     S(ytilde,x-0)*phi^dagger(x)*tau_i                                   (1-gamma5)/2 *  S(x-0,ytilde)
+*/
+
+                  if (type_12 == TYPE_1){
+                    multiply_backward_propagator(&running, propfields, &running, ix, TUP);
+                  }
+                  else if (type_12 == TYPE_2){
+                    multiply_backward_propagator(&running, propfields, &running, ix, TDOWN);
+                  }
+/*   
+       TYPE V.1.a                 tau_i*phi(ytilde)*       (1+gamma5)/2* S(ytilde, x+0)*tau_i*phi(x)*U0(x-0)*U0(x)* (1+gamma5)/2 *  S(x+0,ytilde)
+       TYPE V.1.b                 phi^dagger(ytilde)*tau_i*(1-gamma5)/2* S(ytilde, x+0)*tau_i*phi(x)*U0(x-0)*U0(x)* (1+gamma5)/2 *  S(x+0,ytilde)
+
+       TYPE V.2.a                 tau_i*phi(ytilde)*       (1+gamma5)/2* S(ytilde, x-0)*tau_i*phi(x)*               (1+gamma5)/2 *  S(x-0,ytilde)
+       TYPE V.2.b                 phi^dagger(ytilde)*tau_i*(1-gamma5)/2* S(ytilde, x-0)*tau_i*phi(x)*               (1+gamma5)/2 *  S(x-0,ytilde)
+
+*/
+                  //delta( color component of bispinor running, c1) for all spinor and flavor indices
+                  trace_in_color(colortrace, &running, c1 );
+               } //End of trace color
+               //sum over all lattice sites the result of the color trace
+               trace_in_space( spacetrace, colortrace, ix);
+            }  //End of trace in space
+#if defined TM_USE_MPI
+//Gather the results from all nodes to complete the trace in space
+            for (i=0; i<8*T_global; ++i){
+               _Complex double tmp;
+               MPI_Allreduce(&spacetrace[i], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
+               spacetrace[i]= tmp;
+            }
+#endif
+            // delta (spinor components of spacetrace, s1) for all time slices and flavor indices
+            trace_in_spinor(spinortrace, spacetrace, s1);
+
+         } //End of trace in spinor space
+
+         if ( type_ab == TYPE_A ){
+           taui_scalarfield_flavoronly( spinortrace, tauindex, NO_DAGG, LEFT );
+         }
+         else if ( type_ab == TYPE_B){
+           taui_scalarfield_flavoronly( spinortrace, tauindex, DAGGER, LEFT  );
+         }
+         //delta(flavor component in spinortrace, f1) for all time slices
+         trace_in_flavor( flavortrace, spinortrace, f1 );
+
+      } //End of trace in flavor space
+
+      for (i=0;i<T_global; ++i){
+         paulitrace[i]+=flavortrace[i];
+         (*results)[i+tauindex*T_global]= flavortrace[i]/4.;
+      }
+   } //End of trace for Pauli matrices
+
+
+   if (g_cart_id == 0){printf("Wilson term Dirac Current Density correlator typeV results= %s %s\n", type_12 == TYPE_1 ? "1" : "2",type_ab == TYPE_A ? "a" :"b");}
+   for (i=0; i<T_global; ++i){
+      if (g_cart_id == 0){
+        printf("WCDPL1 %d %d %.3d %10.10e %10.10e\n", type_12, type_ab, i, creal(paulitrace[i])/4., cimag(paulitrace[i])/4.);
+      }
+      (*results)[i+3*T_global] = paulitrace[i]/4.;
+   }
+
+   free(flavortrace);
+   free(paulitrace);
+   free(spacetrace);
+   free(spinortrace);
+   free(colortrace);
+   
+}
+void wilsonterm_current_density_612ab( bispinor ** propfields, int type_12, int type_ab, _Complex double **results ){
+   int ix,i;
+   int f1,c1,s1,tauindex;
+   int spinorstart=0, spinorend=4;
+   bispinor **starting2d;
+   bispinor running;
+#if defined TM_USE_MPI
+   int count;
+   MPI_Status  statuses[8];
+   MPI_Request *request;
+#endif
+   _Complex double *colortrace;
+   _Complex double *spacetrace;
+   _Complex double *spinortrace;
+   _Complex double *flavortrace;
+   _Complex double *paulitrace;
+
+   colortrace= (_Complex double *)malloc(sizeof(_Complex double)*8);
+   spacetrace= (_Complex double *)malloc(sizeof(_Complex double)*8*T_global);
+   spinortrace=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
+   flavortrace=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+   paulitrace= (_Complex double *)malloc(sizeof(_Complex double)*T_global);
+
+   if ( (colortrace == NULL) || (spacetrace == NULL) || (spinortrace == NULL) || (flavortrace == NULL) || (paulitrace == NULL) )
+   {
+     printf("Error in mem allocation\n");
+     exit(1);
+   }
+   *results=(_Complex double *)malloc(sizeof(_Complex double)*4*T_global);
+   if (*results == NULL){
+     printf("Not enough memory in current density six \n");
+     exit(1);
+   }
+
+#if defined TM_USE_MPI
+   request=( MPI_Request *) malloc(sizeof(MPI_Request)*8);
+#endif
+   if ( type_ab == TYPE_A ) {
+        spinorstart=0;
+        spinorend  =2;
+   }
+   else if ( type_ab == TYPE_B ){
+        spinorstart=2;
+        spinorend  =4;
+   }
+   else{
+       if (g_cart_id == 0) fprintf(stdout,"Wrong argument for type_1234, it can only be TYPE_1, TYPE_2,  TYPE_3 or TYPE_4 \n");                                                                      
+       exit(1);                                                                                                                                                                  
+   }
+   if (type_12 == TYPE_1){
+     for (i=0; i<T_global; ++i)
+       paulitrace[i]=0.;
+     for (tauindex=0; tauindex<3; ++tauindex){
+       for (i=0; i<T_global; ++i)
+         flavortrace[i]=0.;
+       for (f1=0; f1<2; ++f1){
+         for (i=0; i<2*T_global; ++i)
+           spinortrace[i]=0.;
+         for (s1= spinorstart; s1<spinorend; ++s1){
+           for (i=0; i<8*T_global; ++i)
+             spacetrace[i]=0.;
+           for (ix = 0; ix< VOLUME; ++ix){
+             for (i=0; i<8; ++i)
+               colortrace[i]=0.;
+             for (c1=0; c1<3; ++c1){
+               _bispinor_null(running);
+               _vector_assign( running.sp_up.s2, propfields[12*s1+4*c1+2*f1][ix].sp_up.s2 );
+               _vector_assign( running.sp_up.s3, propfields[12*s1+4*c1+2*f1][ix].sp_up.s3 );
+               _vector_assign( running.sp_dn.s2, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s2 );
+               _vector_assign( running.sp_dn.s3, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s3 );
+             
+               taui_scalarfield_spinor( &running, &running, GAMMA_DN, tauindex, ix, TDOWN, DAGGER );
+                 
+               multiply_backward_propagator(&running, propfields, &running, ix, NODIR );
+  
+               trace_in_color(colortrace,&running,c1);
+             }  //End of trace color
+              //sum over all lattice sites the result of the color trace
+             trace_in_space(spacetrace,colortrace,ix);
+           } //End of trace space
+#if defined TM_USE_MPI
+           for (i=0; i<8*T_global; ++i){
+             _Complex double tmp;
+             MPI_Allreduce(&spacetrace[i], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, g_cart_grid);
+             spacetrace[i]= tmp;
+           }
+#endif
+            // delta (spinor components of spacetrace, s1) for all time slices and flavor indices
+           trace_in_spinor(spinortrace, spacetrace, s1);
+         }//End of trace in spinor space
+         if ( type_ab == TYPE_A ){
+           taui_scalarfield_flavoronly( spinortrace, tauindex, NO_DAGG, LEFT );
+         }
+         else if ( type_ab == TYPE_B ){
+           taui_scalarfield_flavoronly( spinortrace, tauindex, DAGGER, LEFT  );
+         }
+         trace_in_flavor( flavortrace, spinortrace, f1 );
+       } //End of trace in flavor space
+      //sum for all Pauli matrices
+       for (i=0;i<T_global; ++i){
+         paulitrace[i]+=flavortrace[i];
+         (*results)[i+tauindex*T_global]=flavortrace[i]/4.;
+       }
+     } //End of trace for Pauli matrices
+     if  (g_cart_id == 0){printf("Wilson term Dirac Current Density correlator typeVI results= %s %s\n", type_12 == TYPE_1 ? "1" : "2",type_ab == TYPE_A ? "a" :"b");}
+     for (i=0; i<T_global; ++i){
+       if (g_cart_id == 0){
+        printf("WCDPL2 %d %d %.3d %10.10e %10.10e\n", type_12, type_ab, i, creal(paulitrace[i])/4., cimag(paulitrace[i])/4.);
+       }
+       (*results)[i+3*T_global]=paulitrace[i]/4.;
+     }
+   }
+   if (type_12 == TYPE_2 ){
+      starting2d=(bispinor **)malloc(sizeof(bispinor *)*3);
+      if (starting2d == NULL){
+        if (g_cart_id ==0){
+          printf("Error in allocating temporary fields for bispinor starting2d type Vi\n");
+          exit(1);
+        }
+      }
+      for (i=0; i<3; ++i){
+        starting2d[i] =(bispinor *)malloc(sizeof(bispinor)*VOLUMEPLUSRAND);
+        if (starting2d[i] == NULL){
+          if (g_cart_id == 0) printf("Memory allocation error starting2d VI\n");
+          exit(1);
+        }
+        for (ix=0; ix<VOLUME; ++ix){
+          _bispinor_null(starting2d[i][ix]);
+        }
+      } 
+//Doing the neccesary communication
+#if defined TM_USE_MPI
+      for (s1=spinorstart; s1<spinorend; ++s1)
+        for (c1=0; c1<3; ++c1)
+           for (f1=0; f1<2; ++f1){
+             count=0;
+             generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 0], sizeof(bispinor), TUP   , request, &count );
+             MPI_Waitall( count, request, statuses);
+             count=0;
+             generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 1], sizeof(bispinor), TDOWN   , request, &count );
+             MPI_Waitall( count, request, statuses);
+          }
+#endif
+      for (i=0; i<T_global; ++i)
+        paulitrace[i]=0.;
+      for (tauindex=0; tauindex<3; ++tauindex){
+        for (i=0; i<T_global; ++i)
+          flavortrace[i]=0.;
+        for (f1=0; f1<2; ++f1){
+          for (i=0; i<2*T_global; ++i)
+            spinortrace[i]=0.;
+          for (s1= spinorstart; s1<spinorend; ++s1){
+            for (i=0; i<8*T_global; ++i)
+              spacetrace[i]=0.;
+            for (ix = 0; ix< VOLUME; ++ix){
+              for (i=0; i<8; ++i)
+                colortrace[i]=0.;
+              for (c1=0; c1<3; ++c1){
+                _bispinor_null( starting2d[c1][ix] );
+
+                bispinor_spindown_mult_su3matrix( &starting2d[c1][ix], &propfields[12*s1 + 4*c1 + 2*f1][g_iup[ix][TUP]], &g_gauge_field[ix][TUP], NO_DAGG );
+
+                bispinor_spindown_mult_su3matrix( &starting2d[c1][ix], &starting2d[c1][ix], &g_gauge_field[g_idn[ix][TUP]][TUP], NO_DAGG );
+
+                taui_scalarfield_spinor( &starting2d[c1][ix], &starting2d[c1][ix], GAMMA_DN, tauindex, ix, NODIR, DAGGER );
+
+                multiply_backward_propagator(&starting2d[c1][ix], propfields, &starting2d[c1][ix], ix, TDOWN);
+
+              }
+            }
+#if defined TM_USE_MPI
+            for (c1=0; c1<3; ++c1){
+              count=0;
+              generic_exchange_direction_nonblocking( starting2d[c1], sizeof(bispinor), TDOWN, request, &count );
+              MPI_Waitall( count, request, statuses);
+            }
+#endif
+            for (ix=0; ix<VOLUME; ++ix){
+              for (i=0; i<8; ++i)
+                colortrace[i]=0.;
+              for (c1=0; c1<3; ++c1)
+                trace_in_color(colortrace,&starting2d[c1][g_idn[ix][TUP]],c1);
+              trace_in_space(spacetrace,colortrace,ix);
+            } 
+#if defined TM_USE_MPI
+            for (i=0; i<8*T_global; ++i){
+              _Complex double tmp;
+              MPI_Allreduce(&spacetrace[i], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, g_cart_grid);
+              spacetrace[i]= tmp;
+            }
+#endif
+            // delta (spinor components of spacetrace, s1) for all time slices and flavor indices
+            trace_in_spinor(spinortrace, spacetrace, s1);
+          }//End of trace in spinor space
+          if ( type_ab == TYPE_A ){
+            taui_scalarfield_flavoronly( spinortrace, tauindex, NO_DAGG, LEFT );
+          }
+          else if ( type_ab == TYPE_B ){
+            taui_scalarfield_flavoronly( spinortrace, tauindex, DAGGER, LEFT  );
+          }
+          trace_in_flavor( flavortrace, spinortrace, f1 );
+        } //End of trace in flavor space
+      //sum for all Pauli matrices
+        for (i=0;i<T_global; ++i){
+          paulitrace[i]+=flavortrace[i];
+          (*results)[i+tauindex*T_global]=flavortrace[i]/4.;
+        }
+      } //End of trace for Pauli matrices
+      if  (g_cart_id == 0){printf("Wilson term Dirac Current Density correlator typeVI results= %s %s\n", type_12 == TYPE_1 ? "1" : "2",type_ab == TYPE_A ? "a" :"b");}
+       for (i=0; i<T_global; ++i){
+         if (g_cart_id == 0){
+          printf("WCDPL2 %d %d %.3d %10.10e %10.10e\n", type_12, type_ab, i, creal(paulitrace[i])/4., cimag(paulitrace[i])/4.);
+         }
+         (*results)[i+3*T_global]= paulitrace[i]/4.;
+       }
+       for (i=0; i<3; ++i){
+         free(starting2d[i]);
+       } 
+       free(starting2d);
+   }
+   free(flavortrace);
+   free(paulitrace);
+   free(spacetrace);
+   free(spinortrace);
+   free(colortrace); 
+#if defined TM_USE_MPI
+   if (type_12 == TYPE_2)
+     free(request);
+#endif
+}
+#endif
diff --git a/contractions/contractions_currentdensityextended.h b/contractions/contractions_currentdensityextended.h
new file mode 100644
index 000000000..f6d18e174
--- /dev/null
+++ b/contractions/contractions_currentdensityextended.h
@@ -0,0 +1,27 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2015 Ferenc Pittler
+ *         
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _CONTRACTIONS_CURRENTDENSITYEXTENDED_H
+#define _CONTRACTIONS_CURRENTDENSITYEXTENDED_H
+void wilsonterm_current_density_312ab( bispinor ** propagators, int type_12, int type_ab, _Complex double **res );
+void wilsonterm_current_density_412ab( bispinor ** propagators, int type_12, int type_ab, _Complex double **res );
+void wilsonterm_current_density_512ab( bispinor ** propagators, int type_12, int type_ab, _Complex double **res );
+void wilsonterm_current_density_612ab( bispinor ** propagators, int type_12, int type_ab, _Complex double **res );
+#endif
diff --git a/contractions/contractions_helper.c b/contractions/contractions_helper.c
new file mode 100644
index 000000000..4ef12a2e9
--- /dev/null
+++ b/contractions/contractions_helper.c
@@ -0,0 +1,1661 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2017 Ferenc Pittler
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+
+#ifdef TM_USE_BSM
+#include"lime.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <errno.h>
+#include <time.h>
+#ifdef TM_USE_MPI
+#include <mpi.h>
+#endif
+#include "global.h"
+#include "getopt.h"
+#include "default_input_values.h"
+#include "read_input.h"
+#include "su3.h"
+#include "operator/tm_operators.h"
+#include "linalg_eo.h"
+#include "geometry_eo.h"
+#include "linalg/assign.h"
+#include "operator/D_psi.h"
+#include "operator/D_psi_BSM.h"
+#include "operator/D_psi_BSM2b.h"
+#include "operator/D_psi_BSM2f.h"
+#include "operator/D_psi_BSM2m.h"
+#include "operator/Dov_psi.h"
+#include "operator/tm_operators_nd.h"
+#include "operator/Hopping_Matrix.h"
+#include "invert_eo.h"
+#include "invert_doublet_eo.h"
+#include "invert_overlap.h"
+#include "invert_clover_eo.h"
+#include "init/init_scalar_field.h"
+#include "init/init_bsm_2hop_lookup.h"
+#include "boundary.h"
+#include "start.h"
+#include "solver/solver.h"
+#include "xchange/xchange_gauge.h"
+#include "prepare_source.h"
+#include <io/params.h>
+#include <io/gauge.h>
+#include <io/spinor.h>
+#include <io/utils.h>
+#include "io/scalar.h"
+#include "buffers/utils_nonblocking.h"
+#include "buffers/utils_nogauge.h"
+#include "test/overlaptests.h"
+#include "solver/index_jd.h"
+#include "operator/clovertm_operators.h"
+#include "operator/clover_leaf.h"
+#include "operator.h"
+#include "gettime.h"
+#include "measure_gauge_action.h"
+#include "mpi_init.h"
+#include "init/init_geometry_indices.h"
+#include "init/init_openmp.h"
+#include "init/init_gauge_field.h"
+#include "init/init_spinor_field.h"
+#include "init/init_bispinor_field.h"
+#include "solver/solver_field.h"
+#include "ranlxd.h"
+
+/* indexing of propfields;
+   
+   propagator for  (dagger or nondagger source)
+              for  flavor component f
+              for  color  component c    
+              for  spinor component s
+   is the following bispinor array of size VOLUME(PLUSRAND)
+
+   propfields[12*s + 4*c + 2*f + dagg ? 1: 0]  
+     
+ */
+/**************************
+Multiplication with the backward propagator
+
+S == matrix element of D^-1 between the following states
+
+S( ytilde , x+-dir )       psi   x
+   flavor2, flavor1    x         flavor1
+   spinor2, spinor1              spinor1
+   color 2, color 1              color1
+
+=
+Stilde* (x+-dir , ytilde)      psi   x
+         flavor1, flavor2  x         flavor1
+         spinor1, spinor2            spinor1  
+         color 1, color 2            color1
+where Stilde is the matrix element of D^dagger^-1 between 
+the correspondig states
+
+**************************/
+
+extern int DAGGER;
+extern int NO_DAGG;
+
+extern int GAMMA_UP;
+extern int GAMMA_DN;
+extern int NO_GAMMA;
+
+extern int WITH_SCALAR;
+extern int NO_SCALAR;
+
+extern int TYPE_A;
+extern int TYPE_B;
+
+extern int TYPE_1;
+extern int TYPE_2;
+extern int TYPE_3;
+extern int TYPE_4;
+
+extern int TYPE_I;
+extern int TYPE_II;
+
+extern int RIGHT;
+extern int LEFT;
+
+
+_Complex double bispinor_scalar_product ( bispinor *s1, bispinor *s2 ){
+   _Complex double res=0.0;
+   res   =s2->sp_up.s0.c0 * conj(s1->sp_up.s0.c0) + s2->sp_up.s0.c1 * conj(s1->sp_up.s0.c1) + s2->sp_up.s0.c2 * conj(s1->sp_up.s0.c2) +
+          s2->sp_up.s1.c0 * conj(s1->sp_up.s1.c0) + s2->sp_up.s1.c1 * conj(s1->sp_up.s1.c1) + s2->sp_up.s1.c2 * conj(s1->sp_up.s1.c2) +
+          s2->sp_up.s2.c0 * conj(s1->sp_up.s2.c0) + s2->sp_up.s2.c1 * conj(s1->sp_up.s2.c1) + s2->sp_up.s2.c2 * conj(s1->sp_up.s2.c2) +
+          s2->sp_up.s3.c0 * conj(s1->sp_up.s3.c0) + s2->sp_up.s3.c1 * conj(s1->sp_up.s3.c1) + s2->sp_up.s3.c2 * conj(s1->sp_up.s3.c2) +
+          s2->sp_dn.s0.c0 * conj(s1->sp_dn.s0.c0) + s2->sp_dn.s0.c1 * conj(s1->sp_dn.s0.c1) + s2->sp_dn.s0.c2 * conj(s1->sp_dn.s0.c2) +
+          s2->sp_dn.s1.c0 * conj(s1->sp_dn.s1.c0) + s2->sp_dn.s1.c1 * conj(s1->sp_dn.s1.c1) + s2->sp_dn.s1.c2 * conj(s1->sp_dn.s1.c2) +
+          s2->sp_dn.s2.c0 * conj(s1->sp_dn.s2.c0) + s2->sp_dn.s2.c1 * conj(s1->sp_dn.s2.c1) + s2->sp_dn.s2.c2 * conj(s1->sp_dn.s2.c2) +
+          s2->sp_dn.s3.c0 * conj(s1->sp_dn.s3.c0) + s2->sp_dn.s3.c1 * conj(s1->sp_dn.s3.c1) + s2->sp_dn.s3.c2 * conj(s1->sp_dn.s3.c2);
+   return res;
+}
+void multiply_backward_propagator( bispinor *dest, bispinor **propagator, bispinor *source, int idx, int dir){
+   int propcoord;
+   bispinor source_copy;
+   if (dir == NODIR){
+      propcoord=idx;
+   } 
+   else if (dir == TUP){
+      propcoord=g_iup[idx][TUP];
+   }
+   else if (dir == TDOWN){
+      propcoord=g_idn[idx][TUP];
+   }
+   else{
+      propcoord=0;
+      if (g_cart_id == 0){ fprintf(stderr,"Wrong direction in multiply backward prop\n"); 
+                           exit(1); }
+   }
+
+   _spinor_assign( source_copy.sp_dn, source->sp_dn);
+   _spinor_assign( source_copy.sp_up, source->sp_up);
+
+   dest->sp_up.s0.c0= bispinor_scalar_product ( &propagator[ 1][propcoord], &source_copy );
+   dest->sp_up.s0.c1= bispinor_scalar_product ( &propagator[ 5][propcoord], &source_copy );
+   dest->sp_up.s0.c2= bispinor_scalar_product ( &propagator[ 9][propcoord], &source_copy );
+
+   dest->sp_up.s1.c0= bispinor_scalar_product ( &propagator[13][propcoord], &source_copy );
+   dest->sp_up.s1.c1= bispinor_scalar_product ( &propagator[17][propcoord], &source_copy );
+   dest->sp_up.s1.c2= bispinor_scalar_product ( &propagator[21][propcoord], &source_copy );
+
+   dest->sp_up.s2.c0= bispinor_scalar_product ( &propagator[25][propcoord], &source_copy );
+   dest->sp_up.s2.c1= bispinor_scalar_product ( &propagator[29][propcoord], &source_copy );
+   dest->sp_up.s2.c2= bispinor_scalar_product ( &propagator[33][propcoord], &source_copy );
+
+   dest->sp_up.s3.c0= bispinor_scalar_product ( &propagator[37][propcoord], &source_copy );
+   dest->sp_up.s3.c1= bispinor_scalar_product ( &propagator[41][propcoord], &source_copy );
+   dest->sp_up.s3.c2= bispinor_scalar_product ( &propagator[45][propcoord], &source_copy );
+
+   dest->sp_dn.s0.c0= bispinor_scalar_product ( &propagator[ 3][propcoord], &source_copy );
+   dest->sp_dn.s0.c1= bispinor_scalar_product ( &propagator[ 7][propcoord], &source_copy );
+   dest->sp_dn.s0.c2= bispinor_scalar_product ( &propagator[11][propcoord], &source_copy );
+
+   dest->sp_dn.s1.c0= bispinor_scalar_product ( &propagator[15][propcoord], &source_copy );
+   dest->sp_dn.s1.c1= bispinor_scalar_product ( &propagator[19][propcoord], &source_copy );
+   dest->sp_dn.s1.c2= bispinor_scalar_product ( &propagator[23][propcoord], &source_copy );
+
+   dest->sp_dn.s2.c0= bispinor_scalar_product ( &propagator[27][propcoord], &source_copy );
+   dest->sp_dn.s2.c1= bispinor_scalar_product ( &propagator[31][propcoord], &source_copy );
+   dest->sp_dn.s2.c2= bispinor_scalar_product ( &propagator[35][propcoord], &source_copy );
+
+   dest->sp_dn.s3.c0= bispinor_scalar_product ( &propagator[39][propcoord], &source_copy );
+   dest->sp_dn.s3.c1= bispinor_scalar_product ( &propagator[43][propcoord], &source_copy );
+   dest->sp_dn.s3.c2= bispinor_scalar_product ( &propagator[47][propcoord], &source_copy );
+}
+void bispinor_mult_su3matrix( bispinor *dest, bispinor *source, su3 *a, int dagger){
+   bispinor source_copy;
+   _spinor_assign(source_copy.sp_up, source->sp_up);
+   _spinor_assign(source_copy.sp_dn, source->sp_dn);
+
+   if (dagger == DAGGER){
+     _su3_inverse_multiply(dest->sp_up.s0, *a, source_copy.sp_up.s0);
+     _su3_inverse_multiply(dest->sp_up.s1, *a, source_copy.sp_up.s1);
+     _su3_inverse_multiply(dest->sp_up.s2, *a, source_copy.sp_up.s2);
+     _su3_inverse_multiply(dest->sp_up.s3, *a, source_copy.sp_up.s3);
+
+     _su3_inverse_multiply(dest->sp_dn.s0, *a, source_copy.sp_dn.s0);
+     _su3_inverse_multiply(dest->sp_dn.s1, *a, source_copy.sp_dn.s1);
+     _su3_inverse_multiply(dest->sp_dn.s2, *a, source_copy.sp_dn.s2);
+     _su3_inverse_multiply(dest->sp_dn.s3, *a, source_copy.sp_dn.s3);
+
+     _complexcjg_times_vector(dest->sp_up.s0, phase_0, dest->sp_up.s0);
+     _complexcjg_times_vector(dest->sp_up.s1, phase_0, dest->sp_up.s1);
+     _complexcjg_times_vector(dest->sp_up.s2, phase_0, dest->sp_up.s2);
+     _complexcjg_times_vector(dest->sp_up.s3, phase_0, dest->sp_up.s3);
+
+     _complexcjg_times_vector(dest->sp_dn.s0, phase_0, dest->sp_dn.s0);
+     _complexcjg_times_vector(dest->sp_dn.s1, phase_0, dest->sp_dn.s1);
+     _complexcjg_times_vector(dest->sp_dn.s2, phase_0, dest->sp_dn.s2);
+     _complexcjg_times_vector(dest->sp_dn.s3, phase_0, dest->sp_dn.s3);
+
+
+   }
+   else{
+     _su3_multiply(dest->sp_up.s0, *a, source_copy.sp_up.s0);
+     _su3_multiply(dest->sp_up.s1, *a, source_copy.sp_up.s1);
+     _su3_multiply(dest->sp_up.s2, *a, source_copy.sp_up.s2);
+     _su3_multiply(dest->sp_up.s3, *a, source_copy.sp_up.s3);
+
+     _su3_multiply(dest->sp_dn.s0, *a, source_copy.sp_dn.s0);
+     _su3_multiply(dest->sp_dn.s1, *a, source_copy.sp_dn.s1);
+     _su3_multiply(dest->sp_dn.s2, *a, source_copy.sp_dn.s2);
+     _su3_multiply(dest->sp_dn.s3, *a, source_copy.sp_dn.s3);
+
+     _complex_times_vector(dest->sp_up.s0, phase_0, dest->sp_up.s0);
+     _complex_times_vector(dest->sp_up.s1, phase_0, dest->sp_up.s1);
+     _complex_times_vector(dest->sp_up.s2, phase_0, dest->sp_up.s2);
+     _complex_times_vector(dest->sp_up.s3, phase_0, dest->sp_up.s3);
+
+     _complex_times_vector(dest->sp_dn.s0, phase_0, dest->sp_dn.s0);
+     _complex_times_vector(dest->sp_dn.s1, phase_0, dest->sp_dn.s1);
+     _complex_times_vector(dest->sp_dn.s2, phase_0, dest->sp_dn.s2);
+     _complex_times_vector(dest->sp_dn.s3, phase_0, dest->sp_dn.s3);
+   }
+}
+void bispinor_spinup_mult_su3matrix( bispinor *dest, bispinor *source, su3 *a, int dagger){
+   bispinor source_copy;
+   _bispinor_null(source_copy);
+   _vector_assign(source_copy.sp_up.s0, source->sp_up.s0);
+   _vector_assign(source_copy.sp_up.s1, source->sp_up.s1);
+   _vector_assign(source_copy.sp_dn.s0, source->sp_dn.s0);
+   _vector_assign(source_copy.sp_dn.s1, source->sp_dn.s1);
+
+   if (dagger == DAGGER){
+     _su3_inverse_multiply(dest->sp_up.s0, *a, source_copy.sp_up.s0);
+     _su3_inverse_multiply(dest->sp_up.s1, *a, source_copy.sp_up.s1);
+
+     _su3_inverse_multiply(dest->sp_dn.s0, *a, source_copy.sp_dn.s0);
+     _su3_inverse_multiply(dest->sp_dn.s1, *a, source_copy.sp_dn.s1);
+
+     _complexcjg_times_vector(dest->sp_up.s0, phase_0, dest->sp_up.s0);
+     _complexcjg_times_vector(dest->sp_up.s1, phase_0, dest->sp_up.s1);
+
+     _complexcjg_times_vector(dest->sp_dn.s0, phase_0, dest->sp_dn.s0);
+     _complexcjg_times_vector(dest->sp_dn.s1, phase_0, dest->sp_dn.s1);
+
+   }
+   else{
+     _su3_multiply(dest->sp_up.s0, *a, source_copy.sp_up.s0);
+     _su3_multiply(dest->sp_up.s1, *a, source_copy.sp_up.s1);
+
+     _su3_multiply(dest->sp_dn.s0, *a, source_copy.sp_dn.s0);
+     _su3_multiply(dest->sp_dn.s1, *a, source_copy.sp_dn.s1);
+
+     _complex_times_vector(dest->sp_up.s0, phase_0, dest->sp_up.s0);
+     _complex_times_vector(dest->sp_up.s1, phase_0, dest->sp_up.s1);
+
+     _complex_times_vector(dest->sp_dn.s0, phase_0, dest->sp_dn.s0);
+     _complex_times_vector(dest->sp_dn.s1, phase_0, dest->sp_dn.s1);
+   }
+}
+
+void bispinor_spindown_mult_su3matrix( bispinor *dest, bispinor *source, su3 *a, int dagger){
+   bispinor source_copy;
+   _bispinor_null(source_copy);
+   _vector_assign(source_copy.sp_up.s2, source->sp_up.s2);
+   _vector_assign(source_copy.sp_up.s3, source->sp_up.s3);
+   _vector_assign(source_copy.sp_dn.s2, source->sp_dn.s2);
+   _vector_assign(source_copy.sp_dn.s3, source->sp_dn.s3);
+   
+   if (dagger == DAGGER){
+     _su3_inverse_multiply(dest->sp_up.s2, *a, source_copy.sp_up.s2);
+     _su3_inverse_multiply(dest->sp_up.s3, *a, source_copy.sp_up.s3);
+
+     _su3_inverse_multiply(dest->sp_dn.s2, *a, source_copy.sp_dn.s2);
+     _su3_inverse_multiply(dest->sp_dn.s3, *a, source_copy.sp_dn.s3);
+
+     _complexcjg_times_vector(dest->sp_up.s2, phase_0, dest->sp_up.s2);
+     _complexcjg_times_vector(dest->sp_up.s3, phase_0, dest->sp_up.s3);
+
+     _complexcjg_times_vector(dest->sp_dn.s2, phase_0, dest->sp_dn.s2);
+     _complexcjg_times_vector(dest->sp_dn.s3, phase_0, dest->sp_dn.s3);
+
+   }
+   else{
+     _su3_multiply(dest->sp_up.s2, *a, source_copy.sp_up.s2);
+     _su3_multiply(dest->sp_up.s3, *a, source_copy.sp_up.s3);
+
+     _su3_multiply(dest->sp_dn.s2, *a, source_copy.sp_dn.s2);
+     _su3_multiply(dest->sp_dn.s3, *a, source_copy.sp_dn.s3);
+
+     _complex_times_vector(dest->sp_up.s2, phase_0, dest->sp_up.s2);
+     _complex_times_vector(dest->sp_up.s3, phase_0, dest->sp_up.s3);
+
+     _complex_times_vector(dest->sp_dn.s2, phase_0, dest->sp_dn.s2);
+     _complex_times_vector(dest->sp_dn.s3, phase_0, dest->sp_dn.s3);
+   }
+}
+
+
+void bispinor_timesgamma0( bispinor *dest){
+   su3_vector tempvec1, tempvec2;
+
+   _vector_assign(  tempvec1, dest->sp_up.s0);
+   _vector_assign(  tempvec2, dest->sp_up.s1);
+   _vector_assign(  dest->sp_up.s0, dest->sp_up.s2);
+   _vector_assign(  dest->sp_up.s1, dest->sp_up.s3);
+   _vector_assign(  dest->sp_up.s2, tempvec1);
+   _vector_assign(  dest->sp_up.s3, tempvec2);
+
+   _vector_assign(  tempvec1, dest->sp_dn.s0);
+   _vector_assign(  tempvec2, dest->sp_dn.s1);
+   _vector_assign(  dest->sp_dn.s0, dest->sp_dn.s2);
+   _vector_assign(  dest->sp_dn.s1, dest->sp_dn.s3);
+   _vector_assign(  dest->sp_dn.s2, tempvec1);
+   _vector_assign(  dest->sp_dn.s3, tempvec2);
+
+}
+void bispinor_timesgamma5( bispinor *dest){
+
+   _vector_mul(dest->sp_up.s2, -1, dest->sp_up.s2);
+   _vector_mul(dest->sp_up.s3, -1, dest->sp_up.s3);
+   _vector_mul(dest->sp_dn.s2, -1, dest->sp_dn.s2);
+   _vector_mul(dest->sp_dn.s3, -1, dest->sp_dn.s3);
+
+}
+void bispinor_taui( bispinor *dest, int tauindex){
+   bispinor source_copy;
+
+   _spinor_assign(source_copy.sp_up,  dest->sp_up);
+   _spinor_assign(source_copy.sp_dn,  dest->sp_dn);
+
+   if (tauindex == 2){
+     _spinor_assign( dest->sp_up, source_copy.sp_up);
+     _vector_mul(dest->sp_dn.s0, -1, source_copy.sp_dn.s0);
+     _vector_mul(dest->sp_dn.s1, -1, source_copy.sp_dn.s1);
+     _vector_mul(dest->sp_dn.s2, -1, source_copy.sp_dn.s2);
+     _vector_mul(dest->sp_dn.s3, -1, source_copy.sp_dn.s3);
+   }
+   if (tauindex == 1){
+     _vector_mul(dest->sp_up.s0, -1.*I, source_copy.sp_dn.s0);
+     _vector_mul(dest->sp_up.s1, -1.*I, source_copy.sp_dn.s1);
+     _vector_mul(dest->sp_up.s2, -1.*I, source_copy.sp_dn.s2);
+     _vector_mul(dest->sp_up.s3, -1.*I, source_copy.sp_dn.s3);
+
+     _vector_mul(dest->sp_dn.s0, +1.*I, source_copy.sp_up.s0);
+     _vector_mul(dest->sp_dn.s1, +1.*I, source_copy.sp_up.s1);
+     _vector_mul(dest->sp_dn.s2, +1.*I, source_copy.sp_up.s2);
+     _vector_mul(dest->sp_dn.s3, +1.*I, source_copy.sp_up.s3);
+
+   }
+   if (tauindex == 0){
+     _spinor_assign( dest->sp_up, source_copy.sp_dn);
+     _spinor_assign( dest->sp_dn, source_copy.sp_up);
+   }
+}
+
+//dest used as a source, an output it is overwritten
+void taui_scalarfield_flavoronly( _Complex double *dest, int tauindex, int dagger, int dir ){
+   _Complex double *source_copy;
+   _Complex double a11=0.0, a12=0.0, a21=0.0, a22=0.0;
+   int i;
+  
+   source_copy=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
+
+   if (source_copy == NULL) {
+     if (g_cart_id == 0) {printf("memory allocation failed\n"); exit(1);}
+   }
+   for (i=0; i<2*T_global; ++i)
+     source_copy[i]=dest[i];
+
+   if (dir == LEFT){   
+     if (dagger == DAGGER){
+       if (tauindex == 0){
+         if (smearedcorrelator_BSM == 1){
+           a11=  -1.*g_smeared_scalar_field[2][0] - I*g_smeared_scalar_field[1][0];
+           a12=  +1.*g_smeared_scalar_field[0][0] - I*g_smeared_scalar_field[3][0];
+
+           a21=  +1.*g_smeared_scalar_field[0][0] + I*g_smeared_scalar_field[3][0];
+           a22=  +1.*g_smeared_scalar_field[2][0] - I*g_smeared_scalar_field[1][0];
+         }
+         else{
+           a11=  -1.*g_scalar_field[2][0] - I*g_scalar_field[1][0];
+           a12=  +1.*g_scalar_field[0][0] - I*g_scalar_field[3][0];
+
+           a21=  +1.*g_scalar_field[0][0] + I*g_scalar_field[3][0];
+           a22=  +1.*g_scalar_field[2][0] - I*g_scalar_field[1][0];
+         }
+       }
+       else  if (tauindex == 1){
+         if (smearedcorrelator_BSM == 1) {
+           a11=  +1.*g_smeared_scalar_field[1][0] - I*g_smeared_scalar_field[2][0];
+           a12=  -1.*g_smeared_scalar_field[3][0] - I*g_smeared_scalar_field[0][0];
+
+           a21=  -1.*g_smeared_scalar_field[3][0] + I*g_smeared_scalar_field[0][0];
+           a22=  -1.*g_smeared_scalar_field[1][0] - I*g_smeared_scalar_field[2][0];
+         }
+         else{
+           a11=  +1.*g_scalar_field[1][0] - I*g_scalar_field[2][0];
+           a12=  -1.*g_scalar_field[3][0] - I*g_scalar_field[0][0];
+
+           a21=  -1.*g_scalar_field[3][0] + I*g_scalar_field[0][0];
+           a22=  -1.*g_scalar_field[1][0] - I*g_scalar_field[2][0];
+         }
+       }
+       else  if (tauindex == 2){
+         if (smearedcorrelator_BSM == 1){ 
+           a11=  +1.*g_smeared_scalar_field[0][0] - I*g_smeared_scalar_field[3][0];
+           a12=  +1.*g_smeared_scalar_field[2][0] + I*g_smeared_scalar_field[1][0];
+
+           a21=  +1.*g_smeared_scalar_field[2][0] - I*g_smeared_scalar_field[1][0];
+           a22=  -1.*g_smeared_scalar_field[0][0] - I*g_smeared_scalar_field[3][0];
+         }
+         else{
+           a11=  +1.*g_scalar_field[0][0] - I*g_scalar_field[3][0];
+           a12=  +1.*g_scalar_field[2][0] + I*g_scalar_field[1][0];
+
+           a21=  +1.*g_scalar_field[2][0] - I*g_scalar_field[1][0];
+           a22=  -1.*g_scalar_field[0][0] - I*g_scalar_field[3][0];
+         }
+       }
+     }
+     else if (dagger == NO_DAGG){
+       if (tauindex == 0){
+         if (smearedcorrelator_BSM == 1){
+           a11=  -1.*g_smeared_scalar_field[2][0] + I*g_smeared_scalar_field[1][0];
+           a12=  +1.*g_smeared_scalar_field[0][0] - I*g_smeared_scalar_field[3][0];
+
+           a21=  +1.*g_smeared_scalar_field[0][0] + I*g_smeared_scalar_field[3][0];
+           a22=  +1.*g_smeared_scalar_field[2][0] + I*g_smeared_scalar_field[1][0];
+         }
+         else{
+           a11=  -1.*g_scalar_field[2][0] + I*g_scalar_field[1][0];
+           a12=  +1.*g_scalar_field[0][0] - I*g_scalar_field[3][0];
+
+           a21=  +1.*g_scalar_field[0][0] + I*g_scalar_field[3][0];
+           a22=  +1.*g_scalar_field[2][0] + I*g_scalar_field[1][0];
+         }
+       }
+       else if (tauindex == 1){
+         if (smearedcorrelator_BSM == 1){
+           a11=  +1.*g_smeared_scalar_field[1][0] + I*g_smeared_scalar_field[2][0];
+           a12=  -1.*g_smeared_scalar_field[3][0] - I*g_smeared_scalar_field[0][0];
+
+           a21=  -1.*g_smeared_scalar_field[3][0] + I*g_smeared_scalar_field[0][0];
+           a22=  -1.*g_smeared_scalar_field[1][0] + I*g_smeared_scalar_field[2][0];
+         }
+         else{
+           a11=  +1.*g_scalar_field[1][0] + I*g_scalar_field[2][0];
+           a12=  -1.*g_scalar_field[3][0] - I*g_scalar_field[0][0];
+
+           a21=  -1.*g_scalar_field[3][0] + I*g_scalar_field[0][0];
+           a22=  -1.*g_scalar_field[1][0] + I*g_scalar_field[2][0];
+         }
+       }
+       else if (tauindex == 2){
+         if (smearedcorrelator_BSM == 1){
+           a11=  +1.*g_smeared_scalar_field[0][0] + I*g_smeared_scalar_field[3][0];
+           a12=  +1.*g_smeared_scalar_field[2][0] + I*g_smeared_scalar_field[1][0];
+
+           a21=  +1.*g_smeared_scalar_field[2][0] - I*g_smeared_scalar_field[1][0];
+           a22=  -1.*g_smeared_scalar_field[0][0] + I*g_smeared_scalar_field[3][0];
+         }
+         else{
+           a11=  +1.*g_scalar_field[0][0] + I*g_scalar_field[3][0];
+           a12=  +1.*g_scalar_field[2][0] + I*g_scalar_field[1][0];
+
+           a21=  +1.*g_scalar_field[2][0] - I*g_scalar_field[1][0];
+           a22=  -1.*g_scalar_field[0][0] + I*g_scalar_field[3][0];
+         }
+       }
+     }
+   }
+   else if ( dir == RIGHT ){
+     if (dagger == DAGGER){
+       if (tauindex == 0){
+         if (smearedcorrelator_BSM == 1){
+           a11=  +1.*g_smeared_scalar_field[2][0] - I*g_smeared_scalar_field[1][0];
+           a12=  +1.*g_smeared_scalar_field[0][0] + I*g_smeared_scalar_field[3][0];
+
+           a21=  +1.*g_smeared_scalar_field[0][0] - I*g_smeared_scalar_field[3][0];
+           a22=  -1.*g_smeared_scalar_field[2][0] - I*g_smeared_scalar_field[1][0];
+         }
+         else{
+           a11=  +1.*g_scalar_field[2][0] - I*g_scalar_field[1][0];
+           a12=  +1.*g_scalar_field[0][0] + I*g_scalar_field[3][0];
+
+           a21=  +1.*g_scalar_field[0][0] - I*g_scalar_field[3][0];
+           a22=  -1.*g_scalar_field[2][0] - I*g_scalar_field[1][0];
+         }
+       }
+       else  if (tauindex == 1){
+         if (smearedcorrelator_BSM == 1) {
+           a11=  -1.*g_smeared_scalar_field[1][0] - I*g_smeared_scalar_field[2][0];
+           a12=  +1.*g_smeared_scalar_field[3][0] - I*g_smeared_scalar_field[0][0];
+
+           a21=  +1.*g_smeared_scalar_field[3][0] + I*g_smeared_scalar_field[0][0];
+           a22=  +1.*g_smeared_scalar_field[1][0] - I*g_smeared_scalar_field[2][0];
+         }
+         else{
+           a11=  -1.*g_scalar_field[1][0] - I*g_scalar_field[2][0];
+           a12=  +1.*g_scalar_field[3][0] - I*g_scalar_field[0][0];
+
+           a21=  +1.*g_scalar_field[3][0] + I*g_scalar_field[0][0];
+           a22=  +1.*g_scalar_field[1][0] - I*g_scalar_field[2][0];
+         }
+       }
+       else  if (tauindex == 2){
+         if (smearedcorrelator_BSM == 1){
+           a11=  +1.*g_smeared_scalar_field[0][0] - I*g_smeared_scalar_field[3][0];
+           a12=  -1.*g_smeared_scalar_field[2][0] - I*g_smeared_scalar_field[1][0];
+
+           a21=  -1.*g_smeared_scalar_field[2][0] + I*g_smeared_scalar_field[1][0];
+           a22=  -1.*g_smeared_scalar_field[0][0] - I*g_smeared_scalar_field[3][0];
+         }
+         else{
+           a11=  +1.*g_scalar_field[0][0] - I*g_scalar_field[3][0];
+           a12=  -1.*g_scalar_field[2][0] - I*g_scalar_field[1][0];
+
+           a21=  -1.*g_scalar_field[2][0] + I*g_scalar_field[1][0];
+           a22=  -1.*g_scalar_field[0][0] - I*g_scalar_field[3][0];
+         }
+       }
+     }
+     else if (dagger == NO_DAGG){
+       if (tauindex == 0){
+         if (smearedcorrelator_BSM == 1){
+           a11=  +1.*g_smeared_scalar_field[2][0] + I*g_smeared_scalar_field[1][0];
+           a12=  +1.*g_smeared_scalar_field[0][0] + I*g_smeared_scalar_field[3][0];
+
+           a21=  +1.*g_smeared_scalar_field[0][0] - I*g_smeared_scalar_field[3][0];
+           a22=  -1.*g_smeared_scalar_field[2][0] + I*g_smeared_scalar_field[1][0];
+         }
+         else{
+           a11=  +1.*g_scalar_field[2][0] + I*g_scalar_field[1][0];
+           a12=  +1.*g_scalar_field[0][0] + I*g_scalar_field[3][0];
+
+           a21=  +1.*g_scalar_field[0][0] - I*g_scalar_field[3][0];
+           a22=  -1.*g_scalar_field[2][0] + I*g_scalar_field[1][0];
+         }
+       }
+       else if (tauindex == 1){
+         if (smearedcorrelator_BSM == 1){
+           a11=  -1.*g_smeared_scalar_field[1][0] + I*g_smeared_scalar_field[2][0];
+           a12=  +1.*g_smeared_scalar_field[3][0] - I*g_smeared_scalar_field[0][0];
+
+           a21=  +1.*g_smeared_scalar_field[3][0] + I*g_smeared_scalar_field[0][0];
+           a22=  +1.*g_smeared_scalar_field[1][0] + I*g_smeared_scalar_field[2][0];
+         }
+         else{
+           a11=  -1.*g_scalar_field[1][0] + I*g_scalar_field[2][0];
+           a12=  +1.*g_scalar_field[3][0] - I*g_scalar_field[0][0];
+
+           a21=  +1.*g_scalar_field[3][0] + I*g_scalar_field[0][0];
+           a22=  +1.*g_scalar_field[1][0] + I*g_scalar_field[2][0];
+         }
+       }
+       else if (tauindex == 2){
+         if (smearedcorrelator_BSM == 1){
+           a11=  +1.*g_smeared_scalar_field[0][0] + I*g_smeared_scalar_field[3][0];
+           a12=  -1.*g_smeared_scalar_field[2][0] - I*g_smeared_scalar_field[1][0];
+
+           a21=  -1.*g_smeared_scalar_field[2][0] + I*g_smeared_scalar_field[1][0];
+           a22=  -1.*g_smeared_scalar_field[0][0] + I*g_smeared_scalar_field[3][0];
+         }
+         else{
+           a11=  +1.*g_scalar_field[0][0] + I*g_scalar_field[3][0];
+           a12=  -1.*g_scalar_field[2][0] - I*g_scalar_field[1][0];
+
+           a21=  -1.*g_scalar_field[2][0] + I*g_scalar_field[1][0];
+           a22=  -1.*g_scalar_field[0][0] + I*g_scalar_field[3][0];
+         }
+       }
+     }
+   }
+   for (i=0; i<T_global; ++i){
+     dest[2*i +0]= a11* source_copy[2*i + 0] + a12* source_copy[2*i + 1];
+     dest[2*i +1]= a21* source_copy[2*i + 0] + a22* source_copy[2*i + 1];
+   }
+   free(source_copy);  
+}
+//dest used as a source, an output it is overwritten
+void taui_scalarfield_flavoronly_s0s0( _Complex double *dest, int dagger ){
+   _Complex double *source_copy;
+   _Complex double a11=0.0, a12=0.0, a21=0.0, a22=0.0;
+   int i;
+ 
+   source_copy=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
+   for (i=0; i<2*T_global; ++i)
+     source_copy[i]=dest[i];
+   if (dagger == DAGGER){
+     if (smearedcorrelator_BSM == 1){
+       a11=  +1.*g_smeared_scalar_field[0][0] - I*g_smeared_scalar_field[3][0];
+       a12=  -1.*g_smeared_scalar_field[2][0] - I*g_smeared_scalar_field[1][0];
+
+       a21=  +1.*g_smeared_scalar_field[2][0] - I*g_smeared_scalar_field[1][0];
+       a22=  +1.*g_smeared_scalar_field[0][0] + I*g_smeared_scalar_field[3][0];
+     }
+     else{
+       a11=  +1.*g_scalar_field[0][0] - I*g_scalar_field[3][0];
+       a12=  -1.*g_scalar_field[2][0] - I*g_scalar_field[1][0];
+
+       a21=  +1.*g_scalar_field[2][0] - I*g_scalar_field[1][0];
+       a22=  +1.*g_scalar_field[0][0] + I*g_scalar_field[3][0];     
+     }
+   }
+   else if (dagger==NO_DAGG){
+     if (smearedcorrelator_BSM == 1){
+       a11=  +1.*g_smeared_scalar_field[0][0] + I*g_smeared_scalar_field[3][0];
+       a12=  +1.*g_smeared_scalar_field[2][0] + I*g_smeared_scalar_field[1][0];
+
+       a21=  -1.*g_smeared_scalar_field[2][0] + I*g_smeared_scalar_field[1][0];
+       a22=  +1.*g_smeared_scalar_field[0][0] - I*g_smeared_scalar_field[3][0];
+     }
+     else{
+       a11=  +1.*g_scalar_field[0][0] + I*g_scalar_field[3][0];
+       a12=  +1.*g_scalar_field[2][0] + I*g_scalar_field[1][0];
+
+       a21=  -1.*g_scalar_field[2][0] + I*g_scalar_field[1][0];
+       a22=  +1.*g_scalar_field[0][0] - I*g_scalar_field[3][0];
+     }
+   }
+   else{
+      a11=0.;
+      a12=0.;
+      a21=0.;
+      a22=0.;
+      if (g_cart_id == 0){printf("Wrong Dagger index\n"); exit(1);}
+   }
+   for (i=0; i<T_global; ++i){
+     dest[2*i +0]= a11* source_copy[2*i + 0] + a12* source_copy[2*i + 1];
+     dest[2*i +1]= a21* source_copy[2*i + 0] + a22* source_copy[2*i + 1];
+   }
+   free(source_copy);
+}
+void mult_phi_flavoronly( _Complex double *dest, int dagg){
+   _Complex double *source_copy;
+   _Complex double a11=0.0, a12=0.0, a21=0.0, a22=0.0;
+   int i;
+   source_copy=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
+   if (source_copy == NULL){
+     printf("Error in mem allcoation in phi0 tau3 commutator\n");
+     exit(1);
+   }
+   for (i=0; i<2*T_global; ++i)
+     source_copy[i]=dest[i];
+   if ( dagg == NO_DAGG ){
+     if ( smearedcorrelator_BSM == 1 ){
+       a11=+1.*g_smeared_scalar_field[0][0]+1.*I*g_smeared_scalar_field[3][0];
+       a12=+1.*g_smeared_scalar_field[2][0]+1.*I*g_smeared_scalar_field[1][0];
+       a21=-1.*g_smeared_scalar_field[2][0]+1.*I*g_smeared_scalar_field[1][0];
+       a22=+1.*g_smeared_scalar_field[0][0]-1.*I*g_smeared_scalar_field[3][0];
+     }
+     else{
+       a11=+1.*g_scalar_field[0][0]+1.*I*g_scalar_field[3][0];
+       a12=+1.*g_scalar_field[2][0]+1.*I*g_scalar_field[1][0];
+       a21=-1.*g_scalar_field[2][0]+1.*I*g_scalar_field[1][0];
+       a22=+1.*g_scalar_field[0][0]-1.*I*g_scalar_field[3][0];
+     }
+   }
+   else if (dagg == DAGGER){
+     if ( smearedcorrelator_BSM == 1 ){
+       a11=+1.*g_smeared_scalar_field[0][0]-1.*I*g_smeared_scalar_field[3][0];
+       a12=-1.*g_smeared_scalar_field[2][0]-1.*I*g_smeared_scalar_field[1][0];
+       a21=+1.*g_smeared_scalar_field[2][0]-1.*I*g_smeared_scalar_field[1][0];
+       a22=+1.*g_smeared_scalar_field[0][0]+1.*I*g_smeared_scalar_field[3][0];
+     }
+     else{
+       a11=+1.*g_scalar_field[0][0]-1.*I*g_scalar_field[3][0];
+       a12=-1.*g_scalar_field[2][0]-1.*I*g_scalar_field[1][0];
+       a21=+1.*g_scalar_field[2][0]-1.*I*g_scalar_field[1][0];
+       a22=+1.*g_scalar_field[0][0]+1.*I*g_scalar_field[3][0];
+     }
+   }
+   else{
+     if (g_cart_id == 0) {printf("Error in giving the index in mult_phi_flavoronly\n");
+                          exit(1);
+                         }
+   }
+   for (i=0; i<T_global; ++i){
+     dest[2*i +0]= a11* source_copy[2*i + 0] + a12* source_copy[2*i + 1];
+     dest[2*i +1]= a21* source_copy[2*i + 0] + a22* source_copy[2*i + 1];
+   }
+   free(source_copy);
+}
+void mult_taui_flavoronly( _Complex double *dest, int tauindex){
+   _Complex double *source_copy;
+   _Complex double a11=0.0, a12=0.0, a21=0.0, a22=0.0;
+   int i;
+   source_copy=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
+   if (source_copy == NULL){
+     printf("Error in mem allcoation in phi0 tau3 commutator\n");
+     exit(1);
+   }
+   for (i=0; i<2*T_global; ++i)
+     source_copy[i]=dest[i];
+   if ( tauindex == 2 ){
+     a11=+1.;
+     a12= 0.;
+     a21= 0.;
+     a22=-1.;
+   }
+   else if ( tauindex == 1 ){
+     a11=    0.;
+     a12= -1.*I;
+     a21=     I;
+     a22=     0;
+   }
+   else if ( tauindex == 0 ){
+     a11= 0.;
+     a12=+1.;
+     a21=+1.;
+     a22= 0.;
+   }
+   else{
+     if (g_cart_id == 0) {printf("Error in giving the tauindex in mult_taui_flavoronly\n");
+                          exit(1);
+                         }
+   }
+   for (i=0; i<T_global; ++i){
+     dest[2*i +0]= a11* source_copy[2*i + 0] + a12* source_copy[2*i + 1];
+     dest[2*i +1]= a21* source_copy[2*i + 0] + a22* source_copy[2*i + 1];
+   }
+   free(source_copy);
+}
+void mult_phi( bispinor *dest, bispinor *source, int ix, int dagg){
+   bispinor tmp;
+   _spinor_assign(tmp.sp_up, source->sp_up);
+   _spinor_assign(tmp.sp_dn, source->sp_dn);
+   _Complex double a11=0., a12=0., a21=0., a22=0.;
+
+   if ( dagg == NO_DAGG ){
+     if ( smearedcorrelator_BSM == 1 ){
+       a11=+1.*g_smeared_scalar_field[0][ix]+1.*I*g_smeared_scalar_field[3][ix];
+       a12=+1.*g_smeared_scalar_field[2][ix]+1.*I*g_smeared_scalar_field[1][ix];
+       a21=-1.*g_smeared_scalar_field[2][ix]+1.*I*g_smeared_scalar_field[1][ix];
+       a22=+1.*g_smeared_scalar_field[0][ix]-1.*I*g_smeared_scalar_field[3][ix];
+     }
+     else{
+       a11=+1.*g_scalar_field[0][ix]+1.*I*g_scalar_field[3][ix];
+       a12=+1.*g_scalar_field[2][ix]+1.*I*g_scalar_field[1][ix];
+       a21=-1.*g_scalar_field[2][ix]+1.*I*g_scalar_field[1][ix];
+       a22=+1.*g_scalar_field[0][ix]-1.*I*g_scalar_field[3][ix];
+     }
+   }
+   else if (dagg == DAGGER){
+     if ( smearedcorrelator_BSM == 1 ){
+       a11=+1.*g_smeared_scalar_field[0][ix]-1.*I*g_smeared_scalar_field[3][ix];
+       a12=-1.*g_smeared_scalar_field[2][ix]-1.*I*g_smeared_scalar_field[1][ix];
+       a21=+1.*g_smeared_scalar_field[2][ix]-1.*I*g_smeared_scalar_field[1][ix];
+       a22=+1.*g_smeared_scalar_field[0][ix]+1.*I*g_smeared_scalar_field[3][ix];
+     }
+     else{
+       a11=+1.*g_scalar_field[0][ix]-1.*I*g_scalar_field[3][ix];
+       a12=-1.*g_scalar_field[2][ix]-1.*I*g_scalar_field[1][ix];
+       a21=+1.*g_scalar_field[2][ix]-1.*I*g_scalar_field[1][ix];
+       a22=+1.*g_scalar_field[0][ix]+1.*I*g_scalar_field[3][ix];
+     }
+   }
+   else{
+     if (g_cart_id == 0) {printf("Error in giving the index in mult_phi_flavoronly\n");
+                          exit(1);
+                         }
+   }
+   dest->sp_up.s0.c0 = a11 * tmp.sp_up.s0.c0 + a12 * tmp.sp_dn.s0.c0;
+   dest->sp_up.s0.c1 = a11 * tmp.sp_up.s0.c1 + a12 * tmp.sp_dn.s0.c1;
+   dest->sp_up.s0.c2 = a11 * tmp.sp_up.s0.c2 + a12 * tmp.sp_dn.s0.c2;
+
+   dest->sp_up.s1.c0 = a11 * tmp.sp_up.s1.c0 + a12 * tmp.sp_dn.s1.c0;
+   dest->sp_up.s1.c1 = a11 * tmp.sp_up.s1.c1 + a12 * tmp.sp_dn.s1.c1;
+   dest->sp_up.s1.c2 = a11 * tmp.sp_up.s1.c2 + a12 * tmp.sp_dn.s1.c2;
+
+   dest->sp_up.s2.c0 = a11 * tmp.sp_up.s2.c0 + a12 * tmp.sp_dn.s2.c0;
+   dest->sp_up.s2.c1 = a11 * tmp.sp_up.s2.c1 + a12 * tmp.sp_dn.s2.c1;
+   dest->sp_up.s2.c2 = a11 * tmp.sp_up.s2.c2 + a12 * tmp.sp_dn.s2.c2;
+
+   dest->sp_up.s3.c0 = a11 * tmp.sp_up.s3.c0 + a12 * tmp.sp_dn.s3.c0;
+   dest->sp_up.s3.c1 = a11 * tmp.sp_up.s3.c1 + a12 * tmp.sp_dn.s3.c1;
+   dest->sp_up.s3.c2 = a11 * tmp.sp_up.s3.c2 + a12 * tmp.sp_dn.s3.c2;
+
+   dest->sp_dn.s0.c0 = a21 * tmp.sp_up.s0.c0 + a22 * tmp.sp_dn.s0.c0;
+   dest->sp_dn.s0.c1 = a21 * tmp.sp_up.s0.c1 + a22 * tmp.sp_dn.s0.c1;
+   dest->sp_dn.s0.c2 = a21 * tmp.sp_up.s0.c2 + a22 * tmp.sp_dn.s0.c2;
+
+   dest->sp_dn.s1.c0 = a21 * tmp.sp_up.s1.c0 + a22 * tmp.sp_dn.s1.c0;
+   dest->sp_dn.s1.c1 = a21 * tmp.sp_up.s1.c1 + a22 * tmp.sp_dn.s1.c1;
+   dest->sp_dn.s1.c2 = a21 * tmp.sp_up.s1.c2 + a22 * tmp.sp_dn.s1.c2;
+
+   dest->sp_dn.s2.c0 = a21 * tmp.sp_up.s2.c0 + a22 * tmp.sp_dn.s2.c0;
+   dest->sp_dn.s2.c1 = a21 * tmp.sp_up.s2.c1 + a22 * tmp.sp_dn.s2.c1;
+   dest->sp_dn.s2.c2 = a21 * tmp.sp_up.s2.c2 + a22 * tmp.sp_dn.s2.c2;
+
+   dest->sp_dn.s3.c0 = a21 * tmp.sp_up.s3.c0 + a22 * tmp.sp_dn.s3.c0;
+   dest->sp_dn.s3.c1 = a21 * tmp.sp_up.s3.c1 + a22 * tmp.sp_dn.s3.c1;
+   dest->sp_dn.s3.c2 = a21 * tmp.sp_up.s3.c2 + a22 * tmp.sp_dn.s3.c2;
+
+}
+void taui_spinor( bispinor *dest, bispinor *source, int tauindex ){
+
+   su3_vector tmp2;
+   bispinor tmp;
+   _spinor_assign(tmp.sp_up, source->sp_up);
+   _spinor_assign(tmp.sp_dn, source->sp_dn);
+
+
+   if (tauindex == 0 ){
+    _vector_assign(tmp2        , tmp.sp_up.s0);
+    _vector_assign(tmp.sp_up.s0, tmp.sp_dn.s0);
+    _vector_assign(tmp.sp_dn.s0, tmp2);
+
+    _vector_assign(tmp2        , tmp.sp_up.s1);
+    _vector_assign(tmp.sp_up.s1, tmp.sp_dn.s1);
+    _vector_assign(tmp.sp_dn.s1, tmp2);
+
+    _vector_assign(tmp2        , tmp.sp_up.s2);
+    _vector_assign(tmp.sp_up.s2, tmp.sp_dn.s2);
+    _vector_assign(tmp.sp_dn.s2, tmp2);
+
+    _vector_assign(tmp2        , tmp.sp_up.s3);
+    _vector_assign(tmp.sp_up.s3, tmp.sp_dn.s3);
+    _vector_assign(tmp.sp_dn.s3, tmp2);
+
+ 
+    _spinor_assign(dest->sp_up, tmp.sp_up);
+    _spinor_assign(dest->sp_dn, tmp.sp_dn);
+   }
+   else if (tauindex == 1 ){
+    _vector_assign(tmp2             ,tmp.sp_up.s0);
+    _vector_i_mul( tmp.sp_up.s0, -1 ,tmp.sp_dn.s0);
+    _vector_i_mul( tmp.sp_dn.s0, +1 ,tmp2);
+
+    _vector_assign(tmp2             ,tmp.sp_up.s1);
+    _vector_i_mul( tmp.sp_up.s1, -1 ,tmp.sp_dn.s1);
+    _vector_i_mul( tmp.sp_dn.s1, +1, tmp2);   
+
+    _vector_assign(tmp2             ,tmp.sp_up.s2);
+    _vector_i_mul( tmp.sp_up.s2, -1 ,tmp.sp_dn.s2);
+    _vector_i_mul( tmp.sp_dn.s2, +1 ,tmp2);
+
+    _vector_assign(tmp2             ,tmp.sp_up.s3);
+    _vector_i_mul( tmp.sp_up.s3, -1 ,tmp.sp_dn.s3);
+    _vector_i_mul( tmp.sp_dn.s3, +1, tmp2);
+     
+    _spinor_assign(dest->sp_up, tmp.sp_up);
+    _spinor_assign(dest->sp_dn, tmp.sp_dn);
+   }
+   else if (tauindex == 2 ){
+    _vector_mul(tmp.sp_dn.s0, -1, tmp.sp_dn.s0);
+    _vector_mul(tmp.sp_dn.s1, -1, tmp.sp_dn.s1);
+
+    _vector_mul(tmp.sp_dn.s2, -1, tmp.sp_dn.s2);
+    _vector_mul(tmp.sp_dn.s3, -1, tmp.sp_dn.s3);
+
+     
+    _spinor_assign(dest->sp_up, tmp.sp_up);
+    _spinor_assign(dest->sp_dn, tmp.sp_dn);
+   }
+}
+
+void phi0_taui_commutator( _Complex double *dest,int tauindex ){
+
+   _Complex double *source_copy;
+   _Complex double a11=0.0, a12=0.0, a21=0.0, a22=0.0;
+   int i;
+
+   source_copy=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
+   if (source_copy == NULL){ 
+     printf("Error in mem allcoation in phi0 tau3 commutator\n");
+     exit(1);
+   }
+   for (i=0; i<2*T_global; ++i)
+     source_copy[i]=dest[i];
+   if (tauindex == 2){
+     if (smearedcorrelator_BSM == 1){
+       a11=0.;
+       a12=-2.*g_smeared_scalar_field[2][0]-2.*I*g_smeared_scalar_field[1][0];
+       a21=-2.*g_smeared_scalar_field[2][0]+2.*I*g_smeared_scalar_field[1][0];
+       a22=0.;
+     }
+     else{
+       a11=0.;
+       a12=-2.*g_scalar_field[2][0]-2.*I*g_scalar_field[1][0];
+       a21=-2.*g_scalar_field[2][0]+2.*I*g_scalar_field[1][0];
+       a22=0.;
+     }
+   }
+   if (tauindex == 1){
+     if (smearedcorrelator_BSM == 1){
+       a11=-2.*g_smeared_scalar_field[1][0];
+       a12=+2.*g_smeared_scalar_field[3][0];
+       a21=+2.*g_smeared_scalar_field[3][0];
+       a22=+2.*g_smeared_scalar_field[1][0];
+     }
+     else{
+       a11=-2.*g_scalar_field[1][0];
+       a12=+2.*g_scalar_field[3][0];
+       a21=+2.*g_scalar_field[3][0];
+       a22=+2.*g_scalar_field[1][0];
+     }
+   }
+   if (tauindex == 0){
+     if (smearedcorrelator_BSM == 1){
+       a11=+2.*  g_smeared_scalar_field[2][0];
+       a12=+2.*I*g_smeared_scalar_field[3][0];
+       a21=-2.*I*g_smeared_scalar_field[3][0];
+       a22=-2.*  g_smeared_scalar_field[2][0];
+     }
+     else{
+       a11=+2.*  g_scalar_field[2][0];
+       a12=+2.*I*g_scalar_field[3][0];
+       a21=-2.*I*g_scalar_field[3][0];
+       a22=-2.*  g_scalar_field[2][0];
+     }
+   }
+
+
+   for (i=0; i<T_global; ++i){
+     dest[2*i +0]= a11* source_copy[2*i + 0] + a12* source_copy[2*i + 1];
+     dest[2*i +1]= a21* source_copy[2*i + 0] + a22* source_copy[2*i + 1];
+   }
+   free(source_copy);
+}
+//This routine computes the commutator between Phi(x)and tau^i
+//times a bispinor vector
+//Here Phi(x) is represented by a matrix
+//
+//(+phi_0+i*phi_3   phi_2+iphi_1)
+//(-phi_2+i*phi_1   phi_0-iphi_3)
+//
+void phix_taui_commutator_bispinor( bispinor *dest,int tauindex, int gamma5, int ix ){
+
+   bispinor source_copy;
+   bispinor tmpbi2;
+   _Complex double a11=0.0, a12=0.0, a21=0.0, a22=0.0;
+
+   _spinor_assign(source_copy.sp_up, dest->sp_up);
+   _spinor_assign(source_copy.sp_dn, dest->sp_dn);
+
+   if (tauindex == 2){
+     if (smearedcorrelator_BSM == 1){
+       a11=0.;
+       a12=-2.*g_smeared_scalar_field[2][ix]-2.*I*g_smeared_scalar_field[1][ix];
+       a21=-2.*g_smeared_scalar_field[2][ix]+2.*I*g_smeared_scalar_field[1][ix];
+       a22=0.;
+     }
+     else{
+       a11=0.;
+       a12=-2.*g_scalar_field[2][ix]-2.*I*g_scalar_field[1][ix];
+       a21=-2.*g_scalar_field[2][ix]+2.*I*g_scalar_field[1][ix];
+       a22=0.;
+     }
+   }
+   else if (tauindex == 1){
+     if (smearedcorrelator_BSM == 1){
+       a11=-2.*g_smeared_scalar_field[1][ix];
+       a12=+2.*g_smeared_scalar_field[3][ix];
+       a21=+2.*g_smeared_scalar_field[3][ix];
+       a22=+2.*g_smeared_scalar_field[1][ix];
+     }
+     else{
+       a11=-2.*g_scalar_field[1][ix];
+       a12=+2.*g_scalar_field[3][ix];
+       a21=+2.*g_scalar_field[3][ix];
+       a22=+2.*g_scalar_field[1][ix];
+     }
+   }
+   else if (tauindex == 0){
+     if (smearedcorrelator_BSM == 1){
+       a11=+2.*  g_smeared_scalar_field[2][ix];
+       a12=+2.*I*g_smeared_scalar_field[3][ix];
+       a21=-2.*I*g_smeared_scalar_field[3][ix];
+       a22=-2.*  g_smeared_scalar_field[2][ix];
+     }
+     else{
+       a11=+2.*  g_scalar_field[2][ix];
+       a12=+2.*I*g_scalar_field[3][ix];
+       a21=-2.*I*g_scalar_field[3][ix];
+       a22=-2.*  g_scalar_field[2][ix];
+     }
+   }
+   else {
+     if (g_cart_id == 0){ 
+       printf("Wrong Pauli matrix index\n");
+       exit(1);
+     }    
+   }
+   _spinor_null(tmpbi2.sp_up);
+   _spinor_null(tmpbi2.sp_dn);
+
+   if ( gamma5 == GAMMA_UP){
+     _vector_mul_complex(    tmpbi2.sp_up.s0, a11, source_copy.sp_up.s0);
+     _vector_add_mul_complex(tmpbi2.sp_up.s0, a12, source_copy.sp_dn.s0);
+
+     _vector_mul_complex    (tmpbi2.sp_dn.s0, a21, source_copy.sp_up.s0);
+     _vector_add_mul_complex(tmpbi2.sp_dn.s0, a22, source_copy.sp_dn.s0);
+
+     _vector_mul_complex(    tmpbi2.sp_up.s1, a11, source_copy.sp_up.s1);
+     _vector_add_mul_complex(tmpbi2.sp_up.s1, a12, source_copy.sp_dn.s1);
+
+     _vector_mul_complex    (tmpbi2.sp_dn.s1, a21, source_copy.sp_up.s1);
+     _vector_add_mul_complex(tmpbi2.sp_dn.s1, a22, source_copy.sp_dn.s1);
+   }
+   else if  ( gamma5 == GAMMA_DN ){
+     _vector_mul_complex(    tmpbi2.sp_up.s2, a11, source_copy.sp_up.s2);
+     _vector_add_mul_complex(tmpbi2.sp_up.s2, a12, source_copy.sp_dn.s2);
+
+     _vector_mul_complex    (tmpbi2.sp_dn.s2, a21, source_copy.sp_up.s2);
+     _vector_add_mul_complex(tmpbi2.sp_dn.s2, a22, source_copy.sp_dn.s2);
+
+     _vector_mul_complex(    tmpbi2.sp_up.s3, a11, source_copy.sp_up.s3);
+     _vector_add_mul_complex(tmpbi2.sp_up.s3, a12, source_copy.sp_dn.s3);
+
+     _vector_mul_complex    (tmpbi2.sp_dn.s3, a21, source_copy.sp_up.s3);
+     _vector_add_mul_complex(tmpbi2.sp_dn.s3, a22, source_copy.sp_dn.s3);
+   }
+   else if ( gamma5 == NO_GAMMA ){
+     _spinor_mul_complex    (tmpbi2.sp_up,    a11, source_copy.sp_up);
+     _spinor_add_mul_complex(tmpbi2.sp_up,    a12, source_copy.sp_dn);
+
+     _spinor_mul_complex    (tmpbi2.sp_dn,    a21, source_copy.sp_up);
+     _spinor_add_mul_complex(tmpbi2.sp_dn,    a22, source_copy.sp_dn);
+   }
+
+   _spinor_assign(dest->sp_up, tmpbi2.sp_up);
+   _spinor_assign(dest->sp_dn, tmpbi2.sp_dn);
+
+}
+
+void phix_taui_anti_commutator_bispinor( bispinor *dest,int tauindex, int gamma5, int dagger,int ix ){
+
+   bispinor source_copy;
+   bispinor tmpbi2;
+   _Complex double a11=0.0, a12=0.0, a21=0.0, a22=0.0;
+
+   _spinor_assign(source_copy.sp_up, dest->sp_up);
+   _spinor_assign(source_copy.sp_dn, dest->sp_dn);
+
+   if (dagger == NO_DAGG){
+     if (tauindex == 2){
+       if (smearedcorrelator_BSM == 1){
+         a11=+2.*(g_smeared_scalar_field[0][ix]+I*g_smeared_scalar_field[3][ix]);
+         a12=0.;
+         a21=0.;
+         a22=-2.*(g_smeared_scalar_field[0][ix]-I*g_smeared_scalar_field[3][ix]);
+       }
+       else{
+         a11=2.*(g_scalar_field[0][ix]+I*g_scalar_field[3][ix]);
+         a12=0.;
+         a21=0.;
+         a22=-2*(g_scalar_field[0][ix]-I*g_scalar_field[3][ix]);
+       }
+     }
+     else if (tauindex == 1){
+       if (smearedcorrelator_BSM == 1){
+         a11=+2.*I*g_smeared_scalar_field[2][ix];
+         a12=-2.*I*g_smeared_scalar_field[0][ix];
+         a21=+2.*I*g_smeared_scalar_field[0][ix];
+         a22=+2.*I*g_smeared_scalar_field[2][ix];
+       }
+       else{
+         a11=+2.*I*g_scalar_field[2][ix];
+         a12=-2.*I*g_scalar_field[0][ix];
+         a21=+2.*I*g_scalar_field[0][ix];
+         a22=+2.*I*g_scalar_field[2][ix];
+       }
+     }
+     else if (tauindex == 0){
+       if (smearedcorrelator_BSM == 1){
+         a11=+2.*I*g_smeared_scalar_field[1][ix];
+         a12=+2.  *g_smeared_scalar_field[0][ix];
+         a21=+2.  *g_smeared_scalar_field[0][ix];
+         a22=+2.*I*g_smeared_scalar_field[1][ix];
+       }
+       else{
+         a11=+2.*I*g_scalar_field[1][ix];
+         a12=+2.  *g_scalar_field[0][ix];
+         a21=+2.  *g_scalar_field[0][ix];
+         a22=+2.*I*g_scalar_field[1][ix];
+       }
+     }
+     else {
+       if (g_cart_id == 0){
+         printf("Wrong Pauli matrix index\n");
+         exit(1);
+       }
+     }
+   }
+   else if (dagger==DAGGER){
+     if (tauindex == 2){
+       if (smearedcorrelator_BSM == 1){
+         a11=+2.*(g_smeared_scalar_field[0][ix]-I*g_smeared_scalar_field[3][ix]);
+         a12=0.;
+         a21=0.;
+         a22=-2.*(g_smeared_scalar_field[0][ix]+I*g_smeared_scalar_field[3][ix]);
+       }
+       else{
+         a11=2.*(g_scalar_field[0][ix]-I*g_scalar_field[3][ix]);
+         a12=0.;
+         a21=0.;
+         a22=-2*(g_scalar_field[0][ix]+I*g_scalar_field[3][ix]);
+       }
+     }
+     else if (tauindex == 1){
+       if (smearedcorrelator_BSM == 1){
+         a11=-2.*I*g_smeared_scalar_field[2][ix];
+         a12=-2.*I*g_smeared_scalar_field[0][ix];
+         a21=+2.*I*g_smeared_scalar_field[0][ix];
+         a22=-2.*I*g_smeared_scalar_field[2][ix];
+       }
+       else{
+         a11=-2.*I*g_scalar_field[2][ix];
+         a12=-2.*I*g_scalar_field[0][ix];
+         a21=+2.*I*g_scalar_field[0][ix];
+         a22=-2.*I*g_scalar_field[2][ix];
+       }
+     }
+     else if (tauindex == 0){
+       if (smearedcorrelator_BSM == 1){
+         a11=-2.*I*g_smeared_scalar_field[1][ix];
+         a12=+2.  *g_smeared_scalar_field[0][ix];
+         a21=+2.  *g_smeared_scalar_field[0][ix];
+         a22=-2.*I*g_smeared_scalar_field[1][ix];
+       }
+       else{
+         a11=-2.*I*g_scalar_field[1][ix];
+         a12=+2.  *g_scalar_field[0][ix];
+         a21=+2.  *g_scalar_field[0][ix];
+         a22=-2.*I*g_scalar_field[1][ix];
+       }
+     }
+     else {
+       if (g_cart_id == 0){
+         printf("Wrong Pauli matrix index\n");
+         exit(1);
+       }
+     }  
+   }
+   else{
+     if (g_cart_id == 0){
+       printf("Anticommutator phi tau has to be either dagger or not\n");
+       exit(1);
+     }
+   }
+
+   _spinor_null(tmpbi2.sp_up);
+   _spinor_null(tmpbi2.sp_dn);
+
+   if ( gamma5 == GAMMA_UP){
+     _vector_mul_complex(    tmpbi2.sp_up.s0, a11, source_copy.sp_up.s0);
+     _vector_add_mul_complex(tmpbi2.sp_up.s0, a12, source_copy.sp_dn.s0);
+
+     _vector_mul_complex    (tmpbi2.sp_dn.s0, a21, source_copy.sp_up.s0);
+     _vector_add_mul_complex(tmpbi2.sp_dn.s0, a22, source_copy.sp_dn.s0);
+
+     _vector_mul_complex(    tmpbi2.sp_up.s1, a11, source_copy.sp_up.s1);
+     _vector_add_mul_complex(tmpbi2.sp_up.s1, a12, source_copy.sp_dn.s1);
+
+     _vector_mul_complex    (tmpbi2.sp_dn.s1, a21, source_copy.sp_up.s1);
+     _vector_add_mul_complex(tmpbi2.sp_dn.s1, a22, source_copy.sp_dn.s1);
+   }
+   else if  ( gamma5 == GAMMA_DN ){
+     _vector_mul_complex(    tmpbi2.sp_up.s2, a11, source_copy.sp_up.s2);
+     _vector_add_mul_complex(tmpbi2.sp_up.s2, a12, source_copy.sp_dn.s2);
+
+     _vector_mul_complex    (tmpbi2.sp_dn.s2, a21, source_copy.sp_up.s2);
+     _vector_add_mul_complex(tmpbi2.sp_dn.s2, a22, source_copy.sp_dn.s2);
+
+     _vector_mul_complex(    tmpbi2.sp_up.s3, a11, source_copy.sp_up.s3);
+     _vector_add_mul_complex(tmpbi2.sp_up.s3, a12, source_copy.sp_dn.s3);
+
+     _vector_mul_complex    (tmpbi2.sp_dn.s3, a21, source_copy.sp_up.s3);
+     _vector_add_mul_complex(tmpbi2.sp_dn.s3, a22, source_copy.sp_dn.s3);
+   }
+   else if ( gamma5 == NO_GAMMA ){
+     _spinor_mul_complex    (tmpbi2.sp_up,    a11, source_copy.sp_up);
+     _spinor_add_mul_complex(tmpbi2.sp_up,    a12, source_copy.sp_dn);
+
+     _spinor_mul_complex    (tmpbi2.sp_dn,    a21, source_copy.sp_up);
+     _spinor_add_mul_complex(tmpbi2.sp_dn,    a22, source_copy.sp_dn);
+   }
+
+   _spinor_assign(dest->sp_up, tmpbi2.sp_up);
+   _spinor_assign(dest->sp_dn, tmpbi2.sp_dn);
+
+}
+
+
+void phi0_taui_anticommutator( _Complex double *dest, int tauindex, int dagger ){
+
+   _Complex double *source_copy;
+   _Complex double a11=0.0, a12=0.0, a21=0.0, a22=0.0;
+   int i;
+
+   source_copy=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
+   if (source_copy == NULL){
+     printf("Error in mem allcoation in phi0 tau3 commutator\n");
+     exit(1);
+   }
+   for (i=0; i<2*T_global; ++i)
+     source_copy[i]=dest[i];
+   if ( tauindex == 2){
+     if (dagger == NO_DAGG){
+       if (smearedcorrelator_BSM == 1){
+         a11=2.*(+1.*g_smeared_scalar_field[0][0]+1.*I*g_smeared_scalar_field[3][0]);
+         a12=0.;
+         a21=0.;
+         a22=2.*(-1.*g_smeared_scalar_field[0][0]+1.*I*g_smeared_scalar_field[3][0]);
+       }
+       else{
+         a11=2.*(+1.*g_scalar_field[0][0]+1.*I*g_scalar_field[3][0]);
+         a12=0.;
+         a21=0.;
+         a22=2.*(-1.*g_scalar_field[0][0]+1.*I*g_scalar_field[3][0]);
+       }
+     }
+     if (dagger == DAGGER){
+       if (smearedcorrelator_BSM == 1){
+         a11=2.*(+1.*g_smeared_scalar_field[0][0]-1.*I*g_smeared_scalar_field[3][0]);
+         a12=0.;
+         a21=0.;
+         a22=2.*(-1.*g_smeared_scalar_field[0][0]-1.*I*g_smeared_scalar_field[3][0]);
+       }
+       else{
+         a11=2.*(+1.*g_scalar_field[0][0]-1.*I*g_scalar_field[3][0]);
+         a12=0.;
+         a21=0.;
+         a22=2.*(-1.*g_scalar_field[0][0]-1.*I*g_scalar_field[3][0]);
+       } 
+
+     }
+   }
+   if ( tauindex == 1){
+     if (dagger == NO_DAGG){
+       if (smearedcorrelator_BSM == 1){
+         a11=+2.*I*g_smeared_scalar_field[2][0];
+         a12=-2.*I*g_smeared_scalar_field[0][0];
+         a21=+2.*I*g_smeared_scalar_field[0][0];
+         a22=+2.*I*g_smeared_scalar_field[2][0];
+       }
+       else{
+         a11=+2.*I*g_scalar_field[2][0];
+         a12=-2.*I*g_scalar_field[0][0];
+         a21=+2.*I*g_scalar_field[0][0];
+         a22=+2.*I*g_scalar_field[2][0];
+       }
+     }
+     if (dagger == DAGGER){
+       if (smearedcorrelator_BSM == 1){
+         a11=-2.*I*g_smeared_scalar_field[2][0];
+         a12=-2.*I*g_smeared_scalar_field[0][0];
+         a21=+2.*I*g_smeared_scalar_field[0][0];
+         a22=-2.*I*g_smeared_scalar_field[2][0];
+       }
+       else{
+         a11=-2.*I*g_scalar_field[2][0];
+         a12=-2.*I*g_scalar_field[0][0];
+         a21=+2.*I*g_scalar_field[0][0];
+         a22=-2.*I*g_scalar_field[2][0];
+       }
+
+     } 
+   }
+
+   if ( tauindex == 0){
+     if (dagger == NO_DAGG){
+       if (smearedcorrelator_BSM == 1){
+         a11=2.*I*g_smeared_scalar_field[1][0];
+         a12=2.*  g_smeared_scalar_field[0][0];
+         a21=2.*  g_smeared_scalar_field[0][0];
+         a22=2.*I*g_smeared_scalar_field[1][0];
+       }
+       else{
+         a11=2.*I*g_scalar_field[1][0];
+         a12=2.*  g_scalar_field[0][0];
+         a21=2.*  g_scalar_field[0][0];
+         a22=2.*I*g_scalar_field[1][0];
+       }
+     }
+     if (dagger == DAGGER){
+       if (smearedcorrelator_BSM == 1){
+         a11=-2.*I*g_smeared_scalar_field[1][0];
+         a12=2.*  g_smeared_scalar_field[0][0];
+         a21=2.*  g_smeared_scalar_field[0][0];
+         a22=-2.*I*g_smeared_scalar_field[1][0];
+       }
+       else{
+         a11=-2.*I*g_scalar_field[1][0];
+         a12=2.*  g_scalar_field[0][0];
+         a21=2.*  g_scalar_field[0][0];
+         a22=-2.*I*g_scalar_field[1][0];
+       }
+     }
+   }
+
+//   printf("a11=%e %e\n", creal(a11), cimag(a11));
+//   printf("a12=%e %e\n", creal(a12), cimag(a12));
+//   printf("a21=%e %e\n", creal(a21), cimag(a21));
+//   printf("a22=%e %e\n", creal(a22), cimag(a22));
+
+
+   for (i=0; i<T_global; ++i){
+     dest[2*i +0]= a11* source_copy[2*i + 0] + a12* source_copy[2*i + 1];
+     dest[2*i +1]= a21* source_copy[2*i + 0] + a22* source_copy[2*i + 1];
+   }
+   free(source_copy);
+}
+
+
+void taui_scalarfield_spinor_s0s0( bispinor *dest, bispinor *source, int gamma5, int idx, int direction, int dagger){
+
+  bispinor tmp;
+  bispinor tmpbi2;
+  _Complex double a11=0., a12=0., a21=0., a22=0.;
+
+  int scalarcoord;
+
+  _spinor_assign(tmp.sp_up, source->sp_up);
+  _spinor_assign(tmp.sp_dn, source->sp_dn);
+
+ if (direction == NODIR)
+   scalarcoord=idx;
+ else if (direction<4){
+   scalarcoord= g_iup[idx][direction];
+ }
+ else if (direction<8){
+   scalarcoord= g_idn[idx][7-direction];
+ }
+ else{
+   scalarcoord=0;
+   if (g_cart_id == 0) {printf("Wrong direction in tau scalar field spinor\n"); exit(1);}
+ }
+ if (dagger == DAGGER){
+   if (smearedcorrelator_BSM == 1){
+     a11=  +1.*g_smeared_scalar_field[0][scalarcoord] - I*g_smeared_scalar_field[3][scalarcoord];
+     a12=  -1.*g_smeared_scalar_field[2][scalarcoord] - I*g_smeared_scalar_field[1][scalarcoord];
+
+     a21=  +1.*g_smeared_scalar_field[2][scalarcoord] - I*g_smeared_scalar_field[1][scalarcoord];
+     a22=  +1.*g_smeared_scalar_field[0][scalarcoord] + I*g_smeared_scalar_field[3][scalarcoord];
+
+   }
+   else{
+     a11=  +1.*g_scalar_field[0][scalarcoord] - I*g_scalar_field[3][scalarcoord];
+     a12=  -1.*g_scalar_field[2][scalarcoord] - I*g_scalar_field[1][scalarcoord];
+
+     a21=  +1.*g_scalar_field[2][scalarcoord] - I*g_scalar_field[1][scalarcoord];
+     a22=  +1.*g_scalar_field[0][scalarcoord] + I*g_scalar_field[3][scalarcoord];
+   }
+ }
+ else if (dagger == NO_DAGG){
+   if (smearedcorrelator_BSM == 1){
+     a11=  +1.*g_smeared_scalar_field[0][scalarcoord] + I*g_smeared_scalar_field[3][scalarcoord];
+     a12=  +1.*g_smeared_scalar_field[2][scalarcoord] + I*g_smeared_scalar_field[1][scalarcoord];
+
+     a21=  -1.*g_smeared_scalar_field[2][scalarcoord] + I*g_smeared_scalar_field[1][scalarcoord];
+     a22=  +1.*g_smeared_scalar_field[0][scalarcoord] - I*g_smeared_scalar_field[3][scalarcoord];
+
+   }
+   else{
+     a11=  +1.*g_scalar_field[0][scalarcoord] + I*g_scalar_field[3][scalarcoord];
+     a12=  +1.*g_scalar_field[2][scalarcoord] + I*g_scalar_field[1][scalarcoord];
+
+     a21=  -1.*g_scalar_field[2][scalarcoord] + I*g_scalar_field[1][scalarcoord];
+     a22=  +1.*g_scalar_field[0][scalarcoord] - I*g_scalar_field[3][scalarcoord];
+   }
+ }
+ else {
+   fprintf(stdout, "The sixth argument must be either DAGGER or NO_DAGG\n");
+ }
+ _spinor_null(tmpbi2.sp_up);
+ _spinor_null(tmpbi2.sp_dn);
+
+ if ( gamma5 == GAMMA_UP){
+  _vector_mul_complex(    tmpbi2.sp_up.s0, a11, tmp.sp_up.s0);
+  _vector_add_mul_complex(tmpbi2.sp_up.s0, a12, tmp.sp_dn.s0);
+
+  _vector_mul_complex    (tmpbi2.sp_dn.s0, a21, tmp.sp_up.s0);
+  _vector_add_mul_complex(tmpbi2.sp_dn.s0, a22, tmp.sp_dn.s0);
+
+  _vector_mul_complex(    tmpbi2.sp_up.s1, a11, tmp.sp_up.s1);
+  _vector_add_mul_complex(tmpbi2.sp_up.s1, a12, tmp.sp_dn.s1);
+
+  _vector_mul_complex    (tmpbi2.sp_dn.s1, a21, tmp.sp_up.s1);
+  _vector_add_mul_complex(tmpbi2.sp_dn.s1, a22, tmp.sp_dn.s1);
+ }
+ else if  ( gamma5 == GAMMA_DN ){
+  _vector_mul_complex(    tmpbi2.sp_up.s2, a11, tmp.sp_up.s2);
+  _vector_add_mul_complex(tmpbi2.sp_up.s2, a12, tmp.sp_dn.s2);
+
+  _vector_mul_complex    (tmpbi2.sp_dn.s2, a21, tmp.sp_up.s2);
+  _vector_add_mul_complex(tmpbi2.sp_dn.s2, a22, tmp.sp_dn.s2);
+
+  _vector_mul_complex(    tmpbi2.sp_up.s3, a11, tmp.sp_up.s3);
+  _vector_add_mul_complex(tmpbi2.sp_up.s3, a12, tmp.sp_dn.s3);
+
+  _vector_mul_complex    (tmpbi2.sp_dn.s3, a21, tmp.sp_up.s3);
+  _vector_add_mul_complex(tmpbi2.sp_dn.s3, a22, tmp.sp_dn.s3);
+ }
+ else if ( gamma5 == NO_GAMMA ){
+  _spinor_mul_complex    (tmpbi2.sp_up,    a11, tmp.sp_up);
+  _spinor_add_mul_complex(tmpbi2.sp_up,    a12, tmp.sp_dn);
+
+  _spinor_mul_complex    (tmpbi2.sp_dn,    a21, tmp.sp_up);
+  _spinor_add_mul_complex(tmpbi2.sp_dn,    a22, tmp.sp_dn);
+ }
+
+ _spinor_assign(dest->sp_up, tmpbi2.sp_up);
+ _spinor_assign(dest->sp_dn, tmpbi2.sp_dn);
+
+}
+void taui_scalarfield_spinor( bispinor *dest, bispinor *source, int gamma5, int tauindex, int idx, int direction, int dagger){
+    
+  bispinor tmp;
+  bispinor tmpbi2;
+  _Complex double a11=0.0, a12=0.0, a21=0.0, a22=0.0;
+
+  int scalarcoord;
+
+  _spinor_assign(tmp.sp_up, source->sp_up);
+  _spinor_assign(tmp.sp_dn, source->sp_dn);
+
+ if (direction == NODIR)
+   scalarcoord=idx;
+ else if (direction == TUP ){
+   scalarcoord= g_iup[idx][TUP];
+ }
+ else if (direction == TDOWN){
+   scalarcoord= g_idn[idx][TUP];
+ }
+ else{
+   scalarcoord=0;
+   if (g_cart_id == 0) {printf("Wrong direction in tau scalar field spinor\n"); exit(1);}
+ }
+ if (dagger == DAGGER){
+  if (tauindex == 0){
+   if (smearedcorrelator_BSM  == 1){
+     a11=  -1.*g_smeared_scalar_field[2][scalarcoord] - I*g_smeared_scalar_field[1][scalarcoord];
+     a12=  +1.*g_smeared_scalar_field[0][scalarcoord] - I*g_smeared_scalar_field[3][scalarcoord];
+
+     a21=  +1.*g_smeared_scalar_field[0][scalarcoord] + I*g_smeared_scalar_field[3][scalarcoord];
+     a22=  +1.*g_smeared_scalar_field[2][scalarcoord] - I*g_smeared_scalar_field[1][scalarcoord];
+
+   }
+   else{
+     a11=  -1.*g_scalar_field[2][scalarcoord] - I*g_scalar_field[1][scalarcoord];
+     a12=  +1.*g_scalar_field[0][scalarcoord] - I*g_scalar_field[3][scalarcoord];
+
+     a21=  +1.*g_scalar_field[0][scalarcoord] + I*g_scalar_field[3][scalarcoord];
+     a22=  +1.*g_scalar_field[2][scalarcoord] - I*g_scalar_field[1][scalarcoord];
+   }
+  }
+  else  if (tauindex == 1){
+   if (smearedcorrelator_BSM  == 1){
+     a11=  +1.*g_smeared_scalar_field[1][scalarcoord] - I*g_smeared_scalar_field[2][scalarcoord];
+     a12=  -1.*g_smeared_scalar_field[3][scalarcoord] - I*g_smeared_scalar_field[0][scalarcoord];
+
+     a21=  -1.*g_smeared_scalar_field[3][scalarcoord] + I*g_smeared_scalar_field[0][scalarcoord];
+     a22=  -1.*g_smeared_scalar_field[1][scalarcoord] - I*g_smeared_scalar_field[2][scalarcoord];
+
+   }
+   else{
+     a11=  +1.*g_scalar_field[1][scalarcoord] - I*g_scalar_field[2][scalarcoord];
+     a12=  -1.*g_scalar_field[3][scalarcoord] - I*g_scalar_field[0][scalarcoord];
+
+     a21=  -1.*g_scalar_field[3][scalarcoord] + I*g_scalar_field[0][scalarcoord];
+     a22=  -1.*g_scalar_field[1][scalarcoord] - I*g_scalar_field[2][scalarcoord];
+   }
+  }
+  else  if (tauindex == 2){
+   if (smearedcorrelator_BSM  == 1){
+     a11=  +1.*g_smeared_scalar_field[0][scalarcoord] - I*g_smeared_scalar_field[3][scalarcoord];
+     a12=  +1.*g_smeared_scalar_field[2][scalarcoord] + I*g_smeared_scalar_field[1][scalarcoord];
+
+     a21=  +1.*g_smeared_scalar_field[2][scalarcoord] - I*g_smeared_scalar_field[1][scalarcoord];
+     a22=  -1.*g_smeared_scalar_field[0][scalarcoord] - I*g_smeared_scalar_field[3][scalarcoord];
+
+   }
+   else{
+     a11=  +1.*g_scalar_field[0][scalarcoord] - I*g_scalar_field[3][scalarcoord];
+     a12=  +1.*g_scalar_field[2][scalarcoord] + I*g_scalar_field[1][scalarcoord];
+
+     a21=  +1.*g_scalar_field[2][scalarcoord] - I*g_scalar_field[1][scalarcoord];
+     a22=  -1.*g_scalar_field[0][scalarcoord] - I*g_scalar_field[3][scalarcoord];
+   }
+  }
+ }
+ else if (dagger == NO_DAGG){
+  if (tauindex == 0){
+   if (smearedcorrelator_BSM  == 1){
+     a11=  -1.*g_smeared_scalar_field[2][scalarcoord] + I*g_smeared_scalar_field[1][scalarcoord];
+     a12=  +1.*g_smeared_scalar_field[0][scalarcoord] - I*g_smeared_scalar_field[3][scalarcoord];
+
+     a21=  +1.*g_smeared_scalar_field[0][scalarcoord] + I*g_smeared_scalar_field[3][scalarcoord];
+     a22=  +1.*g_smeared_scalar_field[2][scalarcoord] + I*g_smeared_scalar_field[1][scalarcoord];
+
+   }
+   else{
+     a11=  -1.*g_scalar_field[2][scalarcoord] + I*g_scalar_field[1][scalarcoord];
+     a12=  +1.*g_scalar_field[0][scalarcoord] - I*g_scalar_field[3][scalarcoord];
+
+     a21=  +1.*g_scalar_field[0][scalarcoord] + I*g_scalar_field[3][scalarcoord];
+     a22=  +1.*g_scalar_field[2][scalarcoord] + I*g_scalar_field[1][scalarcoord];
+   }
+  }
+  else if (tauindex == 1){
+   if (smearedcorrelator_BSM  == 1){
+     a11=  +1.*g_smeared_scalar_field[1][scalarcoord] + I*g_smeared_scalar_field[2][scalarcoord];
+     a12=  -1.*g_smeared_scalar_field[3][scalarcoord] - I*g_smeared_scalar_field[0][scalarcoord];
+
+     a21=  -1.*g_smeared_scalar_field[3][scalarcoord] + I*g_smeared_scalar_field[0][scalarcoord];
+     a22=  -1.*g_smeared_scalar_field[1][scalarcoord] + I*g_smeared_scalar_field[2][scalarcoord];
+   }
+   else{
+     a11=  +1.*g_scalar_field[1][scalarcoord] + I*g_scalar_field[2][scalarcoord];
+     a12=  -1.*g_scalar_field[3][scalarcoord] - I*g_scalar_field[0][scalarcoord];
+
+     a21=  -1.*g_scalar_field[3][scalarcoord] + I*g_scalar_field[0][scalarcoord];
+     a22=  -1.*g_scalar_field[1][scalarcoord] + I*g_scalar_field[2][scalarcoord];
+   }
+  }
+  else if (tauindex == 2){
+   if (smearedcorrelator_BSM  == 1){
+     a11=  +1.*g_smeared_scalar_field[0][scalarcoord] + I*g_smeared_scalar_field[3][scalarcoord];
+     a12=  +1.*g_smeared_scalar_field[2][scalarcoord] + I*g_smeared_scalar_field[1][scalarcoord];
+
+     a21=  +1.*g_smeared_scalar_field[2][scalarcoord] - I*g_smeared_scalar_field[1][scalarcoord];
+     a22=  -1.*g_smeared_scalar_field[0][scalarcoord] + I*g_smeared_scalar_field[3][scalarcoord];
+   }
+   else{
+     a11=  +1.*g_scalar_field[0][scalarcoord] + I*g_scalar_field[3][scalarcoord];
+     a12=  +1.*g_scalar_field[2][scalarcoord] + I*g_scalar_field[1][scalarcoord];
+
+     a21=  +1.*g_scalar_field[2][scalarcoord] - I*g_scalar_field[1][scalarcoord];
+     a22=  -1.*g_scalar_field[0][scalarcoord] + I*g_scalar_field[3][scalarcoord];
+   }
+  }
+ }
+ _spinor_null(tmpbi2.sp_up);
+ _spinor_null(tmpbi2.sp_dn);
+ 
+ if ( gamma5 == GAMMA_UP){
+  _vector_mul_complex(    tmpbi2.sp_up.s0, a11, tmp.sp_up.s0);
+  _vector_add_mul_complex(tmpbi2.sp_up.s0, a12, tmp.sp_dn.s0);
+
+  _vector_mul_complex    (tmpbi2.sp_dn.s0, a21, tmp.sp_up.s0);
+  _vector_add_mul_complex(tmpbi2.sp_dn.s0, a22, tmp.sp_dn.s0);
+
+  _vector_mul_complex(    tmpbi2.sp_up.s1, a11, tmp.sp_up.s1);
+  _vector_add_mul_complex(tmpbi2.sp_up.s1, a12, tmp.sp_dn.s1);
+
+  _vector_mul_complex    (tmpbi2.sp_dn.s1, a21, tmp.sp_up.s1);
+  _vector_add_mul_complex(tmpbi2.sp_dn.s1, a22, tmp.sp_dn.s1);
+ }
+ else if  ( gamma5 == GAMMA_DN ){
+  _vector_mul_complex(    tmpbi2.sp_up.s2, a11, tmp.sp_up.s2);
+  _vector_add_mul_complex(tmpbi2.sp_up.s2, a12, tmp.sp_dn.s2);
+
+  _vector_mul_complex    (tmpbi2.sp_dn.s2, a21, tmp.sp_up.s2);
+  _vector_add_mul_complex(tmpbi2.sp_dn.s2, a22, tmp.sp_dn.s2);
+
+  _vector_mul_complex(    tmpbi2.sp_up.s3, a11, tmp.sp_up.s3);
+  _vector_add_mul_complex(tmpbi2.sp_up.s3, a12, tmp.sp_dn.s3);
+
+  _vector_mul_complex    (tmpbi2.sp_dn.s3, a21, tmp.sp_up.s3);
+  _vector_add_mul_complex(tmpbi2.sp_dn.s3, a22, tmp.sp_dn.s3);
+ }
+ else if ( gamma5 == NO_GAMMA ){
+  _spinor_mul_complex    (tmpbi2.sp_up,    a11, tmp.sp_up);
+  _spinor_add_mul_complex(tmpbi2.sp_up,    a12, tmp.sp_dn);
+
+  _spinor_mul_complex    (tmpbi2.sp_dn,    a21, tmp.sp_up);
+  _spinor_add_mul_complex(tmpbi2.sp_dn,    a22, tmp.sp_dn);
+ }
+
+ _spinor_assign(dest->sp_up, tmpbi2.sp_up);
+ _spinor_assign(dest->sp_dn, tmpbi2.sp_dn);
+
+}
+void trace_in_spinor( _Complex double *dest, _Complex double *src, int spinorindex){
+   int tind, find;
+   for (tind=0; tind<T_global; ++tind)
+     for (find=0; find<2; ++find){ 
+       dest[2*tind+find]+=src[8*tind+4*find+spinorindex];
+     }
+}
+void trace_in_color(_Complex double *dest, bispinor *src, int colorindex){
+   if      ( colorindex == 0 ){
+     dest[0]+= src->sp_up.s0.c0;
+     dest[1]+= src->sp_up.s1.c0;
+     dest[2]+= src->sp_up.s2.c0;
+     dest[3]+= src->sp_up.s3.c0;
+     dest[4]+= src->sp_dn.s0.c0;
+     dest[5]+= src->sp_dn.s1.c0;
+     dest[6]+= src->sp_dn.s2.c0;
+     dest[7]+= src->sp_dn.s3.c0;
+
+   }
+   else if ( colorindex == 1 ){
+     dest[0]+= src->sp_up.s0.c1;
+     dest[1]+= src->sp_up.s1.c1;
+     dest[2]+= src->sp_up.s2.c1;
+     dest[3]+= src->sp_up.s3.c1;
+     dest[4]+= src->sp_dn.s0.c1;
+     dest[5]+= src->sp_dn.s1.c1;
+     dest[6]+= src->sp_dn.s2.c1;
+     dest[7]+= src->sp_dn.s3.c1;
+   }
+   else if ( colorindex == 2 ){
+     dest[0]+= src->sp_up.s0.c2;
+     dest[1]+= src->sp_up.s1.c2;
+     dest[2]+= src->sp_up.s2.c2;
+     dest[3]+= src->sp_up.s3.c2;
+     dest[4]+= src->sp_dn.s0.c2;
+     dest[5]+= src->sp_dn.s1.c2;
+     dest[6]+= src->sp_dn.s2.c2;
+     dest[7]+= src->sp_dn.s3.c2;
+   }
+}
+void trace_in_space(_Complex double *dest, _Complex double *source, int idx){
+     int i;
+     for (i=0; i<8;++i){
+       dest[g_coord[idx][TUP]*8+i]+= source[i];
+     }
+}
+void trace_in_flavor(_Complex double *dest, _Complex double *source, int f1){
+     int i;
+     for (i=0; i<T_global; ++i){
+        dest[i]+= source[2*i+f1];
+     }
+}
+#endif
diff --git a/contractions/contractions_helper.h b/contractions/contractions_helper.h
new file mode 100644
index 000000000..52a05624a
--- /dev/null
+++ b/contractions/contractions_helper.h
@@ -0,0 +1,48 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2015 Ferenc Pittler
+ *         
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _CONTRACTIONS_HELPER_H
+#define _CONTRACTIONS_HELPER_H
+
+_Complex double bispinor_scalar_product ( bispinor *s1, bispinor *s2 );
+void multiply_backward_propagator( bispinor *dest, bispinor **propagator, bispinor *source, int idx, int dir);
+void bispinor_mult_su3matrix( bispinor *dest, bispinor *source, su3 *a, int dagger);
+void bispinor_spindown_mult_su3matrix( bispinor *dest, bispinor *source, su3 *a, int dagger);
+void bispinor_spinup_mult_su3matrix( bispinor *dest, bispinor *source, su3 *a, int dagger);
+void bispinor_timesgamma0( bispinor *dest);
+void bispinor_timesgamma5( bispinor *dest);
+void bispinor_taui( bispinor *dest, int tauindex);
+void taui_scalarfield_flavoronly( _Complex double *dest, int tauindex, int dagger, int dir );
+void taui_scalarfield_flavoronly_s0s0( _Complex double *dest, int dagger );
+void mult_phi_flavoronly( _Complex double *dest, int dagg);
+void mult_taui_flavoronly( _Complex double *dest, int tauindex);
+void mult_phi( bispinor *dest, bispinor *source, int ix, int dagg);
+void taui_spinor( bispinor *dest, bispinor *source, int tauindex );
+void phi0_taui_commutator( _Complex double *dest,int tauindex );
+void phix_taui_commutator_bispinor( bispinor *dest,int tauindex, int gamma5, int ix );
+void phi0_taui_anticommutator( _Complex double *dest, int tauindex, int dagger );
+void taui_scalarfield_spinor_s0s0( bispinor *dest, bispinor *source, int gamma5, int idx, int direction, int dagger);
+void taui_scalarfield_spinor( bispinor *dest, bispinor *source, int gamma5, int tauindex, int idx, int direction, int dagger);
+void trace_in_spinor( _Complex double *dest, _Complex double *src, int spinorindex);
+void trace_in_color(_Complex double *dest, bispinor *src, int colorindex);
+void trace_in_space(_Complex double *dest, _Complex double *source, int idx);
+void trace_in_flavor(_Complex double *dest, _Complex double *source, int f1);
+void phix_taui_anti_commutator_bispinor( bispinor *dest,int tauindex, int gamma5, int dagger,int ix );
+#endif
diff --git a/contractions_BSM.c b/contractions_BSM.c
index 27850bdf5..5b4121ba6 100644
--- a/contractions_BSM.c
+++ b/contractions_BSM.c
@@ -1,6 +1,6 @@
 /***********************************************************************
  *
- * Copyright (C) 2009 Carsten Urbach
+ * Copyright (C) 2017 Ferenc Pittler
  *
  * This file is part of tmLQCD.
  *
@@ -17,9 +17,8 @@
  * You should have received a copy of the GNU General Public License
  * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
  ***********************************************************************/
-
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include"lime.h"
 #include <stdlib.h>
@@ -28,7 +27,7 @@
 #include <math.h>
 #include <errno.h>
 #include <time.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
 #include "global.h"
@@ -42,10 +41,13 @@
 #include "geometry_eo.h"
 #include "linalg/assign.h"
 #include "operator/D_psi.h"
+#ifdef TM_USE_BSM
+#include "operator/D_psi_BSM3.h"
 #include "operator/D_psi_BSM.h"
 #include "operator/D_psi_BSM2b.h"
 #include "operator/D_psi_BSM2f.h"
 #include "operator/D_psi_BSM2m.h"
+#endif
 #include "operator/Dov_psi.h"
 #include "operator/tm_operators_nd.h"
 #include "operator/Hopping_Matrix.h"
@@ -53,42 +55,68 @@
 #include "invert_doublet_eo.h"
 #include "invert_overlap.h"
 #include "invert_clover_eo.h"
+#ifdef TM_USE_BSM
 #include "init/init_scalar_field.h"
 #include "init/init_bsm_2hop_lookup.h"
+#endif
 #include "boundary.h"
 #include "start.h"
 #include "solver/solver.h"
+#include "xchange/xchange_gauge.h"
+#include "prepare_source.h"
 #include <io/params.h>
 #include <io/gauge.h>
 #include <io/spinor.h>
 #include <io/utils.h>
+#ifdef TM_USE_BSM
+#include "io/scalar.h"
+#endif
+#include "buffers/utils_nonblocking.h"
+#include "buffers/utils_nogauge.h"
 #include "test/overlaptests.h"
 #include "solver/index_jd.h"
 #include "operator/clovertm_operators.h"
 #include "operator/clover_leaf.h"
 #include "operator.h"
 #include "gettime.h"
+#include "measure_gauge_action.h"
+#include "mpi_init.h"
+#include "init/init_geometry_indices.h"
+#include "init/init_openmp.h"
+#include "init/init_gauge_field.h"
+#include "init/init_spinor_field.h"
+#include "init/init_bispinor_field.h"
+#include "contractions/contractions_checks.h"
+#include "contractions/contractions_FP.h"
+#include "solver/solver_field.h"
+#include "source_generation.h"
+#include "ranlxd.h"
 
-#define DAGGER 1
-#define NO_DAGG 0 
+int DAGGER;
+int NO_DAGG;
 
-#define GAMMA_UP 1
-#define GAMMA_DN -1
-#define NO_GAMMA 0
+int GAMMA_UP;
+int GAMMA_DN;
+int NO_GAMMA;
 
-#define WITH_SCALAR 1
-#define NO_SCALAR 0
+int WITH_SCALAR;
+int NO_SCALAR;
 
-#define TYPE_A 1
-#define TYPE_B 0
+int TYPE_A;
+int TYPE_B;
+
+int TYPE_1;
+int TYPE_2;
+int TYPE_3;
+int TYPE_4;
+
+int TYPE_I;
+int TYPE_II;
+
+int RIGHT;
+int LEFT;
 
-#define TYPE_1 1
-#define TYPE_2 0
-#define TYPE_3 2
-#define TYPE_4 3
 
-#define TYPE_I 1
-#define TYPE_II 0
 static void usage()
 {
   fprintf(stdout, "Options: [-f input-filename]\n");
@@ -141,1746 +169,1889 @@ extern int nstore;
 int check_geometry();
 static void set_default_filenames(char ** input_filename, char ** filename);
 static void process_args(int argc, char *argv[], char ** input_filename, char ** filename);
-/* indexing of propfields;
-   
-   propagator for  (dagger or nondagger source)
-              for  flavor component f
-              for  color  component c    
-              for  spinor component s
-   is the following bispinor array of size VOLUME(PLUSRAND)
-
-   propfields[12*s + 4*c + 2*f + dagg ? 1: 0]  
-     
- */
-/**************************
-Multiplication with the backward propagator
-
-S == matrix element of D^-1 between the following states
-
-S( ytilde , x+-dir )       psi   x
-   flavor2, flavor1    x         flavor1
-   spinor2, spinor1              spinor1
-   color 2, color 1              color1
-
-=
-Stilde* (x+-dir , ytilde)      psi   x
-         flavor1, flavor2  x         flavor1
-         spinor1, spinor2            spinor1  
-         color 1, color 2            color1
-where Stilde is the matrix element of D^dagger^-1 between 
-the correspondig states
-
-**************************/
-_Complex double bispinor_scalar_product ( bispinor *s1, bispinor *s2 ){
-   _Complex double res=0.0;
-   res   =s2->sp_up.s0.c0 * conj(s1->sp_up.s0.c0) + s2->sp_up.s0.c1 * conj(s1->sp_up.s0.c1) + s2->sp_up.s0.c2 * conj(s1->sp_up.s0.c2) +
-          s2->sp_up.s1.c0 * conj(s1->sp_up.s1.c0) + s2->sp_up.s1.c1 * conj(s1->sp_up.s1.c1) + s2->sp_up.s1.c2 * conj(s1->sp_up.s1.c2) +
-          s2->sp_up.s2.c0 * conj(s1->sp_up.s2.c0) + s2->sp_up.s2.c1 * conj(s1->sp_up.s2.c1) + s2->sp_up.s2.c2 * conj(s1->sp_up.s2.c2) +
-          s2->sp_up.s3.c0 * conj(s1->sp_up.s3.c0) + s2->sp_up.s3.c1 * conj(s1->sp_up.s3.c1) + s2->sp_up.s3.c2 * conj(s1->sp_up.s3.c2) +
-          s2->sp_dn.s0.c0 * conj(s1->sp_dn.s0.c0) + s2->sp_dn.s0.c1 * conj(s1->sp_dn.s0.c1) + s2->sp_dn.s0.c2 * conj(s1->sp_dn.s0.c2) +
-          s2->sp_dn.s1.c0 * conj(s1->sp_dn.s1.c0) + s2->sp_dn.s1.c1 * conj(s1->sp_dn.s1.c1) + s2->sp_dn.s1.c2 * conj(s1->sp_dn.s1.c2) +
-          s2->sp_dn.s2.c0 * conj(s1->sp_dn.s2.c0) + s2->sp_dn.s2.c1 * conj(s1->sp_dn.s2.c1) + s2->sp_dn.s2.c2 * conj(s1->sp_dn.s2.c2) +
-          s2->sp_dn.s3.c0 * conj(s1->sp_dn.s3.c0) + s2->sp_dn.s3.c1 * conj(s1->sp_dn.s3.c1) + s2->sp_dn.s3.c2 * conj(s1->sp_dn.s3.c2);
-   return res;
-}
-void multiply_backward_propagator( bispinor *dest, bispinor **propagator, bispinor *source, int idx, int dir){
-   int propcoord;
-   bispinor source_copy;
-   if (dir == NODIR){
-      propcoord=idx;
-   }
-   else if (dir == TUP){
-      propcoord=g_iup[idx][TUP];
-   }
-   else if (dir == TDOWN){
-      propcoord=g_idn[idx][TUP];
-   }
-   _spinor_assign( source_copy.sp_dn, source->sp_dn);
-   _spinor_assign( source_copy.sp_up, source->sp_up);
-
-   dest->sp_up.s0.c0= bispinor_scalar_product ( &propagator[ 1][propcoord], &source_copy );
-   dest->sp_up.s0.c1= bispinor_scalar_product ( &propagator[ 5][propcoord], &source_copy );
-   dest->sp_up.s0.c2= bispinor_scalar_product ( &propagator[ 9][propcoord], &source_copy );
-
-   dest->sp_up.s1.c0= bispinor_scalar_product ( &propagator[13][propcoord], &source_copy );
-   dest->sp_up.s1.c1= bispinor_scalar_product ( &propagator[17][propcoord], &source_copy );
-   dest->sp_up.s1.c2= bispinor_scalar_product ( &propagator[21][propcoord], &source_copy );
-
-   dest->sp_up.s2.c0= bispinor_scalar_product ( &propagator[25][propcoord], &source_copy );
-   dest->sp_up.s2.c1= bispinor_scalar_product ( &propagator[29][propcoord], &source_copy );
-   dest->sp_up.s2.c2= bispinor_scalar_product ( &propagator[33][propcoord], &source_copy );
-
-   dest->sp_up.s3.c0= bispinor_scalar_product ( &propagator[37][propcoord], &source_copy );
-   dest->sp_up.s3.c1= bispinor_scalar_product ( &propagator[41][propcoord], &source_copy );
-   dest->sp_up.s3.c2= bispinor_scalar_product ( &propagator[45][propcoord], &source_copy );
-
-   dest->sp_dn.s0.c0= bispinor_scalar_product ( &propagator[ 3][propcoord], &source_copy );
-   dest->sp_dn.s0.c1= bispinor_scalar_product ( &propagator[ 7][propcoord], &source_copy );
-   dest->sp_dn.s0.c2= bispinor_scalar_product ( &propagator[11][propcoord], &source_copy );
-
-   dest->sp_dn.s1.c0= bispinor_scalar_product ( &propagator[15][propcoord], &source_copy );
-   dest->sp_dn.s1.c1= bispinor_scalar_product ( &propagator[19][propcoord], &source_copy );
-   dest->sp_dn.s1.c2= bispinor_scalar_product ( &propagator[23][propcoord], &source_copy );
-
-   dest->sp_dn.s2.c0= bispinor_scalar_product ( &propagator[27][propcoord], &source_copy );
-   dest->sp_dn.s2.c1= bispinor_scalar_product ( &propagator[31][propcoord], &source_copy );
-   dest->sp_dn.s2.c2= bispinor_scalar_product ( &propagator[35][propcoord], &source_copy );
-
-   dest->sp_dn.s3.c0= bispinor_scalar_product ( &propagator[39][propcoord], &source_copy );
-   dest->sp_dn.s3.c1= bispinor_scalar_product ( &propagator[43][propcoord], &source_copy );
-   dest->sp_dn.s3.c2= bispinor_scalar_product ( &propagator[47][propcoord], &source_copy );
+#ifndef TM_USE_BSM
+int main(int argc, char *argv[]){
+  printf("Works only with BSM operators switched on \n");
 }
-//dest used as a source, an output it is overwritten
-void taui_scalarfield_flavoronly( _Complex double *dest, int tauindex, int dagger ){
-   _Complex double *source_copy;
-   _Complex double a11, a12, a21, a22;
-   int i;
-  
-   source_copy=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
-   for (i=0; i<2*T_global; ++i)
-     source_copy[i]=dest[i];
-   if (dagger == DAGGER){
-     if (tauindex == 0){
-       a11=  -1.*g_scalar_field[2][0] - I*g_scalar_field[1][0];
-       a12=  +1.*g_scalar_field[0][0] - I*g_scalar_field[3][0];
-
-       a21=  +1.*g_scalar_field[0][0] + I*g_scalar_field[3][0];
-       a22=  +1.*g_scalar_field[2][0] - I*g_scalar_field[1][0];
-     }
-     else  if (tauindex == 1){
-       a11=  +1.*g_scalar_field[1][0] - I*g_scalar_field[2][0];
-       a12=  -1.*g_scalar_field[3][0] - I*g_scalar_field[0][0];
-
-       a21=  -1.*g_scalar_field[3][0] + I*g_scalar_field[0][0];
-       a22=  -1.*g_scalar_field[1][0] - I*g_scalar_field[2][0];
-
-     }
-     else  if (tauindex == 2){
-       a11=  +1.*g_scalar_field[0][0] - I*g_scalar_field[3][0];
-       a12=  +1.*g_scalar_field[2][0] + I*g_scalar_field[1][0];
-
-       a21=  +1.*g_scalar_field[2][0] - I*g_scalar_field[1][0];
-       a22=  -1.*g_scalar_field[0][0] - I*g_scalar_field[3][0];
-     }
-   }
-   else if (dagger == NO_DAGG){
-     if (tauindex == 0){
-      a11=  -1.*g_scalar_field[2][0] + I*g_scalar_field[1][0];
-      a12=  +1.*g_scalar_field[0][0] - I*g_scalar_field[3][0];
-
-      a21=  +1.*g_scalar_field[0][0] + I*g_scalar_field[3][0];
-      a22=  +1.*g_scalar_field[2][0] + I*g_scalar_field[1][0];
-     }
-     if (tauindex == 1){
-      a11=  +1.*g_scalar_field[1][0] + I*g_scalar_field[2][0];
-      a12=  -1.*g_scalar_field[3][0] - I*g_scalar_field[0][0];
-
-      a21=  -1.*g_scalar_field[3][0] + I*g_scalar_field[0][0];
-      a22=  -1.*g_scalar_field[1][0] + I*g_scalar_field[2][0];
-     }
-     if (tauindex == 2){
-      a11=  +1.*g_scalar_field[0][0] + I*g_scalar_field[3][0];
-      a12=  +1.*g_scalar_field[2][0] + I*g_scalar_field[1][0];
-
-      a21=  +1.*g_scalar_field[2][0] - I*g_scalar_field[1][0];
-      a22=  -1.*g_scalar_field[0][0] + I*g_scalar_field[3][0];
-     }
-   }
-   for (i=0; i<T_global; ++i){
-     dest[2*i +0]= a11* source_copy[2*i + 0] + a12* source_copy[2*i + 1];
-     dest[2*i +1]= a21* source_copy[2*i + 0] + a22* source_copy[2*i + 1];
-   }
-   free(source_copy);  
-}
-void taui_spinor( bispinor *dest, bispinor *source, int tauindex){
+#else
+int main(int argc, char *argv[]){
+  FILE *parameterfile = NULL;
+  FILE *out=NULL;
+  char datafilename[206];
+  char parameterfilename[206];
+  char conf_filename[50];
+  char scalar_filename[50];
+  char * input_filename = NULL;
+  char * filename = NULL;
+  double plaquette_energy;
+  int i,j,isample=0,op_id=0;
+  char prop_fname[200];
+  char contractions_fname[200];
+  int src_idx, pos;
+//  int count;
+  int status_geo;
+  int ix;
+  _Complex double *current,*pseudoscalar,*scalar,*temp;
+  _Complex double *current1,*current2,*current3;
+  _Complex double *pscalar1,*pscalar2,*pscalar3;
+  _Complex double *scalar1, *scalar2, *scalar3 ;
+#if defined TM_USE_MPI
 
-   su3_vector tmp2;
-   bispinor tmp;
-   _spinor_assign(tmp.sp_up, source->sp_up);
-   _spinor_assign(tmp.sp_dn, source->sp_dn);
+  MPI_Init(&argc, &argv);
 
-   if (tauindex == 0 ){
-    _vector_assign(tmp2        , tmp.sp_up.s0);
-    _vector_assign(tmp.sp_up.s0, tmp.sp_dn.s0);
-    _vector_assign(tmp.sp_dn.s0, tmp2);
+  MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id);
+#endif
+  process_args(argc, argv, &input_filename,&filename);
+  set_default_filenames(&input_filename, &filename);
+//Setting default constants
 
-    _vector_assign(tmp2        , tmp.sp_up.s1);
-    _vector_assign(tmp.sp_up.s1, tmp.sp_dn.s1);
-    _vector_assign(tmp.sp_dn.s1, tmp2);
- 
-    _spinor_assign(dest->sp_up, tmp.sp_up);
-    _spinor_assign(dest->sp_dn, tmp.sp_dn);
-   }
-   else if (tauindex == 1 ){
-    _vector_assign(tmp2             ,tmp.sp_up.s0);
-    _vector_i_mul( tmp.sp_up.s0, -1 ,tmp.sp_dn.s0);
-    _vector_i_mul( tmp.sp_dn.s0, +1, tmp2);
-
-    _vector_assign(tmp2             ,tmp.sp_up.s1);
-    _vector_i_mul( tmp.sp_up.s1, -1 ,tmp.sp_dn.s1);
-    _vector_i_mul( tmp.sp_dn.s1, +1, tmp2);   
-     
-    _spinor_assign(dest->sp_up, tmp.sp_up);
-    _spinor_assign(dest->sp_dn, tmp.sp_dn);
-   }
-   else if (tauindex == 2 ){
-    _vector_mul(tmp.sp_dn.s0, -1, tmp.sp_dn.s0);
-    _vector_mul(tmp.sp_dn.s1, -1, tmp.sp_dn.s1);
-     
-    _spinor_assign(dest->sp_up, tmp.sp_up);
-    _spinor_assign(dest->sp_dn, tmp.sp_dn);
-   }
-}
-void taui_scalarfield_spinor( bispinor *dest, bispinor *source, int gamma5, int tauindex, int idx, int direction, int dagger){
-    
-  su3_vector tmp2;
-  bispinor tmp;
-  bispinor tmpbi2;
-  spinor tmp1;
-  _spinor_assign(tmp.sp_up, source->sp_up);
-  _spinor_assign(tmp.sp_dn, source->sp_dn);
-  _Complex double a11, a12, a21, a22;
-
- int scalarcoord;
- if (direction == NODIR)
-   scalarcoord=idx;
- else if (direction<4){
-   scalarcoord= g_iup[idx][direction];
- }
- else if (direction<8){
-   scalarcoord= g_idn[idx][7-direction];
- }
- if (dagger == DAGGER){
-  if (tauindex == 0){
-   a11=  -1.*g_scalar_field[2][scalarcoord] - I*g_scalar_field[1][scalarcoord];
-   a12=  +1.*g_scalar_field[0][scalarcoord] - I*g_scalar_field[3][scalarcoord];
-
-   a21=  +1.*g_scalar_field[0][scalarcoord] + I*g_scalar_field[3][scalarcoord];
-   a22=  +1.*g_scalar_field[2][scalarcoord] - I*g_scalar_field[1][scalarcoord];
-  }
-  else  if (tauindex == 1){
-   a11=  +1.*g_scalar_field[1][scalarcoord] - I*g_scalar_field[2][scalarcoord];
-   a12=  -1.*g_scalar_field[3][scalarcoord] - I*g_scalar_field[0][scalarcoord];
+  DAGGER=1;
+  NO_DAGG=0; 
 
-   a21=  -1.*g_scalar_field[3][scalarcoord] + I*g_scalar_field[0][scalarcoord];
-   a22=  -1.*g_scalar_field[1][scalarcoord] - I*g_scalar_field[2][scalarcoord];
-  }
-  else  if (tauindex == 2){
-   a11=  +1.*g_scalar_field[0][scalarcoord] - I*g_scalar_field[3][scalarcoord];
-   a12=  +1.*g_scalar_field[2][scalarcoord] + I*g_scalar_field[1][scalarcoord];
+  GAMMA_UP=1;
+  GAMMA_DN=-1;
+  NO_GAMMA=0;
+
+  WITH_SCALAR=1;
+  NO_SCALAR=0;
+
+  TYPE_A=1;
+  TYPE_B=0;
+
+  TYPE_1=1;
+  TYPE_2=0;
+  TYPE_3=2;
+  TYPE_4=3;
 
-   a21=  +1.*g_scalar_field[2][scalarcoord] - I*g_scalar_field[1][scalarcoord];
-   a22=  -1.*g_scalar_field[0][scalarcoord] - I*g_scalar_field[3][scalarcoord];
+  TYPE_I=1;
+  TYPE_II=0;
+  
+  RIGHT=1;
+  LEFT=0;
+
+  /* Read the input file */
+  if ( (i = read_input(input_filename)) != 0)
+  {
+      fprintf(stderr, "Could not find input file: %s\nAborting...\n", input_filename);
+      exit(-1);
   }
- }
- else if (dagger == NO_DAGG){
-  if (tauindex == 0){
-   a11=  -1.*g_scalar_field[2][scalarcoord] + I*g_scalar_field[1][scalarcoord];
-   a12=  +1.*g_scalar_field[0][scalarcoord] - I*g_scalar_field[3][scalarcoord];
-
-   a21=  +1.*g_scalar_field[0][scalarcoord] + I*g_scalar_field[3][scalarcoord];
-   a22=  +1.*g_scalar_field[2][scalarcoord] + I*g_scalar_field[1][scalarcoord];
+
+  if(g_proc_id==0)
+  {
+      fprintf(stdout, "#parameter  rho_BSM set to %f\n",  rho_BSM);
+      fprintf(stdout, "#parameter  eta_BSM set to %f\n",  eta_BSM);
+      fprintf(stdout, "#parameter   m0_BSM set to %f\n",   m0_BSM);
+      fprintf(stdout, "#parameter mu03_BSM set to %f\n", mu03_BSM);
+      fprintf(stdout, "#parameter mu01_BSM set to %f\n", mu01_BSM);
   }
-  if (tauindex == 1){
-   a11=  +1.*g_scalar_field[1][scalarcoord] + I*g_scalar_field[2][scalarcoord];
-   a12=  -1.*g_scalar_field[3][scalarcoord] - I*g_scalar_field[0][scalarcoord];
 
-   a21=  -1.*g_scalar_field[3][scalarcoord] + I*g_scalar_field[0][scalarcoord];
-   a22=  -1.*g_scalar_field[1][scalarcoord] + I*g_scalar_field[2][scalarcoord];
+#ifdef TM_USE_OMP
+  init_openmp();
+#endif
+  tmlqcd_mpi_init(argc, argv);
+
+  if(g_proc_id == 0)
+  {
+      fprintf(stdout,"# The number of processes is %d \n",g_nproc);
+      fprintf(stdout,"# The lattice size is %d x %d x %d x %d\n",
+         (int)(T*g_nproc_t), (int)(LX*g_nproc_x), (int)(LY*g_nproc_y), (int)(g_nproc_z*LZ));
+      fprintf(stdout,"# The local lattice size is %d x %d x %d x %d\n",
+        (int)(T), (int)(LX), (int)(LY),(int) LZ);
+      fflush(stdout);
   }
-  if (tauindex == 2){
-   a11=  +1.*g_scalar_field[0][scalarcoord] + I*g_scalar_field[3][scalarcoord];
-   a12=  +1.*g_scalar_field[2][scalarcoord] + I*g_scalar_field[1][scalarcoord];
 
-   a21=  +1.*g_scalar_field[2][scalarcoord] - I*g_scalar_field[1][scalarcoord];
-   a22=  -1.*g_scalar_field[0][scalarcoord] + I*g_scalar_field[3][scalarcoord];
+
+  g_dbw2rand = 0;
+
+  /* starts the single and double precision random number */
+  /* generator                                            */
+  start_ranlux(rlxd_level, random_seed);
+
+
+#ifdef _GAUGE_COPY
+  j = init_gauge_field(VOLUMEPLUSRAND, 1);
+#else
+  j = init_gauge_field(VOLUMEPLUSRAND, 0);
+#endif
+
+  if (j != 0)
+  {
+      fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n");
+      exit(-1);
   }
- }
- _spinor_null(tmpbi2.sp_up);
- _spinor_null(tmpbi2.sp_dn);
- 
- if ( gamma5 == GAMMA_UP){
-  _vector_mul_complex(    tmpbi2.sp_up.s0, a11, tmp.sp_up.s0);
-  _vector_add_mul_complex(tmpbi2.sp_up.s0, a12, tmp.sp_dn.s0);
 
-  _vector_mul_complex    (tmpbi2.sp_dn.s0, a21, tmp.sp_up.s0);
-  _vector_add_mul_complex(tmpbi2.sp_dn.s0, a22, tmp.sp_dn.s0);
+  init_geometry_indices(VOLUMEPLUSRAND);
 
-  _vector_mul_complex(    tmpbi2.sp_up.s1, a11, tmp.sp_up.s1);
-  _vector_add_mul_complex(tmpbi2.sp_up.s1, a12, tmp.sp_dn.s1);
+/* Iniiialising the spinor fields */
+#if (defined SSE || defined SSE2 || SSE3)
+  signal(SIGILL, &catch_ill_inst);
+#endif
 
-  _vector_mul_complex    (tmpbi2.sp_dn.s1, a21, tmp.sp_up.s1);
-  _vector_add_mul_complex(tmpbi2.sp_dn.s1, a22, tmp.sp_dn.s1);
- }
- else if  ( gamma5 == GAMMA_DN ){
-  _vector_mul_complex(    tmpbi2.sp_up.s2, a11, tmp.sp_up.s2);
-  _vector_add_mul_complex(tmpbi2.sp_up.s2, a12, tmp.sp_dn.s2);
+  DUM_DERI = 8;
+  DUM_MATRIX = DUM_DERI + 5;
+#if ((defined BGL && defined XLC) || defined _USE_TSPLITPAR)
+  NO_OF_SPINORFIELDS = DUM_MATRIX + 3;
+#else
+  NO_OF_SPINORFIELDS = DUM_MATRIX + 3;
+#endif
+  for(j = 0; j < no_operators; j++) if(!operator_list[j].even_odd_flag) even_odd_flag = 0;
 
-  _vector_mul_complex    (tmpbi2.sp_dn.s2, a21, tmp.sp_up.s2);
-  _vector_add_mul_complex(tmpbi2.sp_dn.s2, a22, tmp.sp_dn.s2);
+#ifndef TM_USE_MPI
+  g_dbw2rand = 0;
+#endif
 
-  _vector_mul_complex(    tmpbi2.sp_up.s3, a11, tmp.sp_up.s3);
-  _vector_add_mul_complex(tmpbi2.sp_up.s3, a12, tmp.sp_dn.s3);
+  if (even_odd_flag)
+  {
+      j = init_spinor_field(VOLUMEPLUSRAND / 2, NO_OF_SPINORFIELDS);
+  }
+  else
+  {
+      j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS);
+  }
+  if (j != 0)
+  {
+      fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n");
+      exit(-1);
+  }
+  j = init_bispinor_field(VOLUMEPLUSRAND, 4);
+  if ( j!= 0)
+  {
+      fprintf(stderr, "Not enough memory for bispinor fields! Aborting...\n");
+      exit(0);
+  }
 
-  _vector_mul_complex    (tmpbi2.sp_dn.s3, a21, tmp.sp_up.s3);
-  _vector_add_mul_complex(tmpbi2.sp_dn.s3, a22, tmp.sp_dn.s3);
- }
- else if ( gamma5 == NO_GAMMA ){
-  _spinor_mul_complex    (tmpbi2.sp_up,    a11, tmp.sp_up);
-  _spinor_add_mul_complex(tmpbi2.sp_up,    a12, tmp.sp_dn);
+  int numbScalarFields = 4;
+  j = init_scalar_field(VOLUMEPLUSRAND, numbScalarFields);
+  if ( j!= 0)
+  {
+      fprintf(stderr, "Not enough memory for scalar fields! Aborting...\n");
+      exit(0);
+  }
 
-  _spinor_mul_complex    (tmpbi2.sp_dn,    a21, tmp.sp_up);
-  _spinor_add_mul_complex(tmpbi2.sp_dn,    a22, tmp.sp_dn);
- }
+  spinor ** temp_field = NULL;
+  init_solver_field(&temp_field, VOLUMEPLUSRAND, 2);
 
- _spinor_assign(dest->sp_up, tmpbi2.sp_up);
- _spinor_assign(dest->sp_dn, tmpbi2.sp_dn);
 
-}
-void trace_in_spinor( _Complex double *dest, _Complex double *src, int spinorindex){
-   int tind, find;
-   for (tind=0; tind<T_global; ++tind)
-     for (find=0; find<2; ++find){ 
-       dest[2*tind+find]+=src[8*tind+4*find+spinorindex];
-     }
-}
-void trace_in_color(_Complex double *dest, bispinor *src, int colorindex){
-   if      ( colorindex == 0 ){
-     dest[0]+= src->sp_up.s0.c0;
-     dest[1]+= src->sp_up.s1.c0;
-     dest[2]+= src->sp_up.s2.c0;
-     dest[3]+= src->sp_up.s3.c0;
-     dest[4]+= src->sp_dn.s0.c0;
-     dest[5]+= src->sp_dn.s1.c0;
-     dest[6]+= src->sp_dn.s2.c0;
-     dest[7]+= src->sp_dn.s3.c0;
-
-   }
-   else if ( colorindex == 1 ){
-     dest[0]+= src->sp_up.s0.c1;
-     dest[1]+= src->sp_up.s1.c1;
-     dest[2]+= src->sp_up.s2.c1;
-     dest[3]+= src->sp_up.s3.c1;
-     dest[4]+= src->sp_dn.s0.c1;
-     dest[5]+= src->sp_dn.s1.c1;
-     dest[6]+= src->sp_dn.s2.c1;
-     dest[7]+= src->sp_dn.s3.c1;
-   }
-   else if ( colorindex == 2 ){
-     dest[0]+= src->sp_up.s0.c2;
-     dest[1]+= src->sp_up.s1.c2;
-     dest[2]+= src->sp_up.s2.c2;
-     dest[3]+= src->sp_up.s3.c2;
-     dest[4]+= src->sp_dn.s0.c2;
-     dest[5]+= src->sp_dn.s1.c2;
-     dest[6]+= src->sp_dn.s2.c2;
-     dest[7]+= src->sp_dn.s3.c2;
-   }
-}
-void trace_in_space(_Complex double *dest, _Complex double *source, int idx){
-     int i;
-     for (i=0; i<8;++i){
-       dest[g_coord[idx][TUP]*8+i]+= source[i];
-     }
-}
-void trace_in_flavor(_Complex double *dest, _Complex double *source, int f1){
-     int i;
-     for (i=0; i<T_global; ++i){
-        dest[i]+= source[2*i+f1];
-     }
-}
+  /* define the geometry */
 
-void density_density_1234( bispinor ** propfields, int type_1234 ){
-   int ix,i;
-   int f1,c1,s1,tauindex;
-   int spinorstart, spinorend;
-   su3 * restrict upm;
-   bispinor running;
-
-   _Complex double *colortrace;
-   _Complex double *spacetrace;
-   _Complex double *spinortrace;
-   _Complex double *flavortrace;
-   _Complex double *paulitrace;
-
-   colortrace=(_Complex double *)malloc(sizeof(_Complex double) *8);
-   spacetrace=(_Complex double *)malloc(sizeof(_Complex double) *8*T_global);
-   spinortrace=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
-   flavortrace=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
-   paulitrace= (_Complex double *)malloc(sizeof(_Complex double)*T_global);
-
-   if ( (type_1234 == TYPE_1) || (type_1234 == TYPE_3 )) {
-     spinorstart=0;
-     spinorend  =2;
-   }
-   else if ( (type_1234 == TYPE_2) || (type_1234 == TYPE_4) ){
-     spinorstart=2;
-     spinorend  =4;
-   }
-
-//Trace over the Pauli matrices
-   for (i=0; i<T_global; ++i)
-      paulitrace[i]=0.;
-
-   for (tauindex=0; tauindex<3; ++tauindex){
-
-//Trace over up and down flavors
-      for (i=0; i<T_global; ++i)
-         flavortrace[i]=0.;
-
-      for (f1=0; f1<2; ++f1){
-
-//Trace over the spinor indices you have to trace only over those two spinor 
-//component that appear in the final spinor
-         for (i=0; i<2*T_global; ++i)
-            spinortrace[i]=0.;
-
-         for (s1= spinorstart; s1<spinorend; ++s1){
-
-//Trace over the spatial indices
-            for (i=0; i<8*T_global; ++i)
-               spacetrace[i]=0.;
-  
-          for (ix = 0; ix< VOLUME; ++ix){
-
-//Trace over the color indices for each sites
-
-               for (i=0; i<8; ++i)
-                  colortrace[i]=0.;
-               for (c1=0; c1<3; ++c1){
-/*   
-       TYPE  1 OR  2            (1-g5)/2*S(x  ,ytilde) fixed indices (c1, s1, f1)
-       TYPE  3 OR  4            (1+g5)/2*S(x  ,ytilde) running indices bispinor
-*/
-                  upm = &g_gauge_field[g_idn[ix][TUP]][TUP];
-
-//for the up quark
-                  if ( (type_1234 == TYPE_1) || (type_1234 == TYPE_2) ){
-                    _vector_null( running.sp_up.s0 );
-                    _vector_null( running.sp_up.s1 );
-                    _vector_assign( running.sp_up.s2, propfields[12*s1+4*c1+2*f1][ix].sp_up.s2 );
-                    _vector_assign( running.sp_up.s3, propfields[12*s1+4*c1+2*f1][ix].sp_up.s3 );
-                    _vector_null( running.sp_dn.s0 );
-                    _vector_null( running.sp_dn.s1 );
-                    _vector_assign( running.sp_dn.s2, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s2 );
-                    _vector_assign( running.sp_dn.s3, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s3 );
-                  }
-                  if ((type_1234 == TYPE_3) || (type_1234 == TYPE_4)){
-                    _vector_null( running.sp_up.s2 );
-                    _vector_null( running.sp_up.s3 );
-                    _vector_assign( running.sp_up.s0, propfields[12*s1+4*c1+2*f1][ix].sp_up.s0 );
-                    _vector_assign( running.sp_up.s1, propfields[12*s1+4*c1+2*f1][ix].sp_up.s1 );
-                    _vector_null( running.sp_dn.s2 );
-                    _vector_null( running.sp_dn.s3 );
-                    _vector_assign( running.sp_dn.s0, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s0 );
-                    _vector_assign( running.sp_dn.s1, propfields[12*s1+4*c1+2*f1][ix].sp_dn.s1 );
-                  }
+  geometry();
+  g_kappa=-1;
+  if ((g_cart_id == 0) && (g_kappa != -1))
+  {
+      fprintf(stdout, "#error anti-periodic boundary condition is implemented via g_kappa %e\n",g_kappa);
+      exit(1);
+  }
+  boundary(g_kappa);
 
-/*   
-       TYPE  1 OR  2     phi^dagger(x)*tau_i*  (1-g5)/2*S(x  ,ytilde)
-       TYPE  3 OR  4     tau_i*phi(x)          (1+g5)/2*S(x  ,ytilde)
-*/
-                  if (( type_1234 == TYPE_1) || ( type_1234 == TYPE_2) ){
-                    taui_scalarfield_spinor( &running, &running, GAMMA_DN, tauindex, ix, NODIR, DAGGER );
-                  }
-                  else if ( (type_1234 == TYPE_3) || (type_1234 == TYPE_4) ){
-                    taui_scalarfield_spinor( &running, &running, GAMMA_UP, tauindex, ix, NODIR, NO_DAGG);
-                  }
-/*   
-       TYPE  1 OR  2     S(ytilde, x)*phi^dagger(x)*tau_i*  (1-g5)/2*S(x  ,ytilde)
-       TYPE  3 OR  4     S(ytilde, x)*tau_i*phi(x)          (1+g5)/2*S(x  ,ytilde)
-*/
-                  multiply_backward_propagator(&running, propfields, &running, ix, NODIR );
-
-                  //delta( color component of bispinor running, c1) for all spinor and flavor indices
-                  trace_in_color(colortrace,&running,c1);
-
-               }  //End of trace color
-               //sum over all lattice sites the result of the color trace
-               trace_in_space(spacetrace,colortrace,ix);
-
-            } //End of trace space
-
-//Gather the results from all nodes to complete the trace in space
-            for (i=0; i<8*T_global; ++i){
-               _Complex double tmp;
-               MPI_Allreduce(&spacetrace[i], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
-               spacetrace[i]= tmp;
-            }
-            // delta (spinor components of spacetrace, s1) for all time slices and flavor indices
-            trace_in_spinor(spinortrace, spacetrace, s1);
-
-         }//End of trace in spinor space
-/*   
-       TYPE  1      tau_i*phi(ytilde)*       (1+gamma5)/2*S(ytilde, x)*phi^dagger(x)*tau_i*  (1-g5)/2*S(x  ,ytilde)
-       TYPE  2      phi(ytilde)^dagger*tau_i*(1-gamma5)/2*S(ytilde, x)*phi^dagger(x)*tau_i*  (1-g5)/2*S(x  ,ytilde)
-       TYPE  3      tau_i*phi(ytilde)*       (1+gamma5)/2*S(ytilde, x)*tau_i*phi(x)          (1+g5)/2*S(x  ,ytilde)
-       TYPE  4      phi(ytilde)^dagger*tau_i*(1-gamma5)/2*S(ytilde, x)*tau_i*phi(x)          (1+g5)/2*S(x  ,ytilde)
-*/
-         if ( (type_1234 == TYPE_1) || (type_1234 == TYPE_3) ){
-           taui_scalarfield_flavoronly( spinortrace, tauindex, NO_DAGG );
-         }
-         else if ( ( type_1234 == TYPE_4) || ( type_1234 == TYPE_2 )){
-           taui_scalarfield_flavoronly( spinortrace, tauindex, DAGGER  );
-         }
-         //delta(flavor component in spinortrace, f1) for all time slices
-         trace_in_flavor( flavortrace, spinortrace, f1 );
-      }  //End of trace in flavor space
-      //sum for all Pauli matrices
-      for (i=0;i<T_global; ++i)
-         paulitrace[i]+=flavortrace[i];
-   } //End of trace for Pauli matrices
-
-   if (g_cart_id == 0){printf("Density Density correlator type (%s) results\n", type_1234 == TYPE_1 ? "1" : type_1234 == TYPE_2 ? "2" : type_1234 == TYPE_3 ? "3" : "4");}
-   for (i=0; i<T_global; ++i){
-      if (g_cart_id == 0){
-        printf("%3d %10.10e %10.10e\n", i, creal(paulitrace[i]),cimag(paulitrace[i]));
-      }
-   }
-   free(flavortrace);
-   free(paulitrace);
-   free(spacetrace);
-   free(spinortrace);
-   free(colortrace);
+  status_geo = check_geometry();
+  if (status_geo != 0)
+  {
+      fprintf(stderr, "Checking of geometry failed. Unable to proceed.\nAborting....\n");
+      exit(1);
+  }
 
-}
 
+  if (Nsave == 0) {
+    Nsave = 1;
+  }
 
+  g_mu = g_mu1;
 
-void naivedirac_current_density_12ab( bispinor ** propfields, int type_12, int type_ab ){
-   int ix,i;
-   int f1,c1,s1,tauindex;
-   int spinorstart, spinorend;
-   su3 * restrict upm;
-   bispinor running;
-   int count;
-   MPI_Status  statuses[8];
-   MPI_Request *request;
-   request=( MPI_Request *) malloc(sizeof(MPI_Request)*8);
-
-   _Complex double *colortrace;
-   _Complex double *spacetrace;
-   _Complex double *spinortrace;
-   _Complex double *flavortrace;
-   _Complex double *paulitrace;
-
-   colortrace=(_Complex double *)malloc(sizeof(_Complex double) *8);
-   spacetrace=(_Complex double *)malloc(sizeof(_Complex double) *8*T_global);
-   spinortrace=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
-   flavortrace=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
-   paulitrace= (_Complex double *)malloc(sizeof(_Complex double)*T_global);
-
-   if ( type_ab == TYPE_A ) {
-     spinorstart=0;
-     spinorend  =2;
-   }
-   else if ( type_ab == TYPE_B ){
-     spinorstart=2;
-     spinorend  =4;
-   }
-//Doing the neccessary communication
-   for (s1=spinorstart; s1<spinorend; ++s1)
-      for (c1=0; c1<3; ++c1)
-         for (f1=0; f1<2; ++f1){
-            count=0;
-            generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 0], sizeof(bispinor), TDOWN, request, &count );
-            MPI_Waitall( count, request, statuses);
-            count=0; 
-            generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 1], sizeof(bispinor), TDOWN, request, &count);
-            MPI_Waitall( count, request, statuses);
-         }
-   free(request);
-   
-//Trace over the Pauli matrices
-   for (i=0; i<T_global; ++i)
-      paulitrace[i]=0.;
-
-   for (tauindex=0; tauindex<3; ++tauindex){
-      for (i=0; i<T_global; ++i)
-         flavortrace[i]=0.;
-
-      for (f1=0; f1<2; ++f1){
-
-//Trace over the spinor indices
-         for (i=0; i<2*T_global; ++i)
-            spinortrace[i]=0.;
-
-         for (s1= spinorstart; s1<spinorend; ++s1){
-
-//Trace over the spatial indices
-            for (i=0; i<8*T_global; ++i)
-               spacetrace[i]=0.;
-            for (ix = 0; ix< VOLUME; ++ix){
-
-//Trace over the color indices for each sites
-
-               for (i=0; i<8; ++i) 
-                  colortrace[i]=0.;
-               for (c1=0; c1<3; ++c1){    
-/*   
-       TYPE  IA OR  IB     U0(x-0)*       (1-g5)/2*S(x  ,ytilde) fixed indices (c1, s1, f1)
-       TYPE IIA OR IIB     U0^dagger(x-0)*(1-g5)/2*S(x-0,ytilde) running indices bispinor
-*/
-                  upm = &g_gauge_field[g_idn[ix][TUP]][TUP];
-
-//for the up quark
-                  _vector_null( running.sp_up.s0 ); 
-                  _vector_null( running.sp_up.s1 ); 
-             
-                  if  ( type_12 == TYPE_I ){
-                    _su3_multiply( running.sp_up.s2, (*upm), propfields[12*s1+4*c1+2*f1][ix].sp_up.s2 ); 
-                    _su3_multiply( running.sp_up.s3, (*upm), propfields[12*s1+4*c1+2*f1][ix].sp_up.s3 ); 
-                  }
-                  else if ( type_12 == TYPE_II ){
-                    _su3_inverse_multiply( running.sp_up.s2, (*upm), propfields[12*s1+4*c1+2*f1][g_idn[ix][TUP]].sp_up.s2 ); 
-                    _su3_inverse_multiply( running.sp_up.s3, (*upm), propfields[12*s1+4*c1+2*f1][g_idn[ix][TUP]].sp_up.s3 ); 
-                  }
+  if (g_cart_id == 0)
+  {
+    /*construct the filenames for the observables and the parameters*/
+      strncpy(datafilename, filename, 200);
+      strcat(datafilename, ".data");
+      strncpy(parameterfilename, filename, 200);
+      strcat(parameterfilename, ".para");
 
-//for the up quark
-                  _vector_null( running.sp_dn.s0 ); 
-                  _vector_null( running.sp_dn.s1 ); 
-                  if  ( type_12 == TYPE_I ){
-                    _su3_multiply( running.sp_dn.s2, (*upm), propfields[12*s1+4*c1+2*f1][ix].sp_dn.s2 ); 
-                    _su3_multiply( running.sp_dn.s3, (*upm), propfields[12*s1+4*c1+2*f1][ix].sp_dn.s3 ); 
-                  }
-                  else if ( type_12 == TYPE_II ){
-                    _su3_inverse_multiply( running.sp_dn.s2, (*upm), propfields[12*s1+4*c1+2*f1][g_idn[ix][TUP]].sp_dn.s2 );
-                    _su3_inverse_multiply( running.sp_dn.s3, (*upm), propfields[12*s1+4*c1+2*f1][g_idn[ix][TUP]].sp_dn.s3 );
-                  }
+      parameterfile = fopen(parameterfilename, "w");
+      write_first_messages(parameterfile, "invert", git_hash);
+      fclose(parameterfile);
+  }
 
-/*   
-       TYPE  IA OR  IB     gamma0*U0(x-0)*       (1-g5)/2*S(x  ,ytilde)
-       TYPE IIA OR IIB     gamma0*U0^dagger(x-0)*(1-g5)/2*S(x-0,ytilde)
-*/
-                  _vector_add_assign(running.sp_up.s0, running.sp_up.s2);
-                  _vector_add_assign(running.sp_up.s1, running.sp_up.s3);
-                  _vector_null(running.sp_up.s2);
-                  _vector_null(running.sp_up.s3);
-
-                  _vector_add_assign(running.sp_dn.s0, running.sp_dn.s2);
-                  _vector_add_assign(running.sp_dn.s1, running.sp_dn.s3);
-                  _vector_null(running.sp_dn.s2);
-                  _vector_null(running.sp_dn.s3);
-
-/*   
-       TYPE  IA OR  IB     tau_i*gamma0*U0(x-0)*       (1-g5)/2*S(x  ,ytilde)
-       TYPE IIA OR IIB     tau_i*gamma0*U0^dagger(x-0)*(1-g5)/2*S(x-0,ytilde)
-*/
-                  taui_spinor( &running, &running, tauindex  );
-
-/*   
-       TYPE  IA OR  IB     S(ytilde, x-0)* tau_i*gamma0*U0(x-0)*       (1-g5)/2*S(x  ,ytilde)
-       TYPE IIA OR IIB     S(ytilde, x  )* tau_i*gamma0*U0^dagger(x-0)*(1-g5)/2*S(x-0,ytilde)
-*/
-                  if ( type_12 == TYPE_I ){ 
-                    multiply_backward_propagator(&running, propfields, &running, ix, TDOWN);
-                  }
-                  else if ( type_12 == TYPE_II ){
-                    multiply_backward_propagator(&running, propfields, &running, ix,NODIR);
-                  }
-                  //delta( color component of bispinor running, c1) for all spinor and flavor indices
-                  trace_in_color(colortrace,&running,c1);
-
-               }  //End of trace color
-               //sum over all lattice sites the result of the color trace
-               trace_in_space(spacetrace,colortrace,ix);
-
-            } //End of trace space
-
-//Gather the results from all nodes to complete the trace in space
-            for (i=0; i<8*T_global; ++i){
-               _Complex double tmp;
-               MPI_Allreduce(&spacetrace[i], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
-               spacetrace[i]= tmp;
-            }
-            // delta (spinor components of spacetrace, s1) for all time slices and flavor components
-            trace_in_spinor(spinortrace, spacetrace, s1);
-
-         }//End of trace in spinor space
-   
-/*   
-       TYPE  IA tau_i*phi(ytilde)        *  (1+gamma5)/2  *   S(ytilde, x-0)*   tau_i*gamma0*U0(x-0)*       (1-g5)/2*   S(x  ,ytilde)
-       TYPE  IB phi^dagger(ytilde)*tau_i *  (1-gamma5)/2  *   S(ytilde, x-0)*   tau_i*gamma0*U0(x-0)*       (1-g5)/2*   S(x  ,ytilde)
-
-       TYPE IIA tau_i*phi(ytilde)        *  (1+gamma5)/2  *   S(ytilde, x  )*   tau_i*gamma0*U0^dagger(x-0)*(1-g5)/2*   S(x-0,ytilde)
-       TYPE IIB phi^dagger(ytilde)*tau_i *  (1-gamma5)/2  *   S(ytilde, x  )*   tau_i*gamma0*U0^dagger(x-0)*(1-g5)/2*   S(x-0,ytilde)
-
-*/
-         if ( type_ab == TYPE_A ){
-           taui_scalarfield_flavoronly( spinortrace, tauindex, NO_DAGG );
-         }
-         else if ( type_ab == TYPE_B){
-           taui_scalarfield_flavoronly( spinortrace, tauindex, DAGGER  );
-         }
-         //delta(flavor component in spinortrace, f1) for all time slices 
-         trace_in_flavor( flavortrace, spinortrace, f1 );
-      } //End of trace in flavor space
-      //sum for all Pauli matrices
-      for (i=0;i<T_global; ++i)
-         paulitrace[i]+=flavortrace[i];
-   } //End of trace for Pauli matrices
-   
-   if (g_cart_id == 0){printf("NaiveDirac Current Density correlator type (%s %s) results\n", type_12 == TYPE_I ? "I" : "II",type_ab == TYPE_A ? 'a' :'b');}
-   for (i=0; i<T_global; ++i){
-      if (g_cart_id == 0){
-        printf("%3d %10.10e %10.10e\n", i, creal(paulitrace[i]),cimag(paulitrace[i]));
-      }
-   }
-   free(flavortrace);
-   free(paulitrace);
-   free(spacetrace);
-   free(spinortrace);
-   free(colortrace);
+  init_operators();
 
-}
-void wilsonterm_current_density_312ab( bispinor ** propfields, int type_12, int type_ab ){
-   int ix,i;
-   int f1,c1,s1,tauindex;
-   int spinorstart, spinorend;
-   su3 * restrict upm;
-   su3_vector tmpvec;
-   bispinor running;
-   int count;
-   MPI_Status  statuses[8];
-   MPI_Request *request;
-   request=( MPI_Request *) malloc(sizeof(MPI_Request)*8);
-   _Complex double *colortrace;
-   _Complex double *spacetrace;
-   _Complex double *spinortrace;
-   _Complex double *flavortrace;
-   _Complex double *paulitrace;
-
-   colortrace= (_Complex double *)malloc(sizeof(_Complex double) *8);
-   spacetrace= (_Complex double *)malloc(sizeof(_Complex double) *8*T_global);
-   spinortrace=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
-   flavortrace=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
-   paulitrace= (_Complex double *)malloc(sizeof(_Complex double)*T_global);
-
-   if ( type_ab == TYPE_A ) {
-      spinorstart=0;
-      spinorend  =2;
-   }
-   else if ( type_ab == TYPE_B ){
-      spinorstart=2;
-      spinorend  =4;
-   }
-// Doing the neccessary communication
-   for (s1=spinorstart; s1<spinorend; ++s1)
-      for (c1=0; c1<3; ++c1)
-         for (f1=0; f1<2; ++f1){
-            count=0;
-            generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 0], sizeof(bispinor), TUP   , request, &count );
-            MPI_Waitall( count, request, statuses);
-            count=0;
-            generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 0], sizeof(bispinor), TDOWN , request, &count );
-            MPI_Waitall( count, request, statuses);
-            count=0;
-            generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 1], sizeof(bispinor), TDOWN , request, &count);
-            MPI_Waitall( count, request, statuses);
-         }
-   free(request);
-
-//Trace over the Pauli matrices
-   for (i=0; i<T_global; ++i){
-      paulitrace[i]=0.;
-   }
-   for (tauindex= 0; tauindex <3; ++tauindex){
-//Trace over flavour degrees of freedom
-      for (i=0; i<T_global; ++i)
-         flavortrace[i]=0.;
-
-      for (f1=0; f1<2; ++f1){
-
-//Trace over spinor indices
-         for (i=0; i<8*T_global; ++i){
-            spinortrace[i]=0.;
-         }
-
-         for (s1=spinorstart; s1<spinorend; ++s1){
-
-//Trace over spatial indices
-            for (i=0; i<8*T_global; ++i){
-               spacetrace[i]=0.;
-            }
-            for (ix=0; ix<VOLUME; ++ix){
-
-//Trace over the color indices for each sites
-               for (i=0; i<8; ++i)
-                  colortrace[i]=0.;
-               for (c1=0; c1<3; ++c1){
-/*   
-       TYPE III.1.a OR  III.1.b     U0(x-0)*U0(x)* (1+gamma5)/2 *  S(x+0,ytilde)
-       TYPE III.2.a OR  III.2.b                    (1+gamma5)/2 *  S(x-0,ytilde)
-*/
-                  _vector_null(running.sp_up.s2);
-                  _vector_null(running.sp_up.s3);
-                  _vector_null(running.sp_dn.s2);
-                  _vector_null(running.sp_dn.s3);
-
-                  if ( type_12 == TYPE_1){
-                    upm = &g_gauge_field[ix][TUP];
-//for the up quark
-                    _su3_multiply(running.sp_up.s0, (*upm), propfields[12*s1 + 4*c1 + 2*f1][g_iup[ix][TUP]].sp_up.s0);
-                    _su3_multiply(running.sp_up.s1, (*upm), propfields[12*s1 + 4*c1 + 2*f1][g_iup[ix][TUP]].sp_up.s1);
-
-//for the down quark
-                    _su3_multiply(running.sp_dn.s0, (*upm), propfields[12*s1 + 4*c1 + 2*f1][g_iup[ix][TUP]].sp_dn.s0);
-                    _su3_multiply(running.sp_dn.s1, (*upm), propfields[12*s1 + 4*c1 + 2*f1][g_iup[ix][TUP]].sp_dn.s1);
-
-                    upm = &g_gauge_field[g_idn[ix][TUP]][TUP];
-
-//for the up quark
-
-                    _su3_multiply(tmpvec, (*upm), running.sp_up.s0);
-                    _vector_assign(  running.sp_up.s0, tmpvec);
-                    _su3_multiply(tmpvec, (*upm), running.sp_up.s1);
-                    _vector_assign(  running.sp_up.s1, tmpvec);
-
-//for the down quark
-
-                    _su3_multiply(tmpvec, (*upm), running.sp_dn.s0);
-                    _vector_assign(  running.sp_dn.s0, tmpvec);
-                    _su3_multiply(tmpvec, (*upm), running.sp_dn.s1);
-                    _vector_assign(  running.sp_dn.s1, tmpvec);
-                  }
-                  else if ( type_12 == TYPE_2){
-                    _vector_assign( running.sp_up.s0, propfields[12*s1 + 4*c1 + 2*f1][g_idn[ix][TUP]].sp_up.s0 );
-                    _vector_assign( running.sp_up.s1, propfields[12*s1 + 4*c1 + 2*f1][g_idn[ix][TUP]].sp_up.s1 );
-                    _vector_assign( running.sp_dn.s0, propfields[12*s1 + 4*c1 + 2*f1][g_idn[ix][TUP]].sp_dn.s0 );
-                    _vector_assign( running.sp_dn.s1, propfields[12*s1 + 4*c1 + 2*f1][g_idn[ix][TUP]].sp_dn.s1 );
-                  }
-/*   
-       TYPE III.1.a OR  III.1.b     tau_i*phi(x)*U0(x-0)*U0(x)* (1+gamma5)/2 *  S(x+0,ytilde)
-       TYPE III.2.a OR  III.2.b     tau_i*phi(x)*               (1+gamma5)/2 *  S(x-0,ytilde)
-*/
-                  taui_scalarfield_spinor( &running, &running, GAMMA_UP, tauindex, ix, NODIR, NO_DAGG);
-
-/*   
-       TYPE III.1.a OR  III.1.b     S(ytilde, x-0)*tau_i*phi(x)*U0(x-0)*U0(x)* (1+gamma5)/2 *  S(x+0,ytilde)
-       TYPE III.2.a OR  III.2.b     S(ytilde, x-0)*tau_i*phi(x)*               (1+gamma5)/2 *  S(x-0,ytilde)
-*/
-                  multiply_backward_propagator(&running, propfields, &running, ix,-1);
-                  //delta( color component of bispinor running, c1) for all spinor and flavor indices                  
-                  trace_in_color(colortrace, &running, c1 );
-               } //End of trace color
-               //sum over all lattice sites the result of the color trace
-               trace_in_space( spacetrace, colortrace, ix);
-            }  //End of trace in space
-
-//Gather the results from all nodes to complete the trace in space
-            for (i=0; i<8*T_global; ++i){
-               _Complex double tmp;
-               MPI_Allreduce(&spacetrace[i], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
-               spacetrace[i]= tmp;
-            }
-            // delta (spinor components of spacetrace, s1) for all time slices and flavor indices 
-            trace_in_spinor(spinortrace, spacetrace, s1);
-
-         } //End of trace in spinor space
-
-/*   
-       TYPE III.1.a                 tau_i*phi(ytilde)*       (1+gamma5)/2* S(ytilde, x-0)*tau_i*phi(x)*U0(x-0)*U0(x)* (1+gamma5)/2 *  S(x+0,ytilde)
-       TYPE III.1.b                 phi^dagger(ytilde)*tau_i*(1-gamma5)/2* S(ytilde, x-0)*tau_i*phi(x)*U0(x-0)*U0(x)* (1+gamma5)/2 *  S(x+0,ytilde)
-
-       TYPE III.2.a                 tau_i*phi(ytilde)*       (1+gamma5)/2* S(ytilde, x-0)*tau_i*phi(x)*               (1+gamma5)/2 *  S(x-0,ytilde)
-       TYPE III.2.b                 phi^dagger(ytilde)*tau_i*(1-gamma5)/2* S(ytilde, x-0)*tau_i*phi(x)*               (1+gamma5)/2 *  S(x-0,ytilde)
-
-*/
-         if ( type_ab == TYPE_A ){
-           taui_scalarfield_flavoronly( spinortrace, tauindex, NO_DAGG );
-         }
-         else if ( type_ab == TYPE_B){
-           taui_scalarfield_flavoronly( spinortrace, tauindex, DAGGER  );
-         }
-         //delta(flavor component in spinortrace, f1) for all time slices
-         trace_in_flavor( flavortrace, spinortrace, f1 );
-
-      } //End of trace in flavor space
-      //sum for all Pauli matrices
-      for (i=0;i<T_global; ++i)
-         paulitrace[i]+=flavortrace[i];
-   } //End of trace for Pauli matrices
 
- 
-   if (g_cart_id == 0){printf("Wilson term Dirac Current Density correlator typeIII results= %10.10e %10.10e\n", type_12 == TYPE_1 ? '1' : '2',type_ab == TYPE_A ? 'a' :'b');}
-   for (i=0; i<T_global; ++i){
-      if (g_cart_id == 0){
-        printf("%3d %10.10e %10.10e\n", i, creal(paulitrace[i]), cimag(paulitrace[i]));
+  for (j = 0; j < Nmeas; j++)
+  {
+      sprintf(conf_filename, "%s.%.4d", gauge_input_filename, nstore);
+      if (g_cart_id == 0)
+      {
+          printf("#\n# Trying to read gauge field from file %s in %s precision.\n",
+            conf_filename, (gauge_precision_read_flag == 32 ? "single" : "double"));
+          fflush(stdout);
       }
-   }
-   free(flavortrace);
-   free(paulitrace);
-   free(spacetrace);
-   free(spinortrace);
-   free(colortrace);
-}
-void wilsonterm_current_density_412ab( bispinor ** propfields, int type_12, int type_ab ){
-   int ix,i;
-   int f1,c1,s1,tauindex;
-   int spinorstart, spinorend;
-   bispinor **propsecneighbour;
-   bispinor **tmpbisp2d;
-   su3 * restrict upm;
-   bispinor running;
-   su3_vector tmpvec;
-   int count;
-   MPI_Status  statuses[8];
-   MPI_Request *request;
-   _Complex double *colortrace;
-   _Complex double *spacetrace;
-   _Complex double *spinortrace;
-   _Complex double *flavortrace;
-   _Complex double *paulitrace;
-
-   colortrace= (_Complex double *)malloc(sizeof(_Complex double) *8);
-   spacetrace= (_Complex double *)malloc(sizeof(_Complex double) *8*T_global);
-   spinortrace=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
-   flavortrace=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
-   paulitrace= (_Complex double *)malloc(sizeof(_Complex double)*T_global);
-
-   request=( MPI_Request *) malloc(sizeof(MPI_Request)*8);
-   if ( type_ab == TYPE_A ) {
-        spinorstart=0;
-        spinorend  =2;
-   }
-   else if ( type_ab == TYPE_B ){
-        spinorstart=2;
-        spinorend  =4;
-   }
-
-   if (type_12 == TYPE_2){
-/**********************************
-Creating U^dagger(x-0)*U^dagger(x-2*0)*S(x-2*0,ytilde) in three steps:
-1; Creating U^dagger(x)*S(x,ytilde)
-2; Creating U^dagger(x+0)Ü0^dagger(x)*S(x, ytilde)
-3; Gathering two times in direction TDOWN
-***********************************/
-      tmpbisp2d= (bispinor **)malloc(sizeof(bispinor *)*24);
-      propsecneighbour=(bispinor **)malloc(sizeof(bispinor *)*24);
-      for (i=0; i<24; ++i){
-        propsecneighbour[i]=(bispinor *)malloc(sizeof(bispinor)*VOLUMEPLUSRAND);
-        tmpbisp2d[i]=(bispinor *)malloc(sizeof(bispinor)*VOLUMEPLUSRAND); 
+      if ( (i = read_gauge_field(conf_filename,g_gauge_field) ) !=0)
+      {
+          fprintf(stderr, "Error %d while reading gauge field from %s\n Aborting...\n", i, conf_filename);
+          exit(-2);
       }
-      for (i=0; i<24; ++i)
-        for (ix=0; ix<VOLUME; ++ix)
-          _bispinor_null(tmpbisp2d[i][ix]);
-
-      for (ix = 0; ix< VOLUME; ++ix)
-        for (s1=spinorstart;s1<spinorend; ++s1)
-          for (c1=0; c1<3; ++c1)
-            for (f1=0; f1<2; ++f1){
-
-              upm = &g_gauge_field[ix][TUP];
-
-              _vector_null(tmpbisp2d[12*f1 + 3*s1 + c1][ix].sp_up.s2);
-              _vector_null(tmpbisp2d[12*f1 + 3*s1 + c1][ix].sp_up.s3);
-              _su3_inverse_multiply(tmpbisp2d[12*f1+3*s1+c1][ix].sp_up.s0, (*upm), propfields[12*s1 + 4*c1 + 2*f1][ix].sp_up.s0);
-              _su3_inverse_multiply(tmpbisp2d[12*f1+3*s1+c1][ix].sp_up.s1, (*upm), propfields[12*s1 + 4*c1 + 2*f1][ix].sp_up.s1);
-
-              _vector_null(tmpbisp2d[12*f1 + 3*s1 + c1][ix].sp_dn.s2);
-              _vector_null(tmpbisp2d[12*f1 + 3*s1 + c1][ix].sp_dn.s3);
-              _su3_inverse_multiply(tmpbisp2d[12*f1+3*s1+c1][ix].sp_dn.s0, (*upm), propfields[12*s1 + 4*c1 + 2*f1][ix].sp_dn.s0);
-              _su3_inverse_multiply(tmpbisp2d[12*f1+3*s1+c1][ix].sp_dn.s1, (*upm), propfields[12*s1 + 4*c1 + 2*f1][ix].sp_dn.s1);
-
-               upm = &g_gauge_field[g_iup[ix][TUP]][TUP];
-               
-               _vector_null( tmpvec );
-               _su3_inverse_multiply(tmpvec, (*upm), tmpbisp2d[12*f1+3*s1+c1][ix].sp_up.s0);
-               _vector_assign(  tmpbisp2d[12*f1+3*s1+c1][ix].sp_up.s0, tmpvec);
-               _vector_null( tmpvec );
-               _su3_inverse_multiply(tmpvec, (*upm), tmpbisp2d[12*f1+3*s1+c1][ix].sp_up.s1);
-               _vector_assign(  tmpbisp2d[12*f1+3*s1+c1][ix].sp_up.s1, tmpvec);
-
-               _vector_null( tmpvec );
-               _su3_inverse_multiply(tmpvec, (*upm), tmpbisp2d[12*f1+3*s1+c1][ix].sp_dn.s0);
-               _vector_assign(  tmpbisp2d[12*f1+3*s1+c1][ix].sp_dn.s0, tmpvec);
-               _vector_null( tmpvec );
-               _su3_inverse_multiply(tmpvec, (*upm), tmpbisp2d[12*f1+3*s1+c1][ix].sp_dn.s1);
-               _vector_assign(  tmpbisp2d[12*f1+3*s1+c1][ix].sp_dn.s1, tmpvec);
-            }
-      for (s1=spinorstart;s1<spinorend; ++s1)
-        for (c1=0; c1<3; ++c1)
-          for (f1=0; f1<2; ++f1){
-            count=0;
-            generic_exchange_direction_nonblocking( tmpbisp2d[12*f1+3*s1+c1], sizeof(bispinor), TDOWN, request, &count );
-            MPI_Waitall( count, request, statuses);
-          }
-      for (s1=spinorstart;s1<spinorend; ++s1)
-        for (c1=0; c1<3; ++c1)
-          for (f1=0; f1<2; ++f1){
-            for (ix=0; ix<VOLUMEPLUSRAND; ++ix)
-               _bispinor_null(propsecneighbour[12*f1+3*s1+c1][ix]);
-            for (ix=0; ix<VOLUME; ++ix){
-               _spinor_assign( propsecneighbour[12*f1+3*s1+c1][ix].sp_up, tmpbisp2d[12*f1+3*s1+c1][g_idn[ix][TUP]].sp_up);
-               _spinor_assign( propsecneighbour[12*f1+3*s1+c1][ix].sp_dn, tmpbisp2d[12*f1+3*s1+c1][g_idn[ix][TUP]].sp_dn);
-            }
-          }
-      for (s1=spinorstart;s1<spinorend; ++s1)
-       for (c1=0; c1<3; ++c1)
-         for (f1=0; f1<2; ++f1){
-           count=0;
-           generic_exchange_direction_nonblocking( propsecneighbour[12*f1+3*s1+c1], sizeof(bispinor), TDOWN, request, &count );
-           MPI_Waitall( count, request, statuses);
-        }
-   }
-   for (i=0; i<T_global; ++i)
-      paulitrace[i]=0.;
-
-// Trace over the Pauli matrices
-   for (tauindex=0; tauindex<3; ++tauindex){
-
-//Trace over flavour degrees of freedom
-      for (i=0; i<T_global; ++i)
-         flavortrace[i]=0.;
-
-      for (f1=0; f1<2; ++f1){
-
-//Trace over spinor indices
-         for (i=0; i<8*T_global; ++i){
-            spinortrace[i]=0.;
-         }
-
-         for (s1=spinorstart; s1<spinorend; ++s1){
-
-//Trace over spatial indices
-            for (i=0; i<8*T_global; ++i){
-               spacetrace[i]=0.;
-            }
-            for (ix=0; ix<VOLUME; ++ix){
-
-//Trace over the color indices for each sites
-               for (i=0; i<8; ++i)
-                  colortrace[i]=0.;
-               for (c1=0; c1<3; ++c1){
-
-/*   
-       TYPE IV.1.a OR  IV.1.b                                     (1+gamma5)/2*S(x    ,ytilde)
-       TYPE IV.2.a OR  IV.2.b     U0^dagger(x-0)*U0^dagger(x-2*0)*(1+gamma5)/2*S(x-2*0,ytilde)
-*/
-                  _vector_null(running.sp_up.s2);
-                  _vector_null(running.sp_up.s3);
-                  _vector_null(running.sp_dn.s2);
-                  _vector_null(running.sp_dn.s3);
-                  if ( type_12 == TYPE_2){
-//for the up quark
-                     _vector_assign( running.sp_up.s0, propsecneighbour[12*f1 + 3*s1 + c1][g_idn[ix][TUP]].sp_up.s0);
-                     _vector_assign( running.sp_up.s1, propsecneighbour[12*f1 + 3*s1 + c1][g_idn[ix][TUP]].sp_up.s1);
-
-//for the down quark
-                     _vector_assign( running.sp_dn.s0, propsecneighbour[12*f1 + 3*s1 + c1][g_idn[ix][TUP]].sp_dn.s0);
-                     _vector_assign( running.sp_dn.s1, propsecneighbour[12*f1 + 3*s1 + c1][g_idn[ix][TUP]].sp_dn.s1);
-                  }
-                  else if ( type_12 == TYPE_1){
-                     _vector_assign( running.sp_up.s0, propfields[12*s1 + 4*c1 + 2*f1][ix].sp_up.s0 );
-                     _vector_assign( running.sp_up.s1, propfields[12*s1 + 4*c1 + 2*f1][ix].sp_up.s1 );
 
-                     _vector_assign( running.sp_dn.s0, propfields[12*s1 + 4*c1 + 2*f1][ix].sp_dn.s0 );
-                     _vector_assign( running.sp_dn.s1, propfields[12*s1 + 4*c1 + 2*f1][ix].sp_dn.s1 );
-                  }
-/*   
-       TYPE IV.1.a OR  IV.1.b   tau_i*phi(x)*                                  (1+gamma5)/2*S(x+   ,ytilde)
-       TYPE IV.2.a OR  IV.2.b   tau_i*phi(x)*  U0^dagger(x-0)*U0^dagger(x-2*0)*(1+gamma5)/2*S(x-2*0,ytilde)
-*/
-                  taui_scalarfield_spinor( &running, &running, GAMMA_UP, tauindex, ix, TDOWN, NO_DAGG);
-
-/*   
-       TYPE IV.1.a OR  IV.1.b   S(ytilde, x)*tau_i*phi(x)*                                  (1+gamma5)/2*S(x+   ,ytilde)
-       TYPE IV.2.a OR  IV.2.b   S(ytilde, x)*tau_i*phi(x)*  U0^dagger(x-0)*U0^dagger(x-2*0)*(1+gamma5)/2*S(x-2*0,ytilde)
-*/
-
-                  multiply_backward_propagator(&running, propfields, &running, ix, NODIR);
-
-/*   
-       TYPE IV.1.a tau_i*phi(ytilde)*         S(ytilde, x)*tau_i*phi(x)*                                  (1+gamma5)/2*S(x+   ,ytilde)
-       TYPE IV.1.b phi^dagger(ytilde)*tau_i*  S(ytilde, x)*tau_i*phi(x)*                                  (1+gamma5)/2*S(x+   ,ytilde)
-
-       TYPE IV.2.a tau_i*phi(ytilde)*         S(ytilde, x)*tau_i*phi(x)*  U0^dagger(x-0)*U0^dagger(x-2*0)*(1+gamma5)/2*S(x-2*0,ytilde)
-       TYPE IV.2.b phi^dagger(ytilde)*tau_i*  S(ytilde, x)*tau_i*phi(x)*  U0^dagger(x-0)*U0^dagger(x-2*0)*(1+gamma5)/2*S(x-2*0,ytilde)
-*/
-                  //delta( color component of bispinor running, c1) for all spinor and flavor indices
-                  trace_in_color(colortrace, &running, c1 );
-               } //End of trace color
-               //sum over all lattice sites the result of the color trace
-               trace_in_space( spacetrace, colortrace, ix);
-            }  //End of trace in space
-
-//Gather the results from all nodes to complete the trace in space
-            for (i=0; i<8*T_global; ++i){
-               _Complex double tmp;
-               MPI_Allreduce(&spacetrace[i], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
-               spacetrace[i]= tmp;
-            }
-            // delta (spinor components of spacetrace, s1) for all time slices and flavor indices
-            trace_in_spinor(spinortrace, spacetrace, s1);
-
-         } //End of trace in spinor space
-
-         if ( type_ab == TYPE_A ){
-           taui_scalarfield_flavoronly( spinortrace, tauindex, NO_DAGG );
-         }
-         else if ( type_ab == TYPE_B){
-           taui_scalarfield_flavoronly( spinortrace, tauindex, DAGGER  );
-         }
-         //delta(flavor component in spinortrace, f1) for all time slices
-         trace_in_flavor( flavortrace, spinortrace, f1 );
-
-      } //End of trace in flavor space
-      //sum for all Pauli matrices
-      for (i=0;i<T_global; ++i)
-         paulitrace[i]+=flavortrace[i];
-   } //End of trace for Pauli matrices
-
-
-   if (g_cart_id == 0){printf("Wilson term Dirac Current Density correlator typeIV results= %10.10e %10.10e\n", type_12 == TYPE_1 ? '1' : '2',type_ab == TYPE_A ? 'a' :'b');}
-   for (i=0; i<T_global; ++i){
-      if (g_cart_id == 0){
-        printf("%3d %10.10e %10.10e\n", i, creal(paulitrace[i]), cimag(paulitrace[i]));
+      snprintf(conf_filename, 50, "%s_smeared.%.4d", gauge_input_filename, nstore);
+      if (g_cart_id == 0) {
+        printf("#\n# Trying to read smeared gauge field from file %s in %s precision.\n",
+                conf_filename, (gauge_precision_read_flag == 32 ? "single" : "double"));
+        fflush(stdout);
       }
-   }
-   free(flavortrace);
-   free(paulitrace);
-   free(spacetrace);
-   free(spinortrace);
-   free(colortrace);
-
-   if (type_12 == TYPE_2){
-     for(i=0;i<24;++i){
-       free(tmpbisp2d[i]);
-       free(propsecneighbour[i]);
-     }
-     free(tmpbisp2d);
-     free(propsecneighbour);
-   }
-   free(request); 
-}
-void wilsonterm_current_density_512ab( bispinor ** propfields, int type_12, int type_ab ){
-   int ix,i;
-   int f1,c1,s1,tauindex;
-   int spinorstart, spinorend;
-   su3 * restrict upm;
-   bispinor running;
-   su3_vector tmpvec;
-   int count;
-   MPI_Status  statuses[8];
-   MPI_Request *request;
-   request=( MPI_Request *) malloc(sizeof(MPI_Request)*8);
-
-   _Complex double *colortrace;
-   _Complex double *spacetrace;
-   _Complex double *spinortrace;
-   _Complex double *flavortrace;
-   _Complex double *paulitrace;
-
-   colortrace= (_Complex double *)malloc(sizeof(_Complex double)*8);
-   spacetrace= (_Complex double *)malloc(sizeof(_Complex double)*8*T_global);
-   spinortrace=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
-   flavortrace=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
-   paulitrace= (_Complex double *)malloc(sizeof(_Complex double)*T_global);
-
-   if ( type_ab == TYPE_A ) {
-     spinorstart=0;
-     spinorend  =2;
-   }
-   else if ( type_ab == TYPE_B ){
-     spinorstart=2;
-     spinorend  =4;
-   }
-//Doing the neccesary communication
-   for (s1=spinorstart; s1<spinorend; ++s1)
-     for (c1=0; c1<3; ++c1)
-       for (f1=0; f1<2; ++f1){
-           count=0;
-           generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 0], sizeof(bispinor), TDOWN , request, &count );
-           MPI_Waitall( count, request, statuses);
-           count=0;
-           generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 1], sizeof(bispinor), TUP   , request, &count );
-           MPI_Waitall( count, request, statuses);
-           count=0;
-           generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 1], sizeof(bispinor), TDOWN , request, &count);
-           MPI_Waitall( count, request, statuses);
-       }
-   free(request);
-// Trace over the Pauli matrices
-   for (tauindex=0; tauindex<3; ++tauindex){
-
-//Trace over flavour degrees of freedom
-      for (i=0; i<T_global; ++i)
-         flavortrace[i]=0.;
-
-      for (f1=0; f1<2; ++f1){
-
-//Trace over spinor indices
-         for (i=0; i<8*T_global; ++i){
-            spinortrace[i]=0.;
-         }
-
-         for (s1=spinorstart; s1<spinorend; ++s1){
-
-//Trace over spatial indices
-            for (i=0; i<8*T_global; ++i){
-               spacetrace[i]=0.;
-            }
-            for (ix=0; ix<VOLUME; ++ix){
-
-//Trace over the color indices for each sites
-               for (i=0; i<8; ++i)
-                  colortrace[i]=0.;
-               for (c1=0; c1<3; ++c1){
-/*   
-       TYPE V.1.a OR  V.1.b     U0^dagger(x)*U0^dagger(x-0)* (1-gamma5)/2 *  S(x-0,ytilde)
-       TYPE V.2.a OR  V.2.b                                  (1-gamma5)/2 *  S(x-0,ytilde)
-*/
-                  _vector_null(running.sp_up.s0);
-                  _vector_null(running.sp_up.s1);
-                  _vector_null(running.sp_dn.s0);
-                  _vector_null(running.sp_dn.s1);
-
-                  if ( type_12 == TYPE_1){
-                    upm = &g_gauge_field[g_idn[ix][TUP]][TUP];
-//for the up quark
-                    _su3_inverse_multiply(running.sp_up.s2, (*upm), propfields[12*s1 + 4*c1 + 2*f1][g_idn[ix][TUP]].sp_up.s2);
-                    _su3_inverse_multiply(running.sp_up.s3, (*upm), propfields[12*s1 + 4*c1 + 2*f1][g_idn[ix][TUP]].sp_up.s3);
-
-//for the down quark
-                    _su3_inverse_multiply(running.sp_dn.s2, (*upm), propfields[12*s1 + 4*c1 + 2*f1][g_idn[ix][TUP]].sp_dn.s2);
-                    _su3_inverse_multiply(running.sp_dn.s3, (*upm), propfields[12*s1 + 4*c1 + 2*f1][g_idn[ix][TUP]].sp_dn.s3);
-
-                    upm = &g_gauge_field[ix][TUP];
-
-//for the up quark
-                    _vector_null( tmpvec );
-                    _su3_inverse_multiply(tmpvec, (*upm), running.sp_up.s0);
-                    _vector_assign(  running.sp_up.s0, tmpvec);
- 
-                    _vector_null( tmpvec );
-                    _su3_inverse_multiply(tmpvec, (*upm), running.sp_up.s1);
-                    _vector_assign(  running.sp_up.s1, tmpvec);
 
-//for the down quark
+      if( (i = read_gauge_field(conf_filename,g_smeared_gauge_field)) !=0) {
+          fprintf(stderr, "Error %d while reading gauge field from %s\n Aborting...\n", i, conf_filename);
+          exit(-2);
+      }
 
-                    _vector_null( tmpvec );
-                    _su3_inverse_multiply(tmpvec, (*upm), running.sp_dn.s0);
-                    _vector_assign(  running.sp_dn.s0, tmpvec);
 
-                    _vector_null( tmpvec );
-                    _su3_inverse_multiply(tmpvec, (*upm), running.sp_dn.s1);
-                    _vector_assign(  running.sp_dn.s1, tmpvec);
-                  }
-                  else if ( type_12 == TYPE_2){
-                    _vector_assign( running.sp_up.s0, propfields[12*s1 + 4*c1 + 2*f1][g_idn[ix][TUP]].sp_up.s2 );
-                    _vector_assign( running.sp_up.s1, propfields[12*s1 + 4*c1 + 2*f1][g_idn[ix][TUP]].sp_up.s3 );
+      if (g_cart_id == 0) {
+          printf("# Finished reading gauge field.\n");
+          fflush(stdout);
+      }
 
-                    _vector_assign( running.sp_dn.s0, propfields[12*s1 + 4*c1 + 2*f1][g_idn[ix][TUP]].sp_dn.s2 );
-                    _vector_assign( running.sp_dn.s1, propfields[12*s1 + 4*c1 + 2*f1][g_idn[ix][TUP]].sp_dn.s3 );
-                  }
-/*   
-       TYPE V.1.a OR  V.1.b     phi^dagger(x)*tau_i*     U0^dagger(x)*U0^dagger(x-0)* (1-gamma5)/2 *  S(x-0,ytilde)
-       TYPE V.2.a OR  V.2.b     phi^dagger(x)*tau_i                                        (1-gamma5)/2 *  S(x-0,ytilde)
-*/
-                 
-                  taui_scalarfield_spinor( &running, &running, GAMMA_DN, tauindex, ix, NODIR, DAGGER);
+#ifdef TM_USE_MPI
+      xchange_gauge(g_gauge_field);
+      xchange_gauge(g_smeared_gauge_field);
 
-/*   
-       TYPE V.1.a OR  V.1.b     S(ytilde,x+0)*phi^dagger(x)*tau_i*     U0^dagger(x)*U0^dagger(x-0)* (1-gamma5)/2 *  S(x-0,ytilde)
-       TYPE V.2.a OR  V.2.b     S(ytilde,x-0)*phi^dagger(x)*tau_i                                   (1-gamma5)/2 *  S(x-0,ytilde)
-*/
+#endif
+    /*compute the energy of the gauge field*/
+      plaquette_energy = measure_plaquette( (const su3**) g_gauge_field);
 
-                  if (type_12 == TYPE_1){
-                    multiply_backward_propagator(&running, propfields, &running, ix, TUP);
-                  }
-                  else if (type_12 == TYPE_2){
-                    multiply_backward_propagator(&running, propfields, &running, ix, TDOWN);
-                  }
-/*   
-       TYPE V.1.a                 tau_i*phi(ytilde)*       (1+gamma5)/2* S(ytilde, x-0)*tau_i*phi(x)*U0(x-0)*U0(x)* (1+gamma5)/2 *  S(x+0,ytilde)
-       TYPE V.1.b                 phi^dagger(ytilde)*tau_i*(1-gamma5)/2* S(ytilde, x-0)*tau_i*phi(x)*U0(x-0)*U0(x)* (1+gamma5)/2 *  S(x+0,ytilde)
-
-       TYPE V.2.a                 tau_i*phi(ytilde)*       (1+gamma5)/2* S(ytilde, x-0)*tau_i*phi(x)*               (1+gamma5)/2 *  S(x-0,ytilde)
-       TYPE V.2.b                 phi^dagger(ytilde)*tau_i*(1-gamma5)/2* S(ytilde, x-0)*tau_i*phi(x)*               (1+gamma5)/2 *  S(x-0,ytilde)
-
-*/
-                  //delta( color component of bispinor running, c1) for all spinor and flavor indices
-                  trace_in_color(colortrace, &running, c1 );
-               } //End of trace color
-               //sum over all lattice sites the result of the color trace
-               trace_in_space( spacetrace, colortrace, ix);
-            }  //End of trace in space
-
-//Gather the results from all nodes to complete the trace in space
-            for (i=0; i<8*T_global; ++i){
-               _Complex double tmp;
-               MPI_Allreduce(&spacetrace[i], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
-               spacetrace[i]= tmp;
-            }
-            // delta (spinor components of spacetrace, s1) for all time slices and flavor indices
-            trace_in_spinor(spinortrace, spacetrace, s1);
-
-         } //End of trace in spinor space
-
-         if ( type_ab == TYPE_A ){
-           taui_scalarfield_flavoronly( spinortrace, tauindex, NO_DAGG );
-         }
-         else if ( type_ab == TYPE_B){
-           taui_scalarfield_flavoronly( spinortrace, tauindex, DAGGER  );
-         }
-         //delta(flavor component in spinortrace, f1) for all time slices
-         trace_in_flavor( flavortrace, spinortrace, f1 );
-
-      } //End of trace in flavor space
-
-      for (i=0;i<T_global; ++i)
-         paulitrace[i]+=flavortrace[i];
-   } //End of trace for Pauli matrices
-
-
-   if (g_cart_id == 0){printf("Wilson term Dirac Current Density correlator typeV results= %10.10e %10.10e\n", type_12 == TYPE_1 ? '1' : '2',type_ab == TYPE_A ? 'a' :'b');}
-   for (i=0; i<T_global; ++i){
-      if (g_cart_id == 0){
-        printf("%3d %10.10e %10.10e\n", i, creal(paulitrace[i]), cimag(paulitrace[i]));
+
+      if (g_cart_id == 0) {
+          printf("# The computed plaquette value is %e.\n", plaquette_energy / (6.*VOLUME*g_nproc));
+          fflush(stdout);
       }
-   }
-   free(flavortrace);
-   free(paulitrace);
-   free(spacetrace);
-   free(spinortrace);
-   free(colortrace);
-}
-void wilsonterm_current_density_612ab( bispinor ** propfields, int type_12, int type_ab ){
-   int ix,i;
-   int f1,c1,s1,tauindex;
-   int spinorstart, spinorend;
-   bispinor **starting2d;
-   bispinor **running2d;
-   bispinor **tmpbisp2d;
-   su3 * restrict upm;
-   su3_vector tmpvec;
-   int count;
-   MPI_Status  statuses[8];
-   MPI_Request *request;
-   _Complex double *colortrace;
-   _Complex double *spacetrace;
-   _Complex double *spinortrace;
-   _Complex double *flavortrace;
-   _Complex double *paulitrace;
-
-   colortrace= (_Complex double *)malloc(sizeof(_Complex double)*8);
-   spacetrace= (_Complex double *)malloc(sizeof(_Complex double)*8*T_global);
-   spinortrace=(_Complex double *)malloc(sizeof(_Complex double)*2*T_global);
-   flavortrace=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
-   paulitrace= (_Complex double *)malloc(sizeof(_Complex double)*T_global);
-
-
-   request=( MPI_Request *) malloc(sizeof(MPI_Request)*8);
-   if ( type_ab == TYPE_A ) {
-        spinorstart=0;
-        spinorend  =2;
-   }
-   else if ( type_ab == TYPE_B ){
-        spinorstart=2;
-        spinorend  =4;
-   }
-   tmpbisp2d= (bispinor **)malloc(sizeof(bispinor *)*24);
-   running2d= (bispinor **)malloc(sizeof(bispinor *)*24);
-   starting2d=(bispinor **)malloc(sizeof(bispinor *)*24);
-   for (i=0; i<24; ++i){
-     tmpbisp2d[i] =(bispinor *)malloc(sizeof(bispinor)*VOLUMEPLUSRAND);
-     starting2d[i]=(bispinor *)malloc(sizeof(bispinor)*VOLUMEPLUSRAND);
-     running2d[i] =(bispinor *)malloc(sizeof(bispinor)*VOLUMEPLUSRAND);
-     for (ix=0; ix<VOLUME; ++ix){
-       _bispinor_null( running2d[i][ix]);
-       _bispinor_null(starting2d[i][ix]);
-       _bispinor_null( tmpbisp2d[i][ix]);
-     }
-   } 
-//Doing the neccesary communication
-   for (s1=spinorstart; s1<spinorend; ++s1)
-     for (c1=0; c1<3; ++c1)
-       for (f1=0; f1<2; ++f1){
-           count=0;
-           generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 0], sizeof(bispinor), TUP   , request, &count );
-           MPI_Waitall( count, request, statuses);
-           count=0;
-           generic_exchange_direction_nonblocking( propfields[12*s1 + 4*c1 + 2*f1 + 1], sizeof(bispinor), TDOWN , request, &count );
-           MPI_Waitall( count, request, statuses);
-   }
-/*************************
-Creating U0(x-2*0)U0(x-0)*S(x, ytilde) in two steps:
-1; Doing the product U0(x-0)*U0(x)*S( x+0, ytilde)
-2; Gathering in direction TDOWN
-**************************/
-   if (type_12 == TYPE_2){
-      for (ix = 0; ix< VOLUME; ++ix)
-        for (s1=spinorstart;s1<spinorend; ++s1)
-          for (c1=0; c1<3; ++c1)
-            for (f1=0; f1<2; ++f1){
-
-               upm = &g_gauge_field[ix][TUP];
-
-               _su3_multiply(starting2d[12*f1+3*s1+c1][ix].sp_up.s2, (*upm), propfields[12*s1 + 4*c1 + 2*f1][g_iup[ix][TUP]].sp_up.s2);
-               _su3_multiply(starting2d[12*f1+3*s1+c1][ix].sp_up.s3, (*upm), propfields[12*s1 + 4*c1 + 2*f1][g_iup[ix][TUP]].sp_up.s3);
-
-               _su3_multiply(starting2d[12*f1+3*s1+c1][ix].sp_dn.s2, (*upm), propfields[12*s1 + 4*c1 + 2*f1][g_iup[ix][TUP]].sp_dn.s2);
-               _su3_multiply(starting2d[12*f1+3*s1+c1][ix].sp_dn.s3, (*upm), propfields[12*s1 + 4*c1 + 2*f1][g_iup[ix][TUP]].sp_dn.s3);
-
-               upm = &g_gauge_field[g_idn[ix][TUP]][TUP];
-
-               _su3_multiply(tmpvec, (*upm), starting2d[12*f1+3*s1+c1][ix].sp_up.s2);
-               _vector_assign(  starting2d[12*f1+3*s1+c1][ix].sp_up.s2, tmpvec);
-               _su3_multiply(tmpvec, (*upm), starting2d[12*f1+3*s1+c1][ix].sp_up.s3);
-               _vector_assign(  starting2d[12*f1+3*s1+c1][ix].sp_up.s3, tmpvec);
-
-               _su3_multiply(tmpvec, (*upm), starting2d[12*f1+3*s1+c1][ix].sp_dn.s2);
-               _vector_assign(  starting2d[12*f1+3*s1+c1][ix].sp_dn.s2, tmpvec);
-               _su3_multiply(tmpvec, (*upm), starting2d[12*f1+3*s1+c1][ix].sp_dn.s3);
-               _vector_assign(  starting2d[12*f1+3*s1+c1][ix].sp_dn.s3, tmpvec);
-            }
-      for (s1=spinorstart;s1<spinorend; ++s1)
-        for (c1=0; c1<3; ++c1)
-          for (f1=0; f1<2; ++f1){
-            count=0;
-            generic_exchange_direction_nonblocking( starting2d[12*f1+3*s1+c1], sizeof(bispinor), TDOWN, request, &count );
-            MPI_Waitall( count, request, statuses);
+
+      plaquette_energy = measure_plaquette( (const su3**) g_smeared_gauge_field);
+
+
+      if (g_cart_id == 0) {
+          printf("# The computed plaquette value for smeared gauge field is %e.\n", plaquette_energy / (6.*VOLUME*g_nproc));
+          fflush(stdout);
       }
-   }
-// Trace over the Pauli matrices
-   for (tauindex=0; tauindex<3; ++tauindex){
-
-//Trace over flavour degrees of freedom
-      for (i=0; i<T_global; ++i)
-         flavortrace[i]=0.;
-
-      for (f1=0; f1<2; ++f1){
-
-//Trace over spinor indices
-         for (i=0; i<8*T_global; ++i){
-            spinortrace[i]=0.;
-         }
-
-         for (s1=spinorstart; s1<spinorend; ++s1){
-
-//Trace over spatial indices
-            for (i=0; i<8*T_global; ++i){
-               spacetrace[i]=0.;
-            }
-            for (ix=0; ix<VOLUME; ++ix){
-
-//Trace over the color indices for each sites
-               for (c1=0; c1<3; ++c1){
-/*   
-       TYPE VI.1.a OR  VI.1.b                                     (1-gamma5)/2*S(x    ,ytilde)
-       TYPE VI.2.a OR  VI.2.b                   U0(x-2*0)*U0(x-0)*(1-gamma5)/2*S(x    ,ytilde)
-*/
-                  _vector_null(running2d[12*f1 + 3*s1 + c1][ix].sp_up.s0);
-                  _vector_null(running2d[12*f1 + 3*s1 + c1][ix].sp_up.s1);
-                  _vector_null(running2d[12*f1 + 3*s1 + c1][ix].sp_dn.s0);
-                  _vector_null(running2d[12*f1 + 3*s1 + c1][ix].sp_dn.s1);
-                  if ( type_12 == TYPE_2){
-//for the up quark
-                    _vector_assign( running2d[12*f1+3*s1+c1][ix].sp_up.s2, starting2d[12*f1 + 3*s1 + c1][g_idn[ix][TUP]].sp_up.s2);
-                    _vector_assign( running2d[12*f1+3*s1+c1][ix].sp_up.s3, starting2d[12*f1 + 3*s1 + c1][g_idn[ix][TUP]].sp_up.s3);
-
-//for the down quark
-                    _vector_assign( running2d[12*f1+3*s1+c1][ix].sp_dn.s2, starting2d[12*f1 + 3*s1 + c1][g_idn[ix][TUP]].sp_dn.s2);
-                    _vector_assign( running2d[12*f1+3*s1+c1][ix].sp_dn.s3, starting2d[12*f1 + 3*s1 + c1][g_idn[ix][TUP]].sp_dn.s3);
 
-                  }
-                  else if ( type_12 == TYPE_1){
-                    _vector_assign( running2d[12*f1+3*s1+c1][ix].sp_up.s2, propfields[12*s1 + 4*c1 + 2*f1][ix].sp_up.s2 );
-                    _vector_assign( running2d[12*f1+3*s1+c1][ix].sp_up.s3, propfields[12*s1 + 4*c1 + 2*f1][ix].sp_up.s3 );
+      if(SourceInfo.type == 1) {
+          index_start = 0;
+          index_end = 1;
+      }
 
-                    _vector_assign( running2d[12*f1+3*s1+c1][ix].sp_dn.s2, propfields[12*s1 + 4*c1 + 2*f1][ix].sp_dn.s2 );
-                    _vector_assign( running2d[12*f1+3*s1+c1][ix].sp_dn.s3, propfields[12*s1 + 4*c1 + 2*f1][ix].sp_dn.s3 );
-                  }
-/*   
-       TYPE VI.1.a OR  VI.1.b   phi^dagger(x-0)*tau_i*                    (1-gamma5)/2*S(x,ytilde)
-       TYPE VI.2.a OR  VI.2.b   phi^dagger(x-0)*tau_i*  U0(x-2*0)*U0(x-0)*(1-gamma5)/2*S(x,ytilde)
-*/
-                  taui_scalarfield_spinor( &running2d[12*f1 + 3*s1 +c1][ix], &running2d[12*f1 + 3*s1 +c1][ix], GAMMA_DN, tauindex, ix, TDOWN, DAGGER);
 
+      if (g_cart_id == 0) {
+          fprintf(stdout, "#\n"); /*Indicate starting of the operator part*/
+      }
+      for (op_id =0; op_id < no_operators; op_id++){
+          if ( (operator_list[op_id].type== BSM2f) || (operator_list[op_id].type == BSM3) ){
+              if (operator_list[op_id].type== BSM2f){
+                init_D_psi_BSM2f();
+              }
+              else {
+                init_D_psi_BSM3();
+                init_sw_fields(VOLUME);
+
+                //Note here the factor of 1/2. has been applied since
+                //the routine assign_mul_one_sw_pm_imu_site_lexic computes
+                //1+i *csw*\sum_{\mu,nu} \sigma_mu,nuF_mu,nu/2.
+                sw_term( (const su3**) g_smeared_gauge_field, 1.,  csw_BSM/2.);
+              }
+              operator_list[op_id].prop_zero=(bispinor  **)malloc(sizeof(bispinor*)*48);
+              if (operator_list[op_id].prop_zero == NULL){
+                printf("Error in memory allocation for storing the propagators\n");
+                exit(1);
+              }
+              for (int ii=0; ii<48; ++ii){
+                operator_list[op_id].prop_zero[ii]=(bispinor *)malloc(sizeof(bispinor)*VOLUMEPLUSRAND);
+                if ( operator_list[op_id].prop_zero[ii] == NULL ){
+                  printf("Error in allocating memory for propagators\n");
+                  exit(1);
+                }
+              }
+              if ( ( vectorcurrentcurrent_BSM == 1 ) || ( axialcurrentcurrent_BSM == 1 )){
+                operator_list[op_id].prop_ntmone=(bispinor  **)malloc(sizeof(bispinor*)*48);
+                if (operator_list[op_id].prop_ntmone == NULL){
+                  printf("Error in memory allocation for storing the propagators\n");
+                  exit(1);
+                }
+                for (int ii=0; ii<48; ++ii){
+                  operator_list[op_id].prop_ntmone[ii]=(bispinor *)malloc(sizeof(bispinor)*VOLUMEPLUSRAND);
+                  if ( operator_list[op_id].prop_ntmone[ii] == NULL ){
+                    printf("Error in allocating memory for propagators\n");
+                    exit(1);
+                  }
+                }
+              }
+          }
+          boundary( operator_list[op_id].kappa);
+          g_kappa = operator_list[op_id].kappa;
+          if (g_cart_id ==0) {fprintf(stdout, "#kappa value=%e\n", g_kappa);}
+          g_mu = 0.;
+          if (g_cart_id == 0) printf("# npergauge=%d\n", operator_list[op_id].npergauge);
+
+          if (g_cart_id == 0) printf("# Starting scalar counter is %d for gauge field %d \n", nscalar, nstore );
+          /* support multiple inversions for the BSM operator, one for each scalar field */
+
+          for(int i_pergauge = 0; i_pergauge < operator_list[op_id].npergauge; ++i_pergauge){
+             /* set scalar field counter to InitialScalarCounter */
+             int iscalar = nscalar+j*operator_list[op_id].nscalarstep*operator_list[op_id].npergauge+i_pergauge*operator_list[op_id].nscalarstep;
+             operator_list[op_id].n = iscalar;
+             // read scalar field
+             if( strcmp(scalar_input_filename, "create_random_scalarfield") == 0 )
+             {
+                for( int s = 0; s < 4; s++) { ranlxd(g_scalar_field[s], VOLUME); }
+             }
+             else if ( strcmp(scalar_input_filename, "create_unit_scalarfield") == 0 ){             
+                unit_scalar_field(g_scalar_field);
+             }
+             else
+             {
+                snprintf(scalar_filename, 50, "%s.%.8d", scalar_input_filename, iscalar);
+                if (g_cart_id == 0)
+                {
+                    printf("#\n# Trying to read scalar field from file %s in %s precision.\n",
+                       scalar_filename, (scalar_precision_read_flag == 32 ? "single" : "double"));
+                    fflush(stdout);
+                }
+                int i;
+                double read_end, read_begin=gettime();
+
+                if( (i = read_scalar_field_parallel(scalar_filename,g_scalar_field)) !=0)
+                {
+                    fprintf(stderr, "Error %d while reading scalar field from %s\n Aborting...\n", i, scalar_filename);
+                    exit(-2);
+                }
+                read_end=gettime();
+
+                if (g_cart_id == 0) {
+                   printf("# Finished reading scalar field in %.4e seconds.\n",read_end-read_begin); 
+                   fflush(stdout);
+                }
+
+             }//End of reading scalar field
+
+#if defined TM_USE_MPI
+             for( int s=0; s<4; s++ )
+               generic_exchange_nogauge(g_scalar_field[s], sizeof(scalar));
+#endif
+             for( isample = 0; isample < no_samples; isample++)
+             {
+               if (propagatorsonthefly_BSM == 1){
+
+                 if ((g_cart_id == 0 ) && ( (index_start != 0) || (index_end!= 12) ))
+                 {
+                    fprintf(stderr, "Contraction can be computed only with full set of point propagators\n");
+                    exit(1);
+                 }
+
+                 for(ix = index_start; ix < index_end; ix++) {
+                    if (g_cart_id == 0) {
+                      fprintf(stdout, "#\n"); /*Indicate starting of new index*/
+                    }
+
+                    /* we use g_spinor_field[0-7] for sources and props for the moment */
+                    /* 0-3 in case of 1 flavour  */
+                    /* 0-7 in case of 2 flavours */
+
+                    prepare_source(nstore, isample, ix, op_id, read_source_flag, source_location, 0);
+
+//                  if (g_cart_id == 0) printf("Source has been prepared\n\n\n");
+                    //randmize initial guess for eigcg if needed-----experimental
+                    if( (operator_list[op_id].solver == INCREIGCG) && (operator_list[op_id].solver_params.eigcg_rand_guess_opt) )
+                    { //randomize the initial guess
+                        gaussian_volume_source( operator_list[op_id].prop0, operator_list[op_id].prop1,isample,ix,0); //need to check this
+                    } 
+		    
+		    operator_list[op_id].inverter(op_id, index_start, 1);
+
+                 }//end of loop for spinor and color source degrees of freedom
+
+                 if ( ( vectorcurrentcurrent_BSM == 1 ) || ( axialcurrentcurrent_BSM == 1 )){
+                    int tindex=source_location/(LX*g_nproc_x*LY*g_nproc_y*LZ*g_nproc_z);
+                    int tnewindex=(tindex-1+T_global)%T_global;
+                    int spatialindex=source_location % (LX*g_nproc_x*LY*g_nproc_y*LZ*g_nproc_z);
+                    int backsource=tnewindex*(LX*g_nproc_x*LY*g_nproc_y*LZ*g_nproc_z)+spatialindex;
+
+                    for(ix = index_start; ix < index_end; ix++) {
+                      if (g_cart_id == 0) {
+                        fprintf(stdout, "#\n"); /*Indicate starting of new index*/
+                      }
+
+                      /* we use g_spinor_field[0-7] for sources and props for the moment */
+                      /* 0-3 in case of 1 flavour  */
+                      /* 0-7 in case of 2 flavours */
+
+                      prepare_source(nstore, isample, ix, op_id, read_source_flag, backsource, 0);
+
+                      //randmize initial guess for eigcg if needed-----experimental
+                      if( (operator_list[op_id].solver == INCREIGCG) && (operator_list[op_id].solver_params.eigcg_rand_guess_opt) )
+                      { //randomize the initial guess
+                        gaussian_volume_source( operator_list[op_id].prop0, operator_list[op_id].prop1,isample,ix,0); //need to check this
+                      }
+
+                      operator_list[op_id].inverter(op_id, index_start, 1);
+                    }//end of loop for spinor and color source degrees of freedom
+
+                 }//end of vectorcurrentcurrent_BSM == 1
 
                }
+               else{
+                 for(src_idx = 0; src_idx < 12; src_idx++ )
+                 {
+                    snprintf(prop_fname,200,"bsm2prop.%.4d.%.2d.%02d.%.8d.inverted",nstore, isample, src_idx, iscalar);
 
-            }
-/*   
-       TYPE VI.1.a OR  VI.1.b   S(ytilde, x)    *phi^dagger(x-0)*tau_i*                    (1-gamma5)/2*S(x,ytilde)
-       TYPE VI.2.a OR  VI.2.b   S(ytilde, x-2*0)*phi^dagger(x-0)*tau_i*  U0(x-2*0)*U0(x-0)*(1-gamma5)/2*S(x,ytilde)
-*/
+                    for(pos = 0; pos < 8; ){
+                       printf("READCHECK: Propagator in pos %02d from file %s\n", pos/2,prop_fname);
 
-/**************************
-Multiplication with Stilde(x-2*0,ytilde)P(x) in three steps:
-1; Gathering P(x) from direction +0
-2; Multiplying Stilde(x-O,ytilde) with P(x+0)
-3; Gathering The result in direction -0
+//read the propagator from source d to sink d 
+                       read_spinor(g_spinor_field[0], g_spinor_field[1], prop_fname, pos);
+                       convert_eo_to_lexic(temp_field[0], g_spinor_field[0], g_spinor_field[1]);
+                       pos+=1;
 
-**************************/
+//read the propagator from source d to sink u
+                       read_spinor(g_spinor_field[0], g_spinor_field[1], prop_fname, pos);
+                       convert_eo_to_lexic(temp_field[1], g_spinor_field[0], g_spinor_field[1]);
+                       pos+=1;
 
+//create a bispinor first insert sink u then sink d
+//Store them in such a way that the u-ones should come first
+                       compact(operator_list[op_id].prop_zero[pos > 4 ? src_idx*4+pos/2-3 : src_idx*4+pos/2+ 1], temp_field[1], temp_field[0]);
+                    }
 
-            if ( type_12 == TYPE_2 ) {
-              for (c1=0; c1<3; ++c1){
-                 count=0;
-                 generic_exchange_direction_nonblocking( running2d[12*f1+3*s1+c1], sizeof(bispinor), TUP, request, &count );
-                 MPI_Waitall( count, request, statuses);
-              }
-            }
-            for (ix = 0; ix< VOLUME; ++ix){
-               for (c1=0; c1<3; ++c1){
-                  if (type_12 == TYPE_2){
-                    multiply_backward_propagator(&tmpbisp2d[12*f1+3*s1+c1][ix], propfields, &running2d[12*f1+3*s1+c1][g_iup[ix][TUP]], ix, TDOWN);
-                  }
-                  else if (type_12 == TYPE_1){
-                    multiply_backward_propagator(&running2d[12*f1+3*s1+c1][ix], propfields, &running2d[12*f1+3*s1+c1][ix]            , ix, NODIR);
-                  }
+                 }//end of loop for spinor and color source degrees of freedom
+
+                 if ( (vectorcurrentcurrent_BSM == 1 ) || ( axialcurrentcurrent_BSM == 1 )){
+                   for(src_idx = 0; src_idx < 12; src_idx++ )
+                   {
+                      snprintf(prop_fname,200,"bsm2prop.%.4d.%.2d.%02d.%.8d.inverted",nstore, T_global-1, src_idx, iscalar);
+                      for(pos = 0; pos < 8; ){
+                        printf("READCHECK: Propagator in pos %02d from file %s\n", pos/2,prop_fname);
+//read the propagator from source d to sink d 
+                        read_spinor(g_spinor_field[0], g_spinor_field[1], prop_fname, pos);
+                        convert_eo_to_lexic(temp_field[0], g_spinor_field[0], g_spinor_field[1]);
+                        pos+=1;
+
+//read the propagator from source d to sink u
+                        read_spinor(g_spinor_field[0], g_spinor_field[1], prop_fname, pos);
+                        convert_eo_to_lexic(temp_field[1], g_spinor_field[0], g_spinor_field[1]);
+                        pos+=1;
+//create a bispinor first insert sink u then sink d
+//Store them in such a way that the u-ones should come first
+                        compact(operator_list[op_id].prop_ntmone[pos > 4 ? src_idx*4+pos/2-3 : src_idx*4+pos/2+ 1], temp_field[1], temp_field[0]);
+                     }
+
+                   }//end of loop for spinor and color source degrees of freedom
+                 }
                }
-            }
-            if ( type_12 == TYPE_2 ) {
-               for (c1=0; c1<3; ++c1){
-                  count=0;
-                  generic_exchange_direction_nonblocking( tmpbisp2d[12*f1+3*s1+c1], sizeof(bispinor), TDOWN, request, &count );
-                  MPI_Waitall( count, request, statuses);
+               if (g_cart_id == 0){
+                    snprintf(contractions_fname,200,"bsmcontractions.%.4d.%d.%.8d",nstore, isample, iscalar);
                }
-               for (ix = 0; ix< VOLUME; ++ix){
-                  for (c1=0; c1<3; ++c1){
-                     if (type_12 == TYPE_2){
-                        _spinor_assign( running2d[12*f1+3*s1+c1][ix].sp_up, tmpbisp2d[12*f1 + 3*s1 +c1][g_idn[ix][TUP]].sp_up );
-                        _spinor_assign( running2d[12*f1+3*s1+c1][ix].sp_dn, tmpbisp2d[12*f1 + 3*s1 +c1][g_idn[ix][TUP]].sp_dn );
-                     }
-                  }
+
+               if (smearedcorrelator_BSM == 1){
+                smear_scalar_fields_correlator(g_smeared_scalar_field, g_scalar_field, timesmearcorrelator_BSM );
+                if (g_cart_id == 0) printf("Smeared : %e\t Non smeared %e\n", g_smeared_scalar_field[0][0],g_scalar_field[0][0]);
+                 for ( int s=0; s<4; s++ )
+                   generic_exchange_nogauge(g_smeared_scalar_field[s], sizeof(scalar));
                }
-            }
-
-            for (ix = 0; ix< VOLUME; ++ix){
-//Trace over the color indices for each sites
-               for (i=0; i<8; ++i)
-                  colortrace[i]=0.;
-               for (c1=0; c1<3; ++c1){
-                  //delta( color component of bispinor running, c1) for all spinor and flavor indices
-                  trace_in_color(colortrace,&running2d[12*f1 + 3*s1 +c1][ix],c1);
-               
-               }  //End of trace in color
-               //sum over all lattice sites the result of the color trace
-               trace_in_space(spacetrace,colortrace,ix);
-
-            } //End of trace in space
-
-
-//Gather the results from all nodes to complete the trace in space
-            for (i=0; i<T_global; ++i){
-               _Complex double tmp;
-               MPI_Allreduce(&spacetrace[i], &tmp, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
-               spacetrace[i]= tmp;
-            }
-            // delta (spinor components of spacetrace, s1) for all time slices and flavor indices
-            trace_in_spinor(spinortrace, spacetrace, s1);
-
-         }//End of trace in spinor space
-
-/*   
-       TYPE VI.1.a    tau_i*phi(ytilde)*       (1+gamma5)/2*S(ytilde, x)        *phi^dagger(x-0)*tau_i*                    (1-gamma5)/2*S(x,ytilde)
-       TYPE VI.1.b    phi^dagger(ytilde)*tau_i*(1-gamma5)/2*S(ytilde, x)        *phi^dagger(x-0)*tau_i*                    (1-gamma5)/2*S(x,ytilde)
-
-       TYPE VI.2.a    tau_i*phi(ytilde)*       (1+gamma5)/2*S(ytilde, x-2*0)    *phi^dagger(x-0)*tau_i*  U0(x-2*0)*U0(x-0)*(1-gamma5)/2*S(x,ytilde)
-       TYPE VI.2.b    phi^dagger(ytilde)*tau_i*(1-gamma5)/2*S(ytilde, x-2*0)    *phi^dagger(x-0)*tau_i*  U0(x-2*0)*U0(x-0)*(1-gamma5)/2*S(x,ytilde)
-
-*/
-         if ( type_ab == TYPE_A ){
-           taui_scalarfield_flavoronly( spinortrace, tauindex, NO_DAGG );
-         }
-         else if ( type_ab == TYPE_B ){
-           taui_scalarfield_flavoronly( spinortrace, tauindex, DAGGER  );
-         }
-         //delta(flavor component in spinortrace, f1) for all time slices
-         trace_in_flavor( flavortrace, spinortrace, f1 );
-
-      } //End of trace in flavor space
-      //sum for all Pauli matrices
-      for (i=0;i<T_global; ++i)
-         paulitrace[i]+=flavortrace[i];
-   } //End of trace for Pauli matrices
-
-
-   if (g_cart_id == 0){printf("Wilson term Dirac Current Density correlator typeVI results= %10.10e %10.10e\n", type_12 == TYPE_1 ? '1' : '2',type_ab == TYPE_A ? 'a' :'b');}
-   for (i=0; i<T_global; ++i){
-      if (g_cart_id == 0){
-        printf("%3d %10.10e %10.10e\n", i, creal(paulitrace[i]), cimag(paulitrace[i]));
-      }
-   }
-   free(flavortrace);
-   free(paulitrace);
-   free(spacetrace);
-   free(spinortrace);
-   free(colortrace);
-   free(request);
-}
-void main(int argc, char *argv[]){
-  FILE *parameterfile = NULL;
-  char datafilename[206];
-  char parameterfilename[206];
-  char conf_filename[50];
-  char scalar_filename[50];
-  char * input_filename = NULL;
-  char * filename = NULL;
-  double plaquette_energy;
-  int i;
-  char prop_fname[200];
-  int src_idx, pos;
-  int count;
-  int status_geo;
-  MPI_Status  statuses[8];
-  MPI_Request *request;
-  spinor *tmpspinoru;
-  spinor *tmpspinord;
-  request=( MPI_Request *) malloc(sizeof(MPI_Request)*8);
+               if (g_cart_id == 0) {
+                 printf("Following measurements will be done\n");
+                 if (vectorcurrentcurrent_BSM == 1) printf("#Vectorcurrentcurrent3 correlation function\n");
+                 if (axialcurrentcurrent_BSM == 1) printf("#Axialcurrentcurrent1 correlation function trivial scalar\n");
+                 if (densitydensity_BSM == 1) printf("#Density Density correlation function\n");
+                 if (densitydensity_s0s0_BSM == 1) printf("#Density Density s0s0-p0p0 using trivial scalar field\n");
+                 if (densitydensity_sxsx_BSM == 1) printf("#Density Density sxsx-pxpx using trivial scalar field\n");
+                 if (diraccurrentdensity_BSM == 1) printf("#Dirac current density correlation function\n");
+                 if (wilsoncurrentdensitypr1_BSM == 1) printf("#Wilson  current density PR1 correlation function\n");
+                 if (wilsoncurrentdensitypr2_BSM == 1) printf("#Wilson  current density PR2 correlation function\n");
+                 if (wilsoncurrentdensitypl1_BSM == 1) printf("#Wilson  current density PL1 correlation function\n");
+                 if (wilsoncurrentdensitypl2_BSM == 1) printf("#Wilson  current density PL2 correlation function\n");
+                 if (vectorcurrentdensity_BSM == 1) printf("#JtildeV3 D3, JtildeV1 P2, JtildeV2 P1 to be (JtildeV1 P2 nad JtildeV2 P1 with trivial scalar calculated\n");
+                 if (axialcurrentdensity_BSM == 1) printf("#JtildeA1 P1, JtildeA2 P2 to be calculated\n");
+                 if (pdensityvectordensity_BSM == 1) printf("#P density times vector density (nontrivial scalar) to be calculated\n");
 
-  MPI_Init(&argc, &argv);
+               }
+               scalar=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+               pseudoscalar=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+               current=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+               if ( scalar == NULL || pseudoscalar == NULL || current == NULL){
+                 printf("Error in memory allocation for scalar pseudoscalar and current\n");
+                 exit(1);
+               }
+               pscalar1=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+               pscalar2=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+               pscalar3=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
 
-  MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id);
+               if (pscalar1 == NULL || pscalar2 == NULL || pscalar3 == NULL){
+                 printf("Error in allocating memory for storing pseudoscalar results\n");
+                 exit(1);
+               }
 
-  process_args(argc, argv, &input_filename,&filename);
-  set_default_filenames(&input_filename, &filename);
+               scalar1=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+               scalar2=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+               scalar3=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
 
-  /* Read the input file */
-  if( (i = read_input(input_filename)) != 0) {
-    fprintf(stderr, "Could not find input file: %s\nAborting...\n", input_filename);
-    exit(-1);
-  }
+               if (scalar1 == NULL || scalar2 == NULL || scalar3 == NULL){
+                 printf("Error in allocating memory for storing scalar results\n");
+                 exit(1);
+               }
 
-  if(g_proc_id==0) {
-   printf("parameter rho_BSM set to %f\n", rho_BSM);
-   printf("parameter eta_BSM set to %f\n", eta_BSM);
-   printf("parameter  m0_BSM set to %f\n",  m0_BSM);
-  }
+               current1=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+               current2=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
+               current3=(_Complex double *)malloc(sizeof(_Complex double)*T_global);
 
-#ifdef OMP
-  init_openmp();
-#endif
-  tmlqcd_mpi_init(argc, argv);
-  init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
+               if (current1 == NULL || current2 == NULL || current3 == NULL){
+                 printf("Error in allocating memory for storing current results\n");
+                 exit(1);
+               }
 
-  init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand);
 
-  int numbScalarFields = 4;
-  i = init_scalar_field(VOLUMEPLUSRAND, numbScalarFields);
-  if ( i!= 0) {
-    fprintf(stderr, "Not enough memory for scalar fields! Aborting...\n");
-    exit(0);
-  }
-  if(g_proc_id == 0) {
-     fprintf(stdout,"# The number of processes is %d \n",g_nproc);
-     printf("# The lattice size is %d x %d x %d x %d\n",
-        (int)(T*g_nproc_t), (int)(LX*g_nproc_x), (int)(LY*g_nproc_y), (int)(g_nproc_z*LZ));
-     printf("# The local lattice size is %d x %d x %d x %d\n",
-        (int)(T), (int)(LX), (int)(LY),(int) LZ);
-     fflush(stdout);
-  }
-        /* define the geometry */
-  geometry();
 
-  boundary(-1.0);
+               for (int ii=0;ii<T_global; ++ii){
+                 scalar[ii]=0.0;
+                 pseudoscalar[ii]=0.0;
+                 current[ii]=0.0;
+                 pscalar1[ii]=0.0;
+                 pscalar2[ii]=0.0;
+                 pscalar3[ii]=0.0;
+                 scalar1[ii]=0.0;
+                 scalar2[ii]=0.0;
+                 scalar3[ii]=0.0;
+                 current1[ii]=0.0;
+                 current2[ii]=0.0;
+                 current3[ii]=0.0;
+               }
+               if (g_cart_id == 0){
+                  out=fopen(contractions_fname,"a");
+                  if (out == NULL){
+                    printf("Error in opening file for storing contractions %s\n",filename);
+                    exit(1);
+                  }
+               }
+               if ( vectorcurrentcurrent_BSM == 1){
+                 vector_axial_current_current_1234(operator_list[op_id].prop_zero, operator_list[op_id].prop_ntmone, TYPE_1, 2, 0, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(-1.)*temp[ii];
+                 }
+                 free(temp);
+                 vector_axial_current_current_1234(operator_list[op_id].prop_zero, operator_list[op_id].prop_ntmone, TYPE_2, 2, 0, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(-1.)*temp[ii];
+                 }
+                 free(temp);
+                 vector_axial_current_current_1234(operator_list[op_id].prop_zero, operator_list[op_id].prop_ntmone, TYPE_3, 2, 0, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(-1.)*temp[ii];
+                 }
+                 free(temp);
+                 vector_axial_current_current_1234(operator_list[op_id].prop_zero, operator_list[op_id].prop_ntmone, TYPE_4, 2, 0, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(-1.)*temp[ii];
+                 }
+                 free(temp);
+
+                 if (g_cart_id == 0){
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JTILDEV3JTILDEV3\t%d\t%10.10e\t%10.10e\n", ii, creal(current[ii]), cimag(current[ii]));
+                   }
+                 }
+                 for (int ii=0;ii<T_global; ++ii){
+                   scalar[ii]=0.0;
+                   pseudoscalar[ii]=0.0;
+                   current[ii]=0.0;
+                   pscalar1[ii]=0.0;
+                   pscalar2[ii]=0.0;
+                   pscalar3[ii]=0.0;
+                   scalar1[ii]=0.0;
+                   scalar2[ii]=0.0;
+                   scalar3[ii]=0.0;
+                   current1[ii]=0.0;
+                   current2[ii]=0.0;
+                   current3[ii]=0.0;
+                 }
+               }
+               if ( axialcurrentcurrent_BSM == 1 ){
+
+                 unit_scalar_field(g_scalar_field);
+                 for( int s=0; s<4; s++ )
+                   generic_exchange_nogauge(g_scalar_field[s], sizeof(scalar));
+                 if (smearedcorrelator_BSM == 1){
+                   smear_scalar_fields_correlator(g_smeared_scalar_field, g_scalar_field, timesmearcorrelator_BSM );
+                   for ( int s=0; s<4; s++ )
+                    generic_exchange_nogauge(g_smeared_scalar_field[s], sizeof(scalar));
+                 }
+
+
+                 vector_axial_current_current_1234(operator_list[op_id].prop_zero, operator_list[op_id].prop_ntmone, TYPE_1, 0, 1, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(-1.)*temp[ii];
+                 }
+                 free(temp);
+                 vector_axial_current_current_1234(operator_list[op_id].prop_zero, operator_list[op_id].prop_ntmone, TYPE_2, 0, 1, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(-1.)*temp[ii];
+                 }
+                 free(temp);
+                 vector_axial_current_current_1234(operator_list[op_id].prop_zero, operator_list[op_id].prop_ntmone, TYPE_3, 0, 1, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(-1.)*temp[ii];
+                 }
+                 free(temp);
+                 vector_axial_current_current_1234(operator_list[op_id].prop_zero, operator_list[op_id].prop_ntmone, TYPE_4, 0, 1, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(-1.)*temp[ii];
+                 }
+                 free(temp);
+
+                 if (g_cart_id == 0){
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JTILDEA1JTILDEA1\t%d\t%10.10e\t%10.10e\n", ii, creal(current[ii]), cimag(current[ii]));
+                   }
+                 }
+                 for (int ii=0;ii<T_global; ++ii){
+                   scalar[ii]=0.0;
+                   pseudoscalar[ii]=0.0;
+                   current[ii]=0.0;
+                   pscalar1[ii]=0.0;
+                   pscalar2[ii]=0.0;
+                   pscalar3[ii]=0.0;
+                   scalar1[ii]=0.0;
+                   scalar2[ii]=0.0;
+                   scalar3[ii]=0.0;
+                   current1[ii]=0.0;
+                   current2[ii]=0.0;
+                   current3[ii]=0.0;
+                 }
+
+                 double read_end, read_begin=gettime();
+                 if( (i = read_scalar_field_parallel(scalar_filename,g_scalar_field)) !=0 )
+                 {
+                    fprintf(stderr, "Error %d while reading scalar field from %s\n Aborting...\n", i, scalar_filename);
+                    exit(-2);
+                 }
+                 read_end=gettime();
+                 if (g_cart_id == 0) {
+                   printf("# Finished reading scalar field in %.4e seconds.\n",read_end-read_begin);
+                   fflush(stdout);
+                 }
+                 for( int s=0; s<4; s++ )
+                   generic_exchange_nogauge(g_scalar_field[s], sizeof(scalar));
+                 if (smearedcorrelator_BSM == 1){
+                   smear_scalar_fields_correlator(g_smeared_scalar_field, g_scalar_field, timesmearcorrelator_BSM );
+                   for ( int s=0; s<4; s++ )
+                    generic_exchange_nogauge(g_smeared_scalar_field[s], sizeof(scalar));
+                 }
+                 
+               }
+               if (giancarlo_BSM == 1){
+                 giancarlodensity( operator_list[op_id].prop_zero, 0, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(-1.)*temp[ii];
+                 }
+                 free(temp);
+                 if (g_cart_id == 0){
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"GIANCARLOUNITYNONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current[ii]), cimag(current[ii]));
+                   }
+                 }
+               }
+               if (vectorcurrentdensity_BSM == 1){
+                 for (int ii=0;ii<T_global; ++ii){
+                   scalar[ii]=0.0;
+                   pseudoscalar[ii]=0.0;
+                   current[ii]=0.0;
+                   pscalar1[ii]=0.0;
+                   pscalar2[ii]=0.0;
+                   pscalar3[ii]=0.0;
+                   scalar1[ii]=0.0;
+                   scalar2[ii]=0.0;
+                   scalar3[ii]=0.0;
+                   current1[ii]=0.0;
+                   current2[ii]=0.0;
+                   current3[ii]=0.0;
+                 }
+                 giancarlodensity( operator_list[op_id].prop_zero, 1, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(-1.)*temp[ii];
+                 }
+                 free(temp);
+                 if (g_cart_id == 0){
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"GIANCARLOTAU3NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current[ii]), cimag(current[ii]));
+                   }
+                 }
+                 for (int ii=0;ii<T_global; ++ii){
+                   scalar[ii]=0.0;
+                   pseudoscalar[ii]=0.0;
+                   current[ii]=0.0;
+                   pscalar1[ii]=0.0;
+                   pscalar2[ii]=0.0;
+                   pscalar3[ii]=0.0;
+                   scalar1[ii]=0.0;
+                   scalar2[ii]=0.0;
+                   scalar3[ii]=0.0;
+                   current1[ii]=0.0;
+                   current2[ii]=0.0;
+                   current3[ii]=0.0;
+                 }
+
+                 vector_axial_current_density_1234(operator_list[op_id].prop_zero, TYPE_1,2, 2, 0, 0, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(-1.)*temp[ii];
+                 }
+                 free(temp);
+                 vector_axial_current_density_1234(operator_list[op_id].prop_zero, TYPE_2,2, 2, 0, 0, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(+1.)*temp[ii];
+                 }
+                 free(temp);
+                 vector_axial_current_density_1234(operator_list[op_id].prop_zero, TYPE_3,2, 2, 0, 0, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(-1.)*temp[ii];
+                 }
+                 free(temp);
+                 vector_axial_current_density_1234(operator_list[op_id].prop_zero, TYPE_4,2, 2, 0, 0, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(+1.)*temp[ii];
+                 }
+                 free(temp);
+                 temp= NULL;
+                 if (g_cart_id == 0){
+//                 fprintf(out,"S1S1nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JTILDEV3DS3NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current[ii]), cimag(current[ii]));
+                   }
+                 }
+                 for (int ii=0;ii<T_global; ++ii){
+                   scalar[ii]=0.0;
+                   pseudoscalar[ii]=0.0;
+                   current[ii]=0.0;
+                   pscalar1[ii]=0.0;
+                   pscalar2[ii]=0.0;
+                   pscalar3[ii]=0.0;
+                   scalar1[ii]=0.0;
+                   scalar2[ii]=0.0;
+                   scalar3[ii]=0.0;
+                   current1[ii]=0.0;
+                   current2[ii]=0.0;
+                   current3[ii]=0.0;
+                 }
+
+
+                 unit_scalar_field(g_scalar_field);
+                 for( int s=0; s<4; s++ )
+                   generic_exchange_nogauge(g_scalar_field[s], sizeof(scalar));
+                 if (smearedcorrelator_BSM == 1){
+                   smear_scalar_fields_correlator(g_smeared_scalar_field, g_scalar_field, timesmearcorrelator_BSM );
+                   for ( int s=0; s<4; s++ )
+                    generic_exchange_nogauge(g_smeared_scalar_field[s], sizeof(scalar));
+                 }
+
+                 vector_axial_current_density_1234(operator_list[op_id].prop_zero, TYPE_1,0, 1, 0, 1, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(-1.)*temp[ii];
+                 }
+                 free(temp);
+                 vector_axial_current_density_1234(operator_list[op_id].prop_zero, TYPE_2,0, 1, 0, 1, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(+1.)*temp[ii];
+                 }
+                 free(temp);
+                 vector_axial_current_density_1234(operator_list[op_id].prop_zero, TYPE_3,0, 1, 0, 1, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(-1.)*temp[ii];
+                 }
+                 free(temp);
+                 vector_axial_current_density_1234(operator_list[op_id].prop_zero, TYPE_4,0, 1, 0, 1, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(+1.)*temp[ii];
+                 }
+                 free(temp);
+                 if (g_cart_id == 0){
+//                 fprintf(out,"S1S1nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JTILDEV2P1TRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current[ii]), cimag(current[ii]));
+                   }
+                 }
+                 for (int ii=0;ii<T_global; ++ii){
+                   scalar[ii]=0.0;
+                   pseudoscalar[ii]=0.0;
+                   current[ii]=0.0;
+                   pscalar1[ii]=0.0;
+                   pscalar2[ii]=0.0;
+                   pscalar3[ii]=0.0;
+                   scalar1[ii]=0.0;
+                   scalar2[ii]=0.0;
+                   scalar3[ii]=0.0;
+                   current1[ii]=0.0;
+                   current2[ii]=0.0;
+                   current3[ii]=0.0;
+                 }
+
+                 vector_axial_current_density_1234(operator_list[op_id].prop_zero, TYPE_1,1, 0, 0, 1, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(-1.)*temp[ii];
+                 }
+                 free(temp);
+                 vector_axial_current_density_1234(operator_list[op_id].prop_zero, TYPE_2,1, 0, 0, 1, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(+1.)*temp[ii];
+                 }
+                 free(temp);
+                 vector_axial_current_density_1234(operator_list[op_id].prop_zero, TYPE_3,1, 0, 0, 1, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(-1.)*temp[ii];
+                 }
+                 free(temp);
+                 vector_axial_current_density_1234(operator_list[op_id].prop_zero, TYPE_4,1, 0, 0, 1, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(+1.)*temp[ii];
+                 }
+                 free(temp);
+                 if (g_cart_id == 0){         
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JTILDEV1P2TRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current[ii]), cimag(current[ii]));
+                   }
+                 } 
+
+
+                 for (int ii=0;ii<T_global; ++ii){
+                   scalar[ii]=0.0;
+                   pseudoscalar[ii]=0.0;
+                   current[ii]=0.0;
+                   pscalar1[ii]=0.0;
+                   pscalar2[ii]=0.0;
+                   pscalar3[ii]=0.0;
+                   scalar1[ii]=0.0;
+                   scalar2[ii]=0.0;
+                   scalar3[ii]=0.0;
+                   current1[ii]=0.0;
+                   current2[ii]=0.0;
+                   current3[ii]=0.0;
+                 }
+
+                 double read_end, read_begin=gettime();
+                 if( (i = read_scalar_field_parallel(scalar_filename,g_scalar_field)) !=0 )
+                 {
+                    fprintf(stderr, "Error %d while reading scalar field from %s\n Aborting...\n", i, scalar_filename);
+                    exit(-2);
+                 }
+                 read_end=gettime();
+                 if (g_cart_id == 0) {
+                   printf("# Finished reading scalar field in %.4e seconds.\n",read_end-read_begin);
+                   fflush(stdout);
+                 }
+                 for( int s=0; s<4; s++ )
+                   generic_exchange_nogauge(g_scalar_field[s], sizeof(scalar));
+                 if (smearedcorrelator_BSM == 1){
+                   smear_scalar_fields_correlator(g_smeared_scalar_field, g_scalar_field, timesmearcorrelator_BSM );
+                   for ( int s=0; s<4; s++ )
+                    generic_exchange_nogauge(g_smeared_scalar_field[s], sizeof(scalar));
+                 }
+                  
+               }
+               if (vectordensitydensity_BSM == 1){
+
+                 for (int ii=0;ii<T_global; ++ii){
+                   scalar[ii]=0.0;
+                   pseudoscalar[ii]=0.0;
+                   current[ii]=0.0;
+                   pscalar1[ii]=0.0;
+                   pscalar2[ii]=0.0;
+                   pscalar3[ii]=0.0;
+                   scalar1[ii]=0.0;
+                   scalar2[ii]=0.0;
+                   scalar3[ii]=0.0;
+                   current1[ii]=0.0;
+                   current2[ii]=0.0;
+                   current3[ii]=0.0;
+                 }
+
+                 vector_density_density_1234(operator_list[op_id].prop_zero, TYPE_1,2, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   scalar[ii]+=(+1.)*temp[ii];
+                 }
+                 free(temp);
+                 vector_density_density_1234(operator_list[op_id].prop_zero, TYPE_2,2, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   scalar[ii]+=(-1.)*temp[ii];
+                 }
+                 free(temp);
+                 vector_density_density_1234(operator_list[op_id].prop_zero, TYPE_3,2, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   scalar[ii]+=(-1.)*temp[ii];
+                 }
+                 free(temp);
+                 vector_density_density_1234(operator_list[op_id].prop_zero, TYPE_4,2, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   scalar[ii]+=(+1.)*temp[ii];
+                 }
+                 free(temp);
+                 temp= NULL;
+                 if (g_cart_id == 0){
+//                 fprintf(out,"S1S1nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"VECTORDENSITY3DENSITY3NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(scalar[ii]), cimag(scalar[ii]));
+                   }
+                 }
+                 for (int ii=0;ii<T_global; ++ii){
+                   scalar[ii]=0.0;
+                   pseudoscalar[ii]=0.0;
+                   current[ii]=0.0;
+                   pscalar1[ii]=0.0;
+                   pscalar2[ii]=0.0;
+                   pscalar3[ii]=0.0;
+                   scalar1[ii]=0.0;
+                   scalar2[ii]=0.0;
+                   scalar3[ii]=0.0;
+                   current1[ii]=0.0;
+                   current2[ii]=0.0;
+                   current3[ii]=0.0;
+                 }
 
-  status_geo = check_geometry();
-  if (status_geo != 0) {
-    fprintf(stderr, "Checking of geometry failed. Unable to proceed.\nAborting....\n");
-    exit(1);
-  }
-  if (even_odd_flag) {
-    i = init_spinor_field(VOLUMEPLUSRAND / 2, 2);
-  }
-  else {
-    i = init_spinor_field(VOLUMEPLUSRAND, 2);
-  }
+               }
+               if (axialcurrentdensity_BSM == 1){
+                 unit_scalar_field(g_scalar_field);
+                 for( int s=0; s<4; s++ )
+                   generic_exchange_nogauge(g_scalar_field[s], sizeof(scalar));
+                 if (smearedcorrelator_BSM == 1){
+                   smear_scalar_fields_correlator(g_smeared_scalar_field, g_scalar_field, timesmearcorrelator_BSM );
+                   for ( int s=0; s<4; s++ )
+                    generic_exchange_nogauge(g_smeared_scalar_field[s], sizeof(scalar));
+                 }
+                 vector_axial_current_density_1234(operator_list[op_id].prop_zero, TYPE_1, 0, 0, 1, 1, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(-1.)*temp[ii];
+                 }
+                 free(temp);
+                 vector_axial_current_density_1234(operator_list[op_id].prop_zero, TYPE_2, 0, 0, 1, 1, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(+1.)*temp[ii];
+                 }
+                 free(temp);
+                 vector_axial_current_density_1234(operator_list[op_id].prop_zero, TYPE_3, 0, 0, 1, 1, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(-1.)*temp[ii];
+                 }
+                 free(temp);
+                 vector_axial_current_density_1234(operator_list[op_id].prop_zero, TYPE_4, 0, 0, 1, 1, &temp );
+                 for (int ii=0; ii<T_global; ++ii){
+                   current[ii]+=(+1.)*temp[ii];
+                 }
+                 free(temp);
+                 if (g_cart_id == 0){
+//                 fprintf(out,"S1S1nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JTILDEA1P1TRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current[ii]), cimag(current[ii]));
+                   }
+                 }
+
+                 for (int ii=0;ii<T_global; ++ii){
+                   scalar[ii]=0.0;
+                   pseudoscalar[ii]=0.0;
+                   current[ii]=0.0;
+                   pscalar1[ii]=0.0;
+                   pscalar2[ii]=0.0;
+                   pscalar3[ii]=0.0;
+                   scalar1[ii]=0.0;
+                   scalar2[ii]=0.0;
+                   scalar3[ii]=0.0;
+                   current1[ii]=0.0;
+                   current2[ii]=0.0;
+                   current3[ii]=0.0;
+                 }
 
-  start_ranlux(1, 123456);
-  i = init_bispinor_field(VOLUMEPLUSRAND, 48);
-  if ( i!= 0) {
-    fprintf(stderr, "Not enough memory for bispinor fields! Aborting...\n");
-    exit(0);
-  }
-  sprintf(conf_filename, "%s.%.4d", gauge_input_filename, nstore);
-  if (g_cart_id == 0) {
-    printf("#\n# Trying to read gauge field from file %s in %s precision.\n",
-           conf_filename, (gauge_precision_read_flag == 32 ? "single" : "double"));
-    fflush(stdout);
-  }
-  if ( (i = read_gauge_field(conf_filename,g_gauge_field)) !=0) {
-    fprintf(stderr, "Error %d while reading gauge field from %s\n Aborting...\n", i, conf_filename);
-    exit(-2);
-  }
-  if (g_cart_id == 0) {
-    printf("# Finished reading gauge field.\n");
-    fflush(stdout);
-  }
-  sprintf(scalar_filename, "%s.%d", scalar_input_filename, nscalar);
-  if (g_cart_id == 0) {
-    printf("#\n# Trying to read scalar field from file %s in %s precision.\n",
-           scalar_filename, (scalar_precision_read_flag == 32 ? "single" : "double"));
-    fflush(stdout);
-  }
-  if ( (i = read_scalar_field_parallel(scalar_filename,g_scalar_field)) !=0) {
-    fprintf(stderr, "Error %d while reading scalar field from %s\n Aborting...\n", i, scalar_filename);
-    exit(-2);
-  }
-  if (g_cart_id == 0) {
-    printf("# Finished reading scalar field.\n");
-    fflush(stdout);
-  }
-  g_smearedscalar=(scalar *)malloc(sizeof(scalar *)*4);
-  for (i=0; i<4; ++i)
-    g_smearedscalar[i]= (scalar *)malloc(sizeof(scalar)*(VOLUMEPLUSRAND));
-  smear_scalar_fields(g_scalar_field, g_smearedscalar);
+               }
 
-  xchange_gauge(g_gauge_field);
-  /*compute the energy of the gauge field*/
-  plaquette_energy = measure_plaquette( (const su3**) g_gauge_field);
 
 
-  if (g_cart_id == 0) {
-    printf("# The computed plaquette value is %e.\n", plaquette_energy / (6.*VOLUME*g_nproc));
-    fflush(stdout);
-  }
+               if (densitydensity_BSM == 1){
+                 double read_end, read_begin=gettime();
+                 if( (i = read_scalar_field_parallel(scalar_filename,g_scalar_field)) !=0 )
+                 {
+                    fprintf(stderr, "Error %d while reading scalar field from %s\n Aborting...\n", i, scalar_filename);
+                    exit(-2);
+                 }
+                 read_end=gettime();
+                 if (g_cart_id == 0) {
+                   printf("# Finished reading scalar field in %.4e seconds.\n",read_end-read_begin);
+                   fflush(stdout);
+                 }
+                 for( int s=0; s<4; s++ )
+                   generic_exchange_nogauge(g_scalar_field[s], sizeof(scalar));
+                 if (smearedcorrelator_BSM == 1){
+                   smear_scalar_fields_correlator(g_smeared_scalar_field, g_scalar_field, timesmearcorrelator_BSM );
+                   for ( int s=0; s<4; s++ )
+                    generic_exchange_nogauge(g_smeared_scalar_field[s], sizeof(scalar));
+                 }
+
+                 density_density_1234(operator_list[op_id].prop_zero, TYPE_1, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   pscalar1[ii]+=(-1.)*temp[ii           ];
+                   pscalar2[ii]+=(-1.)*temp[ii+1*T_global];
+                   pscalar3[ii]+=(-1.)*temp[ii+2*T_global];
+                   pseudoscalar[ii] +=(-1.)*temp[ii+3*T_global];
+
+                   scalar1[ii]+=(-1.)*temp[ii           ];
+                   scalar2[ii]+=(-1.)*temp[ii+1*T_global];
+                   scalar3[ii]+=(-1.)*temp[ii+2*T_global];
+                   scalar[ii] +=(-1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+//                 density_density_1234_petros(operator_list[op_id].prop);
+                 density_density_1234(operator_list[op_id].prop_zero, TYPE_2, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   pscalar1[ii]+=temp[ii           ];
+                   pscalar2[ii]+=temp[ii+1*T_global];
+                   pscalar3[ii]+=temp[ii+2*T_global];
+                   pseudoscalar[ii] +=temp[ii+3*T_global];
+
+                   scalar1[ii]+=(-1.)*temp[ii           ];
+                   scalar2[ii]+=(-1.)*temp[ii+1*T_global];
+                   scalar3[ii]+=(-1.)*temp[ii+2*T_global];
+                   scalar[ii] +=(-1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+
+                 density_density_1234(operator_list[op_id].prop_zero, TYPE_3, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   pscalar1[ii]+=temp[ii           ];
+                   pscalar2[ii]+=temp[ii+1*T_global];
+                   pscalar3[ii]+=temp[ii+2*T_global];
+                   pseudoscalar[ii] +=temp[ii+3*T_global];
+
+                   scalar1[ii]+=(-1.)*temp[ii           ];
+                   scalar2[ii]+=(-1.)*temp[ii+1*T_global];
+                   scalar3[ii]+=(-1.)*temp[ii+2*T_global];
+                   scalar[ii] +=(-1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+
+                 density_density_1234(operator_list[op_id].prop_zero, TYPE_4, &temp);
+                 for (int ii=0; ii<T_global; ++ii){      
+                   pscalar1[ii]+=(-1.)*temp[ii           ];
+                   pscalar2[ii]+=(-1.)*temp[ii+1*T_global];
+                   pscalar3[ii]+=(-1.)*temp[ii+2*T_global];
+                   pseudoscalar[ii] +=(-1.)*temp[ii+3*T_global];
+
+                   scalar1[ii]+=(-1.)*temp[ii           ];
+                   scalar2[ii]+=(-1.)*temp[ii+1*T_global];
+                   scalar3[ii]+=(-1.)*temp[ii+2*T_global];
+                   scalar[ii] +=(-1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+                 if (g_cart_id == 0){
+//                 fprintf(out,"S1S1nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"S1S1NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(scalar1[ii]), cimag(scalar1[ii])); 
+                   }
+//                 fprintf(out,"S2S2nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"S2S2NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(scalar2[ii]), cimag(scalar2[ii]));
+                   }
+//                 fprintf(out,"S3S3nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"S3S3NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(scalar3[ii]), cimag(scalar3[ii]));
+                   }
+//                 fprintf(out,"SSnontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"SSNONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(scalar[ii]), cimag(scalar[ii]));
+                   }
+//                 fprintf(out,"S1S1nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"P1P1NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(pscalar1[ii]), cimag(pscalar1[ii]));
+                   }
+//                 fprintf(out,"S2S2nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"P2P2NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(pscalar2[ii]), cimag(pscalar2[ii]));
+                   }
+//                 fprintf(out,"PS3PS3nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"P3P3NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(pscalar3[ii]), cimag(pscalar3[ii]));
+                   }
+//                 fprintf(out,"SSnontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"PPNONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(pseudoscalar[ii]), cimag(pseudoscalar[ii]));
+                   }
+
+                 }
+                 for (int ii=0;ii<T_global; ++ii){
+                   scalar[ii]=0.0;
+                   pseudoscalar[ii]=0.0;
+                   current[ii]=0.0;
+                   pscalar1[ii]=0.0;
+                   pscalar2[ii]=0.0;
+                   pscalar3[ii]=0.0;
+                   scalar1[ii]=0.0;
+                   scalar2[ii]=0.0;
+                   scalar3[ii]=0.0;
+                   current1[ii]=0.0;
+                   current2[ii]=0.0;
+                   current3[ii]=0.0;
+                 }
+               }
 
-  for( int s=0; s<numbScalarFields; s++ ){
-    count=0;
-    generic_exchange_direction_nonblocking( g_scalar_field[s], sizeof(scalar), TDOWN, request, &count );
-    MPI_Waitall( count, request, statuses);
-    count=0;
-    generic_exchange_direction_nonblocking( g_scalar_field[s], sizeof(scalar), TUP  , request, &count );
-    MPI_Waitall( count, request, statuses);
-  }
+               if (pdensityvectordensity_BSM == 1){
+                 double read_end, read_begin=gettime();
+                 if( (i = read_scalar_field_parallel(scalar_filename,g_scalar_field)) !=0 )
+                 {
+                    fprintf(stderr, "Error %d while reading scalar field from %s\n Aborting...\n", i, scalar_filename);
+                    exit(-2);
+                 }
+                 read_end=gettime();
+                 if (g_cart_id == 0) {
+                   printf("# Finished reading scalar field in %.4e seconds.\n",read_end-read_begin);
+                   fflush(stdout);
+                 }
+                 for( int s=0; s<4; s++ )
+                   generic_exchange_nogauge(g_scalar_field[s], sizeof(scalar));
+                 if (smearedcorrelator_BSM == 1){
+                   smear_scalar_fields_correlator(g_smeared_scalar_field, g_scalar_field, timesmearcorrelator_BSM );
+                   for ( int s=0; s<4; s++ )
+                    generic_exchange_nogauge(g_smeared_scalar_field[s], sizeof(scalar));
+                 }
+
+
+                 for (int ii=0;ii<T_global; ++ii){
+                   scalar[ii]=0.0;
+                   pseudoscalar[ii]=0.0;
+                   current[ii]=0.0;
+                   pscalar1[ii]=0.0;
+                   pscalar2[ii]=0.0;
+                   pscalar3[ii]=0.0;
+                   scalar1[ii]=0.0;
+                   scalar2[ii]=0.0;
+                   scalar3[ii]=0.0;
+                   current1[ii]=0.0;
+                   current2[ii]=0.0;
+                   current3[ii]=0.0;
+                 }
+                 density_ptau_density_vector( operator_list[op_id].prop_zero, TYPE_1,&temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   pscalar1[ii]+=(-1.)*temp[ii           ];
+                   pscalar2[ii]+=(-1.)*temp[ii+1*T_global];
+                   pscalar3[ii]+=(-1.)*temp[ii+2*T_global];
+                   pseudoscalar[ii] +=(-1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+                 density_ptau_density_vector( operator_list[op_id].prop_zero, TYPE_2,&temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   pscalar1[ii]+=(+1.)*temp[ii           ];
+                   pscalar2[ii]+=(+1.)*temp[ii+1*T_global];
+                   pscalar3[ii]+=(+1.)*temp[ii+2*T_global];
+                   pseudoscalar[ii] +=(+1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+                 if (g_cart_id == 0){
+//                 fprintf(out,"S1S1nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"P1DP1NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(pscalar1[ii]), cimag(pscalar1[ii]));
+                   }
+//                 fprintf(out,"S2S2nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"P2DP2NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(pscalar2[ii]), cimag(pscalar2[ii]));
+                   }
+//                 fprintf(out,"PS3PS3nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"P3DP3NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(pscalar3[ii]), cimag(pscalar3[ii]));
+                   }
+//                 fprintf(out,"SSnontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"PDPNONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(pseudoscalar[ii]), cimag(pseudoscalar[ii]));
+                   }
+
+                 }
+                 for (int ii=0;ii<T_global; ++ii){
+                   scalar[ii]=0.0;
+                   pseudoscalar[ii]=0.0;
+                   current[ii]=0.0;
+                   pscalar1[ii]=0.0;
+                   pscalar2[ii]=0.0;
+                   pscalar3[ii]=0.0;
+                   scalar1[ii]=0.0;
+                   scalar2[ii]=0.0;
+                   scalar3[ii]=0.0;
+                   current1[ii]=0.0;
+                   current2[ii]=0.0;
+                   current3[ii]=0.0;
+                 }
+                 smearedcorrelator_BSM = 0;
+
+                 for (int ii=0;ii<T_global; ++ii){
+                   scalar[ii]=0.0;
+                   pseudoscalar[ii]=0.0;
+                   current[ii]=0.0;
+                   pscalar1[ii]=0.0;
+                   pscalar2[ii]=0.0;
+                   pscalar3[ii]=0.0;
+                   scalar1[ii]=0.0;
+                   scalar2[ii]=0.0;
+                   scalar3[ii]=0.0;
+                   current1[ii]=0.0;
+                   current2[ii]=0.0;
+                   current3[ii]=0.0;
+                 }
+                 density_ptau_density_vector( operator_list[op_id].prop_zero, TYPE_1,&temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   pscalar1[ii]+=(-1.)*temp[ii           ];
+                   pscalar2[ii]+=(-1.)*temp[ii+1*T_global];
+                   pscalar3[ii]+=(-1.)*temp[ii+2*T_global];
+                   pseudoscalar[ii] +=(-1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+                 density_ptau_density_vector( operator_list[op_id].prop_zero, TYPE_2,&temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   pscalar1[ii]+=(+1.)*temp[ii           ];
+                   pscalar2[ii]+=(+1.)*temp[ii+1*T_global];
+                   pscalar3[ii]+=(+1.)*temp[ii+2*T_global];
+                   pseudoscalar[ii] +=(+1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+                 if (g_cart_id == 0){
+//                 fprintf(out,"S1S1nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"P1DP1NONSMEAREDNONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(pscalar1[ii]), cimag(pscalar1[ii]));
+                   }
+//                 fprintf(out,"S2S2nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"P2DP2NONSMEAREDNONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(pscalar2[ii]), cimag(pscalar2[ii]));
+                   }
+//                 fprintf(out,"PS3PS3nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"P3DP3NONSMEAREDNONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(pscalar3[ii]), cimag(pscalar3[ii]));
+                   }
+//                 fprintf(out,"SSnontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"PDPNONSMEAREDNONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(pseudoscalar[ii]), cimag(pseudoscalar[ii]));
+                   }
+
+                 }
+                 
+                 for (int ii=0;ii<T_global; ++ii){
+                   scalar[ii]=0.0;
+                   pseudoscalar[ii]=0.0;
+                   current[ii]=0.0;
+                   pscalar1[ii]=0.0;
+                   pscalar2[ii]=0.0;
+                   pscalar3[ii]=0.0;
+                   scalar1[ii]=0.0;
+                   scalar2[ii]=0.0;
+                   scalar3[ii]=0.0;
+                   current1[ii]=0.0;
+                   current2[ii]=0.0;
+                   current3[ii]=0.0;
+                 }
+
+                 smearedcorrelator_BSM = 1;
 
+                 
+               }
 
-  spinor ** temp_field = NULL;
-  init_solver_field(&temp_field, VOLUMEPLUSRAND, 2);
-  for( src_idx = 0; src_idx < 12; src_idx++ ){
-    snprintf(prop_fname,200,"bsm2prop.0400.00.%02d.000.inverted",src_idx);
-    for(pos = 0; pos < 8; ){
-      printf("READCHECK: Propagator in pos %02d from file %s\n", pos/2,prop_fname);
-   
-//read the propagator from source d to sink d 
-      read_spinor(g_spinor_field[0], g_spinor_field[1], prop_fname, pos);
-      convert_eo_to_lexic(temp_field[0], g_spinor_field[0], g_spinor_field[1]);
-      pos+=1;
 
-//read the propagator from source d to sink u
-      read_spinor(g_spinor_field[0], g_spinor_field[1], prop_fname, pos);
-      convert_eo_to_lexic(temp_field[1], g_spinor_field[0], g_spinor_field[1]);
-      pos+=1;
-//create a bispinor first insert sink u then sink d
-//Store them in such a way that the u-ones should come first
-      compact(g_bispinor_field[pos > 4 ? src_idx*4+pos/2-3 : src_idx*4+pos/2+ 1], temp_field[1], temp_field[0]);
-    }
-  }
-  if (g_cart_id == 0) printf("Reading is successfull\n");
+               if (densitydensity_s0s0_BSM == 1){
+
+                 unit_scalar_field(g_scalar_field);
+                 for( int s=0; s<4; s++ )
+                   generic_exchange_nogauge(g_scalar_field[s], sizeof(scalar));
+                 if (smearedcorrelator_BSM == 1){
+                   smear_scalar_fields_correlator(g_smeared_scalar_field, g_scalar_field, timesmearcorrelator_BSM);
+                   for ( int s=0; s<4; s++ )
+                    generic_exchange_nogauge(g_smeared_scalar_field[s], sizeof(scalar));
+                 }
+                 density_density_1234_s0s0(operator_list[op_id].prop_zero, TYPE_1, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   pseudoscalar[ii] +=(-1.0)*temp[ii];
+                   scalar[ii] +=(-1.)*temp[ii];
+                 }
+                 free(temp);                  
+                 density_density_1234_s0s0(operator_list[op_id].prop_zero, TYPE_2, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   pseudoscalar[ii] +=temp[ii];
+                   scalar[ii] +=(-1.)*temp[ii];
+                 }
+                 free(temp);
+                 density_density_1234_s0s0(operator_list[op_id].prop_zero, TYPE_3, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   pseudoscalar[ii] +=temp[ii];
+                   scalar[ii] +=(-1.)*temp[ii];
+                 }
+                 free(temp);
+                 density_density_1234_s0s0(operator_list[op_id].prop_zero, TYPE_4, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   pseudoscalar[ii] +=(-1.)*temp[ii];
+                   scalar[ii] +=(-1.)*temp[ii];
+                 }
+                 free(temp);
+                 if (g_cart_id == 0){
+//                 fprintf(out,"S0S0trivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"S0S0TRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(scalar[ii]), cimag(scalar[ii]));
+                   }
+//                 fprintf(out,"S2S2nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"P0P0TRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(pseudoscalar[ii]), cimag(pseudoscalar[ii]));
+                   }
+                 }
+                 for (int ii=0;ii<T_global; ++ii){
+                   scalar[ii]=0.0;
+                   pseudoscalar[ii]=0.0;
+                   current[ii]=0.0;
+                   pscalar1[ii]=0.0;
+                   pscalar2[ii]=0.0;
+                   pscalar3[ii]=0.0;
+                   scalar1[ii]=0.0;
+                   scalar2[ii]=0.0;
+                   scalar3[ii]=0.0;
+                   current1[ii]=0.0;
+                   current2[ii]=0.0;
+                   current3[ii]=0.0;
+                 }
+                 double read_end, read_begin=gettime();
+                 if( (i = read_scalar_field_parallel(scalar_filename,g_scalar_field)) !=0 )
+                 {
+                    fprintf(stderr, "Error %d while reading scalar field from %s\n Aborting...\n", i, scalar_filename);
+                    exit(-2);
+                 }
+                 read_end=gettime();
+                 if (g_cart_id == 0) {
+                   printf("# Finished reading scalar field in %.4e seconds.\n",read_end-read_begin);
+                   fflush(stdout);
+                 }
+                 for( int s=0; s<4; s++ )
+                   generic_exchange_nogauge(g_scalar_field[s], sizeof(scalar));
+                 if (smearedcorrelator_BSM == 1){
+                   smear_scalar_fields_correlator(g_smeared_scalar_field, g_scalar_field, timesmearcorrelator_BSM );
+                   for ( int s=0; s<4; s++ )
+                    generic_exchange_nogauge(g_smeared_scalar_field[s], sizeof(scalar));
+                 }
+               }
+ 
+               if (densitydensity_sxsx_BSM ==1){
+                 unit_scalar_field(g_scalar_field);
+                 for( int s=0; s<4; s++ )
+                   generic_exchange_nogauge(g_scalar_field[s], sizeof(scalar));
+                 if (smearedcorrelator_BSM == 1){
+                   smear_scalar_fields_correlator(g_smeared_scalar_field, g_scalar_field, timesmearcorrelator_BSM );
+                   for ( int s=0; s<4; s++ )
+                    generic_exchange_nogauge(g_smeared_scalar_field[s], sizeof(scalar));
+                 }
+                 density_density_1234_sxsx(operator_list[op_id].prop_zero, TYPE_1, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   pscalar1[ii]+=(-1.)*temp[ii           ];
+                   pscalar2[ii]+=(-1.)*temp[ii+1*T_global];
+                   pscalar3[ii]+=(-1.)*temp[ii+2*T_global];
+                   pseudoscalar[ii] +=(-1.)*temp[ii+3*T_global];
+
+                   scalar1[ii]+=(-1.)*temp[ii           ];
+                   scalar2[ii]+=(-1.)*temp[ii+1*T_global];
+                   scalar3[ii]+=(-1.)*temp[ii+2*T_global];
+                   scalar[ii] +=(-1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+
+                 density_density_1234_sxsx(operator_list[op_id].prop_zero, TYPE_2, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   pscalar1[ii]+=temp[ii           ];
+                   pscalar2[ii]+=temp[ii+1*T_global];
+                   pscalar3[ii]+=temp[ii+2*T_global];
+                   pseudoscalar[ii] +=temp[ii+3*T_global];
+
+                   scalar1[ii]+=(-1.)*temp[ii           ];
+                   scalar2[ii]+=(-1.)*temp[ii+1*T_global];
+                   scalar3[ii]+=(-1.)*temp[ii+2*T_global];
+                   scalar[ii] +=(-1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+
+                 density_density_1234_sxsx(operator_list[op_id].prop_zero, TYPE_3, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   pscalar1[ii]+=temp[ii           ];
+                   pscalar2[ii]+=temp[ii+1*T_global];
+                   pscalar3[ii]+=temp[ii+2*T_global];
+                   pseudoscalar[ii] +=temp[ii+3*T_global];
+
+                   scalar1[ii]+=(-1.)*temp[ii           ];
+                   scalar2[ii]+=(-1.)*temp[ii+1*T_global];
+                   scalar3[ii]+=(-1.)*temp[ii+2*T_global];
+                   scalar[ii] +=(-1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+
+                 density_density_1234_sxsx(operator_list[op_id].prop_zero, TYPE_4, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   pscalar1[ii]+=(-1.)*temp[ii           ];
+                   pscalar2[ii]+=(-1.)*temp[ii+1*T_global];
+                   pscalar3[ii]+=(-1.)*temp[ii+2*T_global];
+                   pseudoscalar[ii] +=(-1.)*temp[ii+3*T_global];
+
+                   scalar1[ii]+=(-1.)*temp[ii           ];
+                   scalar2[ii]+=(-1.)*temp[ii+1*T_global];
+                   scalar3[ii]+=(-1.)*temp[ii+2*T_global];
+                   scalar[ii] +=(-1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+                 if (g_cart_id == 0){
+//                 fprintf(out,"S1S1nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"S1S1TRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(scalar1[ii]), cimag(scalar1[ii]));
+                   }
+//                 fprintf(out,"S2S2nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"S2S2TRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(scalar2[ii]), cimag(scalar2[ii]));
+                   }
+//                 fprintf(out,"S3S3nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"S3S3TRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(scalar3[ii]), cimag(scalar3[ii]));
+                   }
+//                 fprintf(out,"SSnontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"SSTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(scalar[ii]), cimag(scalar[ii]));
+                   }
+//                 fprintf(out,"S1S1nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"P1P1TRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(pscalar1[ii]), cimag(pscalar1[ii]));
+                   }
+//                 fprintf(out,"S2S2nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"P2P2TRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(pscalar2[ii]), cimag(pscalar2[ii]));
+                   }
+//                 fprintf(out,"PS3PS3nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"P3P3TRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(pscalar3[ii]), cimag(pscalar3[ii]));
+                   }
+//                 fprintf(out,"SSnontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"PPTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(pseudoscalar[ii]), cimag(pseudoscalar[ii]));
+                   }
+
+                 }
+                 for (int ii=0;ii<T_global; ++ii){
+                   scalar[ii]=0.0;
+                   pseudoscalar[ii]=0.0;
+                   current[ii]=0.0;
+                   pscalar1[ii]=0.0;
+                   pscalar2[ii]=0.0;
+                   pscalar3[ii]=0.0;
+                   scalar1[ii]=0.0;
+                   scalar2[ii]=0.0;
+                   scalar3[ii]=0.0;
+                   current1[ii]=0.0;
+                   current2[ii]=0.0;
+                   current3[ii]=0.0;
+                 }
+
+                 double read_end, read_begin=gettime();
+                 if( (i = read_scalar_field_parallel(scalar_filename,g_scalar_field)) !=0 )
+                 {
+                    fprintf(stderr, "Error %d while reading scalar field from %s\n Aborting...\n", i, scalar_filename);
+                    exit(-2);
+                 }
+                 read_end=gettime();
+                 if (g_cart_id == 0) {
+                   printf("# Finished reading scalar field in %.4e seconds.\n",read_end-read_begin);
+                   fflush(stdout);
+                 }
+                 for( int s=0; s<4; s++ )
+                   generic_exchange_nogauge(g_scalar_field[s], sizeof(scalar));
+                 if (smearedcorrelator_BSM == 1){
+                   smear_scalar_fields_correlator(g_smeared_scalar_field, g_scalar_field, timesmearcorrelator_BSM );
+                   for ( int s=0; s<4; s++ )
+                    generic_exchange_nogauge(g_smeared_scalar_field[s], sizeof(scalar));
+                 }
+               }
+               if (diraccurrentdensity_BSM == 1){
+                 naivedirac_current_density_12ab( operator_list[op_id].prop_zero, TYPE_I , TYPE_A, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   current1[ii]+=(-1.)*temp[ii           ];
+                   current2[ii]+=(-1.)*temp[ii+1*T_global];
+                   current3[ii]+=(-1.)*temp[ii+2*T_global];
+                   current[ii] +=(-1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+
+//                 diraccurrent1a_petros( operator_list[op_id].prop );
+                 naivedirac_current_density_12ab( operator_list[op_id].prop_zero, TYPE_I , TYPE_B, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   current1[ii]+=(+1.)*temp[ii           ];
+                   current2[ii]+=(+1.)*temp[ii+1*T_global];
+                   current3[ii]+=(+1.)*temp[ii+2*T_global];
+                   current[ii] +=(+1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+
+                 naivedirac_current_density_12ab( operator_list[op_id].prop_zero, TYPE_II, TYPE_A, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   current1[ii]+=(-1.)*temp[ii           ];
+                   current2[ii]+=(-1.)*temp[ii+1*T_global];
+                   current3[ii]+=(-1.)*temp[ii+2*T_global];
+                   current[ii] +=(-1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+
+                 naivedirac_current_density_12ab( operator_list[op_id].prop_zero, TYPE_II, TYPE_B, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   current1[ii]+=(+1.)*temp[ii           ];
+                   current2[ii]+=(+1.)*temp[ii+1*T_global];
+                   current3[ii]+=(+1.)*temp[ii+2*T_global];
+                   current[ii] +=(+1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+                 if (g_cart_id == 0){
+//                   fprintf(out,"S1S1nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"J1D1NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current1[ii]), cimag(current1[ii]));
+                   }
+//                 fprintf(out,"S2S2nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"J2D2NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current2[ii]), cimag(current2[ii]));
+                   }
+//                 fprintf(out,"PS3PS3nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"J3D3NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current3[ii]), cimag(current3[ii]));
+                   }
+//                 fprintf(out,"SSnontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JDNONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current[ii]), cimag(current[ii]));
+                   }
+                 }
+                 for (int ii=0;ii<T_global; ++ii){
+                   scalar[ii]=0.0;
+                   pseudoscalar[ii]=0.0;
+                   current[ii]=0.0;
+                   pscalar1[ii]=0.0;
+                   pscalar2[ii]=0.0;
+                   pscalar3[ii]=0.0;
+                   scalar1[ii]=0.0;
+                   scalar2[ii]=0.0;
+                   scalar3[ii]=0.0;
+                   current1[ii]=0.0;
+                   current2[ii]=0.0;
+                   current3[ii]=0.0;
+                 }
+               }
+               if (wilsoncurrentdensitypr1_BSM == 1){
+                 wilsonterm_current_density_312ab( operator_list[op_id].prop_zero, TYPE_1, TYPE_A, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   current1[ii]+=(-1.)*temp[ii           ];
+                   current2[ii]+=(-1.)*temp[ii+1*T_global];
+                   current3[ii]+=(-1.)*temp[ii+2*T_global];
+                   current[ii] +=(-1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+//                 wilsoncurrent_density_3_petros( operator_list[op_id].prop );
+                 wilsonterm_current_density_312ab( operator_list[op_id].prop_zero, TYPE_1, TYPE_B, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   current1[ii]+=(+1.)*temp[ii           ];
+                   current2[ii]+=(+1.)*temp[ii+1*T_global];
+                   current3[ii]+=(+1.)*temp[ii+2*T_global];
+                   current[ii] +=(+1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+                 wilsonterm_current_density_312ab( operator_list[op_id].prop_zero, TYPE_2, TYPE_A, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   current1[ii]+=(+1.)*temp[ii           ];
+                   current2[ii]+=(+1.)*temp[ii+1*T_global];
+                   current3[ii]+=(+1.)*temp[ii+2*T_global];
+                   current[ii] +=(+1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+
+
+                 wilsonterm_current_density_312ab( operator_list[op_id].prop_zero, TYPE_2, TYPE_B, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   current1[ii]+=(-1.)*temp[ii           ];
+                   current2[ii]+=(-1.)*temp[ii+1*T_global];
+                   current3[ii]+=(-1.)*temp[ii+2*T_global];
+                   current[ii] +=(-1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+                 if (g_cart_id == 0){
+//                   fprintf(out,"S1S1nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JWPR11D1NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current1[ii]), cimag(current1[ii]));
+                   }
+//                 fprintf(out,"S2S2nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JWPR12D2NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current2[ii]), cimag(current2[ii]));
+                   }
+//                 fprintf(out,"PS3PS3nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JWPR13D3NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current3[ii]), cimag(current3[ii]));
+                   }
+//                 fprintf(out,"SSnontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JWPR1DNONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current[ii]), cimag(current[ii]));
+                   }
+                 }
+                 for (int ii=0;ii<T_global; ++ii){
+                   scalar[ii]=0.0;
+                   pseudoscalar[ii]=0.0;
+                   current[ii]=0.0;
+                   pscalar1[ii]=0.0;
+                   pscalar2[ii]=0.0;
+                   pscalar3[ii]=0.0;
+                   scalar1[ii]=0.0;
+                   scalar2[ii]=0.0;
+                   scalar3[ii]=0.0;
+                   current1[ii]=0.0;
+                   current2[ii]=0.0;
+                   current3[ii]=0.0;
+                 }
+               }
+               if (wilsoncurrentdensitypr2_BSM == 1){
+                 wilsonterm_current_density_412ab( operator_list[op_id].prop_zero, TYPE_1, TYPE_A, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   current1[ii]+=(-1.)*temp[ii           ];
+                   current2[ii]+=(-1.)*temp[ii+1*T_global];
+                   current3[ii]+=(-1.)*temp[ii+2*T_global];
+                   current[ii] +=(-1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+                 wilsonterm_current_density_412ab( operator_list[op_id].prop_zero, TYPE_1, TYPE_B, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   current1[ii]+=(+1.)*temp[ii           ];
+                   current2[ii]+=(+1.)*temp[ii+1*T_global];
+                   current3[ii]+=(+1.)*temp[ii+2*T_global];
+                   current[ii] +=(+1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+                 wilsonterm_current_density_412ab( operator_list[op_id].prop_zero, TYPE_2, TYPE_A, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   current1[ii]+=(+1.)*temp[ii           ];
+                   current2[ii]+=(+1.)*temp[ii+1*T_global];
+                   current3[ii]+=(+1.)*temp[ii+2*T_global];
+                   current[ii] +=(+1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+                 wilsonterm_current_density_412ab( operator_list[op_id].prop_zero, TYPE_2, TYPE_B, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   current1[ii]+=(-1.)*temp[ii           ];
+                   current2[ii]+=(-1.)*temp[ii+1*T_global];
+                   current3[ii]+=(-1.)*temp[ii+2*T_global];
+                   current[ii] +=(-1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+                 if (g_cart_id == 0){
+//                   fprintf(out,"S1S1nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JWPR21D1NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current1[ii]), cimag(current1[ii]));
+                   }
+//                 fprintf(out,"S2S2nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JWPR2D2NONTRIVIAL\t%d\t%10.10e\t%10.10e\n",  ii, creal(current2[ii]), cimag(current2[ii]));
+                   }
+//                 fprintf(out,"PS3PS3nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JWPR23D3NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current3[ii]), cimag(current3[ii]));
+                   }
+//                 fprintf(out,"SSnontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JWPR2DNONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current[ii]), cimag(current[ii]));
+                   }
+                 }
+                 for (int ii=0;ii<T_global; ++ii){
+                   scalar[ii]=0.0;
+                   pseudoscalar[ii]=0.0;
+                   current[ii]=0.0;
+                   pscalar1[ii]=0.0;
+                   pscalar2[ii]=0.0;
+                   pscalar3[ii]=0.0;
+                   scalar1[ii]=0.0;
+                   scalar2[ii]=0.0;
+                   scalar3[ii]=0.0;
+                   current1[ii]=0.0;
+                   current2[ii]=0.0;
+                   current3[ii]=0.0;
+                 }
+
+               }
+               if (wilsoncurrentdensitypl1_BSM == 1){
+                 wilsonterm_current_density_512ab( operator_list[op_id].prop_zero, TYPE_1, TYPE_A, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   current1[ii]+=(-1.)*temp[ii           ];
+                   current2[ii]+=(-1.)*temp[ii+1*T_global];
+                   current3[ii]+=(-1.)*temp[ii+2*T_global];
+                   current[ii] +=(-1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+
+                 wilsonterm_current_density_512ab( operator_list[op_id].prop_zero, TYPE_1, TYPE_B, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   current1[ii]+=(+1.)*temp[ii           ];
+                   current2[ii]+=(+1.)*temp[ii+1*T_global];
+                   current3[ii]+=(+1.)*temp[ii+2*T_global];
+                   current[ii] +=(+1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+
+                 wilsonterm_current_density_512ab( operator_list[op_id].prop_zero, TYPE_2, TYPE_A, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   current1[ii]+=(+1.)*temp[ii           ];
+                   current2[ii]+=(+1.)*temp[ii+1*T_global];
+                   current3[ii]+=(+1.)*temp[ii+2*T_global];
+                   current[ii] +=(+1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+
+                 wilsonterm_current_density_512ab( operator_list[op_id].prop_zero, TYPE_2, TYPE_B, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   current1[ii]+=(-1.)*temp[ii           ];
+                   current2[ii]+=(-1.)*temp[ii+1*T_global];
+                   current3[ii]+=(-1.)*temp[ii+2*T_global];
+                   current[ii] +=(-1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+                 if (g_cart_id == 0){
+//                   fprintf(out,"S1S1nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JWPL11D1NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current1[ii]), cimag(current1[ii]));
+                   }
+//                 fprintf(out,"S2S2nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JWPL12D2NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current2[ii]), cimag(current2[ii]));
+                   }
+//                 fprintf(out,"PS3PS3nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JWPL13D3NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current3[ii]), cimag(current3[ii]));
+                   }
+//                 fprintf(out,"SSnontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JWPL1DNONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current[ii]), cimag(current[ii]));
+                   }
+                 }
+                 for (int ii=0;ii<T_global; ++ii){
+                   scalar[ii]=0.0;
+                   pseudoscalar[ii]=0.0;
+                   current[ii]=0.0;
+                   pscalar1[ii]=0.0;
+                   pscalar2[ii]=0.0;
+                   pscalar3[ii]=0.0;
+                   scalar1[ii]=0.0;
+                   scalar2[ii]=0.0;
+                   scalar3[ii]=0.0;
+                   current1[ii]=0.0;
+                   current2[ii]=0.0;
+                   current3[ii]=0.0;
+                 }
+               }
+               if (wilsoncurrentdensitypl2_BSM == 1){
+               //wilsoncurrent61a_petros( operator_list[op_id].prop );
+                 wilsonterm_current_density_612ab( operator_list[op_id].prop_zero, TYPE_1, TYPE_A, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   current1[ii]+=(-1.)*temp[ii           ];
+                   current2[ii]+=(-1.)*temp[ii+1*T_global];
+                   current3[ii]+=(-1.)*temp[ii+2*T_global];
+                   current[ii] +=(-1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+
+                 wilsonterm_current_density_612ab( operator_list[op_id].prop_zero, TYPE_1, TYPE_B, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   current1[ii]+=(+1.)*temp[ii           ];
+                   current2[ii]+=(+1.)*temp[ii+1*T_global];
+                   current3[ii]+=(+1.)*temp[ii+2*T_global];
+                   current[ii] +=(+1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+         
+
+               //wilsoncurrent62a_petros( operator_list[op_id].prop );
+                 wilsonterm_current_density_612ab( operator_list[op_id].prop_zero, TYPE_2, TYPE_A, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   current1[ii]+=(+1.)*temp[ii           ];
+                   current2[ii]+=(+1.)*temp[ii+1*T_global];
+                   current3[ii]+=(+1.)*temp[ii+2*T_global];
+                   current[ii] +=(+1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+
+                 wilsonterm_current_density_612ab( operator_list[op_id].prop_zero, TYPE_2, TYPE_B, &temp);
+                 for (int ii=0; ii<T_global; ++ii){
+                   current1[ii]+=(-1.)*temp[ii           ];
+                   current2[ii]+=(-1.)*temp[ii+1*T_global];
+                   current3[ii]+=(-1.)*temp[ii+2*T_global];
+                   current[ii] +=(-1.)*temp[ii+3*T_global];
+                 }
+                 free(temp);
+                 if (g_cart_id == 0){
+//                   fprintf(out,"S1S1nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JWPL21D1NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current1[ii]), cimag(current1[ii]));
+                   }
+//                 fprintf(out,"S2S2nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JWPL22D2NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current2[ii]), cimag(current2[ii]));
+                   }
+//                 fprintf(out,"PS3PS3nontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JWPL23D3NONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current3[ii]), cimag(current3[ii]));
+                   }
+//                 fprintf(out,"SSnontrivialscalar:\n");
+                   for (int ii=0; ii<T_global; ++ii){
+                     fprintf(out,"JWPL2DNONTRIVIAL\t%d\t%10.10e\t%10.10e\n", ii, creal(current[ii]), cimag(current[ii]));
+                   }
+                 }
+                 for (int ii=0;ii<T_global; ++ii){
+                   scalar[ii]=0.0;
+                   pseudoscalar[ii]=0.0;
+                   current[ii]=0.0;
+                   pscalar1[ii]=0.0;
+                   pscalar2[ii]=0.0;
+                   pscalar3[ii]=0.0;
+                   scalar1[ii]=0.0;
+                   scalar2[ii]=0.0;
+                   scalar3[ii]=0.0;
+                   current1[ii]=0.0;
+                   current2[ii]=0.0;
+                   current3[ii]=0.0;
+                 }
+
+               }
+//               density_density_1234_petros(operator_list[op_id].prop);
+               if (g_cart_id == 0){
+                 fclose(out);
+               }
+               free(scalar);
+               free(pseudoscalar);
+               free(current);
+               free(pscalar1);
+               free(pscalar2);
+               free(pscalar3);
+               free(scalar1);
+               free(scalar2);
+               free(scalar3);
+               free(current1);
+               free(current2);
+               free(current3);
+
+             } //End of loop over samples
+
+          } //End loop over scalar fields
+          if ( ( operator_list[op_id].type == BSM2f ) || ( operator_list[op_id].type == BSM3 )){
+             if ( operator_list[op_id].type == BSM2f ){
+               free_D_psi_BSM2f();
+             }
+             else {
+               free_D_psi_BSM3();
+             }
+             for (int ii=0; ii<48; ++ii)
+               free(operator_list[op_id].prop_zero[ii]);
+             free(operator_list[op_id].prop_zero);
+             if ( ( vectorcurrentcurrent_BSM == 1 ) || ( axialcurrentcurrent_BSM == 1 ) ){
+               for (int ii=0; ii<48; ++ii)
+                 free(operator_list[op_id].prop_ntmone[ii]);
+               free(operator_list[op_id].prop_ntmone);
+             }
+          }
+
+      }//End loop over operators
+
+      nstore+=Nsave;
+  }//End of loop over gauges
+
   finalize_solver(temp_field,2);
-  free(request);
   free_gauge_field();
   free_geometry_indices();
   free_bispinor_field();
   free_scalar_field();
-  int ii;	
-  for ( ii= 0; ii< 4; ++ii)
-     free(g_smearedscalar[ii]);
-  free(g_smearedscalar);
+  free_spinor_field();
+#if defined TM_USE_MPI
   MPI_Barrier(MPI_COMM_WORLD);
   MPI_Finalize();
-
+#endif
 }
+#endif
diff --git a/cu/Makefile.in b/cu/Makefile.in
index 7d63019cc..01e1b3599 100644
--- a/cu/Makefile.in
+++ b/cu/Makefile.in
@@ -44,7 +44,7 @@ include ${top_srcdir}/Makefile.global
 
 # rule to compile objects
 
-${libcu_OBJECTS}: %.o : ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h
+${libcu_OBJECTS}: %.o : ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config.h
 	$(COMPILE) -c $<
 
 
diff --git a/default_input_values.h b/default_input_values.h
index 9ceacc72e..e8d697e53 100644
--- a/default_input_values.h
+++ b/default_input_values.h
@@ -50,13 +50,37 @@
 #define _default_g_mu1 0.0
 #define _default_g_mu2 0.0
 #define _default_g_mu3 0.0
+#define _default_g_shift 0.0
 #define _default_c_sw -1.0
 #define _default_g_beta 6.0
 #define _default_g_N_s 20
 #define _default_g_dflgcr_flag 0
+#define _default_little_evenodd 0
+#define _default_usePL 0
+#define _default_little_solver 0
+#define _default_little_gmres_m_parameter 50
+#define _default_little_solver_max_iter 20
+#define _default_little_solver_low_prec 1.0e-2
+#define _default_little_solver_high_prec 1.0e-10
+
+#define _default_Msap_precon 1
+#define _default_NiterMsap 3
+#define _default_NcycleMsap 2
+#define _default_kappa_Msap -1.
+#define _default_mu_Msap -20.
+
+#define _default_NiterMsap_dflgen 4
+#define _default_NcycleMsap_dflgen 4
+#define _default_NsmoothMsap_dflgen 2
+#define _default_kappa_dflgen -1.
+#define _default_mu_dflgen -20.
+#define _default_kappa_dfl -1.
+#define _default_mu_dfl -20.
+
 #define _default_random_seed 123456
 #define _default_rlxd_level 1
-#define _default_solver_flag 1
+#define _default_solver_flag 1 // this is CG (see solver/solver_types.h)
+#define _default_nd_solver_flag 15 // this is CGMMSND (see solver/solver_types.h)
 #define _default_startoption 0
 #define _default_Ntherm 0
 #define _default_Nmeas 1
@@ -77,9 +101,35 @@
 #define _default_X1 0.
 #define _default_X2 0.
 #define _default_X3 0.
+#ifdef TM_USE_BSM
+#define _default_csw_BSM 1.0
 #define _default_rho_BSM 1.0
 #define _default_eta_BSM 1.0
 #define _default_m0_BSM  0.0
+#define _default_c5phi_BSM 0.0
+#define _default_mu03_BSM  0.0
+#define _default_r_BSM 0.0
+#define _default_mu01_BSM  0.0
+#define _default_smearedcorrelator_BSM 0
+#define _default_propagatorsonthefly_BSM 0
+#define _default_densitydensity_BSM 0
+#define _default_densitydensity_s0s0_BSM 0
+#define _default_densitydensity_sxsx_BSM 0
+#define _default_diraccurrentdensity_BSM 0
+#define _default_wilsoncurrentdensitypr1_BSM 0
+#define _default_wilsoncurrentdensitypr2_BSM 0
+#define _default_wilsoncurrentdensitypl1_BSM 0
+#define _default_axialcurrentdensity_BSM 0
+#define _default_pdensityvectordensity_BSM 0
+#define _default_vectorcurrentdensity_BSM 0
+#define _default_axialcurrentcurrent_BSM 0
+
+#define _default_vectordensitydensity_BSM 0
+#define _default_wilsoncurrentdensitypl2_BSM 0
+#define _default_vectorcurrentcurrent_BSM 0
+#define _default_giancarlo_BSM 0
+#define _default_timesmearcorrelator_BSM 0
+#endif
 #define _default_max_solver_iterations 5000
 #define _default_solver_precision 1.e-15
 #define _default_g_rgi_C1 0.
@@ -107,9 +157,12 @@
 #define _default_scalar_precision_read_flag 64
 #define _default_scalar_precision_write_flag 64
 #define _default_g_disable_IO_checks 0
-#define _default_prop_precision_flag 32
+#define _default_prop_precision_flag 64
+#define _default_write_prop_flag 1
 #define _default_reproduce_randomnumber_flag 1
 #define _default_g_sloppy_precision_flag 0
+#define _default_operator_sloppy_precision_flag 0
+#define _default_compression_type 18
 #define _default_stout_rho 0.1
 #define _default_rho 0.
 #define _default_rho2 0.
@@ -165,9 +218,41 @@
 /* default GPU values */
 #define _default_device_num -1
 
+#define _default_min_innersolver_it 10
+#define _default_max_mms_shifts 6
+
 /* default OpenMP values */
 #define _default_omp_num_threads 0
 
+/* default mixed precision solver values */
+#define _default_mixcg_innereps 5.0e-5
+#define _default_mixcg_maxinnersolverit 5000
+
 #define _default_use_preconditioning 0
 
+#define _default_external_inverter 0
+
+#define _default_subprocess_flag 0
+#define _default_lowmem_flag 0
+
+/* default input values for QUDA interface */
+/* These follow the recommendations of https://github.com/lattice/quda/wiki/Multigrid-Solver */
+#define _default_quda_mg_n_level 2
+#define _default_quda_mg_n_vec 24
+#define _default_quda_mg_mu_factor 8.0
+#define _default_quda_mg_setup_tol 1e-6
+#define _default_quda_mg_setup_maxiter 1000
+#define _default_quda_mg_coarse_solver_tol 0.25
+#define _default_quda_mg_coarse_solver_maxiter 75
+#define _default_quda_mg_smoother_tol 0.25
+#define _default_quda_mg_nu_pre 0
+#define _default_quda_mg_nu_post 4
+#define _default_quda_mg_omega 0.85
+#define _default_quda_mg_enable_size_three_blocks 0
+#define _default_quda_mg_reset_setup_threshold 0.0
+
+// gradient flow measurement step size and maximum flow time
+#define _default_gf_eps 0.01
+#define _default_gf_tmax 9.99
+
 #endif
diff --git a/deriv_Sb.c b/deriv_Sb.c
index ddb6b453b..3b408ea4a 100644
--- a/deriv_Sb.c
+++ b/deriv_Sb.c
@@ -39,7 +39,7 @@
  ************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -98,7 +98,7 @@ void deriv_Sb(const int ieo, spinor * const l, spinor * const k,
   } 
 
   /* for parallelization */
-#ifdef MPI
+#ifdef TM_USE_MPI
   xchange_2fields(k, l, ieo);
 #endif
   /************** loop over all lattice sites ****************/
@@ -408,11 +408,11 @@ void deriv_Sb(const int ieo, spinor * const l, spinor * const k,
   }
 #endif
   /* for parallelization */
-#ifdef MPI
+#ifdef TM_USE_MPI
   xchange_2fields(k, l, ieo);
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #pragma omp parallel
   {
@@ -427,7 +427,7 @@ void deriv_Sb(const int ieo, spinor * const l, spinor * const k,
   spinor * restrict sp ALIGN;
   spinor * restrict sm ALIGN;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -451,7 +451,7 @@ void deriv_Sb(const int ieo, spinor * const l, spinor * const k,
   } 
 
   /************** loop over all lattice sites ****************/
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(icx = ioff; icx < (VOLUME/2+ioff); icx++){
@@ -636,7 +636,7 @@ void deriv_Sb(const int ieo, spinor * const l, spinor * const k,
     /****************** end of loop ************************/
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 
diff --git a/deriv_Sb_D_psi.c b/deriv_Sb_D_psi.c
index f1febc8c9..09b8141f3 100644
--- a/deriv_Sb_D_psi.c
+++ b/deriv_Sb_D_psi.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -78,7 +78,7 @@ void deriv_Sb_D_psi(spinor * const l, spinor * const k,
   ioff2=(VOLUME+RAND)/2-ioff;
 
   /* for parallelization */
-#ifdef MPI
+#ifdef TM_USE_MPI
   xchange_field(k, ieo);
   xchange_field(l, (ieo+1)%2);
 #endif
@@ -384,12 +384,12 @@ void deriv_Sb_D_psi(spinor * const l, spinor * const k,
 #endif
 
   /* for parallelization */
-#ifdef MPI
+#ifdef TM_USE_MPI
   xchange_lexicfield(k);
   xchange_lexicfield(l);
 #endif
   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #pragma omp parallel
   {
@@ -405,7 +405,7 @@ void deriv_Sb_D_psi(spinor * const l, spinor * const k,
   spinor * restrict sp ALIGN;
   spinor * restrict sm ALIGN;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -417,7 +417,7 @@ void deriv_Sb_D_psi(spinor * const l, spinor * const k,
 #endif
 
   /************** loop over all lattice sites ****************/
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(ix = 0; ix < (VOLUME); ix++){
@@ -578,7 +578,7 @@ void deriv_Sb_D_psi(spinor * const l, spinor * const k,
 #pragma pomp inst end(derivSb)
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /*OpenMP closing brace */
 #endif
 }
diff --git a/doc/DDalphaAMG.tex b/doc/DDalphaAMG.tex
new file mode 100644
index 000000000..0a5f2cd64
--- /dev/null
+++ b/doc/DDalphaAMG.tex
@@ -0,0 +1,146 @@
+%author: Simone Bacchio <s.bacchio@gmail.com>
+%date: 07/2016
+
+\subsection{DDalphaAMG: A library for multigrid preconditioning on LQCD}
+
+
+DD-$\alpha$AMG~\cite{Frommer:2013fsa} is an Adaptive Aggregation-based Domain Decomposition Multigrid method for Lattice QCD. A library named DDalphaAMG is publicly available\footnote{\url{https://github.com/DDalphaAMG/DDalphaAMG}} and it contains the full method with additional development tools. DD-$\alpha$AMG has been successfully extended to $N_f=2$ twisted mass fermions in~\cite{Alexandrou:2016}.
+
+%\subsubsection{Design goals of the interface}
+
+\subsubsection{Installation}
+
+Download the Twisted Mass version of the DDalphaAMG library at 
+\begin{Verbatim}[fontsize=\small]
+https://github.com/sbacchio/DDalphaAMG.
+\end{Verbatim}
+The Makefile should be ready for being compiled in a Intel environment. You may want to change the environment or just set some variables; you can do it editing the first lines of the Makefile:
+\begin{Verbatim}[fontsize=\small]
+CC = mpiicc
+
+# --- CFLAGS -----------------------------------------                          
+CFLAGS_gnu = -std=gnu99 -Wall -pedantic -fopenmp -O3 -ffast-math -msse4.2
+CFLAGS_intel = -std=gnu99 -Wall -pedantic -qopenmp -O3  -xHOST
+CFLAGS = $(CFLAGS_intel)
+\end{Verbatim}
+The library can be installed with
+\begin{Verbatim}[fontsize=\small]
+make -j library LIMEDIR="/your/lime/installation/dir" 
+\end{Verbatim}
+and tmLQCD can be configured and compiled by using
+\begin{Verbatim}[fontsize=\small]
+autoreconf -f
+./configure YOUR_OPTIONS --with-DDalphaAMG="/path/to/DDalphaAMG/dir"
+make -j
+\end{Verbatim}
+
+\subsubsection{Usage}
+For calling the solver with a standard setting of parameters, it is just necessary to use \texttt{DDalphaAMG} as a solver:
+\begin{Verbatim}[fontsize=\small]
+BeginOperator TMWILSON
+  2kappaMu = 0.05
+  kappa = 0.177
+  Solver = DDalphaAMG
+  SolverPrecision = 1e-14
+  MaxSolverIterations = 100
+EndOperator
+\end{Verbatim}
+More options are available and explained in the next section. At the first call of the solver, a setup phase will be run and then the same setup will be used for all the inversions with the same configuration. Be aware that the change of configuration at the moment is supported just for HMC simulations for which specific parameters are defined.
+\subsubsection{More advanced settings}
+For tuning purpose, several parameters of DDalphaAMG can be set inside the section \texttt{DDalphaAMG} and here after the complete list of implemented parameters: 
+\begin{Verbatim}[fontsize=\small]
+BeginDDalphaAMG
+  MGOMPNumThreads = 1
+  MGBlockX = 4
+  MGBlockY = 4
+  MGBlockZ = 4
+  MGBlockT = 4
+  MGNumberOfVectors = 24
+  MGNumberOfLevels = 3
+  MGCoarseMuFactor = 5
+  MGSetupIter = 5
+  MGCoarseSetupIter = 3
+  MGSetup2KappaMu = 0.001
+  MGMixedPrecision = yes
+  MGdtauUpdate = 0.05
+  MGrhoUpdate = 0.0
+  MGUpdateSetupIter = 1
+EndDDalphaAMG
+\end{Verbatim}
+Not all the parameters have to be use and for all of them a standard value is defined. Here a brief explanation:
+\begin{description}
+	\item[\texttt{MGOMPNumThreads:}] the DDalphaAMG library does not take advantages on exploiting hyper-threading; while most of the applications of tmLQCD do. For this reason the \texttt{OMPNumThreads} for DDalphaAMG has been separated by the standard one. If this parameter is not used, the value of \texttt{OMPNumThreads} is used.
+	\item[\texttt{MGBlock?:}]\footnote{\label{fn:Alexandrou:2016} for a better understanding of these parameters we strongly suggest the reading of the numerical results presented in \cite{Alexandrou:2016}} block size in the directions X,Y,Z,T. The values have to divide the local size of the lattice and by default an optimal value is used.
+	\item[\texttt{MGNumberOfVectors:}]\footnoteref{fn:Alexandrou:2016} number of vectors used in the fine level. This parameter require some tuning.
+	\item[\texttt{MGNumberOfLevels:}] number of levels for the multigrid method. Can take values from 1 (no multigrid) to 4. A value of 3 is suggested.
+	\item[\texttt{MGCoarseMuFactor:}]\footnoteref{fn:Alexandrou:2016} multiplicative factor for the twisted mass term $\mu$ on the coarsest level. A good performance is achieved with a value between 3 and 6.
+	\item[\texttt{MGSetupIter, MGCoarseSetupIter:}] number of setup iterations in the fine and coarse grid respectively. For the fine grid a value between 3 and 5 is suggested. For the coarse grid 2, 3 iterations should be enough. 
+	\item[\texttt{MGSetup2KappaMu:}] out of the physical point, the
+          solver could have advantages on running the setup with a
+          lower mu, closer to the physical point.
+	\item[\texttt{MGMixedPrecision:}] using the mixed precision solver,
+          a speed-up of 20\% can be achieved. One has to be careful
+          that the mixed precision solver do not restart more than
+          once and that the restarted relative residual (in double
+          precision) is not order of magnitude higher than the one in single
+          precision, see Section~\ref{sec:DDalphaAMG_output}. In that
+          case the mixed precision solver is not suggested.
+	\item[\texttt{MGdtauUpdate:}] for HMC, $d\tau$ interval after that the setup is updated. If 0, it will be updated every time the configuration is changed.
+	\item[\texttt{MGrhoUpdate:}] for HMC, rho value of the monomial at which the setup have to be updated. It can be combined with \texttt{MGdtauUpdate} or used standalone.
+	\item[\texttt{MGUpdateSetupIter:}] for HMC, number of setup iterations to do on the fine level when the setup has to be updated.
+	\item[\texttt{MGNumberOfShifts:}] for MG in multi-shift systems, number of shifted linear systems, N, to be solved by DDalphaAMG. MG will solve the N smaller shifts.
+	\item[\texttt{MGMMSMass:}] for MG in multi-shift systems, alternative to the previous. MG will solve all the mass-shifts smaller than the given value.
+\end{description}
+\subsubsection{Output analysis\label{sec:DDalphaAMG_output}}
+Running tmLQCD programs with the option \texttt{-v}, the full output of DDalphaAMG is shown. Here some hints on the informations given. Just before the setup, the full set of parameters is printed, with an output similar to the following:
+\begin{Verbatim}[fontsize=\small]
++----------------------------------------------------------+
+| 3-level method                                           |
+| postsmoothing K-cycle                                    |
+| FGMRES + red-black multiplicative Schwarz                |
+|          restart length: 10                              |
+|                      m0: -0.430229                       |
+|                     csw: +1.740000                       |
+|                      mu: +0.001200                       |
++----------------------------------------------------------+
+|   preconditioner cycles: 1                               |
+|            inner solver: minimal residual iteration      |
+|               precision: single                          |
++---------------------- depth  0 --------------------------+
+|          global lattice: 96  48  48  48                  |
+|           local lattice: 16  8   8   24                  |
+|           block lattice: 4   4   4   4                   |
+|        post smooth iter: 2                               |
+|     smoother inner iter: 4                               |
+|              setup iter: 3                               |
+|            test vectors: 24                              |
++---------------------- depth  1 --------------------------+
+|          global lattice: 24  12  12  12                  |
+|           local lattice: 4   2   2   6                   |
+|           block lattice: 2   2   2   2                   |
+|        post smooth iter: 2                               |
+|     smoother inner iter: 4                               |
+|              setup iter: 3                               |
+|            test vectors: 28                              |
++---------------------- depth  2 --------------------------+
+|          global lattice: 12  6   6   6                   |
+|           local lattice: 2   1   1   3                   |
+|           block lattice: 1   1   1   1                   |
+|      coarge grid solver: odd even GMRES                  |
+|              iterations: 25                              |
+|                  cycles: 40                              |
+|               tolerance: 5e-02                           |
+|                      mu: +0.012000                       |
++----------------------------------------------------------+
+|          K-cycle length: 5                               |
+|        K-cycle restarts: 2                               |
+|       K-cycle tolerance: 1e-01                           |
++----------------------------------------------------------+
+\end{Verbatim}
+You may want to check that all the parameters agree to what expected and a good set of parameters is presented in \cite{Alexandrou:2016}.
+\subsubsection{Warnings and error messages}
+
+
+
+
+
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 000000000..88507ad10
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,15 @@
+.PHONY: all clean
+
+all: main.pdf
+
+
+main.pdf: $(wildcard *.tex) bibliography.bib Makefile
+	latexmk -f -bibtex -pdf -pdflatex="pdflatex -interaction=nonstopmode" -use-make main.tex
+
+clean:
+	latexmk -CA
+	rm -f main.spl main.tdo main.pdf *converted-to.pdf
+
+distclean:
+	latexmk -CA
+	rm -f main.bbl main.spl main.tdo *converted-to.pdf
diff --git a/doc/bibliography.bib b/doc/bibliography.bib
index c723b47b2..1e4fcf20c 100644
--- a/doc/bibliography.bib
+++ b/doc/bibliography.bib
@@ -1,7610 +1,7906 @@
-@article{Luscher:2012av,
-      author         = "Luscher, Martin and Schaefer, Stefan",
-      title          = "{Lattice QCD with open boundary conditions and
-                        twisted-mass reweighting}",
-      journal        = "Comput.Phys.Commun.",
-      volume         = "184",
-      pages          = "519-528",
-      doi            = "10.1016/j.cpc.2012.10.003",
-      year           = "2013",
-      eprint         = "1206.2809",
+@comment{x-kbibtex-personnameformatting=<%l><, %f>}
+
+@article{Clark:2009wm,
+	archiveprefix = "arXiv",
+	author = "Clark, M.A. and Babich, R. and Barros, K. and Brower, R.C. and Rebbi, C.",
+	doi = "10.1016/j.cpc.2010.05.002",
+	eprint = "0911.3191",
+	journal = "Comput.Phys.Commun.",
+	pages = "1517--1528",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = ARXIV:0911.3191;\%\%",
+	title = "{Solving Lattice QCD systems of equations using mixed precision solvers on GPUs}",
+	volume = "181",
+	year = "2010"
+}
+@article{Boyle:2017xcy,
+      author         = "Boyle, Peter and Chuvelev, Michael and Cossu, Guido and
+                        Kelly, Christopher and Lehner, Christoph and Meadows,
+                        Lawrence",
+      title          = "{Accelerating HPC codes on Intel(R) Omni-Path
+                        Architecture networks: From particle physics to Machine
+                        Learning}",
+      year           = "2017",
+      eprint         = "1711.04883",
       archivePrefix  = "arXiv",
-      primaryClass   = "hep-lat",
-      reportNumber   = "CERN-PH-TH-2012-161",
-      SLACcitation   = "%%CITATION = ARXIV:1206.2809;%%",
+      primaryClass   = "cs.DC",
+      SLACcitation   = "%%CITATION = ARXIV:1711.04883;%%"
+}
+ 
+@article{Babich:2011np,
+	archiveprefix = "arXiv",
+	author = "Babich, R. and Clark, M.A. and Joo, B. and Shi, G. and Brower, R.C. and others",
+	eprint = "1109.2935",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = ARXIV:1109.2935;\%\%",
+	title = "{Scaling Lattice QCD beyond 100 GPUs}",
+	year = "2011"
+}
+
+@article{Strelchenko:2013vaa,
+	archiveprefix = "arXiv",
+	author = "Strelchenko, Alexei and Alexandrou, Constantia and Koutsou, Giannis and Aviles-Casco, Alejandro Vaquero",
+	eprint = "1311.4462",
+	journal = "PoS",
+	pages = "415",
+	primaryclass = "hep-lat",
+	reportnumber = "FERMILAB-CONF-13-528-CD",
+	slaccitation = "%\%CITATION = ARXIV:1311.4462;\%\%",
+	title = "{Implementation of the twisted mass fermion operator in the QUDA library}",
+	volume = "LATTICE2013",
+	year = "2014"
 }
+
+@article{Luscher:2012av,
+	archiveprefix = "arXiv",
+	author = "Luscher, Martin and Schaefer, Stefan",
+	doi = "10.1016/j.cpc.2012.10.003",
+	eprint = "1206.2809",
+	journal = "Comput.Phys.Commun.",
+	pages = "519--528",
+	primaryclass = "hep-lat",
+	reportnumber = "CERN-PH-TH-2012-161",
+	slaccitation = "%\%CITATION = ARXIV:1206.2809;\%\%",
+	title = "{Lattice QCD with open boundary conditions and twisted-mass reweighting}",
+	volume = "184",
+	year = "2013"
+}
+
 @article{Luscher:2010ae,
-      author         = "Luscher, Martin",
-      title          = "{Computational Strategies in Lattice QCD}",
-      pages          = "331-399",
-      year           = "2010",
-      eprint         = "1002.4232",
-      archivePrefix  = "arXiv",
-      primaryClass   = "hep-lat",
-      reportNumber   = "CERN-PH-TH-2010-047",
-      SLACcitation   = "%%CITATION = ARXIV:1002.4232;%%",
+	archiveprefix = "arXiv",
+	author = "Luscher, Martin",
+	eprint = "1002.4232",
+	pages = "331--399",
+	primaryclass = "hep-lat",
+	reportnumber = "CERN-PH-TH-2010-047",
+	slaccitation = "%\%CITATION = ARXIV:1002.4232;\%\%",
+	title = "{Computational Strategies in Lattice QCD}",
+	year = "2010"
 }
+
 @article{Clark:2006fx,
-      author         = "Clark, M.A. and Kennedy, A.D.",
-      title          = "{Accelerating dynamical fermion computations using the
-                        rational hybrid Monte Carlo (RHMC) algorithm with multiple
-                        pseudofermion fields}",
-      journal        = "Phys.Rev.Lett.",
-      volume         = "98",
-      pages          = "051601",
-      doi            = "10.1103/PhysRevLett.98.051601",
-      year           = "2007",
-      eprint         = "hep-lat/0608015",
-      archivePrefix  = "arXiv",
-      primaryClass   = "hep-lat",
-      SLACcitation   = "%%CITATION = HEP-LAT/0608015;%%",
-}
-@Article{'tHooft:1971fh,
-     author    = "'t Hooft, G.",
-     title     = "Renormalization of massless Yang-Mills fields",
-     journal   = "Nucl. Phys.",
-     volume    = "B33",
-     year      = "1971",
-     pages     = "173-199",
-     SLACcitation  = "%%CITATION = NUPHA,B33,173;%%"
-}
-@Article{'tHooft:1971rn,
-     author    = "'t Hooft, G.",
-     title     = "Renormalizable lagrangians for massive Yang-Mills fields",
-     journal   = "Nucl. Phys.",
-     volume    = "B35",
-     year      = "1971",
-     pages     = "167-188",
-     SLACcitation  = "%%CITATION = NUPHA,B35,167;%%"
-}
-@Unpublished{'tHooft:1972aa,
-  author = 	 "'t Hooft, G.",
-  title = 	 "",
-  note = 	 "Unpublished remarks at the 1972 Marseille Conference 
-                  on Yang-Mills Fields"
-}
-@Article{'tHooft:1972fi,
-     author    = "'t Hooft, G. and Veltman, M. J. G.",
-     title     = "Regularization and renormalization of gauge fields",
-     journal   = "Nucl. Phys.",
-     volume    = "B44",
-     year      = "1972",
-     pages     = "189-213",
-     SLACcitation  = "%%CITATION = NUPHA,B44,189;%%"
-}
-@Article{Abdel-Rehim:2004gx,
-     author    = "Abdel-Rehim, A. M. and Lewis, R.",
-     title     = "Twisted mass {QCD} for the pion electromagnetic form factor",
-     journal   = "Phys. Rev.",
-     volume    = "D71",
-     year      = "2005",
-     pages     = "014503",
-     eprint    = "hep-lat/0410047",
-     SLACcitation  = "%%CITATION = HEP-LAT 0410047;%%"
-}
-@Article{Abdel-Rehim:2005gz,
-     author    = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M.
-                  ",
-     title     = "Spectrum of quenched twisted mass lattice QCD at maximal
-                  twist",
-     journal   = "Phys. Rev.",
-     volume    = "D71",
-     year      = "2005",
-     pages     = "094505",
-     eprint    = "hep-lat/0503007",
-     SLACcitation  = "%%CITATION = HEP-LAT/0503007;%%"
-}
-@Article{AbdelRehim:2004sp,
-     author    = "Abdel-Rehim, Abdou M. and Lewis, Randy",
-     title     = "Pion form factor with twisted mass QCD",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "140",
-     year      = "2005",
-     pages     = "299-301",
-     eprint    = "hep-lat/0408033",
-     SLACcitation  = "%%CITATION = HEP-LAT/0408033;%%"
-}
-@Article{AbdelRehim:2005gq,
-     author    = "Abdel-Rehim, A. M. and Lewis, R. and Woloshyn, R. M.",
-     title     = "Twisted mass lattice QCD and hadron phenomenology",
-     journal   = "Int. J. Mod. Phys.",
-     volume    = "A20",
-     year      = "2005",
-     pages     = "6159-6168",
-     SLACcitation  = "%%CITATION = IMPAE,A20,6159;%%"
-}
-@Article{AbdelRehim:2005gz,
-     author    = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M.
-                  ",
-     title     = "{Spectrum of quenched twisted mass lattice QCD at maximal
-                  twist}",
-     journal   = "Phys. Rev.",
-     volume    = "D71",
-     year      = "2005",
-     pages     = "094505",
-     eprint    = "hep-lat/0503007",
-     archivePrefix = "arXiv",
-     doi       = "10.1103/PhysRevD.71.094505",
-     SLACcitation  = "%%CITATION = HEP-LAT/0503007;%%"
-}
-@Article{AbdelRehim:2005qv,
-     author    = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M.
-                  ",
-     title     = "The hadron spectrum from twisted mass QCD with a strange
-                  quark",
-     journal   = "PoS",
-     volume    = "LAT2005",
-     year      = "2006",
-     pages     = "032",
-     eprint    = "hep-lat/0509056",
-     SLACcitation  = "%%CITATION = HEP-LAT/0509056;%%"
-}
-@Article{AbdelRehim:2005yx,
-     author    = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M.
-                  ",
-     title     = "Maximal twist and the spectrum of quenched twisted mass
-                  lattice QCD",
-     journal   = "PoS",
-     volume    = "LAT2005",
-     year      = "2006",
-     pages     = "051",
-     eprint    = "hep-lat/0509098",
-     SLACcitation  = "%%CITATION = HEP-LAT/0509098;%%"
-}
-@Article{AbdelRehim:2006qu,
-     author    = "Abdel-Rehim, Abdou M. and Lewis, Randy and Petry, Robert G.
-                  and Woloshyn, R. M.",
-     title     = "The spectrum of tmLQCD with quark and link smearing",
-     journal   = "PoS",
-     volume    = "LAT2006",
-     year      = "2006",
-     pages     = "164",
-     eprint    = "hep-lat/0610004",
-     SLACcitation  = "%%CITATION = HEP-LAT/0610004;%%"
-}
-@Article{AbdelRehim:2006ra,
-     author    = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M.
-                  and Wu, Jackson M. S.",
-     title     = "Lattice QCD with a twisted mass term and a strange quark",
-     journal   = "Eur. Phys. J.",
-     volume    = "A31",
-     year      = "2007",
-     pages     = "773-776",
-     eprint    = "hep-lat/0610090",
-     SLACcitation  = "%%CITATION = HEP-LAT/0610090;%%"
-}
-@Article{AbdelRehim:2006ve,
-     author    = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M.
-                  and Wu, Jackson M. S.",
-     title     = "Strange quarks in quenched twisted mass lattice QCD",
-     journal   = "Phys. Rev.",
-     volume    = "D74",
-     year      = "2006",
-     pages     = "014507",
-     eprint    = "hep-lat/0601036",
-     SLACcitation  = "%%CITATION = HEP-LAT/0601036;%%"
-}
-@Article{Adler:1974gd,
-     author    = "Adler, Stephen L.",
-     title     = "{Some Simple Vacuum Polarization Phenomenology: e+ e- $\to$
-                  Hadrons: The mu - Mesic Atom x-Ray Discrepancy and (g-2) of
-                  the Muon}",
-     journal   = "Phys. Rev.",
-     volume    = "D10",
-     year      = "1974",
-     pages     = "3714",
-     SLACcitation  = "%%CITATION = PHRVA,D10,3714;%%"
-}
-@Article{Albanese:1987ds,
-     author    = "Albanese, M. and others",
- collaboration = "APE",
-     title     = "Glueball masses and string tension in lattice {QCD}",
-     journal   = "Phys. Lett.",
-     volume    = "B192",
-     year      = "1987",
-     pages     = "163",
-     SLACcitation  = "%%CITATION = PHLTA,B192,163;%%"
-}
-@Article{Alexandrou:2008tn,
-     author    = "Alexandrou, C. and others",
- collaboration = "ETM",
-     title     = "{Light baryon masses with dynamical twisted mass
-                  fermions}",
-     year      = "2008",
-     eprint    = "0803.3190",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = 0803.3190;%%"
-}
-@Article{AliKhan:2000iv,
-     author    = "Ali Khan, A. and others",
- collaboration = "CP-PACS",
-     title     = "Chiral properties of domain-wall quarks in quenched {QCD}",
-     journal   = "Phys. Rev.",
-     volume    = "D63",
-     year      = "2001",
-     pages     = "114504",
-     eprint    = "hep-lat/0007014",
-     SLACcitation  = "%%CITATION = HEP-LAT 0007014;%%"
-}
-@Article{AliKhan:2003br,
-     author    = "Ali Khan, A. and others",
- collaboration = "QCDSF",
-     title     = "Accelerating the hybrid Monte Carlo algorithm",
-     journal   = "Phys. Lett.",
-     volume    = "B564",
-     year      = "2003",
-     pages     = "235-240",
-     eprint    = "hep-lat/0303026",
-     SLACcitation  = "%%CITATION = HEP-LAT 0303026;%%"
-}
-@Article{AliKhan:2003mu,
-     author    = "Ali Khan, A. and others",
-     title     = "Accelerating Hasenbusch's acceleration of hybrid Monte
-                  Carlo",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "129",
-     year      = "2004",
-     pages     = "853-855",
-     eprint    = "hep-lat/0309078",
-     SLACcitation  = "%%CITATION = HEP-LAT 0309078;%%"
-}
-@Article{Allton:1993wc,
-     author    = "Allton, C. R. and others",
- collaboration = "UK{QCD}",
-     title     = "Gauge invariant smearing and matrix correlators using
-                  {Wilson} fermions at Beta = 6.2",
-     journal   = "Phys. Rev.",
-     volume    = "D47",
-     year      = "1993",
-     pages     = "5128-5137",
-     eprint    = "hep-lat/9303009",
-     SLACcitation  = "%%CITATION = HEP-LAT 9303009;%%"
-}
-@Article{Allton:2004qq,
-     author    = "Allton, C. R. and others",
- collaboration = "UKQCD",
-     title     = "Improved Wilson QCD simulations with light quark masses",
-     journal   = "Phys. Rev.",
-     volume    = "D70",
-     year      = "2004",
-     pages     = "014501",
-     eprint    = "hep-lat/0403007",
-     SLACcitation  = "%%CITATION = HEP-LAT/0403007;%%"
-}
-@Article{Aoki:1984qi,
-     author    = "Aoki, S.",
-     title     = "New phase structure for lattice {QCD} with {Wilson} fermions",
-     journal   = "Phys. Rev.",
-     volume    = "D30",
-     year      = "1984",
-     pages     = "2653",
-     SLACcitation  = "%%CITATION = PHRVA,D30,2653;%%"
-}
-@Article{Aoki:1985jj,
-     author    = "Aoki, S. and Higashijima, K.",
-     title     = "The recovery of the chiral symmetry in lattice {Gross-Neveu}
-                  model",
-     journal   = "Prog. Theor. Phys.",
-     volume    = "76",
-     year      = "1986",
-     pages     = "521",
-     SLACcitation  = "%%CITATION = PTPKA,76,521;%%"
-}
-@Article{Aoki:1986ua,
-     author    = "Aoki, Sinya",
-     title     = "NUMERICAL EVIDENCE FOR A PARITY VIOLATING PHASE IN LATTICE
-                  QCD WITH WILSON FERMION",
-     journal   = "Phys. Lett.",
-     volume    = "B190",
-     year      = "1987",
-     pages     = "140",
-     SLACcitation  = "%%CITATION = PHLTA,B190,140;%%"
-}
-@Article{Aoki:1986xr,
-     author    = "Aoki, S.",
-     title     = "A solution to the {U(1)} problem on a lattice",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "57",
-     year      = "1986",
-     pages     = "3136",
-     SLACcitation  = "%%CITATION = PRLTA,57,3136;%%"
-}
-@Article{Aoki:1993vs,
-     author    = "Aoki, S. and Boettcher, S. and Gocksch, A.",
-     title     = "Spontaneous breaking of flavor symmetry and parity in the
-                  Nambu-Jona-Lasinio model with {Wilson} fermions",
-     journal   = "Phys. Lett.",
-     volume    = "B331",
-     year      = "1994",
-     pages     = "157-164",
-     eprint    = "hep-lat/9312084",
-     SLACcitation  = "%%CITATION = HEP-LAT 9312084;%%"
-}
-@Article{Aoki:1995ft,
-     author    = "Aoki, S.",
-     title     = "On the phase structure of {QCD} with {Wilson} fermions",
-     journal   = "Prog. Theor. Phys. Suppl.",
-     volume    = "122",
-     year      = "1996",
-     pages     = "179-186",
-     eprint    = "hep-lat/9509008",
-     SLACcitation  = "%%CITATION = HEP-LAT 9509008;%%"
-}
-@Article{Aoki:1995yf,
-     author    = "Aoki, S. and Ukawa, A. and Umemura, T.",
-     title     = "Finite temperature phase structure of lattice {QCD} with
-                  {Wilson} quark action",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "76",
-     year      = "1996",
-     pages     = "873-876",
-     eprint    = "hep-lat/9508008",
-     SLACcitation  = "%%CITATION = HEP-LAT 9508008;%%"
-}
-@Article{Aoki:1997fm,
-     author    = "Aoki, S.",
-     title     = "Phase structure of lattice {QCD} with {Wilson} fermion at
-                  finite  temperature",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "60A",
-     year      = "1998",
-     pages     = "206-219",
-     eprint    = "hep-lat/9707020",
-     SLACcitation  = "%%CITATION = HEP-LAT 9707020;%%"
-}
-@Article{Aoki:2001xq,
-     author    = "Aoki, S. and others",
- collaboration = "JL{QCD}",
-     title     = "Non-trivial phase structure of {N(f)} = 3 {QCD} with {O(a)}-
-                  improved {Wilson}  fermion at zero temperature",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "106",
-     year      = "2002",
-     pages     = "263-265",
-     eprint    = "hep-lat/0110088",
-     SLACcitation  = "%%CITATION = HEP-LAT 0110088;%%"
-}
-@Article{Aoki:2002vt,
-     author    = "Aoki, Y. and others",
-     title     = "Domain wall fermions with improved gauge actions",
-     journal   = "Phys. Rev.",
-     volume    = "D69",
-     year      = "2004",
-     pages     = "074504",
-     eprint    = "hep-lat/0211023",
-     SLACcitation  = "%%CITATION = HEP-LAT 0211023;%%"
-}
-@Article{Aoki:2004iq,
-     author    = "Aoki, S. and others",
- collaboration = "JL{QCD}",
-     title     = "Bulk first-order phase transition in three-flavor lattice
-                  {QCD} with  {O(a)}-improved {Wilson} fermion action at zero
-                  temperature",
-     year      = "2004",
-     eprint    = "hep-lat/0409016",
-     SLACcitation  = "%%CITATION = HEP-LAT 0409016;%%"
-}
-@Article{Aoki:2004ta,
-     author    = "Aoki, Sinya and B{\"a}r, Oliver",
-     title     = "Twisted-mass {QCD}, {O}(a) improvement and {Wilson} chiral
-                  perturbation  theory",
-     journal   = "Phys. Rev.",
-     volume    = "D70",
-     year      = "2004",
-     pages     = "116011",
-     eprint    = "hep-lat/0409006",
-     SLACcitation  = "%%CITATION = HEP-LAT 0409006;%%"
-}
-@Article{Aoki:2005ii,
-     author    = "Aoki, S. and B{\"a}r, O.",
-     title     = "Determining the low energy parameters of {Wilson} chiral
-                  perturbation theory",
-     year      = "2005",
-     eprint    = "hep-lat/0509002",
-     SLACcitation  = "%%CITATION = HEP-LAT 0509002;%%"
-}
-@Article{Arnold:2003sx,
-     author    = "Arnold, Guido and others",
-     title     = "Numerical methods for the QCD overlap operator. II: Optimal
-                  Krylov subspace methods",
-     year      = "2003",
-     eprint    = "hep-lat/0311025",
-     SLACcitation  = "%%CITATION = HEP-LAT 0311025;%%"
-}
-@Article{Atiyah:1971rm,
-     author    = "Atiyah, M. F. and Singer, I. M.",
-     title     = "The Index of elliptic operators. 5",
-     journal   = "Annals Math.",
-     volume    = "93",
-     year      = "1971",
-     pages     = "139-149",
-     SLACcitation  = "%%CITATION = ANMAA,93,139;%%"
-}
-@Article{Aubin:2006cc,
-     author    = "Aubin, C. and Blum, T.",
-     title     = "{Hadronic contributions to the muon g-2 from the lattice}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "162",
-     year      = "2006",
-     pages     = "251-255",
-     SLACcitation  = "%%CITATION = NUPHZ,162,251;%%"
-}
-@Article{Aubin:2006xv,
-     author    = "Aubin, C. and Blum, T.",
-     title     = "{Calculating the hadronic vacuum polarization and leading
-                  hadronic  contribution to the muon anomalous magnetic
-                  moment with improved  staggered quarks}",
-     journal   = "Phys. Rev.",
-     volume    = "D75",
-     year      = "2007",
-     pages     = "114502",
-     eprint    = "hep-lat/0608011",
-     SLACcitation  = "%%CITATION = HEP-LAT/0608011;%%"
-}
-@Article{BAGEL,
- author="P.A. Boyle",
- year=2005,
- eprint=" http://www.ph.ed.ac.uk/\~{ }paboyle/bagel/Bagel.html"
- }
-@Article{Baikov:2004ku,
-     author    = "Baikov, P. A. and Chetyrkin, K. G. and K{\"u}hn, J. H.",
-     title     = "{Vacuum polarization in pQCD: First complete O(alpha(s)**4)
-                  result}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "135",
-     year      = "2004",
-     pages     = "243-246",
-     SLACcitation  = "%%CITATION = NUPHZ,135,243;%%"
-}
-@Article{Baikov:2005rw,
-     author    = "Baikov, P. A. and Chetyrkin, K. G. and K{\"u}hn, J. H.",
-     title     = "{Scalar correlator at O(alpha(s)**4), Higgs decay into b-
-                  quarks and  bounds on the light quark masses}",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "96",
-     year      = "2006",
-     pages     = "012003",
-     eprint    = "hep-ph/0511063",
-     SLACcitation  = "%%CITATION = HEP-PH/0511063;%%"
-}
-@Article{Baikov:2008jh,
-     author    = "Baikov, P. A. and Chetyrkin, K. G. and K{\"u}hn, J. H.",
-     title     = "{Hadronic Z- and tau-Decays in Order alpha_s^4}",
-     year      = "2008",
-     eprint    = "0801.1821",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = ARXIV:0801.1821;%%"
-}
-@Article{Bali:2000vr,
-     author    = "Bali, G. S. and others",
- collaboration = "TXL",
-     title     = "Static potentials and glueball masses from {QCD} simulations
-                  with {Wilson}  sea quarks",
-     journal   = "Phys. Rev.",
-     volume    = "D62",
-     year      = "2000",
-     pages     = "054503",
-     eprint    = "hep-lat/0003012",
-     SLACcitation  = "%%CITATION = HEP-LAT 0003012;%%"
-}
-@Article{Bali:2004pb,
-     author    = "Bali, G. S. and others",
-     title     = "String breaking with dynamical {Wilson} fermions",
-     journal   = "Nucl. Phys. Proc. Supl.",
-     volume    = "140",
-     pages     = "609-611",
-     year      = "2004",
-     eprint    = "hep-lat/0409137",
-     SLACcitation  = "%%CITATION = HEP-LAT 0409137;%%"
-}
-@Article{Bali:2005fu,
-     author    = "Bali, G. S. and Neff, H. and Duessel, T. and
-                  Lippert, T. and Schilling, K.",
- collaboration = "SESAM",
-     title     = "Observation of string breaking in {QCD}",
-     journal   = "Phys. Rev.",
-     volume    = "D71",
-     year      = "2005",
-     pages     = "114513",
-     eprint    = "hep-lat/0505012",
-     SLACcitation  = "%%CITATION = HEP-LAT 0505012;%%"
-}
-@Article{Bar:2006zj,
-     author    = "B{\"a}r, O. and Jansen, K. and Schaefer, S. and Scorzato, L.
-                  and Shindler, A.",
-     title     = "Overlap fermions on a twisted mass sea",
-     year      = "2006",
-     eprint    = "hep-lat/0609039",
-     SLACcitation  = "%%CITATION = HEP-LAT 0609039;%%"
-}
-@Article{Baxter:1993bv,
-     author    = "Baxter, R. M. and others",
- collaboration = "UK{QCD}",
-     title     = "Quenched heavy light decay constants",
-     journal   = "Phys. Rev.",
-     volume    = "D49",
-     year      = "1994",
-     pages     = "1594-1605",
-     eprint    = "hep-lat/9308020",
-     SLACcitation  = "%%CITATION = HEP-LAT 9308020;%%"
-}
-@Article{Beane:2004tw,
-     author    = "Beane, Silas R.",
-     title     = "{Nucleon masses and magnetic moments in a finite volume}",
-     journal   = "Phys. Rev.",
-     volume    = "D70",
-     year      = "2004",
-     pages     = "034507",
-     eprint    = "hep-lat/0403015",
-     archivePrefix = "arXiv",
-     doi       = "10.1103/PhysRevD.70.034507",
-     SLACcitation  = "%%CITATION = HEP-LAT/0403015;%%"
-}
-@Article{Becher:1999he,
-     author    = "Becher, Thomas and Leutwyler, H.",
-     title     = "Baryon chiral perturbation theory in manifestly Lorentz
-                  invariant form",
-     journal   = "Eur. Phys. J.",
-     volume    = "C9",
-     year      = "1999",
-     pages     = "643-671",
-     eprint    = "hep-ph/9901384",
-     SLACcitation  = "%%CITATION = HEP-PH/9901384;%%"
-}
-@Article{Bietenholz:2004sa,
-     author    = "Bietenholz, W. and others",
- collaboration = "\xlf",
-     title     = "Comparison between overlap and twisted mass fermions
-                  towards the chiral  limit",
-     year      = "2004",
-     eprint    = "hep-lat/0409109",
-     SLACcitation  = "%%CITATION = HEP-LAT 0409109;%%"
-}
-@Article{Bietenholz:2004wv,
-     author    = "Bietenholz, W. and others",
- collaboration = "\xlf",
-     title     = "Going chiral: Overlap versus twisted mass fermions",
-     journal   = "JHEP",
-     volume    = "12",
-     year      = "2004",
-     pages     = "044",
-     eprint    = "hep-lat/0411001",
-     SLACcitation  = "%%CITATION = HEP-LAT 0411001;%%"
-}
-@Article{Blossier:2007vv,
-     author    = "Blossier, B. and others",
- collaboration = "ETM",
-     title     = "{Light quark masses and pseudoscalar decay constants from
-                  Nf=2 Lattice QCD with twisted mass fermions}",
-     year      = "2007",
-     eprint    = "0709.4574",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = ARXIV:0709.4574;%%"
-}
-@Article{Blum:1994eh,
-     author    = "Blum, Tom and others",
-     title     = "QCD thermodynamics with Wilson quarks at large kappa",
-     journal   = "Phys. Rev.",
-     volume    = "D50",
-     year      = "1994",
-     pages     = "3377-3381",
-     eprint    = "hep-lat/9404006",
-     SLACcitation  = "%%CITATION = HEP-LAT 9404006;%%"
-}
-@Article{Blum:2000kn,
-     author    = "Blum, T. and others",
-     title     = "Quenched lattice {QCD} with domain wall fermions and the
-                  chiral limit",
-     journal   = "Phys. Rev.",
-     volume    = "D69",
-     year      = "2004",
-     pages     = "074502",
-     eprint    = "hep-lat/0007038",
-     SLACcitation  = "%%CITATION = HEP-LAT 0007038;%%"
-}
-@Article{Bodin:2005gg,
-     author    = "Bodin, F. and others",
- collaboration = "ApeNEXT",
-     title     = "The {apeNEXT} project",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "140",
-     year      = "2005",
-     pages     = "176-182",
-     SLACcitation  = "%%CITATION = NUPHZ,140,176;%%"
-}
-@Article{Bolder:2000un,
-     author    = "Bolder, B. and others",
-     title     = "A high precision study of the Q anti-Q potential from
-                  {Wilson} loops in  the regime of string breaking",
-     journal   = "Phys. Rev.",
-     volume    = "D63",
-     year      = "2001",
-     pages     = "074504",
-     eprint    = "hep-lat/0005018",
-     SLACcitation  = "%%CITATION = HEP-LAT 0005018;%%"
-}
-@Article{Boucaud:2007uk,
-     author    = "Boucaud, Ph. and others",
- collaboration = "ETM",
-     title     = "Dynamical twisted mass fermions with light quarks",
-     year      = "2007",
-     eprint    = "hep-lat/0701012",
-     SLACcitation  = "%%CITATION = HEP-LAT 0701012;%%"
-}
-@Article{Boucaud:2008xu,
-     author    = "Boucaud, Ph. and others",
- collaboration = "ETM",
-     title     = "{Dynamical Twisted Mass Fermions with Light Quarks:
-                  Simulation and Analysis Details}",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "179",
-     year      = "2008",
-     pages     = "695-715",
-     eprint    = "0803.0224",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     doi       = "10.1016/j.cpc.2008.06.013",
-     SLACcitation  = "%%CITATION = 0803.0224;%%"
-}
-@Article{Boughezal:2006px,
-     author    = "Boughezal, R. and Czakon, M. and Schutzmeier, T.",
-     title     = "{Charm and bottom quark masses from perturbative QCD}",
-     journal   = "Phys. Rev.",
-     volume    = "D74",
-     year      = "2006",
-     pages     = "074006",
-     eprint    = "hep-ph/0605023",
-     SLACcitation  = "%%CITATION = HEP-PH/0605023;%%"
-}
-@Article{Boyle:2005fb,
-     author    = "Boyle, P. A. and others",
-     title     = "{QCDOC}: Project status and first results",
-     journal   = "J. Phys. Conf. Ser.",
-     volume    = "16",
-     year      = "2005",
-     pages     = "129-139",
-     SLACcitation  = "%%CITATION = 00462,16,129;%%"
-}
-
-@Article{Brower:1994er,
-     author    = "Brower, R. C. and Levi, A. R. and Orginos, K.",
-     title     = "Extrapolation methods for the Dirac inverter in hybrid
-                  Monte Carlo",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "42",
-     year      = "1995",
-     pages     = "855-857",
-     eprint    = "hep-lat/9412004",
-     SLACcitation  = "%%CITATION = HEP-LAT 9412004;%%"
-}
-
-@Article{Brower:1995vx,
-     author    = "Brower, R. C. and Ivanenko, T. and Levi, A. R. and Orginos,
-                  K. N.",
-     title     = "Chronological inversion method for the Dirac matrix in
-                  hybrid Monte  Carlo",
-     journal   = "Nucl. Phys.",
-     volume    = "B484",
-     year      = "1997",
-     pages     = "353-374",
-     eprint    = "hep-lat/9509012",
-     SLACcitation  = "%%CITATION = HEP-LAT 9509012;%%"
-}
-@Article{Bunk:1995uv,
-     author    = "Bunk, B. and others",
-     title     = "A New simulation algorithm for lattice {QCD} with dynamical
-                  quarks",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "42",
-     year      = "1995",
-     pages     = "49-55",
-     eprint    = "hep-lat/9411016",
-     SLACcitation  = "%%CITATION = HEP-LAT 9411016;%%"
-}
-@Article{Bunk:1998rm,
-     author    = "Bunk, B. and Elser, S. and Frezzotti, R. and Jansen,
-                  K.",
-     title     = "Ordering monomial factors of polynomials in the product
-                  representation",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "118",
-     year      = "1999",
-     pages     = "95-109",
-     eprint    = "hep-lat/9805026",
-     SLACcitation  = "%%CITATION = HEP-LAT 9805026;%%"
-}
-@Article{Burrage:1998a,
-  author       = " K. Burrage and J. Erhel",
-  title        = "On the performance of various adaptive preconditioned GMRES strategies",
-  journal      = "Num. Lin. Alg. with Appl.",
-  year         = "1998",
-  volume       = "5",
-  pages        = "101-121"
-}
-@Article{Campbell:1987nv,
-     author    = "Campbell, N. A. and Huntley, A. and Michael, C.",
-     title     = "Heavy quark potentials and hybrid mesons from SU(3) lattice
-                  gauge theory",
-     journal   = "Nucl. Phys.",
-     volume    = "B306",
-     year      = "1988",
-     pages     = "51",
-     SLACcitation  = "%%CITATION = NUPHA,B306,51;%%"
-}
-@Article{Capitani:2005jp,
-     author    = "Capitani, S. and others",
-     title     = "Parton distribution functions with twisted mass fermions",
-     journal   = "Phys. Lett.",
-     volume    = "B639",
-     year      = "2006",
-     pages     = "520-526",
-     eprint    = "hep-lat/0511013",
-     SLACcitation  = "%%CITATION = HEP-LAT 0511013;%%"
-}
-@Article{Chen:2003im,
-     author    = "Chen, Y. and others",
-     title     = "Chiral logarithms in quenched {QCD}",
-     journal   = "Phys. Rev.",
-     volume    = "D70",
-     year      = "2004",
-     pages     = "034502",
-     eprint    = "hep-lat/0304005",
-     SLACcitation  = "%%CITATION = HEP-LAT 0304005;%%"
-}
-@Book{Cheng:2000ct,
-     author    = "Cheng, T. P. and Li, L. F.",
-     title     = "Gauge theory of elementary particle physics: Problems and
-                  solutions",
-     publisher = "Oxford, UK: Clarendon",
-     year      = "2000",
-     pages     = "306",
-     edition   = "",
-}
-@Article{Chetyrkin:1990kr,
-     author    = "Chetyrkin, K. G. and K{\"u}hn, Johann H.",
-     title     = "{Mass corrections to the Z decay rate}",
-     journal   = "Phys. Lett.",
-     volume    = "B248",
-     year      = "1990",
-     pages     = "359-364",
-     SLACcitation  = "%%CITATION = PHLTA,B248,359;%%"
-}
-@Article{Chetyrkin:1996cf,
-     author    = "Chetyrkin, K. G. and K{\"u}hn, Johann H. and Steinhauser, M.",
-     title     = "{Three-loop polarization function and O(alpha(s)**2)
-                  corrections to the  production of heavy quarks}",
-     journal   = "Nucl. Phys.",
-     volume    = "B482",
-     year      = "1996",
-     pages     = "213-240",
-     eprint    = "hep-ph/9606230",
-     SLACcitation  = "%%CITATION = HEP-PH/9606230;%%"
-}
-@Article{Chetyrkin:1997mb,
-     author    = "Chetyrkin, K. G. and K{\"u}hn, Johann H. and Steinhauser, M.",
-     title     = "{Heavy quark current correlators to O(alpha(s)**2)}",
-     journal   = "Nucl. Phys.",
-     volume    = "B505",
-     year      = "1997",
-     pages     = "40-64",
-     eprint    = "hep-ph/9705254",
-     SLACcitation  = "%%CITATION = HEP-PH/9705254;%%"
-}
-@Article{Chetyrkin:1998ix,
-     author    = "Chetyrkin, K. G. and Harlander, R. and Steinhauser, M.",
-     title     = "{Singlet polarization functions at O(alpha(s)**2)}",
-     journal   = "Phys. Rev.",
-     volume    = "D58",
-     year      = "1998",
-     pages     = "014012",
-     eprint    = "hep-ph/9801432",
-     SLACcitation  = "%%CITATION = HEP-PH/9801432;%%"
-}
-@Article{Chetyrkin:2000zk,
-     author    = "Chetyrkin, K. G. and Harlander, R. V. and K{\"u}hn, Johann H.",
-     title     = "{Quartic mass corrections to R(had) at O(alpha(s)**3)}",
-     journal   = "Nucl. Phys.",
-     volume    = "B586",
-     year      = "2000",
-     pages     = "56-72",
-     eprint    = "hep-ph/0005139",
-     SLACcitation  = "%%CITATION = HEP-PH/0005139;%%"
-}
-@Article{Chetyrkin:2006xg,
-     author    = "Chetyrkin, K. G. and K{\"u}hn, J. H. and Sturm, C.",
-     title     = "{Four-loop moments of the heavy quark vacuum polarization
-                  function in  perturbative QCD}",
-     journal   = "Eur. Phys. J.",
-     volume    = "C48",
-     year      = "2006",
-     pages     = "107-110",
-     eprint    = "hep-ph/0604234",
-     SLACcitation  = "%%CITATION = HEP-PH/0604234;%%"
-}
-@Article{Chiarappa:2004ry,
-     author    = "Chiarappa, T. and others",
-     title     = "{Comparing iterative methods for overlap and twisted mass
-                   fermions}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "140",
-     year      = "2005",
-     pages     = "853-855",
-     eprint    = "hep-lat/0409107",
-     archivePrefix = "arXiv",
-     doi       = "10.1016/j.nuclphysbps.2004.11.281",
-     SLACcitation  = "%%CITATION = HEP-LAT/0409107;%%"
-}
-@Article{Chiarappa:2006ae,
-     author    = "Chiarappa, T. and others",
-     title     = "{Numerical simulation of {QCD} with u, d, s and c quarks in
-                  the twisted-mass {W}ilson formulation}",
-     journal   = "Eur. Phys. J.",
-     volume    = "C50",
-     year      = "2007",
-     pages     = "373-383",
-     eprint    = "hep-lat/0606011",
-     archivePrefix = "arXiv",
-     doi       = "10.1140/epjc/s10052-006-0204-4",
-     SLACcitation  = "%%CITATION = HEP-LAT/0606011;%%"
-}
-@Article{Chiarappa:2006hz,
-     author    = "Chiarappa, T. and others",
-     title     = "{Iterative methods for overlap and twisted mass fermions}",
-     year      = "2008",
-     journal   = "Comput. Sci. Disc.",
-     volume    = "01",
-     pages     = "015001",
-     eprint    = "hep-lat/0609023",
-     archivePrefix = "arXiv",
-     SLACcitation  = "%%CITATION = HEP-LAT/0609023;%%"
-}
-@Article{Cichy:2008gk,
-     author    = "Cichy, K. and Gonzalez Lopez, J. and Jansen, K. and Kujawa,
-                  A. and Shindler, A.",
-     title     = "{Twisted Mass, Overlap and Creutz Fermions: Cut-off Effects
-                  at Tree-level of Perturbation Theory}",
-     journal   = "Nucl. Phys.",
-     volume    = "B800",
-     year      = "2008",
-     pages     = "94-108",
-     eprint    = "0802.3637",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     doi       = "10.1016/j.nuclphysb.2008.03.004",
-     SLACcitation  = "%%CITATION = 0802.3637;%%"
-}
-@Article{Clark:2004cq,
-     author    = "Clark, M. A. and Kennedy, A. D.",
-     title     = "Accelerating fermionic molecular dynamics",
-     year      = "2004",
-     eprint    = "hep-lat/0409134",
-     SLACcitation  = "%%CITATION = HEP-LAT 0409134;%%"
-}
-
-@Article{Clark:2005sq,
-     author    = "Clark, M. A. and de Forcrand, Ph. and Kennedy, A. D.",
-     title     = "Algorithm shootout: R versus RHMC",
-     journal   = "PoS",
-     volume    = "LAT2005",
-     year      = "2005",
-     pages     = "115",
-     eprint    = "hep-lat/0510004",
-     SLACcitation  = "%%CITATION = HEP-LAT 0510004;%%"
-}
-@Article{Clark:2006fx,
-     author    = "Clark, M. A. and Kennedy, A. D.",
-     title     = "Accelerating dynamical fermion computations using the
-                  rational hybrid {Monte} {Carlo} ({RHMC}) algorithm with multiple
-                  pseudofermion fields",
-     year      = "2006",
-     eprint    = "hep-lat/0608015",
-     SLACcitation  = "%%CITATION = HEP-LAT 0608015;%%"
-}
-@Article{Colangelo:2001df,
-     author    = "Colangelo, G. and Gasser, J. and Leutwyler, H.",
-     title     = "{pi pi scattering}",
-     journal   = "Nucl. Phys.",
-     volume    = "B603",
-     year      = "2001",
-     pages     = "125-179",
-     eprint    = "hep-ph/0103088",
-     archivePrefix = "arXiv",
-     doi       = "10.1016/S0550-3213(01)00147-X",
-     SLACcitation  = "%%CITATION = HEP-PH/0103088;%%"
-}
-@Article{Colangelo:2003hf,
-     author    = "Colangelo, Gilberto and D{\"u}rr, Stephan",
-     title     = "The pion mass in finite volume",
-     journal   = "Eur. Phys. J.",
-     volume    = "C33",
-     year      = "2004",
-     pages     = "543-553",
-     eprint    = "hep-lat/0311023",
-     SLACcitation  = "%%CITATION = HEP-LAT/0311023;%%"
-}
-@Article{Colangelo:2005gd,
-     author    = "Colangelo, Gilberto and D{\"u}rr, Stephan and Haefeli,
-                  Christoph",
-     title     = "Finite volume effects for meson masses and decay
-                  constants",
-     journal   = "Nucl. Phys.",
-     volume    = "B721",
-     year      = "2005",
-     pages     = "136-174",
-     eprint    = "hep-lat/0503014",
-     SLACcitation  = "%%CITATION = HEP-LAT 0503014;%%"
-}
-@Article{Colangelo:2006mp,
-     author    = "Colangelo, Gilberto and Haefeli, Christoph",
-     title     = "{Finite volume effects for the pion mass at two loops}",
-     journal   = "Nucl. Phys.",
-     volume    = "B744",
-     year      = "2006",
-     pages     = "14-33",
-     eprint    = "hep-lat/0602017",
-     archivePrefix = "arXiv",
-     doi       = "10.1016/j.nuclphysb.2006.03.010",
-     SLACcitation  = "%%CITATION = HEP-LAT/0602017;%%"
-}
-@Book{Collins:1994ab,
-     author    = "Collins, J.C.",
-     title     = "Renormalisation",
-     publisher = "Cambridge University Press",
-     series    = "Cambridge Monographs on Mathematical Physics",
-     year      = "1994",
-     edition   = "",
-}
-@Article{Creutz:1984fj,
-     author    = "Creutz, M. and Gocksch, A. and Ogilvie, M. and
-                  Okawa, M.",
-     title     = "Microcanonical renormalization group",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "53",
-     year      = "1984",
-     pages     = "875",
-     SLACcitation  = "%%CITATION = PRLTA,53,875;%%"
-}
-@Article{Creutz:1989wt,
-     author    = "Creutz, M. and Gocksch, A.",
-     title     = "Higher order hybrid monte carlo algorithms",
-     note     = "BNL-42601"
-}
-@Article{Creutz:1996bg,
-     author    = "Creutz, Michael",
-     title     = "Wilson fermions at finite temperature",
-     year      = "1996",
-     eprint    = "hep-lat/9608024",
-     SLACcitation  = "%%CITATION = HEP-LAT 9608024;%%"
-}
-@Article{Creutz:1998ee,
-     author    = "Creutz, M.",
-     title     = "Evaluating Grassmann integrals",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "81",
-     year      = "1998",
-     pages     = "3555-3558",
-     eprint    = "hep-lat/9806037",
-     SLACcitation  = "%%CITATION = HEP-LAT 9806037;%%"
-}
-@Article{Cundy:2005pi,
-     author    = "Cundy, N. and others",
-     title     = "Numerical Methods for the {QCD} Overlap Operator IV: Hybrid
-                  Monte Carlo",
-     year      = "2005",
-     eprint    = "hep-lat/0502007",
-     SLACcitation  = "%%CITATION = HEP-LAT 0502007;%%"
-}
-@Article{David:1984ys,
-     author    = "David, F. and Hamber, H. W.",
-     title     = "Chiral condensate with {Wilson} fermions",
-     journal   = "Nucl. Phys.",
-     volume    = "B248",
-     year      = "1984",
-     pages     = "381",
-     SLACcitation  = "%%CITATION = NUPHA,B248,381;%%"
-}
-@Article{Davies:2008sw,
-     author    = "Davies, C. T. H. and others",
- collaboration = "HPQCD",
-     title     = "{Update: Accurate Determinations of $\alpha_s$ from
-                  Realistic Lattice QCD}",
-     year      = "2008",
-     eprint    = "0807.1687",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = 0807.1687;%%"
-}
-@Article{DeGrand:1990dk,
-     author    = "DeGrand, T. A. and Rossi, P.",
-     title     = "Conditioning techniques for dynamical fermions",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "60",
-     year      = "1990",
-     pages     = "211-214",
-     SLACcitation  = "%%CITATION = CPHCB,60,211;%%"
-}
-@Article{DeGrand:1990ip,
-     author    = "DeGrand, T. A.",
-     title     = "Resonance masses from Monte Carlo simulations (with
-                  emphasis on the rho meson)",
-     journal   = "Phys. Rev.",
-     volume    = "D43",
-     year      = "1991",
-     pages     = "2296-2300",
-     SLACcitation  = "%%CITATION = PHRVA,D43,2296;%%"
-}
-@Article{DeGrand:2002vu,
-     author    = "DeGrand, Thomas and Hasenfratz, Anna and Kovacs, Tamas G.",
-     title     = "Improving the chiral properties of lattice fermions",
-     journal   = "Phys. Rev.",
-     volume    = "D67",
-     year      = "2003",
-     pages     = "054501",
-     eprint    = "hep-lat/0211006",
-     SLACcitation  = "%%CITATION = HEP-LAT 0211006;%%"
-}
-@Article{DeTar:2007ni,
-     author    = "DeTar, Carleton and Levkova, L.",
-     title     = "Effects of the disconnected flavor singlet corrections on
-                  the hyperfine splitting in charmonium",
-     journal   = "PoS",
-     volume    = "LAT2007",
-     year      = "2007",
-     pages     = "116",
-     eprint    = "0710.1322",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = ARXIV:0710.1322;%%"
-}
-@Article{DelDebbio:2006cn,
-     author    = "Del Debbio, L. and Giusti, L. and Luscher, M. and
-                  Petronzio, R. and Tantalo, N.",
-     title     = "QCD with light Wilson quarks on fine lattices. I: First
-                  experiences and physics results",
-     journal   = "JHEP",
-     volume    = "02",
-     year      = "2007",
-     pages     = "056",
-     eprint    = "hep-lat/0610059",
-     SLACcitation  = "%%CITATION = HEP-LAT 0610059;%%"
-}
-@Article{DellaMorte:2000yp,
-     author    = "Della Morte, M. and Frezzotti, R. and Heitger, J. and Sint,
-                  S.",
-     title     = "Non-perturbative scaling tests of twisted mass {QCD}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "94",
-     year      = "2001",
-     pages     = "617-621",
-     eprint    = "hep-lat/0010091",
-     SLACcitation  = "%%CITATION = HEP-LAT 0010091;%%"
-}
-@Article{DellaMorte:2001tu,
-     author    = "Della Morte, M. and Frezzotti, R. and Heitger, J.",
-     title     = "Quenched twisted mass {QCD} at small quark masses and in
-                  large volume",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "106",
-     year      = "2002",
-     pages     = "260-262",
-     eprint    = "hep-lat/0110166",
-     SLACcitation  = "%%CITATION = HEP-LAT 0110166;%%"
-}
-
-@Article{DellaMorte:2001ys,
-     author    = "Della Morte, M. and Frezzotti, R. and Heitger,
-                  J. and Sint, S.",
- collaboration = "ALPHA",
-     title     = "Cutoff effects in twisted mass lattice {QCD}",
-     journal   = "JHEP",
-     volume    = "10",
-     year      = "2001",
-     pages     = "041",
-     eprint    = "hep-lat/0108019",
-     SLACcitation  = "%%CITATION = HEP-LAT 0108019;%%"
-}                                                                               
-@Article{DellaMorte:2003jj,
-     author    = "Della Morte, M. and others",
- collaboration = "ALPHA",
-     title     = "Simulating the Schroedinger functional with two pseudo-
-                  fermions",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "156",
-     year      = "2003",
-     pages     = "62-72",
-     eprint    = "hep-lat/0307008",
-     SLACcitation  = "%%CITATION = HEP-LAT 0307008;%%"
-}                                                                               
-@Article{DellaMorte:2003mn,
-     author    = "Della Morte, M. and others",
- collaboration = "ALPHA",
-     title     = "Lattice HQET with exponentially improved statistical
-                  precision",
-     journal   = "Phys. Lett.",
-     volume    = "B581",
-     year      = "2004",
-     pages     = "93-98",
-     eprint    = "hep-lat/0307021",
-     SLACcitation  = "%%CITATION = HEP-LAT 0307021;%%"
-}             
-@Article{DellaMorte:2003mw,
-     author    = "Della Morte, M. and others",
- collaboration = "ALPHA",
-     title     = "Static quarks with improved statistical precision",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "129",
-     year      = "2004",
-     pages     = "346-348",
-     eprint    = "hep-lat/0309080",
-     SLACcitation  = "%%CITATION = HEP-LAT 0309080;%%"
-}                                                                  
-@Article{DellaMorte:2005yc,
-     author    = "Della Morte, M. and Shindler, A. and Sommer,
-                  R.",
-     title     = "On lattice actions for static quarks",
-     year      = "2005",
-     eprint    = "hep-lat/0506008",
-     SLACcitation  = "%%CITATION = HEP-LAT 0506008;%%"
-}
-@Article{Dimopoulos:2006dm,
-     author    = "Dimopoulos, P. and others",
- collaboration = "ALPHA",
-     title     = "A precise determination of B(K) in quenched QCD",
-     journal   = "Nucl. Phys.",
-     volume    = "B749",
-     year      = "2006",
-     pages     = "69-108",
-     eprint    = "hep-ph/0601002",
-     SLACcitation  = "%%CITATION = HEP-PH 0601002;%%"
-}
-@Article{Dimopoulos:2007fn,
-     author    = "Dimopoulos, P. and others",
-     title     = "{Renormalisation of quark bilinears with Nf=2 Wilson
-                  fermions and tree-level improved gauge action}",
-     journal   = "PoS",
-     volume    = "LAT2007",
-     year      = "2007",
-     pages     = "241",
-     eprint    = "0710.0975",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = 0710.0975;%%"
-}
-@Article{Dimopoulos:2007qy,
-     author    = "Dimopoulos, Petros and Frezzotti, Roberto and Herdoiza,
-                  Gregorio and Urbach, Carsten and Wenger, Urs",
- collaboration = "ETM",
-     title     = "{Scaling and low energy constants in lattice QCD with N_f=2
-                  maximally twisted Wilson quarks}",
-     journal   = "PoS",
-     volume    = "LAT2007",
-     year      = "2007",
-     pages     = "102",
-     eprint    = "0710.2498",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = 0710.2498;%%"
-}
-@Article{Dimopoulos:2008sy,
-     author    = "Dimopoulos, Petros and others",
- collaboration = "ETM",
-     title     = "{Scaling and chiral extrapolation of pion mass and decay
-                  constant with maximally twisted mass QCD}",
-     year      = "2008",
-     eprint    = "0810.2873",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = 0810.2873;%%"
-}
-@Article{Dong:2001fm,
-     author    = "Dong, S. J. and others",
-     title     = "Chiral properties of pseudoscalar mesons on a quenched
-                  20**4 lattice  with overlap fermions",
-     journal   = "Phys. Rev.",
-     volume    = "D65",
-     year      = "2002",
-     pages     = "054507",
-     eprint    = "hep-lat/0108020",
-     SLACcitation  = "%%CITATION = HEP-LAT 0108020;%%"
-}
-@Article{Duane:1987de,
-     author    = "Duane, S. and Kennedy, A. D. and Pendleton, B. J. and
-                  Roweth, D.",
-     title     = "{H}ybrid monte carlo",
-     journal   = "Phys. Lett.",
-     volume    = "B195",
-     year      = "1987",
-     pages     = "216-222",
-     SLACcitation  = "%%CITATION = PHLTA,B195,216;%%"
-}
-@Article{Edwards:1996vs,
-     author    = "Edwards, R. G. and Horvath, I. and Kennedy, A. D.",
-     title     = "Instabilities and non-reversibility of molecular dynamics
-                  trajectories",
-     journal   = "Nucl. Phys.",
-     volume    = "B484",
-     year      = "1997",
-     pages     = "375-402",
-     eprint    = "hep-lat/9606004",
-     SLACcitation  = "%%CITATION = HEP-LAT 9606004;%%"
-}
-@Article{Edwards:2004sx,
-     author    = "Edwards, Robert G. and Joo, Balint",
- collaboration = "SciDAC",
-     title     = "The {Chroma} software system for lattice {QCD}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "140",
-     year      = "2005",
-     pages     = "832",
-     eprint    = "hep-lat/0409003",
-     SLACcitation  = "%%CITATION = HEP-LAT 0409003;%%"
-}
-@Article{Eichten:1989zv,
-     author    = "Eichten, E. and Hill, B.",
-     title     = "An effective field theory for the calculation of matrix
-                  elements involving heavy quarks",
-     journal   = "Phys. Lett.",
-     volume    = "B234",
-     year      = "1990",
-     pages     = "511",
-     SLACcitation  = "%%CITATION = PHLTA,B234,511;%%"
-}
-@Article{Farchioni:2002vn,
-     author    = "Farchioni, F. and Gebert, C. and Montvay, I.
-                  and Scorzato, L.",
-     title     = "Numerical simulation tests with light dynamical quarks",
-     journal   = "Eur. Phys. J.",
-     volume    = "C26",
-     year      = "2002",
-     pages     = "237-251",
-     eprint    = "hep-lat/0206008",
-     SLACcitation  = "%%CITATION = HEP-LAT 0206008;%%"
-}
-@Article{Farchioni:2004fs,
-     author    = "Farchioni, F. and others",
-     title     = "The phase structure of lattice {QCD} with {Wilson} quarks and
-                  renormalization group improved gluons",
-     journal   = "Eur. Phys. J.",
-     volume    = "C42",
-     year      = "2005",
-     pages     = "73-87",
-     eprint    = "hep-lat/0410031",
-     SLACcitation  = "%%CITATION = HEP-LAT 0410031;%%"
-}
-@Article{Farchioni:2004ma,
-     author    = "Farchioni, F. and others",
-     title     = "Exploring the phase structure of lattice {{QCD}} with twisted
-                  mass quarks",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "140",
-     year      = "2005",
-     pages     = "240-245",
-     eprint    = "hep-lat/0409098",
-     SLACcitation  = "%%CITATION = HEP-LAT 0409098;%%"
-}
-@Article{Farchioni:2004us,
-     author    = "Farchioni, F. and others",
-     title     = "Twisted mass quarks and the phase structure of lattice
-                  {QCD}",
-     journal   = "Eur. Phys. J.",
-     volume    = "C39",
-     year      = "2005",
-     pages     = "421-433",
-     eprint    = "hep-lat/0406039",
-     SLACcitation  = "%%CITATION = HEP-LAT 0406039;%%"
-}
-@Article{Farchioni:2005ec,
-     author    = "Farchioni, Federico and others",
-     title     = "Dynamical twisted mass fermions",
-     journal   = "PoS",
-     volume    = "LAT2005",
-     year      = "2006",
-     pages     = "072",
-     eprint    = "hep-lat/0509131",
-     SLACcitation  = "%%CITATION = HEP-LAT 0509131;%%"
-}
-@Article{Farchioni:2005hf,
-     author    = "Farchioni, F. and others",
-     title     = "Twisted mass fermions: Neutral pion masses from
-                  disconnected contributions",
-     journal   = "PoS",
-     volume    = "LAT2005",
-     year      = "2006",
-     pages     = "033",
-     eprint    = "hep-lat/0509036",
-     SLACcitation  = "%%CITATION = HEP-LAT 0509036;%%"
-}
-@Article{Farchioni:2005tu,
-     author    = "Farchioni, F. and others",
-     title     = "Lattice spacing dependence of the first order phase
-                  transition for  dynamical twisted mass fermions",
-     journal   = "Phys. Lett.",
-     volume    = "B624",
-     year      = "2005",
-     pages     = "324-333",
-     eprint    = "hep-lat/0506025",
-     SLACcitation  = "%%CITATION = HEP-LAT 0506025;%%"
-}
-@Article{Feldmann:1999uf,
-     author    = "Feldmann, Thorsten",
-     title     = "{Quark structure of pseudoscalar mesons}",
-     journal   = "Int. J. Mod. Phys.",
-     volume    = "A15",
-     year      = "2000",
-     pages     = "159-207",
-     eprint    = "hep-ph/9907491",
-     SLACcitation  = "%%CITATION = HEP-PH/9907491;%%"
-}
-@Article{Feynman:1948aa,
-     author    = "Feynman, R. P.",
-     title     = "Space-time approach to non-relativistic quantum mechanics",
-     journal   = "Rev. Mod. Phys.",
-     volume    = "20",
-     year      = "1948",
-     pages     = "367-387",
-     SLACcitation  = "%%CITATION = RMPHA,20,367;%%"
-}
-@Article{Fischer:1996th,
-     author    = "Fischer, S. and others",
-     title     = "A Parallel SSOR Preconditioner for Lattice {QCD}",
-     journal   = "Comp. Phys. Commun.",
-     volume    = "98",
-     year      = "1996",
-     pages     = "20-34",
-     eprint    = "hep-lat/9602019",
-     SLACcitation  = "%%CITATION = HEP-LAT 9602019;%%"
-}
-@Article{Fokkema:1998aa,
-     author    = "Fokkema, D.~R. and Sleijpen, G.~L.~G. and Van~der~Vorst, H.~A.",
-     title     = "{J}acobi-{D}avidson style {QR} and {QZ} algorithms for
-                  the reduction of matrix pencils",
-     journal   = "J. Sci. Comput.",
-     volume    = "20",
-     year      = "1998",
-     pages     = "94-125",
-}
-@Article{Foster:1998vw,
-     author    = "Foster, M. and Michael, C.",
-     collaboration = "UKQCD",
-     title     = "Quark mass dependence of hadron masses from lattice {QCD}",
-     journal   = "Phys. Rev.",
-     volume    = "D59",
-     year      = "1999",
-     pages     = "074503",
-     eprint    = "hep-lat/9810021",
-     SLACcitation  = "%%CITATION = HEP-LAT 9810021;%%"
-}
-@Article{Freund,
-     author    = "Freund, R.W.",
-     journal   = "in Numerical Linear Algebra, L.\ Reichel, A.\ Ruttan and R.S.\ Varga (eds.)",
-     year      = "1993",
-     pages     = "p. 101",
-}
-@Article{Frezzotti:1997ym,
-     author    = "Frezzotti, R. and Jansen, K.",
-     title     = "A polynomial hybrid Monte Carlo algorithm",
-     journal   = "Phys. Lett.",
-     volume    = "B402",
-     year      = "1997",
-     pages     = "328-334",
-     eprint    = "hep-lat/9702016",
-     SLACcitation  = "%%CITATION = HEP-LAT 9702016;%%"
-}
-@Article{Frezzotti:1998eu,
-     author    = "Frezzotti, R. and Jansen, K.",
-     title     = "The {PHMC} algorithm for simulations of dynamical fermions.
-                  {I}: Description and properties",
-     journal   = "Nucl. Phys.",
-     volume    = "B555",
-     year      = "1999",
-     pages     = "395-431",
-     eprint    = "hep-lat/9808011",
-     SLACcitation  = "%%CITATION = HEP-LAT 9808011;%%"
-}
-@ArticleF{Frezzotti:1998yp,
-     author    = "Frezzotti, R. and Jansen, K.",
-     title     = "The {PHMC} algorithm for simulations of dynamical fermions.
-                  {II}:  Performance analysis",
-     journal   = "Nucl. Phys.",
-     volume    = "B555",
-     year      = "1999",
-     pages     = "432-453",
-     eprint    = "hep-lat/9808038",
-     SLACcitation  = "%%CITATION = HEP-LAT 9808038;%%"
-}
-@Article{Frezzotti:1999vv,
-     author    = "Frezzotti, R. and Grassi, P. A. and Sint,
-                  S. and Weisz, P.",
-     title     = "A local formulation of lattice {QCD} without unphysical
-                  fermion zero modes",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "83",
-     year      = "2000",
-     pages     = "941-946",
-     eprint    = "hep-lat/9909003",
-     SLACcitation  = "%%CITATION = HEP-LAT 9909003;%%"
-}
-@Article{Frezzotti:2000nk,
-     author    = "Frezzotti, R. and Grassi, P. A. and Sint,
-                  S. and Weisz, P.",
- collaboration = "ALPHA",
-     title     = "Lattice {QCD} with a chirally twisted mass term",
-     journal   = "JHEP",
-     volume    = "08",
-     year      = "2001",
-     pages     = "058",
-     eprint    = "hep-lat/0101001",
-     SLACcitation  = "%%CITATION = HEP-LAT 0101001;%%"
-}
-@Article{Frezzotti:2001du,
-     author    = "Frezzotti, R. and Sint, S.",
-     title     = "Some remarks on {O(a)} improved twisted mass {QCD}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "106",
-     year      = "2002",
-     pages     = "814-816",
-     eprint    = "hep-lat/0110140",
-     SLACcitation  = "%%CITATION = HEP-LAT 0110140;%%"
-}
-@Article{Frezzotti:2001ea,
-     author    = "Frezzotti, R. and Sint, S. and Weisz, P.",
- collaboration = "ALPHA",
-     title     = "{O(a)} improved twisted mass lattice {QCD}",
-     journal   = "JHEP",
-     volume    = "07",
-     year      = "2001",
-     pages     = "048",
-     eprint    = "hep-lat/0104014",
-     SLACcitation  = "%%CITATION = HEP-LAT 0104014;%%"
-}
-@Article{Frezzotti:2003ni,
-     author    = "Frezzotti, R. and Rossi, G. C.",
-     title     = "Chirally improving {Wilson} fermions. {I}: {O(a)} improvement",
-     journal   = "JHEP",
-     volume    = "08",
-     year      = "2004",
-     pages     = "007",
-     eprint    = "hep-lat/0306014",
-     SLACcitation  = "%%CITATION = HEP-LAT 0306014;%%"
-}
-@Article{Frezzotti:2003xj,
-     author    = "Frezzotti, R. and Rossi, G. C.",
-     title     = "Twisted-mass lattice {QCD} with mass non-degenerate quarks",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "128",
-     year      = "2004",
-     pages     = "193-202",
-     eprint    = "hep-lat/0311008",
-     SLACcitation  = "%%CITATION = HEP-LAT 0311008;%%"
-}
-@Article{Frezzotti:2004wz,
-     author    = "Frezzotti, R. and Rossi, G. C.",
-     title     = "Chirally improving {Wilson} fermions. {II}: Four-quark
-                  operators",
-     journal   = "JHEP",
-     volume    = "10",
-     year      = "2004",
-     pages     = "070",
-     eprint    = "hep-lat/0407002",
-     SLACcitation  = "%%CITATION = HEP-LAT 0407002;%%"
-}
-@Article{Frezzotti:2005gi,
-     author    = "Frezzotti, R. and Martinelli, G. and Papinutto, M. and
-                  Rossi, G. C.",
-     title     = "Reducing cutoff effects in maximally twisted lattice {QCD}
-                  close to the  chiral limit",
-     journal   = "JHEP",
-     volume    = "04",
-     year      = "2006",
-     pages     = "038",
-     eprint    = "hep-lat/0503034",
-     SLACcitation  = "%%CITATION = HEP-LAT 0503034;%%"
-}
-@Article{Frezzotti:2007qv,
-     author    = "Frezzotti, R. and Rossi, G.",
-     title     = "{O(a^2) cutoff effects in Wilson fermion simulations}",
-     journal   = "PoS",
-     volume    = "LAT2007",
-     year      = "2007",
-     pages     = "277",
-     eprint    = "0710.2492",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = 0710.2492;%%"
-}
-@Article{Frezzotti:2008dr,
-     author    = "Frezzotti, R. and Lubicz, V. and Simula, S.",
- collaboration = "ETM",
-     title     = "{Electromagnetic form factor of the pion from twisted-mass
-                  lattice {QCD} at {Nf}=2}",
-     year      = "2008",
-     eprint    = "0812.4042",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = 0812.4042;%%"
-}
-@Article{Fritzsch:1973pi,
-     author    = "Fritzsch, H. and Gell-Mann, M. and Leutwyler, H.",
-     title     = "Advantages of the color octet gluon picture",
-     journal   = "Phys. Lett.",
-     volume    = "B47",
-     year      = "1973",
-     pages     = "365-368",
-     SLACcitation  = "%%CITATION = PHLTA,B47,365;%%"
-}
-@Article{Frommer:1994vn,
-     author    = "Frommer, A. and Hannemann, V. and Nockel, B. and Lippert,
-                  T. and Schilling, K.",
-     title     = "Accelerating {Wilson} fermion matrix inversions by means of
-                  the stabilized biconjugate gradient algorithm",
-     journal   = "Int. J. Mod. Phys.",
-     volume    = "C5",
-     year      = "1994",
-     pages     = "1073-1088",
-     eprint    = "hep-lat/9404013",
-     SLACcitation  = "%%CITATION = HEP-LAT 9404013;%%"
-}
-@Article{Frommer:1995ik,
-     author    = "Frommer, Andreas and Nockel, Bertold and Gusken, Stephan
-                  and Lippert, Thomas and Schilling, Klaus",
-     title     = "Many masses on one stroke: Economic computation of quark
-                  propagators",
-     journal   = "Int. J. Mod. Phys.",
-     volume    = "C6",
-     year      = "1995",
-     pages     = "627-638",
-     eprint    = "hep-lat/9504020",
-     SLACcitation  = "%%CITATION = HEP-LAT 9504020;%%"
-}
-@Article{Furman:1994ky,
-     author    = "Furman, V. and Shamir, Y.",
-     title     = "Axial symmetries in lattice QCD with Kaplan fermions",
-     journal   = "Nucl. Phys.",
-     volume    = "B439",
-     year      = "1995",
-     pages     = "54-78",
-     eprint    = "hep-lat/9405004",
-     SLACcitation  = "%%CITATION = HEP-LAT 9405004;%%"
-}
-@Article{Garden:1999fg,
-     author    = "Garden, J. and Heitger, J. and Sommer, R. and
-                  Wittig H.",
- collaboration = "ALPHA",
-     title     = "Precision computation of the strange quark's mass in
-                  quenched {QCD}",
-     journal   = "Nucl. Phys.",
-     volume    = "B571",
-     year      = "2000",
-     pages     = "237-256",
-     eprint    = "hep-lat/9906013",
-     SLACcitation  = "%%CITATION = HEP-LAT 9906013;%%"
-}
-@Article{Garron:2003cb,
-     author    = "Garron, N. and Giusti, L. and Hoelbling,
-                  C. and Lellouch, L. and Rebbi, C.",
-     title     = "B(K) from quenched {QCD} with exact chiral symmetry",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "92",
-     year      = "2004",
-     pages     = "042001",
-     eprint    = "hep-ph/0306295",
-     SLACcitation  = "%%CITATION = HEP-PH 0306295;%%"
-}
-@Article{Gasser:1982ap,
-     author    = "Gasser, J. and Leutwyler, H.",
-     title     = "Quark masses",
-     journal   = "Phys. Rept.",
-     volume    = "87",
-     year      = "1982",
-     pages     = "77-169",
-     SLACcitation  = "%%CITATION = PRPLC,87,77;%%"
-}
-
-@Article{Gasser:1983yg,
-     author    = "Gasser, J. and Leutwyler, H.",
-     title     = "Chiral perturbation theory to one loop",
-     journal   = "Ann. Phys.",
-     volume    = "158",
-     year      = "1984",
-     pages     = "142",
-     SLACcitation  = "%%CITATION = APNYA,158,142;%%"
-}
-@Article{Gasser:1985gg,
-     author    = "Gasser, J. and Leutwyler, H.",
-     title     = "Chiral perturbation theory: expansions in the mass of the
-                  strange quark",
-     journal   = "Nucl. Phys.",
-     volume    = "B250",
-     year      = "1985",
-     pages     = "465",
-     SLACcitation  = "%%CITATION = NUPHA,B250,465;%%"
-}
-@Article{Gasser:1986vb,
-     author    = "Gasser, J. and Leutwyler, H.",
-     title     = "LIGHT QUARKS AT LOW TEMPERATURES",
-     journal   = "Phys. Lett.",
-     volume    = "B184",
-     year      = "1987",
-     pages     = "83",
-     SLACcitation  = "%%CITATION = PHLTA,B184,83;%%"
-}
-@Article{Gattringer:2003qx,
-     author    = "Gattringer, C. and others",
- collaboration = "BGR",
-     title     = "Quenched spectroscopy with fixed-point and chirally
-                  improved fermions",
-     journal   = "Nucl. Phys.",
-     volume    = "B677",
-     year      = "2004",
-     pages     = "3-51",
-     eprint    = "hep-lat/0307013",
-     SLACcitation  = "%%CITATION = HEP-LAT 0307013;%%"
-}
-@Article{Gell-Mann:1964nj,
-     author    = "Gell-Mann, M.",
-     title     = "A Schematic model of baryons and mesons",
-     journal   = "Phys. Lett.",
-     volume    = "8",
-     year      = "1964",
-     pages     = "214-215",
-     SLACcitation  = "%%CITATION = PHLTA,8,214;%%"
-}
-@Article{Gell-Mann:1968rz,
-     author    = "Gell-Mann, M. and Oakes, R. J. and Renner, B.",
-     title     = "Behavior of current divergences under SU(3) x SU(3)",
-     journal   = "Phys. Rev.",
-     volume    = "175",
-     year      = "1968",
-     pages     = "2195-2199",
-     SLACcitation  = "%%CITATION = PHRVA,175,2195;%%"
-}
-@PhdThesis{Geus:2002,
-  author = 	 {R. Geus},
-  title = 	 {The Jacobi-Davidson algorithm for solving large
-                  sparse symmetric eigenvalue problems with
-                  application to the design of accelerator cavities}, 
-  school = 	 {Swiss Federal Institute Of Technology Z{\"u}rich},
-  year = 	 {2002},
-  OPTkey = 	 {DISS. ETH NO. 14734},
-  OPTtype = 	 {},
-  OPTaddress = 	 {},
-  OPTmonth = 	 {},
-  OPTnote = 	 {},
-  OPTannote = 	 {}
-}
-@Article{Gimenez:1998ue,
-     author    = "Gimenez, V. and Giusti, L. and Rapuano, F. and Talevi, M.",
-     title     = "Non-perturbative renormalization of quark bilinears",
-     journal   = "Nucl. Phys.",
-     volume    = "B531",
-     year      = "1998",
-     pages     = "429-445",
-     eprint    = "hep-lat/9806006",
-     SLACcitation  = "%%CITATION = HEP-LAT 9806006;%%"
-}
-@Article{Gimenez:2005nt,
-     author    = "Gimenez, V. and Lubicz, V. and Mescia, F. and Porretti, V.
-                  and Reyes, J.",
-     title     = "{Operator product expansion and quark condensate from
-                  lattice QCD in  coordinate space}",
-     journal   = "Eur. Phys. J.",
-     volume    = "C41",
-     year      = "2005",
-     pages     = "535-544",
-     eprint    = "hep-lat/0503001",
-     SLACcitation  = "%%CITATION = HEP-LAT/0503001;%%"
-}
-@Article{Ginsparg:1981bj,
-     author    = "Ginsparg, P. H. and {Wilson}, K. G.",
-     title     = "A remnant of chiral symmetry on the lattice",
-     journal   = "Phys. Rev.",
-     volume    = "D25",
-     year      = "1982",
-     pages     = "2649",
-     SLACcitation  = "%%CITATION = PHRVA,D25,2649;%%"
-}
-@Article{Giusti:1998wy,
-     author    = "Giusti, L. and Rapuano, F. and Talevi, M. and Vladikas, A.
-                  ",
-     title     = "The QCD chiral condensate from the lattice",
-     journal   = "Nucl. Phys.",
-     volume    = "B538",
-     year      = "1999",
-     pages     = "249-277",
-     eprint    = "hep-lat/9807014",
-     SLACcitation  = "%%CITATION = HEP-LAT 9807014;%%"
-}
-@Article{Giusti:2001pk,
-     author    = "Giusti, L. and Hoelbling, C. and Rebbi, C.",
-     title     = "Light quark masses with overlap fermions in quenched {QCD}",
-     journal   = "Phys. Rev.",
-     volume    = "D64",
-     year      = "2001",
-     pages     = "114508",
-     eprint    = "hep-lat/0108007",
-     note      = "Erratum-ibid.D65:079903,2002",
-     SLACcitation  = "%%CITATION = HEP-LAT 0108007;%%"
-}
-@Article{Giusti:2002sm,
-     author    = "Giusti, L. and Hoelbling, C. and L{\"u}scher, M. and Wittig, H.
-                  ",
-     title     = "Numerical techniques for lattice QCD in the epsilon-
-                  regime",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "153",
-     year      = "2003",
-     pages     = "31-51",
-     eprint    = "hep-lat/0212012",
-     SLACcitation  = "%%CITATION = HEP-LAT 0212012;%%"
-}
-@Article{Giusti:2007hk,
-     author    = "Giusti, Leonardo",
-     title     = "Light dynamical fermions on the lattice: Toward the chiral
-                  regime of QCD",
-     journal   = "PoS.",
-     volume    = "LAT2006",
-     year      = "2007",
-     pages     = "",
-     eprint    = "hep-lat/0702014",
-     SLACcitation  = "%%CITATION = HEP-LAT/0702014;%%"
-}
-@Article{Glassner:1996gz,
-     author    = "Gl{\"a}ssner, U. and others",
-     title     = "How to compute {G}reen's functions for entire mass
-                  trajectories within {K}rylov solvers",
-     year      = "1996",
-     eprint    = "hep-lat/9605008",
-     SLACcitation  = "%%CITATION = HEP-LAT 9605008;%%"
-}
-@Article{Gockeler:1998fn,
-     author    = "G{\"o}ckeler, M. and others",
-     title     = "Scaling of non-perturbatively {O(a)} improved {Wilson}
-                  fermions: Hadron  spectrum, quark masses and decay
-                  constants",
-     journal   = "Phys. Rev.",
-     volume    = "D57",
-     year      = "1998",
-     pages     = "5562-5580",
-     eprint    = "hep-lat/9707021",
-     SLACcitation  = "%%CITATION = HEP-LAT 9707021;%%"
-}
-@Article{Gorishnii:1990vf,
-     author    = "Gorishnii, S. G. and Kataev, A. L. and Larin, S. A.",
-     title     = "{The O (alpha-s**3) corrections to sigma-tot (e+ e- $\to$
-                  hadrons) and Gamma (tau- $\to$ tau-neutrino + hadrons) in
-                  QCD}",
-     journal   = "Phys. Lett.",
-     volume    = "B259",
-     year      = "1991",
-     pages     = "144-150",
-     SLACcitation  = "%%CITATION = PHLTA,B259,144;%%"
-}
-@Article{Greenberg:1964pe,
-     author    = "Greenberg, O. W.",
-     title     = "Spin and unitary spin independence in a paraquark model of
-                  baryons and mesons",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "13",
-     year      = "1964",
-     pages     = "598-602",
-     SLACcitation  = "%%CITATION = PRLTA,13,598;%%"
-}
-@Article{Gregory:2007ce,
-     author    = "Gregory, Eric B. and Irving, Alan and Richards, Chris M.
-                  and McNeile, Craig and Hart, Alistair",
-     title     = "Pseudoscalar Flavor-Singlet Physics with Staggered
-                  Fermions",
-     year      = "2007",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     eprint    = "0710.1725",
-     SLACcitation  = "%%CITATION = ARXIV:0710.1725;%%"
-}
-@Article{Gross:1973id,
-     author    = "Gross, D. J. and Wilczek, F.",
-     title     = "Ultraviolet behavior of non-Abelian gauge theories",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "30",
-     year      = "1973",
-     pages     = "1343-1346",
-     SLACcitation  = "%%CITATION = PRLTA,30,1343;%%"
-}
-@Article{Gross:1973ju,
-     author    = "Gross, D. J. and Wilczek, F.",
-     title     = "Asymptotically free gauge theories. 1",
-     journal   = "Phys. Rev.",
-     volume    = "D8",
-     year      = "1973",
-     pages     = "3633-3652",
-     SLACcitation  = "%%CITATION = PHRVA,D8,3633;%%"
-}
-@Article{Gross:1974jv,
-     author    = "Gross, D. J. and Neveu, A.",
-     title     = "Dynamical symmetry breaking in asymptotically free field
-                  theories",
-     journal   = "Phys. Rev.",
-     volume    = "D10",
-     year      = "1974",
-     pages     = "3235",
-     SLACcitation  = "%%CITATION = PHRVA,D10,3235;%%"
-}
-@Article{Guagnelli:1998ud,
-     author    = "Guagnelli, M. and Sommer, R. and Wittig, H.",
- collaboration = "ALPHA",
-     title     = "Precision computation of a low-energy reference scale in
-                  quenched  lattice {QCD}",
-     journal   = "Nucl. Phys.",
-     volume    = "B535",
-     year      = "1998",
-     pages     = "389-402",
-     eprint    = "hep-lat/9806005",
-     SLACcitation  = "%%CITATION = HEP-LAT 9806005;%%"
-}
-@Article{Guagnelli:2004ga,
-     author    = "Guagnelli, M. and others",
- collaboration = "Zeuthen-Rome (ZeRo)",
-     title     = "Non-perturbative pion matrix element of a twist-2 operator
-                  from the  lattice",
-     journal   = "Eur. Phys. J.",
-     volume    = "C40",
-     year      = "2005",
-     pages     = "69-80",
-     eprint    = "hep-lat/0405027",
-     SLACcitation  = "%%CITATION = HEP-LAT 0405027;%%"
-}
-@Article{Guagnelli:2004ww,
-     author    = "Guagnelli, M. and others",
- collaboration = "Zeuthen-Rome (ZeRo)",
-     title     = "Finite size effects of a pion matrix element",
-     journal   = "Phys. Lett.",
-     volume    = "B597",
-     year      = "2004",
-     pages     = "216-221",
-     eprint    = "hep-lat/0403009",
-     SLACcitation  = "%%CITATION = HEP-LAT 0403009;%%"
-}
-@Article{Guagnelli:2005zc,
-     author    = "Guagnelli, M. and Heitger, J. and Pena, C. and Sint, S. and
-                  Vladikas, A.",
- collaboration = "ALPHA",
-     title     = "Non-perturbative renormalization of left-left four-fermion
-                  operators in  quenched lattice QCD",
-     journal   = "JHEP",
-     volume    = "03",
-     year      = "2006",
-     pages     = "088",
-     eprint    = "hep-lat/0505002",
-     SLACcitation  = "%%CITATION = HEP-LAT 0505002;%%"
-}
-@Article{Gupta:1988js,
-     author    = "Gupta, R. and Kilcup, G. W. and Sharpe, S. R.
-                  ",
-     title     = "Tuning the hybrid monte carlo algorithm",
-     journal   = "Phys. Rev.",
-     volume    = "D38",
-     year      = "1988",
-     pages     = "1278",
-     SLACcitation  = "%%CITATION = PHRVA,D38,1278;%%"
-}
-@Article{Gupta:1989kx,
-     author    = "Gupta, R. and others",
-     title     = "{QCD} with dynamical {Wilson} fermions",
-     journal   = "Phys. Rev.",
-     volume    = "D40",
-     year      = "1989",
-     pages     = "2072",
-     SLACcitation  = "%%CITATION = PHRVA,D40,2072;%%"
-}
-@Article{Gupta:1990ka,
-     author    = "Gupta, S. and Irback, A. and Karsch, F. and
-                  Petersson, B.",
-     title     = "The acceptance probability in the hybrid monte carlo
-                  method",
-     journal   = "Phys. Lett.",
-     volume    = "B242",
-     year      = "1990",
-     pages     = "437-443",
-     SLACcitation  = "%%CITATION = PHLTA,B242,437;%%"
-}
-@Article{Gupta:1991sn,
-     author    = "Gupta, R. and others",
-     title     = "{QCD} with dynamical {Wilson} fermions. 2",
-     journal   = "Phys. Rev.",
-     volume    = "D44",
-     year      = "1991",
-     pages     = "3272-3292",
-     SLACcitation  = "%%CITATION = PHRVA,D44,3272;%%"
-}
-@Unpublished{Gupta:1997nd,
-     author    = "Gupta, R.",
-     title     = "Introduction to lattice {QCD}",
-     year      = "1997",
-     eprint    = "hep-lat/9807028",
-     note      = "Lectures given at Les Houches Summer School in Theoretical Physics, Session 68",
-     SLACcitation  = "%%CITATION = HEP-LAT 9807028;%%"
-}
-@Article{Han:1965pf,
-     author    = "Han, M. Y. and Nambu, Yoichiro",
-     title     = "Three-triplet model with double SU(3) symmetry",
-     journal   = "Phys. Rev.",
-     volume    = "139",
-     year      = "1965",
-     pages     = "B1006-B1010",
-     SLACcitation  = "%%CITATION = PHRVA,139,B1006;%%"
-}
-@Article{Hasenbusch:2001ne,
-     author    = "Hasenbusch, M.",
-     title     = "Speeding up the {H}ybrid-{M}onte-{C}arlo algorithm for dynamical
-                  fermions",
-     journal   = "Phys. Lett.",
-     volume    = "B519",
-     year      = "2001",
-     pages     = "177-182",
-     eprint    = "hep-lat/0107019",
-     SLACcitation  = "%%CITATION = HEP-LAT 0107019;%%"
+	archiveprefix = "arXiv",
+	author = "Clark, M.A. and Kennedy, A.D.",
+	doi = "10.1103/PhysRevLett.98.051601",
+	eprint = "hep-lat/0608015",
+	journal = "Phys.Rev.Lett.",
+	pages = "051601",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = HEP-LAT/0608015;\%\%",
+	title = "{Accelerating dynamical fermion computations using the rational hybrid Monte Carlo (RHMC) algorithm with multiple pseudofermion fields}",
+	volume = "98",
+	year = "2007"
 }
-@article{Hasenbusch:2002ai,
-      author         = "Hasenbusch, M. and Jansen, K.",
-      title          = "{Speeding up lattice QCD simulations with clover improved
-                        Wilson fermions}",
-      journal        = "Nucl.Phys.",
-      volume         = "B659",
-      pages          = "299-320",
-      doi            = "10.1016/S0550-3213(03)00227-X",
-      year           = "2003",
-      eprint         = "hep-lat/0211042",
-      archivePrefix  = "arXiv",
-      primaryClass   = "hep-lat",
-      reportNumber   = "DESY-02-200",
-      SLACcitation   = "%%CITATION = HEP-LAT/0211042;%%",
-}
-@Article{Hasenbusch:2003vg,
-     author    = "Hasenbusch, M.",
-     title     = "Full {QCD} algorithms towards the chiral limit",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "129",
-     year      = "2004",
-     pages     = "27-33",
-     eprint    = "hep-lat/0310029",
-     SLACcitation  = "%%CITATION = HEP-LAT 0310029;%%"
-}
-@Article{Hasenfratz:1998jp,
-     author    = "Hasenfratz, P.",
-     title     = "Lattice {QCD} without tuning, mixing and current
-                  renormalization",
-     journal   = "Nucl. Phys.",
-     volume    = "B525",
-     year      = "1998",
-     pages     = "401-409",
-     eprint    = "hep-lat/9802007",
-     SLACcitation  = "%%CITATION = HEP-LAT 9802007;%%"
-}
-@Article{Hasenfratz:1998ri,
-     author    = "Hasenfratz, P. and Laliena, V. and Niedermayer,
-                  F.",
-     title     = "The index theorem in {QCD} with a finite cut-off",
-     journal   = "Phys. Lett.",
-     volume    = "B427",
-     year      = "1998",
-     pages     = "125-131",
-     eprint    = "hep-lat/9801021",
-     SLACcitation  = "%%CITATION = HEP-LAT 9801021;%%"
-}
-@Article{Hasenfratz:2001hp,
-     author    = "Hasenfratz, A. and Knechtli, F.",
-     title     = "Flavor symmetry and the static potential with hypercubic
-                  blocking",
-     journal   = "Phys. Rev.",
-     volume    = "D64",
-     year      = "2001",
-     pages     = "034504",
-     eprint    = "hep-lat/0103029",
-     SLACcitation  = "%%CITATION = HEP-LAT 0103029;%%"
-}
-@Article{Hasenfratz:2001tw,
-     author    = "Hasenfratz, A. and Hoffmann, R. and Knechtli, F.",
-     title     = "The static potential with hypercubic blocking",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "106",
-     year      = "2002",
-     pages     = "418-420",
-     eprint    = "hep-lat/0110168",
-     SLACcitation  = "%%CITATION = HEP-LAT 0110168;%%"
-}
-@Article{Hashimoto:2008xg,
-     author    = "Hashimoto, Koichi and Izubuchi, Taku",
-     title     = "{eta' meson from two flavor dynamical domain wall
-                  fermions}",
-     year      = "2008",
-     eprint    = "0803.0186",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = ARXIV:0803.0186;%%"
-}
-@Article{Heitger:2000ay,
-     author    = "Heitger, J. and Sommer, R. and Wittig, H.",
- collaboration = "ALPHA",
-     title     = "Effective chiral Lagrangians and lattice {{QCD}}",
-     journal   = "Nucl. Phys.",
-     volume    = "B588",
-     year      = "2000",
-     pages     = "377-399",
-     eprint    = "hep-lat/0006026",
-     note      = "and references therein",
-     SLACcitation  = "%%CITATION = HEP-LAT 0006026;%%"
-}
-@Article{Hernandez:1998et,
-     author    = "Hernandez, P. and Jansen, K. and L{\"u}scher, M.",
-     title     = "Locality properties of Neuberger's lattice Dirac operator",
-     journal   = "Nucl. Phys.",
-     volume    = "B552",
-     year      = "1999",
-     pages     = "363-378",
-     eprint    = "hep-lat/9808010",
-     SLACcitation  = "%%CITATION = HEP-LAT 9808010;%%"
-}
-@Article{Hernandez:2000sb,
-     author    = "Hernandez, P. and Jansen, K. and Lellouch, L.",
-     title     = "A numerical treatment of Neuberger's lattice Dirac
-                  operator",
-     year      = "2000",
-     eprint    = "hep-lat/0001008",
-     SLACcitation  = "%%CITATION = HEP-LAT 0001008;%%"
-}
-@Article{Hernandez:2001hq,
-     author    = "Hernandez, P. and Jansen, K. and Lellouch, L. and
-                  Wittig, H.",
-     title     = "Scalar condensate and light quark masses from overlap
-                  fermions",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "106",
-     year      = "2002",
-     pages     = "766-771",
-     eprint    = "hep-lat/0110199",
-     SLACcitation  = "%%CITATION = HEP-LAT 0110199;%%"
-}
-@Article{Hernandez:2001yn,
-     author    = "Hernandez, P. and Jansen, K. and Lellouch, L. and
-                  Wittig, H.",
-     title     = "Non-perturbative renormalization of the quark condensate in
-                  {Ginsparg}-{Wilson} regularizations",
-     journal   = "JHEP",
-     volume    = "07",
-     year      = "2001",
-     pages     = "018",
-     eprint    = "hep-lat/0106011",
-     SLACcitation  = "%%CITATION = HEP-LAT 0106011;%%"
-}
-@Article{Horsley:2004mx,
-     author    = "Horsley, R. and Perlt, H. and Rakow, P. E. L. and
-                  Schierholz, G. and Schiller, A.",
- collaboration = "QCDSF",
-     title     = "One-loop renormalisation of quark bilinears for overlap
-                  fermions with  improved gauge actions",
-     journal   = "Nucl. Phys.",
-     volume    = "B693",
-     year      = "2004",
-     pages     = "3-35",
-     eprint    = "hep-lat/0404007",
-     SLACcitation  = "%%CITATION = HEP-LAT 0404007;%%"
-}
-@Article{Ilgenfritz:2003gw,
-     author    = "Ilgenfritz, E.-M. and Kerler, W. and
-                  M{\"u}ller-Preu{\ss}ker, M. and Sternbeck, A. and St{\"u}ben, H.",
-     title     = "A numerical reinvestigation of the {Aoki} phase with {N(f)} = 2
-                  {Wilson}  fermions at zero temperature",
-     journal   = "Phys. Rev.",
-     volume    = "D69",
-     year      = "2004",
-     pages     = "074511",
-     eprint    = "hep-lat/0309057",
-     SLACcitation  = "%%CITATION = HEP-LAT 0309057;%%"
-}
-@Article{Ilgenfritz:2006tz,
-     author    = "Ilgenfritz, E. -M. and others",
-     title     = "Twisted mass QCD thermodynamics: First results on apeNEXT",
-     year      = "2006",
-     eprint    = "hep-lat/0610112",
-     SLACcitation  = "%%CITATION = HEP-LAT 0610112;%%"
-}
-@Article{Iwasaki:1983ck,
-     author    = "Iwasaki, Y.",
-     title     = "Renormalization group analysis of lattice theories and
-                  improved lattice action. 2. four-dimensional nonabelian
-                  SU(N) gauge model",
-     note     = "UTHEP-118"
-}
-@Article{Iwasaki:1985we,
-     author    = "Iwasaki, Y.",
-     title     = "Renormalization group analysis of lattice theories and
-                  improved lattice action: two-dimensional nonlinear O(N)
-                  sigma model",
-     journal   = "Nucl. Phys.",
-     volume    = "B258",
-     year      = "1985",
-     pages     = "141-156",
-     SLACcitation  = "%%CITATION = NUPHA,B258,141;%%"
-}
-@Article{Iwasaki:1992hn,
-     author    = "Iwasaki, Y. and Kanaya, K. and Sakai, S. and Yoshie, T.",
-     title     = "Quark confinement in multi - flavor quantum
-                  chromodynamics",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "30",
-     year      = "1993",
-     pages     = "327-330",
-     eprint    = "hep-lat/9211035",
-     SLACcitation  = "%%CITATION = HEP-LAT 9211035;%%"
-}
-@Article{Izubuchi:1998hy,
-     author    = "Izubuchi, T. and Noaki, J. and Ukawa, A.",
-     title     = "Two-dimensional lattice Gross-Neveu model with {Wilson}
-                  fermion action at  finite temperature and chemical
-                  potential",
-     journal   = "Phys. Rev.",
-     volume    = "D58",
-     year      = "1998",
-     pages     = "114507",
-     eprint    = "hep-lat/9805019",
-     SLACcitation  = "%%CITATION = HEP-LAT 9805019;%%"
-}
-@Article{Jacobs:1983ph,
-     author    = "Jacobs, L.",
-     title     = "Undoubling chirally symmetric lattice fermions",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "51",
-     year      = "1983",
-     pages     = "172",
-     SLACcitation  = "%%CITATION = PRLTA,51,172;%%"
-}
-@Article{Jagels:1994a,
-     author    = "Jagels, C. F. and Reichel, L.",
-     title     = " fast minimal residual algorithm for shifted unitary matrices",
-     journal   = "Numer. Linear Algebra Appl.",
-     volume    = "1(6)",
-     pages     = "555-570",
-     year      = "1994"
-}
-@Article{Jagels:1994aa,
-     author    = "Jagels, C. F. and Reichel, L.",
-     title     = "A Fast Minimal Residual Algorithm for Shifted Unitary 
-                  Matrices",
-     journal   = "Numerical Linear Algebra with Aplications",
-     volume    = "1(6)",
-     year      = "1994",
-     pages     = "555-570",
-}
-@Article{Jansen:1994ym,
-     author    = "Jansen, K.",
-     title     = "Domain wall fermions and chiral gauge theories",
-     journal   = "Phys. Rept.",
-     volume    = "273",
-     year      = "1996",
-     pages     = "1-54",
-     eprint    = "hep-lat/9410018",
-     SLACcitation  = "%%CITATION = HEP-LAT 9410018;%%"
-}
-@Article{Jansen:1995ck,
-     author    = "Jansen, Karl and others",
-     title     = "Non-perturbative renormalization of lattice QCD at all
-                  scales",
-     journal   = "Phys. Lett.",
-     volume    = "B372",
-     year      = "1996",
-     pages     = "275-282",
-     eprint    = "hep-lat/9512009",
-     SLACcitation  = "%%CITATION = HEP-LAT 9512009;%%"
-}
-@Article{Jansen:1996cq,
-     author    = "Jansen, K. and Liu, C.",
-     title     = "Study of Liapunov exponents and the reversibility of
-                  molecular dynamics  algorithms",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "53",
-     year      = "1997",
-     pages     = "974-976",
-     eprint    = "hep-lat/9607057",
-     SLACcitation  = "%%CITATION = HEP-LAT 9607057;%%"
-}
-@Article{Jansen:1996xp,
-     author    = "Jansen, K.",
-     title     = "Recent developments in fermion simulation algorithms",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "53",
-     year      = "1997",
-     pages     = "127-133",
-     eprint    = "hep-lat/9607051",
-     SLACcitation  = "%%CITATION = HEP-LAT 9607051;%%"
-}
-@Article{Jansen:1997yt,
-     author    = "Jansen, K. and Liu, C.",
-     title     = "Implementation of Symanzik's improvement program for
-                  simulations of  dynamical {Wilson} fermions in lattice {QCD}",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "99",
-     year      = "1997",
-     pages     = "221-234",
-     eprint    = "hep-lat/9603008",
-     SLACcitation  = "%%CITATION = HEP-LAT 9603008;%%"
-}
-@Article{Jansen:1998mx,
-     author    = "Jansen, K. and Sommer, R.",
- collaboration = "ALPHA",
-     title     = "O(alpha) improvement of lattice {QCD} with two flavors of
-                  {Wilson} quarks",
-     journal   = "Nucl. Phys.",
-     volume    = "B530",
-     year      = "1998",
-     pages     = "185-203",
-     eprint    = "hep-lat/9803017",
-     SLACcitation  = "%%CITATION = HEP-LAT 9803017;%%"
-}
-@Article{Jansen:2003ir,
-     author    = "Jansen, K. and Shindler, A. and Urbach, C. and
-                  Wetzorke, I.",
- collaboration = "\xlf",
-     title     = "Scaling test for {Wilson} twisted mass {QCD}",
-     journal   = "Phys. Lett.",
-     volume    = "B586",
-     year      = "2004",
-     pages     = "432-438",
-     eprint    = "hep-lat/0312013",
-     SLACcitation  = "%%CITATION = HEP-LAT 0312013;%%"
-}
-@Article{Jansen:2003jq,
-     author    = "Jansen, K. and Nagai, K.-I.",
-     title     = "Reducing residual-mass effects for domain-wall fermions",
-     journal   = "JHEP",
-     volume    = "12",
-     year      = "2003",
-     pages     = "038",
-     eprint    = "hep-lat/0305009",
-     SLACcitation  = "%%CITATION = HEP-LAT 0305009;%%"
-}
-@Article{Jansen:2003nt,
-     author    = "Jansen, K.",
-     title     = "Actions for dynamical fermion simulations: Are we ready to
-                  go?",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "129",
-     year      = "2004",
-     pages     = "3-16",
-     eprint    = "hep-lat/0311039",
-     SLACcitation  = "%%CITATION = HEP-LAT 0311039;%%"
-}
-@Article{Jansen:2005cg,
-     author    = "Jansen, K. and others",
- collaboration = "\xlf",
-     title     = "Flavour breaking effects of {Wilson} twisted mass fermions",
-     journal   = "Phys. Lett.",
-     volume    = "B624",
-     year      = "2005",
-     pages     = "334-341",
-     eprint    = "hep-lat/0507032",
-     SLACcitation  = "%%CITATION = HEP-LAT 0507032;%%"
-}
-@Unpublished{Jansen:2005chi,
-  author = 	 {Jansen, K. and others},
-collaborations = {\xlf},
-  title = 	 {},
-  note = 	 {in preparation},
-  OPTkey = 	 {},
-  OPTmonth = 	 {},
-  year = 	 {2005},
-  OPTannote = 	 {}
-}
-@Article{Jansen:2005gf,
-     author    = "Jansen, K. and Papinutto, M. and Shindler, A. and Urbach,
-                  C. and Wetzorke, I.",
- collaboration = "\xlf",
-     title     = "Light quarks with twisted mass fermions",
-     journal   = "Phys. Lett.",
-     volume    = "B619",
-     year      = "2005",
-     pages     = "184-191",
-     eprint    = "hep-lat/0503031",
-     SLACcitation  = "%%CITATION = HEP-LAT 0503031;%%"
-}
-@Article{Jansen:2005kk,
-     author    = "Jansen, K. and Papinutto, M. and Shindler, A. and Urbach,
-                  C. and Wetzorke, I.",
- collaboration = "\xlf",
-     title     = "Quenched scaling of {Wilson} twisted mass fermions",
-     journal   = "JHEP",
-     volume    = "09",
-     year      = "2005",
-     pages     = "071",
-     eprint    = "hep-lat/0507010",
-     SLACcitation  = "%%CITATION = HEP-LAT 0507010;%%"
-}
-@Article{Jansen:2005yp,
-     author    = "Jansen, Karl and Shindler, Andrea and Urbach, Carsten and
-                  Wenger, Urs",
-     title     = "{HMC} algorithm with multiple time scale integration and mass
-                  preconditioning",
-     journal   = "PoS",
-     volume    = "LAT2005",
-     year      = "2006",
-     pages     = "118",
-     eprint    = "hep-lat/0510064",
-     SLACcitation  = "%%CITATION = HEP-LAT 0510064;%%"
-}
-@Article{Jansen:2006ks,
-     author    = "Jansen, Karl",
-     title     = "Status report on ILDG activities",
-     year      = "2006",
-     eprint    = "hep-lat/0609012",
-     SLACcitation  = "%%CITATION = HEP-LAT 0609012;%%"
-}
-@Article{Jansen:2006rf,
-     author    = "Jansen, Karl and Urbach, Carsten",
- collaboration = "ETM",
-     title     = "First results with two light flavours of quarks with
-                  maximally twisted mass",
-     year      = "2006",
-     eprint    = "hep-lat/0610015",
-     SLACcitation  = "%%CITATION = HEP-LAT 0610015;%%"
-}
-@Article{Jansen:2008wv,
-     author    = "Jansen, K. and Michael, C. and Urbach, C.",
- collaboration = "ETM",
-     title     = "The eta' meson from lattice {QCD}",
-     year      = "2008",
-     eprint    = "0804.3871",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = 0804.3871;%%"
-}
-@Article{Jansen:2008zz,
-     author    = "Jansen, K. and Michael, C. and Urbach, C.",
-     title     = "{The eta-prime meson from lattice QCD}",
-     journal   = "Eur. Phys. J.",
-     volume    = "C58",
-     year      = "2008",
-     pages     = "261-269",
-     doi       = "10.1140/epjc/s10052-008-0764-6",
-     SLACcitation  = "%%CITATION = EPHJA,C58,261;%%"
-}
-@Unpublished{Jegerlehner:1996pm,
-     author    = "Jegerlehner, Beat",
-     title     = "Krylov space solvers for shifted linear systems",
-     year      = "1996",
-     eprint    = "hep-lat/9612014",
-     note      = "unpublished",
-     SLACcitation  = "%%CITATION = HEP-LAT 9612014;%%"
-}
-@Article{Jegerlehner:1997rn,
-     author    = "Jegerlehner, B.",
-     title     = "Multiple mass solvers",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "63",
-     year      = "1998",
-     pages     = "958-960",
-     eprint    = "hep-lat/9708029",
-     SLACcitation  = "%%CITATION = HEP-LAT 9708029;%%"
-}
-@Article{Jegerlehner:2003qp,
-     author    = "Jegerlehner, F.",
-     title     = "Theoretical precision in estimates of the hadronic
-                  contributions to  (g-2)mu and alpha(QED)(M(Z))",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "126",
-     year      = "2004",
-     pages     = "325-334",
-     eprint    = "hep-ph/0310234",
-     SLACcitation  = "%%CITATION = HEP-PH 0310234;%%"
-}
-
-@Article{Jenkins:1990jv,
-     author    = "Jenkins, Elizabeth Ellen and Manohar, Aneesh V.",
-     title     = "Baryon chiral perturbation theory using a heavy fermion
-                  Lagrangian",
-     journal   = "Phys. Lett.",
-     volume    = "B255",
-     year      = "1991",
-     pages     = "558-562",
-     SLACcitation  = "%%CITATION = PHLTA,B255,558;%%"
-}
-@Article{Kaiser:1998ds,
-     author    = "Kaiser, Roland and Leutwyler, H.",
-     title     = "{Pseudoscalar decay constants at large N(c)}",
-     year      = "1998",
-     eprint    = "hep-ph/9806336",
-     SLACcitation  = "%%CITATION = HEP-PH/9806336;%%"
-}
-@Article{Kalkreuter:1995mm,
-     author    = "Kalkreuter, Thomas and Simma, Hubert",
-     title     = "An Accelerated conjugate gradient algorithm to compute low
-                  lying eigenvalues: A Study for the Dirac operator in SU(2)
-                  lattice QCD",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "93",
-     year      = "1996",
-     pages     = "33-47",
-     eprint    = "hep-lat/9507023",
-     SLACcitation  = "%%CITATION = HEP-LAT 9507023;%%"
-}
-@Article{Kalkreuter:1996mm,
-     author    = "Kalkreuter, T. and Simma, H.",
-     title     = "An Accelerated conjugate gradient algorithm to compute low
-                  lying eigenvalues: A Study for the Dirac operator in SU(2)
-                  lattice {QCD}",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "93",
-     year      = "1996",
-     pages     = "33-47",
-     eprint    = "hep-lat/9507023",
-     SLACcitation  = "%%CITATION = HEP-LAT 9507023;%%"
-}
-@Article{Kaplan:1992bt,
-     author    = "Kaplan, D. B.",
-     title     = "A Method for simulating chiral fermions on the lattice",
-     journal   = "Phys. Lett.",
-     volume    = "B288",
-     year      = "1992",
-     pages     = "342-347",
-     eprint    = "hep-lat/9206013",
-     SLACcitation  = "%%CITATION = HEP-LAT 9206013;%%"
-}
-@Article{Karsten:1980wd,
-     author    = "Karsten, L. H. and Smit, J.",
-     title     = "Lattice fermions: species doubling, chiral invariance, and
-                  the triangle anomaly",
-     journal   = "Nucl. Phys.",
-     volume    = "B183",
-     year      = "1981",
-     pages     = "103",
-     SLACcitation  = "%%CITATION = NUPHA,B183,103;%%"
-}
-@Article{Kennedy:1990bv,
-     author    = "Kennedy, A. D. and Pendleton, B.",
-     title     = "Acceptances and autocorrelations in hybrid Monte Carlo",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "20",
-     year      = "1991",
-     pages     = "118-121",
-     SLACcitation  = "%%CITATION = NUPHZ,20,118;%%"
-}
-@Article{Knechtli:1998gf,
-     author    = "Knechtli, F. and Sommer, R.",
- collaboration = "ALPHA",
-     title     = "String breaking in SU(2) gauge theory with scalar matter
-                  fields",
-     journal   = "Phys. Lett.",
-     volume    = "B440",
-     year      = "1998",
-     pages     = "345-352",
-     eprint    = "hep-lat/9807022",
-     SLACcitation  = "%%CITATION = HEP-LAT 9807022;%%"
-}
-@Article{Knechtli:2000df,
-     author    = "Knechtli, F. and Sommer, R.",
- collaboration = "ALPHA",
-     title     = "String breaking as a mixing phenomenon in the SU(2) Higgs
-                  model",
-     journal   = "Nucl. Phys.",
-     volume    = "B590",
-     year      = "2000",
-     pages     = "309-328",
-     eprint    = "hep-lat/0005021",
-     SLACcitation  = "%%CITATION = HEP-LAT 0005021;%%"
-}
-@Article{Lacock:1994qx,
-     author    = "Lacock, P. and McKerrell, A. and Michael, C. and Stopher,
-                            I. M. and Stephenson, P. W.",
-     collaboration = "UKQCD",
-     title     = "Efficient hadronic operators in lattice gauge theory",
-     journal   = "Phys. Rev.",
-     volume    = "D51",
-     year      = "1995",
-     pages     = "6403-6410",
-     eprint    = "hep-lat/9412079",
-     SLACcitation  = "%%CITATION = HEP-LAT 9412079;%%"
-}
-@Article{Lepage:1992xa,
-     author    = "Lepage, G. Peter and Mackenzie, Paul B.",
-     title     = "On the viability of lattice perturbation theory",
-     journal   = "Phys. Rev.",
-     volume    = "D48",
-     year      = "1993",
-     pages     = "2250-2264",
-     eprint    = "hep-lat/9209022",
-     SLACcitation  = "%%CITATION = HEP-LAT 9209022;%%"
-}
-@Article{Lepage:2001ym,
-     author    = "Lepage, G. P. and others",
-     title     = "{Constrained curve fitting}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "106",
-     year      = "2002",
-     pages     = "12-20",
-     eprint    = "hep-lat/0110175",
-     archivePrefix = "arXiv",
-     doi       = "10.1016/S0920-5632(01)01638-3",
-     SLACcitation  = "%%CITATION = HEP-LAT/0110175;%%"
-}
-@Article{Lesk:2002gd,
-     author    = "Lesk, V. I. and others",
- collaboration = "CP-PACS",
-     title     = "Flavor singlet meson mass in the continuum limit in two-
-                  flavor lattice QCD",
-     journal   = "Phys. Rev.",
-     volume    = "D67",
-     year      = "2003",
-     pages     = "074503",
-     eprint    = "hep-lat/0211040",
-     SLACcitation  = "%%CITATION = HEP-LAT/0211040;%%"
-}
-@Article{Leutwyler:1997yr,
-     author    = "Leutwyler, H.",
-     title     = "{On the 1/N-expansion in chiral perturbation theory}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "64",
-     year      = "1998",
-     pages     = "223-231",
-     eprint    = "hep-ph/9709408",
-     SLACcitation  = "%%CITATION = HEP-PH/9709408;%%"
-}
-@Article{Leutwyler:2006qq,
-     author    = "Leutwyler, H.",
-     title     = "pi pi scattering",
-     year      = "2006",
-     eprint    = "hep-ph/0612112",
-     SLACcitation  = "%%CITATION = HEP-PH 0612112;%%"
-}
-@Article{Liu:1997fs,
-     author    = "Liu, C. and Jaster, A. and Jansen, K.",
-     title     = "Liapunov exponents and the reversibility of molecular
-                  dynamics  algorithms",
-     journal   = "Nucl. Phys.",
-     volume    = "B524",
-     year      = "1998",
-     pages     = "603-617",
-     eprint    = "hep-lat/9708017",
-     SLACcitation  = "%%CITATION = HEP-LAT 9708017;%%"
-}
-@Article{Luscher:1985dn,
-     author    = "Luscher, M.",
-     title     = "{Volume Dependence of the Energy Spectrum in Massive
-                  Quantum Field Theories. 1. Stable Particle States}",
-     journal   = "Commun. Math. Phys.",
-     volume    = "104",
-     year      = "1986",
-     pages     = "177",
-     doi       = "10.1007/BF01211589",
-     SLACcitation  = "%%CITATION = CMPHA,104,177;%%"
-}
-@Article{Luscher:1990ck,
-     author    = "L{\"u}scher, M. and Wolff, U.",
-     title     = "How to calculate the elastic scattering matrix in two-
-                  dimensional quantum field theories by numerical
-                  simulation",
-     journal   = "Nucl. Phys.",
-     volume    = "B339",
-     year      = "1990",
-     pages     = "222-252",
-     SLACcitation  = "%%CITATION = NUPHA,B339,222;%%"
-}
-@Article{Luscher:1993dy,
-     author    = "Luscher, Martin",
-     title     = "{A Portable high quality random number generator for
-                  lattice field theory simulations}",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "79",
-     year      = "1994",
-     pages     = "100-110",
-     eprint    = "hep-lat/9309020",
-     archivePrefix = "arXiv",
-     doi       = "10.1016/0010-4655(94)90232-1",
-     SLACcitation  = "%%CITATION = HEP-LAT/9309020;%%"
-}
-@Article{Luscher:1993xx,
-     author    = "L{\"u}scher, M.",
-     title     = "A New approach to the problem of dynamical quarks in
-                  numerical simulations of lattice {QCD}",
-     journal   = "Nucl. Phys.",
-     volume    = "B418",
-     year      = "1994",
-     pages     = "637-648",
-     eprint    = "hep-lat/9311007",
-     SLACcitation  = "%%CITATION = HEP-LAT 9311007;%%"
-}
-@Article{Luscher:1996sc,
-     author    = "L{\"u}scher, M. and Sint, S. and Sommer, R. and
-                  Weisz, P.",
-     title     = "Chiral symmetry and {O(a)} improvement in lattice {QCD}",
-     journal   = "Nucl. Phys.",
-     volume    = "B478",
-     year      = "1996",
-     pages     = "365-400",
-     eprint    = "hep-lat/9605038",
-     SLACcitation  = "%%CITATION = HEP-LAT 9605038;%%"
-}
-@Article{Luscher:1996ug,
-     author    = "L{\"u}scher, M. and Sint, S. and Sommer, R. and
-                  Weisz, P. and Wolff, U.",
-     title     = "Non-perturbative {O(a)} improvement of lattice {QCD}",
-     journal   = "Nucl. Phys.",
-     volume    = "B491",
-     year      = "1997",
-     pages     = "323-343",
-     eprint    = "hep-lat/9609035",
-     SLACcitation  = "%%CITATION = HEP-LAT 9609035;%%"
-}
-@Article{Luscher:1998pq,
-     author    = "L{\"u}scher, M.",
-     title     = "Exact chiral symmetry on the lattice and the {Ginsparg}-
-                  {Wilson} relation",
-     journal   = "Phys. Lett.",
-     volume    = "B428",
-     year      = "1998",
-     pages     = "342-345",
-     eprint    = "hep-lat/9802011",
-     SLACcitation  = "%%CITATION = HEP-LAT 9802011;%%"
-}
-@Article{Luscher:2001tx,
-     author    = "L{\"u}scher, Martin",
-     title     = "{Lattice QCD on PCs?}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "106",
-     year      = "2002",
-     pages     = "21-28",
-     eprint    = "hep-lat/0110007",
-     archivePrefix = "arXiv",
-     doi       = "10.1016/S0920-5632(01)01639-5",
-     SLACcitation  = "%%CITATION = HEP-LAT/0110007;%%"
-}
-@Article{Luscher:2003qa,
-     author    = "L{\"u}scher, M.",
-     title     = "Solution of the {D}irac equation in lattice {QCD} using a
-                  domain  decomposition method",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "156",
-     year      = "2004",
-     pages     = "209-220",
-     eprint    = "hep-lat/0310048",
-     SLACcitation  = "%%CITATION = HEP-LAT 0310048;%%"
-}
-@Article{Luscher:2004rx,
-     author    = "L{\"u}scher, M.",
-     title     = "Schwarz-preconditioned {HMC} algorithm for two-flavour
-                  lattice {QCD}",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "165",
-     year      = "2005",
-     pages     = "199",
-     eprint    = "hep-lat/0409106",
-     SLACcitation  = "%%CITATION = HEP-LAT 0409106;%%"
-}
-
-@Article{Luscher:2005mv,
-     author    = "L{\"u}scher, Martin",
-     title     = "Lattice {QCD} with light {W}ilson quarks",
-     journal   = "\href{http://pos.sissa.it/archive/conferences/020/008/LAT2005_002.pdf}{PoS(LAT2005)002}", 
-     year      = "2005",
-     eprint    = "hep-lat/0509152",
-     howpublished="Talk presented at International Symposium on Lattice Field Theory (Lattice 2005)",
-     SLACcitation  = "%%CITATION = HEP-LAT 0509152;%%"
-}
-@Article{Luscher:ranluxweb,
-     author    = "L{\"u}scher, M.",
-     title     = "Ranlux random number generator",
-     eprint    = "http://luscher.web.cern.ch/luscher/ranlux/"
-}
-@Article{Luscher:sse,
-     author    = "L{\"u}scher, M.",
-     title     = "Lattice QCD parallel benchmark programs",
-     eprint    = "http://luscher.web.cern.ch/luscher/QCDpbm/"
-}
-@Article{Madras:1988ei,
-     author    = "Madras, N. and Sokal, A. D.",
-     title     = "The Pivot algorithm: a highly efficient Monte Carlo method
-                  for selfavoiding walk",
-     journal   = "J. Statist. Phys.",
-     volume    = "50",
-     year      = "1988",
-     pages     = "109-186",
-     SLACcitation  = "%%CITATION = JSTPB,50,109;%%"
-}
-@Article{Martinelli:1982mw,
-     author    = "Martinelli, G. and Zhang, Yi-Cheng",
-     title     = "THE CONNECTION BETWEEN LOCAL OPERATORS ON THE LATTICE AND
-                  IN THE CONTINUUM AND ITS RELATION TO MESON DECAY
-                  CONSTANTS",
-     journal   = "Phys. Lett.",
-     volume    = "B123",
-     year      = "1983",
-     pages     = "433",
-     SLACcitation  = "%%CITATION = PHLTA,B123,433;%%"
-}
-@Article{Martinelli:1994ty,
-     author    = "Martinelli, G. and Pittori, C. and Sachrajda, Christopher
-                  T. and Testa, M. and Vladikas, A.",
-     title     = "{A General method for nonperturbative renormalization of
-                  lattice operators}",
-     journal   = "Nucl. Phys.",
-     volume    = "B445",
-     year      = "1995",
-     pages     = "81-108",
-     eprint    = "hep-lat/9411010",
-     archivePrefix = "arXiv",
-     doi       = "10.1016/0550-3213(95)00126-D",
-     SLACcitation  = "%%CITATION = HEP-LAT/9411010;%%"
-}
-@Article{McNeile:2000hf,
-     author    = "McNeile, C. and Michael, C.",
-     collaboration = "UKQCD",
-     title     = "The eta and eta' mesons in {QCD}",
-     journal   = "Phys. Lett.",
-     volume    = "B491",
-     year      = "2000",
-     pages     = "123-129",
-     eprint    = "hep-lat/0006020",
-     SLACcitation  = "%%CITATION = HEP-LAT 0006020;%%"
-}
-@Article{McNeile:2000xx,
-     author    = "McNeile, Craig and Michael, Chris",
-     collaboration = "UKQCD",
-     title     = "Mixing of scalar glueballs and flavour-singlet scalar
-                  mesons",
-     journal   = "Phys. Rev.",
-     volume    = "D63",
-     year      = "2001",
-     pages     = "114503",
-     eprint    = "hep-lat/0010019",
-     SLACcitation  = "%%CITATION = HEP-LAT0010019;%%"
-}
-@Article{McNeile:2001cr,
-     author    = "McNeile, C. and Michael, C. and Sharkey, K. J.",
- collaboration = "UKQCD",
-     title     = "The flavor singlet mesons in {QCD}",
-     journal   = "Phys. Rev.",
-     volume    = "D65",
-     year      = "2002",
-     pages     = "014508",
-     eprint    = "hep-lat/0107003",
-     SLACcitation  = "%%CITATION = HEP-LAT 0107003;%%"
-}
-@Article{McNeile:2002fh,
-     author    = "McNeile, C. and Michael, C.",
- collaboration = "UKQCD",
-     title     = "Hadronic decay of a vector meson from the lattice",
-     journal   = "Phys. Lett.",
-     volume    = "B556",
-     year      = "2003",
-     pages     = "177-184",
-     eprint    = "hep-lat/0212020",
-     SLACcitation  = "%%CITATION = HEP-LAT 0212020;%%"
-}
-@Article{McNeile:2006bz,
-     author    = "McNeile, C. and Michael, C.",
-     collaboration = "UKQCD",
-     title     = "Decay width of light quark hybrid meson from the lattice",
-     journal   = "Phys. Rev.",
-     volume    = "D73",
-     year      = "2006",
-     pages     = "074506",
-     eprint    = "hep-lat/0603007",
-     SLACcitation  = "%%CITATION = HEP-LAT 0603007;%%"
-}
-@Article{Meyer:2006ty,
-     author    = "Meyer, Harvey B. and others",
-     title     = "{Exploring the HMC trajectory-length dependence of
-                  autocorrelation times in lattice QCD}",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "176",
-     year      = "2007",
-     pages     = "91-97",
-     eprint    = "hep-lat/0606004",
-     archivePrefix = "arXiv",
-     doi       = "10.1016/j.cpc.2006.08.002",
-     SLACcitation  = "%%CITATION = HEP-LAT/0606004;%%"
-}
-@Article{Michael:1982gb,
-     author    = "Michael, C. and Teasdale, I.",
-     title     = "EXTRACTING GLUEBALL MASSES FROM LATTICE QCD",
-     journal   = "Nucl. Phys.",
-     volume    = "B215",
-     year      = "1983",
-     pages     = "433",
-     SLACcitation  = "%%CITATION = NUPHA,B215,433;%%"
-}
-@Article{Michael:1989mf,
-     author    = "Michael, C.",
-     title     = "Particle decay in lattice gauge theory",
-     journal   = "Nucl. Phys.",
-     volume    = "B327",
-     year      = "1989",
-     pages     = "515",
-     SLACcitation  = "%%CITATION = NUPHA,B327,515;%%"
-}
-@Article{Michael:1991nc,
-     author    = "Michael, C.",
-     title     = "Hadronic forces from the lattice",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "26",
-     year      = "1992",
-     pages     = "417-419",
-     SLACcitation  = "%%CITATION = NUPHZ,26,417;%%"
-}
-@Article{Michael:1993yj,
-     author    = "Michael, Christopher",
-     title     = "{Fitting correlated data}",
-     journal   = "Phys. Rev.",
-     volume    = "D49",
-     year      = "1994",
-     pages     = "2616-2619",
-     eprint    = "hep-lat/9310026",
-     archivePrefix = "arXiv",
-     doi       = "10.1103/PhysRevD.49.2616",
-     SLACcitation  = "%%CITATION = HEP-LAT/9310026;%%"
-}
-@Article{Michael:1994sz,
-     author    = "Michael, Christopher and McKerrell, A.",
-     title     = "{Fitting correlated hadron mass spectrum data}",
-     journal   = "Phys. Rev.",
-     volume    = "D51",
-     year      = "1995",
-     pages     = "3745-3750",
-     eprint    = "hep-lat/9412087",
-     archivePrefix = "arXiv",
-     doi       = "10.1103/PhysRevD.51.3745",
-     SLACcitation  = "%%CITATION = HEP-LAT/9412087;%%"
-}
-@Article{Michael:2007vn,
-     author    = "Michael, C. and Urbach, C.",
- collaboration = "ETM",
-     title     = "Neutral mesons and disconnected diagrams in Twisted Mass
-                  QCD",
-     journal   = "",
-     volume    = "",
-     pages     = "",
-     year      = "2007",
-     eprint    = "0709.4564",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = ARXIV:0709.4564;%%"
-}
-@Book{Montvay:1994cy,
-     author    = "Montvay, I. and M{\"u}nster, G.",
-     title     = "Quantum fields on a lattice",
-     publisher = "Cambridge University Press",
-     year      = "1994",
-     series    = "Cambridge Monographs on Mathematical Physics",
-}
-@Article{Montvay:1995ea,
-     author    = "Montvay, I.",
-     title     = "An Algorithm for Gluinos on the Lattice",
-     journal   = "Nucl. Phys.",
-     volume    = "B466",
-     year      = "1996",
-     pages     = "259-284",
-     eprint    = "hep-lat/9510042",
-     SLACcitation  = "%%CITATION = HEP-LAT 9510042;%%"
-}
-@Article{Montvay:2005tj,
-     author    = "Montvay, I. and Scholz, E.",
-     title     = "Updating algorithms with multi-step stochastic correction",
-     journal   = "Phys. Lett.",
-     volume    = "B623",
-     year      = "2005",
-     pages     = "73-79",
-     eprint    = "hep-lat/0506006",
-     SLACcitation  = "%%CITATION = HEP-LAT 0506006;%%"
-}
-@Article{Morgan:2002a,
-  author       = "Morgan, R. B.",
-  title        = "GMRES with Deated Restarting",
-  journal      = "SIAM J. Sci. Comput.",
-  volume       = "24",
-  year         = "2002",
-  pages        = "20"
-}
-@Article{Morningstar:2003gk,
-     author    = "Morningstar, Colin and Peardon, Mike J.",
-     title     = "{Analytic smearing of SU(3) link variables in lattice
-                  QCD}",
-     journal   = "Phys. Rev.",
-     volume    = "D69",
-     year      = "2004",
-     pages     = "054501",
-     eprint    = "hep-lat/0311018",
-     archivePrefix = "arXiv",
-     doi       = "10.1103/PhysRevD.69.054501",
-     SLACcitation  = "%%CITATION = HEP-LAT/0311018;%%"
-}
-@Article{Munster:2004am,
-     author    = "M{\"u}nster, G.",
-     title     = "On the phase structure of twisted mass lattice {QCD}",
-     journal   = "JHEP",
-     volume    = "09",
-     year      = "2004",
-     pages     = "035",
-     eprint    = "hep-lat/0407006",
-     SLACcitation  = "%%CITATION = HEP-LAT 0407006;%%"
-}
-@Article{Munster:2004wt,
-     author    = "M{\"u}nster, Gernot and Schmidt, Christian and Scholz, Enno E.
-                  ",
-     title     = "Chiral perturbation theory for twisted mass {QCD}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "140",
-     year      = "2005",
-     pages     = "320-322",
-     eprint    = "hep-lat/0409066",
-     SLACcitation  = "%%CITATION = HEP-LAT 0409066;%%"
-}
-@Article{Nagai:2005mi,
-     author    = "Nagai, Kei-ichi and Jansen, Karl",
-     title     = "Two-dimensional lattice Gross-Neveu model with Wilson
-                  twisted mass  fermions",
-     journal   = "Phys. Lett.",
-     volume    = "B633",
-     year      = "2006",
-     pages     = "325-330",
-     eprint    = "hep-lat/0510076",
-     SLACcitation  = "%%CITATION = HEP-LAT 0510076;%%"
-}   
-@Unpublished{Nagai:priv,
-  author = 	 {Nagai, K},
-  title = 	 {Two-dimensional Gross-Neveu model with {Wilson}
-                  twisted mass fermions},
-  note = 	 {private communication},
-  OPTkey = 	 {},
-  OPTmonth = 	 {},
-  OPTyear = 	 {},
-  OPTannote = 	 {}
-}
-@Article{Necco:2001xg,
-     author    = "Necco, S. and Sommer, R.",
-     title     = "The {N(f)} = 0 heavy quark potential from short to
-                  intermediate  distances",
-     journal   = "Nucl. Phys.",
-     volume    = "B622",
-     year      = "2002",
-     pages     = "328-346",
-     eprint    = "hep-lat/0108008",
-     SLACcitation  = "%%CITATION = HEP-LAT 0108008;%%"
-}
-@Article{Necco:2003vh,
-     author    = "Necco, Silvia",
-     journal   = "Nucl. Phys.",
-     volume    = "B683",
-     year      = "2004",
-     pages     = "137-167",
-     eprint    = "hep-lat/0309017",
-     SLACcitation  = "%%CITATION = HEP-LAT 0309017;%%"
-}
-@Article{Neff:2001zr,
-     author    = "Neff, H. and Eicker, N. and Lippert, T. and Negele, J. W.
-                  and Schilling, K.",
-     title     = "On the low fermionic eigenmode dominance in {QCD} on the
-                  lattice",
-     journal   = "Phys. Rev.",
-     volume    = "D64",
-     year      = "2001",
-     pages     = "114509",
-     eprint    = "hep-lat/0106016",
-     SLACcitation  = "%%CITATION = HEP-LAT/0106016;%%"
-}
-@Article{Neuberger:1997fp,
-     author    = "Neuberger, H.",
-     title     = "Exactly massless quarks on the lattice",
-     journal   = "Phys. Lett.",
-     volume    = "B417",
-     year      = "1998",
-     pages     = "141-144",
-     eprint    = "hep-lat/9707022",
-     SLACcitation  = "%%CITATION = HEP-LAT 9707022;%%"
-}
-@Article{Neuberger:1998wv,
-     author    = "Neuberger, H.",
-     title     = "More about exactly massless quarks on the lattice",
-     journal   = "Phys. Lett.",
-     volume    = "B427",
-     year      = "1998",
-     pages     = "353-355",
-     eprint    = "hep-lat/9801031",
-     SLACcitation  = "%%CITATION = HEP-LAT 9801031;%%"
-}
-@Article{Niedermayer:1998bi,
-     author    = "Niedermayer, F.",
-     title     = "Exact chiral symmetry, topological charge and related
-                  topics",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "73",
-     year      = "1999",
-     pages     = "105-119",
-     eprint    = "hep-lat/9810026",
-     SLACcitation  = "%%CITATION = HEP-LAT 9810026;%%"
-}
-@Article{Nielsen:1980rz,
-     author    = "Nielsen, H. B. and Ninomiya, M.",
-     title     = "Absence of neutrinos on a lattice. 1. proof by homotopy
-                  theory",
-     journal   = "Nucl. Phys.",
-     volume    = "B185",
-     year      = "1981",
-     pages     = "20",
-     SLACcitation  = "%%CITATION = NUPHA,B185,20;%%"
-}
-@Article{Nielsen:1981hk,
-     author    = "Nielsen, H. B. and Ninomiya, M.",
-     title     = "No go theorem for regularizing chiral fermions",
-     journal   = "Phys. Lett.",
-     volume    = "B105",
-     year      = "1981",
-     pages     = "219",
-     SLACcitation  = "%%CITATION = PHLTA,B105,219;%%"
-}
-@Article{Nielsen:1981xu,
-     author    = "Nielsen, H. B. and Ninomiya, M.",
-     title     = "Absence of neutrinos on a lattice. 2. intuitive topological
-                  proof",
-     journal   = "Nucl. Phys.",
-     volume    = "B193",
-     year      = "1981",
-     pages     = "173",
-     SLACcitation  = "%%CITATION = NUPHA,B193,173;%%"
-}
-@Article{Noaki:1998zc,
-     author    = "Noaki, J. and Izubuchi, T. and Ukawa, A.",
-     title     = "Two-dimensional Gross-Neveu model with {Wilson} fermion
-                  action at finite temperature and density",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "73",
-     year      = "1999",
-     pages     = "483-485",
-     eprint    = "hep-lat/9809071",
-     SLACcitation  = "%%CITATION = HEP-LAT 9809071;%%"
-}
-@Article{Orginos:2001xa,
-     author    = "Orginos, K.",
- collaboration = "RBC",
-     title     = "Chiral properties of domain wall fermions with improved
-                  gauge actions",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "106",
-     year      = "2002",
-     pages     = "721-723",
-     eprint    = "hep-lat/0110074",
-     SLACcitation  = "%%CITATION = HEP-LAT 0110074;%%"
-}
-@Article{Orth:2005kq,
-     author    = "Orth, B. and Lippert, T. and Schilling, K.",
-     title     = "Finite-size effects in lattice {QCD} with dynamical {Wilson}
-                  fermions",
-     journal   = "Phys. Rev.",
-     volume    = "D72",
-     year      = "2005",
-     pages     = "014503",
-     eprint    = "hep-lat/0503016",
-     SLACcitation  = "%%CITATION = HEP-LAT 0503016;%%"
-}
-@Article{Osterwalder:1973dx,
-     author    = "Osterwalder, K. and Schrader, R.",
-     title     = "Axioms for euclidean Green's functions",
-     journal   = "Commun. Math. Phys.",
-     volume    = "31",
-     year      = "1973",
-     pages     = "83-112",
-     SLACcitation  = "%%CITATION = CMPHA,31,83;%%"
-}
-@Article{Osterwalder:1975tc,
-     author    = "Osterwalder, K. and Schrader, R.",
-     title     = "Axioms for euclidean Green's functions. 2",
-     journal   = "Commun. Math. Phys.",
-     volume    = "42",
-     year      = "1975",
-     pages     = "281",
-     SLACcitation  = "%%CITATION = CMPHA,42,281;%%"
-}
-@Article{Osterwalder:1977pc,
-     author    = "Osterwalder, K. and Seiler, E.",
-     title     = "Gauge field theories on the lattice",
-     journal   = "Ann. Phys.",
-     volume    = "110",
-     year      = "1978",
-     pages     = "440",
-     SLACcitation  = "%%CITATION = APNYA,110,440;%%"
-}
-@Article{PDBook,
-     author = "Eidelman, S. and others",
-     title = "{Review of Particle Physics}",
-     journal = "{Physics Letters B}",
-     year = "2004",
-     volume = "592",
-     pages = {1+},
-     url = {http://pdg.lbl.gov}
-}
-@Article{Peardon:2002wb,
-     author    = "Peardon, M. J. and Sexton, J.",
- collaboration = "TrinLat",
-     title     = "Multiple molecular dynamics time-scales in hybrid Monte
-                  Carlo fermion simulations",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "119",
-     year      = "2003",
-     pages     = "985-987",
-     eprint    = "hep-lat/0209037",
-     SLACcitation  = "%%CITATION = HEP-LAT 0209037;%%"
-}
-@Book{Peskin:1995ev,
-  author = 	 {Peskin, M. E. and Schroeder, D. V.},
-  title = 	 {An Introduction to quantum field theory},
-  publisher = 	 {Westview Press},
-  year = 	 {1995},
-  OPTkey = 	 {},
-  OPTvolume = 	 {},
-  OPTnumber = 	 {},
-  OPTseries = 	 {Advanced Book Program},
-  OPTaddress = 	 {Boulder, Colorado},
-  OPTedition = 	 {},
-  OPTmonth = 	 {},
-  OPTnote = 	 {},
-  OPTannote = 	 {}
-}
-@Article{Politzer:1973fx,
-     author    = "Politzer, H. D.",
-     title     = "Reliable perturbative results for strong interactions?",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "30",
-     year      = "1973",
-     pages     = "1346-1349",
-     SLACcitation  = "%%CITATION = PRLTA,30,1346;%%"
-}
-@Article{Politzer:1974fr,
-     author    = "Politzer, H. D.",
-     title     = "Asymptotic freedom: an approach to strong interactions",
-     journal   = "Phys. Rept.",
-     volume    = "14",
-     year      = "1974",
-     pages     = "129-180",
-     SLACcitation  = "%%CITATION = PRPLC,14,129;%%"
-}
-@Manual{R:2005,
-    title = {R: A language and environment for statistical computing},
-    author = {{R Development Core Team}},
-    organization = {R Foundation for Statistical Computing},
-    address = {Vienna, Austria},
-    year = {2005},
-    note = {{ISBN} 3-900051-07-0},
-    url = {http://www.R-project.org},
-}
-
-@Book{Rothe:1992wy,
-     author    = "Rothe, H.J.",
-     title     = "Lattice gauge theories",
-     publisher = "World Scientific, Singapore",
-     year      = "1992",
-     pages     = "528",
-     edition   = "",
-}
-@Article{Rupak:2002sm,
-     author    = "Rupak, G. and Shoresh, N.",
-     title     = "Chiral perturbation theory for the {Wilson} lattice action",
-     journal   = "Phys. Rev.",
-     volume    = "D66",
-     year      = "2002",
-     pages     = "054503",
-     eprint    = "hep-lat/0201019",
-     SLACcitation  = "%%CITATION = HEP-LAT 0201019;%%"
-}
-
-@Article{Saad:1993a,
-  author  = "Saad, Y.",
-  title   = "A flexible inner-outer preconditioned GMRES altorithm",
-  journal = "SIAM J. Sci. Comput.",
-  volume  = "14 (2)",
-  year    = "1993",
-  page    = "461-469"  
-}
-@Article{Sachrajda:2004mi,
-     author    = "Sachrajda, C. T. and Villadoro, G.",
-     title     = "{Twisted boundary conditions in lattice simulations}",
-     journal   = "Phys. Lett.",
-     volume    = "B609",
-     year      = "2005",
-     pages     = "73-85",
-     eprint    = "hep-lat/0411033",
-     archivePrefix = "arXiv",
-     doi       = "10.1016/j.physletb.2005.01.033",
-     SLACcitation  = "%%CITATION = HEP-LAT/0411033;%%"
-}
-@Article{Scorzato:2004da,
-     author    = "Scorzato, L.",
-     title     = "Pion mass splitting and phase structure in twisted mass
-                  {QCD}",
-     journal   = "Eur. Phys. J.",
-     volume    = "C37",
-     year      = "2004",
-     pages     = "445-455",
-     eprint    = "hep-lat/0407023",
-     SLACcitation  = "%%CITATION = HEP-LAT 0407023;%%"
-}
-
-@Article{Scorzato:2005rb,
-     author    = "Scorzato, L. and others",
-     title     = "N(f) = 2 lattice {QCD} and chiral perturbation theory",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "153",
-     year      = "2006",
-     pages     = "283-290",
-     eprint    = "hep-lat/0511036",
-     SLACcitation  = "%%CITATION = HEP-LAT 0511036;%%"
-}
-
-@Article{Sexton:1992nu,
-     author    = "Sexton, J. C. and Weingarten, D. H.",
-     title     = "Hamiltonian evolution for the hybrid monte carlo
-                  algorithm",
-     journal   = "Nucl. Phys.",
-     volume    = "B380",
-     year      = "1992",
-     pages     = "665-678",
-     SLACcitation  = "%%CITATION = NUPHA,B380,665;%%"
-}
-
-@Article{Sharpe:1998xm,
-     author    = "Sharpe, S. R. and Singleton, R., Jr.",
-     title     = "Spontaneous flavor and parity breaking with {Wilson}
-                  fermions",
-     journal   = "Phys. Rev.",
-     volume    = "D58",
-     year      = "1998",
-     pages     = "074501",
-     eprint    = "hep-lat/9804028",
-     SLACcitation  = "%%CITATION = HEP-LAT 9804028;%%"
-}
-@Article{Sharpe:2004ny,
-     author    = "Sharpe, S. R. and Wu, Jackson M. S.",
-     title     = "Twisted mass chiral perturbation theory at next-to-leading
-                  order",
-     journal   = "Phys. Rev.",
-     volume    = "D71",
-     year      = "2005",
-     pages     = "074501",
-     eprint    = "hep-lat/0411021",
-     SLACcitation  = "%%CITATION = HEP-LAT 0411021;%%"
-}
-@Article{Sharpe:2004ps,
-     author    = "Sharpe, S. R. and Wu, J. M. S.",
-     title     = "The phase diagram of twisted mass lattice {QCD}",
-     journal   = "Phys. Rev.",
-     volume    = "D70",
-     year      = "2004",
-     pages     = "094029",
-     eprint    = "hep-lat/0407025",
-     SLACcitation  = "%%CITATION = HEP-LAT 0407025;%%"
-}
-@Article{Sharpe:2005rq,
-     author    = "Sharpe, Stephen R.",
-     title     = "Observations on discretization errors in twisted-mass
-                  lattice QCD",
-     journal   = "Phys. Rev.",
-     volume    = "D72",
-     year      = "2005",
-     pages     = "074510",
-     eprint    = "hep-lat/0509009",
-     SLACcitation  = "%%CITATION = HEP-LAT 0509009;%%"
-}
-@Article{Sheikholeslami:1985ij,
-     author    = "Sheikholeslami, B. and Wohlert, R.",
-     title     = "Improved continuum limit lattice action for qcd with {Wilson}
-                  fermions",
-     journal   = "Nucl. Phys.",
-     volume    = "B259",
-     year      = "1985",
-     pages     = "572",
-     SLACcitation  = "%%CITATION = NUPHA,B259,572;%%"
-}
-@Article{Shindler:2005vj,
-     author    = "Shindler, Andrea",
-     title     = "Twisted mass lattice {QCD}: Recent developments and results",
-     journal   = "PoS",
-     volume    = "LAT2005",
-     year      = "2006",
-     pages     = "014",
-     eprint    = "hep-lat/0511002",
-     SLACcitation  = "%%CITATION = HEP-LAT 0511002;%%"
-}
-@Article{Shindler:2006tm,
-     author    = "Shindler, A.",
- collaboration = "ETM",
-     title     = "Lattice QCD with light twisted quarks: First results",
-     year      = "2006",
-     eprint    = "hep-ph/0611264",
-     SLACcitation  = "%%CITATION = HEP-PH 0611264;%%"
-}
-@Article{Shindler:2007vp,
-     author    = "Shindler, A.",
-     title     = "{Twisted mass lattice QCD}",
-     journal   = "Phys. Rept.",
-     volume    = "461",
-     year      = "2008",
-     pages     = "37-110",
-     eprint    = "0707.4093",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     doi       = "10.1016/j.physrep.2008.03.001",
-     SLACcitation  = "%%CITATION = 0707.4093;%%"
-}
-@Article{Sleijpen:1996aa,
-     author    = "G. L. G. Sleijpen and H. A. Van der Vorst",
-     title     = "A Jacobi-Davidson iteration method for linear 
-                  eigenvalue problems",
-     journal   = "SIAM Journal on Matrix Analysis and Applications",
-     volume    = "17",
-     year      = "1996",
-     pages     = "401-425",
-}
-@Article{Sommer:1993ce,
-     author    = "Sommer, R.",
-     title     = "A New way to set the energy scale in lattice gauge theories
-                  and its applications to the static force and alpha-s in
-                  SU(2) Yang-Mills theory",
-     journal   = "Nucl. Phys.",
-     volume    = "B411",
-     year      = "1994",
-     pages     = "839-854",
-     eprint    = "hep-lat/9310022",
-     SLACcitation  = "%%CITATION = HEP-LAT 9310022;%%"
-}
-@Article{Sonneveld:1989cgs,
- author = {Peter Sonneveld},
- title = {CGS, a fast Lanczos-type solver for nonsymmetric linear systems},
- journal = {SIAM J. Sci. Stat. Comput.},
- volume = {10},
- number = {1},
- year = {1989},
- issn = {0196-5204},
- pages = {36--52},
- publisher = {Society for Industrial and Applied Mathematics},
- address = {Philadelphia, PA, USA},
- }
-@Article{Sternbeck:2003gy,
-     author    = "Sternbeck, A. and Ilgenfritz, E.-M. and Kerler, W.
-                  and M{\"u}ller-Preu{\ss}ker, M. and St{\"u}ben, H.",
-     title     = "The {Aoki} phase for {N(f)} = 2 {Wilson} fermions revisited",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "129",
-     year      = "2004",
-     pages     = "898-900",
-     eprint    = "hep-lat/0309059",
-     SLACcitation  = "%%CITATION = HEP-LAT 0309059;%%"
-}
-@Article{Sternbeck:2005tk,
-     author    = "Sternbeck, A. and Ilgenfritz, E. -M. and Mueller-Preussker,
-                  M. and Schiller, A.",
-     title     = "{Going infrared in SU(3) Landau gauge gluodynamics}",
-     journal   = "Phys. Rev.",
-     volume    = "D72",
-     year      = "2005",
-     pages     = "014507",
-     eprint    = "hep-lat/0506007",
-     SLACcitation  = "%%CITATION = HEP-LAT/0506007;%%"
-}
-@Conference{Symanzik:1981hc,
-     author    = "Symanzik, K.",
-     title     = "Some topics in quantum field theory",
-     booktitle = "Mathematical problems in theoretical physics",
-     journal   = "Lecture Notes in Physics",
-     volume    = "153",
-     year      = "1981",
-     pages     = "47-58",
-     editor    = "R. Schrader et al.",
-     note      = "Presented at 6th Int. Conf. on Mathematical Physics,
-                  Berlin, West Germany"
-}
-@Article{Symanzik:1983dc,
-     author    = "Symanzik, K.",
-     title     = "Continuum limit and improved action in lattice theories. 1.
-                  principles and phi**4 theory",
-     journal   = "Nucl. Phys.",
-     volume    = "B226",
-     year      = "1983",
-     pages     = "187",
-     SLACcitation  = "%%CITATION = NUPHA,B226,187;%%"
-}
-@Article{Symanzik:1983gh,
-     author    = "Symanzik, K.",
-     title     = "Continuum limit and improved action in lattice theories. 2.
-                  O(N) nonlinear sigma model in perturbation theory",
-     journal   = "Nucl. Phys.",
-     volume    = "B226",
-     year      = "1983",
-     pages     = "205",
-     SLACcitation  = "%%CITATION = NUPHA,B226,205;%%"
-}
-@Article{Takaishi:1996xj,
-     author    = "Takaishi, T.",
-     title     = "Heavy quark potential and effective actions on blocked
-                  configurations",
-     journal   = "Phys. Rev.",
-     volume    = "D54",
-     year      = "1996",
-     pages     = "1050-1053",
-     SLACcitation  = "%%CITATION = PHRVA,D54,1050;%%"
-}
-@Article{Takaishi:2005tz,
-     author    = "Takaishi, T. and de Forcrand, P.",
-     title     = "Testing and tuning new symplectic integrators for hybrid
-                  Monte Carlo algorithm in lattice QCD",
-     year      = "2005",
-     eprint    = "hep-lat/0505020",
-     SLACcitation  = "%%CITATION = HEP-LAT 0505020;%%"
-}
-@Article{Takeda:2004xh,
-     author    = "Takeda, S. and others",
-     title     = "A scaling study of the step scaling function in SU(3) gauge
-                  theory with  improved gauge actions",
-     journal   = "Phys. Rev.",
-     volume    = "D70",
-     year      = "2004",
-     pages     = "074510",
-     eprint    = "hep-lat/0408010",
-     SLACcitation  = "%%CITATION = HEP-LAT 0408010;%%"
-}
-@Article{Ukawa:2002pc,
-     author    = "Ukawa, A.",
- collaboration = "CP-PACS and JL{QCD}",
-     title     = "Computational cost of full {QCD} simulations experienced by
-                  {CP-PACS and JLQCD Collaborations}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "106",
-     year      = "2002",
-     pages     = "195-196",
-     SLACcitation  = "%%CITATION = NUPHZ,106,195;%%"
-}
-@Article{Urbach:2005ji,
-     author    = "Urbach, C. and Jansen, K. and Shindler, A. and Wenger, U.",
-     title     = "{HMC} algorithm with multiple time scale integration and mass
-                  preconditioning",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "174",
-     year      = "2006",
-     pages     = "87-98",
-     eprint    = "hep-lat/0506011",
-     SLACcitation  = "%%CITATION = HEP-LAT 0506011;%%"
-}
-@Article{Urbach:2007rt,
-     author    = "Urbach, Carsten",
- collaboration = "ETM",
-     title     = "{Lattice QCD with two light Wilson quarks and maximally
-                  twisted mass}",
-     journal   = "PoS",
-     volume    = "LAT2007",
-     year      = "2007",
-     pages     = "022",
-     eprint    = "0710.1517",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = 0710.1517;%%"
-}
-@Article{WalkerLoud:2005bt,
-     author    = "Walker-Loud, Andre and Wu, Jackson M. S.",
-     title     = "{Nucleon and Delta masses in twisted mass chiral
-                  perturbation theory}",
-     journal   = "Phys. Rev.",
-     volume    = "D72",
-     year      = "2005",
-     pages     = "014506",
-     eprint    = "hep-lat/0504001",
-     archivePrefix = "arXiv",
-     doi       = "10.1103/PhysRevD.72.014506",
-     SLACcitation  = "%%CITATION = HEP-LAT/0504001;%%"
-}
-@Article{Weinberg:1973un,
-     author    = "Weinberg, S.",
-     title     = "Nonabelian gauge theories of the strong interactions",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "31",
-     year      = "1973",
-     pages     = "494-497",
-     SLACcitation  = "%%CITATION = PRLTA,31,494;%%"
-}
-@Article{Weinberg:1978kz,
-     author    = "Weinberg, S.",
-     title     = "Phenomenological Lagrangians",
-     journal   = "Physica",
-     volume    = "A96",
-     year      = "1979",
-     pages     = "327",
-     SLACcitation  = "%%CITATION = PHYSA,A96,327;%%"
-}
-@Book{Weinberg:1995mt,
-     author    = "Weinberg, S.",
-     title     = "The Quantum theory of fields. Vol. 1: Foundations",
-     publisher = "Cambridge University Press",
-     year      = "1995",
-     pages     = "609",
-}
-@Article{Weisz:1982zw,
-     author    = "Weisz, P.",
-     title     = "Continuum limit improved lattice action for pure {Yang-Mills}
-                  theory. 1",
-     journal   = "Nucl. Phys.",
-     volume    = "B212",
-     year      = "1983",
-     pages     = "1",
-     SLACcitation  = "%%CITATION = NUPHA,B212,1;%%"
-}
-@Article{Weisz:1983bn,
-     author    = "Weisz, P. and Wohlert, R.",
-     title     = "Continuum limit improved lattice action for pure {Yang-Mills}
-                  theory. 2",
-     journal   = "Nucl. Phys.",
-     volume    = "B236",
-     year      = 1984,
-     pages     = 397,
-     SLACcitation  = "%%CITATION = NUPHA,B236,397;%%"
-}
-@Article{Wennekers:2005wa,
-     author    = "Wennekers, J. and Wittig, H.",
-     title     = "On the renormalized scalar density in quenched QCD",
-     year      = "2005",
-     eprint    = "hep-lat/0507026",
-     SLACcitation  = "%%CITATION = HEP-LAT 0507026;%%"
-}
-@Article{Weyl:1918ib,
-     author    = "Weyl, H.",
-     title     = "Gravitation und Elektrizit{\"a}t",
-     journal   = "Sitzungsber. Preuss. Akad. Wiss. Berlin (Math. Phys. )",
-     volume    = "1918",
-     year      = "1918",
-     pages     = "465",
-     SLACcitation  = "%%CITATION = SPWPA,1918,465;%%"
-}
-@Article{Weyl:1929fm,
-     author    = "Weyl, H.",
-     title     = "Electron and gravitation",
-     journal   = "Z. Phys.",
-     volume    = "56",
-     year      = "1929",
-     pages     = "330-352",
-     SLACcitation  = "%%CITATION = ZEPYA,56,330;%%"
-}
-@Article{Wilson:1974sk,
-     author    = "Wilson, K. G.",
-     title     = "Confinement of quarks",
-     journal   = "Phys. Rev.",
-     volume    = "D10",
-     year      = "1974",
-     pages     = "2445-2459",
-     SLACcitation  = "%%CITATION = PHRVA,D10,2445;%%"
-}
-@Article{Wilson:1974sk,
-     author    = "Wilson, K. G.",
-     title     = "Confinement of quarks",
-     journal   = "Phys. Rev.",
-     volume    = "D10",
-     year      = "1974",
-     pages     = "2445-2459",
-     SLACcitation  = "%%CITATION = PHRVA,D10,2445;%%"
-}
-@Article{Wilson:1975mb,
-     author    = "Wilson, K. G.",
-     title     = "The renormalization group: Critical phenomena and the kondo
-                  problem",
-     journal   = "Rev. Mod. Phys.",
-     volume    = "47",
-     year      = "1975",
-     pages     = "773",
-     SLACcitation  = "%%CITATION = RMPHA,47,773;%%"
-}
-@Article{Wilson:1975mb,
-     author    = "Wilson, K. G.",
-     title     = "The renormalization group: Critical phenomena and the kondo
-                  problem",
-     journal   = "Rev. Mod. Phys.",
-     volume    = "47",
-     year      = "1975",
-     pages     = "773",
-     SLACcitation  = "%%CITATION = RMPHA,47,773;%%"
-}
-@Article{Wolff:2003sm,
-     author    = "Wolff, U.",
- collaboration = "ALPHA",
-     title     = "Monte Carlo errors with less errors",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "156",
-     year      = "2004",
-     pages     = "143-153",
-     eprint    = "hep-lat/0306017",
-     SLACcitation  = "%%CITATION = HEP-LAT 0306017;%%"
-}
-@Article{Yang:1954ek,
-     author    = "Yang, C.-N. and Mills, R. L.",
-     title     = "Conservation of isotopic spin and isotopic gauge
-                  invariance",
-     journal   = "Phys. Rev.",
-     volume    = "96",
-     year      = "1954",
-     pages     = "191-195",
-     SLACcitation  = "%%CITATION = PHRVA,96,191;%%"
-}
-@Article{Yoshie:2008aw,
-     author    = "Yoshie, Tomoteru",
-     title     = "{Making use of the International Lattice Data Grid}",
-     journal   = "PoS",
-     volume    = "LATTICE2008",
-     year      = "2008",
-     pages     = "019",
-     eprint    = "0812.0849",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = 0812.0849;%%"
-}
-@Article{Zweig:1964jf,
-     author    = "Zweig, G.",
-     title     = "An SU(3) model for strong interaction symmetry and its
-                  breaking. 2",
-     note     = "CERN-TH-412"
-}
-@Article{cln:web,
-  author = 	 {},
-  eprint =       {http://www.ginac.de/CLN/}
-}
-@Article{deForcrand:1995bs,
-     author    = "de Forcrand, P.",
-     title     = "Progress on lattice {QCD} algorithms",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "47",
-     year      = "1996",
-     pages     = "228-235",
-     eprint    = "hep-lat/9509082",
-     SLACcitation  = "%%CITATION = HEP-LAT 9509082;%%"
-}
-@Article{deForcrand:1996bx,
-     author    = "de Forcrand, P. and others",
- collaboration = "{QCD}-TARO",
-     title     = "Search for effective lattice action of pure {QCD}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "53",
-     year      = "1997",
-     pages     = "938-941",
-     eprint    = "hep-lat/9608094",
-     SLACcitation  = "%%CITATION = HEP-LAT 9608094;%%"
-}
-@Article{deForcrand:1996ck,
-     author    = "de Forcrand, P. and Takaishi, T.",
-     title     = "Fast fermion Monte Carlo",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "53",
-     year      = "1997",
-     pages     = "968-970",
-     eprint    = "hep-lat/9608093",
-     SLACcitation  = "%%CITATION = HEP-LAT 9608093;%%"
-}
-@Article{etmc:asqr,
-     author    = "Frezzotti, R. et al.",
-     title     = "{O(a^2) cutoff effects in Wilson fermion simulations}",
-     journal   = "PoS",
-     volume    = "LAT2007",
-     year      = "2007",
-     pages     = "277",
-     eprint    = "0710.2492",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = 0710.2492;%%"
-}
-@Article{ildg:web,
-  eprint = 	 {http://cssm.sasr.edu.au/ildg/},
-  author =	 {ILDG working groups}
-}
-@Book{kleinert:1,
-     author    = "Kleinert, H.",
-     title     = "Path integrals in quantum mechanics, statistics and polymer ph
-ysics",
-     publisher = "World Scientific, Singapore",
-     year      = "1995",
-     edition   = "2nd Edition",
-}
-@Article{lapack:web,
-  author = 	 {},
-  eprint =       {http://www.netlib.org/lapack/}
-}
-@Article{lime:web,
-  author = 	 {USQCD},
-  title = 	 {c-lime library},
-  eprint =       {http://usqcd.jlab.org/usqcd-docs/c-lime/}
-}
-@Book{meister:1999,
-  author = 	 {Meister, Andreas},
-  title = 	 {Numerik linearer Gleichungssysteme},
-  publisher = 	 {vieweg},
-  year = 	 {1999},
-  OPTkey = 	 {},
-  OPTvolume = 	 {},
-  OPTnumber = 	 {},
-  OPTseries = 	 {},
-  OPTaddress = 	 {},
-  OPTedition = 	 {},
-  OPTmonth = 	 {},
-  OPTnote = 	 {},
-  OPTannote = 	 {}
-}
-@Manual{minuit,
-  title = 	 {MINUIT home page},
-  note= {\\seal.web.cern.ch/seal/snapshot/work-packages/mathlibs/minuit/home.html}
-}
-@Article{mpi:web,
-  author =       {},
-  title  =       {The message passing interface standard},
-  eprint =       {http://www-unix.mcs.anl.gov/mpi/}
-}
-@PhdThesis{orth:2004phd,
-  author = 	 {Orth, B.},
-  title = 	 {Finite size effects in lattice {QCD}
-                  with dynamical {Wilson} fermions},
-  school = 	 {Bergische Universit{\"a}t Wuppertal},
-  year = 	 {2004},
-  OPTkey = 	 {},
-  OPTtype = 	 {},
-  OPTaddress = 	 {},
-  OPTmonth = 	 {},
-  OPTnote = 	 {},
-  OPTannote = 	 {}
-}
-@PhdThesis{pleiter:phd,
-  author = 	 {Pleiter, D.},
-  title = 	 {XXX},
-  school = 	 {Freie {U}niversit�t {B}erlin},
-  year = 	 {2001}
-}
-@Manual{root,
-  title = 	 {The ROOT system home page},
-  note = {root.cern.ch/}
-}
-
-@Book{saad:2003a,
-     author    = "Y. Saad",
-     title     = "Iterative Methods for sparse linear systems",
-     publisher = "SIAM",
-     year      = "2003",
-     edition   = "2nd",
-}
-
-@Article{scidac,
-  author = 	 {},
-  eprint =       {http://www.scidac.gov/}
-}
-@MastersThesis{urbach:2002aa,
-  author = 	 {Urbach, C.},
-  title = 	 {Untersuchung der {R}eversibilit{\"a}tsverletzung im {H}ybrid
-                  {M}onte {C}arlo {A}lgorithmus},
-  school = 	 {Freie Universit{\"a}t Berlin, Fachbereich Physik},
-  year = 	 {2002}
-}
-
-@Article{'tHooft:1971fh,
-     author    = "'t Hooft, G.",
-     title     = "Renormalization of massless Yang-Mills fields",
-     journal   = "Nucl. Phys.",
-     volume    = "B33",
-     year      = "1971",
-     pages     = "173-199",
-     SLACcitation  = "%%CITATION = NUPHA,B33,173;%%"
-}
-@Article{'tHooft:1971rn,
-     author    = "'t Hooft, G.",
-     title     = "Renormalizable lagrangians for massive Yang-Mills fields",
-     journal   = "Nucl. Phys.",
-     volume    = "B35",
-     year      = "1971",
-     pages     = "167-188",
-     SLACcitation  = "%%CITATION = NUPHA,B35,167;%%"
-}
-@Unpublished{'tHooft:1972aa,
-  author = 	 "'t Hooft, G.",
-  title = 	 "",
-  note = 	 "Unpublished remarks at the 1972 Marseille Conference 
-                  on Yang-Mills Fields"
-}
-@Article{'tHooft:1972fi,
-     author    = "'t Hooft, G. and Veltman, M. J. G.",
-     title     = "Regularization and renormalization of gauge fields",
-     journal   = "Nucl. Phys.",
-     volume    = "B44",
-     year      = "1972",
-     pages     = "189-213",
-     SLACcitation  = "%%CITATION = NUPHA,B44,189;%%"
-}
-@Article{Abdel-Rehim:2004gx,
-     author    = "Abdel-Rehim, A. M. and Lewis, R.",
-     title     = "Twisted mass {QCD} for the pion electromagnetic form factor",
-     journal   = "Phys. Rev.",
-     volume    = "D71",
-     year      = "2005",
-     pages     = "014503",
-     eprint    = "hep-lat/0410047",
-     SLACcitation  = "%%CITATION = HEP-LAT 0410047;%%"
-}
-@Article{Abdel-Rehim:2005gz,
-     author    = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M.
-                  ",
-     title     = "Spectrum of quenched twisted mass lattice QCD at maximal
-                  twist",
-     journal   = "Phys. Rev.",
-     volume    = "D71",
-     year      = "2005",
-     pages     = "094505",
-     eprint    = "hep-lat/0503007",
-     SLACcitation  = "%%CITATION = HEP-LAT/0503007;%%"
-}
-@Article{AbdelRehim:2004sp,
-     author    = "Abdel-Rehim, Abdou M. and Lewis, Randy",
-     title     = "Pion form factor with twisted mass QCD",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "140",
-     year      = "2005",
-     pages     = "299-301",
-     eprint    = "hep-lat/0408033",
-     SLACcitation  = "%%CITATION = HEP-LAT/0408033;%%"
-}
-@Article{AbdelRehim:2005gq,
-     author    = "Abdel-Rehim, A. M. and Lewis, R. and Woloshyn, R. M.",
-     title     = "Twisted mass lattice QCD and hadron phenomenology",
-     journal   = "Int. J. Mod. Phys.",
-     volume    = "A20",
-     year      = "2005",
-     pages     = "6159-6168",
-     SLACcitation  = "%%CITATION = IMPAE,A20,6159;%%"
-}
-@Article{AbdelRehim:2005gz,
-     author    = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M.
-                  ",
-     title     = "{Spectrum of quenched twisted mass lattice QCD at maximal
-                  twist}",
-     journal   = "Phys. Rev.",
-     volume    = "D71",
-     year      = "2005",
-     pages     = "094505",
-     eprint    = "hep-lat/0503007",
-     archivePrefix = "arXiv",
-     doi       = "10.1103/PhysRevD.71.094505",
-     SLACcitation  = "%%CITATION = HEP-LAT/0503007;%%"
-}
-@Article{AbdelRehim:2005qv,
-     author    = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M.
-                  ",
-     title     = "The hadron spectrum from twisted mass QCD with a strange
-                  quark",
-     journal   = "PoS",
-     volume    = "LAT2005",
-     year      = "2006",
-     pages     = "032",
-     eprint    = "hep-lat/0509056",
-     SLACcitation  = "%%CITATION = HEP-LAT/0509056;%%"
-}
-@Article{AbdelRehim:2005yx,
-     author    = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M.
-                  ",
-     title     = "Maximal twist and the spectrum of quenched twisted mass
-                  lattice QCD",
-     journal   = "PoS",
-     volume    = "LAT2005",
-     year      = "2006",
-     pages     = "051",
-     eprint    = "hep-lat/0509098",
-     SLACcitation  = "%%CITATION = HEP-LAT/0509098;%%"
-}
-@Article{AbdelRehim:2006qu,
-     author    = "Abdel-Rehim, Abdou M. and Lewis, Randy and Petry, Robert G.
-                  and Woloshyn, R. M.",
-     title     = "The spectrum of tmLQCD with quark and link smearing",
-     journal   = "PoS",
-     volume    = "LAT2006",
-     year      = "2006",
-     pages     = "164",
-     eprint    = "hep-lat/0610004",
-     SLACcitation  = "%%CITATION = HEP-LAT/0610004;%%"
-}
-@Article{AbdelRehim:2006ra,
-     author    = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M.
-                  and Wu, Jackson M. S.",
-     title     = "Lattice QCD with a twisted mass term and a strange quark",
-     journal   = "Eur. Phys. J.",
-     volume    = "A31",
-     year      = "2007",
-     pages     = "773-776",
-     eprint    = "hep-lat/0610090",
-     SLACcitation  = "%%CITATION = HEP-LAT/0610090;%%"
-}
-@Article{AbdelRehim:2006ve,
-     author    = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M.
-                  and Wu, Jackson M. S.",
-     title     = "Strange quarks in quenched twisted mass lattice QCD",
-     journal   = "Phys. Rev.",
-     volume    = "D74",
-     year      = "2006",
-     pages     = "014507",
-     eprint    = "hep-lat/0601036",
-     SLACcitation  = "%%CITATION = HEP-LAT/0601036;%%"
-}
-@Article{Adler:1974gd,
-     author    = "Adler, Stephen L.",
-     title     = "{Some Simple Vacuum Polarization Phenomenology: e+ e- $\to$
-                  Hadrons: The mu - Mesic Atom x-Ray Discrepancy and (g-2) of
-                  the Muon}",
-     journal   = "Phys. Rev.",
-     volume    = "D10",
-     year      = "1974",
-     pages     = "3714",
-     SLACcitation  = "%%CITATION = PHRVA,D10,3714;%%"
-}
-@Article{Albanese:1987ds,
-     author    = "Albanese, M. and others",
- collaboration = "APE",
-     title     = "Glueball masses and string tension in lattice {QCD}",
-     journal   = "Phys. Lett.",
-     volume    = "B192",
-     year      = "1987",
-     pages     = "163",
-     SLACcitation  = "%%CITATION = PHLTA,B192,163;%%"
-}
-@Article{Alexandrou:2008tn,
-     author    = "Alexandrou, C. and others",
- collaboration = "ETM",
-     title     = "{Light baryon masses with dynamical twisted mass
-                  fermions}",
-     year      = "2008",
-     eprint    = "0803.3190",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = 0803.3190;%%"
-}
-@Article{AliKhan:2000iv,
-     author    = "Ali Khan, A. and others",
- collaboration = "CP-PACS",
-     title     = "Chiral properties of domain-wall quarks in quenched {QCD}",
-     journal   = "Phys. Rev.",
-     volume    = "D63",
-     year      = "2001",
-     pages     = "114504",
-     eprint    = "hep-lat/0007014",
-     SLACcitation  = "%%CITATION = HEP-LAT 0007014;%%"
-}
-@Article{AliKhan:2003br,
-     author    = "Ali Khan, A. and others",
- collaboration = "QCDSF",
-     title     = "Accelerating the hybrid Monte Carlo algorithm",
-     journal   = "Phys. Lett.",
-     volume    = "B564",
-     year      = "2003",
-     pages     = "235-240",
-     eprint    = "hep-lat/0303026",
-     SLACcitation  = "%%CITATION = HEP-LAT 0303026;%%"
-}
-@Article{AliKhan:2003mu,
-     author    = "Ali Khan, A. and others",
-     title     = "Accelerating Hasenbusch's acceleration of hybrid Monte
-                  Carlo",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "129",
-     year      = "2004",
-     pages     = "853-855",
-     eprint    = "hep-lat/0309078",
-     SLACcitation  = "%%CITATION = HEP-LAT 0309078;%%"
-}
-@Article{Allton:1993wc,
-     author    = "Allton, C. R. and others",
- collaboration = "UK{QCD}",
-     title     = "Gauge invariant smearing and matrix correlators using
-                  {Wilson} fermions at Beta = 6.2",
-     journal   = "Phys. Rev.",
-     volume    = "D47",
-     year      = "1993",
-     pages     = "5128-5137",
-     eprint    = "hep-lat/9303009",
-     SLACcitation  = "%%CITATION = HEP-LAT 9303009;%%"
-}
-@Article{Allton:2004qq,
-     author    = "Allton, C. R. and others",
- collaboration = "UKQCD",
-     title     = "Improved Wilson QCD simulations with light quark masses",
-     journal   = "Phys. Rev.",
-     volume    = "D70",
-     year      = "2004",
-     pages     = "014501",
-     eprint    = "hep-lat/0403007",
-     SLACcitation  = "%%CITATION = HEP-LAT/0403007;%%"
-}
-@Article{Aoki:1984qi,
-     author    = "Aoki, S.",
-     title     = "New phase structure for lattice {QCD} with {Wilson} fermions",
-     journal   = "Phys. Rev.",
-     volume    = "D30",
-     year      = "1984",
-     pages     = "2653",
-     SLACcitation  = "%%CITATION = PHRVA,D30,2653;%%"
-}
-@Article{Aoki:1985jj,
-     author    = "Aoki, S. and Higashijima, K.",
-     title     = "The recovery of the chiral symmetry in lattice {Gross-Neveu}
-                  model",
-     journal   = "Prog. Theor. Phys.",
-     volume    = "76",
-     year      = "1986",
-     pages     = "521",
-     SLACcitation  = "%%CITATION = PTPKA,76,521;%%"
-}
-@Article{Aoki:1986ua,
-     author    = "Aoki, Sinya",
-     title     = "NUMERICAL EVIDENCE FOR A PARITY VIOLATING PHASE IN LATTICE
-                  QCD WITH WILSON FERMION",
-     journal   = "Phys. Lett.",
-     volume    = "B190",
-     year      = "1987",
-     pages     = "140",
-     SLACcitation  = "%%CITATION = PHLTA,B190,140;%%"
-}
-@Article{Aoki:1986xr,
-     author    = "Aoki, S.",
-     title     = "A solution to the {U(1)} problem on a lattice",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "57",
-     year      = "1986",
-     pages     = "3136",
-     SLACcitation  = "%%CITATION = PRLTA,57,3136;%%"
-}
-@Article{Aoki:1993vs,
-     author    = "Aoki, S. and Boettcher, S. and Gocksch, A.",
-     title     = "Spontaneous breaking of flavor symmetry and parity in the
-                  Nambu-Jona-Lasinio model with {Wilson} fermions",
-     journal   = "Phys. Lett.",
-     volume    = "B331",
-     year      = "1994",
-     pages     = "157-164",
-     eprint    = "hep-lat/9312084",
-     SLACcitation  = "%%CITATION = HEP-LAT 9312084;%%"
-}
-@Article{Aoki:1995ft,
-     author    = "Aoki, S.",
-     title     = "On the phase structure of {QCD} with {Wilson} fermions",
-     journal   = "Prog. Theor. Phys. Suppl.",
-     volume    = "122",
-     year      = "1996",
-     pages     = "179-186",
-     eprint    = "hep-lat/9509008",
-     SLACcitation  = "%%CITATION = HEP-LAT 9509008;%%"
-}
-@Article{Aoki:1995yf,
-     author    = "Aoki, S. and Ukawa, A. and Umemura, T.",
-     title     = "Finite temperature phase structure of lattice {QCD} with
-                  {Wilson} quark action",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "76",
-     year      = "1996",
-     pages     = "873-876",
-     eprint    = "hep-lat/9508008",
-     SLACcitation  = "%%CITATION = HEP-LAT 9508008;%%"
-}
-@Article{Aoki:1997fm,
-     author    = "Aoki, S.",
-     title     = "Phase structure of lattice {QCD} with {Wilson} fermion at
-                  finite  temperature",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "60A",
-     year      = "1998",
-     pages     = "206-219",
-     eprint    = "hep-lat/9707020",
-     SLACcitation  = "%%CITATION = HEP-LAT 9707020;%%"
-}
-@Article{Aoki:2001xq,
-     author    = "Aoki, S. and others",
- collaboration = "JL{QCD}",
-     title     = "Non-trivial phase structure of {N(f)} = 3 {QCD} with {O(a)}-
-                  improved {Wilson}  fermion at zero temperature",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "106",
-     year      = "2002",
-     pages     = "263-265",
-     eprint    = "hep-lat/0110088",
-     SLACcitation  = "%%CITATION = HEP-LAT 0110088;%%"
-}
-@Article{Aoki:2002vt,
-     author    = "Aoki, Y. and others",
-     title     = "Domain wall fermions with improved gauge actions",
-     journal   = "Phys. Rev.",
-     volume    = "D69",
-     year      = "2004",
-     pages     = "074504",
-     eprint    = "hep-lat/0211023",
-     SLACcitation  = "%%CITATION = HEP-LAT 0211023;%%"
-}
-@Article{Aoki:2004iq,
-     author    = "Aoki, S. and others",
- collaboration = "JL{QCD}",
-     title     = "Bulk first-order phase transition in three-flavor lattice
-                  {QCD} with  {O(a)}-improved {Wilson} fermion action at zero
-                  temperature",
-     year      = "2004",
-     eprint    = "hep-lat/0409016",
-     SLACcitation  = "%%CITATION = HEP-LAT 0409016;%%"
-}
-@Article{Aoki:2004ta,
-     author    = "Aoki, Sinya and B{\"a}r, Oliver",
-     title     = "Twisted-mass {QCD}, {O}(a) improvement and {Wilson} chiral
-                  perturbation  theory",
-     journal   = "Phys. Rev.",
-     volume    = "D70",
-     year      = "2004",
-     pages     = "116011",
-     eprint    = "hep-lat/0409006",
-     SLACcitation  = "%%CITATION = HEP-LAT 0409006;%%"
-}
-@Article{Aoki:2005ii,
-     author    = "Aoki, S. and B{\"a}r, O.",
-     title     = "Determining the low energy parameters of {Wilson} chiral
-                  perturbation theory",
-     year      = "2005",
-     eprint    = "hep-lat/0509002",
-     SLACcitation  = "%%CITATION = HEP-LAT 0509002;%%"
-}
-@Article{Arnold:2003sx,
-     author    = "Arnold, Guido and others",
-     title     = "Numerical methods for the QCD overlap operator. II: Optimal
-                  Krylov subspace methods",
-     year      = "2003",
-     eprint    = "hep-lat/0311025",
-     SLACcitation  = "%%CITATION = HEP-LAT 0311025;%%"
-}
-@Article{Atiyah:1971rm,
-     author    = "Atiyah, M. F. and Singer, I. M.",
-     title     = "The Index of elliptic operators. 5",
-     journal   = "Annals Math.",
-     volume    = "93",
-     year      = "1971",
-     pages     = "139-149",
-     SLACcitation  = "%%CITATION = ANMAA,93,139;%%"
-}
-@Article{Aubin:2006cc,
-     author    = "Aubin, C. and Blum, T.",
-     title     = "{Hadronic contributions to the muon g-2 from the lattice}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "162",
-     year      = "2006",
-     pages     = "251-255",
-     SLACcitation  = "%%CITATION = NUPHZ,162,251;%%"
-}
-@Article{Aubin:2006xv,
-     author    = "Aubin, C. and Blum, T.",
-     title     = "{Calculating the hadronic vacuum polarization and leading
-                  hadronic  contribution to the muon anomalous magnetic
-                  moment with improved  staggered quarks}",
-     journal   = "Phys. Rev.",
-     volume    = "D75",
-     year      = "2007",
-     pages     = "114502",
-     eprint    = "hep-lat/0608011",
-     SLACcitation  = "%%CITATION = HEP-LAT/0608011;%%"
-}
-@Article{BAGEL,
- author="P.A. Boyle",
- year=2005,
- eprint=" http://www.ph.ed.ac.uk/\~{ }paboyle/bagel/Bagel.html"
- }
-@Article{Baikov:2004ku,
-     author    = "Baikov, P. A. and Chetyrkin, K. G. and K{\"u}hn, J. H.",
-     title     = "{Vacuum polarization in pQCD: First complete O(alpha(s)**4)
-                  result}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "135",
-     year      = "2004",
-     pages     = "243-246",
-     SLACcitation  = "%%CITATION = NUPHZ,135,243;%%"
-}
-@Article{Baikov:2005rw,
-     author    = "Baikov, P. A. and Chetyrkin, K. G. and K{\"u}hn, J. H.",
-     title     = "{Scalar correlator at O(alpha(s)**4), Higgs decay into b-
-                  quarks and  bounds on the light quark masses}",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "96",
-     year      = "2006",
-     pages     = "012003",
-     eprint    = "hep-ph/0511063",
-     SLACcitation  = "%%CITATION = HEP-PH/0511063;%%"
-}
-@Article{Baikov:2008jh,
-     author    = "Baikov, P. A. and Chetyrkin, K. G. and K{\"u}hn, J. H.",
-     title     = "{Hadronic Z- and tau-Decays in Order alpha_s^4}",
-     year      = "2008",
-     eprint    = "0801.1821",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = ARXIV:0801.1821;%%"
-}
-@Article{Bali:2000vr,
-     author    = "Bali, G. S. and others",
- collaboration = "TXL",
-     title     = "Static potentials and glueball masses from {QCD} simulations
-                  with {Wilson}  sea quarks",
-     journal   = "Phys. Rev.",
-     volume    = "D62",
-     year      = "2000",
-     pages     = "054503",
-     eprint    = "hep-lat/0003012",
-     SLACcitation  = "%%CITATION = HEP-LAT 0003012;%%"
-}
-@Article{Bali:2004pb,
-     author    = "Bali, G. S. and others",
-     title     = "String breaking with dynamical {Wilson} fermions",
-     journal   = "Nucl. Phys. Proc. Supl.",
-     volume    = "140",
-     pages     = "609-611",
-     year      = "2004",
-     eprint    = "hep-lat/0409137",
-     SLACcitation  = "%%CITATION = HEP-LAT 0409137;%%"
-}
-@Article{Bali:2005fu,
-     author    = "Bali, G. S. and Neff, H. and Duessel, T. and
-                  Lippert, T. and Schilling, K.",
- collaboration = "SESAM",
-     title     = "Observation of string breaking in {QCD}",
-     journal   = "Phys. Rev.",
-     volume    = "D71",
-     year      = "2005",
-     pages     = "114513",
-     eprint    = "hep-lat/0505012",
-     SLACcitation  = "%%CITATION = HEP-LAT 0505012;%%"
-}
-@Article{Bar:2006zj,
-     author    = "B{\"a}r, O. and Jansen, K. and Schaefer, S. and Scorzato, L.
-                  and Shindler, A.",
-     title     = "Overlap fermions on a twisted mass sea",
-     year      = "2006",
-     eprint    = "hep-lat/0609039",
-     SLACcitation  = "%%CITATION = HEP-LAT 0609039;%%"
-}
-@Article{Baxter:1993bv,
-     author    = "Baxter, R. M. and others",
- collaboration = "UK{QCD}",
-     title     = "Quenched heavy light decay constants",
-     journal   = "Phys. Rev.",
-     volume    = "D49",
-     year      = "1994",
-     pages     = "1594-1605",
-     eprint    = "hep-lat/9308020",
-     SLACcitation  = "%%CITATION = HEP-LAT 9308020;%%"
-}
-@Article{Beane:2004tw,
-     author    = "Beane, Silas R.",
-     title     = "{Nucleon masses and magnetic moments in a finite volume}",
-     journal   = "Phys. Rev.",
-     volume    = "D70",
-     year      = "2004",
-     pages     = "034507",
-     eprint    = "hep-lat/0403015",
-     archivePrefix = "arXiv",
-     doi       = "10.1103/PhysRevD.70.034507",
-     SLACcitation  = "%%CITATION = HEP-LAT/0403015;%%"
-}
-@Article{Becher:1999he,
-     author    = "Becher, Thomas and Leutwyler, H.",
-     title     = "Baryon chiral perturbation theory in manifestly Lorentz
-                  invariant form",
-     journal   = "Eur. Phys. J.",
-     volume    = "C9",
-     year      = "1999",
-     pages     = "643-671",
-     eprint    = "hep-ph/9901384",
-     SLACcitation  = "%%CITATION = HEP-PH/9901384;%%"
-}
-@Article{Bietenholz:2004sa,
-     author    = "Bietenholz, W. and others",
- collaboration = "\xlf",
-     title     = "Comparison between overlap and twisted mass fermions
-                  towards the chiral  limit",
-     year      = "2004",
-     eprint    = "hep-lat/0409109",
-     SLACcitation  = "%%CITATION = HEP-LAT 0409109;%%"
-}
-@Article{Bietenholz:2004wv,
-     author    = "Bietenholz, W. and others",
- collaboration = "\xlf",
-     title     = "Going chiral: Overlap versus twisted mass fermions",
-     journal   = "JHEP",
-     volume    = "12",
-     year      = "2004",
-     pages     = "044",
-     eprint    = "hep-lat/0411001",
-     SLACcitation  = "%%CITATION = HEP-LAT 0411001;%%"
-}
-@Article{Blossier:2007vv,
-     author    = "Blossier, B. and others",
- collaboration = "ETM",
-     title     = "{Light quark masses and pseudoscalar decay constants from
-                  Nf=2 Lattice QCD with twisted mass fermions}",
-     year      = "2007",
-     eprint    = "0709.4574",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = ARXIV:0709.4574;%%"
-}
-@Article{Blum:1994eh,
-     author    = "Blum, Tom and others",
-     title     = "QCD thermodynamics with Wilson quarks at large kappa",
-     journal   = "Phys. Rev.",
-     volume    = "D50",
-     year      = "1994",
-     pages     = "3377-3381",
-     eprint    = "hep-lat/9404006",
-     SLACcitation  = "%%CITATION = HEP-LAT 9404006;%%"
-}
-@Article{Blum:2000kn,
-     author    = "Blum, T. and others",
-     title     = "Quenched lattice {QCD} with domain wall fermions and the
-                  chiral limit",
-     journal   = "Phys. Rev.",
-     volume    = "D69",
-     year      = "2004",
-     pages     = "074502",
-     eprint    = "hep-lat/0007038",
-     SLACcitation  = "%%CITATION = HEP-LAT 0007038;%%"
-}
-@Article{Bodin:2005gg,
-     author    = "Bodin, F. and others",
- collaboration = "ApeNEXT",
-     title     = "The {apeNEXT} project",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "140",
-     year      = "2005",
-     pages     = "176-182",
-     SLACcitation  = "%%CITATION = NUPHZ,140,176;%%"
-}
-@Article{Bolder:2000un,
-     author    = "Bolder, B. and others",
-     title     = "A high precision study of the Q anti-Q potential from
-                  {Wilson} loops in  the regime of string breaking",
-     journal   = "Phys. Rev.",
-     volume    = "D63",
-     year      = "2001",
-     pages     = "074504",
-     eprint    = "hep-lat/0005018",
-     SLACcitation  = "%%CITATION = HEP-LAT 0005018;%%"
-}
-@Article{Boucaud:2007uk,
-     author    = "Boucaud, Ph. and others",
- collaboration = "ETM",
-     title     = "Dynamical twisted mass fermions with light quarks",
-     year      = "2007",
-     eprint    = "hep-lat/0701012",
-     SLACcitation  = "%%CITATION = HEP-LAT 0701012;%%"
-}
-@Article{Boucaud:2008xu,
-     author    = "Boucaud, Ph. and others",
- collaboration = "ETM",
-     title     = "{Dynamical Twisted Mass Fermions with Light Quarks:
-                  Simulation and Analysis Details}",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "179",
-     year      = "2008",
-     pages     = "695-715",
-     eprint    = "0803.0224",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     doi       = "10.1016/j.cpc.2008.06.013",
-     SLACcitation  = "%%CITATION = 0803.0224;%%"
-}
-@Article{Boughezal:2006px,
-     author    = "Boughezal, R. and Czakon, M. and Schutzmeier, T.",
-     title     = "{Charm and bottom quark masses from perturbative QCD}",
-     journal   = "Phys. Rev.",
-     volume    = "D74",
-     year      = "2006",
-     pages     = "074006",
-     eprint    = "hep-ph/0605023",
-     SLACcitation  = "%%CITATION = HEP-PH/0605023;%%"
-}
-@Article{Boyle:2005fb,
-     author    = "Boyle, P. A. and others",
-     title     = "{QCDOC}: Project status and first results",
-     journal   = "J. Phys. Conf. Ser.",
-     volume    = "16",
-     year      = "2005",
-     pages     = "129-139",
-     SLACcitation  = "%%CITATION = 00462,16,129;%%"
-}
-@Article{Brower:1994er,
-     author    = "Brower, R. C. and Levi, A. R. and Orginos, K.",
-     title     = "Extrapolation methods for the Dirac inverter in hybrid
-                  Monte Carlo",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "42",
-     year      = "1995",
-     pages     = "855-857",
-     eprint    = "hep-lat/9412004",
-     SLACcitation  = "%%CITATION = HEP-LAT 9412004;%%"
-}
-
-@Article{Brower:1995vx,
-     author    = "Brower, R. C. and Ivanenko, T. and Levi, A. R. and Orginos,
-                  K. N.",
-     title     = "Chronological inversion method for the Dirac matrix in
-                  hybrid Monte  Carlo",
-     journal   = "Nucl. Phys.",
-     volume    = "B484",
-     year      = "1997",
-     pages     = "353-374",
-     eprint    = "hep-lat/9509012",
-     SLACcitation  = "%%CITATION = HEP-LAT 9509012;%%"
-}
-
-@Article{Bunk:1995uv,
-     author    = "Bunk, B. and others",
-     title     = "A New simulation algorithm for lattice {QCD} with dynamical
-                  quarks",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "42",
-     year      = "1995",
-     pages     = "49-55",
-     eprint    = "hep-lat/9411016",
-     SLACcitation  = "%%CITATION = HEP-LAT 9411016;%%"
-}
-@Article{Bunk:1998rm,
-     author    = "Bunk, B. and Elser, Stephan and Frezzotti, R. and Jansen,
-                  K.",
-     title     = "{Ordering monomial factors of polynomials in the product
-                  representation}",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "118",
-     year      = "1999",
-     pages     = "95-109",
-     eprint    = "hep-lat/9805026",
-     archivePrefix = "arXiv",
-     doi       = "10.1016/S0010-4655(99)00198-8",
-     SLACcitation  = "%%CITATION = HEP-LAT/9805026;%%"
-}
-@Article{Bunk:1998rm,
-     author    = "Bunk, B. and Elser, S. and Frezzotti, R. and Jansen,
-                  K.",
-     title     = "Ordering monomial factors of polynomials in the product
-                  representation",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "118",
-     year      = "1999",
-     pages     = "95-109",
-     eprint    = "hep-lat/9805026",
-     SLACcitation  = "%%CITATION = HEP-LAT 9805026;%%"
-}
-@Article{Burrage:1998a,
-  author       = " K. Burrage and J. Erhel",
-  title        = "On the performance of various adaptive preconditioned GMRES strategies",
-  journal      = "Num. Lin. Alg. with Appl.",
-  year         = "1998",
-  volume       = "5",
-  pages        = "101-121"
-}
-@Article{Campbell:1987nv,
-     author    = "Campbell, N. A. and Huntley, A. and Michael, C.",
-     title     = "Heavy quark potentials and hybrid mesons from SU(3) lattice
-                  gauge theory",
-     journal   = "Nucl. Phys.",
-     volume    = "B306",
-     year      = "1988",
-     pages     = "51",
-     SLACcitation  = "%%CITATION = NUPHA,B306,51;%%"
-}
-@Article{Capitani:2005jp,
-     author    = "Capitani, S. and others",
-     title     = "Parton distribution functions with twisted mass fermions",
-     journal   = "Phys. Lett.",
-     volume    = "B639",
-     year      = "2006",
-     pages     = "520-526",
-     eprint    = "hep-lat/0511013",
-     SLACcitation  = "%%CITATION = HEP-LAT 0511013;%%"
-}
-@Article{Chen:2003im,
-     author    = "Chen, Y. and others",
-     title     = "Chiral logarithms in quenched {QCD}",
-     journal   = "Phys. Rev.",
-     volume    = "D70",
-     year      = "2004",
-     pages     = "034502",
-     eprint    = "hep-lat/0304005",
-     SLACcitation  = "%%CITATION = HEP-LAT 0304005;%%"
-}
-@Book{Cheng:2000ct,
-     author    = "Cheng, T. P. and Li, L. F.",
-     title     = "Gauge theory of elementary particle physics: Problems and
-                  solutions",
-     publisher = "Oxford, UK: Clarendon",
-     year      = "2000",
-     pages     = "306",
-     edition   = "",
-}
-@Article{Chetyrkin:1990kr,
-     author    = "Chetyrkin, K. G. and K{\"u}hn, Johann H.",
-     title     = "{Mass corrections to the Z decay rate}",
-     journal   = "Phys. Lett.",
-     volume    = "B248",
-     year      = "1990",
-     pages     = "359-364",
-     SLACcitation  = "%%CITATION = PHLTA,B248,359;%%"
-}
-@Article{Chetyrkin:1996cf,
-     author    = "Chetyrkin, K. G. and K{\"u}hn, Johann H. and Steinhauser, M.",
-     title     = "{Three-loop polarization function and O(alpha(s)**2)
-                  corrections to the  production of heavy quarks}",
-     journal   = "Nucl. Phys.",
-     volume    = "B482",
-     year      = "1996",
-     pages     = "213-240",
-     eprint    = "hep-ph/9606230",
-     SLACcitation  = "%%CITATION = HEP-PH/9606230;%%"
-}
-@Article{Chetyrkin:1997mb,
-     author    = "Chetyrkin, K. G. and K{\"u}hn, Johann H. and Steinhauser, M.",
-     title     = "{Heavy quark current correlators to O(alpha(s)**2)}",
-     journal   = "Nucl. Phys.",
-     volume    = "B505",
-     year      = "1997",
-     pages     = "40-64",
-     eprint    = "hep-ph/9705254",
-     SLACcitation  = "%%CITATION = HEP-PH/9705254;%%"
-}
-@Article{Chetyrkin:1998ix,
-     author    = "Chetyrkin, K. G. and Harlander, R. and Steinhauser, M.",
-     title     = "{Singlet polarization functions at O(alpha(s)**2)}",
-     journal   = "Phys. Rev.",
-     volume    = "D58",
-     year      = "1998",
-     pages     = "014012",
-     eprint    = "hep-ph/9801432",
-     SLACcitation  = "%%CITATION = HEP-PH/9801432;%%"
-}
-@Article{Chetyrkin:2000zk,
-     author    = "Chetyrkin, K. G. and Harlander, R. V. and K{\"u}hn, Johann H.",
-     title     = "{Quartic mass corrections to R(had) at O(alpha(s)**3)}",
-     journal   = "Nucl. Phys.",
-     volume    = "B586",
-     year      = "2000",
-     pages     = "56-72",
-     eprint    = "hep-ph/0005139",
-     SLACcitation  = "%%CITATION = HEP-PH/0005139;%%"
-}
-@Article{Chetyrkin:2006xg,
-     author    = "Chetyrkin, K. G. and K{\"u}hn, J. H. and Sturm, C.",
-     title     = "{Four-loop moments of the heavy quark vacuum polarization
-                  function in  perturbative QCD}",
-     journal   = "Eur. Phys. J.",
-     volume    = "C48",
-     year      = "2006",
-     pages     = "107-110",
-     eprint    = "hep-ph/0604234",
-     SLACcitation  = "%%CITATION = HEP-PH/0604234;%%"
-}
-@Article{Chiarappa:2004ry,
-     author    = "Chiarappa, T. and others",
-     title     = "{Comparing iterative methods for overlap and twisted mass
-                   fermions}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "140",
-     year      = "2005",
-     pages     = "853-855",
-     eprint    = "hep-lat/0409107",
-     archivePrefix = "arXiv",
-     doi       = "10.1016/j.nuclphysbps.2004.11.281",
-     SLACcitation  = "%%CITATION = HEP-LAT/0409107;%%"
-}
-@Article{Chiarappa:2006ae,
-     author    = "Chiarappa, T. and others",
-     title     = "{Numerical simulation of {QCD} with u, d, s and c quarks in
-                  the twisted-mass {W}ilson formulation}",
-     journal   = "Eur. Phys. J.",
-     volume    = "C50",
-     year      = "2007",
-     pages     = "373-383",
-     eprint    = "hep-lat/0606011",
-     archivePrefix = "arXiv",
-     doi       = "10.1140/epjc/s10052-006-0204-4",
-     SLACcitation  = "%%CITATION = HEP-LAT/0606011;%%"
-}
-@Article{Chiarappa:2006hz,
-     author    = "Chiarappa, T. and others",
-     title     = "{Iterative methods for overlap and twisted mass fermions}",
-     year      = "2008",
-     journal   = "Comput. Sci. Disc.",
-     volume    = "01",
-     pages     = "015001",
-     eprint    = "hep-lat/0609023",
-     archivePrefix = "arXiv",
-     SLACcitation  = "%%CITATION = HEP-LAT/0609023;%%"
-}
-@Article{Cichy:2008gk,
-     author    = "Cichy, K. and Gonzalez Lopez, J. and Jansen, K. and Kujawa,
-                  A. and Shindler, A.",
-     title     = "{Twisted Mass, Overlap and Creutz Fermions: Cut-off Effects
-                  at Tree-level of Perturbation Theory}",
-     journal   = "Nucl. Phys.",
-     volume    = "B800",
-     year      = "2008",
-     pages     = "94-108",
-     eprint    = "0802.3637",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     doi       = "10.1016/j.nuclphysb.2008.03.004",
-     SLACcitation  = "%%CITATION = 0802.3637;%%"
-}
-@Article{Clark:2004cq,
-     author    = "Clark, M. A. and Kennedy, A. D.",
-     title     = "Accelerating fermionic molecular dynamics",
-     year      = "2004",
-     eprint    = "hep-lat/0409134",
-     SLACcitation  = "%%CITATION = HEP-LAT 0409134;%%"
-}
-
-@Article{Clark:2005sq,
-     author    = "Clark, M. A. and de Forcrand, Ph. and Kennedy, A. D.",
-     title     = "Algorithm shootout: R versus RHMC",
-     journal   = "PoS",
-     volume    = "LAT2005",
-     year      = "2005",
-     pages     = "115",
-     eprint    = "hep-lat/0510004",
-     SLACcitation  = "%%CITATION = HEP-LAT 0510004;%%"
-}
-@Article{Clark:2006fx,
-     author    = "Clark, M. A. and Kennedy, A. D.",
-     title     = "{Accelerating Dynamical Fermion Computations using the
-                  Rational Hybrid Monte Carlo (RHMC) Algorithm with Multiple
-                  Pseudofermion Fields}",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "98",
-     year      = "2007",
-     pages     = "051601",
-     eprint    = "hep-lat/0608015",
-     archivePrefix = "arXiv",
-     doi       = "10.1103/PhysRevLett.98.051601",
-     SLACcitation  = "%%CITATION = HEP-LAT/0608015;%%"
-}
-@Article{Clark:2006wp,
-     author    = "Clark, M. A. and Kennedy, A. D.",
-     title     = "{Accelerating Staggered Fermion Dynamics with the Rational
-                  Hybrid Monte Carlo (RHMC) Algorithm}",
-     journal   = "Phys. Rev.",
-     volume    = "D75",
-     year      = "2007",
-     pages     = "011502",
-     eprint    = "hep-lat/0610047",
-     archivePrefix = "arXiv",
-     doi       = "10.1103/PhysRevD.75.011502",
-     SLACcitation  = "%%CITATION = HEP-LAT/0610047;%%"
-}
-@Article{Colangelo:2001df,
-     author    = "Colangelo, G. and Gasser, J. and Leutwyler, H.",
-     title     = "{pi pi scattering}",
-     journal   = "Nucl. Phys.",
-     volume    = "B603",
-     year      = "2001",
-     pages     = "125-179",
-     eprint    = "hep-ph/0103088",
-     archivePrefix = "arXiv",
-     doi       = "10.1016/S0550-3213(01)00147-X",
-     SLACcitation  = "%%CITATION = HEP-PH/0103088;%%"
-}
-@Article{Colangelo:2003hf,
-     author    = "Colangelo, Gilberto and D{\"u}rr, Stephan",
-     title     = "The pion mass in finite volume",
-     journal   = "Eur. Phys. J.",
-     volume    = "C33",
-     year      = "2004",
-     pages     = "543-553",
-     eprint    = "hep-lat/0311023",
-     SLACcitation  = "%%CITATION = HEP-LAT/0311023;%%"
-}
-@Article{Colangelo:2005gd,
-     author    = "Colangelo, Gilberto and D{\"u}rr, Stephan and Haefeli,
-                  Christoph",
-     title     = "Finite volume effects for meson masses and decay
-                  constants",
-     journal   = "Nucl. Phys.",
-     volume    = "B721",
-     year      = "2005",
-     pages     = "136-174",
-     eprint    = "hep-lat/0503014",
-     SLACcitation  = "%%CITATION = HEP-LAT 0503014;%%"
-}
-@Article{Colangelo:2006mp,
-     author    = "Colangelo, Gilberto and Haefeli, Christoph",
-     title     = "{Finite volume effects for the pion mass at two loops}",
-     journal   = "Nucl. Phys.",
-     volume    = "B744",
-     year      = "2006",
-     pages     = "14-33",
-     eprint    = "hep-lat/0602017",
-     archivePrefix = "arXiv",
-     doi       = "10.1016/j.nuclphysb.2006.03.010",
-     SLACcitation  = "%%CITATION = HEP-LAT/0602017;%%"
-}
-@Book{Collins:1994ab,
-     author    = "Collins, J.C.",
-     title     = "Renormalisation",
-     publisher = "Cambridge University Press",
-     series    = "Cambridge Monographs on Mathematical Physics",
-     year      = "1994",
-     edition   = "",
-}
-@Article{Creutz:1984fj,
-     author    = "Creutz, M. and Gocksch, A. and Ogilvie, M. and
-                  Okawa, M.",
-     title     = "Microcanonical renormalization group",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "53",
-     year      = "1984",
-     pages     = "875",
-     SLACcitation  = "%%CITATION = PRLTA,53,875;%%"
-}
-@Article{Creutz:1989wt,
-     author    = "Creutz, M. and Gocksch, A.",
-     title     = "Higher order hybrid monte carlo algorithms",
-     note     = "BNL-42601"
-}
-@Article{Creutz:1996bg,
-     author    = "Creutz, Michael",
-     title     = "Wilson fermions at finite temperature",
-     year      = "1996",
-     eprint    = "hep-lat/9608024",
-     SLACcitation  = "%%CITATION = HEP-LAT 9608024;%%"
-}
-@Article{Creutz:1998ee,
-     author    = "Creutz, M.",
-     title     = "Evaluating Grassmann integrals",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "81",
-     year      = "1998",
-     pages     = "3555-3558",
-     eprint    = "hep-lat/9806037",
-     SLACcitation  = "%%CITATION = HEP-LAT 9806037;%%"
-}
-@Article{Cundy:2005pi,
-     author    = "Cundy, N. and others",
-     title     = "Numerical Methods for the {QCD} Overlap Operator IV: Hybrid
-                  Monte Carlo",
-     year      = "2005",
-     eprint    = "hep-lat/0502007",
-     SLACcitation  = "%%CITATION = HEP-LAT 0502007;%%"
-}
-@Article{David:1984ys,
-     author    = "David, F. and Hamber, H. W.",
-     title     = "Chiral condensate with {Wilson} fermions",
-     journal   = "Nucl. Phys.",
-     volume    = "B248",
-     year      = "1984",
-     pages     = "381",
-     SLACcitation  = "%%CITATION = NUPHA,B248,381;%%"
-}
-@Article{Davies:2008sw,
-     author    = "Davies, C. T. H. and others",
- collaboration = "HPQCD",
-     title     = "{Update: Accurate Determinations of $\alpha_s$ from
-                  Realistic Lattice QCD}",
-     year      = "2008",
-     eprint    = "0807.1687",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = 0807.1687;%%"
-}
-@Article{DeGrand:1990dk,
-     author    = "DeGrand, T. A. and Rossi, P.",
-     title     = "Conditioning techniques for dynamical fermions",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "60",
-     year      = "1990",
-     pages     = "211-214",
-     SLACcitation  = "%%CITATION = CPHCB,60,211;%%"
-}
-@Article{DeGrand:1990ip,
-     author    = "DeGrand, T. A.",
-     title     = "Resonance masses from Monte Carlo simulations (with
-                  emphasis on the rho meson)",
-     journal   = "Phys. Rev.",
-     volume    = "D43",
-     year      = "1991",
-     pages     = "2296-2300",
-     SLACcitation  = "%%CITATION = PHRVA,D43,2296;%%"
-}
-@Article{DeGrand:2002vu,
-     author    = "DeGrand, Thomas and Hasenfratz, Anna and Kovacs, Tamas G.",
-     title     = "Improving the chiral properties of lattice fermions",
-     journal   = "Phys. Rev.",
-     volume    = "D67",
-     year      = "2003",
-     pages     = "054501",
-     eprint    = "hep-lat/0211006",
-     SLACcitation  = "%%CITATION = HEP-LAT 0211006;%%"
-}
-@Article{DeTar:2007ni,
-     author    = "DeTar, Carleton and Levkova, L.",
-     title     = "Effects of the disconnected flavor singlet corrections on
-                  the hyperfine splitting in charmonium",
-     journal   = "PoS",
-     volume    = "LAT2007",
-     year      = "2007",
-     pages     = "116",
-     eprint    = "0710.1322",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = ARXIV:0710.1322;%%"
-}
-@Article{DelDebbio:2006cn,
-     author    = "Del Debbio, L. and Giusti, L. and L{\"u}scher, M. and
-                  Petronzio, R. and Tantalo, N.",
-     title     = "QCD with light Wilson quarks on fine lattices. I: First
-                  experiences and physics results",
-     journal   = "JHEP",
-     volume    = "02",
-     year      = "2007",
-     pages     = "056",
-     eprint    = "hep-lat/0610059",
-     SLACcitation  = "%%CITATION = HEP-LAT 0610059;%%"
-}
-@Article{DellaMorte:2000yp,
-     author    = "Della Morte, M. and Frezzotti, R. and Heitger, J. and Sint,
-                  S.",
-     title     = "Non-perturbative scaling tests of twisted mass {QCD}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "94",
-     year      = "2001",
-     pages     = "617-621",
-     eprint    = "hep-lat/0010091",
-     SLACcitation  = "%%CITATION = HEP-LAT 0010091;%%"
-}
-@Article{DellaMorte:2001tu,
-     author    = "Della Morte, M. and Frezzotti, R. and Heitger, J.",
-     title     = "Quenched twisted mass {QCD} at small quark masses and in
-                  large volume",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "106",
-     year      = "2002",
-     pages     = "260-262",
-     eprint    = "hep-lat/0110166",
-     SLACcitation  = "%%CITATION = HEP-LAT 0110166;%%"
-}
-
-@Article{DellaMorte:2001ys,
-     author    = "Della Morte, M. and Frezzotti, R. and Heitger,
-                  J. and Sint, S.",
- collaboration = "ALPHA",
-     title     = "Cutoff effects in twisted mass lattice {QCD}",
-     journal   = "JHEP",
-     volume    = "10",
-     year      = "2001",
-     pages     = "041",
-     eprint    = "hep-lat/0108019",
-     SLACcitation  = "%%CITATION = HEP-LAT 0108019;%%"
-}                                                                               
-@Article{DellaMorte:2003jj,
-     author    = "Della Morte, M. and others",
- collaboration = "ALPHA",
-     title     = "Simulating the Schroedinger functional with two pseudo-
-                  fermions",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "156",
-     year      = "2003",
-     pages     = "62-72",
-     eprint    = "hep-lat/0307008",
-     SLACcitation  = "%%CITATION = HEP-LAT 0307008;%%"
-}                                                                               
-@Article{DellaMorte:2003mn,
-     author    = "Della Morte, M. and others",
- collaboration = "ALPHA",
-     title     = "Lattice HQET with exponentially improved statistical
-                  precision",
-     journal   = "Phys. Lett.",
-     volume    = "B581",
-     year      = "2004",
-     pages     = "93-98",
-     eprint    = "hep-lat/0307021",
-     SLACcitation  = "%%CITATION = HEP-LAT 0307021;%%"
-}             
-@Article{DellaMorte:2003mw,
-     author    = "Della Morte, M. and others",
- collaboration = "ALPHA",
-     title     = "Static quarks with improved statistical precision",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "129",
-     year      = "2004",
-     pages     = "346-348",
-     eprint    = "hep-lat/0309080",
-     SLACcitation  = "%%CITATION = HEP-LAT 0309080;%%"
-}                                                                  
-@Article{DellaMorte:2005yc,
-     author    = "Della Morte, M. and Shindler, A. and Sommer,
-                  R.",
-     title     = "On lattice actions for static quarks",
-     year      = "2005",
-     eprint    = "hep-lat/0506008",
-     SLACcitation  = "%%CITATION = HEP-LAT 0506008;%%"
-}
-@Article{Dimopoulos:2006dm,
-     author    = "Dimopoulos, P. and others",
- collaboration = "ALPHA",
-     title     = "A precise determination of B(K) in quenched QCD",
-     journal   = "Nucl. Phys.",
-     volume    = "B749",
-     year      = "2006",
-     pages     = "69-108",
-     eprint    = "hep-ph/0601002",
-     SLACcitation  = "%%CITATION = HEP-PH 0601002;%%"
-}
-@Article{Dimopoulos:2007fn,
-     author    = "Dimopoulos, P. and others",
-     title     = "{Renormalisation of quark bilinears with Nf=2 Wilson
-                  fermions and tree-level improved gauge action}",
-     journal   = "PoS",
-     volume    = "LAT2007",
-     year      = "2007",
-     pages     = "241",
-     eprint    = "0710.0975",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = 0710.0975;%%"
-}
-@Article{Dimopoulos:2007qy,
-     author    = "Dimopoulos, Petros and Frezzotti, Roberto and Herdoiza,
-                  Gregorio and Urbach, Carsten and Wenger, Urs",
- collaboration = "ETM",
-     title     = "{Scaling and low energy constants in lattice QCD with N_f=2
-                  maximally twisted Wilson quarks}",
-     journal   = "PoS",
-     volume    = "LAT2007",
-     year      = "2007",
-     pages     = "102",
-     eprint    = "0710.2498",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = 0710.2498;%%"
-}
-@Article{Dimopoulos:2008sy,
-     author    = "Dimopoulos, Petros and others",
- collaboration = "ETM",
-     title     = "{Scaling and chiral extrapolation of pion mass and decay
-                  constant with maximally twisted mass QCD}",
-     year      = "2008",
-     eprint    = "0810.2873",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = 0810.2873;%%"
-}
-@Article{Dong:2001fm,
-     author    = "Dong, S. J. and others",
-     title     = "Chiral properties of pseudoscalar mesons on a quenched
-                  20**4 lattice  with overlap fermions",
-     journal   = "Phys. Rev.",
-     volume    = "D65",
-     year      = "2002",
-     pages     = "054507",
-     eprint    = "hep-lat/0108020",
-     SLACcitation  = "%%CITATION = HEP-LAT 0108020;%%"
-}
-@Article{Duane:1987de,
-     author    = "Duane, S. and Kennedy, A. D. and Pendleton, B. J. and
-                  Roweth, D.",
-     title     = "{H}ybrid monte carlo",
-     journal   = "Phys. Lett.",
-     volume    = "B195",
-     year      = "1987",
-     pages     = "216-222",
-     SLACcitation  = "%%CITATION = PHLTA,B195,216;%%"
-}
-@Article{Edwards:1996vs,
-     author    = "Edwards, R. G. and Horvath, I. and Kennedy, A. D.",
-     title     = "Instabilities and non-reversibility of molecular dynamics
-                  trajectories",
-     journal   = "Nucl. Phys.",
-     volume    = "B484",
-     year      = "1997",
-     pages     = "375-402",
-     eprint    = "hep-lat/9606004",
-     SLACcitation  = "%%CITATION = HEP-LAT 9606004;%%"
-}
-@Article{Edwards:2004sx,
-     author    = "Edwards, Robert G. and Joo, Balint",
- collaboration = "SciDAC",
-     title     = "The {Chroma} software system for lattice {QCD}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "140",
-     year      = "2005",
-     pages     = "832",
-     eprint    = "hep-lat/0409003",
-     SLACcitation  = "%%CITATION = HEP-LAT 0409003;%%"
-}
-@Article{Eichten:1989zv,
-     author    = "Eichten, E. and Hill, B.",
-     title     = "An effective field theory for the calculation of matrix
-                  elements involving heavy quarks",
-     journal   = "Phys. Lett.",
-     volume    = "B234",
-     year      = "1990",
-     pages     = "511",
-     SLACcitation  = "%%CITATION = PHLTA,B234,511;%%"
-}
-@Article{Farchioni:2002vn,
-     author    = "Farchioni, F. and Gebert, C. and Montvay, I.
-                  and Scorzato, L.",
-     title     = "Numerical simulation tests with light dynamical quarks",
-     journal   = "Eur. Phys. J.",
-     volume    = "C26",
-     year      = "2002",
-     pages     = "237-251",
-     eprint    = "hep-lat/0206008",
-     SLACcitation  = "%%CITATION = HEP-LAT 0206008;%%"
-}
-@Article{Farchioni:2004fs,
-     author    = "Farchioni, F. and others",
-     title     = "The phase structure of lattice {QCD} with {Wilson} quarks and
-                  renormalization group improved gluons",
-     journal   = "Eur. Phys. J.",
-     volume    = "C42",
-     year      = "2005",
-     pages     = "73-87",
-     eprint    = "hep-lat/0410031",
-     SLACcitation  = "%%CITATION = HEP-LAT 0410031;%%"
-}
-@Article{Farchioni:2004ma,
-     author    = "Farchioni, F. and others",
-     title     = "Exploring the phase structure of lattice {{QCD}} with twisted
-                  mass quarks",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "140",
-     year      = "2005",
-     pages     = "240-245",
-     eprint    = "hep-lat/0409098",
-     SLACcitation  = "%%CITATION = HEP-LAT 0409098;%%"
-}
-@Article{Farchioni:2004us,
-     author    = "Farchioni, F. and others",
-     title     = "Twisted mass quarks and the phase structure of lattice
-                  {QCD}",
-     journal   = "Eur. Phys. J.",
-     volume    = "C39",
-     year      = "2005",
-     pages     = "421-433",
-     eprint    = "hep-lat/0406039",
-     SLACcitation  = "%%CITATION = HEP-LAT 0406039;%%"
-}
-@Article{Farchioni:2005ec,
-     author    = "Farchioni, Federico and others",
-     title     = "Dynamical twisted mass fermions",
-     journal   = "PoS",
-     volume    = "LAT2005",
-     year      = "2006",
-     pages     = "072",
-     eprint    = "hep-lat/0509131",
-     SLACcitation  = "%%CITATION = HEP-LAT 0509131;%%"
-}
-@Article{Farchioni:2005hf,
-     author    = "Farchioni, F. and others",
-     title     = "Twisted mass fermions: Neutral pion masses from
-                  disconnected contributions",
-     journal   = "PoS",
-     volume    = "LAT2005",
-     year      = "2006",
-     pages     = "033",
-     eprint    = "hep-lat/0509036",
-     SLACcitation  = "%%CITATION = HEP-LAT 0509036;%%"
-}
-@Article{Farchioni:2005tu,
-     author    = "Farchioni, F. and others",
-     title     = "Lattice spacing dependence of the first order phase
-                  transition for  dynamical twisted mass fermions",
-     journal   = "Phys. Lett.",
-     volume    = "B624",
-     year      = "2005",
-     pages     = "324-333",
-     eprint    = "hep-lat/0506025",
-     SLACcitation  = "%%CITATION = HEP-LAT 0506025;%%"
-}
-@Article{Feldmann:1999uf,
-     author    = "Feldmann, Thorsten",
-     title     = "{Quark structure of pseudoscalar mesons}",
-     journal   = "Int. J. Mod. Phys.",
-     volume    = "A15",
-     year      = "2000",
-     pages     = "159-207",
-     eprint    = "hep-ph/9907491",
-     SLACcitation  = "%%CITATION = HEP-PH/9907491;%%"
-}
-@Article{Feynman:1948aa,
-     author    = "Feynman, R. P.",
-     title     = "Space-time approach to non-relativistic quantum mechanics",
-     journal   = "Rev. Mod. Phys.",
-     volume    = "20",
-     year      = "1948",
-     pages     = "367-387",
-     SLACcitation  = "%%CITATION = RMPHA,20,367;%%"
-}
-@Article{Fischer:1996th,
-     author    = "Fischer, S. and others",
-     title     = "A Parallel SSOR Preconditioner for Lattice {QCD}",
-     journal   = "Comp. Phys. Commun.",
-     volume    = "98",
-     year      = "1996",
-     pages     = "20-34",
-     eprint    = "hep-lat/9602019",
-     SLACcitation  = "%%CITATION = HEP-LAT 9602019;%%"
-}
-@Article{Fokkema:1998aa,
-     author    = "Fokkema, D.~R. and Sleijpen, G.~L.~G. and Van~der~Vorst, H.~A.",
-     title     = "{J}acobi-{D}avidson style {QR} and {QZ} algorithms for
-                  the reduction of matrix pencils",
-     journal   = "J. Sci. Comput.",
-     volume    = "20",
-     year      = "1998",
-     pages     = "94-125",
-}
-@Article{Foster:1998vw,
-     author    = "Foster, M. and Michael, C.",
-     collaboration = "UKQCD",
-     title     = "Quark mass dependence of hadron masses from lattice {QCD}",
-     journal   = "Phys. Rev.",
-     volume    = "D59",
-     year      = "1999",
-     pages     = "074503",
-     eprint    = "hep-lat/9810021",
-     SLACcitation  = "%%CITATION = HEP-LAT 9810021;%%"
-}
-@Article{Freund,
-     author    = "Freund, R.W.",
-     journal   = "in Numerical Linear Algebra, L.\ Reichel, A.\ Ruttan and R.S.\ Varga (eds.)",
-     year      = "1993",
-     pages     = "p. 101",
-}
-@Article{Frezzotti:1997ym,
-     author    = "Frezzotti, R. and Jansen, K.",
-     title     = "A polynomial hybrid Monte Carlo algorithm",
-     journal   = "Phys. Lett.",
-     volume    = "B402",
-     year      = "1997",
-     pages     = "328-334",
-     eprint    = "hep-lat/9702016",
-     SLACcitation  = "%%CITATION = HEP-LAT 9702016;%%"
-}
-@Article{Frezzotti:1998eu,
-     author    = "Frezzotti, R. and Jansen, K.",
-     title     = "The {PHMC} algorithm for simulations of dynamical fermions.
-                  {I}: Description and properties",
-     journal   = "Nucl. Phys.",
-     volume    = "B555",
-     year      = "1999",
-     pages     = "395-431",
-     eprint    = "hep-lat/9808011",
-     SLACcitation  = "%%CITATION = HEP-LAT 9808011;%%"
-}
-@Article{Frezzotti:1998yp,
-     author    = "Frezzotti, R. and Jansen, K.",
-     title     = "The {PHMC} algorithm for simulations of dynamical fermions.
-                  {II}:  Performance analysis",
-     journal   = "Nucl. Phys.",
-     volume    = "B555",
-     year      = "1999",
-     pages     = "432-453",
-     eprint    = "hep-lat/9808038",
-     SLACcitation  = "%%CITATION = HEP-LAT 9808038;%%"
-}
-@Article{Frezzotti:1999vv,
-     author    = "Frezzotti, R. and Grassi, P. A. and Sint,
-                  S. and Weisz, P.",
-     title     = "A local formulation of lattice {QCD} without unphysical
-                  fermion zero modes",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "83",
-     year      = "2000",
-     pages     = "941-946",
-     eprint    = "hep-lat/9909003",
-     SLACcitation  = "%%CITATION = HEP-LAT 9909003;%%"
-}
-@Article{Frezzotti:2000nk,
-     author    = "Frezzotti, R. and Grassi, P. A. and Sint,
-                  S. and Weisz, P.",
- collaboration = "ALPHA",
-     title     = "Lattice {QCD} with a chirally twisted mass term",
-     journal   = "JHEP",
-     volume    = "08",
-     year      = "2001",
-     pages     = "058",
-     eprint    = "hep-lat/0101001",
-     SLACcitation  = "%%CITATION = HEP-LAT 0101001;%%"
-}
-@Article{Frezzotti:2001du,
-     author    = "Frezzotti, R. and Sint, S.",
-     title     = "Some remarks on {O(a)} improved twisted mass {QCD}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "106",
-     year      = "2002",
-     pages     = "814-816",
-     eprint    = "hep-lat/0110140",
-     SLACcitation  = "%%CITATION = HEP-LAT 0110140;%%"
-}
-@Article{Frezzotti:2001ea,
-     author    = "Frezzotti, R. and Sint, S. and Weisz, P.",
- collaboration = "ALPHA",
-     title     = "{O(a)} improved twisted mass lattice {QCD}",
-     journal   = "JHEP",
-     volume    = "07",
-     year      = "2001",
-     pages     = "048",
-     eprint    = "hep-lat/0104014",
-     SLACcitation  = "%%CITATION = HEP-LAT 0104014;%%"
-}
-@Article{Frezzotti:2003ni,
-     author    = "Frezzotti, R. and Rossi, G. C.",
-     title     = "Chirally improving {Wilson} fermions. {I}: {O(a)} improvement",
-     journal   = "JHEP",
-     volume    = "08",
-     year      = "2004",
-     pages     = "007",
-     eprint    = "hep-lat/0306014",
-     SLACcitation  = "%%CITATION = HEP-LAT 0306014;%%"
-}
-@Article{Frezzotti:2003xj,
-     author    = "Frezzotti, R. and Rossi, G. C.",
-     title     = "Twisted-mass lattice {QCD} with mass non-degenerate quarks",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "128",
-     year      = "2004",
-     pages     = "193-202",
-     eprint    = "hep-lat/0311008",
-     SLACcitation  = "%%CITATION = HEP-LAT 0311008;%%"
-}
-@Article{Frezzotti:2004wz,
-     author    = "Frezzotti, R. and Rossi, G. C.",
-     title     = "Chirally improving {Wilson} fermions. {II}: Four-quark
-                  operators",
-     journal   = "JHEP",
-     volume    = "10",
-     year      = "2004",
-     pages     = "070",
-     eprint    = "hep-lat/0407002",
-     SLACcitation  = "%%CITATION = HEP-LAT 0407002;%%"
-}
-@Article{Frezzotti:2005gi,
-     author    = "Frezzotti, R. and Martinelli, G. and Papinutto, M. and
-                  Rossi, G. C.",
-     title     = "Reducing cutoff effects in maximally twisted lattice {QCD}
-                  close to the  chiral limit",
-     journal   = "JHEP",
-     volume    = "04",
-     year      = "2006",
-     pages     = "038",
-     eprint    = "hep-lat/0503034",
-     SLACcitation  = "%%CITATION = HEP-LAT 0503034;%%"
-}
-@Article{Frezzotti:2007qv,
-     author    = "Frezzotti, R. and Rossi, G.",
-     title     = "{O(a^2) cutoff effects in Wilson fermion simulations}",
-     journal   = "PoS",
-     volume    = "LAT2007",
-     year      = "2007",
-     pages     = "277",
-     eprint    = "0710.2492",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = 0710.2492;%%"
-}
-@Article{Frezzotti:2008dr,
-     author    = "Frezzotti, R. and Lubicz, V. and Simula, S.",
- collaboration = "ETM",
-     title     = "{Electromagnetic form factor of the pion from twisted-mass
-                  lattice {QCD} at {Nf}=2}",
-     year      = "2008",
-     eprint    = "0812.4042",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = 0812.4042;%%"
-}
-@Article{Fritzsch:1973pi,
-     author    = "Fritzsch, H. and Gell-Mann, M. and Leutwyler, H.",
-     title     = "Advantages of the color octet gluon picture",
-     journal   = "Phys. Lett.",
-     volume    = "B47",
-     year      = "1973",
-     pages     = "365-368",
-     SLACcitation  = "%%CITATION = PHLTA,B47,365;%%"
-}
-@Article{Frommer:1994vn,
-     author    = "Frommer, A. and Hannemann, V. and Nockel, B. and Lippert,
-                  T. and Schilling, K.",
-     title     = "Accelerating {Wilson} fermion matrix inversions by means of
-                  the stabilized biconjugate gradient algorithm",
-     journal   = "Int. J. Mod. Phys.",
-     volume    = "C5",
-     year      = "1994",
-     pages     = "1073-1088",
-     eprint    = "hep-lat/9404013",
-     SLACcitation  = "%%CITATION = HEP-LAT 9404013;%%"
-}
-@Article{Frommer:1995ik,
-     author    = "Frommer, Andreas and Nockel, Bertold and Gusken, Stephan
-                  and Lippert, Thomas and Schilling, Klaus",
-     title     = "Many masses on one stroke: Economic computation of quark
-                  propagators",
-     journal   = "Int. J. Mod. Phys.",
-     volume    = "C6",
-     year      = "1995",
-     pages     = "627-638",
-     eprint    = "hep-lat/9504020",
-     SLACcitation  = "%%CITATION = HEP-LAT 9504020;%%"
-}
-@Article{Furman:1994ky,
-     author    = "Furman, V. and Shamir, Y.",
-     title     = "Axial symmetries in lattice QCD with Kaplan fermions",
-     journal   = "Nucl. Phys.",
-     volume    = "B439",
-     year      = "1995",
-     pages     = "54-78",
-     eprint    = "hep-lat/9405004",
-     SLACcitation  = "%%CITATION = HEP-LAT 9405004;%%"
-}
-@Article{Garden:1999fg,
-     author    = "Garden, J. and Heitger, J. and Sommer, R. and
-                  Wittig H.",
- collaboration = "ALPHA",
-     title     = "Precision computation of the strange quark's mass in
-                  quenched {QCD}",
-     journal   = "Nucl. Phys.",
-     volume    = "B571",
-     year      = "2000",
-     pages     = "237-256",
-     eprint    = "hep-lat/9906013",
-     SLACcitation  = "%%CITATION = HEP-LAT 9906013;%%"
-}
-@Article{Garron:2003cb,
-     author    = "Garron, N. and Giusti, L. and Hoelbling,
-                  C. and Lellouch, L. and Rebbi, C.",
-     title     = "B(K) from quenched {QCD} with exact chiral symmetry",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "92",
-     year      = "2004",
-     pages     = "042001",
-     eprint    = "hep-ph/0306295",
-     SLACcitation  = "%%CITATION = HEP-PH 0306295;%%"
-}
-@Article{Gasser:1982ap,
-     author    = "Gasser, J. and Leutwyler, H.",
-     title     = "Quark masses",
-     journal   = "Phys. Rept.",
-     volume    = "87",
-     year      = "1982",
-     pages     = "77-169",
-     SLACcitation  = "%%CITATION = PRPLC,87,77;%%"
-}
-@Article{Gasser:1983yg,
-     author    = "Gasser, J. and Leutwyler, H.",
-     title     = "Chiral perturbation theory to one loop",
-     journal   = "Ann. Phys.",
-     volume    = "158",
-     year      = "1984",
-     pages     = "142",
-     SLACcitation  = "%%CITATION = APNYA,158,142;%%"
-}
-
-@Article{Gasser:1985gg,
-     author    = "Gasser, J. and Leutwyler, H.",
-     title     = "Chiral perturbation theory: expansions in the mass of the
-                  strange quark",
-     journal   = "Nucl. Phys.",
-     volume    = "B250",
-     year      = "1985",
-     pages     = "465",
-     SLACcitation  = "%%CITATION = NUPHA,B250,465;%%"
-}
-@Article{Gasser:1986vb,
-     author    = "Gasser, J. and Leutwyler, H.",
-     title     = "LIGHT QUARKS AT LOW TEMPERATURES",
-     journal   = "Phys. Lett.",
-     volume    = "B184",
-     year      = "1987",
-     pages     = "83",
-     SLACcitation  = "%%CITATION = PHLTA,B184,83;%%"
-}
-@Article{Gattringer:2003qx,
-     author    = "Gattringer, C. and others",
- collaboration = "BGR",
-     title     = "Quenched spectroscopy with fixed-point and chirally
-                  improved fermions",
-     journal   = "Nucl. Phys.",
-     volume    = "B677",
-     year      = "2004",
-     pages     = "3-51",
-     eprint    = "hep-lat/0307013",
-     SLACcitation  = "%%CITATION = HEP-LAT 0307013;%%"
-}
-@Article{Gell-Mann:1964nj,
-     author    = "Gell-Mann, M.",
-     title     = "A Schematic model of baryons and mesons",
-     journal   = "Phys. Lett.",
-     volume    = "8",
-     year      = "1964",
-     pages     = "214-215",
-     SLACcitation  = "%%CITATION = PHLTA,8,214;%%"
-}
-@Article{Gell-Mann:1968rz,
-     author    = "Gell-Mann, M. and Oakes, R. J. and Renner, B.",
-     title     = "Behavior of current divergences under SU(3) x SU(3)",
-     journal   = "Phys. Rev.",
-     volume    = "175",
-     year      = "1968",
-     pages     = "2195-2199",
-     SLACcitation  = "%%CITATION = PHRVA,175,2195;%%"
-}
-@PhdThesis{Geus:2002,
-  author = 	 {R. Geus},
-  title = 	 {The Jacobi-Davidson algorithm for solving large
-                  sparse symmetric eigenvalue problems with
-                  application to the design of accelerator cavities}, 
-  school = 	 {Swiss Federal Institute Of Technology Z{\"u}rich},
-  year = 	 {2002},
-  OPTkey = 	 {DISS. ETH NO. 14734},
-  OPTtype = 	 {},
-  OPTaddress = 	 {},
-  OPTmonth = 	 {},
-  OPTnote = 	 {},
-  OPTannote = 	 {}
-}
-@Article{Gimenez:1998ue,
-     author    = "Gimenez, V. and Giusti, L. and Rapuano, F. and Talevi, M.",
-     title     = "Non-perturbative renormalization of quark bilinears",
-     journal   = "Nucl. Phys.",
-     volume    = "B531",
-     year      = "1998",
-     pages     = "429-445",
-     eprint    = "hep-lat/9806006",
-     SLACcitation  = "%%CITATION = HEP-LAT 9806006;%%"
-}
-@Article{Gimenez:2005nt,
-     author    = "Gimenez, V. and Lubicz, V. and Mescia, F. and Porretti, V.
-                  and Reyes, J.",
-     title     = "{Operator product expansion and quark condensate from
-                  lattice QCD in  coordinate space}",
-     journal   = "Eur. Phys. J.",
-     volume    = "C41",
-     year      = "2005",
-     pages     = "535-544",
-     eprint    = "hep-lat/0503001",
-     SLACcitation  = "%%CITATION = HEP-LAT/0503001;%%"
-}
-@Article{Ginsparg:1981bj,
-     author    = "Ginsparg, P. H. and {Wilson}, K. G.",
-     title     = "A remnant of chiral symmetry on the lattice",
-     journal   = "Phys. Rev.",
-     volume    = "D25",
-     year      = "1982",
-     pages     = "2649",
-     SLACcitation  = "%%CITATION = PHRVA,D25,2649;%%"
-}
-@Article{Giusti:1998wy,
-     author    = "Giusti, L. and Rapuano, F. and Talevi, M. and Vladikas, A.
-                  ",
-     title     = "The QCD chiral condensate from the lattice",
-     journal   = "Nucl. Phys.",
-     volume    = "B538",
-     year      = "1999",
-     pages     = "249-277",
-     eprint    = "hep-lat/9807014",
-     SLACcitation  = "%%CITATION = HEP-LAT 9807014;%%"
-}
-@Article{Giusti:2001pk,
-     author    = "Giusti, L. and Hoelbling, C. and Rebbi, C.",
-     title     = "Light quark masses with overlap fermions in quenched {QCD}",
-     journal   = "Phys. Rev.",
-     volume    = "D64",
-     year      = "2001",
-     pages     = "114508",
-     eprint    = "hep-lat/0108007",
-     note      = "Erratum-ibid.D65:079903,2002",
-     SLACcitation  = "%%CITATION = HEP-LAT 0108007;%%"
-}
-@Article{Giusti:2002sm,
-     author    = "Giusti, L. and Hoelbling, C. and L{\"u}scher, M. and Wittig, H.
-                  ",
-     title     = "Numerical techniques for lattice QCD in the epsilon-
-                  regime",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "153",
-     year      = "2003",
-     pages     = "31-51",
-     eprint    = "hep-lat/0212012",
-     SLACcitation  = "%%CITATION = HEP-LAT 0212012;%%"
-}
-@Article{Giusti:2007hk,
-     author    = "Giusti, Leonardo",
-     title     = "Light dynamical fermions on the lattice: Toward the chiral
-                  regime of QCD",
-     journal   = "PoS.",
-     volume    = "LAT2006",
-     year      = "2007",
-     pages     = "",
-     eprint    = "hep-lat/0702014",
-     SLACcitation  = "%%CITATION = HEP-LAT/0702014;%%"
-}
-@Article{Glassner:1996gz,
-     author    = "Gl{\"a}ssner, U. and others",
-     title     = "How to compute {G}reen's functions for entire mass
-                  trajectories within {K}rylov solvers",
-     year      = "1996",
-     eprint    = "hep-lat/9605008",
-     SLACcitation  = "%%CITATION = HEP-LAT 9605008;%%"
-}
-@Article{Gockeler:1998fn,
-     author    = "G{\"o}ckeler, M. and others",
-     title     = "Scaling of non-perturbatively {O(a)} improved {Wilson}
-                  fermions: Hadron  spectrum, quark masses and decay
-                  constants",
-     journal   = "Phys. Rev.",
-     volume    = "D57",
-     year      = "1998",
-     pages     = "5562-5580",
-     eprint    = "hep-lat/9707021",
-     SLACcitation  = "%%CITATION = HEP-LAT 9707021;%%"
-}
-@Article{Gorishnii:1990vf,
-     author    = "Gorishnii, S. G. and Kataev, A. L. and Larin, S. A.",
-     title     = "{The O (alpha-s**3) corrections to sigma-tot (e+ e- $\to$
-                  hadrons) and Gamma (tau- $\to$ tau-neutrino + hadrons) in
-                  QCD}",
-     journal   = "Phys. Lett.",
-     volume    = "B259",
-     year      = "1991",
-     pages     = "144-150",
-     SLACcitation  = "%%CITATION = PHLTA,B259,144;%%"
-}
-@Article{Greenberg:1964pe,
-     author    = "Greenberg, O. W.",
-     title     = "Spin and unitary spin independence in a paraquark model of
-                  baryons and mesons",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "13",
-     year      = "1964",
-     pages     = "598-602",
-     SLACcitation  = "%%CITATION = PRLTA,13,598;%%"
-}
-@Article{Gregory:2007ce,
-     author    = "Gregory, Eric B. and Irving, Alan and Richards, Chris M.
-                  and McNeile, Craig and Hart, Alistair",
-     title     = "Pseudoscalar Flavor-Singlet Physics with Staggered
-                  Fermions",
-     year      = "2007",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     eprint    = "0710.1725",
-     SLACcitation  = "%%CITATION = ARXIV:0710.1725;%%"
-}
-@Article{Gross:1973id,
-     author    = "Gross, D. J. and Wilczek, F.",
-     title     = "Ultraviolet behavior of non-Abelian gauge theories",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "30",
-     year      = "1973",
-     pages     = "1343-1346",
-     SLACcitation  = "%%CITATION = PRLTA,30,1343;%%"
-}
-@Article{Gross:1973ju,
-     author    = "Gross, D. J. and Wilczek, F.",
-     title     = "Asymptotically free gauge theories. 1",
-     journal   = "Phys. Rev.",
-     volume    = "D8",
-     year      = "1973",
-     pages     = "3633-3652",
-     SLACcitation  = "%%CITATION = PHRVA,D8,3633;%%"
-}
-@Article{Gross:1974jv,
-     author    = "Gross, D. J. and Neveu, A.",
-     title     = "Dynamical symmetry breaking in asymptotically free field
-                  theories",
-     journal   = "Phys. Rev.",
-     volume    = "D10",
-     year      = "1974",
-     pages     = "3235",
-     SLACcitation  = "%%CITATION = PHRVA,D10,3235;%%"
-}
-@Article{Guagnelli:1998ud,
-     author    = "Guagnelli, M. and Sommer, R. and Wittig, H.",
- collaboration = "ALPHA",
-     title     = "Precision computation of a low-energy reference scale in
-                  quenched  lattice {QCD}",
-     journal   = "Nucl. Phys.",
-     volume    = "B535",
-     year      = "1998",
-     pages     = "389-402",
-     eprint    = "hep-lat/9806005",
-     SLACcitation  = "%%CITATION = HEP-LAT 9806005;%%"
-}
-@Article{Guagnelli:2004ga,
-     author    = "Guagnelli, M. and others",
- collaboration = "Zeuthen-Rome (ZeRo)",
-     title     = "Non-perturbative pion matrix element of a twist-2 operator
-                  from the  lattice",
-     journal   = "Eur. Phys. J.",
-     volume    = "C40",
-     year      = "2005",
-     pages     = "69-80",
-     eprint    = "hep-lat/0405027",
-     SLACcitation  = "%%CITATION = HEP-LAT 0405027;%%"
-}
-@Article{Guagnelli:2004ww,
-     author    = "Guagnelli, M. and others",
- collaboration = "Zeuthen-Rome (ZeRo)",
-     title     = "Finite size effects of a pion matrix element",
-     journal   = "Phys. Lett.",
-     volume    = "B597",
-     year      = "2004",
-     pages     = "216-221",
-     eprint    = "hep-lat/0403009",
-     SLACcitation  = "%%CITATION = HEP-LAT 0403009;%%"
-}
-@Article{Guagnelli:2005zc,
-     author    = "Guagnelli, M. and Heitger, J. and Pena, C. and Sint, S. and
-                  Vladikas, A.",
- collaboration = "ALPHA",
-     title     = "Non-perturbative renormalization of left-left four-fermion
-                  operators in  quenched lattice QCD",
-     journal   = "JHEP",
-     volume    = "03",
-     year      = "2006",
-     pages     = "088",
-     eprint    = "hep-lat/0505002",
-     SLACcitation  = "%%CITATION = HEP-LAT 0505002;%%"
-}
-@Article{Gupta:1988js,
-     author    = "Gupta, R. and Kilcup, G. W. and Sharpe, S. R.
-                  ",
-     title     = "Tuning the hybrid monte carlo algorithm",
-     journal   = "Phys. Rev.",
-     volume    = "D38",
-     year      = "1988",
-     pages     = "1278",
-     SLACcitation  = "%%CITATION = PHRVA,D38,1278;%%"
-}
-@Article{Gupta:1989kx,
-     author    = "Gupta, R. and others",
-     title     = "{QCD} with dynamical {Wilson} fermions",
-     journal   = "Phys. Rev.",
-     volume    = "D40",
-     year      = "1989",
-     pages     = "2072",
-     SLACcitation  = "%%CITATION = PHRVA,D40,2072;%%"
-}
-@Article{Gupta:1990ka,
-     author    = "Gupta, S. and Irback, A. and Karsch, F. and
-                  Petersson, B.",
-     title     = "The acceptance probability in the hybrid monte carlo
-                  method",
-     journal   = "Phys. Lett.",
-     volume    = "B242",
-     year      = "1990",
-     pages     = "437-443",
-     SLACcitation  = "%%CITATION = PHLTA,B242,437;%%"
-}
-@Article{Gupta:1991sn,
-     author    = "Gupta, R. and others",
-     title     = "{QCD} with dynamical {Wilson} fermions. 2",
-     journal   = "Phys. Rev.",
-     volume    = "D44",
-     year      = "1991",
-     pages     = "3272-3292",
-     SLACcitation  = "%%CITATION = PHRVA,D44,3272;%%"
-}
-@Unpublished{Gupta:1997nd,
-     author    = "Gupta, R.",
-     title     = "Introduction to lattice {QCD}",
-     year      = "1997",
-     eprint    = "hep-lat/9807028",
-     note      = "Lectures given at Les Houches Summer School in Theoretical Physics, Session 68",
-     SLACcitation  = "%%CITATION = HEP-LAT 9807028;%%"
-}
-@Article{Han:1965pf,
-     author    = "Han, M. Y. and Nambu, Yoichiro",
-     title     = "Three-triplet model with double SU(3) symmetry",
-     journal   = "Phys. Rev.",
-     volume    = "139",
-     year      = "1965",
-     pages     = "B1006-B1010",
-     SLACcitation  = "%%CITATION = PHRVA,139,B1006;%%"
-}
-@Article{Hasenbusch:2001ne,
-     author    = "Hasenbusch, M.",
-     title     = "Speeding up the {H}ybrid-{M}onte-{C}arlo algorithm for dynamical
-                  fermions",
-     journal   = "Phys. Lett.",
-     volume    = "B519",
-     year      = "2001",
-     pages     = "177-182",
-     eprint    = "hep-lat/0107019",
-     SLACcitation  = "%%CITATION = HEP-LAT 0107019;%%"
-}
-@Article{Hasenbusch:2002ai,
-     author    = "Hasenbusch, M. and Jansen, K.",
-     title     = "Speeding up lattice {QCD} simulations with clover-improved
-                  {Wilson} fermions",
-     journal   = "Nucl. Phys.",
-     volume    = "B659",
-     year      = "2003",
-     pages     = "299-320",
-     eprint    = "hep-lat/0211042",
-     SLACcitation  = "%%CITATION = HEP-LAT 0211042;%%"
-}
-@Article{Hasenbusch:2003vg,
-     author    = "Hasenbusch, Martin",
-     title     = "{Full QCD algorithms towards the chiral limit}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "129",
-     year      = "2004",
-     pages     = "27-33",
-     eprint    = "hep-lat/0310029",
-     archivePrefix = "arXiv",
-     doi       = "10.1016/S0920-5632(03)02504-0",
-     SLACcitation  = "%%CITATION = HEP-LAT/0310029;%%"
-}
-@Article{Hasenfratz:1998jp,
-     author    = "Hasenfratz, P.",
-     title     = "Lattice {QCD} without tuning, mixing and current
-                  renormalization",
-     journal   = "Nucl. Phys.",
-     volume    = "B525",
-     year      = "1998",
-     pages     = "401-409",
-     eprint    = "hep-lat/9802007",
-     SLACcitation  = "%%CITATION = HEP-LAT 9802007;%%"
-}
-@Article{Hasenfratz:1998ri,
-     author    = "Hasenfratz, P. and Laliena, V. and Niedermayer,
-                  F.",
-     title     = "The index theorem in {QCD} with a finite cut-off",
-     journal   = "Phys. Lett.",
-     volume    = "B427",
-     year      = "1998",
-     pages     = "125-131",
-     eprint    = "hep-lat/9801021",
-     SLACcitation  = "%%CITATION = HEP-LAT 9801021;%%"
-}
-@Article{Hasenfratz:2001hp,
-     author    = "Hasenfratz, A. and Knechtli, F.",
-     title     = "Flavor symmetry and the static potential with hypercubic
-                  blocking",
-     journal   = "Phys. Rev.",
-     volume    = "D64",
-     year      = "2001",
-     pages     = "034504",
-     eprint    = "hep-lat/0103029",
-     SLACcitation  = "%%CITATION = HEP-LAT 0103029;%%"
-}
-@Article{Hasenfratz:2001tw,
-     author    = "Hasenfratz, A. and Hoffmann, R. and Knechtli, F.",
-     title     = "The static potential with hypercubic blocking",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "106",
-     year      = "2002",
-     pages     = "418-420",
-     eprint    = "hep-lat/0110168",
-     SLACcitation  = "%%CITATION = HEP-LAT 0110168;%%"
-}
-@Article{Hashimoto:2008xg,
-     author    = "Hashimoto, Koichi and Izubuchi, Taku",
-     title     = "{eta' meson from two flavor dynamical domain wall
-                  fermions}",
-     year      = "2008",
-     eprint    = "0803.0186",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = ARXIV:0803.0186;%%"
-}
-@Article{Heitger:2000ay,
-     author    = "Heitger, J. and Sommer, R. and Wittig, H.",
- collaboration = "ALPHA",
-     title     = "Effective chiral Lagrangians and lattice {{QCD}}",
-     journal   = "Nucl. Phys.",
-     volume    = "B588",
-     year      = "2000",
-     pages     = "377-399",
-     eprint    = "hep-lat/0006026",
-     note      = "and references therein",
-     SLACcitation  = "%%CITATION = HEP-LAT 0006026;%%"
-}
-@Article{Hernandez:1998et,
-     author    = "Hernandez, P. and Jansen, K. and L{\"u}scher, M.",
-     title     = "Locality properties of Neuberger's lattice Dirac operator",
-     journal   = "Nucl. Phys.",
-     volume    = "B552",
-     year      = "1999",
-     pages     = "363-378",
-     eprint    = "hep-lat/9808010",
-     SLACcitation  = "%%CITATION = HEP-LAT 9808010;%%"
-}
-@Article{Hernandez:2000sb,
-     author    = "Hernandez, P. and Jansen, K. and Lellouch, L.",
-     title     = "A numerical treatment of Neuberger's lattice Dirac
-                  operator",
-     year      = "2000",
-     eprint    = "hep-lat/0001008",
-     SLACcitation  = "%%CITATION = HEP-LAT 0001008;%%"
-}
-@Article{Hernandez:2001hq,
-     author    = "Hernandez, P. and Jansen, K. and Lellouch, L. and
-                  Wittig, H.",
-     title     = "Scalar condensate and light quark masses from overlap
-                  fermions",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "106",
-     year      = "2002",
-     pages     = "766-771",
-     eprint    = "hep-lat/0110199",
-     SLACcitation  = "%%CITATION = HEP-LAT 0110199;%%"
-}
-@Article{Hernandez:2001yn,
-     author    = "Hernandez, P. and Jansen, K. and Lellouch, L. and
-                  Wittig, H.",
-     title     = "Non-perturbative renormalization of the quark condensate in
-                  {Ginsparg}-{Wilson} regularizations",
-     journal   = "JHEP",
-     volume    = "07",
-     year      = "2001",
-     pages     = "018",
-     eprint    = "hep-lat/0106011",
-     SLACcitation  = "%%CITATION = HEP-LAT 0106011;%%"
-}
-@Article{Horsley:2004mx,
-     author    = "Horsley, R. and Perlt, H. and Rakow, P. E. L. and
-                  Schierholz, G. and Schiller, A.",
- collaboration = "QCDSF",
-     title     = "One-loop renormalisation of quark bilinears for overlap
-                  fermions with  improved gauge actions",
-     journal   = "Nucl. Phys.",
-     volume    = "B693",
-     year      = "2004",
-     pages     = "3-35",
-     eprint    = "hep-lat/0404007",
-     SLACcitation  = "%%CITATION = HEP-LAT 0404007;%%"
-}
-@Article{Ilgenfritz:2003gw,
-     author    = "Ilgenfritz, E.-M. and Kerler, W. and
-                  M{\"u}ller-Preu{\ss}ker, M. and Sternbeck, A. and St{\"u}ben, H.",
-     title     = "A numerical reinvestigation of the {Aoki} phase with {N(f)} = 2
-                  {Wilson}  fermions at zero temperature",
-     journal   = "Phys. Rev.",
-     volume    = "D69",
-     year      = "2004",
-     pages     = "074511",
-     eprint    = "hep-lat/0309057",
-     SLACcitation  = "%%CITATION = HEP-LAT 0309057;%%"
-}
-@Article{Ilgenfritz:2006tz,
-     author    = "Ilgenfritz, E. -M. and others",
-     title     = "Twisted mass QCD thermodynamics: First results on apeNEXT",
-     year      = "2006",
-     eprint    = "hep-lat/0610112",
-     SLACcitation  = "%%CITATION = HEP-LAT 0610112;%%"
-}
-@Article{Iwasaki:1983ck,
-     author    = "Iwasaki, Y.",
-     title     = "Renormalization group analysis of lattice theories and
-                  improved lattice action. 2. four-dimensional nonabelian
-                  SU(N) gauge model",
-     note     = "UTHEP-118"
-}
-@Article{Iwasaki:1985we,
-     author    = "Iwasaki, Y.",
-     title     = "Renormalization group analysis of lattice theories and
-                  improved lattice action: two-dimensional nonlinear O(N)
-                  sigma model",
-     journal   = "Nucl. Phys.",
-     volume    = "B258",
-     year      = "1985",
-     pages     = "141-156",
-     SLACcitation  = "%%CITATION = NUPHA,B258,141;%%"
-}
-@Article{Iwasaki:1992hn,
-     author    = "Iwasaki, Y. and Kanaya, K. and Sakai, S. and Yoshie, T.",
-     title     = "Quark confinement in multi - flavor quantum
-                  chromodynamics",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "30",
-     year      = "1993",
-     pages     = "327-330",
-     eprint    = "hep-lat/9211035",
-     SLACcitation  = "%%CITATION = HEP-LAT 9211035;%%"
-}
-@Article{Izubuchi:1998hy,
-     author    = "Izubuchi, T. and Noaki, J. and Ukawa, A.",
-     title     = "Two-dimensional lattice Gross-Neveu model with {Wilson}
-                  fermion action at  finite temperature and chemical
-                  potential",
-     journal   = "Phys. Rev.",
-     volume    = "D58",
-     year      = "1998",
-     pages     = "114507",
-     eprint    = "hep-lat/9805019",
-     SLACcitation  = "%%CITATION = HEP-LAT 9805019;%%"
-}
-@Article{Jacobs:1983ph,
-     author    = "Jacobs, L.",
-     title     = "Undoubling chirally symmetric lattice fermions",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "51",
-     year      = "1983",
-     pages     = "172",
-     SLACcitation  = "%%CITATION = PRLTA,51,172;%%"
-}
-@Article{Jagels:1994a,
-     author    = "Jagels, C. F. and Reichel, L.",
-     title     = " fast minimal residual algorithm for shifted unitary matrices",
-     journal   = "Numer. Linear Algebra Appl.",
-     volume    = "1(6)",
-     pages     = "555-570",
-     year      = "1994"
-}
-@Article{Jagels:1994aa,
-     author    = "Jagels, C. F. and Reichel, L.",
-     title     = "A Fast Minimal Residual Algorithm for Shifted Unitary 
-                  Matrices",
-     journal   = "Numerical Linear Algebra with Aplications",
-     volume    = "1(6)",
-     year      = "1994",
-     pages     = "555-570",
-}
-@Article{Jansen:1994ym,
-     author    = "Jansen, K.",
-     title     = "Domain wall fermions and chiral gauge theories",
-     journal   = "Phys. Rept.",
-     volume    = "273",
-     year      = "1996",
-     pages     = "1-54",
-     eprint    = "hep-lat/9410018",
-     SLACcitation  = "%%CITATION = HEP-LAT 9410018;%%"
-}
-@Article{Jansen:1995ck,
-     author    = "Jansen, Karl and others",
-     title     = "Non-perturbative renormalization of lattice QCD at all
-                  scales",
-     journal   = "Phys. Lett.",
-     volume    = "B372",
-     year      = "1996",
-     pages     = "275-282",
-     eprint    = "hep-lat/9512009",
-     SLACcitation  = "%%CITATION = HEP-LAT 9512009;%%"
-}
-@Article{Jansen:1996cq,
-     author    = "Jansen, K. and Liu, C.",
-     title     = "Study of Liapunov exponents and the reversibility of
-                  molecular dynamics  algorithms",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "53",
-     year      = "1997",
-     pages     = "974-976",
-     eprint    = "hep-lat/9607057",
-     SLACcitation  = "%%CITATION = HEP-LAT 9607057;%%"
-}
-@Article{Jansen:1996xp,
-     author    = "Jansen, K.",
-     title     = "Recent developments in fermion simulation algorithms",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "53",
-     year      = "1997",
-     pages     = "127-133",
-     eprint    = "hep-lat/9607051",
-     SLACcitation  = "%%CITATION = HEP-LAT 9607051;%%"
-}
-@Article{Jansen:1997yt,
-     author    = "Jansen, K. and Liu, C.",
-     title     = "Implementation of Symanzik's improvement program for
-                  simulations of  dynamical {Wilson} fermions in lattice {QCD}",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "99",
-     year      = "1997",
-     pages     = "221-234",
-     eprint    = "hep-lat/9603008",
-     SLACcitation  = "%%CITATION = HEP-LAT 9603008;%%"
-}
-@Article{Jansen:1998mx,
-     author    = "Jansen, K. and Sommer, R.",
- collaboration = "ALPHA",
-     title     = "O(alpha) improvement of lattice {QCD} with two flavors of
-                  {Wilson} quarks",
-     journal   = "Nucl. Phys.",
-     volume    = "B530",
-     year      = "1998",
-     pages     = "185-203",
-     eprint    = "hep-lat/9803017",
-     SLACcitation  = "%%CITATION = HEP-LAT 9803017;%%"
-}
-@Article{Jansen:2003ir,
-     author    = "Jansen, K. and Shindler, A. and Urbach, C. and
-                  Wetzorke, I.",
- collaboration = "\xlf",
-     title     = "Scaling test for {Wilson} twisted mass {QCD}",
-     journal   = "Phys. Lett.",
-     volume    = "B586",
-     year      = "2004",
-     pages     = "432-438",
-     eprint    = "hep-lat/0312013",
-     SLACcitation  = "%%CITATION = HEP-LAT 0312013;%%"
-}
-@Article{Jansen:2003jq,
-     author    = "Jansen, K. and Nagai, K.-I.",
-     title     = "Reducing residual-mass effects for domain-wall fermions",
-     journal   = "JHEP",
-     volume    = "12",
-     year      = "2003",
-     pages     = "038",
-     eprint    = "hep-lat/0305009",
-     SLACcitation  = "%%CITATION = HEP-LAT 0305009;%%"
-}
-@Article{Jansen:2003nt,
-     author    = "Jansen, K.",
-     title     = "Actions for dynamical fermion simulations: Are we ready to
-                  go?",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "129",
-     year      = "2004",
-     pages     = "3-16",
-     eprint    = "hep-lat/0311039",
-     SLACcitation  = "%%CITATION = HEP-LAT 0311039;%%"
-}
-@Article{Jansen:2005cg,
-     author    = "Jansen, K. and others",
- collaboration = "\xlf",
-     title     = "Flavour breaking effects of {Wilson} twisted mass fermions",
-     journal   = "Phys. Lett.",
-     volume    = "B624",
-     year      = "2005",
-     pages     = "334-341",
-     eprint    = "hep-lat/0507032",
-     SLACcitation  = "%%CITATION = HEP-LAT 0507032;%%"
-}
-@Unpublished{Jansen:2005chi,
-  author = 	 {Jansen, K. and others},
-collaborations = {\xlf},
-  title = 	 {},
-  note = 	 {in preparation},
-  OPTkey = 	 {},
-  OPTmonth = 	 {},
-  year = 	 {2005},
-  OPTannote = 	 {}
-}
-@Article{Jansen:2005gf,
-     author    = "Jansen, K. and Papinutto, M. and Shindler, A. and Urbach,
-                  C. and Wetzorke, I.",
- collaboration = "\xlf",
-     title     = "Light quarks with twisted mass fermions",
-     journal   = "Phys. Lett.",
-     volume    = "B619",
-     year      = "2005",
-     pages     = "184-191",
-     eprint    = "hep-lat/0503031",
-     SLACcitation  = "%%CITATION = HEP-LAT 0503031;%%"
-}
-@Article{Jansen:2005kk,
-     author    = "Jansen, K. and Papinutto, M. and Shindler, A. and Urbach,
-                  C. and Wetzorke, I.",
- collaboration = "\xlf",
-     title     = "Quenched scaling of {Wilson} twisted mass fermions",
-     journal   = "JHEP",
-     volume    = "09",
-     year      = "2005",
-     pages     = "071",
-     eprint    = "hep-lat/0507010",
-     SLACcitation  = "%%CITATION = HEP-LAT 0507010;%%"
-}
-@Article{Jansen:2005yp,
-     author    = "Jansen, Karl and Shindler, Andrea and Urbach, Carsten and
-                  Wenger, Urs",
-     title     = "{HMC} algorithm with multiple time scale integration and mass
-                  preconditioning",
-     journal   = "PoS",
-     volume    = "LAT2005",
-     year      = "2006",
-     pages     = "118",
-     eprint    = "hep-lat/0510064",
-     SLACcitation  = "%%CITATION = HEP-LAT 0510064;%%"
-}
-@Article{Jansen:2006ks,
-     author    = "Jansen, Karl",
-     title     = "Status report on ILDG activities",
-     year      = "2006",
-     eprint    = "hep-lat/0609012",
-     SLACcitation  = "%%CITATION = HEP-LAT 0609012;%%"
-}
-@Article{Jansen:2006rf,
-     author    = "Jansen, Karl and Urbach, Carsten",
- collaboration = "ETM",
-     title     = "First results with two light flavours of quarks with
-                  maximally twisted mass",
-     year      = "2006",
-     eprint    = "hep-lat/0610015",
-     SLACcitation  = "%%CITATION = HEP-LAT 0610015;%%"
-}
-@Article{Jansen:2008wv,
-     author    = "Jansen, K. and Michael, C. and Urbach, C.",
- collaboration = "ETM",
-     title     = "The eta' meson from lattice {QCD}",
-     year      = "2008",
-     eprint    = "0804.3871",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = 0804.3871;%%"
-}
-@Article{Jansen:2008zz,
-     author    = "Jansen, K. and Michael, C. and Urbach, C.",
-     title     = "{The eta-prime meson from lattice QCD}",
-     journal   = "Eur. Phys. J.",
-     volume    = "C58",
-     year      = "2008",
-     pages     = "261-269",
-     doi       = "10.1140/epjc/s10052-008-0764-6",
-     SLACcitation  = "%%CITATION = EPHJA,C58,261;%%"
-}
-@Unpublished{Jegerlehner:1996pm,
-     author    = "Jegerlehner, Beat",
-     title     = "Krylov space solvers for shifted linear systems",
-     year      = "1996",
-     eprint    = "hep-lat/9612014",
-     note      = "unpublished",
-     SLACcitation  = "%%CITATION = HEP-LAT 9612014;%%"
-}
-@Article{Jegerlehner:1997rn,
-     author    = "Jegerlehner, B.",
-     title     = "Multiple mass solvers",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "63",
-     year      = "1998",
-     pages     = "958-960",
-     eprint    = "hep-lat/9708029",
-     SLACcitation  = "%%CITATION = HEP-LAT 9708029;%%"
-}
-@Article{Jegerlehner:2003qp,
-     author    = "Jegerlehner, F.",
-     title     = "Theoretical precision in estimates of the hadronic
-                  contributions to  (g-2)mu and alpha(QED)(M(Z))",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "126",
-     year      = "2004",
-     pages     = "325-334",
-     eprint    = "hep-ph/0310234",
-     SLACcitation  = "%%CITATION = HEP-PH 0310234;%%"
-}
-@Article{Jenkins:1990jv,
-     author    = "Jenkins, Elizabeth Ellen and Manohar, Aneesh V.",
-     title     = "Baryon chiral perturbation theory using a heavy fermion
-                  Lagrangian",
-     journal   = "Phys. Lett.",
-     volume    = "B255",
-     year      = "1991",
-     pages     = "558-562",
-     SLACcitation  = "%%CITATION = PHLTA,B255,558;%%"
-}
-@Article{Kaiser:1998ds,
-     author    = "Kaiser, Roland and Leutwyler, H.",
-     title     = "{Pseudoscalar decay constants at large N(c)}",
-     year      = "1998",
-     eprint    = "hep-ph/9806336",
-     SLACcitation  = "%%CITATION = HEP-PH/9806336;%%"
-}
-
-@Article{Kalkreuter:1995mm,
-     author    = "Kalkreuter, Thomas and Simma, Hubert",
-     title     = "An Accelerated conjugate gradient algorithm to compute low
-                  lying eigenvalues: A Study for the Dirac operator in SU(2)
-                  lattice QCD",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "93",
-     year      = "1996",
-     pages     = "33-47",
-     eprint    = "hep-lat/9507023",
-     SLACcitation  = "%%CITATION = HEP-LAT 9507023;%%"
-}
-@Article{Kalkreuter:1996mm,
-     author    = "Kalkreuter, T. and Simma, H.",
-     title     = "An Accelerated conjugate gradient algorithm to compute low
-                  lying eigenvalues: A Study for the Dirac operator in SU(2)
-                  lattice {QCD}",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "93",
-     year      = "1996",
-     pages     = "33-47",
-     eprint    = "hep-lat/9507023",
-     SLACcitation  = "%%CITATION = HEP-LAT 9507023;%%"
-}
-@Article{Kamleh:2005wg,
-     author    = "Kamleh, W. and Peardon, M. J.",
- collaboration = "TrinLat",
-     title     = "{Polynomial filtering for HMC in lattice QCD}",
-     journal   = "PoS",
-     volume    = "LAT2005",
-     year      = "2006",
-     pages     = "106",
-     SLACcitation  = "%%CITATION = POSCI,LAT2005,106;%%"
-}
-@Article{Kaplan:1992bt,
-     author    = "Kaplan, D. B.",
-     title     = "A Method for simulating chiral fermions on the lattice",
-     journal   = "Phys. Lett.",
-     volume    = "B288",
-     year      = "1992",
-     pages     = "342-347",
-     eprint    = "hep-lat/9206013",
-     SLACcitation  = "%%CITATION = HEP-LAT 9206013;%%"
-}
-@Article{Karsten:1980wd,
-     author    = "Karsten, L. H. and Smit, J.",
-     title     = "Lattice fermions: species doubling, chiral invariance, and
-                  the triangle anomaly",
-     journal   = "Nucl. Phys.",
-     volume    = "B183",
-     year      = "1981",
-     pages     = "103",
-     SLACcitation  = "%%CITATION = NUPHA,B183,103;%%"
-}
-@Article{Kennedy:1990bv,
-     author    = "Kennedy, A. D. and Pendleton, B.",
-     title     = "Acceptances and autocorrelations in hybrid Monte Carlo",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "20",
-     year      = "1991",
-     pages     = "118-121",
-     SLACcitation  = "%%CITATION = NUPHZ,20,118;%%"
-}
-@Article{Knechtli:1998gf,
-     author    = "Knechtli, F. and Sommer, R.",
- collaboration = "ALPHA",
-     title     = "String breaking in SU(2) gauge theory with scalar matter
-                  fields",
-     journal   = "Phys. Lett.",
-     volume    = "B440",
-     year      = "1998",
-     pages     = "345-352",
-     eprint    = "hep-lat/9807022",
-     SLACcitation  = "%%CITATION = HEP-LAT 9807022;%%"
-}
-@Article{Knechtli:2000df,
-     author    = "Knechtli, F. and Sommer, R.",
- collaboration = "ALPHA",
-     title     = "String breaking as a mixing phenomenon in the SU(2) Higgs
-                  model",
-     journal   = "Nucl. Phys.",
-     volume    = "B590",
-     year      = "2000",
-     pages     = "309-328",
-     eprint    = "hep-lat/0005021",
-     SLACcitation  = "%%CITATION = HEP-LAT 0005021;%%"
-}
-@Article{Lacock:1994qx,
-     author    = "Lacock, P. and McKerrell, A. and Michael, C. and Stopher,
-                            I. M. and Stephenson, P. W.",
-     collaboration = "UKQCD",
-     title     = "Efficient hadronic operators in lattice gauge theory",
-     journal   = "Phys. Rev.",
-     volume    = "D51",
-     year      = "1995",
-     pages     = "6403-6410",
-     eprint    = "hep-lat/9412079",
-     SLACcitation  = "%%CITATION = HEP-LAT 9412079;%%"
-}
-@Article{Lepage:1992xa,
-     author    = "Lepage, G. Peter and Mackenzie, Paul B.",
-     title     = "On the viability of lattice perturbation theory",
-     journal   = "Phys. Rev.",
-     volume    = "D48",
-     year      = "1993",
-     pages     = "2250-2264",
-     eprint    = "hep-lat/9209022",
-     SLACcitation  = "%%CITATION = HEP-LAT 9209022;%%"
-}
-@Article{Lepage:2001ym,
-     author    = "Lepage, G. P. and others",
-     title     = "{Constrained curve fitting}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "106",
-     year      = "2002",
-     pages     = "12-20",
-     eprint    = "hep-lat/0110175",
-     archivePrefix = "arXiv",
-     doi       = "10.1016/S0920-5632(01)01638-3",
-     SLACcitation  = "%%CITATION = HEP-LAT/0110175;%%"
-}
-@Article{Lesk:2002gd,
-     author    = "Lesk, V. I. and others",
- collaboration = "CP-PACS",
-     title     = "Flavor singlet meson mass in the continuum limit in two-
-                  flavor lattice QCD",
-     journal   = "Phys. Rev.",
-     volume    = "D67",
-     year      = "2003",
-     pages     = "074503",
-     eprint    = "hep-lat/0211040",
-     SLACcitation  = "%%CITATION = HEP-LAT/0211040;%%"
-}
-@Article{Leutwyler:1997yr,
-     author    = "Leutwyler, H.",
-     title     = "{On the 1/N-expansion in chiral perturbation theory}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "64",
-     year      = "1998",
-     pages     = "223-231",
-     eprint    = "hep-ph/9709408",
-     SLACcitation  = "%%CITATION = HEP-PH/9709408;%%"
-}
-@Article{Leutwyler:2006qq,
-     author    = "Leutwyler, H.",
-     title     = "pi pi scattering",
-     year      = "2006",
-     eprint    = "hep-ph/0612112",
-     SLACcitation  = "%%CITATION = HEP-PH 0612112;%%"
-}
-@Article{Liu:1997fs,
-     author    = "Liu, C. and Jaster, A. and Jansen, K.",
-     title     = "Liapunov exponents and the reversibility of molecular
-                  dynamics  algorithms",
-     journal   = "Nucl. Phys.",
-     volume    = "B524",
-     year      = "1998",
-     pages     = "603-617",
-     eprint    = "hep-lat/9708017",
-     SLACcitation  = "%%CITATION = HEP-LAT 9708017;%%"
-}
-@Article{Luscher:1985dn,
-     author    = "L{\"u}scher, M.",
-     title     = "{Volume Dependence of the Energy Spectrum in Massive
-                  Quantum Field Theories. 1. Stable Particle States}",
-     journal   = "Commun. Math. Phys.",
-     volume    = "104",
-     year      = "1986",
-     pages     = "177",
-     doi       = "10.1007/BF01211589",
-     SLACcitation  = "%%CITATION = CMPHA,104,177;%%"
-}
-@Article{Luscher:1990ck,
-     author    = "L{\"u}scher, M. and Wolff, U.",
-     title     = "How to calculate the elastic scattering matrix in two-
-                  dimensional quantum field theories by numerical
-                  simulation",
-     journal   = "Nucl. Phys.",
-     volume    = "B339",
-     year      = "1990",
-     pages     = "222-252",
-     SLACcitation  = "%%CITATION = NUPHA,B339,222;%%"
-}
-@Article{Luscher:1993dy,
-     author    = "L{\"u}scher, Martin",
-     title     = "{A Portable high quality random number generator for
-                  lattice field theory simulations}",
-     journal   = "Comput. Phys. Commun.",
-     volume    = 79,
-     year      = 1994,
-     pages     = "100-110",
-     eprint    = "hep-lat/9309020",
-     archivePrefix = "arXiv",
-     doi       = "10.1016/0010-4655(94)90232-1",
-     SLACcitation  = "%%CITATION = HEP-LAT/9309020;%%"
-}
-@Article{Luscher:1993xx,
-     author    = "L{\"u}scher, Martin",
-     title     = "A New approach to the problem of dynamical quarks in
-                  numerical simulations of lattice {QCD}",
-     journal   = "Nucl. Phys.",
-     volume    = "B418",
-     year      = "1994",
-     pages     = "637-648",
-     eprint    = "hep-lat/9311007",
-     archivePrefix = "arXiv",
-     doi       = "10.1016/0550-3213(94)90533-9",
-     SLACcitation  = "%%CITATION = HEP-LAT/9311007;%%"
-}
-@Article{Luscher:1993xx,
-     author    = "L{\"u}scher, M.",
-     title     = "A New approach to the problem of dynamical quarks in
-                  numerical simulations of lattice {QCD}",
-     journal   = "Nucl. Phys.",
-     volume    = "B418",
-     year      = "1994",
-     pages     = "637-648",
-     eprint    = "hep-lat/9311007",
-     SLACcitation  = "%%CITATION = HEP-LAT 9311007;%%"
-}
-@Article{Luscher:1996sc,
-     author    = "L{\"u}scher, M. and Sint, S. and Sommer, R. and
-                  Weisz, P.",
-     title     = "Chiral symmetry and {O(a)} improvement in lattice {QCD}",
-     journal   = "Nucl. Phys.",
-     volume    = "B478",
-     year      = "1996",
-     pages     = "365-400",
-     eprint    = "hep-lat/9605038",
-     SLACcitation  = "%%CITATION = HEP-LAT 9605038;%%"
-}
-@Article{Luscher:1996ug,
-     author    = "L{\"u}scher, M. and Sint, S. and Sommer, R. and
-                  Weisz, P. and Wolff, U.",
-     title     = "Non-perturbative {O(a)} improvement of lattice {QCD}",
-     journal   = "Nucl. Phys.",
-     volume    = "B491",
-     year      = "1997",
-     pages     = "323-343",
-     eprint    = "hep-lat/9609035",
-     SLACcitation  = "%%CITATION = HEP-LAT 9609035;%%"
-}
-@Article{Luscher:1998pq,
-     author    = "L{\"u}scher, M.",
-     title     = "Exact chiral symmetry on the lattice and the {Ginsparg}-
-                  {Wilson} relation",
-     journal   = "Phys. Lett.",
-     volume    = "B428",
-     year      = "1998",
-     pages     = "342-345",
-     eprint    = "hep-lat/9802011",
-     SLACcitation  = "%%CITATION = HEP-LAT 9802011;%%"
-}
-@Article{Luscher:2001tx,
-     author    = "L{\"u}scher, Martin",
-     title     = "{Lattice QCD on PCs?}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "106",
-     year      = "2002",
-     pages     = "21-28",
-     eprint    = "hep-lat/0110007",
-     archivePrefix = "arXiv",
-     doi       = "10.1016/S0920-5632(01)01639-5",
-     SLACcitation  = "%%CITATION = HEP-LAT/0110007;%%"
-}
-@Article{Luscher:2003qa,
-     author    = "L{\"u}scher, M.",
-     title     = "Solution of the {D}irac equation in lattice {QCD} using a
-                  domain  decomposition method",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "156",
-     year      = "2004",
-     pages     = "209-220",
-     eprint    = "hep-lat/0310048",
-     SLACcitation  = "%%CITATION = HEP-LAT 0310048;%%"
-}
-@Article{Luscher:2004rx,
-     author    = "L{\"u}scher, M.",
-     title     = "Schwarz-preconditioned {HMC} algorithm for two-flavour
-                  lattice {QCD}",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "165",
-     year      = "2005",
-     pages     = "199",
-     eprint    = "hep-lat/0409106",
-     SLACcitation  = "%%CITATION = HEP-LAT 0409106;%%"
-}
-
-@Article{Luscher:2005mv,
-     author    = "L{\"u}scher, Martin",
-     title     = "Lattice {QCD} with light {W}ilson quarks",
-     journal   = "\href{http://pos.sissa.it/archive/conferences/020/008/LAT2005_002.pdf}{PoS(LAT2005)002}", 
-     year      = "2005",
-     eprint    = "hep-lat/0509152",
-     howpublished="Talk presented at International Symposium on Lattice Field Theory (Lattice 2005)",
-     SLACcitation  = "%%CITATION = HEP-LAT 0509152;%%"
-}
-@Article{Luscher:2007es,
-     author    = "L{\"u}scher, Martin",
-     title     = "{Deflation acceleration of lattice {QCD} simulations}",
-     journal   = "JHEP",
-     volume    = "12",
-     year      = "2007",
-     pages     = "011",
-     eprint    = "0710.5417",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     doi       = "10.1088/1126-6708/2007/12/011",
-     SLACcitation  = "%%CITATION = 0710.5417;%%"
-}
-@Article{Luscher:ranluxweb,
-     author    = "L{\"u}scher, M.",
-     title     = "Ranlux random number generator",
-     eprint    = "http://luscher.web.cern.ch/luscher/ranlux/"
-}
-@Article{Luscher:sse,
-     author    = "L{\"u}scher, M.",
-     title     = "Lattice QCD parallel benchmark programs",
-     eprint    = "http://luscher.web.cern.ch/luscher/QCDpbm/"
-}
-@Article{Madras:1988ei,
-     author    = "Madras, N. and Sokal, A. D.",
-     title     = "The Pivot algorithm: a highly efficient Monte Carlo method
-                  for selfavoiding walk",
-     journal   = "J. Statist. Phys.",
-     volume    = "50",
-     year      = "1988",
-     pages     = "109-186",
-     SLACcitation  = "%%CITATION = JSTPB,50,109;%%"
-}
-@Article{Martinelli:1982mw,
-     author    = "Martinelli, G. and Zhang, Yi-Cheng",
-     title     = "THE CONNECTION BETWEEN LOCAL OPERATORS ON THE LATTICE AND
-                  IN THE CONTINUUM AND ITS RELATION TO MESON DECAY
-                  CONSTANTS",
-     journal   = "Phys. Lett.",
-     volume    = "B123",
-     year      = "1983",
-     pages     = "433",
-     SLACcitation  = "%%CITATION = PHLTA,B123,433;%%"
-}
-@Article{Martinelli:1994ty,
-     author    = "Martinelli, G. and Pittori, C. and Sachrajda, Christopher
-                  T. and Testa, M. and Vladikas, A.",
-     title     = "{A General method for nonperturbative renormalization of
-                  lattice operators}",
-     journal   = "Nucl. Phys.",
-     volume    = "B445",
-     year      = "1995",
-     pages     = "81-108",
-     eprint    = "hep-lat/9411010",
-     archivePrefix = "arXiv",
-     doi       = "10.1016/0550-3213(95)00126-D",
-     SLACcitation  = "%%CITATION = HEP-LAT/9411010;%%"
-}
-@Article{McNeile:2000hf,
-     author    = "McNeile, C. and Michael, C.",
-     collaboration = "UKQCD",
-     title     = "The eta and eta' mesons in {QCD}",
-     journal   = "Phys. Lett.",
-     volume    = "B491",
-     year      = "2000",
-     pages     = "123-129",
-     eprint    = "hep-lat/0006020",
-     SLACcitation  = "%%CITATION = HEP-LAT 0006020;%%"
-}
-@Article{McNeile:2000xx,
-     author    = "McNeile, Craig and Michael, Chris",
-     collaboration = "UKQCD",
-     title     = "Mixing of scalar glueballs and flavour-singlet scalar
-                  mesons",
-     journal   = "Phys. Rev.",
-     volume    = "D63",
-     year      = "2001",
-     pages     = "114503",
-     eprint    = "hep-lat/0010019",
-     SLACcitation  = "%%CITATION = HEP-LAT0010019;%%"
-}
-@Article{McNeile:2001cr,
-     author    = "McNeile, C. and Michael, C. and Sharkey, K. J.",
- collaboration = "UKQCD",
-     title     = "The flavor singlet mesons in {QCD}",
-     journal   = "Phys. Rev.",
-     volume    = "D65",
-     year      = "2002",
-     pages     = "014508",
-     eprint    = "hep-lat/0107003",
-     SLACcitation  = "%%CITATION = HEP-LAT 0107003;%%"
-}
-@Article{McNeile:2002fh,
-     author    = "McNeile, C. and Michael, C.",
- collaboration = "UKQCD",
-     title     = "Hadronic decay of a vector meson from the lattice",
-     journal   = "Phys. Lett.",
-     volume    = "B556",
-     year      = "2003",
-     pages     = "177-184",
-     eprint    = "hep-lat/0212020",
-     SLACcitation  = "%%CITATION = HEP-LAT 0212020;%%"
-}
-@Article{McNeile:2006bz,
-     author    = "McNeile, C. and Michael, C.",
-     collaboration = "UKQCD",
-     title     = "Decay width of light quark hybrid meson from the lattice",
-     journal   = "Phys. Rev.",
-     volume    = "D73",
-     year      = "2006",
-     pages     = "074506",
-     eprint    = "hep-lat/0603007",
-     SLACcitation  = "%%CITATION = HEP-LAT 0603007;%%"
-}
-@Article{Meyer:2006ty,
-     author    = "Meyer, Harvey B. and others",
-     title     = "{Exploring the HMC trajectory-length dependence of
-                  autocorrelation times in lattice QCD}",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "176",
-     year      = "2007",
-     pages     = "91-97",
-     eprint    = "hep-lat/0606004",
-     archivePrefix = "arXiv",
-     doi       = "10.1016/j.cpc.2006.08.002",
-     SLACcitation  = "%%CITATION = HEP-LAT/0606004;%%"
-}
-@Article{Michael:1982gb,
-     author    = "Michael, C. and Teasdale, I.",
-     title     = "EXTRACTING GLUEBALL MASSES FROM LATTICE QCD",
-     journal   = "Nucl. Phys.",
-     volume    = "B215",
-     year      = "1983",
-     pages     = "433",
-     SLACcitation  = "%%CITATION = NUPHA,B215,433;%%"
-}
-@Article{Michael:1989mf,
-     author    = "Michael, C.",
-     title     = "Particle decay in lattice gauge theory",
-     journal   = "Nucl. Phys.",
-     volume    = "B327",
-     year      = "1989",
-     pages     = "515",
-     SLACcitation  = "%%CITATION = NUPHA,B327,515;%%"
-}
-@Article{Michael:1991nc,
-     author    = "Michael, C.",
-     title     = "Hadronic forces from the lattice",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "26",
-     year      = "1992",
-     pages     = "417-419",
-     SLACcitation  = "%%CITATION = NUPHZ,26,417;%%"
-}
-@Article{Michael:1993yj,
-     author    = "Michael, Christopher",
-     title     = "{Fitting correlated data}",
-     journal   = "Phys. Rev.",
-     volume    = "D49",
-     year      = "1994",
-     pages     = "2616-2619",
-     eprint    = "hep-lat/9310026",
-     archivePrefix = "arXiv",
-     doi       = "10.1103/PhysRevD.49.2616",
-     SLACcitation  = "%%CITATION = HEP-LAT/9310026;%%"
-}
-@Article{Michael:1994sz,
-     author    = "Michael, Christopher and McKerrell, A.",
-     title     = "{Fitting correlated hadron mass spectrum data}",
-     journal   = "Phys. Rev.",
-     volume    = "D51",
-     year      = "1995",
-     pages     = "3745-3750",
-     eprint    = "hep-lat/9412087",
-     archivePrefix = "arXiv",
-     doi       = "10.1103/PhysRevD.51.3745",
-     SLACcitation  = "%%CITATION = HEP-LAT/9412087;%%"
-}
-@Article{Michael:2007vn,
-     author    = "Michael, C. and Urbach, C.",
- collaboration = "ETM",
-     title     = "Neutral mesons and disconnected diagrams in Twisted Mass
-                  QCD",
-     journal   = "",
-     volume    = "",
-     pages     = "",
-     year      = "2007",
-     eprint    = "0709.4564",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = ARXIV:0709.4564;%%"
-}
-@Book{Montvay:1994cy,
-     author    = "Montvay, I. and M{\"u}nster, G.",
-     title     = "Quantum fields on a lattice",
-     publisher = "Cambridge University Press",
-     year      = "1994",
-     series    = "Cambridge Monographs on Mathematical Physics",
-}
-@Article{Montvay:1995ea,
-     author    = "Montvay, I.",
-     title     = "An Algorithm for Gluinos on the Lattice",
-     journal   = "Nucl. Phys.",
-     volume    = "B466",
-     year      = "1996",
-     pages     = "259-284",
-     eprint    = "hep-lat/9510042",
-     SLACcitation  = "%%CITATION = HEP-LAT 9510042;%%"
-}
-@Article{Montvay:2005tj,
-     author    = "Montvay, I. and Scholz, E.",
-     title     = "Updating algorithms with multi-step stochastic correction",
-     journal   = "Phys. Lett.",
-     volume    = "B623",
-     year      = "2005",
-     pages     = "73-79",
-     eprint    = "hep-lat/0506006",
-     SLACcitation  = "%%CITATION = HEP-LAT 0506006;%%"
-}
-@Article{Morgan:2002a,
-  author       = "Morgan, R. B.",
-  title        = "GMRES with Deated Restarting",
-  journal      = "SIAM J. Sci. Comput.",
-  volume       = "24",
-  year         = "2002",
-  pages        = "20"
-}
-@Article{Morningstar:2003gk,
-     author    = "Morningstar, Colin and Peardon, Mike J.",
-     title     = "{Analytic smearing of SU(3) link variables in lattice
-                  QCD}",
-     journal   = "Phys. Rev.",
-     volume    = "D69",
-     year      = "2004",
-     pages     = "054501",
-     eprint    = "hep-lat/0311018",
-     archivePrefix = "arXiv",
-     doi       = "10.1103/PhysRevD.69.054501",
-     SLACcitation  = "%%CITATION = HEP-LAT/0311018;%%"
-}
-@Article{Munster:2004am,
-     author    = "M{\"u}nster, G.",
-     title     = "On the phase structure of twisted mass lattice {QCD}",
-     journal   = "JHEP",
-     volume    = "09",
-     year      = "2004",
-     pages     = "035",
-     eprint    = "hep-lat/0407006",
-     SLACcitation  = "%%CITATION = HEP-LAT 0407006;%%"
-}
-@Article{Munster:2004wt,
-     author    = "M{\"u}nster, Gernot and Schmidt, Christian and Scholz, Enno E.
-                  ",
-     title     = "Chiral perturbation theory for twisted mass {QCD}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "140",
-     year      = "2005",
-     pages     = "320-322",
-     eprint    = "hep-lat/0409066",
-     SLACcitation  = "%%CITATION = HEP-LAT 0409066;%%"
-}   
-@Article{Nagai:2005mi,
-     author    = "Nagai, Kei-ichi and Jansen, Karl",
-     title     = "Two-dimensional lattice Gross-Neveu model with Wilson
-                  twisted mass  fermions",
-     journal   = "Phys. Lett.",
-     volume    = "B633",
-     year      = "2006",
-     pages     = "325-330",
-     eprint    = "hep-lat/0510076",
-     SLACcitation  = "%%CITATION = HEP-LAT 0510076;%%"
-}
-@Unpublished{Nagai:priv,
-  author = 	 {Nagai, K},
-  title = 	 {Two-dimensional Gross-Neveu model with {Wilson}
-                  twisted mass fermions},
-  note = 	 {private communication},
-  OPTkey = 	 {},
-  OPTmonth = 	 {},
-  OPTyear = 	 {},
-  OPTannote = 	 {}
-}
-@Article{Necco:2001xg,
-     author    = "Necco, S. and Sommer, R.",
-     title     = "The {N(f)} = 0 heavy quark potential from short to
-                  intermediate  distances",
-     journal   = "Nucl. Phys.",
-     volume    = "B622",
-     year      = "2002",
-     pages     = "328-346",
-     eprint    = "hep-lat/0108008",
-     SLACcitation  = "%%CITATION = HEP-LAT 0108008;%%"
-}
-@Article{Necco:2003vh,
-     author    = "Necco, Silvia",
-     journal   = "Nucl. Phys.",
-     volume    = "B683",
-     year      = "2004",
-     pages     = "137-167",
-     eprint    = "hep-lat/0309017",
-     SLACcitation  = "%%CITATION = HEP-LAT 0309017;%%"
-}
-@Article{Neff:2001zr,
-     author    = "Neff, H. and Eicker, N. and Lippert, T. and Negele, J. W.
-                  and Schilling, K.",
-     title     = "On the low fermionic eigenmode dominance in {QCD} on the
-                  lattice",
-     journal   = "Phys. Rev.",
-     volume    = "D64",
-     year      = "2001",
-     pages     = "114509",
-     eprint    = "hep-lat/0106016",
-     SLACcitation  = "%%CITATION = HEP-LAT/0106016;%%"
-}
-@Article{Neuberger:1997fp,
-     author    = "Neuberger, H.",
-     title     = "Exactly massless quarks on the lattice",
-     journal   = "Phys. Lett.",
-     volume    = "B417",
-     year      = "1998",
-     pages     = "141-144",
-     eprint    = "hep-lat/9707022",
-     SLACcitation  = "%%CITATION = HEP-LAT 9707022;%%"
-}
-@Article{Neuberger:1998wv,
-     author    = "Neuberger, H.",
-     title     = "More about exactly massless quarks on the lattice",
-     journal   = "Phys. Lett.",
-     volume    = "B427",
-     year      = "1998",
-     pages     = "353-355",
-     eprint    = "hep-lat/9801031",
-     SLACcitation  = "%%CITATION = HEP-LAT 9801031;%%"
-}
-@Article{Niedermayer:1998bi,
-     author    = "Niedermayer, F.",
-     title     = "Exact chiral symmetry, topological charge and related
-                  topics",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "73",
-     year      = "1999",
-     pages     = "105-119",
-     eprint    = "hep-lat/9810026",
-     SLACcitation  = "%%CITATION = HEP-LAT 9810026;%%"
-}
-@Article{Nielsen:1980rz,
-     author    = "Nielsen, H. B. and Ninomiya, M.",
-     title     = "Absence of neutrinos on a lattice. 1. proof by homotopy
-                  theory",
-     journal   = "Nucl. Phys.",
-     volume    = "B185",
-     year      = "1981",
-     pages     = "20",
-     SLACcitation  = "%%CITATION = NUPHA,B185,20;%%"
-}
-@Article{Nielsen:1981hk,
-     author    = "Nielsen, H. B. and Ninomiya, M.",
-     title     = "No go theorem for regularizing chiral fermions",
-     journal   = "Phys. Lett.",
-     volume    = "B105",
-     year      = "1981",
-     pages     = "219",
-     SLACcitation  = "%%CITATION = PHLTA,B105,219;%%"
-}
-@Article{Nielsen:1981xu,
-     author    = "Nielsen, H. B. and Ninomiya, M.",
-     title     = "Absence of neutrinos on a lattice. 2. intuitive topological
-                  proof",
-     journal   = "Nucl. Phys.",
-     volume    = "B193",
-     year      = "1981",
-     pages     = "173",
-     SLACcitation  = "%%CITATION = NUPHA,B193,173;%%"
-}
-@Article{Noaki:1998zc,
-     author    = "Noaki, J. and Izubuchi, T. and Ukawa, A.",
-     title     = "Two-dimensional Gross-Neveu model with {Wilson} fermion
-                  action at finite temperature and density",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "73",
-     year      = "1999",
-     pages     = "483-485",
-     eprint    = "hep-lat/9809071",
-     SLACcitation  = "%%CITATION = HEP-LAT 9809071;%%"
-}
-@Article{Orginos:2001xa,
-     author    = "Orginos, K.",
- collaboration = "RBC",
-     title     = "Chiral properties of domain wall fermions with improved
-                  gauge actions",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "106",
-     year      = "2002",
-     pages     = "721-723",
-     eprint    = "hep-lat/0110074",
-     SLACcitation  = "%%CITATION = HEP-LAT 0110074;%%"
-}
-@Article{Orth:2005kq,
-     author    = "Orth, B. and Lippert, T. and Schilling, K.",
-     title     = "Finite-size effects in lattice {QCD} with dynamical {Wilson}
-                  fermions",
-     journal   = "Phys. Rev.",
-     volume    = "D72",
-     year      = "2005",
-     pages     = "014503",
-     eprint    = "hep-lat/0503016",
-     SLACcitation  = "%%CITATION = HEP-LAT 0503016;%%"
-}
-@Article{Osterwalder:1973dx,
-     author    = "Osterwalder, K. and Schrader, R.",
-     title     = "Axioms for euclidean Green's functions",
-     journal   = "Commun. Math. Phys.",
-     volume    = "31",
-     year      = "1973",
-     pages     = "83-112",
-     SLACcitation  = "%%CITATION = CMPHA,31,83;%%"
-}
-@Article{Osterwalder:1975tc,
-     author    = "Osterwalder, K. and Schrader, R.",
-     title     = "Axioms for euclidean Green's functions. 2",
-     journal   = "Commun. Math. Phys.",
-     volume    = "42",
-     year      = "1975",
-     pages     = "281",
-     SLACcitation  = "%%CITATION = CMPHA,42,281;%%"
-}
-@Article{Osterwalder:1977pc,
-     author    = "Osterwalder, K. and Seiler, E.",
-     title     = "Gauge field theories on the lattice",
-     journal   = "Ann. Phys.",
-     volume    = "110",
-     year      = "1978",
-     pages     = "440",
-     SLACcitation  = "%%CITATION = APNYA,110,440;%%"
-}
-@Article{PDBook,
-     author = "Eidelman, S. and others",
-     title = "{Review of Particle Physics}",
-     journal = "{Physics Letters B}",
-     year = "2004",
-     volume = "592",
-     pages = {1+},
-     url = {http://pdg.lbl.gov}
-}
-@Article{Peardon:2002wb,
-     author    = "Peardon, M. J. and Sexton, J.",
- collaboration = "TrinLat",
-     title     = "Multiple molecular dynamics time-scales in hybrid Monte
-                  Carlo fermion simulations",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "119",
-     year      = "2003",
-     pages     = "985-987",
-     eprint    = "hep-lat/0209037",
-     SLACcitation  = "%%CITATION = HEP-LAT 0209037;%%"
-}
-@Book{Peskin:1995ev,
-  author = 	 {Peskin, M. E. and Schroeder, D. V.},
-  title = 	 {An Introduction to quantum field theory},
-  publisher = 	 {Westview Press},
-  year = 	 {1995},
-  OPTkey = 	 {},
-  OPTvolume = 	 {},
-  OPTnumber = 	 {},
-  OPTseries = 	 {Advanced Book Program},
-  OPTaddress = 	 {Boulder, Colorado},
-  OPTedition = 	 {},
-  OPTmonth = 	 {},
-  OPTnote = 	 {},
-  OPTannote = 	 {}
-}
-@Article{Politzer:1973fx,
-     author    = "Politzer, H. D.",
-     title     = "Reliable perturbative results for strong interactions?",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "30",
-     year      = "1973",
-     pages     = "1346-1349",
-     SLACcitation  = "%%CITATION = PRLTA,30,1346;%%"
-}
-@Article{Politzer:1974fr,
-     author    = "Politzer, H. D.",
-     title     = "Asymptotic freedom: an approach to strong interactions",
-     journal   = "Phys. Rept.",
-     volume    = "14",
-     year      = "1974",
-     pages     = "129-180",
-     SLACcitation  = "%%CITATION = PRPLC,14,129;%%"
-}
-@Manual{R:2005,
-    title = {R: A language and environment for statistical computing},
-    author = {{R Development Core Team}},
-    organization = {R Foundation for Statistical Computing},
-    address = {Vienna, Austria},
-    year = {2005},
-    note = {{ISBN} 3-900051-07-0},
-    url = {http://www.R-project.org},
-}
-
-@Book{Rothe:1992wy,
-     author    = "Rothe, H.J.",
-     title     = "Lattice gauge theories",
-     publisher = "World Scientific, Singapore",
-     year      = "1992",
-     pages     = "528",
-     edition   = "",
-}
-@Article{Rupak:2002sm,
-     author    = "Rupak, G. and Shoresh, N.",
-     title     = "Chiral perturbation theory for the {Wilson} lattice action",
-     journal   = "Phys. Rev.",
-     volume    = "D66",
-     year      = "2002",
-     pages     = "054503",
-     eprint    = "hep-lat/0201019",
-     SLACcitation  = "%%CITATION = HEP-LAT 0201019;%%"
-}
-
-@Article{Saad:1993a,
-  author  = "Saad, Y.",
-  title   = "A flexible inner-outer preconditioned GMRES altorithm",
-  journal = "SIAM J. Sci. Comput.",
-  volume  = "14 (2)",
-  year    = "1993",
-  page    = "461-469"  
-}
-@Article{Sachrajda:2004mi,
-     author    = "Sachrajda, C. T. and Villadoro, G.",
-     title     = "{Twisted boundary conditions in lattice simulations}",
-     journal   = "Phys. Lett.",
-     volume    = "B609",
-     year      = "2005",
-     pages     = "73-85",
-     eprint    = "hep-lat/0411033",
-     archivePrefix = "arXiv",
-     doi       = "10.1016/j.physletb.2005.01.033",
-     SLACcitation  = "%%CITATION = HEP-LAT/0411033;%%"
-}
-@Article{Scorzato:2004da,
-     author    = "Scorzato, L.",
-     title     = "Pion mass splitting and phase structure in twisted mass
-                  {QCD}",
-     journal   = "Eur. Phys. J.",
-     volume    = "C37",
-     year      = "2004",
-     pages     = "445-455",
-     eprint    = "hep-lat/0407023",
-     SLACcitation  = "%%CITATION = HEP-LAT 0407023;%%"
-}
-@Article{Scorzato:2005rb,
-     author    = "Scorzato, L. and others",
-     title     = "N(f) = 2 lattice {QCD} and chiral perturbation theory",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "153",
-     year      = "2006",
-     pages     = "283-290",
-     eprint    = "hep-lat/0511036",
-     SLACcitation  = "%%CITATION = HEP-LAT 0511036;%%"
-}
-
-@Article{Sexton:1992nu,
-     author    = "Sexton, J. C. and Weingarten, D. H.",
-     title     = "Hamiltonian evolution for the hybrid monte carlo
-                  algorithm",
-     journal   = "Nucl. Phys.",
-     volume    = "B380",
-     year      = "1992",
-     pages     = "665-678",
-     SLACcitation  = "%%CITATION = NUPHA,B380,665;%%"
-}
-
-@Article{Sharpe:1998xm,
-     author    = "Sharpe, S. R. and Singleton, R., Jr.",
-     title     = "Spontaneous flavor and parity breaking with {Wilson}
-                  fermions",
-     journal   = "Phys. Rev.",
-     volume    = "D58",
-     year      = "1998",
-     pages     = "074501",
-     eprint    = "hep-lat/9804028",
-     SLACcitation  = "%%CITATION = HEP-LAT 9804028;%%"
-}
-
-@Article{Sharpe:2004ny,
-     author    = "Sharpe, S. R. and Wu, Jackson M. S.",
-     title     = "Twisted mass chiral perturbation theory at next-to-leading
-                  order",
-     journal   = "Phys. Rev.",
-     volume    = "D71",
-     year      = "2005",
-     pages     = "074501",
-     eprint    = "hep-lat/0411021",
-     SLACcitation  = "%%CITATION = HEP-LAT 0411021;%%"
-}
-@Article{Sharpe:2004ps,
-     author    = "Sharpe, S. R. and Wu, J. M. S.",
-     title     = "The phase diagram of twisted mass lattice {QCD}",
-     journal   = "Phys. Rev.",
-     volume    = "D70",
-     year      = "2004",
-     pages     = "094029",
-     eprint    = "hep-lat/0407025",
-     SLACcitation  = "%%CITATION = HEP-LAT 0407025;%%"
-}
-@Article{Sharpe:2005rq,
-     author    = "Sharpe, Stephen R.",
-     title     = "Observations on discretization errors in twisted-mass
-                  lattice QCD",
-     journal   = "Phys. Rev.",
-     volume    = "D72",
-     year      = "2005",
-     pages     = "074510",
-     eprint    = "hep-lat/0509009",
-     SLACcitation  = "%%CITATION = HEP-LAT 0509009;%%"
-}
-@Article{Sheikholeslami:1985ij,
-     author    = "Sheikholeslami, B. and Wohlert, R.",
-     title     = "Improved continuum limit lattice action for qcd with {Wilson}
-                  fermions",
-     journal   = "Nucl. Phys.",
-     volume    = "B259",
-     year      = "1985",
-     pages     = "572",
-     SLACcitation  = "%%CITATION = NUPHA,B259,572;%%"
-}
-@Article{Shindler:2005vj,
-     author    = "Shindler, Andrea",
-     title     = "Twisted mass lattice {QCD}: Recent developments and results",
-     journal   = "PoS",
-     volume    = "LAT2005",
-     year      = "2006",
-     pages     = "014",
-     eprint    = "hep-lat/0511002",
-     SLACcitation  = "%%CITATION = HEP-LAT 0511002;%%"
-}
-@Article{Shindler:2006tm,
-     author    = "Shindler, A.",
- collaboration = "ETM",
-     title     = "Lattice QCD with light twisted quarks: First results",
-     year      = "2006",
-     eprint    = "hep-ph/0611264",
-     SLACcitation  = "%%CITATION = HEP-PH 0611264;%%"
-}
-@Article{Shindler:2007vp,
-     author    = "Shindler, A.",
-     title     = "{Twisted mass lattice QCD}",
-     journal   = "Phys. Rept.",
-     volume    = "461",
-     year      = "2008",
-     pages     = "37-110",
-     eprint    = "0707.4093",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     doi       = "10.1016/j.physrep.2008.03.001",
-     SLACcitation  = "%%CITATION = 0707.4093;%%"
-}
-@Article{Sleijpen:1996aa,
-     author    = "G. L. G. Sleijpen and H. A. Van der Vorst",
-     title     = "A Jacobi-Davidson iteration method for linear 
-                  eigenvalue problems",
-     journal   = "SIAM Journal on Matrix Analysis and Applications",
-     volume    = "17",
-     year      = "1996",
-     pages     = "401-425",
-}
-@Article{Sommer:1993ce,
-     author    = "Sommer, R.",
-     title     = "A New way to set the energy scale in lattice gauge theories
-                  and its applications to the static force and alpha-s in
-                  SU(2) Yang-Mills theory",
-     journal   = "Nucl. Phys.",
-     volume    = "B411",
-     year      = "1994",
-     pages     = "839-854",
-     eprint    = "hep-lat/9310022",
-     SLACcitation  = "%%CITATION = HEP-LAT 9310022;%%"
-}
-@Article{Sonneveld:1989cgs,
- author = {Peter Sonneveld},
- title = {CGS, a fast Lanczos-type solver for nonsymmetric linear systems},
- journal = {SIAM J. Sci. Stat. Comput.},
- volume = {10},
- number = {1},
- year = {1989},
- issn = {0196-5204},
- pages = {36--52},
- publisher = {Society for Industrial and Applied Mathematics},
- address = {Philadelphia, PA, USA},
- }
-@Article{Sternbeck:2003gy,
-     author    = "Sternbeck, A. and Ilgenfritz, E.-M. and Kerler, W.
-                  and M{\"u}ller-Preu{\ss}ker, M. and St{\"u}ben, H.",
-     title     = "The {Aoki} phase for {N(f)} = 2 {Wilson} fermions revisited",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "129",
-     year      = "2004",
-     pages     = "898-900",
-     eprint    = "hep-lat/0309059",
-     SLACcitation  = "%%CITATION = HEP-LAT 0309059;%%"
-}
-@Article{Sternbeck:2005tk,
-     author    = "Sternbeck, A. and Ilgenfritz, E. -M. and Mueller-Preussker,
-                  M. and Schiller, A.",
-     title     = "{Going infrared in SU(3) Landau gauge gluodynamics}",
-     journal   = "Phys. Rev.",
-     volume    = "D72",
-     year      = "2005",
-     pages     = "014507",
-     eprint    = "hep-lat/0506007",
-     SLACcitation  = "%%CITATION = HEP-LAT/0506007;%%"
-}
-@Article{Symanzik:1983dc,
-     author    = "Symanzik, K.",
-     title     = "Continuum limit and improved action in lattice theories. 1.
-                  principles and phi**4 theory",
-     journal   = "Nucl. Phys.",
-     volume    = "B226",
-     year      = "1983",
-     pages     = "187",
-     SLACcitation  = "%%CITATION = NUPHA,B226,187;%%"
-}
-@Conference{Symanzik:1981hc,
-     author    = "Symanzik, K.",
-     title     = "Some topics in quantum field theory",
-     booktitle = "Mathematical problems in theoretical physics",
-     journal   = "Lecture Notes in Physics",
-     volume    = "153",
-     year      = "1981",
-     pages     = "47-58",
-     editor    = "R. Schrader et al.",
-     note      = "Presented at 6th Int. Conf. on Mathematical Physics,
-                  Berlin, West Germany"
-}
-@Article{Symanzik:1983gh,
-     author    = "Symanzik, K.",
-     title     = "Continuum limit and improved action in lattice theories. 2.
-                  O(N) nonlinear sigma model in perturbation theory",
-     journal   = "Nucl. Phys.",
-     volume    = "B226",
-     year      = "1983",
-     pages     = "205",
-     SLACcitation  = "%%CITATION = NUPHA,B226,205;%%"
-}
-@Article{Takaishi:1996xj,
-     author    = "Takaishi, T.",
-     title     = "Heavy quark potential and effective actions on blocked
-                  configurations",
-     journal   = "Phys. Rev.",
-     volume    = "D54",
-     year      = "1996",
-     pages     = "1050-1053",
-     SLACcitation  = "%%CITATION = PHRVA,D54,1050;%%"
-}
-@Article{Takaishi:2005tz,
-     author    = "Takaishi, Tetsuya and de Forcrand, Philippe",
-     title     = "{Testing and tuning new symplectic integrators for hybrid
-                  Monte Carlo  algorithm in lattice QCD}",
-     journal   = "Phys. Rev.",
-     volume    = "E73",
-     year      = "2006",
-     pages     = "036706",
-     eprint    = "hep-lat/0505020",
-     archivePrefix = "arXiv",
-     doi       = "10.1103/PhysRevE.73.036706",
-     SLACcitation  = "%%CITATION = HEP-LAT/0505020;%%"
-}
-@Article{Takeda:2004xh,
-     author    = "Takeda, S. and others",
-     title     = "A scaling study of the step scaling function in SU(3) gauge
-                  theory with  improved gauge actions",
-     journal   = "Phys. Rev.",
-     volume    = "D70",
-     year      = "2004",
-     pages     = "074510",
-     eprint    = "hep-lat/0408010",
-     SLACcitation  = "%%CITATION = HEP-LAT 0408010;%%"
-}
-@Article{Ukawa:2002pc,
-     author    = "Ukawa, A.",
- collaboration = "CP-PACS and JL{QCD}",
-     title     = "Computational cost of full {QCD} simulations experienced by
-                  {CP-PACS and JLQCD Collaborations}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "106",
-     year      = "2002",
-     pages     = "195-196",
-     SLACcitation  = "%%CITATION = NUPHZ,106,195;%%"
-}
-@Article{Urbach:2005ji,
-     author    = "Urbach, C. and Jansen, K. and Shindler, A. and Wenger, U.",
-     title     = "{HMC} algorithm with multiple time scale integration and mass
-                  preconditioning",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "174",
-     year      = "2006",
-     pages     = "87-98",
-     eprint    = "hep-lat/0506011",
-     SLACcitation  = "%%CITATION = HEP-LAT 0506011;%%"
-}
-@Article{Urbach:2007rt,
-     author    = "Urbach, Carsten",
- collaboration = "ETM",
-     title     = "{Lattice QCD with two light Wilson quarks and maximally
-                  twisted mass}",
-     journal   = "PoS",
-     volume    = "LAT2007",
-     year      = "2007",
-     pages     = "022",
-     eprint    = "0710.1517",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = 0710.1517;%%"
-}
-@Article{WalkerLoud:2005bt,
-     author    = "Walker-Loud, Andre and Wu, Jackson M. S.",
-     title     = "{Nucleon and Delta masses in twisted mass chiral
-                  perturbation theory}",
-     journal   = "Phys. Rev.",
-     volume    = "D72",
-     year      = "2005",
-     pages     = "014506",
-     eprint    = "hep-lat/0504001",
-     archivePrefix = "arXiv",
-     doi       = "10.1103/PhysRevD.72.014506",
-     SLACcitation  = "%%CITATION = HEP-LAT/0504001;%%"
-}
-@Article{Weinberg:1973un,
-     author    = "Weinberg, S.",
-     title     = "Nonabelian gauge theories of the strong interactions",
-     journal   = "Phys. Rev. Lett.",
-     volume    = "31",
-     year      = "1973",
-     pages     = "494-497",
-     SLACcitation  = "%%CITATION = PRLTA,31,494;%%"
-}
-@Article{Weinberg:1978kz,
-     author    = "Weinberg, S.",
-     title     = "Phenomenological Lagrangians",
-     journal   = "Physica",
-     volume    = "A96",
-     year      = "1979",
-     pages     = "327",
-     SLACcitation  = "%%CITATION = PHYSA,A96,327;%%"
-}
-@Book{Weinberg:1995mt,
-     author    = "Weinberg, S.",
-     title     = "The Quantum theory of fields. Vol. 1: Foundations",
-     publisher = "Cambridge University Press",
-     year      = "1995",
-     pages     = "609",
-}
-@Article{Weisz:1982zw,
-     author    = "Weisz, P.",
-     title     = "Continuum limit improved lattice action for pure {Yang-Mills}
-                  theory. 1",
-     journal   = "Nucl. Phys.",
-     volume    = "B212",
-     year      = "1983",
-     pages     = "1",
-     SLACcitation  = "%%CITATION = NUPHA,B212,1;%%"
-}
-@Article{Weisz:1983bn,
-     author    = "Weisz, P. and Wohlert, R.",
-     title     = "Continuum limit improved lattice action for pure {Yang-Mills}
-                  theory. 2",
-     journal   = "Nucl. Phys.",
-     volume    = "B236",
-     year      = 1984,
-     pages     = 397,
-     SLACcitation  = "%%CITATION = NUPHA,B236,397;%%"
-}
-@Article{Wennekers:2005wa,
-     author    = "Wennekers, J. and Wittig, H.",
-     title     = "On the renormalized scalar density in quenched QCD",
-     year      = "2005",
-     eprint    = "hep-lat/0507026",
-     SLACcitation  = "%%CITATION = HEP-LAT 0507026;%%"
-}
-@Article{Weyl:1918ib,
-     author    = "Weyl, H.",
-     title     = "Gravitation und Elektrizit{\"a}t",
-     journal   = "Sitzungsber. Preuss. Akad. Wiss. Berlin (Math. Phys. )",
-     volume    = "1918",
-     year      = "1918",
-     pages     = "465",
-     SLACcitation  = "%%CITATION = SPWPA,1918,465;%%"
-}
-@Article{Weyl:1929fm,
-     author    = "Weyl, H.",
-     title     = "Electron and gravitation",
-     journal   = "Z. Phys.",
-     volume    = "56",
-     year      = "1929",
-     pages     = "330-352",
-     SLACcitation  = "%%CITATION = ZEPYA,56,330;%%"
-}
-@Article{Wilson:1974sk,
-     author    = "Wilson, K. G.",
-     title     = "Confinement of quarks",
-     journal   = "Phys. Rev.",
-     volume    = "D10",
-     year      = "1974",
-     pages     = "2445-2459",
-     SLACcitation  = "%%CITATION = PHRVA,D10,2445;%%"
-}
-@Article{Wilson:1974sk,
-     author    = "Wilson, K. G.",
-     title     = "Confinement of quarks",
-     journal   = "Phys. Rev.",
-     volume    = "D10",
-     year      = "1974",
-     pages     = "2445-2459",
-     SLACcitation  = "%%CITATION = PHRVA,D10,2445;%%"
-}
-@Article{Wilson:1975mb,
-     author    = "Wilson, K. G.",
-     title     = "The renormalization group: Critical phenomena and the kondo
-                  problem",
-     journal   = "Rev. Mod. Phys.",
-     volume    = "47",
-     year      = "1975",
-     pages     = "773",
-     SLACcitation  = "%%CITATION = RMPHA,47,773;%%"
-}
-@Article{Wilson:1975mb,
-     author    = "Wilson, K. G.",
-     title     = "The renormalization group: Critical phenomena and the kondo
-                  problem",
-     journal   = "Rev. Mod. Phys.",
-     volume    = "47",
-     year      = "1975",
-     pages     = "773",
-     SLACcitation  = "%%CITATION = RMPHA,47,773;%%"
-}
-@Article{Wolff:2003sm,
-     author    = "Wolff, U.",
- collaboration = "ALPHA",
-     title     = "Monte Carlo errors with less errors",
-     journal   = "Comput. Phys. Commun.",
-     volume    = "156",
-     year      = "2004",
-     pages     = "143-153",
-     eprint    = "hep-lat/0306017",
-     SLACcitation  = "%%CITATION = HEP-LAT 0306017;%%"
-}
-@Article{Yang:1954ek,
-     author    = "Yang, C.-N. and Mills, R. L.",
-     title     = "Conservation of isotopic spin and isotopic gauge
-                  invariance",
-     journal   = "Phys. Rev.",
-     volume    = "96",
-     year      = "1954",
-     pages     = "191-195",
-     SLACcitation  = "%%CITATION = PHRVA,96,191;%%"
-}
-@Article{Yoshie:2008aw,
-     author    = "Yoshie, Tomoteru",
-     title     = "{Making use of the International Lattice Data Grid}",
-     journal   = "PoS",
-     volume    = "LATTICE2008",
-     year      = "2008",
-     pages     = "019",
-     eprint    = "0812.0849",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = 0812.0849;%%"
-}
-@Article{Zweig:1964jf,
-     author    = "Zweig, G.",
-     title     = "An SU(3) model for strong interaction symmetry and its
-                  breaking. 2",
-     note     = "CERN-TH-412"
-}
-@Article{cln:web,
-  author = 	 {},
-  eprint =       {http://www.ginac.de/CLN/}
-}
-@Article{deForcrand:1995bs,
-     author    = "de Forcrand, P.",
-     title     = "Progress on lattice {QCD} algorithms",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "47",
-     year      = "1996",
-     pages     = "228-235",
-     eprint    = "hep-lat/9509082",
-     SLACcitation  = "%%CITATION = HEP-LAT 9509082;%%"
-}
-@Article{deForcrand:1996bx,
-     author    = "de Forcrand, P. and others",
- collaboration = "{QCD}-TARO",
-     title     = "Search for effective lattice action of pure {QCD}",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "53",
-     year      = "1997",
-     pages     = "938-941",
-     eprint    = "hep-lat/9608094",
-     SLACcitation  = "%%CITATION = HEP-LAT 9608094;%%"
-}
-@Article{deForcrand:1996ck,
-     author    = "de Forcrand, P. and Takaishi, T.",
-     title     = "Fast fermion Monte Carlo",
-     journal   = "Nucl. Phys. Proc. Suppl.",
-     volume    = "53",
-     year      = "1997",
-     pages     = "968-970",
-     eprint    = "hep-lat/9608093",
-     SLACcitation  = "%%CITATION = HEP-LAT 9608093;%%"
-}
-@Article{etmc:asqr,
-     author    = "Frezzotti, R. et al.",
-     title     = "{O(a^2) cutoff effects in Wilson fermion simulations}",
-     journal   = "PoS",
-     volume    = "LAT2007",
-     year      = "2007",
-     pages     = "277",
-     eprint    = "0710.2492",
-     archivePrefix = "arXiv",
-     primaryClass  =  "hep-lat",
-     SLACcitation  = "%%CITATION = 0710.2492;%%"
-}
-@Article{ildg:web,
-  eprint = 	 {http://cssm.sasr.edu.au/ildg/},
-  author =	 {}
-}
-@Book{kleinert:1,
-     author    = "Kleinert, H.",
-     title     = "Path integrals in quantum mechanics, statistics and polymer ph
-ysics",
-     publisher = "World Scientific, Singapore",
-     year      = "1995",
-     edition   = "2nd Edition",
-}
-@Article{lapack:web,
-  author = 	 {},
-  eprint =       {http://www.netlib.org/lapack/}
-}
-@Article{lime:web,
-  author = 	 {USQCD},
-  title = 	 {c-lime library},
-  eprint =       {http://usqcd.jlab.org/usqcd-docs/c-lime/}
-}
-@Article{hmc:web,
-  author = 	 {},
-  title = 	 {tmLQCD},
-  eprint =       {http://www.carsten-urbach.eu/}
-}
-@Book{meister:1999,
-  author = 	 {Meister, Andreas},
-  title = 	 {Numerik linearer Gleichungssysteme},
-  publisher = 	 {vieweg},
-  year = 	 {1999},
-  OPTkey = 	 {},
-  OPTvolume = 	 {},
-  OPTnumber = 	 {},
-  OPTseries = 	 {},
-  OPTaddress = 	 {},
-  OPTedition = 	 {},
-  OPTmonth = 	 {},
-  OPTnote = 	 {},
-  OPTannote = 	 {}
-}
-@Manual{minuit,
-  title = 	 {MINUIT home page},
-  note= {\\seal.web.cern.ch/seal/snapshot/work-packages/mathlibs/minuit/home.html}
-}
-@Article{mpi:web,
-  author =       {},
-  title  =       {The message passing interface standard},
-  eprint =       {http://www-unix.mcs.anl.gov/mpi/}
-}
-@PhdThesis{orth:2004phd,
-  author = 	 {Orth, B.},
-  title = 	 {Finite size effects in lattice {QCD}
-                  with dynamical {Wilson} fermions},
-  school = 	 {Bergische Universit{\"a}t Wuppertal},
-  year = 	 {2004},
-  OPTkey = 	 {},
-  OPTtype = 	 {},
-  OPTaddress = 	 {},
-  OPTmonth = 	 {},
-  OPTnote = 	 {},
-  OPTannote = 	 {}
-}
-@PhdThesis{pleiter:phd,
-  author = 	 {Pleiter, D.},
-  title = 	 {XXX},
-  school = 	 {Freie {U}niversit�t {B}erlin},
-  year = 	 {2001}
+
+@article{'tHooft:1971fh,
+	author = "{'t Hooft}, G.",
+	journal = "Nucl. Phys.",
+	pages = "173--199",
+	slaccitation = "%\%CITATION = NUPHA,B33,173;\%\%",
+	title = "{Renormalization of massless Yang-Mills fields}",
+	volume = "B33",
+	year = "1971"
 }
-@book{press:1992,
-	address = {Cambridge, UK},
-	author = {Press, William   and Teukolsky, Saul   and Vetterling, William   and Flannery, Brian  },
-	citeulike-article-id = {767703},
-	edition = {2nd},
-	keywords = {bibtex-import},
-	posted-at = {2006-07-21 00:26:35},
-	priority = {0},
-	publisher = {Cambridge University Press},
-	title = {Numerical Recipes in C},
-	year = {1992}
-}
-@Manual{root,
-  title = 	 {The ROOT system home page},
-  note = {root.cern.ch/}
-}
-
-@Book{saad:2003a,
-     author    = "Y. Saad",
-     title     = "Iterative Methods for sparse linear systems",
-     publisher = "SIAM",
-     year      = "2003",
-     edition   = "2nd",
-}
-
-@Article{scidac,
-  author = 	 {},
-  eprint =       {http://www.scidac.gov/}
-}
-@MastersThesis{urbach:2002aa,
-  author = 	 {Urbach, C.},
-  title = 	 {Untersuchung der {R}eversibilit{\"a}tsverletzung im {H}ybrid
-                  {M}onte {C}arlo {A}lgorithmus},
-  school = 	 {Freie Universit{\"a}t Berlin, Fachbereich Physik},
-  year = 	 {2002}
+
+@article{'tHooft:1971rn,
+	author = "{'t Hooft}, G.",
+	journal = "Nucl. Phys.",
+	pages = "167--188",
+	slaccitation = "%\%CITATION = NUPHA,B35,167;\%\%",
+	title = "{Renormalizable lagrangians for massive Yang-Mills fields}",
+	volume = "B35",
+	year = "1971"
+}
+
+@unpublished{'tHooft:1972aa,
+	author = "{'t Hooft}, G.",
+	note = "Unpublished remarks at the 1972 Marseille Conference on Yang-Mills Fields",
+	title = "{}"
+}
+
+@article{'tHooft:1972fi,
+	author = "{'t Hooft}, G. and Veltman, M. J. G.",
+	journal = "Nucl. Phys.",
+	pages = "189--213",
+	slaccitation = "%\%CITATION = NUPHA,B44,189;\%\%",
+	title = "{Regularization and renormalization of gauge fields}",
+	volume = "B44",
+	year = "1972"
+}
+
+@article{Abdel-Rehim:2004gx,
+	author = "Abdel-Rehim, A. M. and Lewis, R.",
+	eprint = "hep-lat/0410047",
+	journal = "Phys. Rev.",
+	pages = "014503",
+	slaccitation = "%\%CITATION = HEP-LAT 0410047;\%\%",
+	title = "{Twisted mass {QCD} for the pion electromagnetic form factor}",
+	volume = "D71",
+	year = "2005"
+}
+
+@article{Abdel-Rehim:2005gz,
+	author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M.",
+	eprint = "hep-lat/0503007",
+	journal = "Phys. Rev.",
+	pages = "094505",
+	slaccitation = "%\%CITATION = HEP-LAT/0503007;\%\%",
+	title = "{Spectrum of quenched twisted mass lattice QCD at maximal twist}",
+	volume = "D71",
+	year = "2005"
+}
+
+@article{AbdelRehim:2004sp,
+	author = "Abdel-Rehim, Abdou M. and Lewis, Randy",
+	eprint = "hep-lat/0408033",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "299--301",
+	slaccitation = "%\%CITATION = HEP-LAT/0408033;\%\%",
+	title = "{Pion form factor with twisted mass QCD}",
+	volume = "140",
+	year = "2005"
+}
+
+@article{AbdelRehim:2005gq,
+	author = "Abdel-Rehim, A. M. and Lewis, R. and Woloshyn, R. M.",
+	journal = "Int. J. Mod. Phys.",
+	pages = "6159--6168",
+	slaccitation = "%\%CITATION = IMPAE,A20,6159;\%\%",
+	title = "{Twisted mass lattice QCD and hadron phenomenology}",
+	volume = "A20",
+	year = "2005"
+}
+
+@article{AbdelRehim:2005gz,
+	archiveprefix = "arXiv",
+	author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M.",
+	doi = "10.1103/PhysRevD.71.094505",
+	eprint = "hep-lat/0503007",
+	journal = "Phys. Rev.",
+	pages = "094505",
+	slaccitation = "%\%CITATION = HEP-LAT/0503007;\%\%",
+	title = "{Spectrum of quenched twisted mass lattice QCD at maximal twist}",
+	volume = "D71",
+	year = "2005"
+}
+
+@article{AbdelRehim:2005qv,
+	author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M.",
+	eprint = "hep-lat/0509056",
+	journal = "PoS",
+	pages = "032",
+	slaccitation = "%\%CITATION = HEP-LAT/0509056;\%\%",
+	title = "{The hadron spectrum from twisted mass QCD with a strange quark}",
+	volume = "LAT2005",
+	year = "2006"
+}
+
+@article{AbdelRehim:2005yx,
+	author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M.",
+	eprint = "hep-lat/0509098",
+	journal = "PoS",
+	pages = "051",
+	slaccitation = "%\%CITATION = HEP-LAT/0509098;\%\%",
+	title = "{Maximal twist and the spectrum of quenched twisted mass lattice QCD}",
+	volume = "LAT2005",
+	year = "2006"
+}
+
+@article{AbdelRehim:2006qu,
+	author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Petry, Robert G. and Woloshyn, R. M.",
+	eprint = "hep-lat/0610004",
+	journal = "PoS",
+	pages = "164",
+	slaccitation = "%\%CITATION = HEP-LAT/0610004;\%\%",
+	title = "{The spectrum of tmLQCD with quark and link smearing}",
+	volume = "LAT2006",
+	year = "2006"
+}
+
+@article{AbdelRehim:2006ra,
+	author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M. and Wu, Jackson M. S.",
+	eprint = "hep-lat/0610090",
+	journal = "Eur. Phys. J.",
+	pages = "773--776",
+	slaccitation = "%\%CITATION = HEP-LAT/0610090;\%\%",
+	title = "{Lattice QCD with a twisted mass term and a strange quark}",
+	volume = "A31",
+	year = "2007"
+}
+
+@article{AbdelRehim:2006ve,
+	author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M. and Wu, Jackson M. S.",
+	eprint = "hep-lat/0601036",
+	journal = "Phys. Rev.",
+	pages = "014507",
+	slaccitation = "%\%CITATION = HEP-LAT/0601036;\%\%",
+	title = "{Strange quarks in quenched twisted mass lattice QCD}",
+	volume = "D74",
+	year = "2006"
+}
+
+@article{Adler:1974gd,
+	author = "Adler, Stephen L.",
+	journal = "Phys. Rev.",
+	pages = "3714",
+	slaccitation = "%\%CITATION = PHRVA,D10,3714;\%\%",
+	title = "{Some Simple Vacuum Polarization Phenomenology: e+ e- $\to$ Hadrons: The mu - Mesic Atom x-Ray Discrepancy and (g-2) of the Muon}",
+	volume = "D10",
+	year = "1974"
+}
+
+@article{Albanese:1987ds,
+	author = "Albanese, M. and others",
+	collaboration = "APE",
+	journal = "Phys. Lett.",
+	pages = "163",
+	slaccitation = "%\%CITATION = PHLTA,B192,163;\%\%",
+	title = "{Glueball masses and string tension in lattice {QCD}}",
+	volume = "B192",
+	year = "1987"
+}
+
+@article{Alexandrou:2008tn,
+	archiveprefix = "arXiv",
+	author = "Alexandrou, C. and others",
+	collaboration = "ETM",
+	eprint = "0803.3190",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0803.3190;\%\%",
+	title = "{Light baryon masses with dynamical twisted mass fermions}",
+	year = "2008"
+}
+
+@article{AliKhan:2000iv,
+	author = "{Ali Khan}, A. and others",
+	collaboration = "CP-PACS",
+	eprint = "hep-lat/0007014",
+	journal = "Phys. Rev.",
+	pages = "114504",
+	slaccitation = "%\%CITATION = HEP-LAT 0007014;\%\%",
+	title = "{Chiral properties of domain-wall quarks in quenched {QCD}}",
+	volume = "D63",
+	year = "2001"
+}
+
+@article{AliKhan:2003br,
+	author = "{Ali Khan}, A. and others",
+	collaboration = "QCDSF",
+	eprint = "hep-lat/0303026",
+	journal = "Phys. Lett.",
+	pages = "235--240",
+	slaccitation = "%\%CITATION = HEP-LAT 0303026;\%\%",
+	title = "{Accelerating the hybrid Monte Carlo algorithm}",
+	volume = "B564",
+	year = "2003"
+}
+
+@article{AliKhan:2003mu,
+	author = "{Ali Khan}, A. and others",
+	eprint = "hep-lat/0309078",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "853--855",
+	slaccitation = "%\%CITATION = HEP-LAT 0309078;\%\%",
+	title = "{Accelerating Hasenbusch's acceleration of hybrid Monte Carlo}",
+	volume = "129",
+	year = "2004"
+}
+
+@article{Allton:1993wc,
+	author = "Allton, C. R. and others",
+	collaboration = "UK{QCD}",
+	eprint = "hep-lat/9303009",
+	journal = "Phys. Rev.",
+	pages = "5128--5137",
+	slaccitation = "%\%CITATION = HEP-LAT 9303009;\%\%",
+	title = "{Gauge invariant smearing and matrix correlators using {Wilson} fermions at Beta = 6.2}",
+	volume = "D47",
+	year = "1993"
+}
+
+@article{Allton:2004qq,
+	author = "Allton, C. R. and others",
+	collaboration = "UKQCD",
+	eprint = "hep-lat/0403007",
+	journal = "Phys. Rev.",
+	pages = "014501",
+	slaccitation = "%\%CITATION = HEP-LAT/0403007;\%\%",
+	title = "{Improved Wilson QCD simulations with light quark masses}",
+	volume = "D70",
+	year = "2004"
+}
+
+@article{Aoki:1984qi,
+	author = "Aoki, S.",
+	journal = "Phys. Rev.",
+	pages = "2653",
+	slaccitation = "%\%CITATION = PHRVA,D30,2653;\%\%",
+	title = "{New phase structure for lattice {QCD} with {Wilson} fermions}",
+	volume = "D30",
+	year = "1984"
+}
+
+@article{Aoki:1985jj,
+	author = "Aoki, S. and Higashijima, K.",
+	journal = "Prog. Theor. Phys.",
+	pages = "521",
+	slaccitation = "%\%CITATION = PTPKA,76,521;\%\%",
+	title = "{The recovery of the chiral symmetry in lattice {Gross-Neveu} model}",
+	volume = "76",
+	year = "1986"
+}
+
+@article{Aoki:1986ua,
+	author = "Aoki, Sinya",
+	journal = "Phys. Lett.",
+	pages = "140",
+	slaccitation = "%\%CITATION = PHLTA,B190,140;\%\%",
+	title = "{NUMERICAL EVIDENCE FOR A PARITY VIOLATING PHASE IN LATTICE QCD WITH WILSON FERMION}",
+	volume = "B190",
+	year = "1987"
+}
+
+@article{Aoki:1986xr,
+	author = "Aoki, S.",
+	journal = "Phys. Rev. Lett.",
+	pages = "3136",
+	slaccitation = "%\%CITATION = PRLTA,57,3136;\%\%",
+	title = "{A solution to the {U(1)} problem on a lattice}",
+	volume = "57",
+	year = "1986"
+}
+
+@article{Aoki:1993vs,
+	author = "Aoki, S. and Boettcher, S. and Gocksch, A.",
+	eprint = "hep-lat/9312084",
+	journal = "Phys. Lett.",
+	pages = "157--164",
+	slaccitation = "%\%CITATION = HEP-LAT 9312084;\%\%",
+	title = "{Spontaneous breaking of flavor symmetry and parity in the Nambu-Jona-Lasinio model with {Wilson} fermions}",
+	volume = "B331",
+	year = "1994"
+}
+
+@article{Aoki:1995ft,
+	author = "Aoki, S.",
+	eprint = "hep-lat/9509008",
+	journal = "Prog. Theor. Phys. Suppl.",
+	pages = "179--186",
+	slaccitation = "%\%CITATION = HEP-LAT 9509008;\%\%",
+	title = "{On the phase structure of {QCD} with {Wilson} fermions}",
+	volume = "122",
+	year = "1996"
+}
+
+@article{Aoki:1995yf,
+	author = "Aoki, S. and Ukawa, A. and Umemura, T.",
+	eprint = "hep-lat/9508008",
+	journal = "Phys. Rev. Lett.",
+	pages = "873--876",
+	slaccitation = "%\%CITATION = HEP-LAT 9508008;\%\%",
+	title = "{Finite temperature phase structure of lattice {QCD} with {Wilson} quark action}",
+	volume = "76",
+	year = "1996"
+}
+
+@article{Aoki:1997fm,
+	author = "Aoki, S.",
+	eprint = "hep-lat/9707020",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "206--219",
+	slaccitation = "%\%CITATION = HEP-LAT 9707020;\%\%",
+	title = "{Phase structure of lattice {QCD} with {Wilson} fermion at finite temperature}",
+	volume = "60A",
+	year = "1998"
+}
+
+@article{Aoki:2001xq,
+	author = "Aoki, S. and others",
+	collaboration = "JL{QCD}",
+	eprint = "hep-lat/0110088",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "263--265",
+	slaccitation = "%\%CITATION = HEP-LAT 0110088;\%\%",
+	title = "{Non-trivial phase structure of {N(f)} = 3 {QCD} with {O(a)}- improved {Wilson} fermion at zero temperature}",
+	volume = "106",
+	year = "2002"
+}
+
+@article{Aoki:2002vt,
+	author = "Aoki, Y. and others",
+	eprint = "hep-lat/0211023",
+	journal = "Phys. Rev.",
+	pages = "074504",
+	slaccitation = "%\%CITATION = HEP-LAT 0211023;\%\%",
+	title = "{Domain wall fermions with improved gauge actions}",
+	volume = "D69",
+	year = "2004"
+}
+
+@article{Aoki:2004iq,
+	author = "Aoki, S. and others",
+	collaboration = "JL{QCD}",
+	eprint = "hep-lat/0409016",
+	slaccitation = "%\%CITATION = HEP-LAT 0409016;\%\%",
+	title = "{Bulk first-order phase transition in three-flavor lattice {QCD} with {O(a)}-improved {Wilson} fermion action at zero temperature}",
+	year = "2004"
+}
+
+@article{Aoki:2004ta,
+	author = "Aoki, Sinya and B{\"a}r, Oliver",
+	eprint = "hep-lat/0409006",
+	journal = "Phys. Rev.",
+	pages = "116011",
+	slaccitation = "%\%CITATION = HEP-LAT 0409006;\%\%",
+	title = "{Twisted-mass {QCD}, {O}(a) improvement and {Wilson} chiral perturbation theory}",
+	volume = "D70",
+	year = "2004"
+}
+
+@article{Aoki:2005ii,
+	author = "Aoki, S. and B{\"a}r, O.",
+	eprint = "hep-lat/0509002",
+	slaccitation = "%\%CITATION = HEP-LAT 0509002;\%\%",
+	title = "{Determining the low energy parameters of {Wilson} chiral perturbation theory}",
+	year = "2005"
+}
+
+@article{Arnold:2003sx,
+	author = "Arnold, Guido and others",
+	eprint = "hep-lat/0311025",
+	slaccitation = "%\%CITATION = HEP-LAT 0311025;\%\%",
+	title = "{Numerical methods for the QCD overlap operator. II: Optimal Krylov subspace methods}",
+	year = "2003"
+}
+
+@article{Atiyah:1971rm,
+	author = "Atiyah, M. F. and Singer, I. M.",
+	journal = "Annals Math.",
+	pages = "139--149",
+	slaccitation = "%\%CITATION = ANMAA,93,139;\%\%",
+	title = "{The Index of elliptic operators. 5}",
+	volume = "93",
+	year = "1971"
+}
+
+@article{Aubin:2006cc,
+	author = "Aubin, C. and Blum, T.",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "251--255",
+	slaccitation = "%\%CITATION = NUPHZ,162,251;\%\%",
+	title = "{Hadronic contributions to the muon g-2 from the lattice}",
+	volume = "162",
+	year = "2006"
+}
+
+@article{Aubin:2006xv,
+	author = "Aubin, C. and Blum, T.",
+	eprint = "hep-lat/0608011",
+	journal = "Phys. Rev.",
+	pages = "114502",
+	slaccitation = "%\%CITATION = HEP-LAT/0608011;\%\%",
+	title = "{Calculating the hadronic vacuum polarization and leading hadronic contribution to the muon anomalous magnetic moment with improved staggered quarks}",
+	volume = "D75",
+	year = "2007"
+}
+
+@article{BAGEL,
+	author = "Boyle, P.A.",
+	eprint = "http://www.ph.ed.ac.uk/\~{ }paboyle/bagel/Bagel.html",
+	year = 2005
+}
+
+@article{Baikov:2004ku,
+	author = "Baikov, P. A. and Chetyrkin, K. G. and K{\"u}hn, J. H.",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "243--246",
+	slaccitation = "%\%CITATION = NUPHZ,135,243;\%\%",
+	title = "{Vacuum polarization in pQCD: First complete O(alpha(s)**4) result}",
+	volume = "135",
+	year = "2004"
+}
+
+@article{Baikov:2005rw,
+	author = "Baikov, P. A. and Chetyrkin, K. G. and K{\"u}hn, J. H.",
+	eprint = "hep-ph/0511063",
+	journal = "Phys. Rev. Lett.",
+	pages = "012003",
+	slaccitation = "%\%CITATION = HEP-PH/0511063;\%\%",
+	title = "{Scalar correlator at O(alpha(s)**4), Higgs decay into b- quarks and bounds on the light quark masses}",
+	volume = "96",
+	year = "2006"
+}
+
+@article{Baikov:2008jh,
+	archiveprefix = "arXiv",
+	author = "Baikov, P. A. and Chetyrkin, K. G. and K{\"u}hn, J. H.",
+	eprint = "0801.1821",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = ARXIV:0801.1821;\%\%",
+	title = "{Hadronic Z- and tau-Decays in Order alpha\_s^4}",
+	year = "2008"
+}
+
+@article{Bali:2000vr,
+	author = "Bali, G. S. and others",
+	collaboration = "TXL",
+	eprint = "hep-lat/0003012",
+	journal = "Phys. Rev.",
+	pages = "054503",
+	slaccitation = "%\%CITATION = HEP-LAT 0003012;\%\%",
+	title = "{Static potentials and glueball masses from {QCD} simulations with {Wilson} sea quarks}",
+	volume = "D62",
+	year = "2000"
+}
+
+@article{Bali:2004pb,
+	author = "Bali, G. S. and others",
+	eprint = "hep-lat/0409137",
+	journal = "Nucl. Phys. Proc. Supl.",
+	pages = "609--611",
+	slaccitation = "%\%CITATION = HEP-LAT 0409137;\%\%",
+	title = "{String breaking with dynamical {Wilson} fermions}",
+	volume = "140",
+	year = "2004"
+}
+
+@article{Bali:2005fu,
+	author = "Bali, G. S. and Neff, H. and Duessel, T. and Lippert, T. and Schilling, K.",
+	collaboration = "SESAM",
+	eprint = "hep-lat/0505012",
+	journal = "Phys. Rev.",
+	pages = "114513",
+	slaccitation = "%\%CITATION = HEP-LAT 0505012;\%\%",
+	title = "{Observation of string breaking in {QCD}}",
+	volume = "D71",
+	year = "2005"
+}
+
+@article{Bar:2006zj,
+	author = "B{\"a}r, O. and Jansen, K. and Schaefer, S. and Scorzato, L. and Shindler, A.",
+	eprint = "hep-lat/0609039",
+	slaccitation = "%\%CITATION = HEP-LAT 0609039;\%\%",
+	title = "{Overlap fermions on a twisted mass sea}",
+	year = "2006"
+}
+
+@article{Baxter:1993bv,
+	author = "Baxter, R. M. and others",
+	collaboration = "UK{QCD}",
+	eprint = "hep-lat/9308020",
+	journal = "Phys. Rev.",
+	pages = "1594--1605",
+	slaccitation = "%\%CITATION = HEP-LAT 9308020;\%\%",
+	title = "{Quenched heavy light decay constants}",
+	volume = "D49",
+	year = "1994"
+}
+
+@article{Beane:2004tw,
+	archiveprefix = "arXiv",
+	author = "Beane, Silas R.",
+	doi = "10.1103/PhysRevD.70.034507",
+	eprint = "hep-lat/0403015",
+	journal = "Phys. Rev.",
+	pages = "034507",
+	slaccitation = "%\%CITATION = HEP-LAT/0403015;\%\%",
+	title = "{Nucleon masses and magnetic moments in a finite volume}",
+	volume = "D70",
+	year = "2004"
+}
+
+@article{Becher:1999he,
+	author = "Becher, Thomas and Leutwyler, H.",
+	eprint = "hep-ph/9901384",
+	journal = "Eur. Phys. J.",
+	pages = "643--671",
+	slaccitation = "%\%CITATION = HEP-PH/9901384;\%\%",
+	title = "{Baryon chiral perturbation theory in manifestly Lorentz invariant form}",
+	volume = "C9",
+	year = "1999"
+}
+
+@article{Bietenholz:2004sa,
+	author = "Bietenholz, W. and others",
+	collaboration = "\xlf",
+	eprint = "hep-lat/0409109",
+	slaccitation = "%\%CITATION = HEP-LAT 0409109;\%\%",
+	title = "{Comparison between overlap and twisted mass fermions towards the chiral limit}",
+	year = "2004"
+}
+
+@article{Bietenholz:2004wv,
+	author = "Bietenholz, W. and others",
+	collaboration = "\xlf",
+	eprint = "hep-lat/0411001",
+	journal = "JHEP",
+	pages = "044",
+	slaccitation = "%\%CITATION = HEP-LAT 0411001;\%\%",
+	title = "{Going chiral: Overlap versus twisted mass fermions}",
+	volume = "12",
+	year = "2004"
+}
+
+@article{Blossier:2007vv,
+	archiveprefix = "arXiv",
+	author = "Blossier, B. and others",
+	collaboration = "ETM",
+	eprint = "0709.4574",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = ARXIV:0709.4574;\%\%",
+	title = "{Light quark masses and pseudoscalar decay constants from Nf=2 Lattice QCD with twisted mass fermions}",
+	year = "2007"
+}
+
+@article{Blum:1994eh,
+	author = "Blum, Tom and others",
+	eprint = "hep-lat/9404006",
+	journal = "Phys. Rev.",
+	pages = "3377--3381",
+	slaccitation = "%\%CITATION = HEP-LAT 9404006;\%\%",
+	title = "{QCD thermodynamics with Wilson quarks at large kappa}",
+	volume = "D50",
+	year = "1994"
+}
+
+@article{Blum:2000kn,
+	author = "Blum, T. and others",
+	eprint = "hep-lat/0007038",
+	journal = "Phys. Rev.",
+	pages = "074502",
+	slaccitation = "%\%CITATION = HEP-LAT 0007038;\%\%",
+	title = "{Quenched lattice {QCD} with domain wall fermions and the chiral limit}",
+	volume = "D69",
+	year = "2004"
+}
+
+@article{Bodin:2005gg,
+	author = "Bodin, F. and others",
+	collaboration = "ApeNEXT",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "176--182",
+	slaccitation = "%\%CITATION = NUPHZ,140,176;\%\%",
+	title = "{The {apeNEXT} project}",
+	volume = "140",
+	year = "2005"
+}
+
+@article{Bolder:2000un,
+	author = "Bolder, B. and others",
+	eprint = "hep-lat/0005018",
+	journal = "Phys. Rev.",
+	pages = "074504",
+	slaccitation = "%\%CITATION = HEP-LAT 0005018;\%\%",
+	title = "{A high precision study of the Q anti-Q potential from {Wilson} loops in the regime of string breaking}",
+	volume = "D63",
+	year = "2001"
+}
+
+@article{Boucaud:2007uk,
+	author = "Boucaud, Ph. and others",
+	collaboration = "ETM",
+	eprint = "hep-lat/0701012",
+	slaccitation = "%\%CITATION = HEP-LAT 0701012;\%\%",
+	title = "{Dynamical twisted mass fermions with light quarks}",
+	year = "2007"
 }
+
+@article{Boucaud:2008xu,
+	archiveprefix = "arXiv",
+	author = "Boucaud, Ph. and others",
+	collaboration = "ETM",
+	doi = "10.1016/j.cpc.2008.06.013",
+	eprint = "0803.0224",
+	journal = "Comput. Phys. Commun.",
+	pages = "695--715",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0803.0224;\%\%",
+	title = "{Dynamical Twisted Mass Fermions with Light Quarks: Simulation and Analysis Details}",
+	volume = "179",
+	year = "2008"
+}
+
+@article{Boughezal:2006px,
+	author = "Boughezal, R. and Czakon, M. and Schutzmeier, T.",
+	eprint = "hep-ph/0605023",
+	journal = "Phys. Rev.",
+	pages = "074006",
+	slaccitation = "%\%CITATION = HEP-PH/0605023;\%\%",
+	title = "{Charm and bottom quark masses from perturbative QCD}",
+	volume = "D74",
+	year = "2006"
+}
+
+@article{Boyle:2005fb,
+	author = "Boyle, P. A. and others",
+	journal = "J. Phys. Conf. Ser.",
+	pages = "129--139",
+	slaccitation = "%\%CITATION = 00462,16,129;\%\%",
+	title = "{{QCDOC}: Project status and first results}",
+	volume = "16",
+	year = "2005"
+}
+
+@article{Brower:1994er,
+	author = "Brower, R. C. and Levi, A. R. and Orginos, K.",
+	eprint = "hep-lat/9412004",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "855--857",
+	slaccitation = "%\%CITATION = HEP-LAT 9412004;\%\%",
+	title = "{Extrapolation methods for the Dirac inverter in hybrid Monte Carlo}",
+	volume = "42",
+	year = "1995"
+}
+
+@article{Brower:1995vx,
+	author = "Brower, R. C. and Ivanenko, T. and Levi, A. R. and Orginos, K. N.",
+	eprint = "hep-lat/9509012",
+	journal = "Nucl. Phys.",
+	pages = "353--374",
+	slaccitation = "%\%CITATION = HEP-LAT 9509012;\%\%",
+	title = "{Chronological inversion method for the Dirac matrix in hybrid Monte Carlo}",
+	volume = "B484",
+	year = "1997"
+}
+
+@article{Bunk:1995uv,
+	author = "Bunk, B. and others",
+	eprint = "hep-lat/9411016",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "49--55",
+	slaccitation = "%\%CITATION = HEP-LAT 9411016;\%\%",
+	title = "{A New simulation algorithm for lattice {QCD} with dynamical quarks}",
+	volume = "42",
+	year = "1995"
+}
+
+@article{Bunk:1998rm,
+	author = "Bunk, B. and Elser, S. and Frezzotti, R. and Jansen, K.",
+	eprint = "hep-lat/9805026",
+	journal = "Comput. Phys. Commun.",
+	pages = "95--109",
+	slaccitation = "%\%CITATION = HEP-LAT 9805026;\%\%",
+	title = "{Ordering monomial factors of polynomials in the product representation}",
+	volume = "118",
+	year = "1999"
+}
+
+@article{Burrage:1998a,
+	author = "Burrage, K. and Erhel, J.",
+	journal = "Num. Lin. Alg. with Appl.",
+	pages = "101--121",
+	title = "{On the performance of various adaptive preconditioned GMRES strategies}",
+	volume = "5",
+	year = "1998"
+}
+
+@article{Campbell:1987nv,
+	author = "Campbell, N. A. and Huntley, A. and Michael, C.",
+	journal = "Nucl. Phys.",
+	pages = "51",
+	slaccitation = "%\%CITATION = NUPHA,B306,51;\%\%",
+	title = "{Heavy quark potentials and hybrid mesons from SU(3) lattice gauge theory}",
+	volume = "B306",
+	year = "1988"
+}
+
+@article{Capitani:2005jp,
+	author = "Capitani, S. and others",
+	eprint = "hep-lat/0511013",
+	journal = "Phys. Lett.",
+	pages = "520--526",
+	slaccitation = "%\%CITATION = HEP-LAT 0511013;\%\%",
+	title = "{Parton distribution functions with twisted mass fermions}",
+	volume = "B639",
+	year = "2006"
+}
+
+@article{Chen:2003im,
+	author = "Chen, Y. and others",
+	eprint = "hep-lat/0304005",
+	journal = "Phys. Rev.",
+	pages = "034502",
+	slaccitation = "%\%CITATION = HEP-LAT 0304005;\%\%",
+	title = "{Chiral logarithms in quenched {QCD}}",
+	volume = "D70",
+	year = "2004"
+}
+
+@book{Cheng:2000ct,
+	author = "Cheng, T. P. and Li, L. F.",
+	edition = "",
+	pages = "306",
+	publisher = "Oxford, UK: Clarendon",
+	title = "{Gauge theory of elementary particle physics: Problems and solutions}",
+	year = "2000"
+}
+
+@article{Chetyrkin:1990kr,
+	author = "Chetyrkin, K. G. and K{\"u}hn, Johann H.",
+	journal = "Phys. Lett.",
+	pages = "359--364",
+	slaccitation = "%\%CITATION = PHLTA,B248,359;\%\%",
+	title = "{Mass corrections to the Z decay rate}",
+	volume = "B248",
+	year = "1990"
+}
+
+@article{Chetyrkin:1996cf,
+	author = "Chetyrkin, K. G. and K{\"u}hn, Johann H. and Steinhauser, M.",
+	eprint = "hep-ph/9606230",
+	journal = "Nucl. Phys.",
+	pages = "213--240",
+	slaccitation = "%\%CITATION = HEP-PH/9606230;\%\%",
+	title = "{Three-loop polarization function and O(alpha(s)**2) corrections to the production of heavy quarks}",
+	volume = "B482",
+	year = "1996"
+}
+
+@article{Chetyrkin:1997mb,
+	author = "Chetyrkin, K. G. and K{\"u}hn, Johann H. and Steinhauser, M.",
+	eprint = "hep-ph/9705254",
+	journal = "Nucl. Phys.",
+	pages = "40--64",
+	slaccitation = "%\%CITATION = HEP-PH/9705254;\%\%",
+	title = "{Heavy quark current correlators to O(alpha(s)**2)}",
+	volume = "B505",
+	year = "1997"
+}
+
+@article{Chetyrkin:1998ix,
+	author = "Chetyrkin, K. G. and Harlander, R. and Steinhauser, M.",
+	eprint = "hep-ph/9801432",
+	journal = "Phys. Rev.",
+	pages = "014012",
+	slaccitation = "%\%CITATION = HEP-PH/9801432;\%\%",
+	title = "{Singlet polarization functions at O(alpha(s)**2)}",
+	volume = "D58",
+	year = "1998"
+}
+
+@article{Chetyrkin:2000zk,
+	author = "Chetyrkin, K. G. and Harlander, R. V. and K{\"u}hn, Johann H.",
+	eprint = "hep-ph/0005139",
+	journal = "Nucl. Phys.",
+	pages = "56--72",
+	slaccitation = "%\%CITATION = HEP-PH/0005139;\%\%",
+	title = "{Quartic mass corrections to R(had) at O(alpha(s)**3)}",
+	volume = "B586",
+	year = "2000"
+}
+
+@article{Chetyrkin:2006xg,
+	author = "Chetyrkin, K. G. and K{\"u}hn, J. H. and Sturm, C.",
+	eprint = "hep-ph/0604234",
+	journal = "Eur. Phys. J.",
+	pages = "107--110",
+	slaccitation = "%\%CITATION = HEP-PH/0604234;\%\%",
+	title = "{Four-loop moments of the heavy quark vacuum polarization function in perturbative QCD}",
+	volume = "C48",
+	year = "2006"
+}
+
+@article{Chiarappa:2004ry,
+	archiveprefix = "arXiv",
+	author = "Chiarappa, T. and others",
+	doi = "10.1016/j.nuclphysbps.2004.11.281",
+	eprint = "hep-lat/0409107",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "853--855",
+	slaccitation = "%\%CITATION = HEP-LAT/0409107;\%\%",
+	title = "{Comparing iterative methods for overlap and twisted mass fermions}",
+	volume = "140",
+	year = "2005"
+}
+
+@article{Chiarappa:2006ae,
+	archiveprefix = "arXiv",
+	author = "Chiarappa, T. and others",
+	doi = "10.1140/epjc/s10052-006-0204-4",
+	eprint = "hep-lat/0606011",
+	journal = "Eur. Phys. J.",
+	pages = "373--383",
+	slaccitation = "%\%CITATION = HEP-LAT/0606011;\%\%",
+	title = "{Numerical simulation of {QCD} with u, d, s and c quarks in the twisted-mass {W}ilson formulation}",
+	volume = "C50",
+	year = "2007"
+}
+
+@article{Chiarappa:2006hz,
+	archiveprefix = "arXiv",
+	author = "Chiarappa, T. and others",
+	eprint = "hep-lat/0609023",
+	journal = "Comput. Sci. Disc.",
+	pages = "015001",
+	slaccitation = "%\%CITATION = HEP-LAT/0609023;\%\%",
+	title = "{Iterative methods for overlap and twisted mass fermions}",
+	volume = "01",
+	year = "2008"
+}
+
+@article{Cichy:2008gk,
+	archiveprefix = "arXiv",
+	author = "Cichy, K. and {Gonzalez Lopez}, J. and Jansen, K. and Kujawa, A. and Shindler, A.",
+	doi = "10.1016/j.nuclphysb.2008.03.004",
+	eprint = "0802.3637",
+	journal = "Nucl. Phys.",
+	pages = "94--108",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0802.3637;\%\%",
+	title = "{Twisted Mass, Overlap and Creutz Fermions: Cut-off Effects at Tree-level of Perturbation Theory}",
+	volume = "B800",
+	year = "2008"
+}
+
+@article{Clark:2004cq,
+	author = "Clark, M. A. and Kennedy, A. D.",
+	eprint = "hep-lat/0409134",
+	slaccitation = "%\%CITATION = HEP-LAT 0409134;\%\%",
+	title = "{Accelerating fermionic molecular dynamics}",
+	year = "2004"
+}
+
+@article{Clark:2005sq,
+	author = "Clark, M. A. and de Forcrand, Ph. and Kennedy, A. D.",
+	eprint = "hep-lat/0510004",
+	journal = "PoS",
+	pages = "115",
+	slaccitation = "%\%CITATION = HEP-LAT 0510004;\%\%",
+	title = "{Algorithm shootout: R versus RHMC}",
+	volume = "LAT2005",
+	year = "2005"
+}
+
+@article{Clark:2006fx,
+	author = "Clark, M. A. and Kennedy, A. D.",
+	eprint = "hep-lat/0608015",
+	slaccitation = "%\%CITATION = HEP-LAT 0608015;\%\%",
+	title = "{Accelerating dynamical fermion computations using the rational hybrid {Monte} {Carlo} ({RHMC}) algorithm with multiple pseudofermion fields}",
+	year = "2006"
+}
+
+@article{Colangelo:2001df,
+	archiveprefix = "arXiv",
+	author = "Colangelo, G. and Gasser, J. and Leutwyler, H.",
+	doi = "10.1016/S0550-3213(01)00147-X",
+	eprint = "hep-ph/0103088",
+	journal = "Nucl. Phys.",
+	pages = "125--179",
+	slaccitation = "%\%CITATION = HEP-PH/0103088;\%\%",
+	title = "{pi pi scattering}",
+	volume = "B603",
+	year = "2001"
+}
+
+@article{Colangelo:2003hf,
+	author = "Colangelo, Gilberto and D{\"u}rr, Stephan",
+	eprint = "hep-lat/0311023",
+	journal = "Eur. Phys. J.",
+	pages = "543--553",
+	slaccitation = "%\%CITATION = HEP-LAT/0311023;\%\%",
+	title = "{The pion mass in finite volume}",
+	volume = "C33",
+	year = "2004"
+}
+
+@article{Colangelo:2005gd,
+	author = "Colangelo, Gilberto and D{\"u}rr, Stephan and Haefeli, Christoph",
+	eprint = "hep-lat/0503014",
+	journal = "Nucl. Phys.",
+	pages = "136--174",
+	slaccitation = "%\%CITATION = HEP-LAT 0503014;\%\%",
+	title = "{Finite volume effects for meson masses and decay constants}",
+	volume = "B721",
+	year = "2005"
+}
+
+@article{Colangelo:2006mp,
+	archiveprefix = "arXiv",
+	author = "Colangelo, Gilberto and Haefeli, Christoph",
+	doi = "10.1016/j.nuclphysb.2006.03.010",
+	eprint = "hep-lat/0602017",
+	journal = "Nucl. Phys.",
+	pages = "14--33",
+	slaccitation = "%\%CITATION = HEP-LAT/0602017;\%\%",
+	title = "{Finite volume effects for the pion mass at two loops}",
+	volume = "B744",
+	year = "2006"
+}
+
+@book{Collins:1994ab,
+	author = "Collins, J.C.",
+	edition = "",
+	publisher = "Cambridge University Press",
+	series = "{Cambridge Monographs on Mathematical Physics}",
+	title = "{Renormalisation}",
+	year = "1994"
+}
+
+@article{Creutz:1984fj,
+	author = "Creutz, M. and Gocksch, A. and Ogilvie, M. and Okawa, M.",
+	journal = "Phys. Rev. Lett.",
+	pages = "875",
+	slaccitation = "%\%CITATION = PRLTA,53,875;\%\%",
+	title = "{Microcanonical renormalization group}",
+	volume = "53",
+	year = "1984"
+}
+
+@article{Creutz:1989wt,
+	author = "Creutz, M. and Gocksch, A.",
+	note = "BNL-42601",
+	title = "{Higher order hybrid monte carlo algorithms}"
+}
+
+@article{Creutz:1996bg,
+	author = "Creutz, Michael",
+	eprint = "hep-lat/9608024",
+	slaccitation = "%\%CITATION = HEP-LAT 9608024;\%\%",
+	title = "{Wilson fermions at finite temperature}",
+	year = "1996"
+}
+
+@article{Creutz:1998ee,
+	author = "Creutz, M.",
+	eprint = "hep-lat/9806037",
+	journal = "Phys. Rev. Lett.",
+	pages = "3555--3558",
+	slaccitation = "%\%CITATION = HEP-LAT 9806037;\%\%",
+	title = "{Evaluating Grassmann integrals}",
+	volume = "81",
+	year = "1998"
+}
+
+@article{Cundy:2005pi,
+	author = "Cundy, N. and others",
+	eprint = "hep-lat/0502007",
+	slaccitation = "%\%CITATION = HEP-LAT 0502007;\%\%",
+	title = "{Numerical Methods for the {QCD} Overlap Operator IV: Hybrid Monte Carlo}",
+	year = "2005"
+}
+
+@article{David:1984ys,
+	author = "David, F. and Hamber, H. W.",
+	journal = "Nucl. Phys.",
+	pages = "381",
+	slaccitation = "%\%CITATION = NUPHA,B248,381;\%\%",
+	title = "{Chiral condensate with {Wilson} fermions}",
+	volume = "B248",
+	year = "1984"
+}
+
+@article{Davies:2008sw,
+	archiveprefix = "arXiv",
+	author = "Davies, C. T. H. and others",
+	collaboration = "HPQCD",
+	eprint = "0807.1687",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0807.1687;\%\%",
+	title = "{Update: Accurate Determinations of $\alpha_s$ from Realistic Lattice QCD}",
+	year = "2008"
+}
+
+@article{DeGrand:1990dk,
+	author = "DeGrand, T. A. and Rossi, P.",
+	journal = "Comput. Phys. Commun.",
+	pages = "211--214",
+	slaccitation = "%\%CITATION = CPHCB,60,211;\%\%",
+	title = "{Conditioning techniques for dynamical fermions}",
+	volume = "60",
+	year = "1990"
+}
+
+@article{DeGrand:1990ip,
+	author = "DeGrand, T. A.",
+	journal = "Phys. Rev.",
+	pages = "2296--2300",
+	slaccitation = "%\%CITATION = PHRVA,D43,2296;\%\%",
+	title = "{Resonance masses from Monte Carlo simulations (with emphasis on the rho meson)}",
+	volume = "D43",
+	year = "1991"
+}
+
+@article{DeGrand:2002vu,
+	author = "DeGrand, Thomas and Hasenfratz, Anna and Kovacs, Tamas G.",
+	eprint = "hep-lat/0211006",
+	journal = "Phys. Rev.",
+	pages = "054501",
+	slaccitation = "%\%CITATION = HEP-LAT 0211006;\%\%",
+	title = "{Improving the chiral properties of lattice fermions}",
+	volume = "D67",
+	year = "2003"
+}
+
+@article{DeTar:2007ni,
+	archiveprefix = "arXiv",
+	author = "DeTar, Carleton and Levkova, L.",
+	eprint = "0710.1322",
+	journal = "PoS",
+	pages = "116",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = ARXIV:0710.1322;\%\%",
+	title = "{Effects of the disconnected flavor singlet corrections on the hyperfine splitting in charmonium}",
+	volume = "LAT2007",
+	year = "2007"
+}
+
+@article{DelDebbio:2006cn,
+	author = "{Del Debbio}, L. and Giusti, L. and Luscher, M. and Petronzio, R. and Tantalo, N.",
+	eprint = "hep-lat/0610059",
+	journal = "JHEP",
+	pages = "056",
+	slaccitation = "%\%CITATION = HEP-LAT 0610059;\%\%",
+	title = "{QCD with light Wilson quarks on fine lattices. I: First experiences and physics results}",
+	volume = "02",
+	year = "2007"
+}
+
+@article{DellaMorte:2000yp,
+	author = "{Della Morte}, M. and Frezzotti, R. and Heitger, J. and Sint, S.",
+	eprint = "hep-lat/0010091",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "617--621",
+	slaccitation = "%\%CITATION = HEP-LAT 0010091;\%\%",
+	title = "{Non-perturbative scaling tests of twisted mass {QCD}}",
+	volume = "94",
+	year = "2001"
+}
+
+@article{DellaMorte:2001tu,
+	author = "{Della Morte}, M. and Frezzotti, R. and Heitger, J.",
+	eprint = "hep-lat/0110166",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "260--262",
+	slaccitation = "%\%CITATION = HEP-LAT 0110166;\%\%",
+	title = "{Quenched twisted mass {QCD} at small quark masses and in large volume}",
+	volume = "106",
+	year = "2002"
+}
+
+@article{DellaMorte:2001ys,
+	author = "{Della Morte}, M. and Frezzotti, R. and Heitger, J. and Sint, S.",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/0108019",
+	journal = "JHEP",
+	pages = "041",
+	slaccitation = "%\%CITATION = HEP-LAT 0108019;\%\%",
+	title = "{Cutoff effects in twisted mass lattice {QCD}}",
+	volume = "10",
+	year = "2001"
+}
+
+@article{DellaMorte:2003jj,
+	author = "{Della Morte}, M. and others",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/0307008",
+	journal = "Comput. Phys. Commun.",
+	pages = "62--72",
+	slaccitation = "%\%CITATION = HEP-LAT 0307008;\%\%",
+	title = "{Simulating the Schroedinger functional with two pseudo- fermions}",
+	volume = "156",
+	year = "2003"
+}
+
+@article{DellaMorte:2003mn,
+	author = "{Della Morte}, M. and others",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/0307021",
+	journal = "Phys. Lett.",
+	pages = "93--98",
+	slaccitation = "%\%CITATION = HEP-LAT 0307021;\%\%",
+	title = "{Lattice HQET with exponentially improved statistical precision}",
+	volume = "B581",
+	year = "2004"
+}
+
+@article{DellaMorte:2003mw,
+	author = "{Della Morte}, M. and others",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/0309080",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "346--348",
+	slaccitation = "%\%CITATION = HEP-LAT 0309080;\%\%",
+	title = "{Static quarks with improved statistical precision}",
+	volume = "129",
+	year = "2004"
+}
+
+@article{DellaMorte:2005yc,
+	author = "{Della Morte}, M. and Shindler, A. and Sommer, R.",
+	eprint = "hep-lat/0506008",
+	slaccitation = "%\%CITATION = HEP-LAT 0506008;\%\%",
+	title = "{On lattice actions for static quarks}",
+	year = "2005"
+}
+
+@article{Dimopoulos:2006dm,
+	author = "Dimopoulos, P. and others",
+	collaboration = "ALPHA",
+	eprint = "hep-ph/0601002",
+	journal = "Nucl. Phys.",
+	pages = "69--108",
+	slaccitation = "%\%CITATION = HEP-PH 0601002;\%\%",
+	title = "{A precise determination of B(K) in quenched QCD}",
+	volume = "B749",
+	year = "2006"
+}
+
+@article{Dimopoulos:2007fn,
+	archiveprefix = "arXiv",
+	author = "Dimopoulos, P. and others",
+	eprint = "0710.0975",
+	journal = "PoS",
+	pages = "241",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0710.0975;\%\%",
+	title = "{Renormalisation of quark bilinears with Nf=2 Wilson fermions and tree-level improved gauge action}",
+	volume = "LAT2007",
+	year = "2007"
+}
+
+@article{Dimopoulos:2007qy,
+	archiveprefix = "arXiv",
+	author = "Dimopoulos, Petros and Frezzotti, Roberto and Herdoiza, Gregorio and Urbach, Carsten and Wenger, Urs",
+	collaboration = "ETM",
+	eprint = "0710.2498",
+	journal = "PoS",
+	pages = "102",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0710.2498;\%\%",
+	title = "{Scaling and low energy constants in lattice QCD with N\_f=2 maximally twisted Wilson quarks}",
+	volume = "LAT2007",
+	year = "2007"
+}
+
+@article{Dimopoulos:2008sy,
+	archiveprefix = "arXiv",
+	author = "Dimopoulos, Petros and others",
+	collaboration = "ETM",
+	eprint = "0810.2873",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0810.2873;\%\%",
+	title = "{Scaling and chiral extrapolation of pion mass and decay constant with maximally twisted mass QCD}",
+	year = "2008"
+}
+
+@article{Dong:2001fm,
+	author = "Dong, S. J. and others",
+	eprint = "hep-lat/0108020",
+	journal = "Phys. Rev.",
+	pages = "054507",
+	slaccitation = "%\%CITATION = HEP-LAT 0108020;\%\%",
+	title = "{Chiral properties of pseudoscalar mesons on a quenched 20**4 lattice with overlap fermions}",
+	volume = "D65",
+	year = "2002"
+}
+
+@article{Duane:1987de,
+	author = "Duane, S. and Kennedy, A. D. and Pendleton, B. J. and Roweth, D.",
+	journal = "Phys. Lett.",
+	pages = "216--222",
+	slaccitation = "%\%CITATION = PHLTA,B195,216;\%\%",
+	title = "{{H}ybrid monte carlo}",
+	volume = "B195",
+	year = "1987"
+}
+
+@article{Edwards:1996vs,
+	author = "Edwards, R. G. and Horvath, I. and Kennedy, A. D.",
+	eprint = "hep-lat/9606004",
+	journal = "Nucl. Phys.",
+	pages = "375--402",
+	slaccitation = "%\%CITATION = HEP-LAT 9606004;\%\%",
+	title = "{Instabilities and non-reversibility of molecular dynamics trajectories}",
+	volume = "B484",
+	year = "1997"
+}
+
+@article{Edwards:2004sx,
+	author = "Edwards, Robert G. and Joo, Balint",
+	collaboration = "SciDAC",
+	eprint = "hep-lat/0409003",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "832",
+	slaccitation = "%\%CITATION = HEP-LAT 0409003;\%\%",
+	title = "{The {Chroma} software system for lattice {QCD}}",
+	volume = "140",
+	year = "2005"
+}
+
+@article{Eichten:1989zv,
+	author = "Eichten, E. and Hill, B.",
+	journal = "Phys. Lett.",
+	pages = "511",
+	slaccitation = "%\%CITATION = PHLTA,B234,511;\%\%",
+	title = "{An effective field theory for the calculation of matrix elements involving heavy quarks}",
+	volume = "B234",
+	year = "1990"
+}
+
+@article{Farchioni:2002vn,
+	author = "Farchioni, F. and Gebert, C. and Montvay, I. and Scorzato, L.",
+	eprint = "hep-lat/0206008",
+	journal = "Eur. Phys. J.",
+	pages = "237--251",
+	slaccitation = "%\%CITATION = HEP-LAT 0206008;\%\%",
+	title = "{Numerical simulation tests with light dynamical quarks}",
+	volume = "C26",
+	year = "2002"
+}
+
+@article{Farchioni:2004fs,
+	author = "Farchioni, F. and others",
+	eprint = "hep-lat/0410031",
+	journal = "Eur. Phys. J.",
+	pages = "73--87",
+	slaccitation = "%\%CITATION = HEP-LAT 0410031;\%\%",
+	title = "{The phase structure of lattice {QCD} with {Wilson} quarks and renormalization group improved gluons}",
+	volume = "C42",
+	year = "2005"
+}
+
+@article{Farchioni:2004ma,
+	author = "Farchioni, F. and others",
+	eprint = "hep-lat/0409098",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "240--245",
+	slaccitation = "%\%CITATION = HEP-LAT 0409098;\%\%",
+	title = "{Exploring the phase structure of lattice {{QCD}} with twisted mass quarks}",
+	volume = "140",
+	year = "2005"
+}
+
+@article{Farchioni:2004us,
+	author = "Farchioni, F. and others",
+	eprint = "hep-lat/0406039",
+	journal = "Eur. Phys. J.",
+	pages = "421--433",
+	slaccitation = "%\%CITATION = HEP-LAT 0406039;\%\%",
+	title = "{Twisted mass quarks and the phase structure of lattice {QCD}}",
+	volume = "C39",
+	year = "2005"
+}
+
+@article{Farchioni:2005ec,
+	author = "Farchioni, Federico and others",
+	eprint = "hep-lat/0509131",
+	journal = "PoS",
+	pages = "072",
+	slaccitation = "%\%CITATION = HEP-LAT 0509131;\%\%",
+	title = "{Dynamical twisted mass fermions}",
+	volume = "LAT2005",
+	year = "2006"
+}
+
+@article{Farchioni:2005hf,
+	author = "Farchioni, F. and others",
+	eprint = "hep-lat/0509036",
+	journal = "PoS",
+	pages = "033",
+	slaccitation = "%\%CITATION = HEP-LAT 0509036;\%\%",
+	title = "{Twisted mass fermions: Neutral pion masses from disconnected contributions}",
+	volume = "LAT2005",
+	year = "2006"
+}
+
+@article{Farchioni:2005tu,
+	author = "Farchioni, F. and others",
+	eprint = "hep-lat/0506025",
+	journal = "Phys. Lett.",
+	pages = "324--333",
+	slaccitation = "%\%CITATION = HEP-LAT 0506025;\%\%",
+	title = "{Lattice spacing dependence of the first order phase transition for dynamical twisted mass fermions}",
+	volume = "B624",
+	year = "2005"
+}
+
+@article{Feldmann:1999uf,
+	author = "Feldmann, Thorsten",
+	eprint = "hep-ph/9907491",
+	journal = "Int. J. Mod. Phys.",
+	pages = "159--207",
+	slaccitation = "%\%CITATION = HEP-PH/9907491;\%\%",
+	title = "{Quark structure of pseudoscalar mesons}",
+	volume = "A15",
+	year = "2000"
+}
+
+@article{Feynman:1948aa,
+	author = "Feynman, R. P.",
+	journal = "Rev. Mod. Phys.",
+	pages = "367--387",
+	slaccitation = "%\%CITATION = RMPHA,20,367;\%\%",
+	title = "{Space-time approach to non-relativistic quantum mechanics}",
+	volume = "20",
+	year = "1948"
+}
+
+@article{Fischer:1996th,
+	author = "Fischer, S. and others",
+	eprint = "hep-lat/9602019",
+	journal = "Comp. Phys. Commun.",
+	pages = "20--34",
+	slaccitation = "%\%CITATION = HEP-LAT 9602019;\%\%",
+	title = "{A Parallel SSOR Preconditioner for Lattice {QCD}}",
+	volume = "98",
+	year = "1996"
+}
+
+@article{Fokkema:1998aa,
+	author = "Fokkema, D.~R. and Sleijpen, G.~L.~G. and Van~der~Vorst, H.~A.",
+	journal = "J. Sci. Comput.",
+	pages = "94--125",
+	title = "{{J}acobi-{D}avidson style {QR} and {QZ} algorithms for the reduction of matrix pencils}",
+	volume = "20",
+	year = "1998"
+}
+
+@article{Foster:1998vw,
+	author = "Foster, M. and Michael, C.",
+	collaboration = "UKQCD",
+	eprint = "hep-lat/9810021",
+	journal = "Phys. Rev.",
+	pages = "074503",
+	slaccitation = "%\%CITATION = HEP-LAT 9810021;\%\%",
+	title = "{Quark mass dependence of hadron masses from lattice {QCD}}",
+	volume = "D59",
+	year = "1999"
+}
+
+@article{Freund,
+	author = "Freund, R.W.",
+	journal = "in Numerical Linear Algebra, L.\ Reichel, A.\ Ruttan and R.S.\ Varga (eds.)",
+	pages = "p. 101",
+	year = "1993"
+}
+
+@article{Frezzotti:1997ym,
+	author = "Frezzotti, R. and Jansen, K.",
+	eprint = "hep-lat/9702016",
+	journal = "Phys. Lett.",
+	pages = "328--334",
+	slaccitation = "%\%CITATION = HEP-LAT 9702016;\%\%",
+	title = "{A polynomial hybrid Monte Carlo algorithm}",
+	volume = "B402",
+	year = "1997"
+}
+
+@article{Frezzotti:1998eu,
+	author = "Frezzotti, R. and Jansen, K.",
+	eprint = "hep-lat/9808011",
+	journal = "Nucl. Phys.",
+	pages = "395--431",
+	slaccitation = "%\%CITATION = HEP-LAT 9808011;\%\%",
+	title = "{The {PHMC} algorithm for simulations of dynamical fermions. {I}: Description and properties}",
+	volume = "B555",
+	year = "1999"
+}
+
+@articlef{Frezzotti:1998yp,
+	author = "Frezzotti, R. and Jansen, K.",
+	eprint = "hep-lat/9808038",
+	journal = "Nucl. Phys.",
+	pages = "432--453",
+	slaccitation = "%\%CITATION = HEP-LAT 9808038;\%\%",
+	title = "{The {PHMC} algorithm for simulations of dynamical fermions. {II}: Performance analysis}",
+	volume = "B555",
+	year = "1999"
+}
+
+@article{Frezzotti:1999vv,
+	author = "Frezzotti, R. and Grassi, P. A. and Sint, S. and Weisz, P.",
+	eprint = "hep-lat/9909003",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "941--946",
+	slaccitation = "%\%CITATION = HEP-LAT 9909003;\%\%",
+	title = "{A local formulation of lattice {QCD} without unphysical fermion zero modes}",
+	volume = "83",
+	year = "2000"
+}
+
+@article{Frezzotti:2000nk,
+	author = "Frezzotti, R. and Grassi, P. A. and Sint, S. and Weisz, P.",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/0101001",
+	journal = "JHEP",
+	pages = "058",
+	slaccitation = "%\%CITATION = HEP-LAT 0101001;\%\%",
+	title = "{Lattice {QCD} with a chirally twisted mass term}",
+	volume = "08",
+	year = "2001"
+}
+
+@article{Frezzotti:2001du,
+	author = "Frezzotti, R. and Sint, S.",
+	eprint = "hep-lat/0110140",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "814--816",
+	slaccitation = "%\%CITATION = HEP-LAT 0110140;\%\%",
+	title = "{Some remarks on {O(a)} improved twisted mass {QCD}}",
+	volume = "106",
+	year = "2002"
+}
+
+@article{Frezzotti:2001ea,
+	author = "Frezzotti, R. and Sint, S. and Weisz, P.",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/0104014",
+	journal = "JHEP",
+	pages = "048",
+	slaccitation = "%\%CITATION = HEP-LAT 0104014;\%\%",
+	title = "{{O(a)} improved twisted mass lattice {QCD}}",
+	volume = "07",
+	year = "2001"
+}
+
+@article{Frezzotti:2003ni,
+	author = "Frezzotti, R. and Rossi, G. C.",
+	eprint = "hep-lat/0306014",
+	journal = "JHEP",
+	pages = "007",
+	slaccitation = "%\%CITATION = HEP-LAT 0306014;\%\%",
+	title = "{Chirally improving {Wilson} fermions. {I}: {O(a)} improvement}",
+	volume = "08",
+	year = "2004"
+}
+
+@article{Frezzotti:2003xj,
+	author = "Frezzotti, R. and Rossi, G. C.",
+	eprint = "hep-lat/0311008",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "193--202",
+	slaccitation = "%\%CITATION = HEP-LAT 0311008;\%\%",
+	title = "{Twisted-mass lattice {QCD} with mass non-degenerate quarks}",
+	volume = "128",
+	year = "2004"
+}
+
+@article{Frezzotti:2004wz,
+	author = "Frezzotti, R. and Rossi, G. C.",
+	eprint = "hep-lat/0407002",
+	journal = "JHEP",
+	pages = "070",
+	slaccitation = "%\%CITATION = HEP-LAT 0407002;\%\%",
+	title = "{Chirally improving {Wilson} fermions. {II}: Four-quark operators}",
+	volume = "10",
+	year = "2004"
+}
+
+@article{Frezzotti:2005gi,
+	author = "Frezzotti, R. and Martinelli, G. and Papinutto, M. and Rossi, G. C.",
+	eprint = "hep-lat/0503034",
+	journal = "JHEP",
+	pages = "038",
+	slaccitation = "%\%CITATION = HEP-LAT 0503034;\%\%",
+	title = "{Reducing cutoff effects in maximally twisted lattice {QCD} close to the chiral limit}",
+	volume = "04",
+	year = "2006"
+}
+
+@article{Frezzotti:2007qv,
+	archiveprefix = "arXiv",
+	author = "Frezzotti, R. and Rossi, G.",
+	eprint = "0710.2492",
+	journal = "PoS",
+	pages = "277",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0710.2492;\%\%",
+	title = "{O(a^2) cutoff effects in Wilson fermion simulations}",
+	volume = "LAT2007",
+	year = "2007"
+}
+
+@article{Frezzotti:2008dr,
+	archiveprefix = "arXiv",
+	author = "Frezzotti, R. and Lubicz, V. and Simula, S.",
+	collaboration = "ETM",
+	eprint = "0812.4042",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0812.4042;\%\%",
+	title = "{Electromagnetic form factor of the pion from twisted-mass lattice {QCD} at {Nf}=2}",
+	year = "2008"
+}
+
+@article{Fritzsch:1973pi,
+	author = "Fritzsch, H. and Gell-Mann, M. and Leutwyler, H.",
+	journal = "Phys. Lett.",
+	pages = "365--368",
+	slaccitation = "%\%CITATION = PHLTA,B47,365;\%\%",
+	title = "{Advantages of the color octet gluon picture}",
+	volume = "B47",
+	year = "1973"
+}
+
+@article{Frommer:1994vn,
+	author = "Frommer, A. and Hannemann, V. and Nockel, B. and Lippert, T. and Schilling, K.",
+	eprint = "hep-lat/9404013",
+	journal = "Int. J. Mod. Phys.",
+	pages = "1073--1088",
+	slaccitation = "%\%CITATION = HEP-LAT 9404013;\%\%",
+	title = "{Accelerating {Wilson} fermion matrix inversions by means of the stabilized biconjugate gradient algorithm}",
+	volume = "C5",
+	year = "1994"
+}
+
+@article{Frommer:1995ik,
+	author = "Frommer, Andreas and Nockel, Bertold and Gusken, Stephan and Lippert, Thomas and Schilling, Klaus",
+	eprint = "hep-lat/9504020",
+	journal = "Int. J. Mod. Phys.",
+	pages = "627--638",
+	slaccitation = "%\%CITATION = HEP-LAT 9504020;\%\%",
+	title = "{Many masses on one stroke: Economic computation of quark propagators}",
+	volume = "C6",
+	year = "1995"
+}
+
+@article{Furman:1994ky,
+	author = "Furman, V. and Shamir, Y.",
+	eprint = "hep-lat/9405004",
+	journal = "Nucl. Phys.",
+	pages = "54--78",
+	slaccitation = "%\%CITATION = HEP-LAT 9405004;\%\%",
+	title = "{Axial symmetries in lattice QCD with Kaplan fermions}",
+	volume = "B439",
+	year = "1995"
+}
+
+@article{Garden:1999fg,
+	author = "Garden, J. and Heitger, J. and Sommer, R. and H., Wittig",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/9906013",
+	journal = "Nucl. Phys.",
+	pages = "237--256",
+	slaccitation = "%\%CITATION = HEP-LAT 9906013;\%\%",
+	title = "{Precision computation of the strange quark's mass in quenched {QCD}}",
+	volume = "B571",
+	year = "2000"
+}
+
+@article{Garron:2003cb,
+	author = "Garron, N. and Giusti, L. and Hoelbling, C. and Lellouch, L. and Rebbi, C.",
+	eprint = "hep-ph/0306295",
+	journal = "Phys. Rev. Lett.",
+	pages = "042001",
+	slaccitation = "%\%CITATION = HEP-PH 0306295;\%\%",
+	title = "{B(K) from quenched {QCD} with exact chiral symmetry}",
+	volume = "92",
+	year = "2004"
+}
+
+@article{Gasser:1982ap,
+	author = "Gasser, J. and Leutwyler, H.",
+	journal = "Phys. Rept.",
+	pages = "77--169",
+	slaccitation = "%\%CITATION = PRPLC,87,77;\%\%",
+	title = "{Quark masses}",
+	volume = "87",
+	year = "1982"
+}
+
+@article{Gasser:1983yg,
+	author = "Gasser, J. and Leutwyler, H.",
+	journal = "Ann. Phys.",
+	pages = "142",
+	slaccitation = "%\%CITATION = APNYA,158,142;\%\%",
+	title = "{Chiral perturbation theory to one loop}",
+	volume = "158",
+	year = "1984"
+}
+
+@article{Gasser:1985gg,
+	author = "Gasser, J. and Leutwyler, H.",
+	journal = "Nucl. Phys.",
+	pages = "465",
+	slaccitation = "%\%CITATION = NUPHA,B250,465;\%\%",
+	title = "{Chiral perturbation theory: expansions in the mass of the strange quark}",
+	volume = "B250",
+	year = "1985"
+}
+
+@article{Gasser:1986vb,
+	author = "Gasser, J. and Leutwyler, H.",
+	journal = "Phys. Lett.",
+	pages = "83",
+	slaccitation = "%\%CITATION = PHLTA,B184,83;\%\%",
+	title = "{LIGHT QUARKS AT LOW TEMPERATURES}",
+	volume = "B184",
+	year = "1987"
+}
+
+@article{Gattringer:2003qx,
+	author = "Gattringer, C. and others",
+	collaboration = "BGR",
+	eprint = "hep-lat/0307013",
+	journal = "Nucl. Phys.",
+	pages = "3--51",
+	slaccitation = "%\%CITATION = HEP-LAT 0307013;\%\%",
+	title = "{Quenched spectroscopy with fixed-point and chirally improved fermions}",
+	volume = "B677",
+	year = "2004"
+}
+
+@article{Gell-Mann:1964nj,
+	author = "Gell-Mann, M.",
+	journal = "Phys. Lett.",
+	pages = "214--215",
+	slaccitation = "%\%CITATION = PHLTA,8,214;\%\%",
+	title = "{A Schematic model of baryons and mesons}",
+	volume = "8",
+	year = "1964"
+}
+
+@article{Gell-Mann:1968rz,
+	author = "Gell-Mann, M. and Oakes, R. J. and Renner, B.",
+	journal = "Phys. Rev.",
+	pages = "2195--2199",
+	slaccitation = "%\%CITATION = PHRVA,175,2195;\%\%",
+	title = "{Behavior of current divergences under SU(3) x SU(3)}",
+	volume = "175",
+	year = "1968"
+}
+
+@phdthesis{Geus:2002,
+	author = "Geus, R.",
+	optaddress = "",
+	optannote = "",
+	optkey = "DISS. ETH NO. 14734",
+	optmonth = "",
+	optnote = "",
+	opttype = "",
+	school = "Swiss Federal Institute Of Technology Z{\"u}rich",
+	title = "{The Jacobi-Davidson algorithm for solving large sparse symmetric eigenvalue problems with application to the design of accelerator cavities}",
+	year = "2002"
+}
+
+@article{Gimenez:1998ue,
+	author = "Gimenez, V. and Giusti, L. and Rapuano, F. and Talevi, M.",
+	eprint = "hep-lat/9806006",
+	journal = "Nucl. Phys.",
+	pages = "429--445",
+	slaccitation = "%\%CITATION = HEP-LAT 9806006;\%\%",
+	title = "{Non-perturbative renormalization of quark bilinears}",
+	volume = "B531",
+	year = "1998"
+}
+
+@article{Gimenez:2005nt,
+	author = "Gimenez, V. and Lubicz, V. and Mescia, F. and Porretti, V. and Reyes, J.",
+	eprint = "hep-lat/0503001",
+	journal = "Eur. Phys. J.",
+	pages = "535--544",
+	slaccitation = "%\%CITATION = HEP-LAT/0503001;\%\%",
+	title = "{Operator product expansion and quark condensate from lattice QCD in coordinate space}",
+	volume = "C41",
+	year = "2005"
+}
+
+@article{Ginsparg:1981bj,
+	author = "Ginsparg, P. H. and {Wilson}, K. G.",
+	journal = "Phys. Rev.",
+	pages = "2649",
+	slaccitation = "%\%CITATION = PHRVA,D25,2649;\%\%",
+	title = "{A remnant of chiral symmetry on the lattice}",
+	volume = "D25",
+	year = "1982"
+}
+
+@article{Giusti:1998wy,
+	author = "Giusti, L. and Rapuano, F. and Talevi, M. and Vladikas, A.",
+	eprint = "hep-lat/9807014",
+	journal = "Nucl. Phys.",
+	pages = "249--277",
+	slaccitation = "%\%CITATION = HEP-LAT 9807014;\%\%",
+	title = "{The QCD chiral condensate from the lattice}",
+	volume = "B538",
+	year = "1999"
+}
+
+@article{Giusti:2001pk,
+	author = "Giusti, L. and Hoelbling, C. and Rebbi, C.",
+	eprint = "hep-lat/0108007",
+	journal = "Phys. Rev.",
+	note = "Erratum-ibid.D65:079903,2002",
+	pages = "114508",
+	slaccitation = "%\%CITATION = HEP-LAT 0108007;\%\%",
+	title = "{Light quark masses with overlap fermions in quenched {QCD}}",
+	volume = "D64",
+	year = "2001"
+}
+
+@article{Giusti:2002sm,
+	author = "Giusti, L. and Hoelbling, C. and L{\"u}scher, M. and Wittig, H.",
+	eprint = "hep-lat/0212012",
+	journal = "Comput. Phys. Commun.",
+	pages = "31--51",
+	slaccitation = "%\%CITATION = HEP-LAT 0212012;\%\%",
+	title = "{Numerical techniques for lattice QCD in the epsilon- regime}",
+	volume = "153",
+	year = "2003"
+}
+
+@article{Giusti:2007hk,
+	author = "Giusti, Leonardo",
+	eprint = "hep-lat/0702014",
+	journal = "PoS.",
+	pages = "",
+	slaccitation = "%\%CITATION = HEP-LAT/0702014;\%\%",
+	title = "{Light dynamical fermions on the lattice: Toward the chiral regime of QCD}",
+	volume = "LAT2006",
+	year = "2007"
+}
+
+@article{Glassner:1996gz,
+	author = "Gl{\"a}ssner, U. and others",
+	eprint = "hep-lat/9605008",
+	slaccitation = "%\%CITATION = HEP-LAT 9605008;\%\%",
+	title = "{How to compute {G}reen's functions for entire mass trajectories within {K}rylov solvers}",
+	year = "1996"
+}
+
+@article{Gockeler:1998fn,
+	author = "G{\"o}ckeler, M. and others",
+	eprint = "hep-lat/9707021",
+	journal = "Phys. Rev.",
+	pages = "5562--5580",
+	slaccitation = "%\%CITATION = HEP-LAT 9707021;\%\%",
+	title = "{Scaling of non-perturbatively {O(a)} improved {Wilson} fermions: Hadron spectrum, quark masses and decay constants}",
+	volume = "D57",
+	year = "1998"
+}
+
+@article{Gorishnii:1990vf,
+	author = "Gorishnii, S. G. and Kataev, A. L. and Larin, S. A.",
+	journal = "Phys. Lett.",
+	pages = "144--150",
+	slaccitation = "%\%CITATION = PHLTA,B259,144;\%\%",
+	title = "{The O (alpha-s**3) corrections to sigma-tot (e+ e- $\to$ hadrons) and Gamma (tau- $\to$ tau-neutrino + hadrons) in QCD}",
+	volume = "B259",
+	year = "1991"
+}
+
+@article{Greenberg:1964pe,
+	author = "Greenberg, O. W.",
+	journal = "Phys. Rev. Lett.",
+	pages = "598--602",
+	slaccitation = "%\%CITATION = PRLTA,13,598;\%\%",
+	title = "{Spin and unitary spin independence in a paraquark model of baryons and mesons}",
+	volume = "13",
+	year = "1964"
+}
+
+@article{Gregory:2007ce,
+	archiveprefix = "arXiv",
+	author = "Gregory, Eric B. and Irving, Alan and Richards, Chris M. and McNeile, Craig and Hart, Alistair",
+	eprint = "0710.1725",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = ARXIV:0710.1725;\%\%",
+	title = "{Pseudoscalar Flavor-Singlet Physics with Staggered Fermions}",
+	year = "2007"
+}
+
+@article{Gross:1973id,
+	author = "Gross, D. J. and Wilczek, F.",
+	journal = "Phys. Rev. Lett.",
+	pages = "1343--1346",
+	slaccitation = "%\%CITATION = PRLTA,30,1343;\%\%",
+	title = "{Ultraviolet behavior of non-Abelian gauge theories}",
+	volume = "30",
+	year = "1973"
+}
+
+@article{Gross:1973ju,
+	author = "Gross, D. J. and Wilczek, F.",
+	journal = "Phys. Rev.",
+	pages = "3633--3652",
+	slaccitation = "%\%CITATION = PHRVA,D8,3633;\%\%",
+	title = "{Asymptotically free gauge theories. 1}",
+	volume = "D8",
+	year = "1973"
+}
+
+@article{Gross:1974jv,
+	author = "Gross, D. J. and Neveu, A.",
+	journal = "Phys. Rev.",
+	pages = "3235",
+	slaccitation = "%\%CITATION = PHRVA,D10,3235;\%\%",
+	title = "{Dynamical symmetry breaking in asymptotically free field theories}",
+	volume = "D10",
+	year = "1974"
+}
+
+@article{Guagnelli:1998ud,
+	author = "Guagnelli, M. and Sommer, R. and Wittig, H.",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/9806005",
+	journal = "Nucl. Phys.",
+	pages = "389--402",
+	slaccitation = "%\%CITATION = HEP-LAT 9806005;\%\%",
+	title = "{Precision computation of a low-energy reference scale in quenched lattice {QCD}}",
+	volume = "B535",
+	year = "1998"
+}
+
+@article{Guagnelli:2004ga,
+	author = "Guagnelli, M. and others",
+	collaboration = "Zeuthen-Rome (ZeRo)",
+	eprint = "hep-lat/0405027",
+	journal = "Eur. Phys. J.",
+	pages = "69--80",
+	slaccitation = "%\%CITATION = HEP-LAT 0405027;\%\%",
+	title = "{Non-perturbative pion matrix element of a twist-2 operator from the lattice}",
+	volume = "C40",
+	year = "2005"
+}
+
+@article{Guagnelli:2004ww,
+	author = "Guagnelli, M. and others",
+	collaboration = "Zeuthen-Rome (ZeRo)",
+	eprint = "hep-lat/0403009",
+	journal = "Phys. Lett.",
+	pages = "216--221",
+	slaccitation = "%\%CITATION = HEP-LAT 0403009;\%\%",
+	title = "{Finite size effects of a pion matrix element}",
+	volume = "B597",
+	year = "2004"
+}
+
+@article{Guagnelli:2005zc,
+	author = "Guagnelli, M. and Heitger, J. and Pena, C. and Sint, S. and Vladikas, A.",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/0505002",
+	journal = "JHEP",
+	pages = "088",
+	slaccitation = "%\%CITATION = HEP-LAT 0505002;\%\%",
+	title = "{Non-perturbative renormalization of left-left four-fermion operators in quenched lattice QCD}",
+	volume = "03",
+	year = "2006"
+}
+
+@article{Gupta:1988js,
+	author = "Gupta, R. and Kilcup, G. W. and Sharpe, S. R.",
+	journal = "Phys. Rev.",
+	pages = "1278",
+	slaccitation = "%\%CITATION = PHRVA,D38,1278;\%\%",
+	title = "{Tuning the hybrid monte carlo algorithm}",
+	volume = "D38",
+	year = "1988"
+}
+
+@article{Gupta:1989kx,
+	author = "Gupta, R. and others",
+	journal = "Phys. Rev.",
+	pages = "2072",
+	slaccitation = "%\%CITATION = PHRVA,D40,2072;\%\%",
+	title = "{{QCD} with dynamical {Wilson} fermions}",
+	volume = "D40",
+	year = "1989"
+}
+
+@article{Gupta:1990ka,
+	author = "Gupta, S. and Irback, A. and Karsch, F. and Petersson, B.",
+	journal = "Phys. Lett.",
+	pages = "437--443",
+	slaccitation = "%\%CITATION = PHLTA,B242,437;\%\%",
+	title = "{The acceptance probability in the hybrid monte carlo method}",
+	volume = "B242",
+	year = "1990"
+}
+
+@article{Gupta:1991sn,
+	author = "Gupta, R. and others",
+	journal = "Phys. Rev.",
+	pages = "3272--3292",
+	slaccitation = "%\%CITATION = PHRVA,D44,3272;\%\%",
+	title = "{{QCD} with dynamical {Wilson} fermions. 2}",
+	volume = "D44",
+	year = "1991"
+}
+
+@unpublished{Gupta:1997nd,
+	author = "Gupta, R.",
+	eprint = "hep-lat/9807028",
+	note = "Lectures given at Les Houches Summer School in Theoretical Physics, Session 68",
+	slaccitation = "%\%CITATION = HEP-LAT 9807028;\%\%",
+	title = "{Introduction to lattice {QCD}}",
+	year = "1997"
+}
+
+@article{Han:1965pf,
+	author = "Han, M. Y. and Nambu, Yoichiro",
+	journal = "Phys. Rev.",
+	pages = "B1006--B1010",
+	slaccitation = "%\%CITATION = PHRVA,139,B1006;\%\%",
+	title = "{Three-triplet model with double SU(3) symmetry}",
+	volume = "139",
+	year = "1965"
+}
+
+@article{Hasenbusch:2001ne,
+	author = "Hasenbusch, M.",
+	eprint = "hep-lat/0107019",
+	journal = "Phys. Lett.",
+	pages = "177--182",
+	slaccitation = "%\%CITATION = HEP-LAT 0107019;\%\%",
+	title = "{Speeding up the {H}ybrid-{M}onte-{C}arlo algorithm for dynamical fermions}",
+	volume = "B519",
+	year = "2001"
+}
+
+@article{Hasenbusch:2002ai,
+	archiveprefix = "arXiv",
+	author = "Hasenbusch, M. and Jansen, K.",
+	doi = "10.1016/S0550-3213(03)00227-X",
+	eprint = "hep-lat/0211042",
+	journal = "Nucl.Phys.",
+	pages = "299--320",
+	primaryclass = "hep-lat",
+	reportnumber = "DESY-02-200",
+	slaccitation = "%\%CITATION = HEP-LAT/0211042;\%\%",
+	title = "{Speeding up lattice QCD simulations with clover improved Wilson fermions}",
+	volume = "B659",
+	year = "2003"
+}
+
+@article{Hasenbusch:2003vg,
+	author = "Hasenbusch, M.",
+	eprint = "hep-lat/0310029",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "27--33",
+	slaccitation = "%\%CITATION = HEP-LAT 0310029;\%\%",
+	title = "{Full {QCD} algorithms towards the chiral limit}",
+	volume = "129",
+	year = "2004"
+}
+
+@article{Hasenfratz:1998jp,
+	author = "Hasenfratz, P.",
+	eprint = "hep-lat/9802007",
+	journal = "Nucl. Phys.",
+	pages = "401--409",
+	slaccitation = "%\%CITATION = HEP-LAT 9802007;\%\%",
+	title = "{Lattice {QCD} without tuning, mixing and current renormalization}",
+	volume = "B525",
+	year = "1998"
+}
+
+@article{Hasenfratz:1998ri,
+	author = "Hasenfratz, P. and Laliena, V. and Niedermayer, F.",
+	eprint = "hep-lat/9801021",
+	journal = "Phys. Lett.",
+	pages = "125--131",
+	slaccitation = "%\%CITATION = HEP-LAT 9801021;\%\%",
+	title = "{The index theorem in {QCD} with a finite cut-off}",
+	volume = "B427",
+	year = "1998"
+}
+
+@article{Hasenfratz:2001hp,
+	author = "Hasenfratz, A. and Knechtli, F.",
+	eprint = "hep-lat/0103029",
+	journal = "Phys. Rev.",
+	pages = "034504",
+	slaccitation = "%\%CITATION = HEP-LAT 0103029;\%\%",
+	title = "{Flavor symmetry and the static potential with hypercubic blocking}",
+	volume = "D64",
+	year = "2001"
+}
+
+@article{Hasenfratz:2001tw,
+	author = "Hasenfratz, A. and Hoffmann, R. and Knechtli, F.",
+	eprint = "hep-lat/0110168",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "418--420",
+	slaccitation = "%\%CITATION = HEP-LAT 0110168;\%\%",
+	title = "{The static potential with hypercubic blocking}",
+	volume = "106",
+	year = "2002"
+}
+
+@article{Hashimoto:2008xg,
+	archiveprefix = "arXiv",
+	author = "Hashimoto, Koichi and Izubuchi, Taku",
+	eprint = "0803.0186",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = ARXIV:0803.0186;\%\%",
+	title = "{eta' meson from two flavor dynamical domain wall fermions}",
+	year = "2008"
+}
+
+@article{Heitger:2000ay,
+	author = "Heitger, J. and Sommer, R. and Wittig, H.",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/0006026",
+	journal = "Nucl. Phys.",
+	note = "and references therein",
+	pages = "377--399",
+	slaccitation = "%\%CITATION = HEP-LAT 0006026;\%\%",
+	title = "{Effective chiral Lagrangians and lattice {{QCD}}}",
+	volume = "B588",
+	year = "2000"
+}
+
+@article{Hernandez:1998et,
+	author = "Hernandez, P. and Jansen, K. and L{\"u}scher, M.",
+	eprint = "hep-lat/9808010",
+	journal = "Nucl. Phys.",
+	pages = "363--378",
+	slaccitation = "%\%CITATION = HEP-LAT 9808010;\%\%",
+	title = "{Locality properties of Neuberger's lattice Dirac operator}",
+	volume = "B552",
+	year = "1999"
+}
+
+@article{Hernandez:2000sb,
+	author = "Hernandez, P. and Jansen, K. and Lellouch, L.",
+	eprint = "hep-lat/0001008",
+	slaccitation = "%\%CITATION = HEP-LAT 0001008;\%\%",
+	title = "{A numerical treatment of Neuberger's lattice Dirac operator}",
+	year = "2000"
+}
+
+@article{Hernandez:2001hq,
+	author = "Hernandez, P. and Jansen, K. and Lellouch, L. and Wittig, H.",
+	eprint = "hep-lat/0110199",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "766--771",
+	slaccitation = "%\%CITATION = HEP-LAT 0110199;\%\%",
+	title = "{Scalar condensate and light quark masses from overlap fermions}",
+	volume = "106",
+	year = "2002"
+}
+
+@article{Hernandez:2001yn,
+	author = "Hernandez, P. and Jansen, K. and Lellouch, L. and Wittig, H.",
+	eprint = "hep-lat/0106011",
+	journal = "JHEP",
+	pages = "018",
+	slaccitation = "%\%CITATION = HEP-LAT 0106011;\%\%",
+	title = "{Non-perturbative renormalization of the quark condensate in {Ginsparg}-{Wilson} regularizations}",
+	volume = "07",
+	year = "2001"
+}
+
+@article{Horsley:2004mx,
+	author = "Horsley, R. and Perlt, H. and Rakow, P. E. L. and Schierholz, G. and Schiller, A.",
+	collaboration = "QCDSF",
+	eprint = "hep-lat/0404007",
+	journal = "Nucl. Phys.",
+	pages = "3--35",
+	slaccitation = "%\%CITATION = HEP-LAT 0404007;\%\%",
+	title = "{One-loop renormalisation of quark bilinears for overlap fermions with improved gauge actions}",
+	volume = "B693",
+	year = "2004"
+}
+
+@article{Ilgenfritz:2003gw,
+	author = "Ilgenfritz, E.-M. and Kerler, W. and M{\"u}ller-Preu{\ss}ker, M. and Sternbeck, A. and St{\"u}ben, H.",
+	eprint = "hep-lat/0309057",
+	journal = "Phys. Rev.",
+	pages = "074511",
+	slaccitation = "%\%CITATION = HEP-LAT 0309057;\%\%",
+	title = "{A numerical reinvestigation of the {Aoki} phase with {N(f)} = 2 {Wilson} fermions at zero temperature}",
+	volume = "D69",
+	year = "2004"
+}
+
+@article{Ilgenfritz:2006tz,
+	author = "Ilgenfritz, E. -M. and others",
+	eprint = "hep-lat/0610112",
+	slaccitation = "%\%CITATION = HEP-LAT 0610112;\%\%",
+	title = "{Twisted mass QCD thermodynamics: First results on apeNEXT}",
+	year = "2006"
+}
+
+@article{Iwasaki:1983ck,
+	author = "Iwasaki, Y.",
+	note = "UTHEP-118",
+	title = "{Renormalization group analysis of lattice theories and improved lattice action. 2. four-dimensional nonabelian SU(N) gauge model}"
+}
+
+@article{Iwasaki:1985we,
+	author = "Iwasaki, Y.",
+	journal = "Nucl. Phys.",
+	pages = "141--156",
+	slaccitation = "%\%CITATION = NUPHA,B258,141;\%\%",
+	title = "{Renormalization group analysis of lattice theories and improved lattice action: two-dimensional nonlinear O(N) sigma model}",
+	volume = "B258",
+	year = "1985"
+}
+
+@article{Iwasaki:1992hn,
+	author = "Iwasaki, Y. and Kanaya, K. and Sakai, S. and Yoshie, T.",
+	eprint = "hep-lat/9211035",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "327--330",
+	slaccitation = "%\%CITATION = HEP-LAT 9211035;\%\%",
+	title = "{Quark confinement in multi - flavor quantum chromodynamics}",
+	volume = "30",
+	year = "1993"
+}
+
+@article{Izubuchi:1998hy,
+	author = "Izubuchi, T. and Noaki, J. and Ukawa, A.",
+	eprint = "hep-lat/9805019",
+	journal = "Phys. Rev.",
+	pages = "114507",
+	slaccitation = "%\%CITATION = HEP-LAT 9805019;\%\%",
+	title = "{Two-dimensional lattice Gross-Neveu model with {Wilson} fermion action at finite temperature and chemical potential}",
+	volume = "D58",
+	year = "1998"
+}
+
+@article{Jacobs:1983ph,
+	author = "Jacobs, L.",
+	journal = "Phys. Rev. Lett.",
+	pages = "172",
+	slaccitation = "%\%CITATION = PRLTA,51,172;\%\%",
+	title = "{Undoubling chirally symmetric lattice fermions}",
+	volume = "51",
+	year = "1983"
+}
+
+@article{Jagels:1994a,
+	author = "Jagels, C. F. and Reichel, L.",
+	journal = "Numer. Linear Algebra Appl.",
+	pages = "555--570",
+	title = "{fast minimal residual algorithm for shifted unitary matrices}",
+	volume = "1(6)",
+	year = "1994"
+}
+
+@article{Jagels:1994aa,
+	author = "Jagels, C. F. and Reichel, L.",
+	journal = "Numerical Linear Algebra with Aplications",
+	pages = "555--570",
+	title = "{A Fast Minimal Residual Algorithm for Shifted Unitary Matrices}",
+	volume = "1(6)",
+	year = "1994"
+}
+
+@article{Jansen:1994ym,
+	author = "Jansen, K.",
+	eprint = "hep-lat/9410018",
+	journal = "Phys. Rept.",
+	pages = "1--54",
+	slaccitation = "%\%CITATION = HEP-LAT 9410018;\%\%",
+	title = "{Domain wall fermions and chiral gauge theories}",
+	volume = "273",
+	year = "1996"
+}
+
+@article{Jansen:1995ck,
+	author = "Jansen, Karl and others",
+	eprint = "hep-lat/9512009",
+	journal = "Phys. Lett.",
+	pages = "275--282",
+	slaccitation = "%\%CITATION = HEP-LAT 9512009;\%\%",
+	title = "{Non-perturbative renormalization of lattice QCD at all scales}",
+	volume = "B372",
+	year = "1996"
+}
+
+@article{Jansen:1996cq,
+	author = "Jansen, K. and Liu, C.",
+	eprint = "hep-lat/9607057",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "974--976",
+	slaccitation = "%\%CITATION = HEP-LAT 9607057;\%\%",
+	title = "{Study of Liapunov exponents and the reversibility of molecular dynamics algorithms}",
+	volume = "53",
+	year = "1997"
+}
+
+@article{Jansen:1996xp,
+	author = "Jansen, K.",
+	eprint = "hep-lat/9607051",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "127--133",
+	slaccitation = "%\%CITATION = HEP-LAT 9607051;\%\%",
+	title = "{Recent developments in fermion simulation algorithms}",
+	volume = "53",
+	year = "1997"
+}
+
+@article{Jansen:1997yt,
+	author = "Jansen, K. and Liu, C.",
+	eprint = "hep-lat/9603008",
+	journal = "Comput. Phys. Commun.",
+	pages = "221--234",
+	slaccitation = "%\%CITATION = HEP-LAT 9603008;\%\%",
+	title = "{Implementation of Symanzik's improvement program for simulations of dynamical {Wilson} fermions in lattice {QCD}}",
+	volume = "99",
+	year = "1997"
+}
+
+@article{Jansen:1998mx,
+	author = "Jansen, K. and Sommer, R.",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/9803017",
+	journal = "Nucl. Phys.",
+	pages = "185--203",
+	slaccitation = "%\%CITATION = HEP-LAT 9803017;\%\%",
+	title = "{O(alpha) improvement of lattice {QCD} with two flavors of {Wilson} quarks}",
+	volume = "B530",
+	year = "1998"
+}
+
+@article{Jansen:2003ir,
+	author = "Jansen, K. and Shindler, A. and Urbach, C. and Wetzorke, I.",
+	collaboration = "\xlf",
+	eprint = "hep-lat/0312013",
+	journal = "Phys. Lett.",
+	pages = "432--438",
+	slaccitation = "%\%CITATION = HEP-LAT 0312013;\%\%",
+	title = "{Scaling test for {Wilson} twisted mass {QCD}}",
+	volume = "B586",
+	year = "2004"
+}
+
+@article{Jansen:2003jq,
+	author = "Jansen, K. and Nagai, K.-I.",
+	eprint = "hep-lat/0305009",
+	journal = "JHEP",
+	pages = "038",
+	slaccitation = "%\%CITATION = HEP-LAT 0305009;\%\%",
+	title = "{Reducing residual-mass effects for domain-wall fermions}",
+	volume = "12",
+	year = "2003"
+}
+
+@article{Jansen:2003nt,
+	author = "Jansen, K.",
+	eprint = "hep-lat/0311039",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "3--16",
+	slaccitation = "%\%CITATION = HEP-LAT 0311039;\%\%",
+	title = "{Actions for dynamical fermion simulations: Are we ready to go?}",
+	volume = "129",
+	year = "2004"
+}
+
+@article{Jansen:2005cg,
+	author = "Jansen, K. and others",
+	collaboration = "\xlf",
+	eprint = "hep-lat/0507032",
+	journal = "Phys. Lett.",
+	pages = "334--341",
+	slaccitation = "%\%CITATION = HEP-LAT 0507032;\%\%",
+	title = "{Flavour breaking effects of {Wilson} twisted mass fermions}",
+	volume = "B624",
+	year = "2005"
+}
+
+@unpublished{Jansen:2005chi,
+	author = "Jansen, K. and others",
+	collaborations = "\xlf",
+	note = "in preparation",
+	optannote = "",
+	optkey = "",
+	optmonth = "",
+	title = "{}",
+	year = "2005"
+}
+
+@article{Jansen:2005gf,
+	author = "Jansen, K. and Papinutto, M. and Shindler, A. and Urbach, C. and Wetzorke, I.",
+	collaboration = "\xlf",
+	eprint = "hep-lat/0503031",
+	journal = "Phys. Lett.",
+	pages = "184--191",
+	slaccitation = "%\%CITATION = HEP-LAT 0503031;\%\%",
+	title = "{Light quarks with twisted mass fermions}",
+	volume = "B619",
+	year = "2005"
+}
+
+@article{Jansen:2005kk,
+	author = "Jansen, K. and Papinutto, M. and Shindler, A. and Urbach, C. and Wetzorke, I.",
+	collaboration = "\xlf",
+	eprint = "hep-lat/0507010",
+	journal = "JHEP",
+	pages = "071",
+	slaccitation = "%\%CITATION = HEP-LAT 0507010;\%\%",
+	title = "{Quenched scaling of {Wilson} twisted mass fermions}",
+	volume = "09",
+	year = "2005"
+}
+
+@article{Jansen:2005yp,
+	author = "Jansen, Karl and Shindler, Andrea and Urbach, Carsten and Wenger, Urs",
+	eprint = "hep-lat/0510064",
+	journal = "PoS",
+	pages = "118",
+	slaccitation = "%\%CITATION = HEP-LAT 0510064;\%\%",
+	title = "{{HMC} algorithm with multiple time scale integration and mass preconditioning}",
+	volume = "LAT2005",
+	year = "2006"
+}
+
+@article{Jansen:2006ks,
+	author = "Jansen, Karl",
+	eprint = "hep-lat/0609012",
+	slaccitation = "%\%CITATION = HEP-LAT 0609012;\%\%",
+	title = "{Status report on ILDG activities}",
+	year = "2006"
+}
+
+@article{Jansen:2006rf,
+	author = "Jansen, Karl and Urbach, Carsten",
+	collaboration = "ETM",
+	eprint = "hep-lat/0610015",
+	slaccitation = "%\%CITATION = HEP-LAT 0610015;\%\%",
+	title = "{First results with two light flavours of quarks with maximally twisted mass}",
+	year = "2006"
+}
+
+@article{Jansen:2008wv,
+	archiveprefix = "arXiv",
+	author = "Jansen, K. and Michael, C. and Urbach, C.",
+	collaboration = "ETM",
+	eprint = "0804.3871",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0804.3871;\%\%",
+	title = "{The eta' meson from lattice {QCD}}",
+	year = "2008"
+}
+
+@article{Jansen:2008zz,
+	author = "Jansen, K. and Michael, C. and Urbach, C.",
+	doi = "10.1140/epjc/s10052-008-0764-6",
+	journal = "Eur. Phys. J.",
+	pages = "261--269",
+	slaccitation = "%\%CITATION = EPHJA,C58,261;\%\%",
+	title = "{The eta-prime meson from lattice QCD}",
+	volume = "C58",
+	year = "2008"
+}
+
+@unpublished{Jegerlehner:1996pm,
+	author = "Jegerlehner, Beat",
+	eprint = "hep-lat/9612014",
+	note = "unpublished",
+	slaccitation = "%\%CITATION = HEP-LAT 9612014;\%\%",
+	title = "{Krylov space solvers for shifted linear systems}",
+	year = "1996"
+}
+
+@article{Jegerlehner:1997rn,
+	author = "Jegerlehner, B.",
+	eprint = "hep-lat/9708029",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "958--960",
+	slaccitation = "%\%CITATION = HEP-LAT 9708029;\%\%",
+	title = "{Multiple mass solvers}",
+	volume = "63",
+	year = "1998"
+}
+
+@article{Jegerlehner:2003qp,
+	author = "Jegerlehner, F.",
+	eprint = "hep-ph/0310234",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "325--334",
+	slaccitation = "%\%CITATION = HEP-PH 0310234;\%\%",
+	title = "{Theoretical precision in estimates of the hadronic contributions to (g-2)mu and alpha(QED)(M(Z))}",
+	volume = "126",
+	year = "2004"
+}
+
+@article{Jenkins:1990jv,
+	author = "Jenkins, Elizabeth Ellen and Manohar, Aneesh V.",
+	journal = "Phys. Lett.",
+	pages = "558--562",
+	slaccitation = "%\%CITATION = PHLTA,B255,558;\%\%",
+	title = "{Baryon chiral perturbation theory using a heavy fermion Lagrangian}",
+	volume = "B255",
+	year = "1991"
+}
+
+@article{Kaiser:1998ds,
+	author = "Kaiser, Roland and Leutwyler, H.",
+	eprint = "hep-ph/9806336",
+	slaccitation = "%\%CITATION = HEP-PH/9806336;\%\%",
+	title = "{Pseudoscalar decay constants at large N(c)}",
+	year = "1998"
+}
+
+@article{Kalkreuter:1995mm,
+	author = "Kalkreuter, Thomas and Simma, Hubert",
+	eprint = "hep-lat/9507023",
+	journal = "Comput. Phys. Commun.",
+	pages = "33--47",
+	slaccitation = "%\%CITATION = HEP-LAT 9507023;\%\%",
+	title = "{An Accelerated conjugate gradient algorithm to compute low lying eigenvalues: A Study for the Dirac operator in SU(2) lattice QCD}",
+	volume = "93",
+	year = "1996"
+}
+
+@article{Kalkreuter:1996mm,
+	author = "Kalkreuter, T. and Simma, H.",
+	eprint = "hep-lat/9507023",
+	journal = "Comput. Phys. Commun.",
+	pages = "33--47",
+	slaccitation = "%\%CITATION = HEP-LAT 9507023;\%\%",
+	title = "{An Accelerated conjugate gradient algorithm to compute low lying eigenvalues: A Study for the Dirac operator in SU(2) lattice {QCD}}",
+	volume = "93",
+	year = "1996"
+}
+
+@article{Kaplan:1992bt,
+	author = "Kaplan, D. B.",
+	eprint = "hep-lat/9206013",
+	journal = "Phys. Lett.",
+	pages = "342--347",
+	slaccitation = "%\%CITATION = HEP-LAT 9206013;\%\%",
+	title = "{A Method for simulating chiral fermions on the lattice}",
+	volume = "B288",
+	year = "1992"
+}
+
+@article{Karsten:1980wd,
+	author = "Karsten, L. H. and Smit, J.",
+	journal = "Nucl. Phys.",
+	pages = "103",
+	slaccitation = "%\%CITATION = NUPHA,B183,103;\%\%",
+	title = "{Lattice fermions: species doubling, chiral invariance, and the triangle anomaly}",
+	volume = "B183",
+	year = "1981"
+}
+
+@article{Kennedy:1990bv,
+	author = "Kennedy, A. D. and Pendleton, B.",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "118--121",
+	slaccitation = "%\%CITATION = NUPHZ,20,118;\%\%",
+	title = "{Acceptances and autocorrelations in hybrid Monte Carlo}",
+	volume = "20",
+	year = "1991"
+}
+
+@article{Knechtli:1998gf,
+	author = "Knechtli, F. and Sommer, R.",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/9807022",
+	journal = "Phys. Lett.",
+	pages = "345--352",
+	slaccitation = "%\%CITATION = HEP-LAT 9807022;\%\%",
+	title = "{String breaking in SU(2) gauge theory with scalar matter fields}",
+	volume = "B440",
+	year = "1998"
+}
+
+@article{Knechtli:2000df,
+	author = "Knechtli, F. and Sommer, R.",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/0005021",
+	journal = "Nucl. Phys.",
+	pages = "309--328",
+	slaccitation = "%\%CITATION = HEP-LAT 0005021;\%\%",
+	title = "{String breaking as a mixing phenomenon in the SU(2) Higgs model}",
+	volume = "B590",
+	year = "2000"
+}
+
+@article{Lacock:1994qx,
+	author = "Lacock, P. and McKerrell, A. and Michael, C. and Stopher, I. M. and Stephenson, P. W.",
+	collaboration = "UKQCD",
+	eprint = "hep-lat/9412079",
+	journal = "Phys. Rev.",
+	pages = "6403--6410",
+	slaccitation = "%\%CITATION = HEP-LAT 9412079;\%\%",
+	title = "{Efficient hadronic operators in lattice gauge theory}",
+	volume = "D51",
+	year = "1995"
+}
+
+@article{Lepage:1992xa,
+	author = "Lepage, G. Peter and Mackenzie, Paul B.",
+	eprint = "hep-lat/9209022",
+	journal = "Phys. Rev.",
+	pages = "2250--2264",
+	slaccitation = "%\%CITATION = HEP-LAT 9209022;\%\%",
+	title = "{On the viability of lattice perturbation theory}",
+	volume = "D48",
+	year = "1993"
+}
+
+@article{Lepage:2001ym,
+	archiveprefix = "arXiv",
+	author = "Lepage, G. P. and others",
+	doi = "10.1016/S0920-5632(01)01638-3",
+	eprint = "hep-lat/0110175",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "12--20",
+	slaccitation = "%\%CITATION = HEP-LAT/0110175;\%\%",
+	title = "{Constrained curve fitting}",
+	volume = "106",
+	year = "2002"
+}
+
+@article{Lesk:2002gd,
+	author = "Lesk, V. I. and others",
+	collaboration = "CP-PACS",
+	eprint = "hep-lat/0211040",
+	journal = "Phys. Rev.",
+	pages = "074503",
+	slaccitation = "%\%CITATION = HEP-LAT/0211040;\%\%",
+	title = "{Flavor singlet meson mass in the continuum limit in two- flavor lattice QCD}",
+	volume = "D67",
+	year = "2003"
+}
+
+@article{Leutwyler:1997yr,
+	author = "Leutwyler, H.",
+	eprint = "hep-ph/9709408",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "223--231",
+	slaccitation = "%\%CITATION = HEP-PH/9709408;\%\%",
+	title = "{On the 1/N-expansion in chiral perturbation theory}",
+	volume = "64",
+	year = "1998"
+}
+
+@article{Leutwyler:2006qq,
+	author = "Leutwyler, H.",
+	eprint = "hep-ph/0612112",
+	slaccitation = "%\%CITATION = HEP-PH 0612112;\%\%",
+	title = "{pi pi scattering}",
+	year = "2006"
+}
+
+@article{Liu:1997fs,
+	author = "Liu, C. and Jaster, A. and Jansen, K.",
+	eprint = "hep-lat/9708017",
+	journal = "Nucl. Phys.",
+	pages = "603--617",
+	slaccitation = "%\%CITATION = HEP-LAT 9708017;\%\%",
+	title = "{Liapunov exponents and the reversibility of molecular dynamics algorithms}",
+	volume = "B524",
+	year = "1998"
+}
+
+@article{Luscher:1985dn,
+	author = "Luscher, M.",
+	doi = "10.1007/BF01211589",
+	journal = "Commun. Math. Phys.",
+	pages = "177",
+	slaccitation = "%\%CITATION = CMPHA,104,177;\%\%",
+	title = "{Volume Dependence of the Energy Spectrum in Massive Quantum Field Theories. 1. Stable Particle States}",
+	volume = "104",
+	year = "1986"
+}
+
+@article{Luscher:1990ck,
+	author = "L{\"u}scher, M. and Wolff, U.",
+	journal = "Nucl. Phys.",
+	pages = "222--252",
+	slaccitation = "%\%CITATION = NUPHA,B339,222;\%\%",
+	title = "{How to calculate the elastic scattering matrix in two- dimensional quantum field theories by numerical simulation}",
+	volume = "B339",
+	year = "1990"
+}
+
+@article{Luscher:1993dy,
+	archiveprefix = "arXiv",
+	author = "Luscher, Martin",
+	doi = "10.1016/0010-4655(94)90232-1",
+	eprint = "hep-lat/9309020",
+	journal = "Comput. Phys. Commun.",
+	pages = "100--110",
+	slaccitation = "%\%CITATION = HEP-LAT/9309020;\%\%",
+	title = "{A Portable high quality random number generator for lattice field theory simulations}",
+	volume = "79",
+	year = "1994"
+}
+
+@article{Luscher:1993xx,
+	author = "L{\"u}scher, M.",
+	eprint = "hep-lat/9311007",
+	journal = "Nucl. Phys.",
+	pages = "637--648",
+	slaccitation = "%\%CITATION = HEP-LAT 9311007;\%\%",
+	title = "{A New approach to the problem of dynamical quarks in numerical simulations of lattice {QCD}}",
+	volume = "B418",
+	year = "1994"
+}
+
+@article{Luscher:1996sc,
+	author = "L{\"u}scher, M. and Sint, S. and Sommer, R. and Weisz, P.",
+	eprint = "hep-lat/9605038",
+	journal = "Nucl. Phys.",
+	pages = "365--400",
+	slaccitation = "%\%CITATION = HEP-LAT 9605038;\%\%",
+	title = "{Chiral symmetry and {O(a)} improvement in lattice {QCD}}",
+	volume = "B478",
+	year = "1996"
+}
+
+@article{Luscher:1996ug,
+	author = "L{\"u}scher, M. and Sint, S. and Sommer, R. and Weisz, P. and Wolff, U.",
+	eprint = "hep-lat/9609035",
+	journal = "Nucl. Phys.",
+	pages = "323--343",
+	slaccitation = "%\%CITATION = HEP-LAT 9609035;\%\%",
+	title = "{Non-perturbative {O(a)} improvement of lattice {QCD}}",
+	volume = "B491",
+	year = "1997"
+}
+
+@article{Luscher:1998pq,
+	author = "L{\"u}scher, M.",
+	eprint = "hep-lat/9802011",
+	journal = "Phys. Lett.",
+	pages = "342--345",
+	slaccitation = "%\%CITATION = HEP-LAT 9802011;\%\%",
+	title = "{Exact chiral symmetry on the lattice and the {Ginsparg}- {Wilson} relation}",
+	volume = "B428",
+	year = "1998"
+}
+
+@article{Luscher:2001tx,
+	archiveprefix = "arXiv",
+	author = "L{\"u}scher, Martin",
+	doi = "10.1016/S0920-5632(01)01639-5",
+	eprint = "hep-lat/0110007",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "21--28",
+	slaccitation = "%\%CITATION = HEP-LAT/0110007;\%\%",
+	title = "{Lattice QCD on PCs?}",
+	volume = "106",
+	year = "2002"
+}
+
+@article{Luscher:2003qa,
+	author = "L{\"u}scher, M.",
+	eprint = "hep-lat/0310048",
+	journal = "Comput. Phys. Commun.",
+	pages = "209--220",
+	slaccitation = "%\%CITATION = HEP-LAT 0310048;\%\%",
+	title = "{Solution of the {D}irac equation in lattice {QCD} using a domain decomposition method}",
+	volume = "156",
+	year = "2004"
+}
+
+@article{Luscher:2004rx,
+	author = "L{\"u}scher, M.",
+	eprint = "hep-lat/0409106",
+	journal = "Comput. Phys. Commun.",
+	pages = "199",
+	slaccitation = "%\%CITATION = HEP-LAT 0409106;\%\%",
+	title = "{Schwarz-preconditioned {HMC} algorithm for two-flavour lattice {QCD}}",
+	volume = "165",
+	year = "2005"
+}
+
+@article{Luscher:2005mv,
+	author = "L{\"u}scher, Martin",
+	eprint = "hep-lat/0509152",
+	howpublished = "Talk presented at International Symposium on Lattice Field Theory (Lattice 2005)",
+	journal = "\href{http://pos.sissa.it/archive/conferences/020/008/LAT2005\_002.pdf}{PoS(LAT2005)002}",
+	slaccitation = "%\%CITATION = HEP-LAT 0509152;\%\%",
+	title = "{Lattice {QCD} with light {W}ilson quarks}",
+	year = "2005"
+}
+
+@article{Luscher:ranluxweb,
+	author = "L{\"u}scher, M.",
+	eprint = "http://luscher.web.cern.ch/luscher/ranlux/",
+	title = "{Ranlux random number generator}"
+}
+
+@article{Luscher:sse,
+	author = "L{\"u}scher, M.",
+	eprint = "http://luscher.web.cern.ch/luscher/QCDpbm/",
+	title = "{Lattice QCD parallel benchmark programs}"
+}
+
+@article{Madras:1988ei,
+	author = "Madras, N. and Sokal, A. D.",
+	journal = "J. Statist. Phys.",
+	pages = "109--186",
+	slaccitation = "%\%CITATION = JSTPB,50,109;\%\%",
+	title = "{The Pivot algorithm: a highly efficient Monte Carlo method for selfavoiding walk}",
+	volume = "50",
+	year = "1988"
+}
+
+@article{Martinelli:1982mw,
+	author = "Martinelli, G. and Zhang, Yi-Cheng",
+	journal = "Phys. Lett.",
+	pages = "433",
+	slaccitation = "%\%CITATION = PHLTA,B123,433;\%\%",
+	title = "{THE CONNECTION BETWEEN LOCAL OPERATORS ON THE LATTICE AND IN THE CONTINUUM AND ITS RELATION TO MESON DECAY CONSTANTS}",
+	volume = "B123",
+	year = "1983"
+}
+
+@article{Martinelli:1994ty,
+	archiveprefix = "arXiv",
+	author = "Martinelli, G. and Pittori, C. and Sachrajda, Christopher T. and Testa, M. and Vladikas, A.",
+	doi = "10.1016/0550-3213(95)00126-D",
+	eprint = "hep-lat/9411010",
+	journal = "Nucl. Phys.",
+	pages = "81--108",
+	slaccitation = "%\%CITATION = HEP-LAT/9411010;\%\%",
+	title = "{A General method for nonperturbative renormalization of lattice operators}",
+	volume = "B445",
+	year = "1995"
+}
+
+@article{McNeile:2000hf,
+	author = "McNeile, C. and Michael, C.",
+	collaboration = "UKQCD",
+	eprint = "hep-lat/0006020",
+	journal = "Phys. Lett.",
+	pages = "123--129",
+	slaccitation = "%\%CITATION = HEP-LAT 0006020;\%\%",
+	title = "{The eta and eta' mesons in {QCD}}",
+	volume = "B491",
+	year = "2000"
+}
+
+@article{McNeile:2000xx,
+	author = "McNeile, Craig and Michael, Chris",
+	collaboration = "UKQCD",
+	eprint = "hep-lat/0010019",
+	journal = "Phys. Rev.",
+	pages = "114503",
+	slaccitation = "%\%CITATION = HEP-LAT0010019;\%\%",
+	title = "{Mixing of scalar glueballs and flavour-singlet scalar mesons}",
+	volume = "D63",
+	year = "2001"
+}
+
+@article{McNeile:2001cr,
+	author = "McNeile, C. and Michael, C. and Sharkey, K. J.",
+	collaboration = "UKQCD",
+	eprint = "hep-lat/0107003",
+	journal = "Phys. Rev.",
+	pages = "014508",
+	slaccitation = "%\%CITATION = HEP-LAT 0107003;\%\%",
+	title = "{The flavor singlet mesons in {QCD}}",
+	volume = "D65",
+	year = "2002"
+}
+
+@article{McNeile:2002fh,
+	author = "McNeile, C. and Michael, C.",
+	collaboration = "UKQCD",
+	eprint = "hep-lat/0212020",
+	journal = "Phys. Lett.",
+	pages = "177--184",
+	slaccitation = "%\%CITATION = HEP-LAT 0212020;\%\%",
+	title = "{Hadronic decay of a vector meson from the lattice}",
+	volume = "B556",
+	year = "2003"
+}
+
+@article{McNeile:2006bz,
+	author = "McNeile, C. and Michael, C.",
+	collaboration = "UKQCD",
+	eprint = "hep-lat/0603007",
+	journal = "Phys. Rev.",
+	pages = "074506",
+	slaccitation = "%\%CITATION = HEP-LAT 0603007;\%\%",
+	title = "{Decay width of light quark hybrid meson from the lattice}",
+	volume = "D73",
+	year = "2006"
+}
+
+@article{Meyer:2006ty,
+	archiveprefix = "arXiv",
+	author = "Meyer, Harvey B. and others",
+	doi = "10.1016/j.cpc.2006.08.002",
+	eprint = "hep-lat/0606004",
+	journal = "Comput. Phys. Commun.",
+	pages = "91--97",
+	slaccitation = "%\%CITATION = HEP-LAT/0606004;\%\%",
+	title = "{Exploring the HMC trajectory-length dependence of autocorrelation times in lattice QCD}",
+	volume = "176",
+	year = "2007"
+}
+
+@article{Michael:1982gb,
+	author = "Michael, C. and Teasdale, I.",
+	journal = "Nucl. Phys.",
+	pages = "433",
+	slaccitation = "%\%CITATION = NUPHA,B215,433;\%\%",
+	title = "{EXTRACTING GLUEBALL MASSES FROM LATTICE QCD}",
+	volume = "B215",
+	year = "1983"
+}
+
+@article{Michael:1989mf,
+	author = "Michael, C.",
+	journal = "Nucl. Phys.",
+	pages = "515",
+	slaccitation = "%\%CITATION = NUPHA,B327,515;\%\%",
+	title = "{Particle decay in lattice gauge theory}",
+	volume = "B327",
+	year = "1989"
+}
+
+@article{Michael:1991nc,
+	author = "Michael, C.",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "417--419",
+	slaccitation = "%\%CITATION = NUPHZ,26,417;\%\%",
+	title = "{Hadronic forces from the lattice}",
+	volume = "26",
+	year = "1992"
+}
+
+@article{Michael:1993yj,
+	archiveprefix = "arXiv",
+	author = "Michael, Christopher",
+	doi = "10.1103/PhysRevD.49.2616",
+	eprint = "hep-lat/9310026",
+	journal = "Phys. Rev.",
+	pages = "2616--2619",
+	slaccitation = "%\%CITATION = HEP-LAT/9310026;\%\%",
+	title = "{Fitting correlated data}",
+	volume = "D49",
+	year = "1994"
+}
+
+@article{Michael:1994sz,
+	archiveprefix = "arXiv",
+	author = "Michael, Christopher and McKerrell, A.",
+	doi = "10.1103/PhysRevD.51.3745",
+	eprint = "hep-lat/9412087",
+	journal = "Phys. Rev.",
+	pages = "3745--3750",
+	slaccitation = "%\%CITATION = HEP-LAT/9412087;\%\%",
+	title = "{Fitting correlated hadron mass spectrum data}",
+	volume = "D51",
+	year = "1995"
+}
+
+@article{Michael:2007vn,
+	archiveprefix = "arXiv",
+	author = "Michael, C. and Urbach, C.",
+	collaboration = "ETM",
+	eprint = "0709.4564",
+	journal = "",
+	pages = "",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = ARXIV:0709.4564;\%\%",
+	title = "{Neutral mesons and disconnected diagrams in Twisted Mass QCD}",
+	volume = "",
+	year = "2007"
+}
+
+@book{Montvay:1994cy,
+	author = "Montvay, I. and M{\"u}nster, G.",
+	publisher = "Cambridge University Press",
+	series = "{Cambridge Monographs on Mathematical Physics}",
+	title = "{Quantum fields on a lattice}",
+	year = "1994"
+}
+
+@article{Montvay:1995ea,
+	author = "Montvay, I.",
+	eprint = "hep-lat/9510042",
+	journal = "Nucl. Phys.",
+	pages = "259--284",
+	slaccitation = "%\%CITATION = HEP-LAT 9510042;\%\%",
+	title = "{An Algorithm for Gluinos on the Lattice}",
+	volume = "B466",
+	year = "1996"
+}
+
+@article{Montvay:2005tj,
+	author = "Montvay, I. and Scholz, E.",
+	eprint = "hep-lat/0506006",
+	journal = "Phys. Lett.",
+	pages = "73--79",
+	slaccitation = "%\%CITATION = HEP-LAT 0506006;\%\%",
+	title = "{Updating algorithms with multi-step stochastic correction}",
+	volume = "B623",
+	year = "2005"
+}
+
+@article{Morgan:2002a,
+	author = "Morgan, R. B.",
+	journal = "SIAM J. Sci. Comput.",
+	pages = "20",
+	title = "{GMRES with Deated Restarting}",
+	volume = "24",
+	year = "2002"
+}
+
+@article{Morningstar:2003gk,
+	archiveprefix = "arXiv",
+	author = "Morningstar, Colin and Peardon, Mike J.",
+	doi = "10.1103/PhysRevD.69.054501",
+	eprint = "hep-lat/0311018",
+	journal = "Phys. Rev.",
+	pages = "054501",
+	slaccitation = "%\%CITATION = HEP-LAT/0311018;\%\%",
+	title = "{Analytic smearing of SU(3) link variables in lattice QCD}",
+	volume = "D69",
+	year = "2004"
+}
+
+@article{Munster:2004am,
+	author = "M{\"u}nster, G.",
+	eprint = "hep-lat/0407006",
+	journal = "JHEP",
+	pages = "035",
+	slaccitation = "%\%CITATION = HEP-LAT 0407006;\%\%",
+	title = "{On the phase structure of twisted mass lattice {QCD}}",
+	volume = "09",
+	year = "2004"
+}
+
+@article{Munster:2004wt,
+	author = "M{\"u}nster, Gernot and Schmidt, Christian and Scholz, Enno E.",
+	eprint = "hep-lat/0409066",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "320--322",
+	slaccitation = "%\%CITATION = HEP-LAT 0409066;\%\%",
+	title = "{Chiral perturbation theory for twisted mass {QCD}}",
+	volume = "140",
+	year = "2005"
+}
+
+@article{Nagai:2005mi,
+	author = "Nagai, Kei-ichi and Jansen, Karl",
+	eprint = "hep-lat/0510076",
+	journal = "Phys. Lett.",
+	pages = "325--330",
+	slaccitation = "%\%CITATION = HEP-LAT 0510076;\%\%",
+	title = "{Two-dimensional lattice Gross-Neveu model with Wilson twisted mass fermions}",
+	volume = "B633",
+	year = "2006"
+}
+
+@unpublished{Nagai:priv,
+	author = "Nagai, K",
+	note = "private communication",
+	optannote = "",
+	optkey = "",
+	optmonth = "",
+	optyear = "",
+	title = "{Two-dimensional Gross-Neveu model with {Wilson} twisted mass fermions}"
+}
+
+@article{Necco:2001xg,
+	author = "Necco, S. and Sommer, R.",
+	eprint = "hep-lat/0108008",
+	journal = "Nucl. Phys.",
+	pages = "328--346",
+	slaccitation = "%\%CITATION = HEP-LAT 0108008;\%\%",
+	title = "{The {N(f)} = 0 heavy quark potential from short to intermediate distances}",
+	volume = "B622",
+	year = "2002"
+}
+
+@article{Necco:2003vh,
+	author = "Necco, Silvia",
+	eprint = "hep-lat/0309017",
+	journal = "Nucl. Phys.",
+	pages = "137--167",
+	slaccitation = "%\%CITATION = HEP-LAT 0309017;\%\%",
+	volume = "B683",
+	year = "2004"
+}
+
+@article{Neff:2001zr,
+	author = "Neff, H. and Eicker, N. and Lippert, T. and Negele, J. W. and Schilling, K.",
+	eprint = "hep-lat/0106016",
+	journal = "Phys. Rev.",
+	pages = "114509",
+	slaccitation = "%\%CITATION = HEP-LAT/0106016;\%\%",
+	title = "{On the low fermionic eigenmode dominance in {QCD} on the lattice}",
+	volume = "D64",
+	year = "2001"
+}
+
+@article{Neuberger:1997fp,
+	author = "Neuberger, H.",
+	eprint = "hep-lat/9707022",
+	journal = "Phys. Lett.",
+	pages = "141--144",
+	slaccitation = "%\%CITATION = HEP-LAT 9707022;\%\%",
+	title = "{Exactly massless quarks on the lattice}",
+	volume = "B417",
+	year = "1998"
+}
+
+@article{Neuberger:1998wv,
+	author = "Neuberger, H.",
+	eprint = "hep-lat/9801031",
+	journal = "Phys. Lett.",
+	pages = "353--355",
+	slaccitation = "%\%CITATION = HEP-LAT 9801031;\%\%",
+	title = "{More about exactly massless quarks on the lattice}",
+	volume = "B427",
+	year = "1998"
+}
+
+@article{Niedermayer:1998bi,
+	author = "Niedermayer, F.",
+	eprint = "hep-lat/9810026",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "105--119",
+	slaccitation = "%\%CITATION = HEP-LAT 9810026;\%\%",
+	title = "{Exact chiral symmetry, topological charge and related topics}",
+	volume = "73",
+	year = "1999"
+}
+
+@article{Nielsen:1980rz,
+	author = "Nielsen, H. B. and Ninomiya, M.",
+	journal = "Nucl. Phys.",
+	pages = "20",
+	slaccitation = "%\%CITATION = NUPHA,B185,20;\%\%",
+	title = "{Absence of neutrinos on a lattice. 1. proof by homotopy theory}",
+	volume = "B185",
+	year = "1981"
+}
+
+@article{Nielsen:1981hk,
+	author = "Nielsen, H. B. and Ninomiya, M.",
+	journal = "Phys. Lett.",
+	pages = "219",
+	slaccitation = "%\%CITATION = PHLTA,B105,219;\%\%",
+	title = "{No go theorem for regularizing chiral fermions}",
+	volume = "B105",
+	year = "1981"
+}
+
+@article{Nielsen:1981xu,
+	author = "Nielsen, H. B. and Ninomiya, M.",
+	journal = "Nucl. Phys.",
+	pages = "173",
+	slaccitation = "%\%CITATION = NUPHA,B193,173;\%\%",
+	title = "{Absence of neutrinos on a lattice. 2. intuitive topological proof}",
+	volume = "B193",
+	year = "1981"
+}
+
+@article{Noaki:1998zc,
+	author = "Noaki, J. and Izubuchi, T. and Ukawa, A.",
+	eprint = "hep-lat/9809071",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "483--485",
+	slaccitation = "%\%CITATION = HEP-LAT 9809071;\%\%",
+	title = "{Two-dimensional Gross-Neveu model with {Wilson} fermion action at finite temperature and density}",
+	volume = "73",
+	year = "1999"
+}
+
+@article{Orginos:2001xa,
+	author = "Orginos, K.",
+	collaboration = "RBC",
+	eprint = "hep-lat/0110074",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "721--723",
+	slaccitation = "%\%CITATION = HEP-LAT 0110074;\%\%",
+	title = "{Chiral properties of domain wall fermions with improved gauge actions}",
+	volume = "106",
+	year = "2002"
+}
+
+@article{Orth:2005kq,
+	author = "Orth, B. and Lippert, T. and Schilling, K.",
+	eprint = "hep-lat/0503016",
+	journal = "Phys. Rev.",
+	pages = "014503",
+	slaccitation = "%\%CITATION = HEP-LAT 0503016;\%\%",
+	title = "{Finite-size effects in lattice {QCD} with dynamical {Wilson} fermions}",
+	volume = "D72",
+	year = "2005"
+}
+
+@article{Osterwalder:1973dx,
+	author = "Osterwalder, K. and Schrader, R.",
+	journal = "Commun. Math. Phys.",
+	pages = "83--112",
+	slaccitation = "%\%CITATION = CMPHA,31,83;\%\%",
+	title = "{Axioms for euclidean Green's functions}",
+	volume = "31",
+	year = "1973"
+}
+
+@article{Osterwalder:1975tc,
+	author = "Osterwalder, K. and Schrader, R.",
+	journal = "Commun. Math. Phys.",
+	pages = "281",
+	slaccitation = "%\%CITATION = CMPHA,42,281;\%\%",
+	title = "{Axioms for euclidean Green's functions. 2}",
+	volume = "42",
+	year = "1975"
+}
+
+@article{Osterwalder:1977pc,
+	author = "Osterwalder, K. and Seiler, E.",
+	journal = "Ann. Phys.",
+	pages = "440",
+	slaccitation = "%\%CITATION = APNYA,110,440;\%\%",
+	title = "{Gauge field theories on the lattice}",
+	volume = "110",
+	year = "1978"
+}
+
+@article{PDBook,
+	author = "Eidelman, S. and others",
+	journal = "{Physics Letters B}",
+	pages = "1+",
+	title = "{Review of Particle Physics}",
+	url = "http://pdg.lbl.gov",
+	volume = "592",
+	year = "2004"
+}
+
+@article{Peardon:2002wb,
+	author = "Peardon, M. J. and Sexton, J.",
+	collaboration = "TrinLat",
+	eprint = "hep-lat/0209037",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "985--987",
+	slaccitation = "%\%CITATION = HEP-LAT 0209037;\%\%",
+	title = "{Multiple molecular dynamics time-scales in hybrid Monte Carlo fermion simulations}",
+	volume = "119",
+	year = "2003"
+}
+
+@book{Peskin:1995ev,
+	author = "Peskin, M. E. and Schroeder, D. V.",
+	optaddress = "Boulder, Colorado",
+	optannote = "",
+	optedition = "",
+	optkey = "",
+	optmonth = "",
+	optnote = "",
+	optnumber = "",
+	optseries = "Advanced Book Program",
+	optvolume = "",
+	publisher = "Westview Press",
+	title = "{An Introduction to quantum field theory}",
+	year = "1995"
+}
+
+@article{Politzer:1973fx,
+	author = "Politzer, H. D.",
+	journal = "Phys. Rev. Lett.",
+	pages = "1346--1349",
+	slaccitation = "%\%CITATION = PRLTA,30,1346;\%\%",
+	title = "{Reliable perturbative results for strong interactions?}",
+	volume = "30",
+	year = "1973"
+}
+
+@article{Politzer:1974fr,
+	author = "Politzer, H. D.",
+	journal = "Phys. Rept.",
+	pages = "129--180",
+	slaccitation = "%\%CITATION = PRPLC,14,129;\%\%",
+	title = "{Asymptotic freedom: an approach to strong interactions}",
+	volume = "14",
+	year = "1974"
+}
+
+@manual{R:2005,
+	address = "Vienna, Austria",
+	author = "{R Development Core Team}",
+	note = "{ISBN} 3-900051-07-0",
+	organization = "R Foundation for Statistical Computing",
+	title = "{R: A language and environment for statistical computing}",
+	url = "http://www.R-project.org",
+	year = "2005"
+}
+
+@book{Rothe:1992wy,
+	author = "Rothe, H.J.",
+	edition = "",
+	pages = "528",
+	publisher = "World Scientific, Singapore",
+	title = "{Lattice gauge theories}",
+	year = "1992"
+}
+
+@article{Rupak:2002sm,
+	author = "Rupak, G. and Shoresh, N.",
+	eprint = "hep-lat/0201019",
+	journal = "Phys. Rev.",
+	pages = "054503",
+	slaccitation = "%\%CITATION = HEP-LAT 0201019;\%\%",
+	title = "{Chiral perturbation theory for the {Wilson} lattice action}",
+	volume = "D66",
+	year = "2002"
+}
+
+@article{Saad:1993a,
+	author = "Saad, Y.",
+	journal = "SIAM J. Sci. Comput.",
+	page = "461-469",
+	title = "{A flexible inner-outer preconditioned GMRES altorithm}",
+	volume = "14 (2)",
+	year = "1993"
+}
+
+@article{Sachrajda:2004mi,
+	archiveprefix = "arXiv",
+	author = "Sachrajda, C. T. and Villadoro, G.",
+	doi = "10.1016/j.physletb.2005.01.033",
+	eprint = "hep-lat/0411033",
+	journal = "Phys. Lett.",
+	pages = "73--85",
+	slaccitation = "%\%CITATION = HEP-LAT/0411033;\%\%",
+	title = "{Twisted boundary conditions in lattice simulations}",
+	volume = "B609",
+	year = "2005"
+}
+
+@article{Scorzato:2004da,
+	author = "Scorzato, L.",
+	eprint = "hep-lat/0407023",
+	journal = "Eur. Phys. J.",
+	pages = "445--455",
+	slaccitation = "%\%CITATION = HEP-LAT 0407023;\%\%",
+	title = "{Pion mass splitting and phase structure in twisted mass {QCD}}",
+	volume = "C37",
+	year = "2004"
+}
+
+@article{Scorzato:2005rb,
+	author = "Scorzato, L. and others",
+	eprint = "hep-lat/0511036",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "283--290",
+	slaccitation = "%\%CITATION = HEP-LAT 0511036;\%\%",
+	title = "{N(f) = 2 lattice {QCD} and chiral perturbation theory}",
+	volume = "153",
+	year = "2006"
+}
+
+@article{Sexton:1992nu,
+	author = "Sexton, J. C. and Weingarten, D. H.",
+	journal = "Nucl. Phys.",
+	pages = "665--678",
+	slaccitation = "%\%CITATION = NUPHA,B380,665;\%\%",
+	title = "{Hamiltonian evolution for the hybrid monte carlo algorithm}",
+	volume = "B380",
+	year = "1992"
+}
+
+@article{Sharpe:1998xm,
+	author = "Sharpe, S. R. and Singleton, R. Jr.",
+	eprint = "hep-lat/9804028",
+	journal = "Phys. Rev.",
+	pages = "074501",
+	slaccitation = "%\%CITATION = HEP-LAT 9804028;\%\%",
+	title = "{Spontaneous flavor and parity breaking with {Wilson} fermions}",
+	volume = "D58",
+	year = "1998"
+}
+
+@article{Sharpe:2004ny,
+	author = "Sharpe, S. R. and Wu, Jackson M. S.",
+	eprint = "hep-lat/0411021",
+	journal = "Phys. Rev.",
+	pages = "074501",
+	slaccitation = "%\%CITATION = HEP-LAT 0411021;\%\%",
+	title = "{Twisted mass chiral perturbation theory at next-to-leading order}",
+	volume = "D71",
+	year = "2005"
+}
+
+@article{Sharpe:2004ps,
+	author = "Sharpe, S. R. and Wu, J. M. S.",
+	eprint = "hep-lat/0407025",
+	journal = "Phys. Rev.",
+	pages = "094029",
+	slaccitation = "%\%CITATION = HEP-LAT 0407025;\%\%",
+	title = "{The phase diagram of twisted mass lattice {QCD}}",
+	volume = "D70",
+	year = "2004"
+}
+
+@article{Sharpe:2005rq,
+	author = "Sharpe, Stephen R.",
+	eprint = "hep-lat/0509009",
+	journal = "Phys. Rev.",
+	pages = "074510",
+	slaccitation = "%\%CITATION = HEP-LAT 0509009;\%\%",
+	title = "{Observations on discretization errors in twisted-mass lattice QCD}",
+	volume = "D72",
+	year = "2005"
+}
+
+@article{Sheikholeslami:1985ij,
+	author = "Sheikholeslami, B. and Wohlert, R.",
+	journal = "Nucl. Phys.",
+	pages = "572",
+	slaccitation = "%\%CITATION = NUPHA,B259,572;\%\%",
+	title = "{Improved continuum limit lattice action for qcd with {Wilson} fermions}",
+	volume = "B259",
+	year = "1985"
+}
+
+@article{Shindler:2005vj,
+	author = "Shindler, Andrea",
+	eprint = "hep-lat/0511002",
+	journal = "PoS",
+	pages = "014",
+	slaccitation = "%\%CITATION = HEP-LAT 0511002;\%\%",
+	title = "{Twisted mass lattice {QCD}: Recent developments and results}",
+	volume = "LAT2005",
+	year = "2006"
+}
+
+@article{Shindler:2006tm,
+	author = "Shindler, A.",
+	collaboration = "ETM",
+	eprint = "hep-ph/0611264",
+	slaccitation = "%\%CITATION = HEP-PH 0611264;\%\%",
+	title = "{Lattice QCD with light twisted quarks: First results}",
+	year = "2006"
+}
+
+@article{Shindler:2007vp,
+	archiveprefix = "arXiv",
+	author = "Shindler, A.",
+	doi = "10.1016/j.physrep.2008.03.001",
+	eprint = "0707.4093",
+	journal = "Phys. Rept.",
+	pages = "37--110",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0707.4093;\%\%",
+	title = "{Twisted mass lattice QCD}",
+	volume = "461",
+	year = "2008"
+}
+
+@article{Sleijpen:1996aa,
+	author = "Sleijpen, G. L. G. and der Vorst, H. A. Van",
+	journal = "SIAM Journal on Matrix Analysis and Applications",
+	pages = "401--425",
+	title = "{A Jacobi-Davidson iteration method for linear eigenvalue problems}",
+	volume = "17",
+	year = "1996"
+}
+
+@article{Sommer:1993ce,
+	author = "Sommer, R.",
+	eprint = "hep-lat/9310022",
+	journal = "Nucl. Phys.",
+	pages = "839--854",
+	slaccitation = "%\%CITATION = HEP-LAT 9310022;\%\%",
+	title = "{A New way to set the energy scale in lattice gauge theories and its applications to the static force and alpha-s in SU(2) Yang-Mills theory}",
+	volume = "B411",
+	year = "1994"
+}
+
+@article{Sonneveld:1989cgs,
+	address = "Philadelphia, PA, USA",
+	author = "Sonneveld, Peter",
+	issn = "0196-5204",
+	journal = "SIAM J. Sci. Stat. Comput.",
+	number = "1",
+	pages = "36--52",
+	publisher = "Society for Industrial and Applied Mathematics",
+	title = "{CGS, a fast Lanczos-type solver for nonsymmetric linear systems}",
+	volume = "10",
+	year = "1989"
+}
+
+@article{Sternbeck:2003gy,
+	author = "Sternbeck, A. and Ilgenfritz, E.-M. and Kerler, W. and M{\"u}ller-Preu{\ss}ker, M. and St{\"u}ben, H.",
+	eprint = "hep-lat/0309059",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "898--900",
+	slaccitation = "%\%CITATION = HEP-LAT 0309059;\%\%",
+	title = "{The {Aoki} phase for {N(f)} = 2 {Wilson} fermions revisited}",
+	volume = "129",
+	year = "2004"
+}
+
+@article{Sternbeck:2005tk,
+	author = "Sternbeck, A. and Ilgenfritz, E. -M. and Mueller-Preussker, M. and Schiller, A.",
+	eprint = "hep-lat/0506007",
+	journal = "Phys. Rev.",
+	pages = "014507",
+	slaccitation = "%\%CITATION = HEP-LAT/0506007;\%\%",
+	title = "{Going infrared in SU(3) Landau gauge gluodynamics}",
+	volume = "D72",
+	year = "2005"
+}
+
+@conference{Symanzik:1981hc,
+	author = "Symanzik, K.",
+	booktitle = "{Mathematical problems in theoretical physics}",
+	editor = "et al., R. Schrader",
+	journal = "Lecture Notes in Physics",
+	note = "Presented at 6th Int. Conf. on Mathematical Physics, Berlin, West Germany",
+	pages = "47--58",
+	title = "{Some topics in quantum field theory}",
+	volume = "153",
+	year = "1981"
+}
+
+@article{Symanzik:1983dc,
+	author = "Symanzik, K.",
+	journal = "Nucl. Phys.",
+	pages = "187",
+	slaccitation = "%\%CITATION = NUPHA,B226,187;\%\%",
+	title = "{Continuum limit and improved action in lattice theories. 1. principles and phi**4 theory}",
+	volume = "B226",
+	year = "1983"
+}
+
+@article{Symanzik:1983gh,
+	author = "Symanzik, K.",
+	journal = "Nucl. Phys.",
+	pages = "205",
+	slaccitation = "%\%CITATION = NUPHA,B226,205;\%\%",
+	title = "{Continuum limit and improved action in lattice theories. 2. O(N) nonlinear sigma model in perturbation theory}",
+	volume = "B226",
+	year = "1983"
+}
+
+@article{Takaishi:1996xj,
+	author = "Takaishi, T.",
+	journal = "Phys. Rev.",
+	pages = "1050--1053",
+	slaccitation = "%\%CITATION = PHRVA,D54,1050;\%\%",
+	title = "{Heavy quark potential and effective actions on blocked configurations}",
+	volume = "D54",
+	year = "1996"
+}
+
+@article{Takaishi:2005tz,
+	author = "Takaishi, T. and de Forcrand, P.",
+	eprint = "hep-lat/0505020",
+	slaccitation = "%\%CITATION = HEP-LAT 0505020;\%\%",
+	title = "{Testing and tuning new symplectic integrators for hybrid Monte Carlo algorithm in lattice QCD}",
+	year = "2005"
+}
+
+@article{Takeda:2004xh,
+	author = "Takeda, S. and others",
+	eprint = "hep-lat/0408010",
+	journal = "Phys. Rev.",
+	pages = "074510",
+	slaccitation = "%\%CITATION = HEP-LAT 0408010;\%\%",
+	title = "{A scaling study of the step scaling function in SU(3) gauge theory with improved gauge actions}",
+	volume = "D70",
+	year = "2004"
+}
+
+@article{Ukawa:2002pc,
+	author = "Ukawa, A.",
+	collaboration = "CP-PACS and JL{QCD}",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "195--196",
+	slaccitation = "%\%CITATION = NUPHZ,106,195;\%\%",
+	title = "{Computational cost of full {QCD} simulations experienced by {CP-PACS and JLQCD Collaborations}}",
+	volume = "106",
+	year = "2002"
+}
+
+@article{Urbach:2005ji,
+	author = "Urbach, C. and Jansen, K. and Shindler, A. and Wenger, U.",
+	eprint = "hep-lat/0506011",
+	journal = "Comput. Phys. Commun.",
+	pages = "87--98",
+	slaccitation = "%\%CITATION = HEP-LAT 0506011;\%\%",
+	title = "{{HMC} algorithm with multiple time scale integration and mass preconditioning}",
+	volume = "174",
+	year = "2006"
+}
+
+@article{Urbach:2007rt,
+	archiveprefix = "arXiv",
+	author = "Urbach, Carsten",
+	collaboration = "ETM",
+	eprint = "0710.1517",
+	journal = "PoS",
+	pages = "022",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0710.1517;\%\%",
+	title = "{Lattice QCD with two light Wilson quarks and maximally twisted mass}",
+	volume = "LAT2007",
+	year = "2007"
+}
+
+@article{WalkerLoud:2005bt,
+	archiveprefix = "arXiv",
+	author = "Walker-Loud, Andre and Wu, Jackson M. S.",
+	doi = "10.1103/PhysRevD.72.014506",
+	eprint = "hep-lat/0504001",
+	journal = "Phys. Rev.",
+	pages = "014506",
+	slaccitation = "%\%CITATION = HEP-LAT/0504001;\%\%",
+	title = "{Nucleon and Delta masses in twisted mass chiral perturbation theory}",
+	volume = "D72",
+	year = "2005"
+}
+
+@article{Weinberg:1973un,
+	author = "Weinberg, S.",
+	journal = "Phys. Rev. Lett.",
+	pages = "494--497",
+	slaccitation = "%\%CITATION = PRLTA,31,494;\%\%",
+	title = "{Nonabelian gauge theories of the strong interactions}",
+	volume = "31",
+	year = "1973"
+}
+
+@article{Weinberg:1978kz,
+	author = "Weinberg, S.",
+	journal = "Physica",
+	pages = "327",
+	slaccitation = "%\%CITATION = PHYSA,A96,327;\%\%",
+	title = "{Phenomenological Lagrangians}",
+	volume = "A96",
+	year = "1979"
+}
+
+@book{Weinberg:1995mt,
+	author = "Weinberg, S.",
+	pages = "609",
+	publisher = "Cambridge University Press",
+	title = "{The Quantum theory of fields. Vol. 1: Foundations}",
+	year = "1995"
+}
+
+@article{Weisz:1982zw,
+	author = "Weisz, P.",
+	journal = "Nucl. Phys.",
+	pages = "1",
+	slaccitation = "%\%CITATION = NUPHA,B212,1;\%\%",
+	title = "{Continuum limit improved lattice action for pure {Yang-Mills} theory. 1}",
+	volume = "B212",
+	year = "1983"
+}
+
+@article{Weisz:1983bn,
+	author = "Weisz, P. and Wohlert, R.",
+	journal = "Nucl. Phys.",
+	pages = 397,
+	slaccitation = "%\%CITATION = NUPHA,B236,397;\%\%",
+	title = "{Continuum limit improved lattice action for pure {Yang-Mills} theory. 2}",
+	volume = "B236",
+	year = 1984
+}
+
+@article{Wennekers:2005wa,
+	author = "Wennekers, J. and Wittig, H.",
+	eprint = "hep-lat/0507026",
+	slaccitation = "%\%CITATION = HEP-LAT 0507026;\%\%",
+	title = "{On the renormalized scalar density in quenched QCD}",
+	year = "2005"
+}
+
+@article{Weyl:1918ib,
+	author = "Weyl, H.",
+	journal = "Sitzungsber. Preuss. Akad. Wiss. Berlin (Math. Phys. )",
+	pages = "465",
+	slaccitation = "%\%CITATION = SPWPA,1918,465;\%\%",
+	title = "{Gravitation und Elektrizit{\"a}t}",
+	volume = "1918",
+	year = "1918"
+}
+
+@article{Weyl:1929fm,
+	author = "Weyl, H.",
+	journal = "Z. Phys.",
+	pages = "330--352",
+	slaccitation = "%\%CITATION = ZEPYA,56,330;\%\%",
+	title = "{Electron and gravitation}",
+	volume = "56",
+	year = "1929"
+}
+
+@article{Wilson:1974sk,
+	author = "Wilson, K. G.",
+	journal = "Phys. Rev.",
+	pages = "2445--2459",
+	slaccitation = "%\%CITATION = PHRVA,D10,2445;\%\%",
+	title = "{Confinement of quarks}",
+	volume = "D10",
+	year = "1974"
+}
+
+@article{Wilson:1974sk,
+	author = "Wilson, K. G.",
+	journal = "Phys. Rev.",
+	pages = "2445--2459",
+	slaccitation = "%\%CITATION = PHRVA,D10,2445;\%\%",
+	title = "{Confinement of quarks}",
+	volume = "D10",
+	year = "1974"
+}
+
+@article{Wilson:1975mb,
+	author = "Wilson, K. G.",
+	journal = "Rev. Mod. Phys.",
+	pages = "773",
+	slaccitation = "%\%CITATION = RMPHA,47,773;\%\%",
+	title = "{The renormalization group: Critical phenomena and the kondo problem}",
+	volume = "47",
+	year = "1975"
+}
+
+@article{Wilson:1975mb,
+	author = "Wilson, K. G.",
+	journal = "Rev. Mod. Phys.",
+	pages = "773",
+	slaccitation = "%\%CITATION = RMPHA,47,773;\%\%",
+	title = "{The renormalization group: Critical phenomena and the kondo problem}",
+	volume = "47",
+	year = "1975"
+}
+
+@article{Wolff:2003sm,
+	author = "Wolff, U.",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/0306017",
+	journal = "Comput. Phys. Commun.",
+	pages = "143--153",
+	slaccitation = "%\%CITATION = HEP-LAT 0306017;\%\%",
+	title = "{Monte Carlo errors with less errors}",
+	volume = "156",
+	year = "2004"
+}
+
+@article{Yang:1954ek,
+	author = "Yang, C.-N. and Mills, R. L.",
+	journal = "Phys. Rev.",
+	pages = "191--195",
+	slaccitation = "%\%CITATION = PHRVA,96,191;\%\%",
+	title = "{Conservation of isotopic spin and isotopic gauge invariance}",
+	volume = "96",
+	year = "1954"
+}
+
+@article{Yoshie:2008aw,
+	archiveprefix = "arXiv",
+	author = "Yoshie, Tomoteru",
+	eprint = "0812.0849",
+	journal = "PoS",
+	pages = "019",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0812.0849;\%\%",
+	title = "{Making use of the International Lattice Data Grid}",
+	volume = "LATTICE2008",
+	year = "2008"
+}
+
+@article{Zweig:1964jf,
+	author = "Zweig, G.",
+	note = "CERN-TH-412",
+	title = "{An SU(3) model for strong interaction symmetry and its breaking. 2}"
+}
+
+@article{cln:web,
+	eprint = "http://www.ginac.de/CLN/"
+}
+
+@article{deForcrand:1995bs,
+	author = "de Forcrand, P.",
+	eprint = "hep-lat/9509082",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "228--235",
+	slaccitation = "%\%CITATION = HEP-LAT 9509082;\%\%",
+	title = "{Progress on lattice {QCD} algorithms}",
+	volume = "47",
+	year = "1996"
+}
+
+@article{deForcrand:1996bx,
+	author = "de Forcrand, P. and others",
+	collaboration = "{QCD}-TARO",
+	eprint = "hep-lat/9608094",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "938--941",
+	slaccitation = "%\%CITATION = HEP-LAT 9608094;\%\%",
+	title = "{Search for effective lattice action of pure {QCD}}",
+	volume = "53",
+	year = "1997"
+}
+
+@article{deForcrand:1996ck,
+	author = "de Forcrand, P. and Takaishi, T.",
+	eprint = "hep-lat/9608093",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "968--970",
+	slaccitation = "%\%CITATION = HEP-LAT 9608093;\%\%",
+	title = "{Fast fermion Monte Carlo}",
+	volume = "53",
+	year = "1997"
+}
+
+@article{etmc:asqr,
+	archiveprefix = "arXiv",
+	author = "Frezzotti, R. et al.",
+	eprint = "0710.2492",
+	journal = "PoS",
+	pages = "277",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0710.2492;\%\%",
+	title = "{O(a^2) cutoff effects in Wilson fermion simulations}",
+	volume = "LAT2007",
+	year = "2007"
+}
+
+@article{ildg:web,
+	author = "working groups, ILDG",
+	eprint = "http://cssm.sasr.edu.au/ildg/"
+}
+
+@book{kleinert:1,
+	author = "Kleinert, H.",
+	edition = "2nd Edition",
+	publisher = "World Scientific, Singapore",
+	title = "{Path integrals in quantum mechanics, statistics and polymer ph ysics}",
+	year = "1995"
+}
+
+@article{lapack:web,
+	eprint = "http://www.netlib.org/lapack/"
+}
+
+@article{lime:web,
+	author = "USQCD",
+	eprint = "http://usqcd.jlab.org/usqcd-docs/c-lime/",
+	title = "{c-lime library}"
+}
+
+@book{meister:1999,
+	author = "Meister, Andreas",
+	optaddress = "",
+	optannote = "",
+	optedition = "",
+	optkey = "",
+	optmonth = "",
+	optnote = "",
+	optnumber = "",
+	optseries = "",
+	optvolume = "",
+	publisher = "vieweg",
+	title = "{Numerik linearer Gleichungssysteme}",
+	year = "1999"
+}
+
+@manual{minuit,
+	note = "\\seal.web.cern.ch/seal/snapshot/work-packages/mathlibs/minuit/home.html",
+	title = "{MINUIT home page}"
+}
+
+@article{mpi:web,
+	eprint = "http://www-unix.mcs.anl.gov/mpi/",
+	title = "{The message passing interface standard}"
+}
+
+@phdthesis{orth:2004phd,
+	author = "Orth, B.",
+	optaddress = "",
+	optannote = "",
+	optkey = "",
+	optmonth = "",
+	optnote = "",
+	opttype = "",
+	school = "Bergische Universit{\"a}t Wuppertal",
+	title = "{Finite size effects in lattice {QCD} with dynamical {Wilson} fermions}",
+	year = "2004"
+}
+
+@phdthesis{pleiter:phd,
+	author = "Pleiter, D.",
+	school = "Freie {U}niversit�t {B}erlin",
+	title = "{XXX}",
+	year = "2001"
+}
+
+@manual{root,
+	note = "root.cern.ch/",
+	title = "{The ROOT system home page}"
+}
+
+@book{saad:2003a,
+	author = "Saad, Y.",
+	edition = "2nd",
+	publisher = "SIAM",
+	title = "{Iterative Methods for sparse linear systems}",
+	year = "2003"
+}
+
+@article{scidac,
+	eprint = "http://www.scidac.gov/"
+}
+
+@mastersthesis{urbach:2002aa,
+	author = "Urbach, C.",
+	school = "Freie Universit{\"a}t Berlin, Fachbereich Physik",
+	title = "{Untersuchung der {R}eversibilit{\"a}tsverletzung im {H}ybrid {M}onte {C}arlo {A}lgorithmus}",
+	year = "2002"
+}
+
+@article{'tHooft:1971fh,
+	author = "{'t Hooft}, G.",
+	journal = "Nucl. Phys.",
+	pages = "173--199",
+	slaccitation = "%\%CITATION = NUPHA,B33,173;\%\%",
+	title = "{Renormalization of massless Yang-Mills fields}",
+	volume = "B33",
+	year = "1971"
+}
+
+@article{'tHooft:1971rn,
+	author = "{'t Hooft}, G.",
+	journal = "Nucl. Phys.",
+	pages = "167--188",
+	slaccitation = "%\%CITATION = NUPHA,B35,167;\%\%",
+	title = "{Renormalizable lagrangians for massive Yang-Mills fields}",
+	volume = "B35",
+	year = "1971"
+}
+
+@unpublished{'tHooft:1972aa,
+	author = "{'t Hooft}, G.",
+	note = "Unpublished remarks at the 1972 Marseille Conference on Yang-Mills Fields",
+	title = "{}"
+}
+
+@article{'tHooft:1972fi,
+	author = "{'t Hooft}, G. and Veltman, M. J. G.",
+	journal = "Nucl. Phys.",
+	pages = "189--213",
+	slaccitation = "%\%CITATION = NUPHA,B44,189;\%\%",
+	title = "{Regularization and renormalization of gauge fields}",
+	volume = "B44",
+	year = "1972"
+}
+
+@article{Abdel-Rehim:2004gx,
+	author = "Abdel-Rehim, A. M. and Lewis, R.",
+	eprint = "hep-lat/0410047",
+	journal = "Phys. Rev.",
+	pages = "014503",
+	slaccitation = "%\%CITATION = HEP-LAT 0410047;\%\%",
+	title = "{Twisted mass {QCD} for the pion electromagnetic form factor}",
+	volume = "D71",
+	year = "2005"
+}
+
+@article{Abdel-Rehim:2005gz,
+	author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M.",
+	eprint = "hep-lat/0503007",
+	journal = "Phys. Rev.",
+	pages = "094505",
+	slaccitation = "%\%CITATION = HEP-LAT/0503007;\%\%",
+	title = "{Spectrum of quenched twisted mass lattice QCD at maximal twist}",
+	volume = "D71",
+	year = "2005"
+}
+
+@article{AbdelRehim:2004sp,
+	author = "Abdel-Rehim, Abdou M. and Lewis, Randy",
+	eprint = "hep-lat/0408033",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "299--301",
+	slaccitation = "%\%CITATION = HEP-LAT/0408033;\%\%",
+	title = "{Pion form factor with twisted mass QCD}",
+	volume = "140",
+	year = "2005"
+}
+
+@article{AbdelRehim:2005gq,
+	author = "Abdel-Rehim, A. M. and Lewis, R. and Woloshyn, R. M.",
+	journal = "Int. J. Mod. Phys.",
+	pages = "6159--6168",
+	slaccitation = "%\%CITATION = IMPAE,A20,6159;\%\%",
+	title = "{Twisted mass lattice QCD and hadron phenomenology}",
+	volume = "A20",
+	year = "2005"
+}
+
+@article{AbdelRehim:2005gz,
+	archiveprefix = "arXiv",
+	author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M.",
+	doi = "10.1103/PhysRevD.71.094505",
+	eprint = "hep-lat/0503007",
+	journal = "Phys. Rev.",
+	pages = "094505",
+	slaccitation = "%\%CITATION = HEP-LAT/0503007;\%\%",
+	title = "{Spectrum of quenched twisted mass lattice QCD at maximal twist}",
+	volume = "D71",
+	year = "2005"
+}
+
+@article{AbdelRehim:2005qv,
+	author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M.",
+	eprint = "hep-lat/0509056",
+	journal = "PoS",
+	pages = "032",
+	slaccitation = "%\%CITATION = HEP-LAT/0509056;\%\%",
+	title = "{The hadron spectrum from twisted mass QCD with a strange quark}",
+	volume = "LAT2005",
+	year = "2006"
+}
+
+@article{AbdelRehim:2005yx,
+	author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M.",
+	eprint = "hep-lat/0509098",
+	journal = "PoS",
+	pages = "051",
+	slaccitation = "%\%CITATION = HEP-LAT/0509098;\%\%",
+	title = "{Maximal twist and the spectrum of quenched twisted mass lattice QCD}",
+	volume = "LAT2005",
+	year = "2006"
+}
+
+@article{AbdelRehim:2006qu,
+	author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Petry, Robert G. and Woloshyn, R. M.",
+	eprint = "hep-lat/0610004",
+	journal = "PoS",
+	pages = "164",
+	slaccitation = "%\%CITATION = HEP-LAT/0610004;\%\%",
+	title = "{The spectrum of tmLQCD with quark and link smearing}",
+	volume = "LAT2006",
+	year = "2006"
+}
+
+@article{AbdelRehim:2006ra,
+	author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M. and Wu, Jackson M. S.",
+	eprint = "hep-lat/0610090",
+	journal = "Eur. Phys. J.",
+	pages = "773--776",
+	slaccitation = "%\%CITATION = HEP-LAT/0610090;\%\%",
+	title = "{Lattice QCD with a twisted mass term and a strange quark}",
+	volume = "A31",
+	year = "2007"
+}
+
+@article{AbdelRehim:2006ve,
+	author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M. and Wu, Jackson M. S.",
+	eprint = "hep-lat/0601036",
+	journal = "Phys. Rev.",
+	pages = "014507",
+	slaccitation = "%\%CITATION = HEP-LAT/0601036;\%\%",
+	title = "{Strange quarks in quenched twisted mass lattice QCD}",
+	volume = "D74",
+	year = "2006"
+}
+
+@article{Adler:1974gd,
+	author = "Adler, Stephen L.",
+	journal = "Phys. Rev.",
+	pages = "3714",
+	slaccitation = "%\%CITATION = PHRVA,D10,3714;\%\%",
+	title = "{Some Simple Vacuum Polarization Phenomenology: e+ e- $\to$ Hadrons: The mu - Mesic Atom x-Ray Discrepancy and (g-2) of the Muon}",
+	volume = "D10",
+	year = "1974"
+}
+
+@article{Albanese:1987ds,
+	author = "Albanese, M. and others",
+	collaboration = "APE",
+	journal = "Phys. Lett.",
+	pages = "163",
+	slaccitation = "%\%CITATION = PHLTA,B192,163;\%\%",
+	title = "{Glueball masses and string tension in lattice {QCD}}",
+	volume = "B192",
+	year = "1987"
+}
+
+@article{Alexandrou:2008tn,
+	archiveprefix = "arXiv",
+	author = "Alexandrou, C. and others",
+	collaboration = "ETM",
+	eprint = "0803.3190",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0803.3190;\%\%",
+	title = "{Light baryon masses with dynamical twisted mass fermions}",
+	year = "2008"
+}
+
+@article{AliKhan:2000iv,
+	author = "{Ali Khan}, A. and others",
+	collaboration = "CP-PACS",
+	eprint = "hep-lat/0007014",
+	journal = "Phys. Rev.",
+	pages = "114504",
+	slaccitation = "%\%CITATION = HEP-LAT 0007014;\%\%",
+	title = "{Chiral properties of domain-wall quarks in quenched {QCD}}",
+	volume = "D63",
+	year = "2001"
+}
+
+@article{AliKhan:2003br,
+	author = "{Ali Khan}, A. and others",
+	collaboration = "QCDSF",
+	eprint = "hep-lat/0303026",
+	journal = "Phys. Lett.",
+	pages = "235--240",
+	slaccitation = "%\%CITATION = HEP-LAT 0303026;\%\%",
+	title = "{Accelerating the hybrid Monte Carlo algorithm}",
+	volume = "B564",
+	year = "2003"
+}
+
+@article{AliKhan:2003mu,
+	author = "{Ali Khan}, A. and others",
+	eprint = "hep-lat/0309078",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "853--855",
+	slaccitation = "%\%CITATION = HEP-LAT 0309078;\%\%",
+	title = "{Accelerating Hasenbusch's acceleration of hybrid Monte Carlo}",
+	volume = "129",
+	year = "2004"
+}
+
+@article{Allton:1993wc,
+	author = "Allton, C. R. and others",
+	collaboration = "UK{QCD}",
+	eprint = "hep-lat/9303009",
+	journal = "Phys. Rev.",
+	pages = "5128--5137",
+	slaccitation = "%\%CITATION = HEP-LAT 9303009;\%\%",
+	title = "{Gauge invariant smearing and matrix correlators using {Wilson} fermions at Beta = 6.2}",
+	volume = "D47",
+	year = "1993"
+}
+
+@article{Allton:2004qq,
+	author = "Allton, C. R. and others",
+	collaboration = "UKQCD",
+	eprint = "hep-lat/0403007",
+	journal = "Phys. Rev.",
+	pages = "014501",
+	slaccitation = "%\%CITATION = HEP-LAT/0403007;\%\%",
+	title = "{Improved Wilson QCD simulations with light quark masses}",
+	volume = "D70",
+	year = "2004"
+}
+
+@article{Aoki:1984qi,
+	author = "Aoki, S.",
+	journal = "Phys. Rev.",
+	pages = "2653",
+	slaccitation = "%\%CITATION = PHRVA,D30,2653;\%\%",
+	title = "{New phase structure for lattice {QCD} with {Wilson} fermions}",
+	volume = "D30",
+	year = "1984"
+}
+
+@article{Aoki:1985jj,
+	author = "Aoki, S. and Higashijima, K.",
+	journal = "Prog. Theor. Phys.",
+	pages = "521",
+	slaccitation = "%\%CITATION = PTPKA,76,521;\%\%",
+	title = "{The recovery of the chiral symmetry in lattice {Gross-Neveu} model}",
+	volume = "76",
+	year = "1986"
+}
+
+@article{Aoki:1986ua,
+	author = "Aoki, Sinya",
+	journal = "Phys. Lett.",
+	pages = "140",
+	slaccitation = "%\%CITATION = PHLTA,B190,140;\%\%",
+	title = "{NUMERICAL EVIDENCE FOR A PARITY VIOLATING PHASE IN LATTICE QCD WITH WILSON FERMION}",
+	volume = "B190",
+	year = "1987"
+}
+
+@article{Aoki:1986xr,
+	author = "Aoki, S.",
+	journal = "Phys. Rev. Lett.",
+	pages = "3136",
+	slaccitation = "%\%CITATION = PRLTA,57,3136;\%\%",
+	title = "{A solution to the {U(1)} problem on a lattice}",
+	volume = "57",
+	year = "1986"
+}
+
+@article{Aoki:1993vs,
+	author = "Aoki, S. and Boettcher, S. and Gocksch, A.",
+	eprint = "hep-lat/9312084",
+	journal = "Phys. Lett.",
+	pages = "157--164",
+	slaccitation = "%\%CITATION = HEP-LAT 9312084;\%\%",
+	title = "{Spontaneous breaking of flavor symmetry and parity in the Nambu-Jona-Lasinio model with {Wilson} fermions}",
+	volume = "B331",
+	year = "1994"
+}
+
+@article{Aoki:1995ft,
+	author = "Aoki, S.",
+	eprint = "hep-lat/9509008",
+	journal = "Prog. Theor. Phys. Suppl.",
+	pages = "179--186",
+	slaccitation = "%\%CITATION = HEP-LAT 9509008;\%\%",
+	title = "{On the phase structure of {QCD} with {Wilson} fermions}",
+	volume = "122",
+	year = "1996"
+}
+
+@article{Aoki:1995yf,
+	author = "Aoki, S. and Ukawa, A. and Umemura, T.",
+	eprint = "hep-lat/9508008",
+	journal = "Phys. Rev. Lett.",
+	pages = "873--876",
+	slaccitation = "%\%CITATION = HEP-LAT 9508008;\%\%",
+	title = "{Finite temperature phase structure of lattice {QCD} with {Wilson} quark action}",
+	volume = "76",
+	year = "1996"
+}
+
+@article{Aoki:1997fm,
+	author = "Aoki, S.",
+	eprint = "hep-lat/9707020",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "206--219",
+	slaccitation = "%\%CITATION = HEP-LAT 9707020;\%\%",
+	title = "{Phase structure of lattice {QCD} with {Wilson} fermion at finite temperature}",
+	volume = "60A",
+	year = "1998"
+}
+
+@article{Aoki:2001xq,
+	author = "Aoki, S. and others",
+	collaboration = "JL{QCD}",
+	eprint = "hep-lat/0110088",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "263--265",
+	slaccitation = "%\%CITATION = HEP-LAT 0110088;\%\%",
+	title = "{Non-trivial phase structure of {N(f)} = 3 {QCD} with {O(a)}- improved {Wilson} fermion at zero temperature}",
+	volume = "106",
+	year = "2002"
+}
+
+@article{Aoki:2002vt,
+	author = "Aoki, Y. and others",
+	eprint = "hep-lat/0211023",
+	journal = "Phys. Rev.",
+	pages = "074504",
+	slaccitation = "%\%CITATION = HEP-LAT 0211023;\%\%",
+	title = "{Domain wall fermions with improved gauge actions}",
+	volume = "D69",
+	year = "2004"
+}
+
+@article{Aoki:2004iq,
+	author = "Aoki, S. and others",
+	collaboration = "JL{QCD}",
+	eprint = "hep-lat/0409016",
+	slaccitation = "%\%CITATION = HEP-LAT 0409016;\%\%",
+	title = "{Bulk first-order phase transition in three-flavor lattice {QCD} with {O(a)}-improved {Wilson} fermion action at zero temperature}",
+	year = "2004"
+}
+
+@article{Aoki:2004ta,
+	author = "Aoki, Sinya and B{\"a}r, Oliver",
+	eprint = "hep-lat/0409006",
+	journal = "Phys. Rev.",
+	pages = "116011",
+	slaccitation = "%\%CITATION = HEP-LAT 0409006;\%\%",
+	title = "{Twisted-mass {QCD}, {O}(a) improvement and {Wilson} chiral perturbation theory}",
+	volume = "D70",
+	year = "2004"
+}
+
+@article{Aoki:2005ii,
+	author = "Aoki, S. and B{\"a}r, O.",
+	eprint = "hep-lat/0509002",
+	slaccitation = "%\%CITATION = HEP-LAT 0509002;\%\%",
+	title = "{Determining the low energy parameters of {Wilson} chiral perturbation theory}",
+	year = "2005"
+}
+
+@article{Arnold:2003sx,
+	author = "Arnold, Guido and others",
+	eprint = "hep-lat/0311025",
+	slaccitation = "%\%CITATION = HEP-LAT 0311025;\%\%",
+	title = "{Numerical methods for the QCD overlap operator. II: Optimal Krylov subspace methods}",
+	year = "2003"
+}
+
+@article{Atiyah:1971rm,
+	author = "Atiyah, M. F. and Singer, I. M.",
+	journal = "Annals Math.",
+	pages = "139--149",
+	slaccitation = "%\%CITATION = ANMAA,93,139;\%\%",
+	title = "{The Index of elliptic operators. 5}",
+	volume = "93",
+	year = "1971"
+}
+
+@article{Aubin:2006cc,
+	author = "Aubin, C. and Blum, T.",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "251--255",
+	slaccitation = "%\%CITATION = NUPHZ,162,251;\%\%",
+	title = "{Hadronic contributions to the muon g-2 from the lattice}",
+	volume = "162",
+	year = "2006"
+}
+
+@article{Aubin:2006xv,
+	author = "Aubin, C. and Blum, T.",
+	eprint = "hep-lat/0608011",
+	journal = "Phys. Rev.",
+	pages = "114502",
+	slaccitation = "%\%CITATION = HEP-LAT/0608011;\%\%",
+	title = "{Calculating the hadronic vacuum polarization and leading hadronic contribution to the muon anomalous magnetic moment with improved staggered quarks}",
+	volume = "D75",
+	year = "2007"
+}
+
+@article{BAGEL,
+	author = "Boyle, P.A.",
+	eprint = "http://www.ph.ed.ac.uk/\~{ }paboyle/bagel/Bagel.html",
+	year = 2005
+}
+
+@article{Baikov:2004ku,
+	author = "Baikov, P. A. and Chetyrkin, K. G. and K{\"u}hn, J. H.",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "243--246",
+	slaccitation = "%\%CITATION = NUPHZ,135,243;\%\%",
+	title = "{Vacuum polarization in pQCD: First complete O(alpha(s)**4) result}",
+	volume = "135",
+	year = "2004"
+}
+
+@article{Baikov:2005rw,
+	author = "Baikov, P. A. and Chetyrkin, K. G. and K{\"u}hn, J. H.",
+	eprint = "hep-ph/0511063",
+	journal = "Phys. Rev. Lett.",
+	pages = "012003",
+	slaccitation = "%\%CITATION = HEP-PH/0511063;\%\%",
+	title = "{Scalar correlator at O(alpha(s)**4), Higgs decay into b- quarks and bounds on the light quark masses}",
+	volume = "96",
+	year = "2006"
+}
+
+@article{Baikov:2008jh,
+	archiveprefix = "arXiv",
+	author = "Baikov, P. A. and Chetyrkin, K. G. and K{\"u}hn, J. H.",
+	eprint = "0801.1821",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = ARXIV:0801.1821;\%\%",
+	title = "{Hadronic Z- and tau-Decays in Order alpha\_s^4}",
+	year = "2008"
+}
+
+@article{Bali:2000vr,
+	author = "Bali, G. S. and others",
+	collaboration = "TXL",
+	eprint = "hep-lat/0003012",
+	journal = "Phys. Rev.",
+	pages = "054503",
+	slaccitation = "%\%CITATION = HEP-LAT 0003012;\%\%",
+	title = "{Static potentials and glueball masses from {QCD} simulations with {Wilson} sea quarks}",
+	volume = "D62",
+	year = "2000"
+}
+
+@article{Bali:2004pb,
+	author = "Bali, G. S. and others",
+	eprint = "hep-lat/0409137",
+	journal = "Nucl. Phys. Proc. Supl.",
+	pages = "609--611",
+	slaccitation = "%\%CITATION = HEP-LAT 0409137;\%\%",
+	title = "{String breaking with dynamical {Wilson} fermions}",
+	volume = "140",
+	year = "2004"
+}
+
+@article{Bali:2005fu,
+	author = "Bali, G. S. and Neff, H. and Duessel, T. and Lippert, T. and Schilling, K.",
+	collaboration = "SESAM",
+	eprint = "hep-lat/0505012",
+	journal = "Phys. Rev.",
+	pages = "114513",
+	slaccitation = "%\%CITATION = HEP-LAT 0505012;\%\%",
+	title = "{Observation of string breaking in {QCD}}",
+	volume = "D71",
+	year = "2005"
+}
+
+@article{Bar:2006zj,
+	author = "B{\"a}r, O. and Jansen, K. and Schaefer, S. and Scorzato, L. and Shindler, A.",
+	eprint = "hep-lat/0609039",
+	slaccitation = "%\%CITATION = HEP-LAT 0609039;\%\%",
+	title = "{Overlap fermions on a twisted mass sea}",
+	year = "2006"
+}
+
+@article{Baxter:1993bv,
+	author = "Baxter, R. M. and others",
+	collaboration = "UK{QCD}",
+	eprint = "hep-lat/9308020",
+	journal = "Phys. Rev.",
+	pages = "1594--1605",
+	slaccitation = "%\%CITATION = HEP-LAT 9308020;\%\%",
+	title = "{Quenched heavy light decay constants}",
+	volume = "D49",
+	year = "1994"
+}
+
+@article{Beane:2004tw,
+	archiveprefix = "arXiv",
+	author = "Beane, Silas R.",
+	doi = "10.1103/PhysRevD.70.034507",
+	eprint = "hep-lat/0403015",
+	journal = "Phys. Rev.",
+	pages = "034507",
+	slaccitation = "%\%CITATION = HEP-LAT/0403015;\%\%",
+	title = "{Nucleon masses and magnetic moments in a finite volume}",
+	volume = "D70",
+	year = "2004"
+}
+
+@article{Becher:1999he,
+	author = "Becher, Thomas and Leutwyler, H.",
+	eprint = "hep-ph/9901384",
+	journal = "Eur. Phys. J.",
+	pages = "643--671",
+	slaccitation = "%\%CITATION = HEP-PH/9901384;\%\%",
+	title = "{Baryon chiral perturbation theory in manifestly Lorentz invariant form}",
+	volume = "C9",
+	year = "1999"
+}
+
+@article{Bietenholz:2004sa,
+	author = "Bietenholz, W. and others",
+	collaboration = "\xlf",
+	eprint = "hep-lat/0409109",
+	slaccitation = "%\%CITATION = HEP-LAT 0409109;\%\%",
+	title = "{Comparison between overlap and twisted mass fermions towards the chiral limit}",
+	year = "2004"
+}
+
+@article{Bietenholz:2004wv,
+	author = "Bietenholz, W. and others",
+	collaboration = "\xlf",
+	eprint = "hep-lat/0411001",
+	journal = "JHEP",
+	pages = "044",
+	slaccitation = "%\%CITATION = HEP-LAT 0411001;\%\%",
+	title = "{Going chiral: Overlap versus twisted mass fermions}",
+	volume = "12",
+	year = "2004"
+}
+
+@article{Blossier:2007vv,
+	archiveprefix = "arXiv",
+	author = "Blossier, B. and others",
+	collaboration = "ETM",
+	eprint = "0709.4574",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = ARXIV:0709.4574;\%\%",
+	title = "{Light quark masses and pseudoscalar decay constants from Nf=2 Lattice QCD with twisted mass fermions}",
+	year = "2007"
+}
+
+@article{Blum:1994eh,
+	author = "Blum, Tom and others",
+	eprint = "hep-lat/9404006",
+	journal = "Phys. Rev.",
+	pages = "3377--3381",
+	slaccitation = "%\%CITATION = HEP-LAT 9404006;\%\%",
+	title = "{QCD thermodynamics with Wilson quarks at large kappa}",
+	volume = "D50",
+	year = "1994"
+}
+
+@article{Blum:2000kn,
+	author = "Blum, T. and others",
+	eprint = "hep-lat/0007038",
+	journal = "Phys. Rev.",
+	pages = "074502",
+	slaccitation = "%\%CITATION = HEP-LAT 0007038;\%\%",
+	title = "{Quenched lattice {QCD} with domain wall fermions and the chiral limit}",
+	volume = "D69",
+	year = "2004"
+}
+
+@article{Bodin:2005gg,
+	author = "Bodin, F. and others",
+	collaboration = "ApeNEXT",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "176--182",
+	slaccitation = "%\%CITATION = NUPHZ,140,176;\%\%",
+	title = "{The {apeNEXT} project}",
+	volume = "140",
+	year = "2005"
+}
+
+@article{Bolder:2000un,
+	author = "Bolder, B. and others",
+	eprint = "hep-lat/0005018",
+	journal = "Phys. Rev.",
+	pages = "074504",
+	slaccitation = "%\%CITATION = HEP-LAT 0005018;\%\%",
+	title = "{A high precision study of the Q anti-Q potential from {Wilson} loops in the regime of string breaking}",
+	volume = "D63",
+	year = "2001"
+}
+
+@article{Boucaud:2007uk,
+	author = "Boucaud, Ph. and others",
+	collaboration = "ETM",
+	eprint = "hep-lat/0701012",
+	slaccitation = "%\%CITATION = HEP-LAT 0701012;\%\%",
+	title = "{Dynamical twisted mass fermions with light quarks}",
+	year = "2007"
+}
+
+@article{Boucaud:2008xu,
+	archiveprefix = "arXiv",
+	author = "Boucaud, Ph. and others",
+	collaboration = "ETM",
+	doi = "10.1016/j.cpc.2008.06.013",
+	eprint = "0803.0224",
+	journal = "Comput. Phys. Commun.",
+	pages = "695--715",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0803.0224;\%\%",
+	title = "{Dynamical Twisted Mass Fermions with Light Quarks: Simulation and Analysis Details}",
+	volume = "179",
+	year = "2008"
+}
+
+@article{Boughezal:2006px,
+	author = "Boughezal, R. and Czakon, M. and Schutzmeier, T.",
+	eprint = "hep-ph/0605023",
+	journal = "Phys. Rev.",
+	pages = "074006",
+	slaccitation = "%\%CITATION = HEP-PH/0605023;\%\%",
+	title = "{Charm and bottom quark masses from perturbative QCD}",
+	volume = "D74",
+	year = "2006"
+}
+
+@article{Boyle:2005fb,
+	author = "Boyle, P. A. and others",
+	journal = "J. Phys. Conf. Ser.",
+	pages = "129--139",
+	slaccitation = "%\%CITATION = 00462,16,129;\%\%",
+	title = "{{QCDOC}: Project status and first results}",
+	volume = "16",
+	year = "2005"
+}
+
+@article{Brower:1994er,
+	author = "Brower, R. C. and Levi, A. R. and Orginos, K.",
+	eprint = "hep-lat/9412004",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "855--857",
+	slaccitation = "%\%CITATION = HEP-LAT 9412004;\%\%",
+	title = "{Extrapolation methods for the Dirac inverter in hybrid Monte Carlo}",
+	volume = "42",
+	year = "1995"
+}
+
+@article{Brower:1995vx,
+	author = "Brower, R. C. and Ivanenko, T. and Levi, A. R. and Orginos, K. N.",
+	eprint = "hep-lat/9509012",
+	journal = "Nucl. Phys.",
+	pages = "353--374",
+	slaccitation = "%\%CITATION = HEP-LAT 9509012;\%\%",
+	title = "{Chronological inversion method for the Dirac matrix in hybrid Monte Carlo}",
+	volume = "B484",
+	year = "1997"
+}
+
+@article{Bunk:1995uv,
+	author = "Bunk, B. and others",
+	eprint = "hep-lat/9411016",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "49--55",
+	slaccitation = "%\%CITATION = HEP-LAT 9411016;\%\%",
+	title = "{A New simulation algorithm for lattice {QCD} with dynamical quarks}",
+	volume = "42",
+	year = "1995"
+}
+
+@article{Bunk:1998rm,
+	archiveprefix = "arXiv",
+	author = "Bunk, B. and Elser, Stephan and Frezzotti, R. and Jansen, K.",
+	doi = "10.1016/S0010-4655(99)00198-8",
+	eprint = "hep-lat/9805026",
+	journal = "Comput. Phys. Commun.",
+	pages = "95--109",
+	slaccitation = "%\%CITATION = HEP-LAT/9805026;\%\%",
+	title = "{Ordering monomial factors of polynomials in the product representation}",
+	volume = "118",
+	year = "1999"
+}
+
+@article{Bunk:1998rm,
+	author = "Bunk, B. and Elser, S. and Frezzotti, R. and Jansen, K.",
+	eprint = "hep-lat/9805026",
+	journal = "Comput. Phys. Commun.",
+	pages = "95--109",
+	slaccitation = "%\%CITATION = HEP-LAT 9805026;\%\%",
+	title = "{Ordering monomial factors of polynomials in the product representation}",
+	volume = "118",
+	year = "1999"
+}
+
+@article{Burrage:1998a,
+	author = "Burrage, K. and Erhel, J.",
+	journal = "Num. Lin. Alg. with Appl.",
+	pages = "101--121",
+	title = "{On the performance of various adaptive preconditioned GMRES strategies}",
+	volume = "5",
+	year = "1998"
+}
+
+@article{Campbell:1987nv,
+	author = "Campbell, N. A. and Huntley, A. and Michael, C.",
+	journal = "Nucl. Phys.",
+	pages = "51",
+	slaccitation = "%\%CITATION = NUPHA,B306,51;\%\%",
+	title = "{Heavy quark potentials and hybrid mesons from SU(3) lattice gauge theory}",
+	volume = "B306",
+	year = "1988"
+}
+
+@article{Capitani:2005jp,
+	author = "Capitani, S. and others",
+	eprint = "hep-lat/0511013",
+	journal = "Phys. Lett.",
+	pages = "520--526",
+	slaccitation = "%\%CITATION = HEP-LAT 0511013;\%\%",
+	title = "{Parton distribution functions with twisted mass fermions}",
+	volume = "B639",
+	year = "2006"
+}
+
+@article{Chen:2003im,
+	author = "Chen, Y. and others",
+	eprint = "hep-lat/0304005",
+	journal = "Phys. Rev.",
+	pages = "034502",
+	slaccitation = "%\%CITATION = HEP-LAT 0304005;\%\%",
+	title = "{Chiral logarithms in quenched {QCD}}",
+	volume = "D70",
+	year = "2004"
+}
+
+@book{Cheng:2000ct,
+	author = "Cheng, T. P. and Li, L. F.",
+	edition = "",
+	pages = "306",
+	publisher = "Oxford, UK: Clarendon",
+	title = "{Gauge theory of elementary particle physics: Problems and solutions}",
+	year = "2000"
+}
+
+@article{Chetyrkin:1990kr,
+	author = "Chetyrkin, K. G. and K{\"u}hn, Johann H.",
+	journal = "Phys. Lett.",
+	pages = "359--364",
+	slaccitation = "%\%CITATION = PHLTA,B248,359;\%\%",
+	title = "{Mass corrections to the Z decay rate}",
+	volume = "B248",
+	year = "1990"
+}
+
+@article{Chetyrkin:1996cf,
+	author = "Chetyrkin, K. G. and K{\"u}hn, Johann H. and Steinhauser, M.",
+	eprint = "hep-ph/9606230",
+	journal = "Nucl. Phys.",
+	pages = "213--240",
+	slaccitation = "%\%CITATION = HEP-PH/9606230;\%\%",
+	title = "{Three-loop polarization function and O(alpha(s)**2) corrections to the production of heavy quarks}",
+	volume = "B482",
+	year = "1996"
+}
+
+@article{Chetyrkin:1997mb,
+	author = "Chetyrkin, K. G. and K{\"u}hn, Johann H. and Steinhauser, M.",
+	eprint = "hep-ph/9705254",
+	journal = "Nucl. Phys.",
+	pages = "40--64",
+	slaccitation = "%\%CITATION = HEP-PH/9705254;\%\%",
+	title = "{Heavy quark current correlators to O(alpha(s)**2)}",
+	volume = "B505",
+	year = "1997"
+}
+
+@article{Chetyrkin:1998ix,
+	author = "Chetyrkin, K. G. and Harlander, R. and Steinhauser, M.",
+	eprint = "hep-ph/9801432",
+	journal = "Phys. Rev.",
+	pages = "014012",
+	slaccitation = "%\%CITATION = HEP-PH/9801432;\%\%",
+	title = "{Singlet polarization functions at O(alpha(s)**2)}",
+	volume = "D58",
+	year = "1998"
+}
+
+@article{Chetyrkin:2000zk,
+	author = "Chetyrkin, K. G. and Harlander, R. V. and K{\"u}hn, Johann H.",
+	eprint = "hep-ph/0005139",
+	journal = "Nucl. Phys.",
+	pages = "56--72",
+	slaccitation = "%\%CITATION = HEP-PH/0005139;\%\%",
+	title = "{Quartic mass corrections to R(had) at O(alpha(s)**3)}",
+	volume = "B586",
+	year = "2000"
+}
+
+@article{Chetyrkin:2006xg,
+	author = "Chetyrkin, K. G. and K{\"u}hn, J. H. and Sturm, C.",
+	eprint = "hep-ph/0604234",
+	journal = "Eur. Phys. J.",
+	pages = "107--110",
+	slaccitation = "%\%CITATION = HEP-PH/0604234;\%\%",
+	title = "{Four-loop moments of the heavy quark vacuum polarization function in perturbative QCD}",
+	volume = "C48",
+	year = "2006"
+}
+
+@article{Chiarappa:2004ry,
+	archiveprefix = "arXiv",
+	author = "Chiarappa, T. and others",
+	doi = "10.1016/j.nuclphysbps.2004.11.281",
+	eprint = "hep-lat/0409107",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "853--855",
+	slaccitation = "%\%CITATION = HEP-LAT/0409107;\%\%",
+	title = "{Comparing iterative methods for overlap and twisted mass fermions}",
+	volume = "140",
+	year = "2005"
+}
+
+@article{Chiarappa:2006ae,
+	archiveprefix = "arXiv",
+	author = "Chiarappa, T. and others",
+	doi = "10.1140/epjc/s10052-006-0204-4",
+	eprint = "hep-lat/0606011",
+	journal = "Eur. Phys. J.",
+	pages = "373--383",
+	slaccitation = "%\%CITATION = HEP-LAT/0606011;\%\%",
+	title = "{Numerical simulation of {QCD} with u, d, s and c quarks in the twisted-mass {W}ilson formulation}",
+	volume = "C50",
+	year = "2007"
+}
+
+@article{Chiarappa:2006hz,
+	archiveprefix = "arXiv",
+	author = "Chiarappa, T. and others",
+	eprint = "hep-lat/0609023",
+	journal = "Comput. Sci. Disc.",
+	pages = "015001",
+	slaccitation = "%\%CITATION = HEP-LAT/0609023;\%\%",
+	title = "{Iterative methods for overlap and twisted mass fermions}",
+	volume = "01",
+	year = "2008"
+}
+
+@article{Cichy:2008gk,
+	archiveprefix = "arXiv",
+	author = "Cichy, K. and {Gonzalez Lopez}, J. and Jansen, K. and Kujawa, A. and Shindler, A.",
+	doi = "10.1016/j.nuclphysb.2008.03.004",
+	eprint = "0802.3637",
+	journal = "Nucl. Phys.",
+	pages = "94--108",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0802.3637;\%\%",
+	title = "{Twisted Mass, Overlap and Creutz Fermions: Cut-off Effects at Tree-level of Perturbation Theory}",
+	volume = "B800",
+	year = "2008"
+}
+
+@article{Clark:2004cq,
+	author = "Clark, M. A. and Kennedy, A. D.",
+	eprint = "hep-lat/0409134",
+	slaccitation = "%\%CITATION = HEP-LAT 0409134;\%\%",
+	title = "{Accelerating fermionic molecular dynamics}",
+	year = "2004"
+}
+
+@article{Clark:2005sq,
+	author = "Clark, M. A. and de Forcrand, Ph. and Kennedy, A. D.",
+	eprint = "hep-lat/0510004",
+	journal = "PoS",
+	pages = "115",
+	slaccitation = "%\%CITATION = HEP-LAT 0510004;\%\%",
+	title = "{Algorithm shootout: R versus RHMC}",
+	volume = "LAT2005",
+	year = "2005"
+}
+
+@article{Clark:2006fx,
+	archiveprefix = "arXiv",
+	author = "Clark, M. A. and Kennedy, A. D.",
+	doi = "10.1103/PhysRevLett.98.051601",
+	eprint = "hep-lat/0608015",
+	journal = "Phys. Rev. Lett.",
+	pages = "051601",
+	slaccitation = "%\%CITATION = HEP-LAT/0608015;\%\%",
+	title = "{Accelerating Dynamical Fermion Computations using the Rational Hybrid Monte Carlo (RHMC) Algorithm with Multiple Pseudofermion Fields}",
+	volume = "98",
+	year = "2007"
+}
+
+@article{Clark:2006wp,
+	archiveprefix = "arXiv",
+	author = "Clark, M. A. and Kennedy, A. D.",
+	doi = "10.1103/PhysRevD.75.011502",
+	eprint = "hep-lat/0610047",
+	journal = "Phys. Rev.",
+	pages = "011502",
+	slaccitation = "%\%CITATION = HEP-LAT/0610047;\%\%",
+	title = "{Accelerating Staggered Fermion Dynamics with the Rational Hybrid Monte Carlo (RHMC) Algorithm}",
+	volume = "D75",
+	year = "2007"
+}
+
+@article{Colangelo:2001df,
+	archiveprefix = "arXiv",
+	author = "Colangelo, G. and Gasser, J. and Leutwyler, H.",
+	doi = "10.1016/S0550-3213(01)00147-X",
+	eprint = "hep-ph/0103088",
+	journal = "Nucl. Phys.",
+	pages = "125--179",
+	slaccitation = "%\%CITATION = HEP-PH/0103088;\%\%",
+	title = "{pi pi scattering}",
+	volume = "B603",
+	year = "2001"
+}
+
+@article{Colangelo:2003hf,
+	author = "Colangelo, Gilberto and D{\"u}rr, Stephan",
+	eprint = "hep-lat/0311023",
+	journal = "Eur. Phys. J.",
+	pages = "543--553",
+	slaccitation = "%\%CITATION = HEP-LAT/0311023;\%\%",
+	title = "{The pion mass in finite volume}",
+	volume = "C33",
+	year = "2004"
+}
+
+@article{Colangelo:2005gd,
+	author = "Colangelo, Gilberto and D{\"u}rr, Stephan and Haefeli, Christoph",
+	eprint = "hep-lat/0503014",
+	journal = "Nucl. Phys.",
+	pages = "136--174",
+	slaccitation = "%\%CITATION = HEP-LAT 0503014;\%\%",
+	title = "{Finite volume effects for meson masses and decay constants}",
+	volume = "B721",
+	year = "2005"
+}
+
+@article{Colangelo:2006mp,
+	archiveprefix = "arXiv",
+	author = "Colangelo, Gilberto and Haefeli, Christoph",
+	doi = "10.1016/j.nuclphysb.2006.03.010",
+	eprint = "hep-lat/0602017",
+	journal = "Nucl. Phys.",
+	pages = "14--33",
+	slaccitation = "%\%CITATION = HEP-LAT/0602017;\%\%",
+	title = "{Finite volume effects for the pion mass at two loops}",
+	volume = "B744",
+	year = "2006"
+}
+
+@book{Collins:1994ab,
+	author = "Collins, J.C.",
+	edition = "",
+	publisher = "Cambridge University Press",
+	series = "{Cambridge Monographs on Mathematical Physics}",
+	title = "{Renormalisation}",
+	year = "1994"
+}
+
+@article{Creutz:1984fj,
+	author = "Creutz, M. and Gocksch, A. and Ogilvie, M. and Okawa, M.",
+	journal = "Phys. Rev. Lett.",
+	pages = "875",
+	slaccitation = "%\%CITATION = PRLTA,53,875;\%\%",
+	title = "{Microcanonical renormalization group}",
+	volume = "53",
+	year = "1984"
+}
+
+@article{Creutz:1989wt,
+	author = "Creutz, M. and Gocksch, A.",
+	note = "BNL-42601",
+	title = "{Higher order hybrid monte carlo algorithms}"
+}
+
+@article{Creutz:1996bg,
+	author = "Creutz, Michael",
+	eprint = "hep-lat/9608024",
+	slaccitation = "%\%CITATION = HEP-LAT 9608024;\%\%",
+	title = "{Wilson fermions at finite temperature}",
+	year = "1996"
+}
+
+@article{Creutz:1998ee,
+	author = "Creutz, M.",
+	eprint = "hep-lat/9806037",
+	journal = "Phys. Rev. Lett.",
+	pages = "3555--3558",
+	slaccitation = "%\%CITATION = HEP-LAT 9806037;\%\%",
+	title = "{Evaluating Grassmann integrals}",
+	volume = "81",
+	year = "1998"
+}
+
+@article{Cundy:2005pi,
+	author = "Cundy, N. and others",
+	eprint = "hep-lat/0502007",
+	slaccitation = "%\%CITATION = HEP-LAT 0502007;\%\%",
+	title = "{Numerical Methods for the {QCD} Overlap Operator IV: Hybrid Monte Carlo}",
+	year = "2005"
+}
+
+@article{David:1984ys,
+	author = "David, F. and Hamber, H. W.",
+	journal = "Nucl. Phys.",
+	pages = "381",
+	slaccitation = "%\%CITATION = NUPHA,B248,381;\%\%",
+	title = "{Chiral condensate with {Wilson} fermions}",
+	volume = "B248",
+	year = "1984"
+}
+
+@article{Davies:2008sw,
+	archiveprefix = "arXiv",
+	author = "Davies, C. T. H. and others",
+	collaboration = "HPQCD",
+	eprint = "0807.1687",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0807.1687;\%\%",
+	title = "{Update: Accurate Determinations of $\alpha_s$ from Realistic Lattice QCD}",
+	year = "2008"
+}
+
+@article{DeGrand:1990dk,
+	author = "DeGrand, T. A. and Rossi, P.",
+	journal = "Comput. Phys. Commun.",
+	pages = "211--214",
+	slaccitation = "%\%CITATION = CPHCB,60,211;\%\%",
+	title = "{Conditioning techniques for dynamical fermions}",
+	volume = "60",
+	year = "1990"
+}
+
+@article{DeGrand:1990ip,
+	author = "DeGrand, T. A.",
+	journal = "Phys. Rev.",
+	pages = "2296--2300",
+	slaccitation = "%\%CITATION = PHRVA,D43,2296;\%\%",
+	title = "{Resonance masses from Monte Carlo simulations (with emphasis on the rho meson)}",
+	volume = "D43",
+	year = "1991"
+}
+
+@article{DeGrand:2002vu,
+	author = "DeGrand, Thomas and Hasenfratz, Anna and Kovacs, Tamas G.",
+	eprint = "hep-lat/0211006",
+	journal = "Phys. Rev.",
+	pages = "054501",
+	slaccitation = "%\%CITATION = HEP-LAT 0211006;\%\%",
+	title = "{Improving the chiral properties of lattice fermions}",
+	volume = "D67",
+	year = "2003"
+}
+
+@article{DeTar:2007ni,
+	archiveprefix = "arXiv",
+	author = "DeTar, Carleton and Levkova, L.",
+	eprint = "0710.1322",
+	journal = "PoS",
+	pages = "116",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = ARXIV:0710.1322;\%\%",
+	title = "{Effects of the disconnected flavor singlet corrections on the hyperfine splitting in charmonium}",
+	volume = "LAT2007",
+	year = "2007"
+}
+
+@article{DelDebbio:2006cn,
+	author = "{Del Debbio}, L. and Giusti, L. and L{\"u}scher, M. and Petronzio, R. and Tantalo, N.",
+	eprint = "hep-lat/0610059",
+	journal = "JHEP",
+	pages = "056",
+	slaccitation = "%\%CITATION = HEP-LAT 0610059;\%\%",
+	title = "{QCD with light Wilson quarks on fine lattices. I: First experiences and physics results}",
+	volume = "02",
+	year = "2007"
+}
+
+@article{DellaMorte:2000yp,
+	author = "{Della Morte}, M. and Frezzotti, R. and Heitger, J. and Sint, S.",
+	eprint = "hep-lat/0010091",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "617--621",
+	slaccitation = "%\%CITATION = HEP-LAT 0010091;\%\%",
+	title = "{Non-perturbative scaling tests of twisted mass {QCD}}",
+	volume = "94",
+	year = "2001"
+}
+
+@article{DellaMorte:2001tu,
+	author = "{Della Morte}, M. and Frezzotti, R. and Heitger, J.",
+	eprint = "hep-lat/0110166",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "260--262",
+	slaccitation = "%\%CITATION = HEP-LAT 0110166;\%\%",
+	title = "{Quenched twisted mass {QCD} at small quark masses and in large volume}",
+	volume = "106",
+	year = "2002"
+}
+
+@article{DellaMorte:2001ys,
+	author = "{Della Morte}, M. and Frezzotti, R. and Heitger, J. and Sint, S.",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/0108019",
+	journal = "JHEP",
+	pages = "041",
+	slaccitation = "%\%CITATION = HEP-LAT 0108019;\%\%",
+	title = "{Cutoff effects in twisted mass lattice {QCD}}",
+	volume = "10",
+	year = "2001"
+}
+
+@article{DellaMorte:2003jj,
+	author = "{Della Morte}, M. and others",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/0307008",
+	journal = "Comput. Phys. Commun.",
+	pages = "62--72",
+	slaccitation = "%\%CITATION = HEP-LAT 0307008;\%\%",
+	title = "{Simulating the Schroedinger functional with two pseudo- fermions}",
+	volume = "156",
+	year = "2003"
+}
+
+@article{DellaMorte:2003mn,
+	author = "{Della Morte}, M. and others",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/0307021",
+	journal = "Phys. Lett.",
+	pages = "93--98",
+	slaccitation = "%\%CITATION = HEP-LAT 0307021;\%\%",
+	title = "{Lattice HQET with exponentially improved statistical precision}",
+	volume = "B581",
+	year = "2004"
+}
+
+@article{DellaMorte:2003mw,
+	author = "{Della Morte}, M. and others",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/0309080",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "346--348",
+	slaccitation = "%\%CITATION = HEP-LAT 0309080;\%\%",
+	title = "{Static quarks with improved statistical precision}",
+	volume = "129",
+	year = "2004"
+}
+
+@article{DellaMorte:2005yc,
+	author = "{Della Morte}, M. and Shindler, A. and Sommer, R.",
+	eprint = "hep-lat/0506008",
+	slaccitation = "%\%CITATION = HEP-LAT 0506008;\%\%",
+	title = "{On lattice actions for static quarks}",
+	year = "2005"
+}
+
+@article{Dimopoulos:2006dm,
+	author = "Dimopoulos, P. and others",
+	collaboration = "ALPHA",
+	eprint = "hep-ph/0601002",
+	journal = "Nucl. Phys.",
+	pages = "69--108",
+	slaccitation = "%\%CITATION = HEP-PH 0601002;\%\%",
+	title = "{A precise determination of B(K) in quenched QCD}",
+	volume = "B749",
+	year = "2006"
+}
+
+@article{Dimopoulos:2007fn,
+	archiveprefix = "arXiv",
+	author = "Dimopoulos, P. and others",
+	eprint = "0710.0975",
+	journal = "PoS",
+	pages = "241",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0710.0975;\%\%",
+	title = "{Renormalisation of quark bilinears with Nf=2 Wilson fermions and tree-level improved gauge action}",
+	volume = "LAT2007",
+	year = "2007"
+}
+
+@article{Dimopoulos:2007qy,
+	archiveprefix = "arXiv",
+	author = "Dimopoulos, Petros and Frezzotti, Roberto and Herdoiza, Gregorio and Urbach, Carsten and Wenger, Urs",
+	collaboration = "ETM",
+	eprint = "0710.2498",
+	journal = "PoS",
+	pages = "102",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0710.2498;\%\%",
+	title = "{Scaling and low energy constants in lattice QCD with N\_f=2 maximally twisted Wilson quarks}",
+	volume = "LAT2007",
+	year = "2007"
+}
+
+@article{Dimopoulos:2008sy,
+	archiveprefix = "arXiv",
+	author = "Dimopoulos, Petros and others",
+	collaboration = "ETM",
+	eprint = "0810.2873",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0810.2873;\%\%",
+	title = "{Scaling and chiral extrapolation of pion mass and decay constant with maximally twisted mass QCD}",
+	year = "2008"
+}
+
+@article{Dong:2001fm,
+	author = "Dong, S. J. and others",
+	eprint = "hep-lat/0108020",
+	journal = "Phys. Rev.",
+	pages = "054507",
+	slaccitation = "%\%CITATION = HEP-LAT 0108020;\%\%",
+	title = "{Chiral properties of pseudoscalar mesons on a quenched 20**4 lattice with overlap fermions}",
+	volume = "D65",
+	year = "2002"
+}
+
+@article{Duane:1987de,
+	author = "Duane, S. and Kennedy, A. D. and Pendleton, B. J. and Roweth, D.",
+	journal = "Phys. Lett.",
+	pages = "216--222",
+	slaccitation = "%\%CITATION = PHLTA,B195,216;\%\%",
+	title = "{{H}ybrid monte carlo}",
+	volume = "B195",
+	year = "1987"
+}
+
+@article{Edwards:1996vs,
+	author = "Edwards, R. G. and Horvath, I. and Kennedy, A. D.",
+	eprint = "hep-lat/9606004",
+	journal = "Nucl. Phys.",
+	pages = "375--402",
+	slaccitation = "%\%CITATION = HEP-LAT 9606004;\%\%",
+	title = "{Instabilities and non-reversibility of molecular dynamics trajectories}",
+	volume = "B484",
+	year = "1997"
+}
+
+@article{Eichten:1989zv,
+	author = "Eichten, E. and Hill, B.",
+	journal = "Phys. Lett.",
+	pages = "511",
+	slaccitation = "%\%CITATION = PHLTA,B234,511;\%\%",
+	title = "{An effective field theory for the calculation of matrix elements involving heavy quarks}",
+	volume = "B234",
+	year = "1990"
+}
+
+@article{Farchioni:2002vn,
+	author = "Farchioni, F. and Gebert, C. and Montvay, I. and Scorzato, L.",
+	eprint = "hep-lat/0206008",
+	journal = "Eur. Phys. J.",
+	pages = "237--251",
+	slaccitation = "%\%CITATION = HEP-LAT 0206008;\%\%",
+	title = "{Numerical simulation tests with light dynamical quarks}",
+	volume = "C26",
+	year = "2002"
+}
+
+@article{Farchioni:2004fs,
+	author = "Farchioni, F. and others",
+	eprint = "hep-lat/0410031",
+	journal = "Eur. Phys. J.",
+	pages = "73--87",
+	slaccitation = "%\%CITATION = HEP-LAT 0410031;\%\%",
+	title = "{The phase structure of lattice {QCD} with {Wilson} quarks and renormalization group improved gluons}",
+	volume = "C42",
+	year = "2005"
+}
+
+@article{Farchioni:2004ma,
+	author = "Farchioni, F. and others",
+	eprint = "hep-lat/0409098",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "240--245",
+	slaccitation = "%\%CITATION = HEP-LAT 0409098;\%\%",
+	title = "{Exploring the phase structure of lattice {{QCD}} with twisted mass quarks}",
+	volume = "140",
+	year = "2005"
+}
+
+@article{Farchioni:2004us,
+	author = "Farchioni, F. and others",
+	eprint = "hep-lat/0406039",
+	journal = "Eur. Phys. J.",
+	pages = "421--433",
+	slaccitation = "%\%CITATION = HEP-LAT 0406039;\%\%",
+	title = "{Twisted mass quarks and the phase structure of lattice {QCD}}",
+	volume = "C39",
+	year = "2005"
+}
+
+@article{Farchioni:2005ec,
+	author = "Farchioni, Federico and others",
+	eprint = "hep-lat/0509131",
+	journal = "PoS",
+	pages = "072",
+	slaccitation = "%\%CITATION = HEP-LAT 0509131;\%\%",
+	title = "{Dynamical twisted mass fermions}",
+	volume = "LAT2005",
+	year = "2006"
+}
+
+@article{Farchioni:2005hf,
+	author = "Farchioni, F. and others",
+	eprint = "hep-lat/0509036",
+	journal = "PoS",
+	pages = "033",
+	slaccitation = "%\%CITATION = HEP-LAT 0509036;\%\%",
+	title = "{Twisted mass fermions: Neutral pion masses from disconnected contributions}",
+	volume = "LAT2005",
+	year = "2006"
+}
+
+@article{Farchioni:2005tu,
+	author = "Farchioni, F. and others",
+	eprint = "hep-lat/0506025",
+	journal = "Phys. Lett.",
+	pages = "324--333",
+	slaccitation = "%\%CITATION = HEP-LAT 0506025;\%\%",
+	title = "{Lattice spacing dependence of the first order phase transition for dynamical twisted mass fermions}",
+	volume = "B624",
+	year = "2005"
+}
+
+@article{Feldmann:1999uf,
+	author = "Feldmann, Thorsten",
+	eprint = "hep-ph/9907491",
+	journal = "Int. J. Mod. Phys.",
+	pages = "159--207",
+	slaccitation = "%\%CITATION = HEP-PH/9907491;\%\%",
+	title = "{Quark structure of pseudoscalar mesons}",
+	volume = "A15",
+	year = "2000"
+}
+
+@article{Feynman:1948aa,
+	author = "Feynman, R. P.",
+	journal = "Rev. Mod. Phys.",
+	pages = "367--387",
+	slaccitation = "%\%CITATION = RMPHA,20,367;\%\%",
+	title = "{Space-time approach to non-relativistic quantum mechanics}",
+	volume = "20",
+	year = "1948"
+}
+
+@article{Fischer:1996th,
+	author = "Fischer, S. and others",
+	eprint = "hep-lat/9602019",
+	journal = "Comp. Phys. Commun.",
+	pages = "20--34",
+	slaccitation = "%\%CITATION = HEP-LAT 9602019;\%\%",
+	title = "{A Parallel SSOR Preconditioner for Lattice {QCD}}",
+	volume = "98",
+	year = "1996"
+}
+
+@article{Fokkema:1998aa,
+	author = "Fokkema, D.~R. and Sleijpen, G.~L.~G. and Van~der~Vorst, H.~A.",
+	journal = "J. Sci. Comput.",
+	pages = "94--125",
+	title = "{{J}acobi-{D}avidson style {QR} and {QZ} algorithms for the reduction of matrix pencils}",
+	volume = "20",
+	year = "1998"
+}
+
+@article{Foster:1998vw,
+	author = "Foster, M. and Michael, C.",
+	collaboration = "UKQCD",
+	eprint = "hep-lat/9810021",
+	journal = "Phys. Rev.",
+	pages = "074503",
+	slaccitation = "%\%CITATION = HEP-LAT 9810021;\%\%",
+	title = "{Quark mass dependence of hadron masses from lattice {QCD}}",
+	volume = "D59",
+	year = "1999"
+}
+
+@article{Freund,
+	author = "Freund, R.W.",
+	journal = "in Numerical Linear Algebra, L.\ Reichel, A.\ Ruttan and R.S.\ Varga (eds.)",
+	pages = "p. 101",
+	year = "1993"
+}
+
+@article{Frezzotti:1997ym,
+	author = "Frezzotti, R. and Jansen, K.",
+	eprint = "hep-lat/9702016",
+	journal = "Phys. Lett.",
+	pages = "328--334",
+	slaccitation = "%\%CITATION = HEP-LAT 9702016;\%\%",
+	title = "{A polynomial hybrid Monte Carlo algorithm}",
+	volume = "B402",
+	year = "1997"
+}
+
+@article{Frezzotti:1998eu,
+	author = "Frezzotti, R. and Jansen, K.",
+	eprint = "hep-lat/9808011",
+	journal = "Nucl. Phys.",
+	pages = "395--431",
+	slaccitation = "%\%CITATION = HEP-LAT 9808011;\%\%",
+	title = "{The {PHMC} algorithm for simulations of dynamical fermions. {I}: Description and properties}",
+	volume = "B555",
+	year = "1999"
+}
+
+@article{Frezzotti:1998yp,
+	author = "Frezzotti, R. and Jansen, K.",
+	eprint = "hep-lat/9808038",
+	journal = "Nucl. Phys.",
+	pages = "432--453",
+	slaccitation = "%\%CITATION = HEP-LAT 9808038;\%\%",
+	title = "{The {PHMC} algorithm for simulations of dynamical fermions. {II}: Performance analysis}",
+	volume = "B555",
+	year = "1999"
+}
+
+@article{Frezzotti:1999vv,
+	author = "Frezzotti, R. and Grassi, P. A. and Sint, S. and Weisz, P.",
+	eprint = "hep-lat/9909003",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "941--946",
+	slaccitation = "%\%CITATION = HEP-LAT 9909003;\%\%",
+	title = "{A local formulation of lattice {QCD} without unphysical fermion zero modes}",
+	volume = "83",
+	year = "2000"
+}
+
+@article{Frezzotti:2000nk,
+	author = "Frezzotti, R. and Grassi, P. A. and Sint, S. and Weisz, P.",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/0101001",
+	journal = "JHEP",
+	pages = "058",
+	slaccitation = "%\%CITATION = HEP-LAT 0101001;\%\%",
+	title = "{Lattice {QCD} with a chirally twisted mass term}",
+	volume = "08",
+	year = "2001"
+}
+
+@article{Frezzotti:2001du,
+	author = "Frezzotti, R. and Sint, S.",
+	eprint = "hep-lat/0110140",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "814--816",
+	slaccitation = "%\%CITATION = HEP-LAT 0110140;\%\%",
+	title = "{Some remarks on {O(a)} improved twisted mass {QCD}}",
+	volume = "106",
+	year = "2002"
+}
+
+@article{Frezzotti:2001ea,
+	author = "Frezzotti, R. and Sint, S. and Weisz, P.",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/0104014",
+	journal = "JHEP",
+	pages = "048",
+	slaccitation = "%\%CITATION = HEP-LAT 0104014;\%\%",
+	title = "{{O(a)} improved twisted mass lattice {QCD}}",
+	volume = "07",
+	year = "2001"
+}
+
+@article{Frezzotti:2003ni,
+	author = "Frezzotti, R. and Rossi, G. C.",
+	eprint = "hep-lat/0306014",
+	journal = "JHEP",
+	pages = "007",
+	slaccitation = "%\%CITATION = HEP-LAT 0306014;\%\%",
+	title = "{Chirally improving {Wilson} fermions. {I}: {O(a)} improvement}",
+	volume = "08",
+	year = "2004"
+}
+
+@article{Frezzotti:2003xj,
+	author = "Frezzotti, R. and Rossi, G. C.",
+	eprint = "hep-lat/0311008",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "193--202",
+	slaccitation = "%\%CITATION = HEP-LAT 0311008;\%\%",
+	title = "{Twisted-mass lattice {QCD} with mass non-degenerate quarks}",
+	volume = "128",
+	year = "2004"
+}
+
+@article{Frezzotti:2004wz,
+	author = "Frezzotti, R. and Rossi, G. C.",
+	eprint = "hep-lat/0407002",
+	journal = "JHEP",
+	pages = "070",
+	slaccitation = "%\%CITATION = HEP-LAT 0407002;\%\%",
+	title = "{Chirally improving {Wilson} fermions. {II}: Four-quark operators}",
+	volume = "10",
+	year = "2004"
+}
+
+@article{Frezzotti:2005gi,
+	author = "Frezzotti, R. and Martinelli, G. and Papinutto, M. and Rossi, G. C.",
+	eprint = "hep-lat/0503034",
+	journal = "JHEP",
+	pages = "038",
+	slaccitation = "%\%CITATION = HEP-LAT 0503034;\%\%",
+	title = "{Reducing cutoff effects in maximally twisted lattice {QCD} close to the chiral limit}",
+	volume = "04",
+	year = "2006"
+}
+
+@article{Frezzotti:2007qv,
+	archiveprefix = "arXiv",
+	author = "Frezzotti, R. and Rossi, G.",
+	eprint = "0710.2492",
+	journal = "PoS",
+	pages = "277",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0710.2492;\%\%",
+	title = "{O(a^2) cutoff effects in Wilson fermion simulations}",
+	volume = "LAT2007",
+	year = "2007"
+}
+
+@article{Frezzotti:2008dr,
+	archiveprefix = "arXiv",
+	author = "Frezzotti, R. and Lubicz, V. and Simula, S.",
+	collaboration = "ETM",
+	eprint = "0812.4042",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0812.4042;\%\%",
+	title = "{Electromagnetic form factor of the pion from twisted-mass lattice {QCD} at {Nf}=2}",
+	year = "2008"
+}
+
+@article{Fritzsch:1973pi,
+	author = "Fritzsch, H. and Gell-Mann, M. and Leutwyler, H.",
+	journal = "Phys. Lett.",
+	pages = "365--368",
+	slaccitation = "%\%CITATION = PHLTA,B47,365;\%\%",
+	title = "{Advantages of the color octet gluon picture}",
+	volume = "B47",
+	year = "1973"
+}
+
+@article{Frommer:1994vn,
+	author = "Frommer, A. and Hannemann, V. and Nockel, B. and Lippert, T. and Schilling, K.",
+	eprint = "hep-lat/9404013",
+	journal = "Int. J. Mod. Phys.",
+	pages = "1073--1088",
+	slaccitation = "%\%CITATION = HEP-LAT 9404013;\%\%",
+	title = "{Accelerating {Wilson} fermion matrix inversions by means of the stabilized biconjugate gradient algorithm}",
+	volume = "C5",
+	year = "1994"
+}
+
+@article{Frommer:1995ik,
+	author = "Frommer, Andreas and Nockel, Bertold and Gusken, Stephan and Lippert, Thomas and Schilling, Klaus",
+	eprint = "hep-lat/9504020",
+	journal = "Int. J. Mod. Phys.",
+	pages = "627--638",
+	slaccitation = "%\%CITATION = HEP-LAT 9504020;\%\%",
+	title = "{Many masses on one stroke: Economic computation of quark propagators}",
+	volume = "C6",
+	year = "1995"
+}
+
+@article{Frommer:2013fsa,
+	archiveprefix = "arXiv",
+	author = "Frommer, Andreas and Kahl, Karsten and Krieg, Stefan and Leder, Bj�rn and Rottmann, Matthias",
+	doi = "10.1137/130919507",
+	eprint = "1303.1377",
+	journal = "SIAM J. Sci. Comput.",
+	pages = "A1581--A1608",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = ARXIV:1303.1377;\%\%",
+	title = "{Adaptive Aggregation Based Domain Decomposition Multigrid for the Lattice Wilson Dirac Operator}",
+	volume = "36",
+	year = "2014"
+}
+
+@article{Alexandrou:2016izb,
+	archiveprefix = "arXiv",
+	author = "Alexandrou, Constantia and Bacchio, Simone and Finkenrath, Jacob and Frommer, Andreas and Kahl, Karsten and Rottmann, Matthias",
+	eprint = "1610.02370",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = ARXIV:1610.02370;\%\%",
+	title = "{Adaptive Aggregation-based Domain Decomposition Multigrid for Twisted Mass Fermions}",
+	year = "2016"
+}
+
+@article{Furman:1994ky,
+	author = "Furman, V. and Shamir, Y.",
+	eprint = "hep-lat/9405004",
+	journal = "Nucl. Phys.",
+	pages = "54--78",
+	slaccitation = "%\%CITATION = HEP-LAT 9405004;\%\%",
+	title = "{Axial symmetries in lattice QCD with Kaplan fermions}",
+	volume = "B439",
+	year = "1995"
+}
+
+@article{Garden:1999fg,
+	author = "Garden, J. and Heitger, J. and Sommer, R. and H., Wittig",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/9906013",
+	journal = "Nucl. Phys.",
+	pages = "237--256",
+	slaccitation = "%\%CITATION = HEP-LAT 9906013;\%\%",
+	title = "{Precision computation of the strange quark's mass in quenched {QCD}}",
+	volume = "B571",
+	year = "2000"
+}
+
+@article{Garron:2003cb,
+	author = "Garron, N. and Giusti, L. and Hoelbling, C. and Lellouch, L. and Rebbi, C.",
+	eprint = "hep-ph/0306295",
+	journal = "Phys. Rev. Lett.",
+	pages = "042001",
+	slaccitation = "%\%CITATION = HEP-PH 0306295;\%\%",
+	title = "{B(K) from quenched {QCD} with exact chiral symmetry}",
+	volume = "92",
+	year = "2004"
+}
+
+@article{Gasser:1982ap,
+	author = "Gasser, J. and Leutwyler, H.",
+	journal = "Phys. Rept.",
+	pages = "77--169",
+	slaccitation = "%\%CITATION = PRPLC,87,77;\%\%",
+	title = "{Quark masses}",
+	volume = "87",
+	year = "1982"
+}
+
+@article{Gasser:1983yg,
+	author = "Gasser, J. and Leutwyler, H.",
+	journal = "Ann. Phys.",
+	pages = "142",
+	slaccitation = "%\%CITATION = APNYA,158,142;\%\%",
+	title = "{Chiral perturbation theory to one loop}",
+	volume = "158",
+	year = "1984"
+}
+
+@article{Gasser:1985gg,
+	author = "Gasser, J. and Leutwyler, H.",
+	journal = "Nucl. Phys.",
+	pages = "465",
+	slaccitation = "%\%CITATION = NUPHA,B250,465;\%\%",
+	title = "{Chiral perturbation theory: expansions in the mass of the strange quark}",
+	volume = "B250",
+	year = "1985"
+}
+
+@article{Gasser:1986vb,
+	author = "Gasser, J. and Leutwyler, H.",
+	journal = "Phys. Lett.",
+	pages = "83",
+	slaccitation = "%\%CITATION = PHLTA,B184,83;\%\%",
+	title = "{LIGHT QUARKS AT LOW TEMPERATURES}",
+	volume = "B184",
+	year = "1987"
+}
+
+@article{Gattringer:2003qx,
+	author = "Gattringer, C. and others",
+	collaboration = "BGR",
+	eprint = "hep-lat/0307013",
+	journal = "Nucl. Phys.",
+	pages = "3--51",
+	slaccitation = "%\%CITATION = HEP-LAT 0307013;\%\%",
+	title = "{Quenched spectroscopy with fixed-point and chirally improved fermions}",
+	volume = "B677",
+	year = "2004"
+}
+
+@article{Gell-Mann:1964nj,
+	author = "Gell-Mann, M.",
+	journal = "Phys. Lett.",
+	pages = "214--215",
+	slaccitation = "%\%CITATION = PHLTA,8,214;\%\%",
+	title = "{A Schematic model of baryons and mesons}",
+	volume = "8",
+	year = "1964"
+}
+
+@article{Gell-Mann:1968rz,
+	author = "Gell-Mann, M. and Oakes, R. J. and Renner, B.",
+	journal = "Phys. Rev.",
+	pages = "2195--2199",
+	slaccitation = "%\%CITATION = PHRVA,175,2195;\%\%",
+	title = "{Behavior of current divergences under SU(3) x SU(3)}",
+	volume = "175",
+	year = "1968"
+}
+
+@phdthesis{Geus:2002,
+	author = "Geus, R.",
+	optaddress = "",
+	optannote = "",
+	optkey = "DISS. ETH NO. 14734",
+	optmonth = "",
+	optnote = "",
+	opttype = "",
+	school = "Swiss Federal Institute Of Technology Z{\"u}rich",
+	title = "{The Jacobi-Davidson algorithm for solving large sparse symmetric eigenvalue problems with application to the design of accelerator cavities}",
+	year = "2002"
+}
+
+@article{Gimenez:1998ue,
+	author = "Gimenez, V. and Giusti, L. and Rapuano, F. and Talevi, M.",
+	eprint = "hep-lat/9806006",
+	journal = "Nucl. Phys.",
+	pages = "429--445",
+	slaccitation = "%\%CITATION = HEP-LAT 9806006;\%\%",
+	title = "{Non-perturbative renormalization of quark bilinears}",
+	volume = "B531",
+	year = "1998"
+}
+
+@article{Gimenez:2005nt,
+	author = "Gimenez, V. and Lubicz, V. and Mescia, F. and Porretti, V. and Reyes, J.",
+	eprint = "hep-lat/0503001",
+	journal = "Eur. Phys. J.",
+	pages = "535--544",
+	slaccitation = "%\%CITATION = HEP-LAT/0503001;\%\%",
+	title = "{Operator product expansion and quark condensate from lattice QCD in coordinate space}",
+	volume = "C41",
+	year = "2005"
+}
+
+@article{Ginsparg:1981bj,
+	author = "Ginsparg, P. H. and {Wilson}, K. G.",
+	journal = "Phys. Rev.",
+	pages = "2649",
+	slaccitation = "%\%CITATION = PHRVA,D25,2649;\%\%",
+	title = "{A remnant of chiral symmetry on the lattice}",
+	volume = "D25",
+	year = "1982"
+}
+
+@article{Giusti:1998wy,
+	author = "Giusti, L. and Rapuano, F. and Talevi, M. and Vladikas, A.",
+	eprint = "hep-lat/9807014",
+	journal = "Nucl. Phys.",
+	pages = "249--277",
+	slaccitation = "%\%CITATION = HEP-LAT 9807014;\%\%",
+	title = "{The QCD chiral condensate from the lattice}",
+	volume = "B538",
+	year = "1999"
+}
+
+@article{Giusti:2001pk,
+	author = "Giusti, L. and Hoelbling, C. and Rebbi, C.",
+	eprint = "hep-lat/0108007",
+	journal = "Phys. Rev.",
+	note = "Erratum-ibid.D65:079903,2002",
+	pages = "114508",
+	slaccitation = "%\%CITATION = HEP-LAT 0108007;\%\%",
+	title = "{Light quark masses with overlap fermions in quenched {QCD}}",
+	volume = "D64",
+	year = "2001"
+}
+
+@article{Giusti:2002sm,
+	author = "Giusti, L. and Hoelbling, C. and L{\"u}scher, M. and Wittig, H.",
+	eprint = "hep-lat/0212012",
+	journal = "Comput. Phys. Commun.",
+	pages = "31--51",
+	slaccitation = "%\%CITATION = HEP-LAT 0212012;\%\%",
+	title = "{Numerical techniques for lattice QCD in the epsilon- regime}",
+	volume = "153",
+	year = "2003"
+}
+
+@article{Giusti:2007hk,
+	author = "Giusti, Leonardo",
+	eprint = "hep-lat/0702014",
+	journal = "PoS.",
+	pages = "",
+	slaccitation = "%\%CITATION = HEP-LAT/0702014;\%\%",
+	title = "{Light dynamical fermions on the lattice: Toward the chiral regime of QCD}",
+	volume = "LAT2006",
+	year = "2007"
+}
+
+@article{Glassner:1996gz,
+	author = "Gl{\"a}ssner, U. and others",
+	eprint = "hep-lat/9605008",
+	slaccitation = "%\%CITATION = HEP-LAT 9605008;\%\%",
+	title = "{How to compute {G}reen's functions for entire mass trajectories within {K}rylov solvers}",
+	year = "1996"
+}
+
+@article{Gockeler:1998fn,
+	author = "G{\"o}ckeler, M. and others",
+	eprint = "hep-lat/9707021",
+	journal = "Phys. Rev.",
+	pages = "5562--5580",
+	slaccitation = "%\%CITATION = HEP-LAT 9707021;\%\%",
+	title = "{Scaling of non-perturbatively {O(a)} improved {Wilson} fermions: Hadron spectrum, quark masses and decay constants}",
+	volume = "D57",
+	year = "1998"
+}
+
+@article{Gorishnii:1990vf,
+	author = "Gorishnii, S. G. and Kataev, A. L. and Larin, S. A.",
+	journal = "Phys. Lett.",
+	pages = "144--150",
+	slaccitation = "%\%CITATION = PHLTA,B259,144;\%\%",
+	title = "{The O (alpha-s**3) corrections to sigma-tot (e+ e- $\to$ hadrons) and Gamma (tau- $\to$ tau-neutrino + hadrons) in QCD}",
+	volume = "B259",
+	year = "1991"
+}
+
+@article{Greenberg:1964pe,
+	author = "Greenberg, O. W.",
+	journal = "Phys. Rev. Lett.",
+	pages = "598--602",
+	slaccitation = "%\%CITATION = PRLTA,13,598;\%\%",
+	title = "{Spin and unitary spin independence in a paraquark model of baryons and mesons}",
+	volume = "13",
+	year = "1964"
+}
+
+@article{Gregory:2007ce,
+	archiveprefix = "arXiv",
+	author = "Gregory, Eric B. and Irving, Alan and Richards, Chris M. and McNeile, Craig and Hart, Alistair",
+	eprint = "0710.1725",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = ARXIV:0710.1725;\%\%",
+	title = "{Pseudoscalar Flavor-Singlet Physics with Staggered Fermions}",
+	year = "2007"
+}
+
+@article{Gross:1973id,
+	author = "Gross, D. J. and Wilczek, F.",
+	journal = "Phys. Rev. Lett.",
+	pages = "1343--1346",
+	slaccitation = "%\%CITATION = PRLTA,30,1343;\%\%",
+	title = "{Ultraviolet behavior of non-Abelian gauge theories}",
+	volume = "30",
+	year = "1973"
+}
+
+@article{Gross:1973ju,
+	author = "Gross, D. J. and Wilczek, F.",
+	journal = "Phys. Rev.",
+	pages = "3633--3652",
+	slaccitation = "%\%CITATION = PHRVA,D8,3633;\%\%",
+	title = "{Asymptotically free gauge theories. 1}",
+	volume = "D8",
+	year = "1973"
+}
+
+@article{Gross:1974jv,
+	author = "Gross, D. J. and Neveu, A.",
+	journal = "Phys. Rev.",
+	pages = "3235",
+	slaccitation = "%\%CITATION = PHRVA,D10,3235;\%\%",
+	title = "{Dynamical symmetry breaking in asymptotically free field theories}",
+	volume = "D10",
+	year = "1974"
+}
+
+@article{Guagnelli:1998ud,
+	author = "Guagnelli, M. and Sommer, R. and Wittig, H.",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/9806005",
+	journal = "Nucl. Phys.",
+	pages = "389--402",
+	slaccitation = "%\%CITATION = HEP-LAT 9806005;\%\%",
+	title = "{Precision computation of a low-energy reference scale in quenched lattice {QCD}}",
+	volume = "B535",
+	year = "1998"
+}
+
+@article{Guagnelli:2004ga,
+	author = "Guagnelli, M. and others",
+	collaboration = "Zeuthen-Rome (ZeRo)",
+	eprint = "hep-lat/0405027",
+	journal = "Eur. Phys. J.",
+	pages = "69--80",
+	slaccitation = "%\%CITATION = HEP-LAT 0405027;\%\%",
+	title = "{Non-perturbative pion matrix element of a twist-2 operator from the lattice}",
+	volume = "C40",
+	year = "2005"
+}
+
+@article{Guagnelli:2004ww,
+	author = "Guagnelli, M. and others",
+	collaboration = "Zeuthen-Rome (ZeRo)",
+	eprint = "hep-lat/0403009",
+	journal = "Phys. Lett.",
+	pages = "216--221",
+	slaccitation = "%\%CITATION = HEP-LAT 0403009;\%\%",
+	title = "{Finite size effects of a pion matrix element}",
+	volume = "B597",
+	year = "2004"
+}
+
+@article{Guagnelli:2005zc,
+	author = "Guagnelli, M. and Heitger, J. and Pena, C. and Sint, S. and Vladikas, A.",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/0505002",
+	journal = "JHEP",
+	pages = "088",
+	slaccitation = "%\%CITATION = HEP-LAT 0505002;\%\%",
+	title = "{Non-perturbative renormalization of left-left four-fermion operators in quenched lattice QCD}",
+	volume = "03",
+	year = "2006"
+}
+
+@article{Gupta:1988js,
+	author = "Gupta, R. and Kilcup, G. W. and Sharpe, S. R.",
+	journal = "Phys. Rev.",
+	pages = "1278",
+	slaccitation = "%\%CITATION = PHRVA,D38,1278;\%\%",
+	title = "{Tuning the hybrid monte carlo algorithm}",
+	volume = "D38",
+	year = "1988"
+}
+
+@article{Gupta:1989kx,
+	author = "Gupta, R. and others",
+	journal = "Phys. Rev.",
+	pages = "2072",
+	slaccitation = "%\%CITATION = PHRVA,D40,2072;\%\%",
+	title = "{{QCD} with dynamical {Wilson} fermions}",
+	volume = "D40",
+	year = "1989"
+}
+
+@article{Gupta:1990ka,
+	author = "Gupta, S. and Irback, A. and Karsch, F. and Petersson, B.",
+	journal = "Phys. Lett.",
+	pages = "437--443",
+	slaccitation = "%\%CITATION = PHLTA,B242,437;\%\%",
+	title = "{The acceptance probability in the hybrid monte carlo method}",
+	volume = "B242",
+	year = "1990"
+}
+
+@article{Gupta:1991sn,
+	author = "Gupta, R. and others",
+	journal = "Phys. Rev.",
+	pages = "3272--3292",
+	slaccitation = "%\%CITATION = PHRVA,D44,3272;\%\%",
+	title = "{{QCD} with dynamical {Wilson} fermions. 2}",
+	volume = "D44",
+	year = "1991"
+}
+
+@unpublished{Gupta:1997nd,
+	author = "Gupta, R.",
+	eprint = "hep-lat/9807028",
+	note = "Lectures given at Les Houches Summer School in Theoretical Physics, Session 68",
+	slaccitation = "%\%CITATION = HEP-LAT 9807028;\%\%",
+	title = "{Introduction to lattice {QCD}}",
+	year = "1997"
+}
+
+@article{Han:1965pf,
+	author = "Han, M. Y. and Nambu, Yoichiro",
+	journal = "Phys. Rev.",
+	pages = "B1006--B1010",
+	slaccitation = "%\%CITATION = PHRVA,139,B1006;\%\%",
+	title = "{Three-triplet model with double SU(3) symmetry}",
+	volume = "139",
+	year = "1965"
+}
+
+@article{Hasenbusch:2001ne,
+	author = "Hasenbusch, M.",
+	eprint = "hep-lat/0107019",
+	journal = "Phys. Lett.",
+	pages = "177--182",
+	slaccitation = "%\%CITATION = HEP-LAT 0107019;\%\%",
+	title = "{Speeding up the {H}ybrid-{M}onte-{C}arlo algorithm for dynamical fermions}",
+	volume = "B519",
+	year = "2001"
+}
+
+@article{Hasenbusch:2002ai,
+	author = "Hasenbusch, M. and Jansen, K.",
+	eprint = "hep-lat/0211042",
+	journal = "Nucl. Phys.",
+	pages = "299--320",
+	slaccitation = "%\%CITATION = HEP-LAT 0211042;\%\%",
+	title = "{Speeding up lattice {QCD} simulations with clover-improved {Wilson} fermions}",
+	volume = "B659",
+	year = "2003"
+}
+
+@article{Hasenbusch:2003vg,
+	archiveprefix = "arXiv",
+	author = "Hasenbusch, Martin",
+	doi = "10.1016/S0920-5632(03)02504-0",
+	eprint = "hep-lat/0310029",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "27--33",
+	slaccitation = "%\%CITATION = HEP-LAT/0310029;\%\%",
+	title = "{Full QCD algorithms towards the chiral limit}",
+	volume = "129",
+	year = "2004"
+}
+
+@article{Hasenfratz:1998jp,
+	author = "Hasenfratz, P.",
+	eprint = "hep-lat/9802007",
+	journal = "Nucl. Phys.",
+	pages = "401--409",
+	slaccitation = "%\%CITATION = HEP-LAT 9802007;\%\%",
+	title = "{Lattice {QCD} without tuning, mixing and current renormalization}",
+	volume = "B525",
+	year = "1998"
+}
+
+@article{Hasenfratz:1998ri,
+	author = "Hasenfratz, P. and Laliena, V. and Niedermayer, F.",
+	eprint = "hep-lat/9801021",
+	journal = "Phys. Lett.",
+	pages = "125--131",
+	slaccitation = "%\%CITATION = HEP-LAT 9801021;\%\%",
+	title = "{The index theorem in {QCD} with a finite cut-off}",
+	volume = "B427",
+	year = "1998"
+}
+
+@article{Hasenfratz:2001hp,
+	author = "Hasenfratz, A. and Knechtli, F.",
+	eprint = "hep-lat/0103029",
+	journal = "Phys. Rev.",
+	pages = "034504",
+	slaccitation = "%\%CITATION = HEP-LAT 0103029;\%\%",
+	title = "{Flavor symmetry and the static potential with hypercubic blocking}",
+	volume = "D64",
+	year = "2001"
+}
+
+@article{Hasenfratz:2001tw,
+	author = "Hasenfratz, A. and Hoffmann, R. and Knechtli, F.",
+	eprint = "hep-lat/0110168",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "418--420",
+	slaccitation = "%\%CITATION = HEP-LAT 0110168;\%\%",
+	title = "{The static potential with hypercubic blocking}",
+	volume = "106",
+	year = "2002"
+}
+
+@article{Hashimoto:2008xg,
+	archiveprefix = "arXiv",
+	author = "Hashimoto, Koichi and Izubuchi, Taku",
+	eprint = "0803.0186",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = ARXIV:0803.0186;\%\%",
+	title = "{eta' meson from two flavor dynamical domain wall fermions}",
+	year = "2008"
+}
+
+@article{Heitger:2000ay,
+	author = "Heitger, J. and Sommer, R. and Wittig, H.",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/0006026",
+	journal = "Nucl. Phys.",
+	note = "and references therein",
+	pages = "377--399",
+	slaccitation = "%\%CITATION = HEP-LAT 0006026;\%\%",
+	title = "{Effective chiral Lagrangians and lattice {{QCD}}}",
+	volume = "B588",
+	year = "2000"
+}
+
+@article{Hernandez:1998et,
+	author = "Hernandez, P. and Jansen, K. and L{\"u}scher, M.",
+	eprint = "hep-lat/9808010",
+	journal = "Nucl. Phys.",
+	pages = "363--378",
+	slaccitation = "%\%CITATION = HEP-LAT 9808010;\%\%",
+	title = "{Locality properties of Neuberger's lattice Dirac operator}",
+	volume = "B552",
+	year = "1999"
+}
+
+@article{Hernandez:2000sb,
+	author = "Hernandez, P. and Jansen, K. and Lellouch, L.",
+	eprint = "hep-lat/0001008",
+	slaccitation = "%\%CITATION = HEP-LAT 0001008;\%\%",
+	title = "{A numerical treatment of Neuberger's lattice Dirac operator}",
+	year = "2000"
+}
+
+@article{Hernandez:2001hq,
+	author = "Hernandez, P. and Jansen, K. and Lellouch, L. and Wittig, H.",
+	eprint = "hep-lat/0110199",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "766--771",
+	slaccitation = "%\%CITATION = HEP-LAT 0110199;\%\%",
+	title = "{Scalar condensate and light quark masses from overlap fermions}",
+	volume = "106",
+	year = "2002"
+}
+
+@article{Hernandez:2001yn,
+	author = "Hernandez, P. and Jansen, K. and Lellouch, L. and Wittig, H.",
+	eprint = "hep-lat/0106011",
+	journal = "JHEP",
+	pages = "018",
+	slaccitation = "%\%CITATION = HEP-LAT 0106011;\%\%",
+	title = "{Non-perturbative renormalization of the quark condensate in {Ginsparg}-{Wilson} regularizations}",
+	volume = "07",
+	year = "2001"
+}
+
+@article{Horsley:2004mx,
+	author = "Horsley, R. and Perlt, H. and Rakow, P. E. L. and Schierholz, G. and Schiller, A.",
+	collaboration = "QCDSF",
+	eprint = "hep-lat/0404007",
+	journal = "Nucl. Phys.",
+	pages = "3--35",
+	slaccitation = "%\%CITATION = HEP-LAT 0404007;\%\%",
+	title = "{One-loop renormalisation of quark bilinears for overlap fermions with improved gauge actions}",
+	volume = "B693",
+	year = "2004"
+}
+
+@article{Ilgenfritz:2003gw,
+	author = "Ilgenfritz, E.-M. and Kerler, W. and M{\"u}ller-Preu{\ss}ker, M. and Sternbeck, A. and St{\"u}ben, H.",
+	eprint = "hep-lat/0309057",
+	journal = "Phys. Rev.",
+	pages = "074511",
+	slaccitation = "%\%CITATION = HEP-LAT 0309057;\%\%",
+	title = "{A numerical reinvestigation of the {Aoki} phase with {N(f)} = 2 {Wilson} fermions at zero temperature}",
+	volume = "D69",
+	year = "2004"
+}
+
+@article{Ilgenfritz:2006tz,
+	author = "Ilgenfritz, E. -M. and others",
+	eprint = "hep-lat/0610112",
+	slaccitation = "%\%CITATION = HEP-LAT 0610112;\%\%",
+	title = "{Twisted mass QCD thermodynamics: First results on apeNEXT}",
+	year = "2006"
+}
+
+@article{Iwasaki:1983ck,
+	author = "Iwasaki, Y.",
+	note = "UTHEP-118",
+	title = "{Renormalization group analysis of lattice theories and improved lattice action. 2. four-dimensional nonabelian SU(N) gauge model}"
+}
+
+@article{Iwasaki:1985we,
+	author = "Iwasaki, Y.",
+	journal = "Nucl. Phys.",
+	pages = "141--156",
+	slaccitation = "%\%CITATION = NUPHA,B258,141;\%\%",
+	title = "{Renormalization group analysis of lattice theories and improved lattice action: two-dimensional nonlinear O(N) sigma model}",
+	volume = "B258",
+	year = "1985"
+}
+
+@article{Iwasaki:1992hn,
+	author = "Iwasaki, Y. and Kanaya, K. and Sakai, S. and Yoshie, T.",
+	eprint = "hep-lat/9211035",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "327--330",
+	slaccitation = "%\%CITATION = HEP-LAT 9211035;\%\%",
+	title = "{Quark confinement in multi - flavor quantum chromodynamics}",
+	volume = "30",
+	year = "1993"
+}
+
+@article{Izubuchi:1998hy,
+	author = "Izubuchi, T. and Noaki, J. and Ukawa, A.",
+	eprint = "hep-lat/9805019",
+	journal = "Phys. Rev.",
+	pages = "114507",
+	slaccitation = "%\%CITATION = HEP-LAT 9805019;\%\%",
+	title = "{Two-dimensional lattice Gross-Neveu model with {Wilson} fermion action at finite temperature and chemical potential}",
+	volume = "D58",
+	year = "1998"
+}
+
+@article{Jacobs:1983ph,
+	author = "Jacobs, L.",
+	journal = "Phys. Rev. Lett.",
+	pages = "172",
+	slaccitation = "%\%CITATION = PRLTA,51,172;\%\%",
+	title = "{Undoubling chirally symmetric lattice fermions}",
+	volume = "51",
+	year = "1983"
+}
+
+@article{Jagels:1994a,
+	author = "Jagels, C. F. and Reichel, L.",
+	journal = "Numer. Linear Algebra Appl.",
+	pages = "555--570",
+	title = "{fast minimal residual algorithm for shifted unitary matrices}",
+	volume = "1(6)",
+	year = "1994"
+}
+
+@article{Jagels:1994aa,
+	author = "Jagels, C. F. and Reichel, L.",
+	journal = "Numerical Linear Algebra with Aplications",
+	pages = "555--570",
+	title = "{A Fast Minimal Residual Algorithm for Shifted Unitary Matrices}",
+	volume = "1(6)",
+	year = "1994"
+}
+
+@article{Jansen:1994ym,
+	author = "Jansen, K.",
+	eprint = "hep-lat/9410018",
+	journal = "Phys. Rept.",
+	pages = "1--54",
+	slaccitation = "%\%CITATION = HEP-LAT 9410018;\%\%",
+	title = "{Domain wall fermions and chiral gauge theories}",
+	volume = "273",
+	year = "1996"
+}
+
+@article{Jansen:1995ck,
+	author = "Jansen, Karl and others",
+	eprint = "hep-lat/9512009",
+	journal = "Phys. Lett.",
+	pages = "275--282",
+	slaccitation = "%\%CITATION = HEP-LAT 9512009;\%\%",
+	title = "{Non-perturbative renormalization of lattice QCD at all scales}",
+	volume = "B372",
+	year = "1996"
+}
+
+@article{Jansen:1996cq,
+	author = "Jansen, K. and Liu, C.",
+	eprint = "hep-lat/9607057",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "974--976",
+	slaccitation = "%\%CITATION = HEP-LAT 9607057;\%\%",
+	title = "{Study of Liapunov exponents and the reversibility of molecular dynamics algorithms}",
+	volume = "53",
+	year = "1997"
+}
+
+@article{Jansen:1996xp,
+	author = "Jansen, K.",
+	eprint = "hep-lat/9607051",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "127--133",
+	slaccitation = "%\%CITATION = HEP-LAT 9607051;\%\%",
+	title = "{Recent developments in fermion simulation algorithms}",
+	volume = "53",
+	year = "1997"
+}
+
+@article{Jansen:1997yt,
+	author = "Jansen, K. and Liu, C.",
+	eprint = "hep-lat/9603008",
+	journal = "Comput. Phys. Commun.",
+	pages = "221--234",
+	slaccitation = "%\%CITATION = HEP-LAT 9603008;\%\%",
+	title = "{Implementation of Symanzik's improvement program for simulations of dynamical {Wilson} fermions in lattice {QCD}}",
+	volume = "99",
+	year = "1997"
+}
+
+@article{Jansen:1998mx,
+	author = "Jansen, K. and Sommer, R.",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/9803017",
+	journal = "Nucl. Phys.",
+	pages = "185--203",
+	slaccitation = "%\%CITATION = HEP-LAT 9803017;\%\%",
+	title = "{O(alpha) improvement of lattice {QCD} with two flavors of {Wilson} quarks}",
+	volume = "B530",
+	year = "1998"
+}
+
+@article{Jansen:2003ir,
+	author = "Jansen, K. and Shindler, A. and Urbach, C. and Wetzorke, I.",
+	collaboration = "\xlf",
+	eprint = "hep-lat/0312013",
+	journal = "Phys. Lett.",
+	pages = "432--438",
+	slaccitation = "%\%CITATION = HEP-LAT 0312013;\%\%",
+	title = "{Scaling test for {Wilson} twisted mass {QCD}}",
+	volume = "B586",
+	year = "2004"
+}
+
+@article{Jansen:2003jq,
+	author = "Jansen, K. and Nagai, K.-I.",
+	eprint = "hep-lat/0305009",
+	journal = "JHEP",
+	pages = "038",
+	slaccitation = "%\%CITATION = HEP-LAT 0305009;\%\%",
+	title = "{Reducing residual-mass effects for domain-wall fermions}",
+	volume = "12",
+	year = "2003"
+}
+
+@article{Jansen:2003nt,
+	author = "Jansen, K.",
+	eprint = "hep-lat/0311039",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "3--16",
+	slaccitation = "%\%CITATION = HEP-LAT 0311039;\%\%",
+	title = "{Actions for dynamical fermion simulations: Are we ready to go?}",
+	volume = "129",
+	year = "2004"
+}
+
+@article{Jansen:2005cg,
+	author = "Jansen, K. and others",
+	collaboration = "\xlf",
+	eprint = "hep-lat/0507032",
+	journal = "Phys. Lett.",
+	pages = "334--341",
+	slaccitation = "%\%CITATION = HEP-LAT 0507032;\%\%",
+	title = "{Flavour breaking effects of {Wilson} twisted mass fermions}",
+	volume = "B624",
+	year = "2005"
+}
+
+@unpublished{Jansen:2005chi,
+	author = "Jansen, K. and others",
+	collaborations = "\xlf",
+	note = "in preparation",
+	optannote = "",
+	optkey = "",
+	optmonth = "",
+	title = "{}",
+	year = "2005"
+}
+
+@article{Jansen:2005gf,
+	author = "Jansen, K. and Papinutto, M. and Shindler, A. and Urbach, C. and Wetzorke, I.",
+	collaboration = "\xlf",
+	eprint = "hep-lat/0503031",
+	journal = "Phys. Lett.",
+	pages = "184--191",
+	slaccitation = "%\%CITATION = HEP-LAT 0503031;\%\%",
+	title = "{Light quarks with twisted mass fermions}",
+	volume = "B619",
+	year = "2005"
+}
+
+@article{Jansen:2005kk,
+	author = "Jansen, K. and Papinutto, M. and Shindler, A. and Urbach, C. and Wetzorke, I.",
+	collaboration = "\xlf",
+	eprint = "hep-lat/0507010",
+	journal = "JHEP",
+	pages = "071",
+	slaccitation = "%\%CITATION = HEP-LAT 0507010;\%\%",
+	title = "{Quenched scaling of {Wilson} twisted mass fermions}",
+	volume = "09",
+	year = "2005"
+}
+
+@article{Jansen:2005yp,
+	author = "Jansen, Karl and Shindler, Andrea and Urbach, Carsten and Wenger, Urs",
+	eprint = "hep-lat/0510064",
+	journal = "PoS",
+	pages = "118",
+	slaccitation = "%\%CITATION = HEP-LAT 0510064;\%\%",
+	title = "{{HMC} algorithm with multiple time scale integration and mass preconditioning}",
+	volume = "LAT2005",
+	year = "2006"
+}
+
+@article{Jansen:2006ks,
+	author = "Jansen, Karl",
+	eprint = "hep-lat/0609012",
+	slaccitation = "%\%CITATION = HEP-LAT 0609012;\%\%",
+	title = "{Status report on ILDG activities}",
+	year = "2006"
+}
+
+@article{Jansen:2006rf,
+	author = "Jansen, Karl and Urbach, Carsten",
+	collaboration = "ETM",
+	eprint = "hep-lat/0610015",
+	slaccitation = "%\%CITATION = HEP-LAT 0610015;\%\%",
+	title = "{First results with two light flavours of quarks with maximally twisted mass}",
+	year = "2006"
+}
+
+@article{Jansen:2008wv,
+	archiveprefix = "arXiv",
+	author = "Jansen, K. and Michael, C. and Urbach, C.",
+	collaboration = "ETM",
+	eprint = "0804.3871",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0804.3871;\%\%",
+	title = "{The eta' meson from lattice {QCD}}",
+	year = "2008"
+}
+
+@article{Jansen:2008zz,
+	author = "Jansen, K. and Michael, C. and Urbach, C.",
+	doi = "10.1140/epjc/s10052-008-0764-6",
+	journal = "Eur. Phys. J.",
+	pages = "261--269",
+	slaccitation = "%\%CITATION = EPHJA,C58,261;\%\%",
+	title = "{The eta-prime meson from lattice QCD}",
+	volume = "C58",
+	year = "2008"
+}
+
+@unpublished{Jegerlehner:1996pm,
+	author = "Jegerlehner, Beat",
+	eprint = "hep-lat/9612014",
+	note = "unpublished",
+	slaccitation = "%\%CITATION = HEP-LAT 9612014;\%\%",
+	title = "{Krylov space solvers for shifted linear systems}",
+	year = "1996"
+}
+
+@article{Jegerlehner:1997rn,
+	author = "Jegerlehner, B.",
+	eprint = "hep-lat/9708029",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "958--960",
+	slaccitation = "%\%CITATION = HEP-LAT 9708029;\%\%",
+	title = "{Multiple mass solvers}",
+	volume = "63",
+	year = "1998"
+}
+
+@article{Jegerlehner:2003qp,
+	author = "Jegerlehner, F.",
+	eprint = "hep-ph/0310234",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "325--334",
+	slaccitation = "%\%CITATION = HEP-PH 0310234;\%\%",
+	title = "{Theoretical precision in estimates of the hadronic contributions to (g-2)mu and alpha(QED)(M(Z))}",
+	volume = "126",
+	year = "2004"
+}
+
+@article{Jenkins:1990jv,
+	author = "Jenkins, Elizabeth Ellen and Manohar, Aneesh V.",
+	journal = "Phys. Lett.",
+	pages = "558--562",
+	slaccitation = "%\%CITATION = PHLTA,B255,558;\%\%",
+	title = "{Baryon chiral perturbation theory using a heavy fermion Lagrangian}",
+	volume = "B255",
+	year = "1991"
+}
+
+@article{Kaiser:1998ds,
+	author = "Kaiser, Roland and Leutwyler, H.",
+	eprint = "hep-ph/9806336",
+	slaccitation = "%\%CITATION = HEP-PH/9806336;\%\%",
+	title = "{Pseudoscalar decay constants at large N(c)}",
+	year = "1998"
+}
+
+@article{Kalkreuter:1995mm,
+	author = "Kalkreuter, Thomas and Simma, Hubert",
+	eprint = "hep-lat/9507023",
+	journal = "Comput. Phys. Commun.",
+	pages = "33--47",
+	slaccitation = "%\%CITATION = HEP-LAT 9507023;\%\%",
+	title = "{An Accelerated conjugate gradient algorithm to compute low lying eigenvalues: A Study for the Dirac operator in SU(2) lattice QCD}",
+	volume = "93",
+	year = "1996"
+}
+
+@article{Kalkreuter:1996mm,
+	author = "Kalkreuter, T. and Simma, H.",
+	eprint = "hep-lat/9507023",
+	journal = "Comput. Phys. Commun.",
+	pages = "33--47",
+	slaccitation = "%\%CITATION = HEP-LAT 9507023;\%\%",
+	title = "{An Accelerated conjugate gradient algorithm to compute low lying eigenvalues: A Study for the Dirac operator in SU(2) lattice {QCD}}",
+	volume = "93",
+	year = "1996"
+}
+
+@article{Kamleh:2005wg,
+	author = "Kamleh, W. and Peardon, M. J.",
+	collaboration = "TrinLat",
+	journal = "PoS",
+	pages = "106",
+	slaccitation = "%\%CITATION = POSCI,LAT2005,106;\%\%",
+	title = "{Polynomial filtering for HMC in lattice QCD}",
+	volume = "LAT2005",
+	year = "2006"
+}
+
+@article{Kaplan:1992bt,
+	author = "Kaplan, D. B.",
+	eprint = "hep-lat/9206013",
+	journal = "Phys. Lett.",
+	pages = "342--347",
+	slaccitation = "%\%CITATION = HEP-LAT 9206013;\%\%",
+	title = "{A Method for simulating chiral fermions on the lattice}",
+	volume = "B288",
+	year = "1992"
+}
+
+@article{Karsten:1980wd,
+	author = "Karsten, L. H. and Smit, J.",
+	journal = "Nucl. Phys.",
+	pages = "103",
+	slaccitation = "%\%CITATION = NUPHA,B183,103;\%\%",
+	title = "{Lattice fermions: species doubling, chiral invariance, and the triangle anomaly}",
+	volume = "B183",
+	year = "1981"
+}
+
+@article{Kennedy:1990bv,
+	author = "Kennedy, A. D. and Pendleton, B.",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "118--121",
+	slaccitation = "%\%CITATION = NUPHZ,20,118;\%\%",
+	title = "{Acceptances and autocorrelations in hybrid Monte Carlo}",
+	volume = "20",
+	year = "1991"
+}
+
+@article{Knechtli:1998gf,
+	author = "Knechtli, F. and Sommer, R.",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/9807022",
+	journal = "Phys. Lett.",
+	pages = "345--352",
+	slaccitation = "%\%CITATION = HEP-LAT 9807022;\%\%",
+	title = "{String breaking in SU(2) gauge theory with scalar matter fields}",
+	volume = "B440",
+	year = "1998"
+}
+
+@article{Knechtli:2000df,
+	author = "Knechtli, F. and Sommer, R.",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/0005021",
+	journal = "Nucl. Phys.",
+	pages = "309--328",
+	slaccitation = "%\%CITATION = HEP-LAT 0005021;\%\%",
+	title = "{String breaking as a mixing phenomenon in the SU(2) Higgs model}",
+	volume = "B590",
+	year = "2000"
+}
+
+@article{Lacock:1994qx,
+	author = "Lacock, P. and McKerrell, A. and Michael, C. and Stopher, I. M. and Stephenson, P. W.",
+	collaboration = "UKQCD",
+	eprint = "hep-lat/9412079",
+	journal = "Phys. Rev.",
+	pages = "6403--6410",
+	slaccitation = "%\%CITATION = HEP-LAT 9412079;\%\%",
+	title = "{Efficient hadronic operators in lattice gauge theory}",
+	volume = "D51",
+	year = "1995"
+}
+
+@article{Lepage:1992xa,
+	author = "Lepage, G. Peter and Mackenzie, Paul B.",
+	eprint = "hep-lat/9209022",
+	journal = "Phys. Rev.",
+	pages = "2250--2264",
+	slaccitation = "%\%CITATION = HEP-LAT 9209022;\%\%",
+	title = "{On the viability of lattice perturbation theory}",
+	volume = "D48",
+	year = "1993"
+}
+
+@article{Lepage:2001ym,
+	archiveprefix = "arXiv",
+	author = "Lepage, G. P. and others",
+	doi = "10.1016/S0920-5632(01)01638-3",
+	eprint = "hep-lat/0110175",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "12--20",
+	slaccitation = "%\%CITATION = HEP-LAT/0110175;\%\%",
+	title = "{Constrained curve fitting}",
+	volume = "106",
+	year = "2002"
+}
+
+@article{Lesk:2002gd,
+	author = "Lesk, V. I. and others",
+	collaboration = "CP-PACS",
+	eprint = "hep-lat/0211040",
+	journal = "Phys. Rev.",
+	pages = "074503",
+	slaccitation = "%\%CITATION = HEP-LAT/0211040;\%\%",
+	title = "{Flavor singlet meson mass in the continuum limit in two- flavor lattice QCD}",
+	volume = "D67",
+	year = "2003"
+}
+
+@article{Leutwyler:1997yr,
+	author = "Leutwyler, H.",
+	eprint = "hep-ph/9709408",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "223--231",
+	slaccitation = "%\%CITATION = HEP-PH/9709408;\%\%",
+	title = "{On the 1/N-expansion in chiral perturbation theory}",
+	volume = "64",
+	year = "1998"
+}
+
+@article{Leutwyler:2006qq,
+	author = "Leutwyler, H.",
+	eprint = "hep-ph/0612112",
+	slaccitation = "%\%CITATION = HEP-PH 0612112;\%\%",
+	title = "{pi pi scattering}",
+	year = "2006"
+}
+
+@article{Liu:1997fs,
+	author = "Liu, C. and Jaster, A. and Jansen, K.",
+	eprint = "hep-lat/9708017",
+	journal = "Nucl. Phys.",
+	pages = "603--617",
+	slaccitation = "%\%CITATION = HEP-LAT 9708017;\%\%",
+	title = "{Liapunov exponents and the reversibility of molecular dynamics algorithms}",
+	volume = "B524",
+	year = "1998"
+}
+
+@article{Luscher:1985dn,
+	author = "L{\"u}scher, M.",
+	doi = "10.1007/BF01211589",
+	journal = "Commun. Math. Phys.",
+	pages = "177",
+	slaccitation = "%\%CITATION = CMPHA,104,177;\%\%",
+	title = "{Volume Dependence of the Energy Spectrum in Massive Quantum Field Theories. 1. Stable Particle States}",
+	volume = "104",
+	year = "1986"
+}
+
+@article{Luscher:1990ck,
+	author = "L{\"u}scher, M. and Wolff, U.",
+	journal = "Nucl. Phys.",
+	pages = "222--252",
+	slaccitation = "%\%CITATION = NUPHA,B339,222;\%\%",
+	title = "{How to calculate the elastic scattering matrix in two- dimensional quantum field theories by numerical simulation}",
+	volume = "B339",
+	year = "1990"
+}
+
+@article{Luscher:1993dy,
+	archiveprefix = "arXiv",
+	author = "L{\"u}scher, Martin",
+	doi = "10.1016/0010-4655(94)90232-1",
+	eprint = "hep-lat/9309020",
+	journal = "Comput. Phys. Commun.",
+	pages = "100--110",
+	slaccitation = "%\%CITATION = HEP-LAT/9309020;\%\%",
+	title = "{A Portable high quality random number generator for lattice field theory simulations}",
+	volume = 79,
+	year = 1994
+}
+
+@article{Luscher:1993xx,
+	archiveprefix = "arXiv",
+	author = "L{\"u}scher, Martin",
+	doi = "10.1016/0550-3213(94)90533-9",
+	eprint = "hep-lat/9311007",
+	journal = "Nucl. Phys.",
+	pages = "637--648",
+	slaccitation = "%\%CITATION = HEP-LAT/9311007;\%\%",
+	title = "{A New approach to the problem of dynamical quarks in numerical simulations of lattice {QCD}}",
+	volume = "B418",
+	year = "1994"
+}
+
+@article{Luscher:1993xx,
+	author = "L{\"u}scher, M.",
+	eprint = "hep-lat/9311007",
+	journal = "Nucl. Phys.",
+	pages = "637--648",
+	slaccitation = "%\%CITATION = HEP-LAT 9311007;\%\%",
+	title = "{A New approach to the problem of dynamical quarks in numerical simulations of lattice {QCD}}",
+	volume = "B418",
+	year = "1994"
+}
+
+@article{Luscher:1996sc,
+	author = "L{\"u}scher, M. and Sint, S. and Sommer, R. and Weisz, P.",
+	eprint = "hep-lat/9605038",
+	journal = "Nucl. Phys.",
+	pages = "365--400",
+	slaccitation = "%\%CITATION = HEP-LAT 9605038;\%\%",
+	title = "{Chiral symmetry and {O(a)} improvement in lattice {QCD}}",
+	volume = "B478",
+	year = "1996"
+}
+
+@article{Luscher:1996ug,
+	author = "L{\"u}scher, M. and Sint, S. and Sommer, R. and Weisz, P. and Wolff, U.",
+	eprint = "hep-lat/9609035",
+	journal = "Nucl. Phys.",
+	pages = "323--343",
+	slaccitation = "%\%CITATION = HEP-LAT 9609035;\%\%",
+	title = "{Non-perturbative {O(a)} improvement of lattice {QCD}}",
+	volume = "B491",
+	year = "1997"
+}
+
+@article{Luscher:1998pq,
+	author = "L{\"u}scher, M.",
+	eprint = "hep-lat/9802011",
+	journal = "Phys. Lett.",
+	pages = "342--345",
+	slaccitation = "%\%CITATION = HEP-LAT 9802011;\%\%",
+	title = "{Exact chiral symmetry on the lattice and the {Ginsparg}- {Wilson} relation}",
+	volume = "B428",
+	year = "1998"
+}
+
+@article{Luscher:2001tx,
+	archiveprefix = "arXiv",
+	author = "L{\"u}scher, Martin",
+	doi = "10.1016/S0920-5632(01)01639-5",
+	eprint = "hep-lat/0110007",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "21--28",
+	slaccitation = "%\%CITATION = HEP-LAT/0110007;\%\%",
+	title = "{Lattice QCD on PCs?}",
+	volume = "106",
+	year = "2002"
+}
+
+@article{Luscher:2003qa,
+	author = "L{\"u}scher, M.",
+	eprint = "hep-lat/0310048",
+	journal = "Comput. Phys. Commun.",
+	pages = "209--220",
+	slaccitation = "%\%CITATION = HEP-LAT 0310048;\%\%",
+	title = "{Solution of the {D}irac equation in lattice {QCD} using a domain decomposition method}",
+	volume = "156",
+	year = "2004"
+}
+
+@article{Luscher:2004rx,
+	author = "L{\"u}scher, M.",
+	eprint = "hep-lat/0409106",
+	journal = "Comput. Phys. Commun.",
+	pages = "199",
+	slaccitation = "%\%CITATION = HEP-LAT 0409106;\%\%",
+	title = "{Schwarz-preconditioned {HMC} algorithm for two-flavour lattice {QCD}}",
+	volume = "165",
+	year = "2005"
+}
+
+@article{Luscher:2005mv,
+	author = "L{\"u}scher, Martin",
+	eprint = "hep-lat/0509152",
+	howpublished = "Talk presented at International Symposium on Lattice Field Theory (Lattice 2005)",
+	journal = "\href{http://pos.sissa.it/archive/conferences/020/008/LAT2005\_002.pdf}{PoS(LAT2005)002}",
+	slaccitation = "%\%CITATION = HEP-LAT 0509152;\%\%",
+	title = "{Lattice {QCD} with light {W}ilson quarks}",
+	year = "2005"
+}
+
+@article{Luscher:2007es,
+	archiveprefix = "arXiv",
+	author = "L{\"u}scher, Martin",
+	doi = "10.1088/1126-6708/2007/12/011",
+	eprint = "0710.5417",
+	journal = "JHEP",
+	pages = "011",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0710.5417;\%\%",
+	title = "{Deflation acceleration of lattice {QCD} simulations}",
+	volume = "12",
+	year = "2007"
+}
+
+@article{Luscher:ranluxweb,
+	author = "L{\"u}scher, M.",
+	eprint = "http://luscher.web.cern.ch/luscher/ranlux/",
+	title = "{Ranlux random number generator}"
+}
+
+@article{Luscher:sse,
+	author = "L{\"u}scher, M.",
+	eprint = "http://luscher.web.cern.ch/luscher/QCDpbm/",
+	title = "{Lattice QCD parallel benchmark programs}"
+}
+
+@article{Madras:1988ei,
+	author = "Madras, N. and Sokal, A. D.",
+	journal = "J. Statist. Phys.",
+	pages = "109--186",
+	slaccitation = "%\%CITATION = JSTPB,50,109;\%\%",
+	title = "{The Pivot algorithm: a highly efficient Monte Carlo method for selfavoiding walk}",
+	volume = "50",
+	year = "1988"
+}
+
+@article{Martinelli:1982mw,
+	author = "Martinelli, G. and Zhang, Yi-Cheng",
+	journal = "Phys. Lett.",
+	pages = "433",
+	slaccitation = "%\%CITATION = PHLTA,B123,433;\%\%",
+	title = "{THE CONNECTION BETWEEN LOCAL OPERATORS ON THE LATTICE AND IN THE CONTINUUM AND ITS RELATION TO MESON DECAY CONSTANTS}",
+	volume = "B123",
+	year = "1983"
+}
+
+@article{Martinelli:1994ty,
+	archiveprefix = "arXiv",
+	author = "Martinelli, G. and Pittori, C. and Sachrajda, Christopher T. and Testa, M. and Vladikas, A.",
+	doi = "10.1016/0550-3213(95)00126-D",
+	eprint = "hep-lat/9411010",
+	journal = "Nucl. Phys.",
+	pages = "81--108",
+	slaccitation = "%\%CITATION = HEP-LAT/9411010;\%\%",
+	title = "{A General method for nonperturbative renormalization of lattice operators}",
+	volume = "B445",
+	year = "1995"
+}
+
+@article{McNeile:2000hf,
+	author = "McNeile, C. and Michael, C.",
+	collaboration = "UKQCD",
+	eprint = "hep-lat/0006020",
+	journal = "Phys. Lett.",
+	pages = "123--129",
+	slaccitation = "%\%CITATION = HEP-LAT 0006020;\%\%",
+	title = "{The eta and eta' mesons in {QCD}}",
+	volume = "B491",
+	year = "2000"
+}
+
+@article{McNeile:2000xx,
+	author = "McNeile, Craig and Michael, Chris",
+	collaboration = "UKQCD",
+	eprint = "hep-lat/0010019",
+	journal = "Phys. Rev.",
+	pages = "114503",
+	slaccitation = "%\%CITATION = HEP-LAT0010019;\%\%",
+	title = "{Mixing of scalar glueballs and flavour-singlet scalar mesons}",
+	volume = "D63",
+	year = "2001"
+}
+
+@article{McNeile:2001cr,
+	author = "McNeile, C. and Michael, C. and Sharkey, K. J.",
+	collaboration = "UKQCD",
+	eprint = "hep-lat/0107003",
+	journal = "Phys. Rev.",
+	pages = "014508",
+	slaccitation = "%\%CITATION = HEP-LAT 0107003;\%\%",
+	title = "{The flavor singlet mesons in {QCD}}",
+	volume = "D65",
+	year = "2002"
+}
+
+@article{McNeile:2002fh,
+	author = "McNeile, C. and Michael, C.",
+	collaboration = "UKQCD",
+	eprint = "hep-lat/0212020",
+	journal = "Phys. Lett.",
+	pages = "177--184",
+	slaccitation = "%\%CITATION = HEP-LAT 0212020;\%\%",
+	title = "{Hadronic decay of a vector meson from the lattice}",
+	volume = "B556",
+	year = "2003"
+}
+
+@article{McNeile:2006bz,
+	author = "McNeile, C. and Michael, C.",
+	collaboration = "UKQCD",
+	eprint = "hep-lat/0603007",
+	journal = "Phys. Rev.",
+	pages = "074506",
+	slaccitation = "%\%CITATION = HEP-LAT 0603007;\%\%",
+	title = "{Decay width of light quark hybrid meson from the lattice}",
+	volume = "D73",
+	year = "2006"
+}
+
+@article{Meyer:2006ty,
+	archiveprefix = "arXiv",
+	author = "Meyer, Harvey B. and others",
+	doi = "10.1016/j.cpc.2006.08.002",
+	eprint = "hep-lat/0606004",
+	journal = "Comput. Phys. Commun.",
+	pages = "91--97",
+	slaccitation = "%\%CITATION = HEP-LAT/0606004;\%\%",
+	title = "{Exploring the HMC trajectory-length dependence of autocorrelation times in lattice QCD}",
+	volume = "176",
+	year = "2007"
+}
+
+@article{Michael:1982gb,
+	author = "Michael, C. and Teasdale, I.",
+	journal = "Nucl. Phys.",
+	pages = "433",
+	slaccitation = "%\%CITATION = NUPHA,B215,433;\%\%",
+	title = "{EXTRACTING GLUEBALL MASSES FROM LATTICE QCD}",
+	volume = "B215",
+	year = "1983"
+}
+
+@article{Michael:1989mf,
+	author = "Michael, C.",
+	journal = "Nucl. Phys.",
+	pages = "515",
+	slaccitation = "%\%CITATION = NUPHA,B327,515;\%\%",
+	title = "{Particle decay in lattice gauge theory}",
+	volume = "B327",
+	year = "1989"
+}
+
+@article{Michael:1991nc,
+	author = "Michael, C.",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "417--419",
+	slaccitation = "%\%CITATION = NUPHZ,26,417;\%\%",
+	title = "{Hadronic forces from the lattice}",
+	volume = "26",
+	year = "1992"
+}
+
+@article{Michael:1993yj,
+	archiveprefix = "arXiv",
+	author = "Michael, Christopher",
+	doi = "10.1103/PhysRevD.49.2616",
+	eprint = "hep-lat/9310026",
+	journal = "Phys. Rev.",
+	pages = "2616--2619",
+	slaccitation = "%\%CITATION = HEP-LAT/9310026;\%\%",
+	title = "{Fitting correlated data}",
+	volume = "D49",
+	year = "1994"
+}
+
+@article{Michael:1994sz,
+	archiveprefix = "arXiv",
+	author = "Michael, Christopher and McKerrell, A.",
+	doi = "10.1103/PhysRevD.51.3745",
+	eprint = "hep-lat/9412087",
+	journal = "Phys. Rev.",
+	pages = "3745--3750",
+	slaccitation = "%\%CITATION = HEP-LAT/9412087;\%\%",
+	title = "{Fitting correlated hadron mass spectrum data}",
+	volume = "D51",
+	year = "1995"
+}
+
+@article{Michael:2007vn,
+	archiveprefix = "arXiv",
+	author = "Michael, C. and Urbach, C.",
+	collaboration = "ETM",
+	eprint = "0709.4564",
+	journal = "",
+	pages = "",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = ARXIV:0709.4564;\%\%",
+	title = "{Neutral mesons and disconnected diagrams in Twisted Mass QCD}",
+	volume = "",
+	year = "2007"
+}
+
+@book{Montvay:1994cy,
+	author = "Montvay, I. and M{\"u}nster, G.",
+	publisher = "Cambridge University Press",
+	series = "{Cambridge Monographs on Mathematical Physics}",
+	title = "{Quantum fields on a lattice}",
+	year = "1994"
+}
+
+@article{Montvay:1995ea,
+	author = "Montvay, I.",
+	eprint = "hep-lat/9510042",
+	journal = "Nucl. Phys.",
+	pages = "259--284",
+	slaccitation = "%\%CITATION = HEP-LAT 9510042;\%\%",
+	title = "{An Algorithm for Gluinos on the Lattice}",
+	volume = "B466",
+	year = "1996"
+}
+
+@article{Montvay:2005tj,
+	author = "Montvay, I. and Scholz, E.",
+	eprint = "hep-lat/0506006",
+	journal = "Phys. Lett.",
+	pages = "73--79",
+	slaccitation = "%\%CITATION = HEP-LAT 0506006;\%\%",
+	title = "{Updating algorithms with multi-step stochastic correction}",
+	volume = "B623",
+	year = "2005"
+}
+
+@article{Morgan:2002a,
+	author = "Morgan, R. B.",
+	journal = "SIAM J. Sci. Comput.",
+	pages = "20",
+	title = "{GMRES with Deated Restarting}",
+	volume = "24",
+	year = "2002"
+}
+
+@article{Morningstar:2003gk,
+	archiveprefix = "arXiv",
+	author = "Morningstar, Colin and Peardon, Mike J.",
+	doi = "10.1103/PhysRevD.69.054501",
+	eprint = "hep-lat/0311018",
+	journal = "Phys. Rev.",
+	pages = "054501",
+	slaccitation = "%\%CITATION = HEP-LAT/0311018;\%\%",
+	title = "{Analytic smearing of SU(3) link variables in lattice QCD}",
+	volume = "D69",
+	year = "2004"
+}
+
+@article{Munster:2004am,
+	author = "M{\"u}nster, G.",
+	eprint = "hep-lat/0407006",
+	journal = "JHEP",
+	pages = "035",
+	slaccitation = "%\%CITATION = HEP-LAT 0407006;\%\%",
+	title = "{On the phase structure of twisted mass lattice {QCD}}",
+	volume = "09",
+	year = "2004"
+}
+
+@article{Munster:2004wt,
+	author = "M{\"u}nster, Gernot and Schmidt, Christian and Scholz, Enno E.",
+	eprint = "hep-lat/0409066",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "320--322",
+	slaccitation = "%\%CITATION = HEP-LAT 0409066;\%\%",
+	title = "{Chiral perturbation theory for twisted mass {QCD}}",
+	volume = "140",
+	year = "2005"
+}
+
+@article{Nagai:2005mi,
+	author = "Nagai, Kei-ichi and Jansen, Karl",
+	eprint = "hep-lat/0510076",
+	journal = "Phys. Lett.",
+	pages = "325--330",
+	slaccitation = "%\%CITATION = HEP-LAT 0510076;\%\%",
+	title = "{Two-dimensional lattice Gross-Neveu model with Wilson twisted mass fermions}",
+	volume = "B633",
+	year = "2006"
+}
+
+@unpublished{Nagai:priv,
+	author = "Nagai, K",
+	note = "private communication",
+	optannote = "",
+	optkey = "",
+	optmonth = "",
+	optyear = "",
+	title = "{Two-dimensional Gross-Neveu model with {Wilson} twisted mass fermions}"
+}
+
+@article{Necco:2001xg,
+	author = "Necco, S. and Sommer, R.",
+	eprint = "hep-lat/0108008",
+	journal = "Nucl. Phys.",
+	pages = "328--346",
+	slaccitation = "%\%CITATION = HEP-LAT 0108008;\%\%",
+	title = "{The {N(f)} = 0 heavy quark potential from short to intermediate distances}",
+	volume = "B622",
+	year = "2002"
+}
+
+@article{Necco:2003vh,
+	author = "Necco, Silvia",
+	eprint = "hep-lat/0309017",
+	journal = "Nucl. Phys.",
+	pages = "137--167",
+	slaccitation = "%\%CITATION = HEP-LAT 0309017;\%\%",
+	volume = "B683",
+	year = "2004"
+}
+
+@article{Neff:2001zr,
+	author = "Neff, H. and Eicker, N. and Lippert, T. and Negele, J. W. and Schilling, K.",
+	eprint = "hep-lat/0106016",
+	journal = "Phys. Rev.",
+	pages = "114509",
+	slaccitation = "%\%CITATION = HEP-LAT/0106016;\%\%",
+	title = "{On the low fermionic eigenmode dominance in {QCD} on the lattice}",
+	volume = "D64",
+	year = "2001"
+}
+
+@article{Neuberger:1997fp,
+	author = "Neuberger, H.",
+	eprint = "hep-lat/9707022",
+	journal = "Phys. Lett.",
+	pages = "141--144",
+	slaccitation = "%\%CITATION = HEP-LAT 9707022;\%\%",
+	title = "{Exactly massless quarks on the lattice}",
+	volume = "B417",
+	year = "1998"
+}
+
+@article{Neuberger:1998wv,
+	author = "Neuberger, H.",
+	eprint = "hep-lat/9801031",
+	journal = "Phys. Lett.",
+	pages = "353--355",
+	slaccitation = "%\%CITATION = HEP-LAT 9801031;\%\%",
+	title = "{More about exactly massless quarks on the lattice}",
+	volume = "B427",
+	year = "1998"
+}
+
+@article{Niedermayer:1998bi,
+	author = "Niedermayer, F.",
+	eprint = "hep-lat/9810026",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "105--119",
+	slaccitation = "%\%CITATION = HEP-LAT 9810026;\%\%",
+	title = "{Exact chiral symmetry, topological charge and related topics}",
+	volume = "73",
+	year = "1999"
+}
+
+@article{Nielsen:1980rz,
+	author = "Nielsen, H. B. and Ninomiya, M.",
+	journal = "Nucl. Phys.",
+	pages = "20",
+	slaccitation = "%\%CITATION = NUPHA,B185,20;\%\%",
+	title = "{Absence of neutrinos on a lattice. 1. proof by homotopy theory}",
+	volume = "B185",
+	year = "1981"
+}
+
+@article{Nielsen:1981hk,
+	author = "Nielsen, H. B. and Ninomiya, M.",
+	journal = "Phys. Lett.",
+	pages = "219",
+	slaccitation = "%\%CITATION = PHLTA,B105,219;\%\%",
+	title = "{No go theorem for regularizing chiral fermions}",
+	volume = "B105",
+	year = "1981"
+}
+
+@article{Nielsen:1981xu,
+	author = "Nielsen, H. B. and Ninomiya, M.",
+	journal = "Nucl. Phys.",
+	pages = "173",
+	slaccitation = "%\%CITATION = NUPHA,B193,173;\%\%",
+	title = "{Absence of neutrinos on a lattice. 2. intuitive topological proof}",
+	volume = "B193",
+	year = "1981"
+}
+
+@article{Noaki:1998zc,
+	author = "Noaki, J. and Izubuchi, T. and Ukawa, A.",
+	eprint = "hep-lat/9809071",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "483--485",
+	slaccitation = "%\%CITATION = HEP-LAT 9809071;\%\%",
+	title = "{Two-dimensional Gross-Neveu model with {Wilson} fermion action at finite temperature and density}",
+	volume = "73",
+	year = "1999"
+}
+
+@article{Orginos:2001xa,
+	author = "Orginos, K.",
+	collaboration = "RBC",
+	eprint = "hep-lat/0110074",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "721--723",
+	slaccitation = "%\%CITATION = HEP-LAT 0110074;\%\%",
+	title = "{Chiral properties of domain wall fermions with improved gauge actions}",
+	volume = "106",
+	year = "2002"
+}
+
+@article{Orth:2005kq,
+	author = "Orth, B. and Lippert, T. and Schilling, K.",
+	eprint = "hep-lat/0503016",
+	journal = "Phys. Rev.",
+	pages = "014503",
+	slaccitation = "%\%CITATION = HEP-LAT 0503016;\%\%",
+	title = "{Finite-size effects in lattice {QCD} with dynamical {Wilson} fermions}",
+	volume = "D72",
+	year = "2005"
+}
+
+@article{Osterwalder:1973dx,
+	author = "Osterwalder, K. and Schrader, R.",
+	journal = "Commun. Math. Phys.",
+	pages = "83--112",
+	slaccitation = "%\%CITATION = CMPHA,31,83;\%\%",
+	title = "{Axioms for euclidean Green's functions}",
+	volume = "31",
+	year = "1973"
+}
+
+@article{Osterwalder:1975tc,
+	author = "Osterwalder, K. and Schrader, R.",
+	journal = "Commun. Math. Phys.",
+	pages = "281",
+	slaccitation = "%\%CITATION = CMPHA,42,281;\%\%",
+	title = "{Axioms for euclidean Green's functions. 2}",
+	volume = "42",
+	year = "1975"
+}
+
+@article{Osterwalder:1977pc,
+	author = "Osterwalder, K. and Seiler, E.",
+	journal = "Ann. Phys.",
+	pages = "440",
+	slaccitation = "%\%CITATION = APNYA,110,440;\%\%",
+	title = "{Gauge field theories on the lattice}",
+	volume = "110",
+	year = "1978"
+}
+
+@article{PDBook,
+	author = "Eidelman, S. and others",
+	journal = "{Physics Letters B}",
+	pages = "1+",
+	title = "{Review of Particle Physics}",
+	url = "http://pdg.lbl.gov",
+	volume = "592",
+	year = "2004"
+}
+
+@article{Peardon:2002wb,
+	author = "Peardon, M. J. and Sexton, J.",
+	collaboration = "TrinLat",
+	eprint = "hep-lat/0209037",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "985--987",
+	slaccitation = "%\%CITATION = HEP-LAT 0209037;\%\%",
+	title = "{Multiple molecular dynamics time-scales in hybrid Monte Carlo fermion simulations}",
+	volume = "119",
+	year = "2003"
+}
+
+@book{Peskin:1995ev,
+	author = "Peskin, M. E. and Schroeder, D. V.",
+	optaddress = "Boulder, Colorado",
+	optannote = "",
+	optedition = "",
+	optkey = "",
+	optmonth = "",
+	optnote = "",
+	optnumber = "",
+	optseries = "Advanced Book Program",
+	optvolume = "",
+	publisher = "Westview Press",
+	title = "{An Introduction to quantum field theory}",
+	year = "1995"
+}
+
+@article{Politzer:1973fx,
+	author = "Politzer, H. D.",
+	journal = "Phys. Rev. Lett.",
+	pages = "1346--1349",
+	slaccitation = "%\%CITATION = PRLTA,30,1346;\%\%",
+	title = "{Reliable perturbative results for strong interactions?}",
+	volume = "30",
+	year = "1973"
+}
+
+@article{Politzer:1974fr,
+	author = "Politzer, H. D.",
+	journal = "Phys. Rept.",
+	pages = "129--180",
+	slaccitation = "%\%CITATION = PRPLC,14,129;\%\%",
+	title = "{Asymptotic freedom: an approach to strong interactions}",
+	volume = "14",
+	year = "1974"
+}
+
+@manual{R:2005,
+	address = "Vienna, Austria",
+	author = "{R Development Core Team}",
+	note = "{ISBN} 3-900051-07-0",
+	organization = "R Foundation for Statistical Computing",
+	title = "{R: A language and environment for statistical computing}",
+	url = "http://www.R-project.org",
+	year = "2005"
+}
+
+@book{Rothe:1992wy,
+	author = "Rothe, H.J.",
+	edition = "",
+	pages = "528",
+	publisher = "World Scientific, Singapore",
+	title = "{Lattice gauge theories}",
+	year = "1992"
+}
+
+@article{Rupak:2002sm,
+	author = "Rupak, G. and Shoresh, N.",
+	eprint = "hep-lat/0201019",
+	journal = "Phys. Rev.",
+	pages = "054503",
+	slaccitation = "%\%CITATION = HEP-LAT 0201019;\%\%",
+	title = "{Chiral perturbation theory for the {Wilson} lattice action}",
+	volume = "D66",
+	year = "2002"
+}
+
+@article{Saad:1993a,
+	author = "Saad, Y.",
+	journal = "SIAM J. Sci. Comput.",
+	page = "461-469",
+	title = "{A flexible inner-outer preconditioned GMRES altorithm}",
+	volume = "14 (2)",
+	year = "1993"
+}
+
+@article{Sachrajda:2004mi,
+	archiveprefix = "arXiv",
+	author = "Sachrajda, C. T. and Villadoro, G.",
+	doi = "10.1016/j.physletb.2005.01.033",
+	eprint = "hep-lat/0411033",
+	journal = "Phys. Lett.",
+	pages = "73--85",
+	slaccitation = "%\%CITATION = HEP-LAT/0411033;\%\%",
+	title = "{Twisted boundary conditions in lattice simulations}",
+	volume = "B609",
+	year = "2005"
+}
+
+@article{Scorzato:2004da,
+	author = "Scorzato, L.",
+	eprint = "hep-lat/0407023",
+	journal = "Eur. Phys. J.",
+	pages = "445--455",
+	slaccitation = "%\%CITATION = HEP-LAT 0407023;\%\%",
+	title = "{Pion mass splitting and phase structure in twisted mass {QCD}}",
+	volume = "C37",
+	year = "2004"
+}
+
+@article{Scorzato:2005rb,
+	author = "Scorzato, L. and others",
+	eprint = "hep-lat/0511036",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "283--290",
+	slaccitation = "%\%CITATION = HEP-LAT 0511036;\%\%",
+	title = "{N(f) = 2 lattice {QCD} and chiral perturbation theory}",
+	volume = "153",
+	year = "2006"
+}
+
+@article{Sexton:1992nu,
+	author = "Sexton, J. C. and Weingarten, D. H.",
+	journal = "Nucl. Phys.",
+	pages = "665--678",
+	slaccitation = "%\%CITATION = NUPHA,B380,665;\%\%",
+	title = "{Hamiltonian evolution for the hybrid monte carlo algorithm}",
+	volume = "B380",
+	year = "1992"
+}
+
+@article{Sharpe:1998xm,
+	author = "Sharpe, S. R. and Singleton, R. Jr.",
+	eprint = "hep-lat/9804028",
+	journal = "Phys. Rev.",
+	pages = "074501",
+	slaccitation = "%\%CITATION = HEP-LAT 9804028;\%\%",
+	title = "{Spontaneous flavor and parity breaking with {Wilson} fermions}",
+	volume = "D58",
+	year = "1998"
+}
+
+@article{Sharpe:2004ny,
+	author = "Sharpe, S. R. and Wu, Jackson M. S.",
+	eprint = "hep-lat/0411021",
+	journal = "Phys. Rev.",
+	pages = "074501",
+	slaccitation = "%\%CITATION = HEP-LAT 0411021;\%\%",
+	title = "{Twisted mass chiral perturbation theory at next-to-leading order}",
+	volume = "D71",
+	year = "2005"
+}
+
+@article{Sharpe:2004ps,
+	author = "Sharpe, S. R. and Wu, J. M. S.",
+	eprint = "hep-lat/0407025",
+	journal = "Phys. Rev.",
+	pages = "094029",
+	slaccitation = "%\%CITATION = HEP-LAT 0407025;\%\%",
+	title = "{The phase diagram of twisted mass lattice {QCD}}",
+	volume = "D70",
+	year = "2004"
+}
+
+@article{Sharpe:2005rq,
+	author = "Sharpe, Stephen R.",
+	eprint = "hep-lat/0509009",
+	journal = "Phys. Rev.",
+	pages = "074510",
+	slaccitation = "%\%CITATION = HEP-LAT 0509009;\%\%",
+	title = "{Observations on discretization errors in twisted-mass lattice QCD}",
+	volume = "D72",
+	year = "2005"
+}
+
+@article{Sheikholeslami:1985ij,
+	author = "Sheikholeslami, B. and Wohlert, R.",
+	journal = "Nucl. Phys.",
+	pages = "572",
+	slaccitation = "%\%CITATION = NUPHA,B259,572;\%\%",
+	title = "{Improved continuum limit lattice action for qcd with {Wilson} fermions}",
+	volume = "B259",
+	year = "1985"
+}
+
+@article{Shindler:2005vj,
+	author = "Shindler, Andrea",
+	eprint = "hep-lat/0511002",
+	journal = "PoS",
+	pages = "014",
+	slaccitation = "%\%CITATION = HEP-LAT 0511002;\%\%",
+	title = "{Twisted mass lattice {QCD}: Recent developments and results}",
+	volume = "LAT2005",
+	year = "2006"
+}
+
+@article{Shindler:2006tm,
+	author = "Shindler, A.",
+	collaboration = "ETM",
+	eprint = "hep-ph/0611264",
+	slaccitation = "%\%CITATION = HEP-PH 0611264;\%\%",
+	title = "{Lattice QCD with light twisted quarks: First results}",
+	year = "2006"
+}
+
+@article{Shindler:2007vp,
+	archiveprefix = "arXiv",
+	author = "Shindler, A.",
+	doi = "10.1016/j.physrep.2008.03.001",
+	eprint = "0707.4093",
+	journal = "Phys. Rept.",
+	pages = "37--110",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0707.4093;\%\%",
+	title = "{Twisted mass lattice QCD}",
+	volume = "461",
+	year = "2008"
+}
+
+@article{Sleijpen:1996aa,
+	author = "Sleijpen, G. L. G. and der Vorst, H. A. Van",
+	journal = "SIAM Journal on Matrix Analysis and Applications",
+	pages = "401--425",
+	title = "{A Jacobi-Davidson iteration method for linear eigenvalue problems}",
+	volume = "17",
+	year = "1996"
+}
+
+@article{Sommer:1993ce,
+	author = "Sommer, R.",
+	eprint = "hep-lat/9310022",
+	journal = "Nucl. Phys.",
+	pages = "839--854",
+	slaccitation = "%\%CITATION = HEP-LAT 9310022;\%\%",
+	title = "{A New way to set the energy scale in lattice gauge theories and its applications to the static force and alpha-s in SU(2) Yang-Mills theory}",
+	volume = "B411",
+	year = "1994"
+}
+
+@article{Sonneveld:1989cgs,
+	address = "Philadelphia, PA, USA",
+	author = "Sonneveld, Peter",
+	issn = "0196-5204",
+	journal = "SIAM J. Sci. Stat. Comput.",
+	number = "1",
+	pages = "36--52",
+	publisher = "Society for Industrial and Applied Mathematics",
+	title = "{CGS, a fast Lanczos-type solver for nonsymmetric linear systems}",
+	volume = "10",
+	year = "1989"
+}
+
+@article{Sternbeck:2003gy,
+	author = "Sternbeck, A. and Ilgenfritz, E.-M. and Kerler, W. and M{\"u}ller-Preu{\ss}ker, M. and St{\"u}ben, H.",
+	eprint = "hep-lat/0309059",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "898--900",
+	slaccitation = "%\%CITATION = HEP-LAT 0309059;\%\%",
+	title = "{The {Aoki} phase for {N(f)} = 2 {Wilson} fermions revisited}",
+	volume = "129",
+	year = "2004"
+}
+
+@article{Sternbeck:2005tk,
+	author = "Sternbeck, A. and Ilgenfritz, E. -M. and Mueller-Preussker, M. and Schiller, A.",
+	eprint = "hep-lat/0506007",
+	journal = "Phys. Rev.",
+	pages = "014507",
+	slaccitation = "%\%CITATION = HEP-LAT/0506007;\%\%",
+	title = "{Going infrared in SU(3) Landau gauge gluodynamics}",
+	volume = "D72",
+	year = "2005"
+}
+
+@article{Symanzik:1983dc,
+	author = "Symanzik, K.",
+	journal = "Nucl. Phys.",
+	pages = "187",
+	slaccitation = "%\%CITATION = NUPHA,B226,187;\%\%",
+	title = "{Continuum limit and improved action in lattice theories. 1. principles and phi**4 theory}",
+	volume = "B226",
+	year = "1983"
+}
+
+@conference{Symanzik:1981hc,
+	author = "Symanzik, K.",
+	booktitle = "{Mathematical problems in theoretical physics}",
+	editor = "et al., R. Schrader",
+	journal = "Lecture Notes in Physics",
+	note = "Presented at 6th Int. Conf. on Mathematical Physics, Berlin, West Germany",
+	pages = "47--58",
+	title = "{Some topics in quantum field theory}",
+	volume = "153",
+	year = "1981"
+}
+
+@article{Symanzik:1983gh,
+	author = "Symanzik, K.",
+	journal = "Nucl. Phys.",
+	pages = "205",
+	slaccitation = "%\%CITATION = NUPHA,B226,205;\%\%",
+	title = "{Continuum limit and improved action in lattice theories. 2. O(N) nonlinear sigma model in perturbation theory}",
+	volume = "B226",
+	year = "1983"
+}
+
+@article{Takaishi:1996xj,
+	author = "Takaishi, T.",
+	journal = "Phys. Rev.",
+	pages = "1050--1053",
+	slaccitation = "%\%CITATION = PHRVA,D54,1050;\%\%",
+	title = "{Heavy quark potential and effective actions on blocked configurations}",
+	volume = "D54",
+	year = "1996"
+}
+
+@article{Takaishi:2005tz,
+	archiveprefix = "arXiv",
+	author = "Takaishi, Tetsuya and de Forcrand, Philippe",
+	doi = "10.1103/PhysRevE.73.036706",
+	eprint = "hep-lat/0505020",
+	journal = "Phys. Rev.",
+	pages = "036706",
+	slaccitation = "%\%CITATION = HEP-LAT/0505020;\%\%",
+	title = "{Testing and tuning new symplectic integrators for hybrid Monte Carlo algorithm in lattice QCD}",
+	volume = "E73",
+	year = "2006"
+}
+
+@article{Takeda:2004xh,
+	author = "Takeda, S. and others",
+	eprint = "hep-lat/0408010",
+	journal = "Phys. Rev.",
+	pages = "074510",
+	slaccitation = "%\%CITATION = HEP-LAT 0408010;\%\%",
+	title = "{A scaling study of the step scaling function in SU(3) gauge theory with improved gauge actions}",
+	volume = "D70",
+	year = "2004"
+}
+
+@article{Ukawa:2002pc,
+	author = "Ukawa, A.",
+	collaboration = "CP-PACS and JL{QCD}",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "195--196",
+	slaccitation = "%\%CITATION = NUPHZ,106,195;\%\%",
+	title = "{Computational cost of full {QCD} simulations experienced by {CP-PACS and JLQCD Collaborations}}",
+	volume = "106",
+	year = "2002"
+}
+
+@article{Urbach:2005ji,
+	author = "Urbach, C. and Jansen, K. and Shindler, A. and Wenger, U.",
+	eprint = "hep-lat/0506011",
+	journal = "Comput. Phys. Commun.",
+	pages = "87--98",
+	slaccitation = "%\%CITATION = HEP-LAT 0506011;\%\%",
+	title = "{{HMC} algorithm with multiple time scale integration and mass preconditioning}",
+	volume = "174",
+	year = "2006"
+}
+
+@article{Urbach:2007rt,
+	archiveprefix = "arXiv",
+	author = "Urbach, Carsten",
+	collaboration = "ETM",
+	eprint = "0710.1517",
+	journal = "PoS",
+	pages = "022",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0710.1517;\%\%",
+	title = "{Lattice QCD with two light Wilson quarks and maximally twisted mass}",
+	volume = "LAT2007",
+	year = "2007"
+}
+
+@article{WalkerLoud:2005bt,
+	archiveprefix = "arXiv",
+	author = "Walker-Loud, Andre and Wu, Jackson M. S.",
+	doi = "10.1103/PhysRevD.72.014506",
+	eprint = "hep-lat/0504001",
+	journal = "Phys. Rev.",
+	pages = "014506",
+	slaccitation = "%\%CITATION = HEP-LAT/0504001;\%\%",
+	title = "{Nucleon and Delta masses in twisted mass chiral perturbation theory}",
+	volume = "D72",
+	year = "2005"
+}
+
+@article{Weinberg:1973un,
+	author = "Weinberg, S.",
+	journal = "Phys. Rev. Lett.",
+	pages = "494--497",
+	slaccitation = "%\%CITATION = PRLTA,31,494;\%\%",
+	title = "{Nonabelian gauge theories of the strong interactions}",
+	volume = "31",
+	year = "1973"
+}
+
+@article{Weinberg:1978kz,
+	author = "Weinberg, S.",
+	journal = "Physica",
+	pages = "327",
+	slaccitation = "%\%CITATION = PHYSA,A96,327;\%\%",
+	title = "{Phenomenological Lagrangians}",
+	volume = "A96",
+	year = "1979"
+}
+
+@book{Weinberg:1995mt,
+	author = "Weinberg, S.",
+	pages = "609",
+	publisher = "Cambridge University Press",
+	title = "{The Quantum theory of fields. Vol. 1: Foundations}",
+	year = "1995"
+}
+
+@article{Weisz:1982zw,
+	author = "Weisz, P.",
+	journal = "Nucl. Phys.",
+	pages = "1",
+	slaccitation = "%\%CITATION = NUPHA,B212,1;\%\%",
+	title = "{Continuum limit improved lattice action for pure {Yang-Mills} theory. 1}",
+	volume = "B212",
+	year = "1983"
+}
+
+@article{Weisz:1983bn,
+	author = "Weisz, P. and Wohlert, R.",
+	journal = "Nucl. Phys.",
+	pages = 397,
+	slaccitation = "%\%CITATION = NUPHA,B236,397;\%\%",
+	title = "{Continuum limit improved lattice action for pure {Yang-Mills} theory. 2}",
+	volume = "B236",
+	year = 1984
+}
+
+@article{Wennekers:2005wa,
+	author = "Wennekers, J. and Wittig, H.",
+	eprint = "hep-lat/0507026",
+	slaccitation = "%\%CITATION = HEP-LAT 0507026;\%\%",
+	title = "{On the renormalized scalar density in quenched QCD}",
+	year = "2005"
+}
+
+@article{Weyl:1918ib,
+	author = "Weyl, H.",
+	journal = "Sitzungsber. Preuss. Akad. Wiss. Berlin (Math. Phys. )",
+	pages = "465",
+	slaccitation = "%\%CITATION = SPWPA,1918,465;\%\%",
+	title = "{Gravitation und Elektrizit{\"a}t}",
+	volume = "1918",
+	year = "1918"
+}
+
+@article{Weyl:1929fm,
+	author = "Weyl, H.",
+	journal = "Z. Phys.",
+	pages = "330--352",
+	slaccitation = "%\%CITATION = ZEPYA,56,330;\%\%",
+	title = "{Electron and gravitation}",
+	volume = "56",
+	year = "1929"
+}
+
+@article{Wilson:1974sk,
+	author = "Wilson, K. G.",
+	journal = "Phys. Rev.",
+	pages = "2445--2459",
+	slaccitation = "%\%CITATION = PHRVA,D10,2445;\%\%",
+	title = "{Confinement of quarks}",
+	volume = "D10",
+	year = "1974"
+}
+
+@article{Wilson:1974sk,
+	author = "Wilson, K. G.",
+	journal = "Phys. Rev.",
+	pages = "2445--2459",
+	slaccitation = "%\%CITATION = PHRVA,D10,2445;\%\%",
+	title = "{Confinement of quarks}",
+	volume = "D10",
+	year = "1974"
+}
+
+@article{Wilson:1975mb,
+	author = "Wilson, K. G.",
+	journal = "Rev. Mod. Phys.",
+	pages = "773",
+	slaccitation = "%\%CITATION = RMPHA,47,773;\%\%",
+	title = "{The renormalization group: Critical phenomena and the kondo problem}",
+	volume = "47",
+	year = "1975"
+}
+
+@article{Wilson:1975mb,
+	author = "Wilson, K. G.",
+	journal = "Rev. Mod. Phys.",
+	pages = "773",
+	slaccitation = "%\%CITATION = RMPHA,47,773;\%\%",
+	title = "{The renormalization group: Critical phenomena and the kondo problem}",
+	volume = "47",
+	year = "1975"
+}
+
+@article{Wolff:2003sm,
+	author = "Wolff, U.",
+	collaboration = "ALPHA",
+	eprint = "hep-lat/0306017",
+	journal = "Comput. Phys. Commun.",
+	pages = "143--153",
+	slaccitation = "%\%CITATION = HEP-LAT 0306017;\%\%",
+	title = "{Monte Carlo errors with less errors}",
+	volume = "156",
+	year = "2004"
+}
+
+@article{Yang:1954ek,
+	author = "Yang, C.-N. and Mills, R. L.",
+	journal = "Phys. Rev.",
+	pages = "191--195",
+	slaccitation = "%\%CITATION = PHRVA,96,191;\%\%",
+	title = "{Conservation of isotopic spin and isotopic gauge invariance}",
+	volume = "96",
+	year = "1954"
+}
+
+@article{Yoshie:2008aw,
+	archiveprefix = "arXiv",
+	author = "Yoshie, Tomoteru",
+	eprint = "0812.0849",
+	journal = "PoS",
+	pages = "019",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0812.0849;\%\%",
+	title = "{Making use of the International Lattice Data Grid}",
+	volume = "LATTICE2008",
+	year = "2008"
+}
+
+@article{Zweig:1964jf,
+	author = "Zweig, G.",
+	note = "CERN-TH-412",
+	title = "{An SU(3) model for strong interaction symmetry and its breaking. 2}"
+}
+
+@article{cln:web,
+	eprint = "http://www.ginac.de/CLN/"
+}
+
+@article{deForcrand:1995bs,
+	author = "de Forcrand, P.",
+	eprint = "hep-lat/9509082",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "228--235",
+	slaccitation = "%\%CITATION = HEP-LAT 9509082;\%\%",
+	title = "{Progress on lattice {QCD} algorithms}",
+	volume = "47",
+	year = "1996"
+}
+
+@article{deForcrand:1996bx,
+	author = "de Forcrand, P. and others",
+	collaboration = "{QCD}-TARO",
+	eprint = "hep-lat/9608094",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "938--941",
+	slaccitation = "%\%CITATION = HEP-LAT 9608094;\%\%",
+	title = "{Search for effective lattice action of pure {QCD}}",
+	volume = "53",
+	year = "1997"
+}
+
+@article{deForcrand:1996ck,
+	author = "de Forcrand, P. and Takaishi, T.",
+	eprint = "hep-lat/9608093",
+	journal = "Nucl. Phys. Proc. Suppl.",
+	pages = "968--970",
+	slaccitation = "%\%CITATION = HEP-LAT 9608093;\%\%",
+	title = "{Fast fermion Monte Carlo}",
+	volume = "53",
+	year = "1997"
+}
+
+@article{etmc:asqr,
+	archiveprefix = "arXiv",
+	author = "Frezzotti, R. et al.",
+	eprint = "0710.2492",
+	journal = "PoS",
+	pages = "277",
+	primaryclass = "hep-lat",
+	slaccitation = "%\%CITATION = 0710.2492;\%\%",
+	title = "{O(a^2) cutoff effects in Wilson fermion simulations}",
+	volume = "LAT2007",
+	year = "2007"
+}
+
+@article{ildg:web,
+	eprint = "http://cssm.sasr.edu.au/ildg/"
+}
+
+@book{kleinert:1,
+	author = "Kleinert, H.",
+	edition = "2nd Edition",
+	publisher = "World Scientific, Singapore",
+	title = "{Path integrals in quantum mechanics, statistics and polymer ph ysics}",
+	year = "1995"
+}
+
+@article{lapack:web,
+	eprint = "http://www.netlib.org/lapack/"
+}
+
+@article{lime:web,
+	author = "USQCD",
+	eprint = "http://usqcd.jlab.org/usqcd-docs/c-lime/",
+	title = "{c-lime library}"
+}
+
+@article{hmc:web,
+	eprint = "http://www.carsten-urbach.eu/",
+	title = "{tmLQCD}"
+}
+
+@book{meister:1999,
+	author = "Meister, Andreas",
+	optaddress = "",
+	optannote = "",
+	optedition = "",
+	optkey = "",
+	optmonth = "",
+	optnote = "",
+	optnumber = "",
+	optseries = "",
+	optvolume = "",
+	publisher = "vieweg",
+	title = "{Numerik linearer Gleichungssysteme}",
+	year = "1999"
+}
+
+@manual{minuit,
+	note = "\\seal.web.cern.ch/seal/snapshot/work-packages/mathlibs/minuit/home.html",
+	title = "{MINUIT home page}"
+}
+
+@article{mpi:web,
+	eprint = "http://www-unix.mcs.anl.gov/mpi/",
+	title = "{The message passing interface standard}"
+}
+
+@phdthesis{orth:2004phd,
+	author = "Orth, B.",
+	optaddress = "",
+	optannote = "",
+	optkey = "",
+	optmonth = "",
+	optnote = "",
+	opttype = "",
+	school = "Bergische Universit{\"a}t Wuppertal",
+	title = "{Finite size effects in lattice {QCD} with dynamical {Wilson} fermions}",
+	year = "2004"
+}
+
+@phdthesis{pleiter:phd,
+	author = "Pleiter, D.",
+	school = "Freie {U}niversit�t {B}erlin",
+	title = "{XXX}",
+	year = "2001"
+}
+
+@book{press:1992,
+	address = "Cambridge, UK",
+	author = "Press, William and Teukolsky, Saul and Vetterling, William and Flannery, Brian",
+	citeulike-article-id = "767703",
+	edition = "2nd",
+	keywords = "bibtex-import",
+	posted-at = "2006-07-21 00:26:35",
+	priority = "0",
+	publisher = "Cambridge University Press",
+	title = "{Numerical Recipes in C}",
+	year = "1992"
+}
+
+@manual{root,
+	note = "root.cern.ch/",
+	title = "{The ROOT system home page}"
+}
+
+@book{saad:2003a,
+	author = "Saad, Y.",
+	edition = "2nd",
+	publisher = "SIAM",
+	title = "{Iterative Methods for sparse linear systems}",
+	year = "2003"
+}
+
+@article{scidac,
+	eprint = "http://www.scidac.gov/"
+}
+
+@mastersthesis{urbach:2002aa,
+	author = "Urbach, C.",
+	school = "Freie Universit{\"a}t Berlin, Fachbereich Physik",
+	title = "{Untersuchung der {R}eversibilit{\"a}tsverletzung im {H}ybrid {M}onte {C}arlo {A}lgorithmus}",
+	year = "2002"
+}
+
+@inbook{Joo2013,
+	abstract = "Lattice Quantum Chromodynamics (LQCD) is currently the only known model independent, non perturbative computational method for calculations in the theory of the strong interactions, and is of importance in studies of nuclear and high energy physics. LQCD codes use large fractions of supercomputing cycles worldwide and are often amongst the first to be ported to new high performance computing architectures. The recently released Intel Xeon Phi architecture from Intel Corporation features parallelism at the level of many x86-based cores, multiple threads per core, and vector processing units. In this contribution, we describe our experiences with optimizing a key LQCD kernel for the Xeon Phi architecture. On a single node, using single precision, our Dslash kernel sustains a performance of up to 320 GFLOPS, while our Conjugate Gradients solver sustains up to 237 GFLOPS. Furthermore we demonstrate a fully 'native' multi-node LQCD implementation running entirely on KNC nodes with minimum involvement of the host CPU. Our multi-node implementation of the solver has been strong scaled to 3.9 TFLOPS on 32 KNCs.",
+	address = "Berlin, Heidelberg",
+	author = "Jo{\'o}, B{\'a}lint and Kalamkar, Dhiraj D. and Vaidyanathan, Karthikeyan and Smelyanskiy, Mikhail and Pamnany, Kiran and Lee, Victor W. and Dubey, Pradeep and Watson, William",
+	booktitle = "{Supercomputing: 28th International Supercomputing Conference, ISC 2013, Leipzig, Germany, June 16-20, 2013. Proceedings}",
+	doi = "10.1007/978-3-642-38750-0_4",
+	editor = "Kunkel, Julian Martin and Ludwig, Thomas and Meuer, Hans Werner",
+	isbn = "978-3-642-38750-0",
+	pages = "40--54",
+	publisher = "Springer Berlin Heidelberg",
+	title = "{Lattice QCD on Intel® Xeon PhiTM Coprocessors}",
+	url = "https://doi.org/10.1007/978-3-642-38750-0_4",
+	year = "2013"
+}
+
diff --git a/doc/cgmms.tex b/doc/cgmms.tex
new file mode 100644
index 000000000..0b150cd32
--- /dev/null
+++ b/doc/cgmms.tex
@@ -0,0 +1,80 @@
+\subsection{CGMMS}
+
+The multi-shift CG implementation in tmLQCD is referred to as \emph{CGMMS} since it was originally developped to solve a multi-system of equations of the form
+\begin{equation}
+  ( A + \mathbb{I}\mu_k^2 ) = b \, ,
+\end{equation}
+where $A$ can be $\Qp\Qm$ or $\Mp\Mm$ and the squared shifts $\mu_i^2$ can be naturally interpreted as different twisted quark masses (in the case of $\Mp\Mm$, appropriate factors of $\gamma^5$ must be inserted as required).
+\begin{equation}
+  \begin{split}
+  ( \Mw + i \mu \gamma^5 )( \Mw^\dagger - i \mu \gamma^5 ) & = b \\
+  ( \Mw \Mw^\dagger + \cancel{i\mu\gamma^5 \Mw^\dagger - i \mu \Mw \gamma^5} + \mu^2 ) & = b \\
+  ( \Mw \Mw^\dagger + \mu^2 ) & = b \, ,
+  \end{split}
+\end{equation}
+where in the last line $\gamma_5$-hermiticity of $\Mw$ was used.
+With the clover term, $T$, in the operator, the calculation goes through in the same way, with the result
+\begin{equation}
+  \begin{split}
+    & ( \Mw \Mw^\dagger + \Mw T + T \Mw^\dagger + \cancel{i\mu\gamma^5 \Mw^\dagger - i \mu \Mw \gamma^5} + i\mu\gamma_5 T - i\mu T \gamma_5 + T^2 + \mu^2 ) = b \\
+    & ( \Msw \Msw^\dagger + \mu^2 ) = b \, ,
+  \end{split}
+\end{equation}
+where $\Msw = \Mw + T$.
+
+The algorithm is listed below in Algorithm \ref{alg:cgm} (see also Ref.~\cite{Chiarappa:2006hz}
+and references therein), with the identification $\sigma_k = \mu_k^2$.
+Note that in line 6 below, $\alpha_{n-1}(1+\sigma_k\alpha_n)$ is correct, in contrast to Ref.~\cite{Chiarappa:2006hz}.
+
+\begin{algorithm}
+  \caption{CGMMS algorithm}
+  \label{alg:cgm}
+  \begin{algorithmic}[1]
+    \vspace{.2cm}
+    \STATE $n=0, x_0^k = 0, r_0 = p_0 = p_0^k = b, k_\mathrm{max},
+    \delta, \epsilon$
+    \STATE  $\biggl.\biggr.\alpha_{-1} = \zeta_{-1}^k = \zeta_0^k = 1, \beta_0^k = \beta_0 = 0$
+    \REPEAT
+    \STATE $\alpha_n = (r_n, r_n) / (p_n, A p_n)$
+    \FOR{$k = 1$ to $k_\mathrm{max}$}
+    \STATE $\biggl.\biggr.\zeta_{n+1}^k = (\zeta^k_n  \alpha_{n-1}) / 
+      (\alpha_n \beta_n(1 - \zeta_n^k / \zeta^k_{n-1}) + \alpha_{n-1}
+      (1+\sigma_k\alpha_n))$
+    \STATE $\alpha^k_n = (\alpha_n \zeta_{n+1}^k)/ \zeta_n^k$
+    \STATE $\biggl.\biggr.x_{n+1}^k = x_n^k + \alpha_n^k p_n^k$
+    \IF{$\|\alpha^{k_\mathrm{max}} p^{k_\mathrm{max}}\| < \delta$}
+    \STATE $k_\mathrm{max} = k_\mathrm{max} -1$
+    \ENDIF
+    \ENDFOR
+    \STATE $x_{n+1} = x_n + \alpha_n p_n$
+    \STATE $\biggl.\biggr.r_{n+1} = r_n - \alpha_n Ap_n$
+    \STATE $\beta_{n+1} = (r_{n+1}, r_{n+1}) / (r_n, r_n)$
+    \STATE $\beta_{n+1}^k = \frac{\beta_{n+1} \zeta_{n+1}^k \alpha_n^k}{\zeta_{n}^k\alpha_n}$
+    \STATE $\biggl.\biggr.p_{n+1}^k = \zeta_{n+1}^k r_{n+1} + \beta_{n+1}^k p_n^k$
+    \STATE $n=n+1$
+    \UNTIL{$\|r_n\|<\epsilon$}
+  \end{algorithmic}
+\end{algorithm}
+
+The implementations in \texttt{solver/cg\_mms\_tm.c} and \texttt{solver/cg\_mms\_tm\_nd.c} use a slightly different approach in that the lowest shift is included in the operator $A$, such that the higher shifts are $\sigma_k-\sigma_0\, \forall k > 0$.
+
+It should be noted that $\sqrt{\sigma_k}$ are passed to \texttt{cg\_mms\_tm} and the solver internally squares these and shifts them by $\sigma_0$.
+
+\subsubsection{Single flavour Wilson (clover) fermions in the rational approximation}
+
+For details about the rational approximation in tmLQCD, see Section~\ref{subsec:rationalhmc}.
+
+\textbf{QPhiX interface:} For the HMC with a single flavour of Wilson (clover) fermions (\texttt{RAT} or \texttt{CLOVERRAT} monomials), the function \texttt{solve\_mshift\_oneflavour} of \texttt{solver/monomial\_solve.c} provides a wrapper for tmLQCD or external multi-shift solvers.
+
+Note that it passes the shifts as expected by tmLQCD's \texttt{cg\_mms\_tm}, which means that they need to be squared.
+For the QPhiX normalisation, the QPhiX solver interface also divides them by $4\kappa^2$.
+The shifts are taken as is and not shifted by $\sigma_0$.
+
+\subsubsection{Two flavour Wilson twisted mass (clover) fermions in the rational approximation}
+
+\textbf{QPhiX interface:} For the HMC with non-degenerate twisted mass (clover) doublets (\texttt{NDRAT} and \texttt{NDCLOVERRAT} monomials, exactly the same approach is used.
+
+%%% Local Variables: 
+%%% mode: latex
+%%% TeX-master: "main"
+%%% End: 
diff --git a/doc/command.tex b/doc/command.tex
index cf495acf0..9551b0a9d 100644
--- a/doc/command.tex
+++ b/doc/command.tex
@@ -46,6 +46,11 @@
 \newcommand{\Qpm}{Q_{\pm}}
 \newcommand{\Qp}{Q_{+}}
 \newcommand{\Qm}{Q_{-}}
+\newcommand{\Mp}{M_{+}}
+\newcommand{\Mm}{M_{-}}
+\newcommand{\Dw}{D_\mathrm{w}}
+\newcommand{\Mw}{M_\mathrm{w}}
+\newcommand{\Msw}{M_\mathrm{sw}}
 \newcommand{\Wp}{W_{+}}
 \newcommand{\Wm}{W_{-}}
 \newcommand{\Qnd}{Q_{\textrm{ND}}}
diff --git a/doc/eo_pre.tex b/doc/eo_pre.tex
index 7b8b9c2d9..79233449e 100644
--- a/doc/eo_pre.tex
+++ b/doc/eo_pre.tex
@@ -23,7 +23,7 @@ \subsection{HMC Update}
 \end{equation}
 For convenience we define
 $\tilde\mu=2\kappa\mu$. Using the matrix $M$ one can define the
-hermitian (two flavor) operator:
+hermitian (two flavour) operator:
 \begin{equation}
   \label{eq:eo1}
   Q\equiv \gamma_5 M = \begin{pmatrix}
@@ -221,7 +221,7 @@ \subsubsection{Mass non-degenerate flavour doublet}
     \frac{M_{oe}(1+i\bar\mu\gamma_5)M_{eo}}{1+\bar\mu^2-\bar\epsilon^2}\\
   \end{pmatrix}
 \end{equation*}
-with the previous definitions of $M_{eo}$ etc. The inplementation for
+with the previous definitions of $M_{eo}$ etc. The implementation for
 the HMC is very similar to the mass degenerate case. $\hat Q^h$ has
 again a hermitian conjugate given by
 \[
@@ -247,7 +247,7 @@ \subsubsection{Combining Clover and Twisted mass term} \label{sec:clover_twist}
 with the clover term $T$. For convenience we define
 $\tilde\mu\equiv2\kappa\mu$ and $\tilde c_{SW} = 2\kappa
 c_{SW}$. Using the matrix $M$ one can define the 
-(two flavor) operator:
+(two flavour) operator:
 \begin{equation}
   \label{eq:eosw1}
   Q\equiv \gamma_5 M = \begin{pmatrix}
@@ -314,7 +314,7 @@ \subsubsection{Combining Clover and Twisted mass term} \label{sec:clover_twist}
     i\tilde\mu\gamma_5)]\, .\\
   \end{split}
 \end{equation}
-Note that for $\tilde\mu=0$  $\det(1+T_{ee})$ is real. For
+Note that for $\tilde\mu=0$, $\det(1+T_{ee})$ is real. For
 $\tilde\mu\neq0$ however, $\det(1+T_{ee}+i\tilde\mu\gamma_5)$ is the
 complex conjugate of $\det(1+T_{ee}-i\tilde\mu\gamma_5)$ as the
 product of the two must be real. The latter can be seen from
@@ -395,7 +395,7 @@ \subsubsection{Combining Clover and Twisted mass term} \label{sec:clover_twist}
 with norm one.
 
 The additional bit in the action $S_{\det}$ needs to be treated
-seperately. The variation of this part is
+separately. The variation of this part is
 \begin{equation}
   \label{eq:eosw11}
   \delta S_{\det} = -\tr \left\{ \left[(1+i\tilde\mu\gamma_5 + T_{ee})^{-1}  +
@@ -410,7 +410,7 @@ \subsubsection{Combining Clover and Twisted mass term} \label{sec:clover_twist}
 \begin{equation}
   \label{eq:Tee}
   1+T_{a\alpha,b\beta} = 1 + \frac{i}{2} c_\mathrm{sw}
-  \kappa\sigma_{\mu\nu}^{\alpha\beta}F_{\mu\nu}^{\alpha\beta}(x)
+  \kappa\sigma_{\mu\nu}^{\alpha\beta}F_{\mu\nu}^{ab}(x)
 \end{equation}  
 once for all $x$. This is implemented in {\ttfamily clover\_leaf.c} in
 the routine {\ttfamily sw\_term}. The twisted mass term is not
@@ -418,7 +418,7 @@ \subsubsection{Combining Clover and Twisted mass term} \label{sec:clover_twist}
 plus and minus $\mu$, respectively. It is easier to add the twisted
 mass term in later on. 
 
-The term in eq.~(\ref{eq:Tee}) correpsonds to a $12\times12$ matrix
+The term in eq.~(\ref{eq:Tee}) corresponds to a $12\times12$ matrix
 in colour and spin which reduces to two complex $6\times6$ matrices
 per site because it is block-diagonal in spin (one matrix for the two
 upper spin components, one for the two lower ones). 
@@ -430,7 +430,7 @@ \subsubsection{Combining Clover and Twisted mass term} \label{sec:clover_twist}
   sw[x][0][0]} is the upper diagonal $3\times3$ matrix, {\ttfamily
   sw[x][1][0]} the upper off-diagnoal $3\times3$ matrix and {\ttfamily
   sw[x][2][0]} the lower diagonal matrix. The lower off-diagonal
-matrix would be the inverse of {\ttfamily sw[x][1][0]}. The second
+matrix would be the hermitian conjugate of {\ttfamily sw[x][1][0]}. The second
 $6\times6$ matrix is stored following the same conventions.
 
 For computing $S_\mathrm{det}$, we take into account the structure
@@ -462,7 +462,7 @@ \subsubsection{Combining Clover and Twisted mass term} \label{sec:clover_twist}
 When it comes to computing the inverse of $1\pm i \mu\gamma_5 +
 T_{ee}$, the dependence on the sign of $\mu$ is unavoidable. However,
 it is only needed for even (odd) sites, so we can use an array
-{\ttfamily sw\_inv[VOLUME][3][2]} of type {\ttfamily su3} to store
+{\ttfamily sw\_inv[VOLUME][4][2]} of type {\ttfamily su3} to store
 e.g. $+\mu$ at even and $-\mu$ at odd sites.
 
 For evaluating the force for $S_\mathrm{det}$ in the function
@@ -604,7 +604,7 @@ \subsubsection{Combining Clover and Nondegenerate Twisted mass term}
 proportionality of $D$ to the identity matrix was used.
 
 The implementation {\ttfamily sw\_trace\_nd} in {\ttfamily clover\_det.c} populates
-a temporary $6\times6$ array from the {\ttfamily sw} array, squares it and
+a temporary $6\times6$ array from the {\ttfamily sw} array, squares it 
 and adds $\bar{\mu}^2 - \bar{\epsilon}^2$ to the diagonal. Using $\det(\gamma_5) = 1$, 
 the contribution to the effective action is then:
 \begin{equation}
@@ -741,7 +741,7 @@ \subsubsection{Inverting $M$ on $\phi_o$}
 
 In case inverting the full matrix $M$ is much faster than inverting
 the even/odd preconditioned matrix -- as might be the case with
-deflation, one may use for symmetric even/odd preconditioining
+deflation, one may use for symmetric even/odd preconditioning
 \begin{equation}
   (\hat M^\pm)^{-1}\phi_o\ =\ P_{l\to o}\ (M_\pm)^{-1}\ P_{o\to l}\
   M^\pm_{oo}\ \phi_o
@@ -771,6 +771,14 @@ \subsubsection{Inverting $M$ on $\phi_o$}
   A_{oo}\quad &=\quad (1- M_{oo}^{-1} M_{oe} M_{ee}^{-1} M_{eo})^{-1}\ M_{oo}^{-1} \\
 \end{split}
 \]
+In practice The projectors $P_{l\to o}$ and $P_{o\to l}$ are trivially
+implemented by inverting the full matrix on a spinor with all even
+sites set to zero and the odd sites to $\phi_o$.
+
+Using this allows one to use one the one hand the speeding up due to
+even/odd preconditioning in the HMC, and on the other hand the
+speeding up due to a deflated solver.
+
 \endinput
 
 %%% Local Variables: 
diff --git a/doc/input.tex b/doc/input.tex
index e46ac605f..17048a322 100644
--- a/doc/input.tex
+++ b/doc/input.tex
@@ -227,11 +227,16 @@ \subsection{Input parameter for main program}
   It has only effect, if every source is in a separate file
   (i.e. SourceInfo.splitted is set, which is the default).
 
+\item {\ttfamily SourceFilename} and {\ttfamily PropagatorFilename}:\\
+  This sets the basefilename for sources and propagators respectively.
+  The default is {\ttfamily source} for both.
+
 \item {\ttfamily NoSamples}:\\
   in case of stochastic source the number of samples.
 
 \item {\ttfamily SourceType}:\\
-  lets you chose the source type: {\ttfamily Volume, Point, TimeSlice}
+  lets you chose the source type: {\ttfamily Volume, Point, TimeSlice,
+  PionTimeSlice, GenPionTimeSlice}
   are possible here.
 
 \item {\ttfamily ComputeEVs}:\\
diff --git a/doc/main.tex b/doc/main.tex
index 7685f7d48..097db1794 100644
--- a/doc/main.tex
+++ b/doc/main.tex
@@ -1,4 +1,5 @@
-\documentclass[a4paper,12pt,dvips]{article}
+\documentclass[a4paper,12pt]{article}
+\usepackage{graphicx}
 %amsmath
 \usepackage{amssymb}
 \usepackage{amsmath}
@@ -15,6 +16,16 @@
 \usepackage{pifont}
 \usepackage{algorithm}
 \usepackage{algorithmic}
+\usepackage{fancyvrb}
+\usepackage{cancel}
+\usepackage{framed}
+
+\usepackage[a4paper, total={16cm, 25cm}]{geometry}
+\usepackage{fancyvrb}
+
+\makeatletter
+\newcommand\footnoteref[1]{\protected@xdef\@thefnmark{\ref{#1}}\@footnotemark}
+\makeatother
 
 %\setlength{\parindent}{0pt}
 % Absatzabstand
@@ -69,6 +80,11 @@ \section{Implementation}
 \section{File Formats and IO}
 \myinput{prop_format.tex}
 
+\section{Interfaces to external QCD libraries}
+\myinput{quda.tex}
+\myinput{DDalphaAMG.tex}
+\myinput{qphix.tex}
+
 \clearpage
 \bibliographystyle{h-physrev5}
 \bibliography{bibliography}
@@ -87,6 +103,9 @@ \section{File Formats and IO}
 
   \section{Deflation}
   \myinput{deflation}
+  
+  \section{Solvers}
+  \myinput{solvers}
 \end{appendix}
 
 
diff --git a/doc/qphix.tex b/doc/qphix.tex
new file mode 100644
index 000000000..284e827e7
--- /dev/null
+++ b/doc/qphix.tex
@@ -0,0 +1,209 @@
+%author: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
+%date: 10/2017
+
+\subsection{QPhiX: Optimised kernels and solvers for Intel Processors}\label{subsec:qphix}
+
+
+The QPhiX \cite{Joo2013} interface provides a library of MPI- and OpenMP-parallel linear operators and solvers for Wilson-type lattice fermions as well as a code-generator for the kernels employed by these operators.
+QPhiX has been extended to include all the operators relevant for tmLQCD, including the non-degenerate operator with and without the clover term.
+
+\subsubsection{Installation}
+If not already installed, you have to install QPhiX first.
+At the time of writing, the version with support for all twisted mass operators is in branch 
+
+\begin{itemize}
+\item{\texttt{devel} branch of \url{https://github.com/JeffersonLab/qphix}.}
+\end{itemize}
+
+It depends on QMP (\url{https://github.com/usqcd-software/qmp}), which is built and installed through the usual \texttt{configure, make, make install} mechanism.
+
+QPhiX is built using CMake and requires the availability of python 3, as well as the jinja2 library (\url{https://jinja.pocoo.org}).
+The latter can easily be installed via the pip package installer:
+\begin{framed}
+\begin{Verbatim}
+pip install --user jinja  
+\end{Verbatim}
+\end{framed}
+
+\textbf{QPhiX AVX2 Compilation}: 
+In order to compile QPhiX using GCC on an AVX2 machine, CMake is called in this way:
+\begin{framed}
+\begin{Verbatim}[fontsize=\small]
+CXX=mpicxx \
+CXXFLAGS="-mavx2 -mtune=core-avx2 -march=core-avx2 -std=c++11 -O3 -fopenmp" \
+cmake -Disa=avx2  \
+      -DQMP_DIR=${QMP_INSTALL_DIR} \
+      -Dparallel_arch=parscalar \
+      -Dhost_cxx=g++ \
+      -Dhost_cxxflags="-std=c++11 -O3" \
+      -Dtwisted_mass=TRUE \
+      -Dtm_clover=TRUE \
+      -Dclover=TRUE \
+      -Dtesting=FALSE  \
+      -DCMAKE_INSTALL_PREFIX=${QPHIX_INSTALL_DIR} ${QPHIX_SRC_DIR}
+\end{Verbatim}
+\end{framed}
+where \texttt{QMP\_INSTALL\_DIR}, \texttt{QPHIX\_INSTALL\_DIR} and \texttt{QPHIX\_SRC\_DIR} should be replaced with the QMP installation directory, the target installation directory for QPhiX and the QPhiX source directory respectively.
+
+In the command above:
+\begin{itemize}
+  \item{\texttt{-Dtesting=FALSE} disables the building of all tests, which would additionally require QDP++ to be available}
+  \item{\texttt{-Dhost\_cxx} and \texttt{-Dhost\_cxxflags} define the compiler used for building the code generator executables. This can be any compiler and \texttt{g++} works just fine for this purpose.+}
+\end{itemize}
+
+\textbf{QPhiX AVX512 Compilation}: 
+On a KNL-based machine like Marconi-KNL instead, the Intel compiler and Intel MPI library should be used:
+\begin{framed}
+\begin{Verbatim}
+CXX=mpiicpc \
+CXXFLAGS="-xKNL -std=c++11 -O3 -qopenmp" \
+CFLAGS="-xKNL -O3 -std=c99 -qopenmp" \
+cmake -Disa=avx512  \
+      -DQMP_DIR==${QMP_INSTALL_DIR} \
+      -Dparallel_arch=parscalar \
+      -Dhost_cxx=g++ \
+      -Dhost_cxxflags="-std=c++11 -O3" \
+      -Dtwisted_mass=TRUE \
+      -Dtm_clover=TRUE \
+      -Dclover=TRUE \
+      -Dtesting=FALSE  \
+      -DCMAKE_INSTALL_PREFIX=${QPHIX_INSTALL_DIR} ${QPHIX_SRC_DIR}
+\end{Verbatim}
+\end{framed}
+
+Note that for Skylake, the correct code for targetting vectorisation is \texttt{SKYLAKE-AVX512}.
+
+\textbf{tmLQCD AVX512 Compilation}: Once QPhiX is built and installed, tmLQCD can be configured as follows on a KNL AVX512 machine, for example:
+\begin{framed}
+\begin{Verbatim}[fontsize=\small]
+$ cd ${TMLQCD_SRC_DIR}
+$ autoconf
+$ cd ${TMLQCD_BUILD_DIR}
+$ ${TMLQCD_SRC_DIR}/configure \
+  --host=x86_64-linux-gnu \
+  --with-limedir=${LIME_INSTALL_DIR} \
+  --with-lemondir=${LEMON_INSTALL_DIR} \
+  --with-mpidimension=4 --enable-omp --enable-mpi \
+  --disable-sse2 --disable-sse3 \
+  --with-lapack="-Wl,--start-group ${MKLROOT}/lib/intel64/libmkl_intel_lp64.a
+                 ${MKLROOT}/lib/intel64/libmkl_core.a
+                 ${MKLROOT}/lib/intel64/libmkl_intel_thread.a
+                 -Wl,--end-group -lpthread -lm -ldl" \
+  --disable-halfspinor --enable-gaugecopy \
+  --enable-alignment=64 \
+  --enable-qphix-soalen=4 \
+  --with-qphixdir=${QPHIX_INSTALL_DIR} \
+  --with-qmpdir=${QMP_INSTALL_DIR} \
+  CC=mpiicc CXX=mpiicpc F77=ifort \
+  CFLAGS="-O3 -std=c99 -qopenmp -xKNL" \
+  CXXFLAGS="-O3 -std=c++11 -qopenmp -xKNL" \
+  LDFLAGS="-qopenmp"
+\end{Verbatim}
+\end{framed}
+\textbf{IMPORTANT:} On AVX512 machines, for some reason, the half-spinor tmLQCD operators do not work.
+This is likely related to MPI and alignment, but we were unable to resolve it at the time of writing.
+As a result, \texttt{--disable-halfspinor} is passed when building on these architectures.
+
+\texttt{--enable-qphix-soalen=4} sets the QPhiX \emph{structure of array} (SoA) length, which defines the size of the innermost direction in the blocked data structures in QPhiX.
+\emph{Half} the \emph{local} lattice extent in $X$ direction, $L_x/2$, has to be divisible by this number.
+Setting this equal to the double-precision SIMD length on a given architecture means that a full double-precision SIMD vector can be loaded in a single instruction, while values below the SIMD vector length will result in multiple load and store instructions, while all computation are always carried out on full vectors.
+
+For now, the same SoA length is used for all supported arithmetic precisions as this facilitates thinking about possible parallelisation strategies.
+On AVX512 machines, a setting this to $8$ is optimal whereas $4$ is recommended for AVX2.
+
+Note that compiling for KNL requires cross-compilation (if not on a KNL build node), but it seems to be sufficient to specify \texttt{--host=x86\_64-linux-gnu} for all test programs to compile correctly during the configuration stage. 
+
+The QPhiX interface can be combined with DD$\alpha$AMG without problems, but building together with the QUDA interface is only possible using GCC or clang, since QUDA is not compatible with the Intel compiler.
+On the QPhiX side, this will result in a potentially significant reduction of performance.
+
+\subsubsection{Usage}
+
+\noindent\textbf{QPhiX global parameters}: The blocking and threading parameters for QPhiX are passed by adding the following section to the tmLQCD input file:
+\begin{framed}
+\begin{Verbatim}
+BeginExternalInverter QPHIX
+  # physical cores per MPI task
+  NCores = 34
+  # block sizes (see qphix papers for details)
+  By = 8
+  Bz = 8
+  # split the processing of time slices into this many
+  # independent blocks
+  MinCt = 1
+  # (hyper-)thread geometry per core
+  # ompnumthreads = NCores * Sy * Sz
+  # if only a single thread per core is launched
+  # these should both be left as '1'
+  Sy = 1
+  Sz = 2
+  # paddings in XY and XYZ blocks
+  PadXY = 1
+  PadXYZ = 0
+EndExternalInverter    
+\end{Verbatim}
+\end{framed}
+
+\begin{itemize}
+  \item{\texttt{NCores}: number of physical cores per MPI task. On KNL, it might even make sense to specify twice the number of physical cores since each core contains two vector processing units (VPUs). Another possiblity would be to specify the number of tiles per MPI tasks and consider cores and VPUs throuh \texttt{Sz} and \texttt{Sy} below. The only case that has been tested for performance is to set this equal to the number of physical cores per MPI task.}
+  \item{\texttt{By, Bz}: the QPhiX data structures are organised into blocks which can be efficiently loaded into CPU caches. \texttt{By} and \texttt{Bz} define the size of these blocks in the $Y$ and $Z$ lattice dimensions. The local lattice extent in the given dimension should be divisible by the respective block extent. Generally, $4$ or $8$ are good values and the larger of the two may be preferable.}
+  \item{\texttt{MinCt}: Processing of time slices is split into MinCt blocks. This is useful for dual-socket systems when running with a single MPI task per node. In this case, this should be set to $2$ which will allow the kernels to run in a NUMA-friendly fashion. The local $T$ dimension must be divisible by this number. On KNL, this should be set to $1$. Note that in all cases tested so far, running with $2$ MPI tasks per node on dual-socket systems was superior.}
+  \item{\texttt{Sy, Sz}: thread blocking parameters. When multiple threads share resources (this is the case for cores and hyperthreads on KNL, for example), these parameters make it possible to consider this in the volume-traversal loops implemented in QPhiX. On KNL, the only setting which has been tested for performance is to set this equal to $2$, given that \texttt{NCores} has been set to the number of physical cores. \texttt{Sz} then splits the local $Z$ direction among two hyperthreads.}
+  \item{\texttt{PadXY(Z)}: Adds padding to the QPhiX data structures which may result in higer overall performance. Only value tested on KNL is \texttt{PadXY=1} and \texttt{PadXYZ=0}.}
+\end{itemize}
+
+\noindent\textbf{IMPORTANT}: The global setting \texttt{OmpNumThreads} should be set to \texttt{NCores * Sy * Sz}, otherwise the QPhiX interface will abort execution.
+
+\noindent\textbf{QPhiX operator / monomial parameters}: QPhiX solvers are available in operators for inversions and monomials for performing HMC with the same parameters.
+For a clover determinant, using QPhiX solvers instead of tmLQCD-native ones would be achieved as follows:
+\begin{framed}
+\begin{Verbatim}
+BeginMonomial CLOVERDET
+  Timescale = 1
+  kappa = 0.1394267
+  2KappaMu = 0.00069713350
+  CSW = 1.69
+  rho = 0.238419657
+  MaxSolverIterations = 5000
+  AcceptancePrecision =  1.e-21
+  ForcePrecision = 1.e-16
+  Name = cloverdetlight
+  Solver = mixedcg
+  UseExternalInverter = qphix
+  UseCompression = 12
+  UseSloppyPrecision = single
+EndMonomial  
+\end{Verbatim}
+\end{framed}
+\begin{itemize}
+  \item{\texttt{Solver}: specify the solver type (see below for the solvers supported by the QPhiX interface).} 
+  \item{\texttt{UseExternalInverter}: the external inverter \texttt{qphix} should be used for this monomial.}
+  \item{\texttt{UseCompression}: gauge compression should be used (\texttt{12}). This improves performance by increasing the flop/byte ratio. Twisted boundary conditions are fully supported in all directions.}
+  \item{\texttt{UseSloppyPrecision}: for a solver using just a single arithmetic precision (like basic \texttt{cg} or \texttt{bicgstab}), this sets the arithmetic precision employed. For a mixed-precision solver such as \texttt{mixedcg}, this sets the arithmetic precision of the inner solver.}
+\end{itemize}
+
+\noindent\textbf{Supported solvers}: The QPhiX interface provides support for the solvers:
+\begin{itemize}
+\item{\texttt{cg}}
+\item{\texttt{mixedcg}}
+\item{\texttt{bicgstab}}
+\item{\texttt{mixedbicgstab}}
+\item{\texttt{cgmms} (single-flavour rational \emph{monomials} only)}
+\item{\texttt{cgmmsnd} (two-flavour non-degenerate rational \emph{monomials} only)}
+\end{itemize}
+Note that as usual, \texttt{bicgstab} and \texttt{mixedbicgstab} do not converge for twisted mass fermions at maximal twist.
+
+Note also that if the solver is any of \texttt{cg}, \texttt{bicgstab}, \texttt{cgmms} or \texttt{cgmmsnd} and \texttt{UseSloppyPrecision = single} is set, the selected solver will run in single precision arithmetic.
+Only \texttt{mixedcg} and \texttt{mixedbicgstab} are mixed precision solvers which use the sloppy precision as the precision of the inner solver.
+
+\subsubsection{Notes about QPhiX performance}
+
+\begin{itemize}
+  \item{\textbf{MPI Task and Thread pinning:} QPhiX performs best when MPI tasks are pinned to the resources assigned to them and when threads are bound to individual cores or hyperthreads. This is conventiently achieved for Intel MPI by taking control of resource pinning from the job scheduler, setting \texttt{I\_MPI\_PIN=1} and \texttt{I\_MPI\_PIN\_DOMAIN=N}, where \texttt{N} should be set to a pinning domain appropriate for the chosen parallelisation. In order to then distribute application threads in an optimal fashion across the cores that have been assigned to a given MPI task in this way, setting \texttt{KMP\_AFFINITY="balanced,granularity=fine"} is recommended.}
+  \begin{itemize}
+    \item{Generally, the size of the pinning domain is the number of hyperthreads per core supported by the CPU in question, times the number of cores that a given MPI task should run on. If hyperthreading is disabled on the machine in question, it is simply the number of cores that each MPI task should be allocated.}
+  \end{itemize}
+  \item{\textbf{Halo packing overheads:} In QPhiX, communication in the Y and especially in the X dimension incurs halo packing overheads. These are usually greater than the gain from having a more balanced surface to bulk ratio. It is thus recommended to do as little MPI parallelisation as possible in the X dimension and similarly limit it in the Y dimension, although the latter is less performance-critical.}
+  \item{\textbf{OmniPath networking performance:} On machines based on Intel Knight's Landing or Skylake processors with OmniPath networks, best single node performance is generally reached with a single MPI task per node (KNL) or a single MPI task per socket (Skylake). However, until computing centres have implemented the recommendations of Ref.~\cite{Boyle:2017xcy}, more than one or two MPI tasks per node are required to saturate network bandwidth on these machines. Generally, $4$ to $8$ MPI tasks per node seem to work well.}
+\end{itemize}
+
+
diff --git a/doc/quda.tex b/doc/quda.tex
new file mode 100644
index 000000000..069e1183e
--- /dev/null
+++ b/doc/quda.tex
@@ -0,0 +1,209 @@
+%author: Mario Schroeck <mario.schroeck@roma3.infn.it>
+%author: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
+%date: 04/2015
+%date: 06/2017, 12/2017, 06/2018
+
+\subsection{QUDA: A library for QCD on GPUs}\label{subsec:quda}
+
+
+The QUDA \cite{Clark:2009wm, Babich:2011np, Strelchenko:2013vaa} interface is complementary to tmLQCD's own CUDA kernels for computations on the GPU by Florian Burger.
+So far it is exclusively used for inversions.
+
+\subsubsection{Design goals of the interface}
+The QUDA interface has been designed with the following goals in mind, sorted by priority:
+\begin{enumerate}
+	\item \emph{Safety.} Naturally, highest priority is given to the correctness of the output of the interface. 
+	This is trivially achieved by always checking the final residual on the CPU with the default tmLQCD routines.
+	\item \emph{Ease of use.} Within the operator declarations of the input file (between {\ttfamily BeginOperator} and {\ttfamily EndOperator}) a simple flag {\ttfamily UseQudaInverter} is introduced which, when set to {\ttfamily yes}, will let QUDA perform the inversion of that operator. The operators {\ttfamily TMWILSON, WILSON, DBTMWILSON} and {\ttfamily CLOVER} are supported.\footnote{{\ttfamily DBCLOVER} is supported by the interface but not by QUDA as of version 0.7.0.}
+	\item \emph{Minimality.} Minimal changes in the form of {\ttfamily \#ifdef QUDA} precompiler directives to the tmLQCD code base. The main bulk of the interface lies in a single separate file {\ttfamily quda\_interface.c} (with corresponding header file). In the file {\ttfamily operators.c}, the QUDA library is initialized when an operator is initialized which has set {\ttfamily UseQudaInverter = yes}. There, the actual call to the inverter is conditionally replaced with a call to the QUDA interface.
+	\item \emph{Performance.} The higher priority of the previous items results in small performance detriments. In particular:
+	\begin{itemize}
+		\item tmLQCD's $\theta$-boundary conditions are not compatible with QUDA's 8 and 12 parameter reconstruction of the gauge fields (as of QUDA-0.7.0). Therefore reconstruction/compression is deactivated by default, although it may be activated via the input file, see below.
+		\item The gaugefield is transferred each time to the GPU before the inversion starts in order to ensure not to miss any modifications of the gaugefield.
+	\end{itemize}
+\end{enumerate}
+
+
+\subsubsection{Installation}
+If not already installed, you have to install QUDA first. Download the most recent version from \url{http://lattice.github.io/quda/}. Note that QUDA version $\geq 0.7.0$ is required (chiral gamma basis).
+
+QUDA can be installed without any dependencies, consider, e.g., the following minimal configuration:
+
+\begin{verbatim}
+cmake \
+  -DQUDA_DIRAC_STAGGERED=OFF \
+  -DQUDA_DIRAC_DOMAIN_WALL=OFF \
+  -DQUDA_DIRAC_WILSON=ON \
+  -DQUDA_DIRAC_CLOVER=ON \
+  -DQUDA_DIRAC_TWISTED_MASS=ON \
+  -DQUDA_DIRAC_TWISTED_CLOVER=ON \
+  -DQUDA_DIRAC_NDEG_TWISTED_MASS=ON \
+  -DQUDA_DYNAMIC_CLOVER=ON \
+  -DQUDA_MPI=ON \
+  -DQUDA_INTERFACE_MILC=OFF \
+  -DQUDA_INTERFACE_QDP=ON \ 
+  -DQUDA_MULTIGRID=ON \
+  -DQUDA_GPU_ARCH=sm_37 \
+  ${path_to_quda}
+\end{verbatim}
+where {\ttfamily \$CUDADIR} and {\ttfamily \$MPI\_PATH} have to be set appropriately.
+{\ttfamily \$QUDADIR} is your choice for the installation directory of QUDA.
+Note that for Wilson clover quarks, you should set \texttt{-DQUDA\_DYNAMIC\_CLOVER=OFF}, whereas the opposite is strictly necessary for twisted mass clover quarks, which means that you will require two QUDA and tmLQCD builds for the time being if you intend to work with both actions.
+Note also that if you want to use QUDA in a scalar build of tmLQCD, you should remove the lines {\ttfamily --enable-multi-gpu} and {\ttfamily --with-mpi=\$MPI\_PATH} in the configuration (and probably you want to replace the MPI compilers).
+In order to profit from QUDA's autotuning functionality, set the environment variable {\ttfamily QUDA\_RESOURCE\_PATH} to a directory of your choice.
+Every time that you update your QUDA installation or change some of the many QUDA environment variables, the files in the directory will have to be deleted or a new directory chosen.
+It is convenient to base the directory dynamically on the head git commit of your QUDA source tree as well as the value of the {\ttfamily QUDA\_ENABLE\_GDR} environment variable.
+There may be other environment variables which make one set of auto-tuning results incompatible with another.
+
+Once QUDA is installed, a minimal configuration of tmLQCD could look like, e.g.,
+\begin{verbatim}
+./configure CC=mpicc \
+--prefix=$TMLQCDDIR \
+--with-limedir=$LIMEDIR \
+--with-lapack=<linker-flags> \
+--enable-mpi \
+--with-mpidimension=4 \
+CXX=mpiCC \
+--with-qudadir=$QUDADIR \
+--with-cudadir=${CUDADIR}/lib
+\end{verbatim}
+Note that a {\ttfamily C++} compiler is required for linking against the QUDA library, therefore set {\ttfamily CXX} appropriately. {\ttfamily \${QUDADIR}} is where you installed QUDA in the previous step and {\ttfamily \${CUDADIR}} is required again for linking.
+
+\subsubsection{Usage}
+Any main program that reads and handles the operator declaration from an input file can easily be set up to use the QUDA inverter by setting the {\ttfamily UseExternalInverter} flag to {\ttfamily quda}. For example, in the input file for the {\ttfamily invert} executable, add the flag to the operator declaration as
+\begin{verbatim}
+BeginOperator TMWILSON
+  2kappaMu = 0.05
+  kappa = 0.177
+  UseEvenOdd = yes
+  Solver = CG
+  SolverPrecision = 1e-14
+  MaxSolverIterations = 1000
+  UseExternalInverter = quda
+EndOperator
+\end{verbatim}
+and the operator of interest will be inverted using QUDA. The initialization of QUDA is done automatically within the operator initialization,  the QUDA library should be finalized by a call to {\ttfamily \_endQuda()} just before finalizing MPI. When you use the QUDA interface for work that is being published, don't forget to cite \cite{Clark:2009wm, Babich:2011np, Strelchenko:2013vaa}.
+
+\subsubsection{General settings}
+Some properties of the QUDA interface can be configured via the {\ttfamily ExternalInverter} section.
+\begin{verbatim}
+BeginExternalInverter QUDA
+  FermionBC = [theta, pbc, apbc]
+EndExternalInverter
+\end{verbatim}
+
+The option {\ttfamily FermionBC} shown above forces twisted ({\ttfamily theta}), periodic ({\ttfamily pbc}) or antiperiodic ({\ttfamily apbc}) temporal quark field boundary conditions.
+This setting exists because at the time of writing (2017.12.28), there seems to be a bug or incompatibility in QUDA which causes (anti-)periodic boundary conditions with gauge compression to produce incorrect propagators.
+
+\subsubsection{QUDA-MG interface}
+The interface has support for the QUDA Multigrid (MG) solver implementation and allows a number of parameters to be adjusted in order to tune the MG setup.
+The defaults for these parameters follow the recommendations of \url{https://github.com/lattice/quda/wiki/Multigrid-Solver}, which also provides useful hints for further tuning.
+Although some of the parameters can be set on a per-level basis, the interface currently only exposes a single setting for all levels, where appropriate.
+The K-cycle is used by default and there is currently no user-exposed option for changing this.
+
+The MG-preconditioned GCR solver is selected as follows:
+\begin{verbatim}
+BeginOperator TMWILSON
+  2kappaMu = 0.05
+  kappa = 0.177
+  UseEvenOdd = yes
+  Solver = mg
+  SolverPrecision = 1e-18
+  MaxSolverIterations = 200
+  UseExternalInverter = quda
+  UseSloppyPrecision = single
+EndOperator
+\end{verbatim}
+
+The MG setup can be tuned using the following parameters in the \texttt{BeginExternalInverter QUDA} section:
+\begin{itemize}
+  \item{ \texttt{MGNumberOfLevels}: number of levels to be used in the MG, $3$ is usually ideal but $2$ can be similarly efficient depending on the quark mass (positive integer, default $3$) }
+  \item{ \texttt{MGSetupSolver}: solver used for generating null vectors. \texttt{CG} or \texttt{BiCGstab} (default \texttt{CG}). Usage of \texttt{BiCGstab} may be recommended for Wilson or clover Wilson quarks. }
+  \item{ \texttt{MGSetupSolverTolerance}: relative target residual (unsquared!) during setup phase. (positive float, default $1\cdot10^{-6}$) }
+  \item{ \texttt{MGSetupMaxSolverIterations}: maximum number of iterations during setup phase. (positive integer, default $1000$) }
+  \item{ \texttt{MGCoarseSolverTolerance}: unsquared relative target residual on the coarse grids. (positive float, default $0.25$) }
+  \item{ \texttt{MGNumberOfVectors}: number of null vectors to compute on a per-level basis. (possible values $\left[ 24, 32 \right]$, default $24$)}
+  \item{ \texttt{MGCoarseMaxSolveriterations}: maximum number of iterations on coarse grids. (positive integer, default $75$) }
+  \item{ \texttt{MGEnableSizeThreeBlocks}: By default, QUDA has limited support for size $3$ aggregates. If set to \emph{yes}, the automatic blocking algorithm will attempt to use them for lattice extents divisible by $3$ when the local lattice extent at a given level is smaller than $16$ aggregate sites. This requires you to instantiate the necessary block sizes in QUDA (see comments below). (boolean \emph{yes} or \emph{no}, default \emph{no}) }
+  \item{ \texttt{MGBlockSizes[X,Y,Z,T]}: aggregate sizes on each level. When these are set for a given lattice dimension, the automatic blocking algorithm for that dimension is overridden and the specified blockings are forced. When the required aggregate sizes are not instantiated in QUDA, the setup phase will fail with an informative error message. (comma-separated list of integers, for a three level solver, for example, this needs to be specified for the first and second level)} 
+  \item{ \texttt{MGSmootherTolerance}: unsquared relative target residual of the smoother on all levels. (positive float, default $0.25$) }
+  \item{ \texttt{MGSmootherPreIterations}: number of smoothing steps before coarse grid correction. (zero or positive integer, default $0$)}
+  \item{ \texttt{MGSmootherPostIterations}: number of smoothing steps after prolongation. (zero or positive integer, default $4$)}
+  \item{ \texttt{MGOverUnderRelaxationFactor}: Over- or under-relaxation factor. (positive float, default $0.85$)}
+  \item{ \texttt{MGCoarseMuFactor}: Scaling factor for twisted mass on a per-level basis, accelerates convergence and reduces condition numer of coarse grid. From experience it seems that it's reasonable to set this $>1.0$ only on the coarsest level, but it might also help on intermediate levels. If running with twisted mass, this should always be set and tuned for maximum efficiency. (positive float, usually $ > 1.0$, default $8.0$ from the second level upwards).}
+  \item{ \texttt{MGRunVerify}: Check GPU coarse operators against CPU coarse operators and verify Galerkin projectors during setup phase. This is usually fast enough to always be performed, although sometimes it seems to fail even though the setup works fine. (\emph{yes} or \emph{no}, default \emph{yes}) } 
+\end{itemize}
+
+If no blocking is specified manually, the aggregation parameters are set automatically as follows:
+\begin{itemize}
+  \item{ A default block size of $4$ is attempted if the MPI-partitioned fine or aggregate lattice extent is larger or equal to $16$ lattice sites. }
+  \item{ If the number of aggregate lattice sites in a given direction is even and smaller than $16$, a block size of $2$ is used. }
+  \item{ The option \texttt{MGEnableSizeThreeBlocks} can be set to \texttt{yes}. Then, for levels coarser than the fine grid, extents smaller than $16$ and divisible by $3$, a block size of $3$ will be used. This will almost certainly require the addition of instantiations of block sizes to QUDA in the restrictor and transfer operator. (\texttt{lib/restrictor.cu} and \texttt{lib/transfer\_util.cu}) }
+  \item{ In all other cases, aggregation is disabled for this direction and level. This includes, for instance, extents divisible by primes other than $2$ or $3$. }
+\end{itemize}
+
+
+Note that at the time of writing (2017.12.28), only double-single mixed-precision is supported for the MG-preconditioned GCR solver and the solve will abort if a double-half precision solve is attempted.
+
+A typical MG setup might look like this for twisted mass clover quarks: 
+
+\begin{verbatim}
+BeginExternalInverter QUDA
+  MGNumberOfLevels = 3
+  MGSetupSolver = cg
+  MGSetupSolverTolerance = 1e-6
+  MGSetupMaxSolverIterations = 1000
+  MGCoarseSolverTolerance = 0.25
+  MGCoarseSolverIterations = 75
+  MGSmootherTolerance = 0.25
+  MGSmootherPreIterations = 2
+  MGSmootherPostIterations = 4
+  MGOverUnderRelaxationFactor = 0.85
+  MGCoarseMuFactor = 1.0, 1.0, 12.0
+  MGNumberOfVectors = 24, 24, 32
+  MGRunVerify = yes
+  MGEnableSizeThreeBlocks = no
+EndExternalInverter
+\end{verbatim}
+
+Alternatively, a blocking can be specified manually:
+
+\begin{verbatim}
+BeginExternalInverter QUDA
+  MGNumberOfLevels = 3
+  MGBlockSizesX = 4, 3
+  MGBlockSizesY = 4, 3
+  MGBlockSizesZ = 6, 4
+  MGBlockSizesT = 6, 4
+  MGSetupSolver = cg
+  MGSetupSolverTolerance = 1e-6
+  MGSetupMaxSolverIterations = 1000
+  MGCoarseSolverTolerance = 0.25
+  MGCoarseSolverIterations = 75
+  MGSmootherTolerance = 0.25
+  MGSmootherPreIterations = 2
+  MGSmootherPostIterations = 4
+  MGOverUnderRelaxationFactor = 0.85
+  MGCoarseMuFactor = 1.0, 1.0, 12.0
+  MGRunVerify = yes
+  MGEnableSizeThreeBlocks = no
+EndExternalInverter
+\end{verbatim}
+
+\subsubsection{More advanced settings}
+To achieve higher performance you may choose single (default) or even half precision as sloppy precision for the inner solver of the mixed precision inverter with reliable updates. After {\ttfamily BeginOperator} and before {\ttfamily EndOperator} set {\ttfamily UseSloppyPrecision = double|single|half}.
+The MG-preconditioned GCR solver only works in double-single mixed precision, but the null vectors are stored in half precision as recommended by Kate Clark.
+
+To activate compression of the gauge fields (in order to save bandwidth and thus to achieve higher performance), set {\ttfamily UseCompression = 8|12|18} within {\ttfamily BeginOperator} and {\ttfamily EndOperator}. 
+The default is 18 which corresponds to no compression. 
+Note that if you use compression, trivial (anti)periodic boundary conditions will be applied to the gauge fields, instead of the default $\theta$-boundary conditions. 
+As a consequence, the residual check on tmLQCD side will fail. 
+Moreover, compression is not applicable when using general $\theta$-boundary conditions in the spatial directions. 
+If trying to do so, compression will be de-activated automatically and the user gets informed via the standard output.
+The \texttt{FermionBC} setting can be used to force particular temporal boundary conditions to be applied to the gauge field in the Dirac operator.
+
+\subsubsection{Functionality}
+The QUDA interface can currently be used to invert {\ttfamily TMWILSON, WILSON, DBTMWILSON} and {\ttfamily CLOVER} within a 4D multi-GPU (MPI) parallel environment with CG, BICGSTAB or MG-preconditioned GCR. QUDA uses even-odd preconditioning, if wanted ({\ttfamily UseEvenOdd = yes}), and the interface is set up to use a mixed precision solver by default. For more details on the QUDA settings check the function {\ttfamily \_initQuda()} in {\ttfamily quda\_interface.c}.
+
+
+
diff --git a/doc/rational.tex b/doc/rational.tex
index 032015b55..945abbb87 100644
--- a/doc/rational.tex
+++ b/doc/rational.tex
@@ -1,4 +1,4 @@
-\subsection{Rational HMC}
+\subsection{Rational HMC} \label{subsec:rationalhmc}
 
 For the heavy doublet one may alternatively use a rational
 approximation 
@@ -117,13 +117,13 @@ \subsubsection{Correction Monomial}
 The rational approximation has a finite precision. In the HMC one can
 account for this effect by estimating
 \[
-1 - |\hat Q_h| R\,,
+1 - |\hat Q_h| \mathcal{R}\,,
 \]
 which can be done in different ways:
 \begin{itemize}
 \item we include an additional monomial for
   \[
-  \det (|\hat Q_h| R)
+  \det (|\hat Q_h| \mathcal{R})
   \]
   in the Hamiltonian. If the rational apprximation is precise enough,
   it is sufficient to only include this in the heatbath and acceptance
@@ -139,7 +139,7 @@ \subsubsection{Correction Monomial}
   \]
   The series
   \[
-  B = (1+Z)^{1/4} =  1 + \frac{1}{4} Z - \frac{3}{32} Z^2 + \frac{7}{122} Z^3 + ...
+  B = (1+Z)^{1/4} =  1 + \frac{1}{4} Z - \frac{3}{32} Z^2 + \frac{7}{128} Z^3 + ...
   \]
   is rapidly converging and can usually be truncated after the $Z^2$
   or latest $Z^3$ term, see
@@ -156,48 +156,58 @@ \subsubsection{Correction Monomial}
   which, again expanding in $Z$ is obtained by
   \[
   \phi^\dagger (1+Z)^{-1/2} \phi = \phi^\dagger (1 - \frac{1}{2}Z +
-  \frac{3}{8}Z^3 + ...) \phi\, .
+  \frac{3}{8}Z^2 - \frac{5}{16}Z^3 + ...) \phi\, .
   \]
   Also here the series can be truncated after the first few terms.
+  Since the correction monomial is not used in the force computation of MD,
+  its final purpose for the HMC is to compute the energy difference
+  \[
+  dH_{corr} = R^\dagger \left(1-(1+Z_{old})^{1/4}(1+Z_{new})^{-1/2}(1+Z_{old})^{1/4}\right)R\,.
+  \]
+  Considering $\mathcal{O}(Z_{old}) = \mathcal{O}(Z_{new}) = \mathcal{O}(Z)$ and
+  using the previous series expansions, we obtain
+  \begin{align*}
+  dH_{corr} & =  R^\dagger \left( \frac{1}{2} Z_{old} - \frac{1}{2} Z_{new} \right)R\\
+  & + R^\dagger \left( - \frac{1}{8} Z_{old}^2 - \frac{1}{8} \left\{Z_{old} , Z_{new} \right\}  + \frac{3}{8} Z_{new}^2 \right)R\\
+  & + R^\dagger \left( \frac{1}{16} Z_{old}^3 + \frac{3}{64} \left\{ Z_{old}^2 , Z_{new} \right\} - \frac{1}{32} Z_{old} Z_{new} Z_{old} + \frac{3}{32} \left\{ Z_{old} , Z_{new}^2 \right\} - \frac{5}{16} Z_{new}^3 \right)R\\
+  & + \mathcal{O}(Z^4).
+  \end{align*}
+  The coefficients in front of the terms $R^\dagger Z_{old}^n R$ are given by the series of
+  \[
+  (1+Z_{old})^{1/2} -1 =  \frac{1}{2} Z_{old} - \frac{1}{8} Z_{old}^2 + \frac{1}{16} Z_{old}^3 + ...
+  \]
+  For this reason, computing $\phi = B(Z_{old})\cdot R$, we use as a stopping criterium
+  \[
+  c_n R^\dagger Z_{old}^n R < \text{tolerance}
+  \]
+  where $c_n$ are the coefficients from the series of $(1+Z_{old})^{1/2}$.
+  Since $Z$ is hermitian, we can compute in advance the next order correction of the series evaluating
+  \[
+  c_n (R Z_{old})^\dagger\cdot (Z_{old}^{n-1} R) < \text{tolerance}\,;
+  \]
+  in this way we save an application of Z in the evaluation of $\phi = B(Z_{old})\cdot R$.
+  
+  Exploting the hermiticity of $Z$, we can also save applications of it in the computation of
+  \[
+  dH_{corr} = R^\dagger R - \phi^\dagger\left((1+Z_{new})^{-1/2}\right)\phi\,,
+  \]
+  which is done in the acceptance step. Indeed defining $\chi_i = Z_{new}^i\phi$, $dH_{corr}$ reads as
+  \[
+  dH_{corr} = R^\dagger R - \phi^\dagger\phi + \frac{1}{2}\phi^\dagger\chi_1\phi - \frac{3}{8}\chi_1^\dagger\chi_1 + \frac{5}{16} \chi_1^\dagger\chi_2 - ...\,,
+  \]
+  that requires $n$ applications of $Z_{new}$ for computing $dH_{corr}$ up to $\mathcal{O}(Z_{new}^{2n})$.
+  Here we use as stopping criterium,
+  \[
+  c_n \phi^\dagger Z_{new}^{n} \phi < \text{tolerance}\,;
+  \]
+  where $c_n$ are the coefficients from the series of $(1+Z_{new})^{-1/2}$.
+
 \item the second possibility is to include this correction as a
   reweighting factor.
 \item the third is to use a more precise rational approximation for
   the heatbath and acceptance steps.
 \end{itemize}
 
-\subsubsection{CGMMS Solver}
-
-\begin{algorithm}
-  \caption{CGMMS algorithm}
-  \label{alg:cgm}
-  \begin{algorithmic}[1]
-    \vspace{.2cm}
-    \STATE $n=0, x_0^k = 0, r_0 = p_0 = p_0^k = b, k_\mathrm{max},
-    \delta, \epsilon$
-    \STATE  $\biggl.\biggr.\alpha_{-1} = \zeta_{-1}^k = \zeta_0^k = 1, \beta_0^k = \beta_0 = 0$
-    \REPEAT
-    \STATE $\alpha_n = (r_n, r_n) / (p_n, A p_n)$
-    \FOR{$k = 1$ to $k_\mathrm{max}$}
-    \STATE $\biggl.\biggr.\zeta_{n+1}^k = (\zeta^k_n  \alpha_{n-1}) / 
-      (\alpha_n \beta_n(1 - \zeta_n^k / \zeta^k_{n-1}) + \alpha_{n-1}
-      (1-\sigma_k\alpha_n))$
-    \STATE $\alpha^k_n = (\alpha_n \zeta_{n+1}^k)/ \zeta_n^k$
-    \STATE $\biggl.\biggr.x_{n+1}^k = x_n^k + \alpha_n^k p_n^k$
-    \IF{$\|\alpha^{k_\mathrm{max}} p^{k_\mathrm{max}}\| < \delta$}
-    \STATE $k_\mathrm{max} = k_\mathrm{max} -1$
-    \ENDIF
-    \ENDFOR
-    \STATE $x_{n+1} = x_n + \alpha_n p_n$
-    \STATE $\biggl.\biggr.r_{n+1} = r_n - \alpha_n Ap_n$
-    \STATE $\beta_{n+1} = (r_{n+1}, r_{n+1}) / (r_n, r_n)$
-    \STATE $\beta_{n+1}^k = \frac{\beta_{n+1} \zeta_{n+1}^k \alpha_n^k}{\zeta_{n}^k\alpha_n}$
-    \STATE $\biggl.\biggr.p_{n+1}^k = \zeta_{n+1}^k r_{n+1} + \beta_{n+1}^k p_n^k$
-    \STATE $n=n+1$
-    \UNTIL{$\|r_n\|<\epsilon$}
-  \end{algorithmic}
-\end{algorithm}
-
-
 For evaluating the rational approximation $\mathcal{R}$ applied to a
 spinor field $\psi$ a multi-mass or multi-shift solver (see
 algorithm~\ref{alg:cgm}) can be used, see Ref.~\cite{Chiarappa:2006hz}
diff --git a/doc/solvers.tex b/doc/solvers.tex
new file mode 100644
index 000000000..35d1424da
--- /dev/null
+++ b/doc/solvers.tex
@@ -0,0 +1,4 @@
+In this section, we give details of some of the solvers which are implemented in tmLQCD.
+In particular, we clarify some of the conventions used and how these map over to the external library interfaces.
+
+\myinput{cgmms}
\ No newline at end of file
diff --git a/expo.c b/expo.c
index dcac9a983..caf8ba7b6 100644
--- a/expo.c
+++ b/expo.c
@@ -34,7 +34,7 @@
  ************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #ifdef SSE
 # undef SSE
@@ -52,48 +52,132 @@
 #include "su3.h"
 #include "su3adj.h"
 #include "expo.h"
+#include "float.h"
+#include "global.h"
 
-void exposu3(su3* const vr, const su3adj* const p) {
-  int i;
-  su3 ALIGN v,v2;
-  double ALIGN fac,r;
-  double ALIGN a,b;
-  _Complex double ALIGN a0,a1,a2,a1p;
+static double imag_det(const su3adj* p) {
+  double d,tos3,o3,os3;
+  tos3=2.0/sqrt(3.0);
+  o3=1.0/3.0;
+  os3=1.0/sqrt(3.0);
+  
+  d=tos3*(*p).d8*(o3*(*p).d8*(*p).d8-(*p).d3*(*p).d3)+2*((*p).d2*(*p).d4*(*p).d7-(*p).d1*(*p).d4*(*p).d6-(*p).d2*(*p).d5*(*p).d6-(*p).d1*(*p).d5*(*p).d7);
+  d+=(os3*(*p).d8-(*p).d3)*((*p).d4*(*p).d4+(*p).d5*(*p).d5)+(os3*(*p).d8+(*p).d3)*((*p).d6*(*p).d6+(*p).d7*(*p).d7)-tos3*(*p).d8*((*p).d1*(*p).d1+(*p).d2*(*p).d2);	
+  return d;
+}
+
+static void mul_su3alg(su3adj* p,double d) {
+  (*p).d1*=d;
+  (*p).d2*=d;
+  (*p).d3*=d;
+  (*p).d4*=d;
+  (*p).d5*=d;
+  (*p).d6*=d;
+  (*p).d7*=d;
+  (*p).d8*=d;
+}
 
-  /* it writes 'p=vec(h_{j,mu})' in matrix form 'v' */  
+void init_exposu3() {
+  int k;
+  double fctr = 1.0;
+  g_exposu3_no_c = 0;
+  
+  while (fctr>DBL_EPSILON) {
+    g_exposu3_no_c++;
+    fctr/=(double)(g_exposu3_no_c);
+  }
+  g_exposu3_no_c += 7;
+  g_exposu3_no_c += (g_exposu3_no_c%2);
+  
+  g_exposu3_c=malloc((g_exposu3_no_c+1)*sizeof(*g_exposu3_c));
+  
+  g_exposu3_c[0]=1.0;
+  for (k=0; k < g_exposu3_no_c; k++)
+    g_exposu3_c[k+1]=g_exposu3_c[k]/(double)(k+1);
+}
+
+void exposu3(su3* const vr, const su3adj* const p) {
+  int n,m,mm;
+  su3 ALIGN v,v2,vt;
+  su3adj pa;
+  double ALIGN d,tc;
+  _Complex double t;
+  _Complex double ALIGN p0,p1,p2;
+  _Complex double ALIGN q0,q1,q2;
+  
   _make_su3(v,*p);
+  _su3_times_su3(v2,v,v);
+  tc = -2.0*(v2.c00 +v2.c11+v2.c22);
+  
+  pa.d1=(*p).d1;
+  pa.d2=(*p).d2;
+  pa.d3=(*p).d3;
+  pa.d4=(*p).d4;
+  pa.d5=(*p).d5;
+  pa.d6=(*p).d6;
+  pa.d7=(*p).d7;
+  pa.d8=(*p).d8;
+  
+  mm=0;
+  while (tc>1.0) {
+    mul_su3alg(&pa,0.5);
+    tc*=0.5;
+    mm+=1;
+  }
+  
+  /* it writes 'p=vec(h_{j,mu})' in matrix form 'v'  */
+  _make_su3(v,pa);
   /* calculates v^2 */
   _su3_times_su3(v2,v,v);
-  /* */
-  a = 0.5 * (creal(v2.c00) + creal(v2.c11) + creal(v2.c22));
-  /* 1/3 imaginary part of tr v*v2 */
-  b = 0.33333333333333333 * cimag(v.c00 * v2.c00 + v.c01 * v2.c10 + v.c02 * v2.c20 +
-                                  v.c10 * v2.c01 + v.c11 * v2.c11 + v.c12 * v2.c21 +
-                                  v.c20 * v2.c02 + v.c21 * v2.c12 + v.c22 * v2.c22  );
-  a0  = 0.16059043836821615e-9;
-  a1  = 0.11470745597729725e-10;
-  a2  = 0.76471637318198165e-12;
-  fac = 0.20876756987868099e-8;      /*  1/12! */
-  r   = 12.0;
-  for(i = 3; i <= 15; ++i)
-  {
-    a1p = a0 + a * a2;
-    a0 = fac + b * I * a2;
-    a2 = a1;
-    a1 = a1p;
-    fac *= r;
-    r -= 1.0;
+  /* t= -tr(X^2)/2*/
+  t = -0.5*(v2.c00 +v2.c11+v2.c22);
+  /* d= -1i * det(X)*/
+  d=-imag_det(&pa);
+ /*  printf(" d= %.16f and t=%.16f + 1i %.16f \n",d,creal(t),cimag(t));*/
+  
+  if(fabs(d)>(1.000001*(1.000002-fabs(t))))
+    printf("The norm of X is larger than 1 and N = %d \n", g_exposu3_no_c);
+  
+  
+  p0=g_exposu3_c[g_exposu3_no_c];
+  p1=0.0;
+  p2=0.0;
+  
+  for (n=(g_exposu3_no_c-1);n>=0;n--) {
+    q0=p0;
+    q1=p1;
+    q2=p2;
+    
+    p0=g_exposu3_c[n]-I*d*q2;
+    p1=q0-t*q2;
+    p2=q1;
   }
+   
   /* vr = a0 + a1*v + a2*v2 */
-  vr->c00 = a0 + a1 * v.c00 + a2 * v2.c00;
-  vr->c01 =      a1 * v.c01 + a2 * v2.c01;
-  vr->c02 =      a1 * v.c02 + a2 * v2.c02;
-  vr->c10 =      a1 * v.c10 + a2 * v2.c10;
-  vr->c11 = a0 + a1 * v.c11 + a2 * v2.c11;
-  vr->c12 =      a1 * v.c12 + a2 * v2.c12;
-  vr->c20 =      a1 * v.c20 + a2 * v2.c20;
-  vr->c21 =      a1 * v.c21 + a2 * v2.c21;
-  vr->c22 = a0 + a1 * v.c22 + a2 * v2.c22;
+  vt.c00 = p0 + p1 * v.c00 + p2 * v2.c00;
+  vt.c01 =      p1 * v.c01 + p2 * v2.c01;
+  vt.c02 =      p1 * v.c02 + p2 * v2.c02;
+  vt.c10 =      p1 * v.c10 + p2 * v2.c10;
+  vt.c11 = p0 + p1 * v.c11 + p2 * v2.c11;
+  vt.c12 =      p1 * v.c12 + p2 * v2.c12;
+  vt.c20 =      p1 * v.c20 + p2 * v2.c20;
+  vt.c21 =      p1 * v.c21 + p2 * v2.c21;
+  vt.c22 = p0 + p1 * v.c22 + p2 * v2.c22;
+  
+  for(m=0;m<mm;m++) {
+    _su3_times_su3(v2,vt,vt);
+    vt=v2;
+  }
+  
+  vr->c00=vt.c00;
+  vr->c01=vt.c01; 
+  vr->c02=vt.c02; 
+  vr->c10=vt.c10;
+  vr->c11=vt.c11;
+  vr->c12=vt.c12;
+  vr->c20=vt.c20;
+  vr->c21=vt.c21;
+  vr->c22=vt.c22;
 }
 
 void exposu3_check(su3* const vr, const su3adj* const p, int im) {
@@ -135,6 +219,12 @@ void restoresu3(su3* const vr, const su3* const u) {
   vr->c20 = conj(vr->c01 * vr->c12 - vr->c02 * vr->c11);
   vr->c21 = conj(vr->c02 * vr->c10 - vr->c00 * vr->c12);
   vr->c22 = conj(vr->c00 * vr->c11 - vr->c01 * vr->c10);
+
+  /* compute  row 2 as the conjugate of the cross-product of 3 and 1 */
+  vr->c10 = conj(vr->c21 * vr->c02 - vr->c22 * vr->c01);
+  vr->c11 = conj(vr->c22 * vr->c00 - vr->c20 * vr->c02);
+  vr->c12 = conj(vr->c20 * vr->c01 - vr->c21 * vr->c00);
+
 }
 
 void restoresu3_in_place(su3* const u) {
@@ -156,6 +246,12 @@ void restoresu3_in_place(su3* const u) {
   u->c20 = conj(u->c01 * u->c12 - u->c02 * u->c11);
   u->c21 = conj(u->c02 * u->c10 - u->c00 * u->c12);
   u->c22 = conj(u->c00 * u->c11 - u->c01 * u->c10);
+
+  /* compute  row 2 as the conjugate of the cross-product of 3 and 1 */
+  u->c10 = conj(u->c21 * u->c02 - u->c22 * u->c01);
+  u->c11 = conj(u->c22 * u->c00 - u->c20 * u->c02);
+  u->c12 = conj(u->c20 * u->c01 - u->c21 * u->c00);
+
 }
                                 
 /* Exponentiates a hermitian 3x3 matrix Q */
diff --git a/expo.h b/expo.h
index dd0c3657f..8e5c1eef3 100644
--- a/expo.h
+++ b/expo.h
@@ -19,10 +19,11 @@
 #ifndef _EXPO_H
 #define _EXPO_H
 
-extern void exposu3(su3* const vr, const su3adj* const p);
-extern void exposu3_check(su3* const vr, const su3adj* const p, int im);
-extern void restoresu3(su3* const vr, const su3* const u);
-extern void restoresu3_in_place(su3* const u);
-extern void exposu3_in_place(su3* const u);
+void init_exposu3();
+void exposu3(su3* const vr, const su3adj* const p);
+void exposu3_check(su3* const vr, const su3adj* const p, int im);
+void restoresu3(su3* const vr, const su3* const u);
+void restoresu3_in_place(su3* const u);
+void exposu3_in_place(su3* const u);
 
 #endif
diff --git a/fatal_error.c b/fatal_error.c
index f7a153b6f..9c3b32f90 100644
--- a/fatal_error.c
+++ b/fatal_error.c
@@ -19,13 +19,13 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <tmlqcd_config.h>
 #endif
 
 #include <stdio.h>
 #include <global.h>
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
 
@@ -38,7 +38,7 @@ void fatal_error(char const *error, char const *function)
     fprintf(stderr, "FATAL ERROR\n");
     if (function != NULL)
     {
-#ifdef MPI
+#ifdef TM_USE_MPI
       fprintf(stderr, "  Within %s (reported by node %d):\n", function, g_proc_id);
 #else
       fprintf(stderr, "  Within %s:\n", function);
@@ -48,7 +48,7 @@ void fatal_error(char const *error, char const *function)
     fflush(stderr);
   }
   
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Abort(MPI_COMM_WORLD, 1);
   MPI_Finalize();
 #endif
diff --git a/gamma.c b/gamma.c
index eb0fcb90b..71e130ebb 100644
--- a/gamma.c
+++ b/gamma.c
@@ -27,7 +27,7 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -36,14 +36,14 @@
 #include "su3.h"
 #include "su3spinor.h"
 #include "gamma.h"
-#ifdef OMP
+#ifdef TM_USE_OMP
 #include <omp.h>
 #endif
 
 /* (*Q) = gammaXY*(*P) */
 
 void gamma0( const int Q,  const int P, const int V){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel for
 #endif
   for (int ix = 0; ix < V; ix++){
@@ -51,7 +51,7 @@ void gamma0( const int Q,  const int P, const int V){
   }
 }
 void gamma1( const int Q,  const int P, const int V){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel for
 #endif
   for (int ix=0;ix<V;ix++){
@@ -59,7 +59,7 @@ void gamma1( const int Q,  const int P, const int V){
   }
 }
 void gamma2( const int Q,  const int P, const int V){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel for
 #endif
   for (int ix=0;ix<V;ix++){
@@ -67,7 +67,7 @@ void gamma2( const int Q,  const int P, const int V){
   }
 }
 void gamma3( const int Q,  const int P, const int V){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel for
 #endif
   for (int ix=0;ix<V;ix++){
@@ -75,13 +75,13 @@ void gamma3( const int Q,  const int P, const int V){
   }
 }
 void gamma5(spinor * const l, spinor * const k, const int V){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
   int ix;
   spinor *r,*s;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (ix = 0; ix < V; ix++){
@@ -92,12 +92,12 @@ void gamma5(spinor * const l, spinor * const k, const int V){
     _vector_minus_assign((*r).s2,(*s).s2);
     _vector_minus_assign((*r).s3,(*s).s3);
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /*OpenMP closing brace */
 #endif
 }
 void gamma5new(spinor * const Q, spinor * const P, const int V){ 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel for
 #endif
   for (int ix=0;ix<V;ix++){ 
@@ -105,7 +105,7 @@ void gamma5new(spinor * const Q, spinor * const P, const int V){
   } 
 }
 void gamma50( const int Q,  const int P, const int V){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel for
 #endif
     for (int ix=0;ix<V;ix++){
@@ -113,7 +113,7 @@ void gamma50( const int Q,  const int P, const int V){
   }
 }
 void gamma51( const int Q,  const int P, const int V){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel for
 #endif
   for (int ix=0;ix<V;ix++){
@@ -121,7 +121,7 @@ void gamma51( const int Q,  const int P, const int V){
   }
 }
 void gamma52( const int Q,  const int P, const int V){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel for
 #endif
   for (int ix=0;ix<V;ix++){
@@ -129,7 +129,7 @@ void gamma52( const int Q,  const int P, const int V){
   }
 }
 void gamma53( const int Q,  const int P, const int V){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel for
 #endif
   for (int ix=0;ix<V;ix++){
@@ -138,7 +138,7 @@ void gamma53( const int Q,  const int P, const int V){
 }
 
 void P_plus(spinor * const Q, spinor * const P, const int V){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel for
 #endif
   for (int ix = 0; ix < V; ix++){
@@ -147,7 +147,7 @@ void P_plus(spinor * const Q, spinor * const P, const int V){
 }
 
 void P_minus(spinor * const Q, spinor * const P, const int V){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel for
 #endif
   for (int ix = 0; ix < V; ix++){
@@ -157,7 +157,7 @@ void P_minus(spinor * const Q, spinor * const P, const int V){
 
 void Proj(spinor * const Q, spinor * const P, const int V, const int flag){
   if(flag == 0){ 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel for
 #endif
     for (int ix = 0; ix < V; ix++){
@@ -165,7 +165,7 @@ void Proj(spinor * const Q, spinor * const P, const int V, const int flag){
     }
   }
   else if(flag == 1){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel for
 #endif
     for (int ix = 0; ix < V; ix++){
diff --git a/gen_sources.c b/gen_sources.c
index 0085a4ef6..91a39189b 100644
--- a/gen_sources.c
+++ b/gen_sources.c
@@ -28,7 +28,7 @@
 
 #include "lime.h"
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -36,10 +36,10 @@
 #include <time.h>
 #include <sys/time.h>
 #include <string.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include "global.h"
@@ -98,11 +98,11 @@ int main(int argc,char *argv[]) {
   L=0;
   T=0;
   
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Init(&argc, &argv);
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   /* FIXME: in principle this should not be set like this as it could result
     in thread oversubscription when more than one process is run locally
     unfortunately, there does not seem to be a standard way to determine
@@ -206,10 +206,16 @@ int main(int argc,char *argv[]) {
     for(is = 0; is < 4; is ++) {
       for(ic = 0; ic < 3; ic++) {
 	if(!filenameflag && !appendflag) {
-	  sprintf(spinorfilename, "%s.%.4d.%.4d.%.2d.%.2d", filename, nstore, sample, t0, 3*is+ic); 
+          if(T_global > 99) {
+            sprintf(spinorfilename, "%s.%.4d.%.4d.%.3d.%.2d", filename, nstore, sample, t0, 3*is+ic);
+          }
+          else {
+            sprintf(spinorfilename, "%s.%.4d.%.4d.%.2d.%.2d", filename, nstore, sample, t0, 3*is+ic);
+          }
 	}
 	else if(!filenameflag && appendflag) {
-	  sprintf(spinorfilename, "%s.%.4d.%.4d.%.2d", filename, nstore, sample, t0); 
+          if(T_global > 99) sprintf(spinorfilename, "%s.%.4d.%.4d.%.3d", filename, nstore, sample, t0); 
+	  else sprintf(spinorfilename, "%s.%.4d.%.4d.%.2d", filename, nstore, sample, t0); 
 	}
 	else{
 	  sprintf(spinorfilename, "%s.%.2d", filename, 3*is+ic); 
@@ -233,7 +239,8 @@ int main(int argc,char *argv[]) {
   else {
     if(!ext_sourceflag) {
       if(!filenameflag) {
-	sprintf(spinorfilename, "%s.%.4d.%.4d.%.2d", filename, nstore, sample, t0); 
+	if(T_global > 99) sprintf(spinorfilename, "%s.%.4d.%.4d.%.3d", filename, nstore, sample, t0); 
+        else sprintf(spinorfilename, "%s.%.4d.%.4d.%.2d", filename, nstore, sample, t0); 
       }
       else {
 	sprintf(spinorfilename, "%s", filename); 
@@ -241,7 +248,7 @@ int main(int argc,char *argv[]) {
       printf("Generating source %s!\n", spinorfilename);
       fflush(stdout);
       source_generation_pion_only(g_spinor_field[0], g_spinor_field[1], 
-				  t0, sample, nstore);
+				  t0, sample, nstore, random_seed);
       
       co = scalar_prod(g_spinor_field[1], g_spinor_field[1], VOLUME/2, 1);
       write_source_type(0, spinorfilename);
@@ -249,7 +256,8 @@ int main(int argc,char *argv[]) {
     }
     else {
       if(!filenameflag) {
-        sprintf(spinorfilename, "%s.%.4d.%.4d.%.2d.inverted", filename, nstore, sample, t0);
+        if(T_global > 99) sprintf(spinorfilename, "%s.%.4d.%.4d.%.3d.inverted", filename, nstore, sample, t0);
+        else sprintf(spinorfilename, "%s.%.4d.%.4d.%.2d.inverted", filename, nstore, sample, t0);
       }
       else {
         sprintf(spinorfilename, "%s.inverted", filename);
@@ -259,9 +267,10 @@ int main(int argc,char *argv[]) {
       printf("Generating ext. pion source %s!\n", spinorfilename);
       extended_pion_source(g_spinor_field[2], g_spinor_field[3],
 			   g_spinor_field[0], g_spinor_field[1],
-			   t0, 0., 0., 0.);
+			   t0, (g_nproc_t*T)/2, 0., 0., 0.);
       if(!filenameflag) {
-	sprintf(spinorfilename, "g%s.%.4d.%.4d.%.2d", filename, nstore, sample, t0); 
+	if(T_global > 99) sprintf(spinorfilename, "g%s.%.4d.%.4d.%.3d", filename, nstore, sample, t0); 
+        else sprintf(spinorfilename, "g%s.%.4d.%.4d.%.2d", filename, nstore, sample, t0); 
       }
       else {
 	sprintf(spinorfilename, "g%s", filename); 
@@ -271,7 +280,7 @@ int main(int argc,char *argv[]) {
     }
   }
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Finalize();
 #endif
   free_geometry_indices();
diff --git a/geometry_eo.c b/geometry_eo.c
index 6a3297e61..cd985bd93 100644
--- a/geometry_eo.c
+++ b/geometry_eo.c
@@ -32,7 +32,7 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -751,7 +751,7 @@ void geometry(){
   int startvaluey = 0;
   int startvaluez = 0;
   int * xeven;
-#if defined MPI
+#if defined TM_USE_MPI
   int isp, *ones, *oneS, *oneL;
   int lsliceS, lsliceL, check_struct_zt;
 #endif
@@ -885,7 +885,11 @@ void geometry(){
   }
 
   for(j=0; j<4; j++){  // NEW GIUPDNEO
-    for(ix=0;ix< (VOLUME+RAND);ix++){
+    for(ix = 0; ix < (VOLUME)/2; ix++){
+      g_iup_eo[ix][j]=g_lexic2eosub[g_iup[g_eo2lexic[ix]][j]];
+      g_idn_eo[ix][j]=g_lexic2eosub[g_idn[g_eo2lexic[ix]][j]];
+    }
+    for(ix = (VOLUME+RAND)/2; ix < VOLUME+RAND/2; ix++){
       g_iup_eo[ix][j]=g_lexic2eosub[g_iup[g_eo2lexic[ix]][j]];
       g_idn_eo[ix][j]=g_lexic2eosub[g_idn[g_eo2lexic[ix]][j]];
     }
@@ -1455,8 +1459,9 @@ void geometry(){
     }
 #endif
   }
-
-  Hopping_Matrix_Indices();
+  if(!lowmem_flag){
+    Hopping_Matrix_Indices();
+  }
 
   free(xeven);
 }
diff --git a/get_rectangle_staples.c b/get_rectangle_staples.c
index f39a1a85e..b3d48f7d1 100644
--- a/get_rectangle_staples.c
+++ b/get_rectangle_staples.c
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -27,7 +27,10 @@
 #include "get_rectangle_staples.h"
 
 void get_rectangle_staples(su3 * const v, const int x, const int mu) {
+  get_rectangle_staples_general(v,x,mu,g_gauge_field);
+}
 
+void get_rectangle_staples_general(su3 * const v, const int x, const int mu, const su3** const gf) {
   su3 ALIGN tmp1, tmp2;
   int y, z, nu;
   su3 * a, * b, * c, * d, * e;
@@ -47,18 +50,18 @@ void get_rectangle_staples(su3 * const v, const int x, const int mu) {
        * b| |e
        * a| |d
        */
-      a = &g_gauge_field[x][nu];
+      a = &gf[x][nu];
       y = g_iup[x][nu];
-      b = &g_gauge_field[y][nu];
+      b = &gf[y][nu];
       _su3_times_su3(tmp1, *a, *b);
       z = g_iup[y][nu];
-      c = &g_gauge_field[z][mu];
+      c = &gf[z][mu];
       _su3_times_su3(tmp2, tmp1, *c);
 
       y = g_iup[x][mu];
-      d = &g_gauge_field[y][nu];
+      d = &gf[y][nu];
       z = g_iup[y][nu];
-      e = &g_gauge_field[z][nu];
+      e = &gf[z][nu];
       _su3_times_su3(tmp1, *d, *e);
       _su3_times_su3d_acc((*v), tmp2, tmp1);
 
@@ -71,16 +74,16 @@ void get_rectangle_staples(su3 * const v, const int x, const int mu) {
        */
       y = g_idn[x][nu];
       z = g_idn[y][nu];
-      d = &g_gauge_field[z][nu];
-      a = &g_gauge_field[z][mu];
+      d = &gf[z][nu];
+      a = &gf[z][mu];
       _su3d_times_su3(tmp1, *d, *a);
-      e = &g_gauge_field[y][nu];
+      e = &gf[y][nu];
       _su3d_times_su3(tmp2, *e, tmp1);
 
       y = g_iup[z][mu];
-      b = &g_gauge_field[y][nu];
+      b = &gf[y][nu];
       z = g_iup[y][nu];
-      c = &g_gauge_field[z][nu];
+      c = &gf[z][nu];
       _su3_times_su3(tmp1, *b, *c);
       _su3_times_su3_acc((*v), tmp2, tmp1);
 
@@ -92,18 +95,18 @@ void get_rectangle_staples(su3 * const v, const int x, const int mu) {
        * a| _|e
        *    d
        */
-      a = &g_gauge_field[x][nu];
+      a = &gf[x][nu];
       y = g_iup[x][nu];
-      b = &g_gauge_field[y][mu];
+      b = &gf[y][mu];
       _su3_times_su3(tmp1, *a, *b);
       z = g_iup[y][mu];
-      c = &g_gauge_field[z][mu];
+      c = &gf[z][mu];
       _su3_times_su3(tmp2, tmp1, *c);
 
       y = g_iup[x][mu];
-      d = &g_gauge_field[y][mu];
+      d = &gf[y][mu];
       z = g_iup[y][mu];
-      e = &g_gauge_field[z][nu];
+      e = &gf[z][nu];
       _su3_times_su3(tmp1, *d, *e);
       _su3_times_su3d_acc((*v), tmp2, tmp1);
 
@@ -116,17 +119,17 @@ void get_rectangle_staples(su3 * const v, const int x, const int mu) {
        *   ab
        */
       y = g_idn[x][nu];
-      d = &g_gauge_field[y][nu];
-      a = &g_gauge_field[y][mu];
+      d = &gf[y][nu];
+      a = &gf[y][mu];
       _su3d_times_su3(tmp1, *d, *a);
       z = g_iup[y][mu];
-      b = &g_gauge_field[z][mu];
+      b = &gf[z][mu];
       _su3_times_su3(tmp2, tmp1, *b);
 
       y = g_iup[z][mu];
-      c = &g_gauge_field[y][nu];
+      c = &gf[y][nu];
       z = g_iup[x][mu];
-      e = &g_gauge_field[z][mu];
+      e = &gf[z][mu];
       _su3_times_su3d(tmp1, *c, *e);
       _su3_times_su3_acc((*v), tmp2, tmp1);
 
@@ -140,16 +143,16 @@ void get_rectangle_staples(su3 * const v, const int x, const int mu) {
        */
       y = g_idn[x][mu];
       z = g_idn[y][nu];
-      d = &g_gauge_field[z][nu];
-      a = &g_gauge_field[z][mu];
+      d = &gf[z][nu];
+      a = &gf[z][mu];
       _su3d_times_su3(tmp1, *d, *a);
-      e = &g_gauge_field[y][mu];
+      e = &gf[y][mu];
       _su3d_times_su3(tmp2, *e, tmp1);
 
       y = g_idn[x][nu];
-      b = &g_gauge_field[y][mu];
+      b = &gf[y][mu];
       z = g_iup[y][mu];
-      c = &g_gauge_field[z][nu];
+      c = &gf[z][nu];
       _su3_times_su3(tmp1, *b, *c);
       _su3_times_su3_acc((*v), tmp2, tmp1); 
 
@@ -162,17 +165,17 @@ void get_rectangle_staples(su3 * const v, const int x, const int mu) {
        *  d
        */
       y = g_idn[x][mu];
-      d = &g_gauge_field[y][mu];
+      d = &gf[y][mu];
       z = g_iup[y][nu];
-      a = &g_gauge_field[y][nu];
+      a = &gf[y][nu];
       _su3d_times_su3(tmp1, *d, *a);
-      b = &g_gauge_field[z][mu];
+      b = &gf[z][mu];
       _su3_times_su3(tmp2, tmp1, *b);
 
       y = g_iup[x][mu];
-      e = &g_gauge_field[y][nu];
+      e = &gf[y][nu];
       z = g_iup[x][nu];
-      c = &g_gauge_field[z][mu];
+      c = &gf[z][mu];
       _su3_times_su3d(tmp1, *c, *e);
       _su3_times_su3_acc((*v), tmp2, tmp1);
     }
diff --git a/get_rectangle_staples.h b/get_rectangle_staples.h
index 53c81b78f..fa718c205 100644
--- a/get_rectangle_staples.h
+++ b/get_rectangle_staples.h
@@ -20,5 +20,6 @@
 #define _GET_RECTANGLE_STAPLES_H
 
 void get_rectangle_staples(su3 * const v, const int x, const int mu);
+void get_rectangle_staples_general(su3 * const v, const int x, const int mu, const su3** const gf);
 
 #endif
diff --git a/get_staples.c b/get_staples.c
index 49d32c176..bf7f976c4 100644
--- a/get_staples.c
+++ b/get_staples.c
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/getopt.c b/getopt.c
index 835828c65..ba131a07f 100644
--- a/getopt.c
+++ b/getopt.c
@@ -46,7 +46,7 @@
 #endif
 
 #ifdef HAVE_CONFIG_H
-# include <config.h>
+# include <tmlqcd_config.h>
 #endif
 
 #if !defined __STDC__ || !__STDC__
diff --git a/gettime.c b/gettime.c
index ce3e3a334..54cfe692d 100644
--- a/gettime.c
+++ b/gettime.c
@@ -20,20 +20,20 @@
 
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #ifdef HAVE_CLOCK_GETTIME
 #  ifndef _POSIX_C_SOURCE
 #    define _POSIX_C_SOURCE 199309L
 #  endif
 #  include <sys/time.h>
-#  include <bits/time.h>
+//#  include <bits/time.h>
 #endif
 #include <time.h>
 #if (defined BGL && !defined BGP)
 #  include <rts.h>
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 
@@ -46,7 +46,7 @@ double gettime(void) {
   const double clockspeed=1.0e-6/700.0;
   t = rts_get_timebase() * clockspeed;
 
-#elif defined MPI
+#elif defined TM_USE_MPI
 
   t = MPI_Wtime();
 
diff --git a/global.h b/global.h
index b9e57a035..2be0b9961 100644
--- a/global.h
+++ b/global.h
@@ -1,4 +1,5 @@
 /***********************************************************************
+ *
  *
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
  *
@@ -31,11 +32,11 @@
  *
  ***************************************************************/
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 #  include <mpi.h>
 #endif
 #ifdef FIXEDVOLUME
@@ -63,29 +64,52 @@
 # include "bgl.h"
 #endif
 
+#ifdef TM_USE_BSM
 // for Frezzotti-Rossi model Dirac operator
-EXTERN double eta_BSM, rho_BSM, m0_BSM;
+EXTERN double eta_BSM, rho_BSM, m0_BSM, c5phi_BSM, r_BSM, mu03_BSM, mu01_BSM, csw_BSM, r0_BSM;
+EXTERN int propagatorsonthefly_BSM;
+EXTERN int smearedcorrelator_BSM;
+EXTERN int densitydensity_BSM;
+EXTERN int densitydensity_s0s0_BSM;
+EXTERN int densitydensity_sxsx_BSM;
+EXTERN int diraccurrentdensity_BSM;
+EXTERN int wilsoncurrentdensitypr1_BSM;
+EXTERN int wilsoncurrentdensitypr2_BSM;
+EXTERN int wilsoncurrentdensitypl1_BSM;
+EXTERN int wilsoncurrentdensitypl2_BSM;
+EXTERN int vectorcurrentcurrent_BSM;
+EXTERN int axialcurrentcurrent_BSM;
+EXTERN int vectordensitydensity_BSM;
+EXTERN int vectorcurrentdensity_BSM;
+EXTERN int axialcurrentdensity_BSM;
+EXTERN int pdensityvectordensity_BSM;
+EXTERN int giancarlo_BSM;
+EXTERN int timesmearcorrelator_BSM;
+
 #define TUP 0
 #define XUP 1
 #define YUP 2
 #define ZUP 3
+
 #define ZDOWN 4
 #define YDOWN 5
 #define XDOWN 6
 #define TDOWN 7
 #define NODIR 8
-EXTERN scalar ** g_smearedscalar;
-
-EXTERN int DUM_DERI, DUM_SOLVER, DUM_MATRIX;
+#endif
+EXTERN int DUM_DERI, DUM_MATRIX;
 EXTERN int NO_OF_SPINORFIELDS;
+EXTERN int NO_OF_SPINORFIELDS_32;
 
 EXTERN int DUM_BI_DERI, DUM_BI_SOLVER, DUM_BI_MATRIX;
 EXTERN int NO_OF_BISPINORFIELDS;
 
 EXTERN int g_update_gauge_copy;
+EXTERN int g_update_gauge_copy_32;
 EXTERN int g_relative_precision_flag;
 EXTERN int g_debug_level;
 EXTERN int g_disable_IO_checks;
+EXTERN int g_disable_src_IO_checks;
 
 EXTERN int T_global;
 #ifndef FIXEDVOLUME
@@ -111,13 +135,16 @@ EXTERN int ** g_iup_eo; /* NEW GIUPDNEO */
 EXTERN int ** g_idn_eo;
 EXTERN int ** g_coord;
 EXTERN int * g_hi;
+#if defined TM_USE_BSM
 EXTERN int * g_bsm_2hop_lookup;
+#endif
 
 
 EXTERN int * g_field_z_ipt_even;
 EXTERN int * g_field_z_ipt_odd;
 
 EXTERN spinor ** g_spinor_field;
+EXTERN spinor32 ** g_spinor_field32;
 
 EXTERN bispinor ** g_bispinor_field;
 EXTERN spinor * g_tbuff;
@@ -184,13 +211,29 @@ EXTERN int g_running_phmc;
 /* End IF PHMC  */
 
 EXTERN su3 ** g_gauge_field;
+EXTERN su3_32 ** g_gauge_field_32;
+#ifdef TM_USE_BSM 
+EXTERN su3 ** g_smeared_gauge_field;
+#endif
 #ifdef _USE_HALFSPINOR
 EXTERN su3 *** g_gauge_field_copy;
+EXTERN su3_32 *** g_gauge_field_copy_32;
+#ifdef TM_USE_BSM
+EXTERN su3 *** g_smeared_gauge_field_copy;
+#endif
 #elif (defined _USE_TSPLITPAR )
 EXTERN su3 ** g_gauge_field_copyt;
 EXTERN su3 ** g_gauge_field_copys;
+#ifdef TM_USE_BSM
+EXTERN su3 ** g_smeared_gauge_field_copyt;
+EXTERN su3 ** g_smeared_gauge_field_copys;
+#endif
 #else
 EXTERN su3 ** g_gauge_field_copy;
+EXTERN su3_32 ** g_gauge_field_copy_32;
+#ifdef TM_USE_BSM
+EXTERN su3 ** g_smeared_gauge_field_copy;
+#endif
 #endif
 
 /*for temporalgauge in GPU part*/
@@ -199,13 +242,15 @@ EXTERN su3 ** g_tempgauge_field;
 EXTERN su3adj ** moment;
 EXTERN su3adj ** df0;
 EXTERN su3adj ** ddummy;
-
+#ifdef TM_USE_BSM
 /* scalar field (BSM toy model) */
 EXTERN scalar ** g_scalar_field;
+EXTERN scalar ** g_smeared_scalar_field;
+#endif
 
 EXTERN int count00,count01,count10,count11,count20,count21;
-EXTERN double g_kappa, g_c_sw, g_ka_csw_8, g_beta;
-EXTERN double g_mu, g_mu1, g_mu2, g_mu3;
+EXTERN double g_kappa, g_c_sw, g_beta;
+EXTERN double g_mu, g_mu1, g_mu2, g_mu3, g_shift;
 EXTERN double g_rgi_C0, g_rgi_C1;
 
 /* Parameters for non-degenrate case */
@@ -222,6 +267,10 @@ EXTERN int g_mpi_z_rank;
 EXTERN int g_mpi_ST_rank;
 EXTERN int g_nb_list[8];
 
+/* Variables for exposu3 */
+EXTERN int g_exposu3_no_c;
+EXTERN double * g_exposu3_c;
+
 /* OpenMP Kahan accumulation arrays */
 EXTERN _Complex double *g_omp_acc_cp;
 EXTERN double* g_omp_acc_re;
@@ -230,8 +279,21 @@ EXTERN double* g_omp_acc_re;
 EXTERN int g_dflgcr_flag;
 EXTERN int g_N_s;
 EXTERN int * index_block_eo;
-
-#ifdef MPI
+EXTERN int Msap_precon;
+EXTERN int NiterMsap;
+EXTERN int NcycleMsap;
+EXTERN int NiterMsap_dflgen;
+EXTERN int NcycleMsap_dflgen;
+EXTERN int NsmoothMsap_dflgen;
+EXTERN int usePL;
+EXTERN int little_solver;
+EXTERN int little_evenodd;
+EXTERN int little_gmres_m_parameter;
+EXTERN double little_solver_low_prec;
+EXTERN double little_solver_high_prec;
+EXTERN int little_solver_max_iter;
+
+#ifdef TM_USE_MPI
 EXTERN MPI_Status status;
 EXTERN MPI_Request req1,req2,req3,req4;
 EXTERN MPI_Comm g_cart_grid;
@@ -248,7 +310,11 @@ EXTERN int g_nb_z_up, g_nb_z_dn;
 
 #endif
 
-#ifdef OMP
+EXTERN int subprocess_flag;
+EXTERN int lowmem_flag;
+EXTERN int g_external_id;
+
+#ifdef TM_USE_OMP
 EXTERN int omp_num_threads;
 #endif
 
@@ -275,3 +341,14 @@ void fatal_error(char const *error, char const *function);
 
 #endif
 
+/*
+ * Comments: generic macro for swapping values or pointers.
+ * We use memcpy because is optimal when the amount to copy is known at compilation time. 
+ * "sizeof(x) == sizeof(y) ? (signed)sizeof(x) : -1" is a compile time check that the types are compatible.
+ */
+#define SWAP(x,y) do \
+{ unsigned char swap_temp[sizeof(x) == sizeof(y) ? (signed)sizeof(x) : -1]; \
+  memcpy(swap_temp,&y,sizeof(x)); \
+  memcpy(&y,&x,       sizeof(x)); \
+  memcpy(&x,swap_temp,sizeof(x)); \
+} while(0)
diff --git a/hmc_tm.c b/hmc_tm.c
index a15b7eaf1..dcb565b88 100644
--- a/hmc_tm.c
+++ b/hmc_tm.c
@@ -26,7 +26,7 @@
  *******************************************************************************/
 #include "lime.h"
 #if HAVE_CONFIG_H
-#include<config.h>
+#include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -36,10 +36,10 @@
 #include <string.h>
 #include <signal.h>
 #include <unistd.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include "global.h"
@@ -52,7 +52,7 @@
 #include "start.h"
 #include "measure_gauge_action.h"
 #include "measure_rectangles.h"
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include "xchange/xchange.h"
 #endif
 #include "read_input.h"
@@ -67,7 +67,10 @@
 #include "monomial/monomial.h"
 #include "integrator.h"
 #include "sighandler.h"
-#include "measurements.h"
+#include "meas/measurements.h"
+#ifdef DDalphaAMG
+#include "DDalphaAMG_interface.h"
+#endif
 
 extern int nstore;
 
@@ -100,8 +103,6 @@ int main(int argc,char *argv[]) {
   /* Do we want to perform reversibility checks */
   /* See also return_check_flag in read_input.h */
   int return_check = 0;
-  /* For getopt */
-  int c;
 
   paramsXlfInfo *xlfInfo;
 
@@ -125,36 +126,13 @@ int main(int argc,char *argv[]) {
   verbose = 1;
   g_use_clover_flag = 0;
 
-#ifdef MPI
-
-#  ifdef OMP
-  int mpi_thread_provided;
-  MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_thread_provided);
-#  else
-  MPI_Init(&argc, &argv);
-#  endif
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id);
-#else
-  g_proc_id = 0;
-#endif
-
   process_args(argc,argv,&input_filename,&filename);
   set_default_filenames(&input_filename,&filename);
 
-  /* Read the input file */
-  if( (status = read_input(input_filename)) != 0) {
-    fprintf(stderr, "Could not find input file: %s\nAborting...\n", input_filename);
-    exit(-1);
-  }
-
-#ifdef OMP
-  init_openmp();
-#endif
+  init_parallel_and_read_input(argc, argv, input_filename);
 
   DUM_DERI = 4;
-  DUM_SOLVER = DUM_DERI+1;
-  DUM_MATRIX = DUM_SOLVER+6;
+  DUM_MATRIX = DUM_DERI+7;
   if(g_running_phmc) {
     NO_OF_SPINORFIELDS = DUM_MATRIX+8;
   }
@@ -166,7 +144,10 @@ int main(int argc,char *argv[]) {
 
   DUM_BI_MATRIX = DUM_BI_SOLVER+6;
   NO_OF_BISPINORFIELDS = DUM_BI_MATRIX+6;
-
+  
+  //4 extra fields (corresponding to DUM_MATRIX+0..5) for deg. and ND matrix mult.
+  NO_OF_SPINORFIELDS_32 = 6;
+  
   tmlqcd_mpi_init(argc, argv);
 
   if(nstore == -1) {
@@ -183,7 +164,7 @@ int main(int argc,char *argv[]) {
     }
   }
   
-#ifndef MPI
+#ifndef TM_USE_MPI
   g_dbw2rand = 0;
 #endif
   
@@ -192,12 +173,16 @@ int main(int argc,char *argv[]) {
   
 #ifdef _GAUGE_COPY
   status = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
+  status += init_gauge_field_32(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
   status = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
+  status += init_gauge_field_32(VOLUMEPLUSRAND + g_dbw2rand, 0);   
 #endif
   /* need temporary gauge field for gauge reread checks and in update_tm */
   status += init_gauge_tmp(VOLUME);
 
+  status += init_gauge_fg(VOLUME);
+
   if (status != 0) {
     fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n");
     exit(0);
@@ -209,9 +194,11 @@ int main(int argc,char *argv[]) {
   }
   if(even_odd_flag) {
     j = init_spinor_field(VOLUMEPLUSRAND/2, NO_OF_SPINORFIELDS);
+    j += init_spinor_field_32(VOLUMEPLUSRAND/2, NO_OF_SPINORFIELDS_32);      
   }
   else {
     j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS);
+    j += init_spinor_field_32(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS_32);    
   }
   if (j != 0) {
     fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n");
@@ -281,9 +268,14 @@ int main(int argc,char *argv[]) {
     fprintf(stderr, "Not enough memory for halffield! Aborting...\n");
     exit(-1);
   }
-  if(g_sloppy_precision_flag == 1) {
-    init_dirac_halfspinor32();
-  }
+
+  j = init_dirac_halfspinor32();
+  if (j != 0)
+  {
+    fprintf(stderr, "Not enough memory for 32-bit halffield! Aborting...\n");
+    exit(-1);
+  } 
+  
 #  if (defined _PERSISTENT)
   init_xchange_halffield();
 #  endif
@@ -320,10 +312,14 @@ int main(int argc,char *argv[]) {
   }
 
   /*For parallelization: exchange the gaugefield */
-#ifdef MPI
+#ifdef TM_USE_MPI
   xchange_gauge(g_gauge_field);
 #endif
-
+    
+  /*Convert to a 32 bit gauge field, after xchange*/
+  convert_32_gauge_field(g_gauge_field_32, g_gauge_field, VOLUMEPLUSRAND + g_dbw2rand);
+  
+    
   if(even_odd_flag) {
     j = init_monomials(VOLUMEPLUSRAND/2, even_odd_flag);
   }
@@ -375,7 +371,7 @@ int main(int argc,char *argv[]) {
   if(g_proc_id == 0) {
     gettimeofday(&t1,NULL);
     countfile = fopen("history_hmc_tm", "a");
-    fprintf(countfile, "!!! Timestamp %ld, Nsave = %d, g_mu = %e, g_mu1 = %e, g_mu_2 = %e, g_mu3 = %e, beta = %f, kappa = %f, C1 = %f, ",
+    fprintf(countfile, "!!! Timestamp %ld, Nsave = %d, g_mu = %.12f, g_mu1 = %.12f, g_mu_2 = %.12f, g_mu3 = %.12f, beta = %.12f, kappa = %.12f, C1 = %f, ",
             t1.tv_sec, Nsave, g_mu, g_mu1, g_mu2, g_mu3, g_beta, g_kappa, g_rgi_C1);
     for(j = 0; j < Integrator.no_timescales; j++) {
       fprintf(countfile, "n_int[%d] = %d ", j, Integrator.no_mnls_per_ts[j]);
@@ -455,9 +451,10 @@ int main(int argc,char *argv[]) {
             } else {
               if(g_proc_id==0) {
                 if(read_attempt+1 < 2) {
-                  fprintf(stdout, "# Reread attempt %d out of %d failed, trying again in %d seconds!\n",read_attempt+1,2,2);
-                } else {
-                  fprintf(stdout, "$ Reread attept %d out of %d failed, write will be reattempted!\n",read_attempt+1,2,2);
+                  fprintf(stdout, "# Reread attempt %d out of %d failed, trying again in %d seconds!\n", read_attempt+1, 2, 2);
+                } 
+		else {
+                  fprintf(stdout, "# Reread attempt %d out of %d failed, write will be reattempted!\n", read_attempt+1, 2);
                 }
               }
               sleep(2);
@@ -482,7 +479,7 @@ int main(int argc,char *argv[]) {
             fprintf(stdout, "# Will attempt to write again in %d seconds.\n", io_timeout);
           
           sleep(io_timeout);
-#ifdef MPI
+#ifdef TM_USE_MPI
           MPI_Barrier(MPI_COMM_WORLD);
 #endif
         }
@@ -501,12 +498,22 @@ int main(int argc,char *argv[]) {
     }
 
     /* online measurements */
+#ifdef DDalphaAMG
+    // When the configuration is rejected, we have to update it in the MG and redo the setup.
+    int mg_update = accept ? 0:1;
+#endif
     for(imeas = 0; imeas < no_measurements; imeas++){
       meas = &measurement_list[imeas];
       if(trajectory_counter%meas->freq == 0){
         if (g_proc_id == 0) {
           fprintf(stdout, "#\n# Beginning online measurement.\n");
         }
+#ifdef DDalphaAMG
+        if( mg_update ) {
+          mg_update = 0;
+          MG_reset();
+        }
+#endif
         meas->measurefunc(trajectory_counter, imeas, even_odd_flag);
       }
     }
@@ -519,7 +526,7 @@ int main(int argc,char *argv[]) {
       verbose = 0;
     }
 
-#ifdef MPI
+#ifdef TM_USE_MPI
     MPI_Barrier(MPI_COMM_WORLD);
 #endif
     if(ix == 0 && g_proc_id == 0) {
@@ -540,13 +547,15 @@ int main(int argc,char *argv[]) {
     fclose(parameterfile);
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   free_omp_accumulators();
 #endif
   free_gauge_tmp();
   free_gauge_field();
+  free_gauge_field_32();  
   free_geometry_indices();
   free_spinor_field();
+  free_spinor_field_32();  
   free_moment_field();
   free_monomials();
   if(g_running_phmc) {
@@ -555,7 +564,9 @@ int main(int argc,char *argv[]) {
   }
   free(input_filename);
   free(filename);
-#ifdef MPI
+  free(SourceInfo.basename);
+  free(PropInfo.basename);
+#ifdef TM_USE_MPI
   MPI_Barrier(MPI_COMM_WORLD);
   MPI_Finalize();
 #endif
diff --git a/hopping_test.c b/hopping_test.c
index 9547e5f8f..753ab66ce 100644
--- a/hopping_test.c
+++ b/hopping_test.c
@@ -24,7 +24,7 @@
 *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -35,7 +35,7 @@
 #if (defined BGL && !defined BGP)
 #  include <rts.h>
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 # ifdef HAVE_LIBLEMON
 #  include <io/params.h>
@@ -89,7 +89,7 @@ double bgl_wtime() {
   return ( rts_get_timebase() * clockspeed );
 }
 #else
-# ifdef MPI
+# ifdef TM_USE_MPI
 double bgl_wtime() { return(MPI_Wtime()); }
 # else
 double bgl_wtime() { return(0); }
@@ -106,10 +106,9 @@ int main(int argc,char *argv[])
   double delta, deltamax;
   spinor rsp;
   int status = 0;
-#ifdef MPI
+#ifdef TM_USE_MPI
   DUM_DERI = 6;
-  DUM_SOLVER = DUM_DERI+2;
-  DUM_MATRIX = DUM_SOLVER+6;
+  DUM_MATRIX = DUM_DERI+8;
   NO_OF_SPINORFIELDS = DUM_MATRIX+2;
 
   MPI_Init(&argc, &argv);
@@ -158,7 +157,7 @@ int main(int argc,char *argv[])
 #ifdef _INDEX_INDEP_GEOM
     printf("# the code was compiled with index independent geometry\n");
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
 #  ifdef _NON_BLOCKING
     printf("# the code was compiled for non-blocking MPI calls (spinor and gauge)\n");
 #  endif
@@ -240,7 +239,7 @@ int main(int argc,char *argv[])
     exit(1);
   }
 
-#if (defined MPI && !(defined _USE_SHMEM))
+#if (defined TM_USE_MPI && !(defined _USE_SHMEM))
   check_xchange(); 
 #endif
 
@@ -259,7 +258,7 @@ int main(int argc,char *argv[])
   }
 
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   /*For parallelization: exchange the gaugefield */
   xchange_gauge(g_gauge_field);
 #endif
@@ -277,7 +276,7 @@ int main(int argc,char *argv[])
     }	else if (read_source_flag == 1) { /* yes */
       /* even first, odd second */
       read_spinorfield_cm_single(g_spinor_field[0],g_spinor_field[1],SourceInfo.basename,-1,0); 
-# if (!defined MPI)
+# if (!defined TM_USE_MPI)
       if (write_cp_flag == 1) {
 	strcat(SourceInfo.basename,".2");
 	read_spinorfield_cm_single(g_spinor_field[2],g_spinor_field[3],SourceInfo.basename,-1,0); 
@@ -359,7 +358,7 @@ int main(int argc,char *argv[])
       fflush(stdout);
     }
 
-#ifdef MPI
+#ifdef TM_USE_MPI
     MPI_Barrier(MPI_COMM_WORLD);
     MPI_Finalize();
 #endif
diff --git a/include/tmLQCD.h b/include/tmLQCD.h
index 2396300d1..4894b4bf2 100755
--- a/include/tmLQCD.h
+++ b/include/tmLQCD.h
@@ -27,6 +27,12 @@
 #ifndef _TMLQCD_H
 #define _TMLQCD_H
 
+#include "tmlqcd_config.h"
+
+#ifdef TM_USE_MPI
+#include <mpi.h>
+#endif
+
 #ifdef __cplusplus
 extern "C"
 {
@@ -39,18 +45,28 @@ extern "C"
   typedef struct {
     unsigned int nproc, nproc_t, nproc_x, nproc_y, nproc_z, cart_id, proc_id, time_rank, omp_num_threads;
     unsigned int proc_coords[4];
+#ifdef TM_USE_MPI
+    MPI_Comm cart_grid;
+#else
+    int cart_grid;
+#endif
   } tmLQCD_mpi_params;
 
-  int tmLQCD_invert_init(int argc, char *argv[], const int verbose);
+  int tmLQCD_invert_init(int argc, char *argv[], const int verbose, const int external_id);
   int tmLQCD_read_gauge(const int nconfig);
   int tmLQCD_invert(double * const propagator, double * const source,
 		    const int op_id, const int write_prop);
   int tmLQCD_finalise();
 
-  int tmLQCD_get_gauge_field_pointer(double * gf);
+  int tmLQCD_get_gauge_field_pointer(double ** gf);
   int tmLQCD_get_mpi_params(tmLQCD_mpi_params * params);
   int tmLQCD_get_lat_params(tmLQCD_lat_params * params);
-  
+
+#ifdef TM_USE_QUDA
+  int invert_quda_direct(double * const propgator, double * const source,
+                    const int op_id);
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/config.h.in b/include/tmlqcd_config.h.in
similarity index 87%
rename from config.h.in
rename to include/tmlqcd_config.h.in
index eaba625ae..3a6e7c010 100644
--- a/config.h.in
+++ b/include/tmlqcd_config.h.in
@@ -1,10 +1,14 @@
-/* config.h.in.  Generated from configure.in by autoheader.  */
+/* Hand-written tmlqcd_config.h.in. */
+
 #ifndef _CONFIG_H
 #define _CONFIG_H
 
 /* We are on a CRAY */
 #undef CRAY
 
+/* We will use BSM type operators with additional gauge field */
+#undef TM_USE_BSM
+
 /* lapack available */
 #undef HAVE_LAPACK
 
@@ -18,10 +22,10 @@
 #undef HAVE_CLOCK_GETTIME
 
 /* Compile with MPI support */
-#undef MPI
+#undef TM_USE_MPI
 
 /* Compile with OpenMP support */
-#undef OMP
+#undef TM_USE_OMP
 
 /* Compile with FFTW support */
 #undef HAVE_FFTW
@@ -89,6 +93,12 @@
 /* Alignment compiler hint macro */
 #undef ALIGN
 
+/* Alignment for 32bit arrays -- necessary for SSE and automated vectorization */
+#undef ALIGN_BASE32
+
+/* Alignment of 32bit fields, compiler hint macro */
+#undef ALIGN32
+
 /* Compile with SSE2 support */
 #undef SSE2
 
@@ -190,5 +200,20 @@
 /* Define if we want to compute the LapH eigenvectors */
 #undef WITHLAPH
 
+/* Define to 1 if you have the `quda' library (-lquda). */
+#undef HAVE_LIBQUDA
+
+/* Using QUDA GPU */
+#undef TM_USE_QUDA
+
+/* Using DDalphaAMG */
+#undef DDalphaAMG
+
+/* Using QPHIX */
+#undef TM_USE_QPHIX
+
+/* Structure of Array length to use with QPhiX */
+#undef QPHIX_SOALEN
+
 #endif
 
diff --git a/init/Makefile.in b/init/Makefile.in
index 56ec90931..be98493fc 100644
--- a/init/Makefile.in
+++ b/init/Makefile.in
@@ -30,12 +30,10 @@ LDADD =
 COMPILE = ${CC} $(DEFS) ${INCLUDES} ${CFLAGS}
 
 LIBRARIES = libinit
-libinit_TARGETS = init_moment_field init_gauge_tmp init_gauge_field \
+libinit_TARGETS = init_moment_field init_gauge_tmp init_gauge_fg init_gauge_field \
 	init_geometry_indices init_spinor_field init_dirac_halfspinor \
 	init_chi_spinor_field init_bispinor_field init_jacobi_field \
-	init_scalar_field \
-	init_omp_accumulators init_openmp \
-	init_bsm_2hop_lookup
+	init_omp_accumulators init_openmp init_parallel init_scalar_field init_bsm_2hop_lookup
 
 libinit_STARGETS = 
 
@@ -63,10 +61,10 @@ include ${top_srcdir}/Makefile.global
 
 # rule to compile objects
 
-${libinit_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h
+${libinit_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config.h
 	$(COMPILE) ${OPTARGS} -c $<
 
-${libinit_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h
+${libinit_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config.h
 	$(COMPILE) ${SOPTARGS} -c $<
 
 # rule to make libinit
diff --git a/init/init.h b/init/init.h
index 51ebf52dc..68a08b1cd 100644
--- a/init/init.h
+++ b/init/init.h
@@ -25,15 +25,19 @@
 #include "init/init_dirac_halfspinor.h"
 #include "init/init_gauge_field.h"
 #include "init/init_gauge_tmp.h"
+#include "init/init_gauge_fg.h"
 #include "init/init_geometry_indices.h"
+#ifdef TM_USE_BSM
 #include "init/init_scalar_field.h"
+#endif
+#include "init/init_parallel.h"
 #ifdef WITHLAP
 #  include "init/init_jacobi_field.h"
 #endif
 #include "init/init_moment_field.h"
 #include "init/init_spinor_field.h"
 #include "init/init_stout_smear_vars.h"
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 # include "init/init_omp_accumulators.h"
 # include "init/init_openmp.h"
diff --git a/init/init_bispinor_field.c b/init/init_bispinor_field.c
index 518903b7c..10e360f9d 100644
--- a/init/init_bispinor_field.c
+++ b/init/init_bispinor_field.c
@@ -19,7 +19,7 @@
 
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/init/init_bsm_2hop_lookup.c b/init/init_bsm_2hop_lookup.c
index bca4b3bac..76c036a9d 100644
--- a/init/init_bsm_2hop_lookup.c
+++ b/init/init_bsm_2hop_lookup.c
@@ -18,8 +18,10 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
+
+#ifdef TM_USE_BSM
 #include <stdlib.h>
 #include <stdio.h>
 #include <errno.h>
@@ -87,3 +89,4 @@ void free_bsm_2hop_lookup() {
   if((void*)g_bsm_2hop_lookup != NULL)
     free(g_bsm_2hop_lookup);
 }
+#endif
diff --git a/init/init_bsm_2hop_lookup.h b/init/init_bsm_2hop_lookup.h
index a5bdfbeff..b987aa782 100644
--- a/init/init_bsm_2hop_lookup.h
+++ b/init/init_bsm_2hop_lookup.h
@@ -24,7 +24,6 @@
  * must be called after geometry()! (geometry_eo.h)
  *
  *********************************************************************************/
-
 #ifndef _INIT_BSM_2HOP_LOOKUP_H
 #define _INIT_BSM_2HOP_LOOKUP_H
 
diff --git a/init/init_chi_spinor_field.c b/init/init_chi_spinor_field.c
index 6a63ba4fb..87c8124f6 100644
--- a/init/init_chi_spinor_field.c
+++ b/init/init_chi_spinor_field.c
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/init/init_dirac_halfspinor.c b/init/init_dirac_halfspinor.c
index f90fe884d..7652598c4 100644
--- a/init/init_dirac_halfspinor.c
+++ b/init/init_dirac_halfspinor.c
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -29,6 +29,7 @@
 #include "global.h"
 #include "su3.h"
 #include "init_dirac_halfspinor.h"
+#include "fatal_error.h"
 
 #ifdef BGQ
 #  define SPI_ALIGN_BASE 0x7f
@@ -71,7 +72,7 @@ int init_dirac_halfspinor() {
 
   HalfSpinor = (halfspinor*)(((unsigned long int)(HalfSpinor_)+ALIGN_BASE+1)&~ALIGN_BASE);
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   if((void*)(sendBuffer_ = (halfspinor*)calloc(RAND/2+8, sizeof(halfspinor))) == NULL) {
     printf ("malloc errno : %d\n",errno); 
     errno = 0;
@@ -95,57 +96,55 @@ int init_dirac_halfspinor() {
       y = (j-t*(LX*LY*LZ)-x*(LY*LZ))/(LZ);
       z = (j-t*(LX*LY*LZ)-x*(LY*LZ) - y*LZ);
       for(int mu = 0; mu < 4; mu++) {
-	NBPointer[ieo][8*i + 2*mu + 0] = &HalfSpinor[ 8*g_lexic2eosub[ g_idn[j][mu] ] + 2*mu + 0];
-	NBPointer[ieo][8*i + 2*mu + 1] = &HalfSpinor[ 8*g_lexic2eosub[ g_iup[j][mu] ] + 2*mu + 1];
+        NBPointer[ieo][8*i + 2*mu + 0] = &HalfSpinor[ 8*g_lexic2eosub[ g_idn[j][mu] ] + 2*mu + 0];
+        NBPointer[ieo][8*i + 2*mu + 1] = &HalfSpinor[ 8*g_lexic2eosub[ g_iup[j][mu] ] + 2*mu + 1];
       }
 #if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
       if(t == 0) {
-	k = (g_lexic2eosub[g_idn[j][0]] - VOLUME/2);
-	NBPointer[ieo][8*i] = &sendBuffer[ k ];
+        k = (g_lexic2eosub[g_idn[j][0]] - VOLUME/2);
+        NBPointer[ieo][8*i] = &sendBuffer[ k ];
       }
       if(t == T-1) {
-	k = (g_lexic2eosub[g_iup[j][0]] - VOLUME/2);
-	NBPointer[ieo][8*i + 1] = &sendBuffer[ k ];
+        k = (g_lexic2eosub[g_iup[j][0]] - VOLUME/2);
+        NBPointer[ieo][8*i + 1] = &sendBuffer[ k ];
       }
 #endif
 #if ((defined PARALLELX) || (defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
       if(x == 0) {
-	k = (g_lexic2eosub[g_idn[j][1]] - VOLUME/2);
-	NBPointer[ieo][8*i + 2] = &sendBuffer[ k ];
+        k = (g_lexic2eosub[g_idn[j][1]] - VOLUME/2);
+        NBPointer[ieo][8*i + 2] = &sendBuffer[ k ];
       }
       if(x == LX-1) {
-	k = (g_lexic2eosub[g_iup[j][1]] - VOLUME/2);
-	NBPointer[ieo][8*i + 3] = &sendBuffer[ k ];
+        k = (g_lexic2eosub[g_iup[j][1]] - VOLUME/2);
+        NBPointer[ieo][8*i + 3] = &sendBuffer[ k ];
       }
 #endif
 #if ((defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXYT) || (defined PARALLELXYZT))
       if(y == 0) {
-	k = (g_lexic2eosub[g_idn[j][2]] - VOLUME/2);
-	NBPointer[ieo][8*i + 4] = &sendBuffer[ k ];
+        k = (g_lexic2eosub[g_idn[j][2]] - VOLUME/2);
+        NBPointer[ieo][8*i + 4] = &sendBuffer[ k ];
       }
       if(y == LY-1) {
-	k = (g_lexic2eosub[g_iup[j][2]] - VOLUME/2);
-	NBPointer[ieo][8*i + 5] = &sendBuffer[ k ];
+        k = (g_lexic2eosub[g_iup[j][2]] - VOLUME/2);
+        NBPointer[ieo][8*i + 5] = &sendBuffer[ k ];
       }
 #endif
 #if ((defined PARALLELXYZ) || (defined PARALLELXYZT))
       if(z == 0) {
-	k = (g_lexic2eosub[g_idn[j][3]] - VOLUME/2);
-	NBPointer[ieo][8*i + 6] = &sendBuffer[ k ];
+        k = (g_lexic2eosub[g_idn[j][3]] - VOLUME/2);
+        NBPointer[ieo][8*i + 6] = &sendBuffer[ k ];
       }
       if(z == LZ-1) {
-	k = (g_lexic2eosub[g_iup[j][3]] - VOLUME/2);
-	NBPointer[ieo][8*i + 7] = &sendBuffer[ k ];
+        k = (g_lexic2eosub[g_iup[j][3]] - VOLUME/2);
+        NBPointer[ieo][8*i + 7] = &sendBuffer[ k ];
       }
 #endif
     }
     for(int i = VOLUME/2; i < (VOLUME+RAND)/2; i++) {
       for(int mu = 0; mu < 8; mu++) {
-	NBPointer[ieo][8*i + mu] = NBPointer[ieo][0];
+        NBPointer[ieo][8*i + mu] = NBPointer[ieo][0];
       }
     }
-#ifdef MPI
-#endif
   }
   for(int ieo = 2; ieo < 4; ieo++) {
     for(int i = 0; i < VOLUME/2; i++) {
@@ -156,48 +155,48 @@ int init_dirac_halfspinor() {
       y = (j-t*(LX*LY*LZ)-x*(LY*LZ))/(LZ);
       z = (j-t*(LX*LY*LZ)-x*(LY*LZ) - y*LZ);
       for(int mu = 0; mu < 8; mu++) {
-	NBPointer[ieo][8*i + mu] = &HalfSpinor[8*i + mu];
+        NBPointer[ieo][8*i + mu] = &HalfSpinor[8*i + mu];
       }
 #if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
       if(t == T-1) {
-	NBPointer[ieo][8*i]     = &recvBuffer[ (g_lexic2eosub[ g_iup[j][0] ] - VOLUME/2)];
+        NBPointer[ieo][8*i]     = &recvBuffer[ (g_lexic2eosub[ g_iup[j][0] ] - VOLUME/2)];
       }
       if(t == 0) {
-	NBPointer[ieo][8*i + 1] = &recvBuffer[ (g_lexic2eosub[ g_idn[j][0] ] - VOLUME/2)];
+        NBPointer[ieo][8*i + 1] = &recvBuffer[ (g_lexic2eosub[ g_idn[j][0] ] - VOLUME/2)];
       }
 #endif
 #if ((defined PARALLELX) || (defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
       if(x == LX-1) { 
-	NBPointer[ieo][8*i + 2] = &recvBuffer[ (g_lexic2eosub[ g_iup[j][1] ] - VOLUME/2)];
+        NBPointer[ieo][8*i + 2] = &recvBuffer[ (g_lexic2eosub[ g_iup[j][1] ] - VOLUME/2)];
       }
       if(x == 0) {
-	NBPointer[ieo][8*i + 3] = &recvBuffer[ (g_lexic2eosub[ g_idn[j][1] ] - VOLUME/2)];
+        NBPointer[ieo][8*i + 3] = &recvBuffer[ (g_lexic2eosub[ g_idn[j][1] ] - VOLUME/2)];
       }
 #endif
 #if ((defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXYT) || (defined PARALLELXYZT))
       if(y == LY-1) {
-	NBPointer[ieo][8*i + 4] = &recvBuffer[ (g_lexic2eosub[ g_iup[j][2] ] - VOLUME/2)];
+        NBPointer[ieo][8*i + 4] = &recvBuffer[ (g_lexic2eosub[ g_iup[j][2] ] - VOLUME/2)];
       }
       if(y == 0) {
-	NBPointer[ieo][8*i + 5] = &recvBuffer[ (g_lexic2eosub[ g_idn[j][2] ] - VOLUME/2)];
+        NBPointer[ieo][8*i + 5] = &recvBuffer[ (g_lexic2eosub[ g_idn[j][2] ] - VOLUME/2)];
       }
 #endif
 #if ((defined PARALLELXYZ) || (defined PARALLELXYZT))
       if(z == LZ-1) {
-	NBPointer[ieo][8*i + 6] = &recvBuffer[ (g_lexic2eosub[ g_iup[j][3] ] - VOLUME/2)];
+        NBPointer[ieo][8*i + 6] = &recvBuffer[ (g_lexic2eosub[ g_iup[j][3] ] - VOLUME/2)];
       }
       if(z == 0) {
-	NBPointer[ieo][8*i + 7] = &recvBuffer[ (g_lexic2eosub[ g_idn[j][3] ] - VOLUME/2)];
+        NBPointer[ieo][8*i + 7] = &recvBuffer[ (g_lexic2eosub[ g_idn[j][3] ] - VOLUME/2)];
       }
 #endif
     }
     for(int i = VOLUME/2; i < (VOLUME+RAND)/2; i++) {
       for(int mu = 0; mu < 8; mu++) {
-	NBPointer[ieo][8*i + mu] = NBPointer[ieo][0];
+        NBPointer[ieo][8*i + mu] = NBPointer[ieo][0];
       }
     }
   }
-#if (defined SPI && defined MPI)
+#if (defined SPI && defined TM_USE_MPI)
   // here comes the SPI initialisation
   uint64_t messageSizes[NUM_DIRS];
   uint64_t roffsets[NUM_DIRS], soffsets[NUM_DIRS];
@@ -256,12 +255,12 @@ int init_dirac_halfspinor() {
 
   // Setup the FIFO handles
   rc = msg_InjFifoInit ( &injFifoHandle,
-			 0,                      /* startingSubgroupId */
-			 0,                      /* startingFifoId     */
-			 spi_num_dirs,           /* numFifos   */
-			 INJ_MEMORY_FIFO_SIZE+1, /* fifoSize */
-			 NULL                    /* Use default attributes */
-			 );
+       0,                      /* startingSubgroupId */
+       0,                      /* startingFifoId     */
+       spi_num_dirs,           /* numFifos   */
+       INJ_MEMORY_FIFO_SIZE+1, /* fifoSize */
+       NULL                    /* Use default attributes */
+       );
   if(rc != 0) {
     fprintf(stderr, "msg_InjFifoInit failed with rc=%d\n",rc);
     exit(1);
@@ -300,8 +299,8 @@ int init_dirac_halfspinor() {
   for (unsigned int j = 0; j < spi_num_dirs; j++) {
     descCount[ j ] =
       msg_InjFifoInject ( injFifoHandle,
-			  j,
-			  &SPIDescriptors[j]);
+        j,
+        &SPIDescriptors[j]);
   }
   // wait for receive completion
   while ( recvCounter > 0 );
@@ -320,16 +319,16 @@ int init_dirac_halfspinor() {
     if(i == 7) k = g_nb_z_dn;
     for(int mu = 0; mu < messageSizes[i]/sizeof(halfspinor); mu++) {
       if(k != (int)creal(recvBuffer[ soffsets[i]/sizeof(halfspinor) + mu ].s0.c0) ||
-	 k != (int)creal(recvBuffer[ soffsets[i]/sizeof(halfspinor) + mu ].s0.c1) ||
-	 k != (int)creal(recvBuffer[ soffsets[i]/sizeof(halfspinor) + mu ].s0.c2) ||
-	 k != (int)creal(recvBuffer[ soffsets[i]/sizeof(halfspinor) + mu ].s1.c0) ||
-	 k != (int)creal(recvBuffer[ soffsets[i]/sizeof(halfspinor) + mu ].s1.c1) ||
-	 k != (int)creal(recvBuffer[ soffsets[i]/sizeof(halfspinor) + mu ].s1.c2)) {
-	if(g_cart_id == 0) {
-	  printf("SPI exchange doesn't work for dir %d: %d != %d at point %d\n", 
-		 i, k ,(int)creal(recvBuffer[ soffsets[i]/sizeof(halfspinor) + mu ].s0.c0), mu);
-	}
-	j++;
+   k != (int)creal(recvBuffer[ soffsets[i]/sizeof(halfspinor) + mu ].s0.c1) ||
+   k != (int)creal(recvBuffer[ soffsets[i]/sizeof(halfspinor) + mu ].s0.c2) ||
+   k != (int)creal(recvBuffer[ soffsets[i]/sizeof(halfspinor) + mu ].s1.c0) ||
+   k != (int)creal(recvBuffer[ soffsets[i]/sizeof(halfspinor) + mu ].s1.c1) ||
+   k != (int)creal(recvBuffer[ soffsets[i]/sizeof(halfspinor) + mu ].s1.c2)) {
+  if(g_cart_id == 0) {
+    printf("SPI exchange doesn't work for dir %d: %d != %d at point %d\n", 
+     i, k ,(int)creal(recvBuffer[ soffsets[i]/sizeof(halfspinor) + mu ].s0.c0), mu);
+  }
+  j++;
       }
     }
   }
@@ -347,8 +346,9 @@ int init_dirac_halfspinor() {
 
 int init_dirac_halfspinor32() {
   int j=0, k;
-  int x, y, z, t, mu;
   
+  int x, y, z, t, mu;
+
   NBPointer32 = (halfspinor32***) calloc(4,sizeof(halfspinor32**));
   NBPointer32_ = (halfspinor32**) calloc(16,(VOLUME+RAND)*sizeof(halfspinor32*));
   NBPointer32[0] = NBPointer32_;
@@ -356,7 +356,7 @@ int init_dirac_halfspinor32() {
   NBPointer32[2] = NBPointer32_ + (16*(VOLUME+RAND)/2);
   NBPointer32[3] = NBPointer32_ + (24*(VOLUME+RAND)/2);
 
-  if((void*)(HalfSpinor32_ = (halfspinor32*)calloc(8*(VOLUME+RAND)+1, sizeof(halfspinor32))) == NULL) {
+  if((void*)(HalfSpinor32_ = (halfspinor32*)calloc(4*(VOLUME)+1, sizeof(halfspinor32))) == NULL) {
     printf ("malloc errno : %d\n",errno); 
     errno = 0;
     return(-1);
@@ -364,7 +364,7 @@ int init_dirac_halfspinor32() {
 
   HalfSpinor32 = (halfspinor32*)(((unsigned long int)(HalfSpinor32_)+ALIGN_BASE)&~ALIGN_BASE);
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   //re-use memory from 64Bit version
   sendBuffer32 = (halfspinor32*)sendBuffer;
   recvBuffer32 = (halfspinor32*)recvBuffer;
@@ -379,50 +379,55 @@ int init_dirac_halfspinor32() {
       y = (j-t*(LX*LY*LZ)-x*(LY*LZ))/(LZ);
       z = (j-t*(LX*LY*LZ)-x*(LY*LZ) - y*LZ);
       for(mu = 0; mu < 4; mu++) {
-	NBPointer32[ieo][8*i + 2*mu + 0] = &HalfSpinor32[ 8*g_lexic2eosub[ g_idn[j][mu] ] + 2*mu + 0];
-	NBPointer32[ieo][8*i + 2*mu + 1] = &HalfSpinor32[ 8*g_lexic2eosub[ g_iup[j][mu] ] + 2*mu + 1];
+        NBPointer32[ieo][8*i + 2*mu + 0] = &HalfSpinor32[ 8*g_lexic2eosub[ g_idn[j][mu] ] + 2*mu + 0];
+        NBPointer32[ieo][8*i + 2*mu + 1] = &HalfSpinor32[ 8*g_lexic2eosub[ g_iup[j][mu] ] + 2*mu + 1];
       }
 #if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
       if(t == 0) {
-	k = (g_lexic2eosub[g_idn[j][0]] - VOLUME/2);
-	NBPointer32[ieo][8*i] = &sendBuffer32[ k ];
+        k = (g_lexic2eosub[g_idn[j][0]] - VOLUME/2);
+        NBPointer32[ieo][8*i] = &sendBuffer32[ k ];
       }
       if(t == T-1) {
-	k = (g_lexic2eosub[g_iup[j][0]] - VOLUME/2);
-	NBPointer32[ieo][8*i + 1] = &sendBuffer32[ k ];
+        k = (g_lexic2eosub[g_iup[j][0]] - VOLUME/2);
+        NBPointer32[ieo][8*i + 1] = &sendBuffer32[ k ];
       }
 #endif
 #if ((defined PARALLELX) || (defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
       if(x == 0) {
-	k = (g_lexic2eosub[g_idn[j][1]] - VOLUME/2);
-	NBPointer32[ieo][8*i + 2] = &sendBuffer32[ k ];
+        k = (g_lexic2eosub[g_idn[j][1]] - VOLUME/2);
+        NBPointer32[ieo][8*i + 2] = &sendBuffer32[ k ];
       }
       if(x == LX-1) {
-	k = (g_lexic2eosub[g_iup[j][1]] - VOLUME/2);
-	NBPointer32[ieo][8*i + 3] = &sendBuffer32[ k ];
+        k = (g_lexic2eosub[g_iup[j][1]] - VOLUME/2);
+        NBPointer32[ieo][8*i + 3] = &sendBuffer32[ k ];
       }
 #endif
 #if ((defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXYT) || (defined PARALLELXYZT))
       if(y == 0) {
-	k = (g_lexic2eosub[g_idn[j][2]] - VOLUME/2);
-	NBPointer32[ieo][8*i + 4] = &sendBuffer32[ k ];
+        k = (g_lexic2eosub[g_idn[j][2]] - VOLUME/2);
+        NBPointer32[ieo][8*i + 4] = &sendBuffer32[ k ];
       }
       if(y == LY-1) {
-	k = (g_lexic2eosub[g_iup[j][2]] - VOLUME/2);
-	NBPointer32[ieo][8*i + 5] = &sendBuffer32[ k ];
+        k = (g_lexic2eosub[g_iup[j][2]] - VOLUME/2);
+        NBPointer32[ieo][8*i + 5] = &sendBuffer32[ k ];
       }
 #endif
 #if ((defined PARALLELXYZ) || (defined PARALLELXYZT))
       if(z == 0) {
-	k = (g_lexic2eosub[g_idn[j][3]] - VOLUME/2);
-	NBPointer32[ieo][8*i + 6] = &sendBuffer32[ k ];
+        k = (g_lexic2eosub[g_idn[j][3]] - VOLUME/2);
+        NBPointer32[ieo][8*i + 6] = &sendBuffer32[ k ];
       }
       if(z == LZ-1) {
-	k = (g_lexic2eosub[g_iup[j][3]] - VOLUME/2);
-	NBPointer32[ieo][8*i + 7] = &sendBuffer32[ k ];
+        k = (g_lexic2eosub[g_iup[j][3]] - VOLUME/2);
+        NBPointer32[ieo][8*i + 7] = &sendBuffer32[ k ];
       }
 #endif
     }
+    for(int i = VOLUME/2; i < (VOLUME+RAND)/2; i++) {
+      for(int mu = 0; mu < 8; mu++) {
+        NBPointer32[ieo][8*i + mu] = NBPointer32[ieo][0];
+      }
+    }
   }
   for(int ieo = 2; ieo < 4; ieo++) {
     for(int i = 0; i < VOLUME/2; i++) {
@@ -433,43 +438,48 @@ int init_dirac_halfspinor32() {
       y = (j-t*(LX*LY*LZ)-x*(LY*LZ))/(LZ);
       z = (j-t*(LX*LY*LZ)-x*(LY*LZ) - y*LZ);
       for(mu = 0; mu < 8; mu++) {
-	NBPointer32[ieo][8*i + mu] = &HalfSpinor32[8*i + mu];
+        NBPointer32[ieo][8*i + mu] = &HalfSpinor32[8*i + mu];
       }
 #if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
       if(t == T-1) {
-	NBPointer32[ieo][8*i]     = &recvBuffer32[ (g_lexic2eosub[ g_iup[j][0] ] - VOLUME/2)];
+        NBPointer32[ieo][8*i]     = &recvBuffer32[ (g_lexic2eosub[ g_iup[j][0] ] - VOLUME/2)];
       }
       if(t == 0) {
-	NBPointer32[ieo][8*i + 1] = &recvBuffer32[ (g_lexic2eosub[ g_idn[j][0] ] - VOLUME/2)];
+        NBPointer32[ieo][8*i + 1] = &recvBuffer32[ (g_lexic2eosub[ g_idn[j][0] ] - VOLUME/2)];
       }
 #endif
 #if ((defined PARALLELX) || (defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
       if(x == LX-1) { 
-	NBPointer32[ieo][8*i + 2] = &recvBuffer32[ (g_lexic2eosub[ g_iup[j][1] ] - VOLUME/2)];
+        NBPointer32[ieo][8*i + 2] = &recvBuffer32[ (g_lexic2eosub[ g_iup[j][1] ] - VOLUME/2)];
       }
       if(x == 0) {
-	NBPointer32[ieo][8*i + 3] = &recvBuffer32[ (g_lexic2eosub[ g_idn[j][1] ] - VOLUME/2)];
+        NBPointer32[ieo][8*i + 3] = &recvBuffer32[ (g_lexic2eosub[ g_idn[j][1] ] - VOLUME/2)];
       }
 #endif
 #if ((defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXYT) || (defined PARALLELXYZT))
       if(y == LY-1) {
-	NBPointer32[ieo][8*i + 4] = &recvBuffer32[ (g_lexic2eosub[ g_iup[j][2] ] - VOLUME/2)];
+        NBPointer32[ieo][8*i + 4] = &recvBuffer32[ (g_lexic2eosub[ g_iup[j][2] ] - VOLUME/2)];
       }
       if(y == 0) {
-	NBPointer32[ieo][8*i + 5] = &recvBuffer32[ (g_lexic2eosub[ g_idn[j][2] ] - VOLUME/2)];
+        NBPointer32[ieo][8*i + 5] = &recvBuffer32[ (g_lexic2eosub[ g_idn[j][2] ] - VOLUME/2)];
       }
 #endif
 #if ((defined PARALLELXYZ) || (defined PARALLELXYZT))
       if(z == LZ-1) {
-	NBPointer32[ieo][8*i + 6] = &recvBuffer32[ (g_lexic2eosub[ g_iup[j][3] ] - VOLUME/2)];
+        NBPointer32[ieo][8*i + 6] = &recvBuffer32[ (g_lexic2eosub[ g_iup[j][3] ] - VOLUME/2)];
       }
       if(z == 0) {
-	NBPointer32[ieo][8*i + 7] = &recvBuffer32[ (g_lexic2eosub[ g_idn[j][3] ] - VOLUME/2)];
+        NBPointer32[ieo][8*i + 7] = &recvBuffer32[ (g_lexic2eosub[ g_idn[j][3] ] - VOLUME/2)];
       }
 #endif
     }
+    for(int i = VOLUME/2; i < (VOLUME+RAND)/2; i++) {
+      for(int mu = 0; mu < 8; mu++) {
+        NBPointer32[ieo][8*i + mu] = NBPointer32[ieo][0];
+      }
+    }
   }
-#if (defined SPI && defined MPI)
+#if (defined SPI && defined TM_USE_MPI)
   // here comes the SPI initialisation
   uint64_t messageSizes[NUM_DIRS];
   uint64_t roffsets[NUM_DIRS], soffsets[NUM_DIRS];
@@ -504,12 +514,12 @@ int init_dirac_halfspinor32() {
 
   // test communication
   for(unsigned int i = 0; i < RAND/2; i++) {
-    sendBuffer32[i].s0.c0 = (double)g_cart_id;
-    sendBuffer32[i].s0.c1 = (double)g_cart_id;
-    sendBuffer32[i].s0.c2 = (double)g_cart_id;
-    sendBuffer32[i].s1.c0 = (double)g_cart_id;
-    sendBuffer32[i].s1.c1 = (double)g_cart_id;
-    sendBuffer32[i].s1.c2 = (double)g_cart_id;
+    sendBuffer32[i].s0.c0 = (float)g_cart_id;
+    sendBuffer32[i].s0.c1 = (float)g_cart_id;
+    sendBuffer32[i].s0.c2 = (float)g_cart_id;
+    sendBuffer32[i].s1.c0 = (float)g_cart_id;
+    sendBuffer32[i].s1.c1 = (float)g_cart_id;
+    sendBuffer32[i].s1.c2 = (float)g_cart_id;
   }
 
   // Initialize the barrier, resetting the hardware.
@@ -518,16 +528,18 @@ int init_dirac_halfspinor32() {
     printf("MUSPI_GIBarrierInit returned rc = %d\n", rc);
     exit(__LINE__);
   }
-  // reset the recv counter 
+  // reset the recv counter, note the division by 2, totalMessageSize has been set in init_dirac_halfspinor 
+  // which must be called first!
   recvCounter = totalMessageSize/2;
   global_barrier(); // make sure everybody is set recv counter
   
+  // could do communication with multiple threads
   //#pragma omp for nowait
   for (unsigned int j = 0; j < spi_num_dirs; j++) {
     descCount[ j ] =
       msg_InjFifoInject ( injFifoHandle,
-			  j,
-			  &SPIDescriptors32[j]);
+        j,
+        &SPIDescriptors32[j]);
   }
   // wait for receive completion
   while ( recvCounter > 0 );
@@ -546,16 +558,16 @@ int init_dirac_halfspinor32() {
     if(i == 7) k = g_nb_z_dn;
     for(int mu = 0; mu < messageSizes[i]/sizeof(halfspinor32); mu++) {
       if(k != (int)creal(recvBuffer32[ soffsets[i]/sizeof(halfspinor32) + mu ].s0.c0) ||
-	 k != (int)creal(recvBuffer32[ soffsets[i]/sizeof(halfspinor32) + mu ].s0.c1) ||
-	 k != (int)creal(recvBuffer32[ soffsets[i]/sizeof(halfspinor32) + mu ].s0.c2) ||
-	 k != (int)creal(recvBuffer32[ soffsets[i]/sizeof(halfspinor32) + mu ].s1.c0) ||
-	 k != (int)creal(recvBuffer32[ soffsets[i]/sizeof(halfspinor32) + mu ].s1.c1) ||
-	 k != (int)creal(recvBuffer32[ soffsets[i]/sizeof(halfspinor32) + mu ].s1.c2)) {
-	if(g_cart_id == 0) {
-	  printf("32 Bit SPI exchange doesn't work for dir %d: %d != %d at point %d\n", 
-		 i, k ,(int)creal(recvBuffer32[ soffsets[i]/sizeof(halfspinor32) + mu ].s0.c0), mu);
-	}
-	j++;
+   k != (int)creal(recvBuffer32[ soffsets[i]/sizeof(halfspinor32) + mu ].s0.c1) ||
+   k != (int)creal(recvBuffer32[ soffsets[i]/sizeof(halfspinor32) + mu ].s0.c2) ||
+   k != (int)creal(recvBuffer32[ soffsets[i]/sizeof(halfspinor32) + mu ].s1.c0) ||
+   k != (int)creal(recvBuffer32[ soffsets[i]/sizeof(halfspinor32) + mu ].s1.c1) ||
+   k != (int)creal(recvBuffer32[ soffsets[i]/sizeof(halfspinor32) + mu ].s1.c2)) {
+  if(g_cart_id == 0) {
+    printf("32 Bit SPI exchange doesn't work for dir %d: %d != %d at point %d\n", 
+     i, k ,(int)creal(recvBuffer32[ soffsets[i]/sizeof(halfspinor32) + mu ].s0.c0), mu);
+  }
+  j++;
       }
     }
   }
diff --git a/init/init_gauge_fg.c b/init/init_gauge_fg.c
new file mode 100644
index 000000000..f87566fee
--- /dev/null
+++ b/init/init_gauge_fg.c
@@ -0,0 +1,61 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include "global.h"
+#include "su3.h"
+#include "sse.h"
+#include "init_gauge_fg.h"
+
+su3 * gauge_fg_ = NULL;
+su3 ** gauge_fg = NULL;
+
+int init_gauge_fg(const int V) {
+  int i=0;
+
+  if((void*)(gauge_fg = (su3**)calloc(V, sizeof(su3*))) == NULL) {
+    fprintf(stderr, "malloc errno : %d\n", errno);
+    errno = 0;
+    return(1);
+  }
+  if((void*)(gauge_fg_ = (su3*)calloc(4*V+1, sizeof(su3))) == NULL) {
+    fprintf(stderr, "malloc errno : %d\n", errno);
+    errno = 0;
+    return(1);
+  }
+#if (defined SSE || defined SSE2 || defined SSE3)
+  gauge_fg[0] = (su3*)(((unsigned long int)(gauge_fg_)+ALIGN_BASE)&~ALIGN_BASE);
+#else
+  gauge_fg[0] = gauge_fg_;
+#endif
+  for(i = 1; i < V; i++){
+    gauge_fg[i] = gauge_fg[i-1]+4;
+  }
+  return(0);
+}
+
+void free_gauge_fg() {
+  free(gauge_fg_);
+  free(gauge_fg);
+}
diff --git a/init/init_gauge_fg.h b/init/init_gauge_fg.h
new file mode 100644
index 000000000..3c606bafb
--- /dev/null
+++ b/init/init_gauge_fg.h
@@ -0,0 +1,27 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+#ifndef _INIT_GAUGE_FG_H
+#define _INIT_GAUGE_FG_H
+
+extern su3 ** gauge_fg;
+
+int init_gauge_fg(const int V);
+void free_gauge_fg();
+
+#endif
diff --git a/init/init_gauge_field.c b/init/init_gauge_field.c
index 04c406839..77e8e284c 100644
--- a/init/init_gauge_field.c
+++ b/init/init_gauge_field.c
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -27,13 +27,26 @@
 #include "su3.h"
 #include "sse.h"
 #include "init_gauge_field.h"
+#include "expo.h"
 
 su3 * gauge_field = NULL;
+su3_32 * gauge_field_32 = NULL;
+#ifdef TM_USE_BSM
+su3 * smeared_gauge_field = NULL;
+#endif
 #ifdef _USE_TSPLITPAR
 su3 * gauge_field_copyt = NULL;
 su3 * gauge_field_copys = NULL;
+#ifdef TM_USE_BSM
+su3 * smeared_gauge_field_copyt = NULL;
+su3 * smeared_gauge_field_copys = NULL;
+#endif
 #else
 su3 * gauge_field_copy = NULL;
+su3_32 * gauge_field_copy_32 = NULL;
+#ifdef TM_USE_BSM
+su3 * smeared_gauge_field_copy = NULL;
+#endif
 #endif
 
 int init_gauge_field(const int V, const int back) {
@@ -42,31 +55,64 @@ int init_gauge_field(const int V, const int back) {
 #ifdef _USE_TSPLITPAR
   g_gauge_field_copyt = NULL;
   g_gauge_field_copys = NULL;
+#ifdef TM_USE_BSM
+  g_smeared_gauge_field_copyt = NULL;
+  g_smeared_gauge_field_copys = NULL;
+#endif
 #else
   g_gauge_field_copy = NULL;
+#ifdef TM_USE_BSM
+  g_smeared_gauge_field_copy = NULL;
 #endif
+#endif
+
+  if (g_exposu3_no_c == 0) init_exposu3();
 
   if((void*)(g_gauge_field = (su3**)calloc(V, sizeof(su3*))) == NULL) {
     printf ("malloc errno : %d\n",errno); 
     errno = 0;
     return(1);
   }
+#ifdef TM_USE_BSM
+  if((void*)(g_smeared_gauge_field = (su3**)calloc(V, sizeof(su3*))) == NULL) { 
+    printf ("malloc errno : %d\n",errno); 
+    errno = 0;
+    return(1);
+  }
+#endif
   if((void*)(gauge_field = (su3*)calloc(4*V+1, sizeof(su3))) == NULL) {
     printf ("malloc errno : %d\n",errno); 
     errno = 0;
     return(2);
   }
+#ifdef TM_USE_BSM
+  if((void*)(smeared_gauge_field = (su3*)calloc(4*V+1, sizeof(su3))) == NULL) {
+    printf ("malloc errno : %d\n",errno);
+    errno = 0;
+    return(2);
+  }
+#endif
 #if (defined SSE || defined SSE2 || defined SSE3)
   g_gauge_field[0] = (su3*)(((unsigned long int)(gauge_field)+ALIGN_BASE)&~ALIGN_BASE);
+#ifdef TM_USE_BSM
+  g_smeared_gauge_field[0] = (su3*)(((unsigned long int)(smeared_gauge_field)+ALIGN_BASE)&~ALIGN_BASE);
+#endif
 #else
   g_gauge_field[0] = gauge_field;
+#ifdef TM_USE_BSM
+  g_smeared_gauge_field[0] = smeared_gauge_field;
+#endif
+
 #endif
   for(i = 1; i < V; i++){
     g_gauge_field[i] = g_gauge_field[i-1]+4;
+#ifdef TM_USE_BSM
+    g_smeared_gauge_field[i] = g_smeared_gauge_field[i-1]+4;
+#endif
   }
 
 #  if defined _USE_HALFSPINOR
-  if(back == 1) {
+  if(back == 1 && !lowmem_flag) {
     /*
       g_gauge_field_copy[ieo][PM][sites/2][mu]
     */
@@ -75,22 +121,52 @@ int init_gauge_field(const int V, const int back) {
       errno = 0;
       return(3);
     }
+#ifdef TM_USE_BSM
+    if((void*)(g_smeared_gauge_field_copy = (su3***)calloc(2, sizeof(su3**))) == NULL) {
+      printf ("malloc errno : %d\n",errno);
+      errno = 0;
+      return(3);
+    }
+#endif
     if((void*)(g_gauge_field_copy[0] = (su3**)calloc(VOLUME, sizeof(su3*))) == NULL) {
       printf ("malloc errno : %d\n",errno); 
       errno = 0;
       return(3);
     }
+#ifdef TM_USE_BSM
+    if((void*)(g_smeared_gauge_field_copy[0] = (su3**)calloc(VOLUME, sizeof(su3*))) == NULL) {
+      printf ("malloc errno : %d\n",errno);
+      errno = 0;
+      return(3);
+    }
+#endif
     g_gauge_field_copy[1] = g_gauge_field_copy[0] + (VOLUME)/2;
+#ifdef TM_USE_BSM 
+    g_smeared_gauge_field_copy[1] = g_smeared_gauge_field_copy[0] + (VOLUME)/2;
+#endif
     if((void*)(gauge_field_copy = (su3*)calloc(4*(VOLUME)+1, sizeof(su3))) == NULL) {
       printf ("malloc errno : %d\n",errno); 
       errno = 0;
       return(4);
     }
+#ifdef TM_USE_BSM
+    if((void*)(smeared_gauge_field_copy = (su3*)calloc(4*(VOLUME)+1, sizeof(su3))) == NULL) {
+      printf ("malloc errno : %d\n",errno);
+      errno = 0;
+      return(4);
+    }
+#endif
 #    if (defined SSE || defined SSE2 || defined SSE3)
     g_gauge_field_copy[0][0] = (su3*)(((unsigned long int)(gauge_field_copy)+ALIGN_BASE)&~ALIGN_BASE);
-#    else
+#ifdef TM_USE_BSM
+    g_smeared_gauge_field_copy[0][0] = (su3*)(((unsigned long int)(smeared_gauge_field_copy)+ALIGN_BASE)&~ALIGN_BASE);
+#endif
+#else
     g_gauge_field_copy[0][0] = gauge_field_copy;
-#    endif
+#ifdef TM_USE_BSM
+    g_smeared_gauge_field_copy[0][0] = smeared_gauge_field_copy;
+#endif
+#endif
     for(i = 1; i < (VOLUME)/2; i++) {
       g_gauge_field_copy[0][i] = g_gauge_field_copy[0][i-1]+4;
     }
@@ -98,9 +174,18 @@ int init_gauge_field(const int V, const int back) {
     for(i = 1; i < (VOLUME)/2; i++) {
       g_gauge_field_copy[1][i] = g_gauge_field_copy[1][i-1]+4;
     }
+#ifdef TM_USE_BSM
+  for(i = 1; i < (VOLUME)/2; i++) {
+      g_smeared_gauge_field_copy[0][i] = g_smeared_gauge_field_copy[0][i-1]+4;
+    }
+    g_smeared_gauge_field_copy[1][0] = g_smeared_gauge_field_copy[0][0] + 2*VOLUME;
+    for(i = 1; i < (VOLUME)/2; i++) {
+      g_smeared_gauge_field_copy[1][i] = g_smeared_gauge_field_copy[1][i-1]+4;
+    }
+#endif
   }
 #  elif defined _USE_TSPLITPAR
-  if(back == 1) {
+  if(back == 1 && !lowmem_flag) {
     if((void*)(g_gauge_field_copyt = (su3**)calloc((VOLUME+RAND), sizeof(su3*))) == NULL) {
       printf ("malloc errno : %d\n",errno); 
       errno = 0;
@@ -121,51 +206,254 @@ int init_gauge_field(const int V, const int back) {
       errno = 0;
       return(4);
     }
-#    if (defined SSE || defined SSE2 || defined SSE3)
+#ifdef TM_USE_BSM
+   if((void*)(g_smeared_gauge_field_copyt = (su3**)calloc((VOLUME+RAND), sizeof(su3*))) == NULL) {
+      printf ("malloc errno : %d\n",errno);
+      errno = 0;
+      return(3);
+    }
+    if((void*)(g_smeared_gauge_field_copys = (su3**)calloc((VOLUME+RAND), sizeof(su3*))) == NULL) {
+      printf ("malloc errno : %d\n",errno);
+      errno = 0;
+      return(3);
+    }
+    if((void*)(g_smeared_gauge_field_copyt = (su3*)calloc(2*(VOLUME+RAND)+1, sizeof(su3))) == NULL) {
+      printf ("malloc errno : %d\n",errno);
+      errno = 0;
+      return(4);
+    }
+    if((void*)(g_smeared_gauge_field_copys = (su3*)calloc(6*(VOLUME+RAND)+1, sizeof(su3))) == NULL) {
+      printf ("malloc errno : %d\n",errno);
+      errno = 0;
+      return(4);
+    }
+#endif
+#if (defined SSE || defined SSE2 || defined SSE3)
     g_gauge_field_copyt[0] = (su3*)(((unsigned long int)(gauge_field_copyt)+ALIGN_BASE)&~ALIGN_BASE);
     g_gauge_field_copys[0] = (su3*)(((unsigned long int)(gauge_field_copys)+ALIGN_BASE)&~ALIGN_BASE);
-#    else
+#ifdef TM_USE_BSM
+    g_smeared_gauge_field_copyt[0] = (su3*)(((unsigned long int)(smeared_gauge_field_copyt)+ALIGN_BASE)&~ALIGN_BASE);
+    g_smeared_gauge_field_copys[0] = (su3*)(((unsigned long int)(smeared_gauge_field_copys)+ALIGN_BASE)&~ALIGN_BASE);
+#endif
+#else
     g_gauge_field_copyt[0] = gauge_field_copyt;
     g_gauge_field_copys[0] = gauge_field_copys;
+#ifdef TM_USE_BSM
+    g_smeared_gauge_field_copyt[0] = smeared_gauge_field_copyt;
+    g_smeared_gauge_field_copys[0] = smeared_gauge_field_copys;
+#endif
 #    endif
     for(i = 1; i < (VOLUME+RAND); i++) {
       g_gauge_field_copyt[i] = g_gauge_field_copyt[i-1]+2;
       g_gauge_field_copys[i] = g_gauge_field_copys[i-1]+6;
     }
+#ifdef TM_USE_BSM
+    for(i = 1; i < (VOLUME+RAND); i++) {
+      g_smeared_gauge_field_copyt[i] = g_smeared_gauge_field_copyt[i-1]+2;
+      g_smeared_gauge_field_copys[i] = g_smeared_gauge_field_copys[i-1]+6;
+    }
+#endif
   }
 #  else  /* than _USE_HALFSPINOR or _USE_TSPLITPAR */
-  if(back == 1) {
+  if(back == 1 && !lowmem_flag) {
     if((void*)(g_gauge_field_copy = (su3**)calloc((VOLUME+RAND), sizeof(su3*))) == NULL) {
       printf ("malloc errno : %d\n",errno); 
       errno = 0;
       return(3);
     }
+#ifdef TM_USE_BSM
+    if((void*)(g_smeared_gauge_field_copy = (su3**)calloc((VOLUME+RAND), sizeof(su3*))) == NULL) {
+      printf ("malloc errno : %d\n",errno);
+      errno = 0;
+      return(3);
+    }
+#endif
     if((void*)(gauge_field_copy = (su3*)calloc(8*(VOLUME+RAND)+1, sizeof(su3))) == NULL) {
       printf ("malloc errno : %d\n",errno); 
       errno = 0;
       return(4);
     }
+#ifdef TM_USE_BSM
+    if((void*)(smeared_gauge_field_copy = (su3*)calloc(8*(VOLUME+RAND)+1, sizeof(su3))) == NULL) {
+      printf ("malloc errno : %d\n",errno);
+      errno = 0;
+      return(4);
+    }
+#endif
+
 #  if (defined SSE || defined SSE2 || defined SSE3)
     g_gauge_field_copy[0] = (su3*)(((unsigned long int)(gauge_field_copy)+ALIGN_BASE)&~ALIGN_BASE);
+#ifdef TM_USE_BSM
+    g_smeared_gauge_field_copy[0] = (su3*)(((unsigned long int)(smeared_gauge_field_copy)+ALIGN_BASE)&~ALIGN_BASE);
+#endif
 #  else
     g_gauge_field_copy[0] = gauge_field_copy;
+#ifdef TM_USE_BSM
+    g_smeared_gauge_field_copy[0] = smeared_gauge_field_copy;
+#endif
 #  endif
     for(i = 1; i < (VOLUME+RAND); i++) {
       g_gauge_field_copy[i] = g_gauge_field_copy[i-1]+8;
     }
+#ifdef TM_USE_BSM
+    for(i = 1; i < (VOLUME+RAND); i++) {
+      g_smeared_gauge_field_copy[i] = g_smeared_gauge_field_copy[i-1]+8;
+    }
+#endif
   }
-#  endif
+#endif
   g_update_gauge_copy = 1;
   return(0);
 }
 
 void free_gauge_field() {
   free(gauge_field);
+#ifdef TM_USE_BSM
+  free(smeared_gauge_field);
+#endif
   free(g_gauge_field);
+#ifdef TM_USE_BSM
+  free(g_smeared_gauge_field);
+#endif
+  if(!lowmem_flag){
 #  if defined _USE_TSPLITPAR
-  free(gauge_field_copys);
-  free(gauge_field_copyt);
+    free(gauge_field_copys);
+    free(gauge_field_copyt);
+#   ifdef USE_BSM
+    free(smeared_gauge_field_copys);
+    free(smeared_gauge_field_copyt);
+#endif
 #  else
-  free(gauge_field_copy);
-#  endif
+#   ifdef TM_USE_BSM
+     free(smeared_gauge_field_copy);
+#endif
+    free(gauge_field_copy);
+#endif
+  }
 }
+
+
+
+int init_gauge_field_32(const int V, const int back) {
+  if(!lowmem_flag){
+    int i=0;
+
+    g_gauge_field_copy_32 = NULL;
+
+
+    if((void*)(g_gauge_field_32 = (su3_32**)calloc(V, sizeof(su3_32*))) == NULL) {
+      printf ("malloc errno : %d\n",errno); 
+      errno = 0;
+      return(1);
+    }
+    if((void*)(gauge_field_32 = (su3_32*)calloc(4*V+1, sizeof(su3_32))) == NULL) {
+      printf ("malloc errno : %d\n",errno); 
+      errno = 0;
+      return(2);
+    }
+
+    /*doing alignment no matter what*/
+    g_gauge_field_32[0] = (su3_32*)(((unsigned long int)(gauge_field_32)+ALIGN_BASE32)&~ALIGN_BASE32);
+
+    for(i = 1; i < V; i++){
+      g_gauge_field_32[i] = g_gauge_field_32[i-1]+4;
+    }
+
+#    if defined _USE_HALFSPINOR
+    if(back == 1) {
+      /*
+        g_gauge_field_copy[ieo][PM][sites/2][mu]
+      */
+      if((void*)(g_gauge_field_copy_32 = (su3_32***)calloc(2, sizeof(su3_32**))) == NULL) {
+        printf ("malloc errno : %d\n",errno); 
+        errno = 0;
+        return(3);
+      }
+      if((void*)(g_gauge_field_copy_32[0] = (su3_32**)calloc(VOLUME, sizeof(su3_32*))) == NULL) {
+        printf ("malloc errno : %d\n",errno); 
+        errno = 0;
+        return(3);
+      }
+      g_gauge_field_copy_32[1] = g_gauge_field_copy_32[0] + (VOLUME)/2;
+      if((void*)(gauge_field_copy_32 = (su3_32*)calloc(4*(VOLUME)+1, sizeof(su3_32))) == NULL) {
+        printf ("malloc errno : %d\n",errno); 
+        errno = 0;
+        return(4);
+      }
+      /* doing alignment no matter what */
+      g_gauge_field_copy_32[0][0] = (su3_32*)(((unsigned long int)(gauge_field_copy_32)+ALIGN_BASE32)&~ALIGN_BASE32);
+
+      for(i = 1; i < (VOLUME)/2; i++) {
+        g_gauge_field_copy_32[0][i] = g_gauge_field_copy_32[0][i-1]+4;
+      }
+      g_gauge_field_copy_32[1][0] = g_gauge_field_copy_32[0][0] + 2*VOLUME; 
+      for(i = 1; i < (VOLUME)/2; i++) {
+        g_gauge_field_copy_32[1][i] = g_gauge_field_copy_32[1][i-1]+4;
+      }
+    }
+#    else  /* than _USE_HALFSPINOR  */
+    if(back == 1) {
+      if((void*)(g_gauge_field_copy_32 = (su3_32**)calloc((VOLUME+RAND), sizeof(su3_32*))) == NULL) {
+        printf ("malloc errno : %d\n",errno); 
+        errno = 0;
+        return(3);
+      }
+      if((void*)(gauge_field_copy_32 = (su3_32*)calloc(8*(VOLUME+RAND)+1, sizeof(su3_32))) == NULL) {
+        printf ("malloc errno : %d\n",errno); 
+        errno = 0;
+        return(4);
+      }
+
+      /* doing alignment no matter what */
+      g_gauge_field_copy_32[0] = (su3_32*)(((unsigned long int)(gauge_field_copy_32)+ALIGN_BASE32)&~ALIGN_BASE32);
+
+      for(i = 1; i < (VOLUME+RAND); i++) {
+        g_gauge_field_copy_32[i] = g_gauge_field_copy_32[i-1]+8;
+      }
+    }
+#    endif
+    g_update_gauge_copy_32 = 1;
+  }
+  return(0);
+}
+
+void free_gauge_field_32() {
+  if(!lowmem_flag){
+    free(gauge_field_32);
+    free(g_gauge_field_32);
+    free(gauge_field_copy_32);
+  }
+}
+
+
+void convert_32_gauge_field( su3_32** gf32, su3** gf, int V){
+ int i,mu;   
+  for(i = 0; i < V; i++) {
+    for(mu =0; mu<4; mu++){
+     gf32[i][mu].c00 = (_Complex float) gf[i][mu].c00;
+     gf32[i][mu].c01 = (_Complex float) gf[i][mu].c01;
+     gf32[i][mu].c02 = (_Complex float) gf[i][mu].c02;
+     
+     gf32[i][mu].c10 = (_Complex float) gf[i][mu].c10;
+     gf32[i][mu].c11 = (_Complex float) gf[i][mu].c11;
+     gf32[i][mu].c12 = (_Complex float) gf[i][mu].c12;    
+
+     gf32[i][mu].c20 = (_Complex float) gf[i][mu].c20;
+     gf32[i][mu].c21 = (_Complex float) gf[i][mu].c21;
+     gf32[i][mu].c22 = (_Complex float) gf[i][mu].c22;        
+    }
+  }
+#if defined _USE_HALFSPINOR
+  
+  
+  
+  
+#endif
+  
+}
+
+
+
+
+
+
diff --git a/init/init_gauge_field.h b/init/init_gauge_field.h
index c60fc699f..8245dfca2 100644
--- a/init/init_gauge_field.h
+++ b/init/init_gauge_field.h
@@ -21,5 +21,9 @@
 
 int init_gauge_field(const int V, const int back);
 void free_gauge_field();
+int init_gauge_field_32(const int V, const int back);
+void free_gauge_field_32();
+
+void convert_32_gauge_field( su3_32** gf32, su3** gf, int V);
 
 #endif
diff --git a/init/init_gauge_tmp.c b/init/init_gauge_tmp.c
index 73a935ea7..8e26fe84e 100644
--- a/init/init_gauge_tmp.c
+++ b/init/init_gauge_tmp.c
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/init/init_geometry_indices.c b/init/init_geometry_indices.c
index 0edc6437c..bc8872081 100644
--- a/init/init_geometry_indices.c
+++ b/init/init_geometry_indices.c
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -148,8 +148,10 @@ int init_geometry_indices(const int V) {
   for(i = 1; i < (T+4); i++){
     g_ipt[i] = g_ipt[i-1]+(LX+4);
   }
-  g_hi = (int*)calloc(16*(VOLUME+RAND)+2,sizeof(int));
-  if((void*) g_hi == NULL) return(40);
+  if(!lowmem_flag){
+    g_hi = (int*)calloc(16*(VOLUME+RAND)+2,sizeof(int));
+    if((void*) g_hi == NULL) return(40);
+  }
 
 #ifdef WITHLAPH
   g_idn3d = (int**)calloc(SPACEVOLUME, sizeof(int*));
@@ -172,7 +174,9 @@ void free_geometry_indices() {
   free(ipt_);
   free(ipt__);
   free(g_ipt);
-  free(g_hi);
+  if(!lowmem_flag){
+    free(g_hi);
+  }
   free(g_idn);
   free(g_iup);
   free(g_eo2lexic);
diff --git a/init/init_jacobi_field.c b/init/init_jacobi_field.c
index 52b4f1010..db3487eb8 100644
--- a/init/init_jacobi_field.c
+++ b/init/init_jacobi_field.c
@@ -80,7 +80,7 @@ double v[6];
      s->c1 = v[2] + v[3] * I;
      s->c2 = v[4] + v[5] * I;
  }
-#ifdef MPI
+#ifdef TM_USE_MPI
  xchange_jacobi(k);
 #endif
 }
@@ -96,7 +96,7 @@ double v[6];
      s=k+ix;
      *s=unif_su3_vector();
    }
-#ifdef MPI
+#ifdef TM_USE_MPI
  xchange_jacobi(k);
 #endif
 }
diff --git a/init/init_moment_field.c b/init/init_moment_field.c
index 876db4f9f..f0db1a156 100644
--- a/init/init_moment_field.c
+++ b/init/init_moment_field.c
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
- # include<config.h>
+ # include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/init/init_omp_accumulators.c b/init/init_omp_accumulators.c
index 47d733a46..9013200a2 100644
--- a/init/init_omp_accumulators.c
+++ b/init/init_omp_accumulators.c
@@ -18,9 +18,9 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 #include <omp.h>
 #endif
 #include <stdlib.h>
diff --git a/init/init_openmp.c b/init/init_openmp.c
index 2712010e7..59f67334f 100644
--- a/init/init_openmp.c
+++ b/init/init_openmp.c
@@ -18,9 +18,9 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 #include <omp.h>
 #include "init_omp_accumulators.h"
 #endif
@@ -30,7 +30,7 @@
 #include "global.h"
 
 void init_openmp(void) {
-#ifdef OMP  
+#ifdef TM_USE_OMP  
   if(omp_num_threads > 0) 
   {
      omp_set_num_threads(omp_num_threads);
diff --git a/init/init_parallel.c b/init/init_parallel.c
new file mode 100644
index 000000000..0d1a5453f
--- /dev/null
+++ b/init/init_parallel.c
@@ -0,0 +1,79 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2017  Bartosz Kostrzewa, Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include <tmlqcd_config.h>
+#endif
+#ifdef TM_USE_MPI
+#include <mpi.h>
+#endif
+#ifdef TM_USE_OMP
+#include <omp.h>
+#include "init/init_openmp.h"
+#endif
+#ifdef TM_USE_QPHIX
+#include "qphix/qphix_config.h"
+#endif
+#ifdef QPHIX_QMP_COMMS
+#include <qmp.h>
+#endif
+
+#include "init_parallel.h"
+#include "global.h"
+#include "read_input.h"
+
+void init_parallel_and_read_input(int argc, char *argv[], char input_filename[]) {
+#ifdef QPHIX_QMP_COMMS
+  // Initialize QMP
+  QMP_thread_level_t prv;
+  if (QMP_init_msg_passing(&argc, &argv, QMP_THREAD_SINGLE, &prv) != QMP_SUCCESS) {
+    QMP_error("Failed to initialize QMP\n");
+    abort();
+  }
+  if (QMP_is_primary_node()) {
+    printf("QMP IS INITIALIZED\n");
+  }
+#elif defined(TM_USE_MPI) && !defined(QPHIX_QMP_COMMS)
+#ifdef TM_USE_OMP
+  int mpi_thread_provided;
+  MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_thread_provided);
+#else
+  MPI_Init(&argc, &argv);
+#endif
+#endif  // QPHIX_QMP_COMMS
+
+#if defined(TM_USE_MPI) || defined(QPHIX_QMP_COMMS)
+  MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id);
+#else
+  g_proc_id = 0;
+#endif
+
+// Read the input file
+int status = read_input(input_filename);
+if (status != 0) {
+  fprintf(stderr, "Could not find input file: %s\nAborting...\n", input_filename);
+  exit(-1);
+}
+
+#ifdef TM_USE_OMP
+  init_openmp();
+#endif
+}
diff --git a/init/init_parallel.h b/init/init_parallel.h
new file mode 100644
index 000000000..2ed88deae
--- /dev/null
+++ b/init/init_parallel.h
@@ -0,0 +1,27 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2017  Bartosz Kostrzewa, Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *******************************************************************************/
+
+#ifndef _INIT_PARALLEL_H
+#define _INIT_PARALLEL_H
+
+void init_parallel_and_read_input(int argc, char *argv[], char input_filename[]);
+
+#endif
diff --git a/init/init_scalar_field.c b/init/init_scalar_field.c
index f1ba15401..5e6d95a46 100644
--- a/init/init_scalar_field.c
+++ b/init/init_scalar_field.c
@@ -21,9 +21,8 @@
  *
  ***********************************************************************/
 
-
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -32,9 +31,11 @@
 #include "su3.h"
 #include "sse.h"
 
-
+#ifdef TM_USE_BSM
 scalar * sca = NULL;
 
+scalar * scasmeared = NULL;
+
 int init_scalar_field(const int V, const int nr) {
   int i = 0;
 
@@ -58,10 +59,33 @@ int init_scalar_field(const int V, const int nr) {
     g_scalar_field[i] = g_scalar_field[i-1]+V;
   }
 
+  if((void*)(scasmeared = (scalar*)calloc(nr*V+1, sizeof(scalar))) == NULL) {
+    printf ("malloc errno : %d\n",errno);
+    errno = 0;
+    return(1);
+  }
+  if((void*)(g_smeared_scalar_field = malloc(nr*sizeof(scalar*))) == NULL) {
+    printf ("malloc errno : %d\n",errno);
+    errno = 0;
+    return(2);
+  }
+#if ( defined SSE || defined SSE2 || defined SSE3)
+  g_smeared_scalar_field[0] = (scalar*)(((unsigned long int)(scasmeared)+ALIGN_BASE)&~ALIGN_BASE);
+#else
+  g_smeared_scalar_field[0] = scasmeared;
+#endif
+
+  for(i = 1; i < nr; i++){
+    g_smeared_scalar_field[i] = g_smeared_scalar_field[i-1]+V;
+  }
+
+
+
   return(0);
 }
 
 void free_scalar_field() {
   free(sca);
+  free(scasmeared);
 }
-
+#endif
diff --git a/init/init_spinor_field.c b/init/init_spinor_field.c
index a2c8eefcf..0eba87b89 100644
--- a/init/init_spinor_field.c
+++ b/init/init_spinor_field.c
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -80,6 +80,58 @@ void free_spinor_field() {
 }
 
 
+spinor32 * sp32 = NULL;
+int init_spinor_field_32(const int V, const int nr) {
+  int i = 0;
+
+#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+  if((void*)(sp32 = (spinor32*)shmalloc((nr*V+1)*sizeof(spinor32))) == NULL) {
+    printf ("malloc errno : %d\n",errno); 
+    errno = 0;
+    return(1);
+  }
+#else
+  if((void*)(sp32 = (spinor32*)calloc(nr*V+1, sizeof(spinor32))) == NULL) {
+    printf ("malloc errno : %d\n",errno); 
+    errno = 0;
+    return(1);
+  }
+#endif
+  if((void*)(g_spinor_field32 = (spinor32**)malloc(nr*sizeof(spinor32*))) == NULL) {
+    printf ("malloc errno : %d\n",errno); 
+    errno = 0;
+    return(2);
+  }
+#if ( defined SSE || defined SSE2 || defined SSE3)
+  g_spinor_field32[0] = (spinor32*)(((unsigned long int)(sp32)+ALIGN_BASE32)&~ALIGN_BASE32);
+#else
+  g_spinor_field32[0] = (spinor32*)(((unsigned long int)(sp32)+ALIGN_BASE32)&~ALIGN_BASE32);
+#endif
+  
+  for(i = 1; i < nr; i++){
+    g_spinor_field32[i] = g_spinor_field32[i-1]+V;
+  }
+
+  return(0);
+}
+
+void free_spinor_field_32() {
+#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+  shfree(sp32);
+#else
+  free(sp32);
+#endif
+}
+
+
+
+
+
+
+
+
+
+
 /** 
  * costumized spinor allocation routines
  */
diff --git a/init/init_spinor_field.h b/init/init_spinor_field.h
index 87cf22fbd..5aad71c1a 100644
--- a/init/init_spinor_field.h
+++ b/init/init_spinor_field.h
@@ -20,12 +20,14 @@
 #define _INIT_SPINOR_FIELD_H
 
 int init_spinor_field(const int V, const int nr);
+int init_spinor_field_32(const int V, const int nr);
 int init_csg_field(const int V);
 
 int allocate_spinor_field_array(spinor ***spinors,spinor **sp,const int V, const int nr);
 void free_spinor_field_array(spinor** sp);
 
 void free_spinor_field();
+void free_spinor_field_32();
 int init_timslice_buffer_field(const int t_slice);
 
 #endif
diff --git a/init/init_stout_smear_vars.c b/init/init_stout_smear_vars.c
index 3bbb986a9..1e433f00b 100644
--- a/init/init_stout_smear_vars.c
+++ b/init/init_stout_smear_vars.c
@@ -20,7 +20,7 @@
 
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -28,6 +28,7 @@
 #include "global.h"
 #include "su3.h"
 #include "sse.h"
+#include "expo.h"
 #include "init_stout_smear_vars.h"
 
 su3 * gauge_field_saved;
@@ -91,6 +92,8 @@ int init_stout_smear_vars(const int V, const int stout_no_iter)
   k = 0;
   mu = 0;
 
+  if (g_exposu3_no_c == 0) init_exposu3();
+
   /*
    *  this is the field where we store the smeared force matrices \Sigma^{(k)}_\mu(x)
    *  eqtn (44) hep-lat/0311018
diff --git a/integrator.c b/integrator.c
index b2a709a91..94a536d8c 100644
--- a/integrator.c
+++ b/integrator.c
@@ -13,13 +13,13 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU General Public License
  * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -28,6 +28,7 @@
 #include "global.h"
 #include "monomial/monomial.h"
 #include "update_momenta.h"
+#include "update_momenta_fg.h"
 #include "update_gauge.h"
 #include "hamiltonian_field.h"
 #include "integrator.h"
@@ -40,13 +41,15 @@ static const double omf4_vartheta = 0.08398315262876693;
 static const double omf4_lamb = 0.6822365335719091;
 
 /* second order minimal norm integration scheme */
-void integrate_2mn(const double tau, const int S, const int halfstep);
+void integrate_2mn(const double tau, const int S, const int halfstep, const double tau2);
 /* second order minimal norm integration scheme in velocity version */
-void integrate_2mnp(const double tau, const int S, const int halfstep);
+void integrate_2mnp(const double tau, const int S, const int halfstep, const double tau2);
+/* fourth order force gradient integration scheme */
+void integrate_2mnfg(const double tau, const int S, const int halfstep, const double tau2);
 /* Leap Frog integration scheme */
-void integrate_leap_frog(const double tau, const int S, const int halfstep);
+void integrate_leap_frog(const double tau, const int S, const int halfstep, const double tau2);
 /* fourth order OMF scheme */
-void integrate_omf4(const double tau, const int S, const int halfstep);
+void integrate_omf4(const double tau, const int S, const int halfstep,const double tau2 );
 /* half step function */
 void dohalfstep(const double tau, const int S);
 
@@ -77,6 +80,9 @@ int init_integrator() {
       else if(Integrator.type[i] == OMF4) {
 	Integrator.integrate[i] = &integrate_omf4;
       }
+      else if(Integrator.type[i] == MN2FG) {
+	Integrator.integrate[i] = &integrate_2mnfg;
+      }
     }
   }
 
@@ -120,16 +126,17 @@ void integrator_unset_fields() {
   return;
 }
 
-void integrate_omf4(const double tau, const int S, const int halfstep) {
+void integrate_omf4(const double tau, const int S, const int halfstep, const double tau2) {
   int i,j=0;
   integrator * itgr = &Integrator;
-  double eps;
+  double eps,eps2;
 
   if(S == itgr->no_timescales-1) {
     dohalfstep(tau, S);
   }
-  eps = tau/((double)itgr->n_int[S]);
-
+  eps  = tau/((double)itgr->n_int[S]);
+  eps2 = tau2/((double)itgr->n_int[S]);
+  
   if(S == 0) {
 
     for(j = 1; j < itgr->n_int[0]; j++) {
@@ -154,36 +161,36 @@ void integrate_omf4(const double tau, const int S, const int halfstep) {
     update_momenta(itgr->mnls_per_ts[0], omf4_lamb*eps, itgr->no_mnls_per_ts[0], &itgr->hf);
     update_gauge(omf4_rho*eps, &itgr->hf);
     if(halfstep != 1) {
-      update_momenta(itgr->mnls_per_ts[0], 2*omf4_vartheta*eps, itgr->no_mnls_per_ts[0], &itgr->hf);
+      update_momenta(itgr->mnls_per_ts[0], omf4_vartheta*(eps+eps2), itgr->no_mnls_per_ts[0], &itgr->hf);
     }
   }
   else {
     for(i = 1; i < itgr->n_int[S]; i++){
-      itgr->integrate[S-1](omf4_rho*eps, S-1, 0);
+      itgr->integrate[S-1](omf4_rho*eps, S-1, 0, omf4_theta*eps);
       update_momenta(itgr->mnls_per_ts[S], omf4_lamb*eps, itgr->no_mnls_per_ts[S], &itgr->hf);
-      itgr->integrate[S-1](omf4_theta*eps, S-1, 0);
+      itgr->integrate[S-1](omf4_theta*eps, S-1, 0, (1-2.*(omf4_theta+omf4_rho))*eps);
       update_momenta(itgr->mnls_per_ts[S], 0.5*(1-2.*(omf4_lamb+omf4_vartheta))*eps, itgr->no_mnls_per_ts[S], &itgr->hf);
-      itgr->integrate[S-1]((1-2.*(omf4_theta+omf4_rho))*eps, S-1, 0);
+      itgr->integrate[S-1]((1-2.*(omf4_theta+omf4_rho))*eps, S-1, 0, omf4_theta*eps);
       update_momenta(itgr->mnls_per_ts[S], 0.5*(1-2.*(omf4_lamb+omf4_vartheta))*eps, itgr->no_mnls_per_ts[S], &itgr->hf);
-      itgr->integrate[S-1](omf4_theta*eps, S-1, 0);
+      itgr->integrate[S-1](omf4_theta*eps, S-1, 0, omf4_rho*eps);
       update_momenta(itgr->mnls_per_ts[S], omf4_lamb*eps, itgr->no_mnls_per_ts[S], &itgr->hf);
-      itgr->integrate[S-1](omf4_rho*eps, S-1, 0);
+      itgr->integrate[S-1](omf4_rho*eps, S-1, 0, omf4_rho*eps);
       update_momenta(itgr->mnls_per_ts[S], 2*omf4_vartheta*eps, itgr->no_mnls_per_ts[S], &itgr->hf);
     }
-    itgr->integrate[S-1](omf4_rho*eps, S-1, 0);
+    itgr->integrate[S-1](omf4_rho*eps, S-1, 0, omf4_theta*eps);
     update_momenta(itgr->mnls_per_ts[S], omf4_lamb*eps, itgr->no_mnls_per_ts[S], &itgr->hf);
-    itgr->integrate[S-1](omf4_theta*eps, S-1, 0);
+    itgr->integrate[S-1](omf4_theta*eps, S-1, 0, (1-2.*(omf4_theta+omf4_rho))*eps);
     update_momenta(itgr->mnls_per_ts[S], 0.5*(1-2.*(omf4_lamb+omf4_vartheta))*eps, itgr->no_mnls_per_ts[S], &itgr->hf);
-    itgr->integrate[S-1]((1-2.*(omf4_theta+omf4_rho))*eps, S-1, 0);
+    itgr->integrate[S-1]((1-2.*(omf4_theta+omf4_rho))*eps, S-1, 0, omf4_theta*eps);
     update_momenta(itgr->mnls_per_ts[S], 0.5*(1-2.*(omf4_lamb+omf4_vartheta))*eps, itgr->no_mnls_per_ts[S], &itgr->hf);
-    itgr->integrate[S-1](omf4_theta*eps, S-1, 0);
+    itgr->integrate[S-1](omf4_theta*eps, S-1, 0, omf4_rho*eps);
     update_momenta(itgr->mnls_per_ts[S], omf4_lamb*eps, itgr->no_mnls_per_ts[S], &itgr->hf);
     if(S == itgr->no_timescales-1) {
-      itgr->integrate[S-1](omf4_rho*eps, S-1, 1);
+      itgr->integrate[S-1](omf4_rho*eps, S-1, 1, omf4_rho*eps);
     }
-    else itgr->integrate[S-1](omf4_rho*eps, S-1, halfstep);
+    else itgr->integrate[S-1](omf4_rho*eps, S-1, halfstep, omf4_rho*eps2);
     if(halfstep != 1 && S != itgr->no_timescales-1) {
-      update_momenta(itgr->mnls_per_ts[S], 2*omf4_vartheta*eps, itgr->no_mnls_per_ts[S], &itgr->hf);
+      update_momenta(itgr->mnls_per_ts[S], omf4_vartheta*(eps+eps2), itgr->no_mnls_per_ts[S], &itgr->hf);
     }
   }
 
@@ -195,17 +202,17 @@ void integrate_omf4(const double tau, const int S, const int halfstep) {
 
 /* the following are only needed locally */
 
-void integrate_2mn(const double tau, const int S, const int halfstep) {
+void integrate_2mn(const double tau, const int S, const int halfstep, const double tau2) {
   int i,j=0;
   integrator * itgr = &Integrator;
-  double eps,
-    oneminus2lambda = (1.-2.*itgr->lambda[S]);
+  double eps,eps2, oneminus2lambda = (1.-2.*itgr->lambda[S]);
 
   if(S == itgr->no_timescales-1) {
     dohalfstep(tau, S);
   }
   
-  eps = tau/((double)itgr->n_int[S]);
+  eps  = tau/((double)itgr->n_int[S]);
+  eps2 = tau2/((double)itgr->n_int[S]);
   if(S == 0) {
 
     for(j = 1; j < itgr->n_int[0]; j++) {
@@ -218,24 +225,24 @@ void integrate_2mn(const double tau, const int S, const int halfstep) {
     update_momenta(itgr->mnls_per_ts[0], oneminus2lambda*eps, itgr->no_mnls_per_ts[0], &itgr->hf);
     update_gauge(0.5*eps, &itgr->hf);
     if(halfstep != 1) {
-      update_momenta(itgr->mnls_per_ts[0], 2*itgr->lambda[0]*eps, itgr->no_mnls_per_ts[0], &itgr->hf);
+      update_momenta(itgr->mnls_per_ts[0], itgr->lambda[0]*(eps+eps2), itgr->no_mnls_per_ts[0], &itgr->hf);
     }
   }
   else {
     for(i = 1; i < itgr->n_int[S]; i++){
-      itgr->integrate[S-1](eps/2., S-1, 0);
+      itgr->integrate[S-1](eps/2., S-1, 0, eps/2);
       update_momenta(itgr->mnls_per_ts[S], oneminus2lambda*eps, itgr->no_mnls_per_ts[S], &itgr->hf);
-      itgr->integrate[S-1](eps/2., S-1, 0);
+      itgr->integrate[S-1](eps/2., S-1, 0, eps/2);
       update_momenta(itgr->mnls_per_ts[S], 2*itgr->lambda[S]*eps, itgr->no_mnls_per_ts[S], &itgr->hf);
     }
-    itgr->integrate[S-1](eps/2., S-1, 0);
+    itgr->integrate[S-1](eps/2., S-1, 0, eps/2);
     update_momenta(itgr->mnls_per_ts[S], oneminus2lambda*eps, itgr->no_mnls_per_ts[S], &itgr->hf);
     if(S == itgr->no_timescales-1) {
-      itgr->integrate[S-1](eps/2., S-1, 1);
+      itgr->integrate[S-1](eps/2., S-1, 1, eps/2.);
     }
-    else itgr->integrate[S-1](eps/2., S-1, halfstep);
+    else itgr->integrate[S-1](eps/2., S-1, halfstep, eps2/2.);
     if(halfstep != 1 && S != itgr->no_timescales-1) {
-      update_momenta(itgr->mnls_per_ts[S], 2*itgr->lambda[S]*eps, itgr->no_mnls_per_ts[S], &itgr->hf);
+      update_momenta(itgr->mnls_per_ts[S], itgr->lambda[S]*(eps+eps2), itgr->no_mnls_per_ts[S], &itgr->hf);
     }
   }
 
@@ -244,10 +251,11 @@ void integrate_2mn(const double tau, const int S, const int halfstep) {
   }
 }
 
-void integrate_2mnp(const double tau, const int S, const int halfstep) {
+void integrate_2mnp(const double tau, const int S, const int halfstep, const double tau2) {
   int i;
   integrator * itgr = &Integrator;
-  double eps = tau/((double)itgr->n_int[S]);
+  double eps  = tau/((double)itgr->n_int[S]);
+  double eps2 = tau2/((double)itgr->n_int[S]); // dummy stepsize
   double oneminus2lambda = (1.-2.*itgr->lambda[S]);
   
   if(S == 0) {
@@ -265,50 +273,100 @@ void integrate_2mnp(const double tau, const int S, const int halfstep) {
   }
   else {
     for(i = 0; i < itgr->n_int[S]; i++) {
-      integrate_2mnp(itgr->lambda[S]*eps, S-1, halfstep);
+      integrate_2mnp(itgr->lambda[S]*eps, S-1, halfstep,oneminus2lambda*eps);
       update_momenta(itgr->mnls_per_ts[S], 0.5*eps, itgr->no_mnls_per_ts[S], &itgr->hf);
 
-      integrate_2mnp(oneminus2lambda*eps, S-1, halfstep);
+      integrate_2mnp(oneminus2lambda*eps, S-1, halfstep,itgr->lambda[S]*eps);
       update_momenta(itgr->mnls_per_ts[S], 0.5*eps, itgr->no_mnls_per_ts[S], &itgr->hf);
 
-      integrate_2mnp(itgr->lambda[S]*eps, S-1, halfstep);
+      integrate_2mnp(itgr->lambda[S]*eps, S-1, halfstep,itgr->lambda[S]*eps);
     }
   }
 }
 
+/* For 2MNFG lamda MUST be equal to 1/6 */
+void integrate_2mnfg(const double tau, const int S, const int halfstep, const double tau2) {
+  int i,j=0;
+  integrator * itgr = &Integrator;
+  double eps,eps2, oneminus2lambda = (1.-2.*itgr->lambda[S]);
+
+  if(S == itgr->no_timescales-1) {
+    dohalfstep(tau, S);
+  }
+  
+  eps  = tau/((double)itgr->n_int[S]);
+  eps2 = tau2/((double)itgr->n_int[S]);
+  if(S == 0) {
+
+    for(j = 1; j < itgr->n_int[0]; j++) {
+      update_gauge(0.5*eps, &itgr->hf);
+      update_momenta_fg(itgr->mnls_per_ts[0], oneminus2lambda*eps, itgr->no_mnls_per_ts[0], &itgr->hf, eps);
+      update_gauge(0.5*eps, &itgr->hf);
+      update_momenta(itgr->mnls_per_ts[0], 2.*itgr->lambda[0]*eps, itgr->no_mnls_per_ts[0], &itgr->hf);
+    }
+    update_gauge(0.5*eps, &itgr->hf);
+    update_momenta_fg(itgr->mnls_per_ts[0], oneminus2lambda*eps, itgr->no_mnls_per_ts[0], &itgr->hf, eps);
+    update_gauge(0.5*eps, &itgr->hf);
+    if(halfstep != 1) {
+      update_momenta(itgr->mnls_per_ts[0], itgr->lambda[0]*(eps+eps2), itgr->no_mnls_per_ts[0], &itgr->hf);
+    }
+  }
+  else {
+    for(i = 1; i < itgr->n_int[S]; i++){
+      itgr->integrate[S-1](eps/2., S-1, 0, eps/2);
+      update_momenta_fg(itgr->mnls_per_ts[S], oneminus2lambda*eps, itgr->no_mnls_per_ts[S], &itgr->hf, eps);
+      itgr->integrate[S-1](eps/2., S-1, 0, eps/2);
+      update_momenta(itgr->mnls_per_ts[S], 2*itgr->lambda[S]*eps, itgr->no_mnls_per_ts[S], &itgr->hf);
+    }
+    itgr->integrate[S-1](eps/2., S-1, 0, eps/2);
+    update_momenta_fg(itgr->mnls_per_ts[S], oneminus2lambda*eps, itgr->no_mnls_per_ts[S], &itgr->hf, eps);
+    if(S == itgr->no_timescales-1) {
+      itgr->integrate[S-1](eps/2., S-1, 1, eps/2.);
+    }
+    else itgr->integrate[S-1](eps/2., S-1, halfstep, eps2/2.);
+    if(halfstep != 1 && S != itgr->no_timescales-1) {
+      update_momenta(itgr->mnls_per_ts[S], itgr->lambda[S]*(eps+eps2), itgr->no_mnls_per_ts[S], &itgr->hf);
+    }
+  }
+
+  if(S == itgr->no_timescales-1) {
+    dohalfstep(tau, S);
+  }
+}
 
-void integrate_leap_frog(const double tau, const int S, const int halfstep) {
+void integrate_leap_frog(const double tau, const int S, const int halfstep,const double tau2) {
   int i;
   integrator * itgr = &Integrator;
-  double eps, eps0;
+  double eps, eps0, eps2;
 
   if(S == itgr->no_timescales-1) {
     dohalfstep(tau, S);
   }
 
   eps = tau/((double)itgr->n_int[S]);
+  eps2 = tau2/((double)itgr->n_int[S]);
   if(S == 0) {
-    eps0 = tau/((double)itgr->n_int[0]);
+    eps0 = tau/((double)itgr->n_int[0]); //what is the meaning of this variable ??
     for(i = 1; i < itgr->n_int[0]; i++) {
       update_gauge(eps0, &itgr->hf);
       update_momenta(itgr->mnls_per_ts[0], eps0, itgr->no_mnls_per_ts[0], &itgr->hf);
     }
     update_gauge(eps0, &itgr->hf);
     if(halfstep != 1) {
-      update_momenta(itgr->mnls_per_ts[0], eps0, itgr->no_mnls_per_ts[0], &itgr->hf);
+      update_momenta(itgr->mnls_per_ts[0], 0.5*(eps0+eps2), itgr->no_mnls_per_ts[0], &itgr->hf);
     }
   }
   else {
     for(i = 1; i < itgr->n_int[S]; i++){
-      itgr->integrate[S-1](eps, S-1, 0);
+      itgr->integrate[S-1](eps, S-1, 0, eps);
       update_momenta(itgr->mnls_per_ts[S], eps, itgr->no_mnls_per_ts[S], &itgr->hf);
     }
     if(S == itgr->no_timescales-1) {
-      itgr->integrate[S-1](eps, S-1, 1);
+      itgr->integrate[S-1](eps, S-1, 1, eps);
     }
-    else itgr->integrate[S-1](eps, S-1, halfstep);
+    else itgr->integrate[S-1](eps, S-1, halfstep, eps2);
     if(halfstep != 1 && S != itgr->no_timescales-1) {
-      update_momenta(itgr->mnls_per_ts[S], eps, itgr->no_mnls_per_ts[S], &itgr->hf);
+      update_momenta(itgr->mnls_per_ts[S], 0.5*(eps+eps2), itgr->no_mnls_per_ts[S], &itgr->hf);
     }
   }
 
@@ -326,7 +384,7 @@ void dohalfstep(const double tau, const int S) {
       update_momenta(itgr->mnls_per_ts[i], 0.5*eps, itgr->no_mnls_per_ts[i], &itgr->hf);
       eps /= ((double)itgr->n_int[i-1]);
     }
-    else if(itgr->type[i] == MN2){
+    else if((itgr->type[i] == MN2) || (itgr->type[i] == MN2FG)){
       update_momenta(itgr->mnls_per_ts[i], itgr->lambda[i]*eps, itgr->no_mnls_per_ts[i], &itgr->hf);
       eps /= ((double)itgr->n_int[i-1])*2;
     }
@@ -338,7 +396,7 @@ void dohalfstep(const double tau, const int S) {
   if(itgr->type[0] == LEAPFROG) {
     update_momenta(itgr->mnls_per_ts[0], 0.5*eps, itgr->no_mnls_per_ts[0], &itgr->hf);
   }
-  else if(itgr->type[0] == MN2) {
+  else if((itgr->type[0] == MN2)||(itgr->type[0] == MN2FG)) {
     update_momenta(itgr->mnls_per_ts[0], itgr->lambda[0]*eps, itgr->no_mnls_per_ts[0], &itgr->hf);
   }
   else if(itgr->type[0] == OMF4) {
diff --git a/integrator.h b/integrator.h
index f8f48c908..c09685a8b 100644
--- a/integrator.h
+++ b/integrator.h
@@ -31,8 +31,9 @@
 #define MN2 6
 #define MN2p 7
 #define OMF4 8
+#define MN2FG 9
 
-typedef void (*integratefk)(const double, const int, const int);
+typedef void (*integratefk)(const double, const int, const int, const double);
 
 typedef struct {
   /* gauge, momenta and derivative fields to be used during integration */
diff --git a/invert.c b/invert.c
index b54060a2c..49c0bf8a8 100644
--- a/invert.c
+++ b/invert.c
@@ -26,7 +26,7 @@
 
 #include"lime.h"
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -34,10 +34,10 @@
 #include <time.h>
 #include <string.h>
 #include <signal.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include "global.h"
@@ -46,13 +46,24 @@
 #include "linalg_eo.h"
 #include "geometry_eo.h"
 #include "start.h"
+#include "gettime.h"
 /*#include "eigenvalues.h"*/
 #include "measure_gauge_action.h"
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include "xchange/xchange.h"
 #endif
 #include <io/utils.h>
+#ifdef TM_USE_BSM
+#include "operator/D_psi_BSM3.h"
+#include "operator/D_psi_BSM.h"
+#include "operator/D_psi_BSM2b.h"
+#include "operator/D_psi_BSM2f.h"
+#include "operator/D_psi_BSM2m.h"
 #include <io/scalar.h>
+#include "buffers/utils_nonblocking.h"
+#include "buffers/utils_nogauge.h"
+#endif
+#include "source_generation.h"
 #include "read_input.h"
 #include "mpi_init.h"
 #include "sighandler.h"
@@ -71,7 +82,6 @@
 #include "block.h"
 #include "operator.h"
 #include "sighandler.h"
-#include "solver/dfl_projector.h"
 #include "solver/generate_dfl_subspace.h"
 #include "prepare_source.h"
 #include <io/params.h>
@@ -79,10 +89,25 @@
 #include <io/spinor.h>
 #include <io/utils.h>
 #include "solver/dirac_operator_eigenvectors.h"
+#include "source_generation.h"
 #include "P_M_eta.h"
 #include "operator/tm_operators.h"
 #include "operator/Dov_psi.h"
 #include "solver/spectral_proj.h"
+#ifdef TM_USE_QUDA
+#  include "quda_interface.h"
+#endif
+#ifdef TM_USE_QPHIX
+#  include "qphix_interface.h"
+#endif
+#ifdef DDalphaAMG
+#  include "DDalphaAMG_interface.h"
+#endif
+#include "meas/measurements.h"
+#include "source_generation.h"
+#include "expo.h"
+
+#define CONF_FILENAME_LENGTH 500
 
 extern int nstore;
 int check_geometry();
@@ -90,6 +115,7 @@ int check_geometry();
 static void usage();
 static void process_args(int argc, char *argv[], char ** input_filename, char ** filename);
 static void set_default_filenames(char ** input_filename, char ** filename);
+static void invert_compute_modenumber();
 
 int main(int argc, char *argv[])
 {
@@ -97,13 +123,14 @@ int main(int argc, char *argv[])
   int j, i, ix = 0, isample = 0, op_id = 0;
   char datafilename[206];
   char parameterfilename[206];
-  char conf_filename[50];
+#ifdef TM_USE_BSM
   char scalar_filename[50];
+#endif
+  char conf_filename[CONF_FILENAME_LENGTH];
   char * input_filename = NULL;
   char * filename = NULL;
   double plaquette_energy;
   struct stout_parameters params_smear;
-  spinor **s, *s_;
 
 #ifdef _KOJAK_INST
 #pragma pomp inst init
@@ -116,41 +143,19 @@ int main(int argc, char *argv[])
 
   DUM_DERI = 8;
   DUM_MATRIX = DUM_DERI + 5;
-#if ((defined BGL && defined XLC) || defined _USE_TSPLITPAR)
-  NO_OF_SPINORFIELDS = DUM_MATRIX + 3;
-#else
-  NO_OF_SPINORFIELDS = DUM_MATRIX + 3;
-#endif
+  NO_OF_SPINORFIELDS = DUM_MATRIX + 4;
+
+  //4 extra fields (corresponding to DUM_MATRIX+0..5) for deg. and ND matrix mult.  
+  NO_OF_SPINORFIELDS_32 = 6;
 
   verbose = 0;
   g_use_clover_flag = 0;
 
-#ifdef MPI
-
-#  ifdef OMP
-  int mpi_thread_provided;
-  MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_thread_provided);
-#  else
-  MPI_Init(&argc, &argv);
-#  endif
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id);
-#else
-  g_proc_id = 0;
-#endif
 
   process_args(argc,argv,&input_filename,&filename);
   set_default_filenames(&input_filename, &filename);
 
-  /* Read the input file */
-  if( (j = read_input(input_filename)) != 0) {
-    fprintf(stderr, "Could not find input file: %s\nAborting...\n", input_filename);
-    exit(-1);
-  }
-
-#ifdef OMP
-  init_openmp();
-#endif
+  init_parallel_and_read_input(argc, argv, input_filename);
 
   /* this DBW2 stuff is not needed for the inversion ! */
   if (g_dflgcr_flag == 1) {
@@ -171,21 +176,23 @@ int main(int argc, char *argv[])
 
   /* starts the single and double precision random number */
   /* generator                                            */
-  start_ranlux(rlxd_level, random_seed);
+  start_ranlux(rlxd_level, random_seed^nstore);
 
   /* we need to make sure that we don't have even_odd_flag = 1 */
   /* if any of the operators doesn't use it                    */
   /* in this way even/odd can still be used by other operators */
   for(j = 0; j < no_operators; j++) if(!operator_list[j].even_odd_flag) even_odd_flag = 0;
 
-#ifndef MPI
+#ifndef TM_USE_MPI
   g_dbw2rand = 0;
 #endif
 
 #ifdef _GAUGE_COPY
   j = init_gauge_field(VOLUMEPLUSRAND, 1);
+  j += init_gauge_field_32(VOLUMEPLUSRAND, 1);
 #else
   j = init_gauge_field(VOLUMEPLUSRAND, 0);
+  j += init_gauge_field_32(VOLUMEPLUSRAND, 0);  
 #endif
   if (j != 0) {
     fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n");
@@ -210,22 +217,24 @@ int main(int argc, char *argv[])
   }
   if (even_odd_flag) {
     j = init_spinor_field(VOLUMEPLUSRAND / 2, NO_OF_SPINORFIELDS);
+    j += init_spinor_field_32(VOLUMEPLUSRAND / 2, NO_OF_SPINORFIELDS_32);   
   }
   else {
     j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS);
+    j += init_spinor_field_32(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS_32);   
   }
   if (j != 0) {
     fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n");
     exit(-1);
   }
 
-  if(have_bsm_op) {
-    j = init_bispinor_field(VOLUMEPLUSRAND, 6);
-    if ( j!= 0) {
-      fprintf(stderr, "Not enough memory for bispinor fields! Aborting...\n");
-      exit(0);
-    }
+#ifdef TM_USE_BSM
+  j = init_bispinor_field(VOLUMEPLUSRAND, 6);
+  if ( j!= 0) {
+    fprintf(stderr, "Not enough memory for bispinor fields! Aborting...\n");
+    exit(0);
   }
+#endif
 
   if (g_running_phmc) {
     j = init_chi_spinor_field(VOLUMEPLUSRAND / 2, 20);
@@ -259,6 +268,15 @@ int main(int argc, char *argv[])
 
   init_operators();
 
+  /* list and initialize measurements*/
+  if(g_proc_id == 0) {
+    printf("\n");
+    for(int j = 0; j < no_measurements; j++) {
+      printf("# measurement id %d, type = %d\n", j, measurement_list[j].type);
+    }
+  }
+  init_measurements();  
+
   /* this could be maybe moved to init_operators */
 #ifdef _USE_HALFSPINOR
   j = init_dirac_halfspinor();
@@ -266,13 +284,12 @@ int main(int argc, char *argv[])
     fprintf(stderr, "Not enough memory for halffield! Aborting...\n");
     exit(-1);
   }
-  if (g_sloppy_precision_flag == 1) {
-    j = init_dirac_halfspinor32();
-    if (j != 0)
-    {
-      fprintf(stderr, "Not enough memory for 32-bit halffield! Aborting...\n");
-      exit(-1);
-    }
+  /* for mixed precision solvers, the 32 bit halfspinor field must always be there */
+  j = init_dirac_halfspinor32();
+  if (j != 0)
+  {
+    fprintf(stderr, "Not enough memory for 32-bit halffield! Aborting...\n");
+    exit(-1);
   }
 #  if (defined _PERSISTENT)
   if (even_odd_flag)
@@ -281,7 +298,16 @@ int main(int argc, char *argv[])
 #endif
 
   for (j = 0; j < Nmeas; j++) {
-    sprintf(conf_filename, "%s.%.4d", gauge_input_filename, nstore);
+    int n_written = snprintf(conf_filename, CONF_FILENAME_LENGTH, "%s.%.4d", gauge_input_filename, nstore);
+    if( n_written < 0 || n_written >= CONF_FILENAME_LENGTH ){
+      char error_message[500];
+      snprintf(error_message,
+               500,
+               "Encoding error or gauge configuration filename "
+               "longer than %d characters! See invert.c CONF_FILENAME_LENGTH\n", 
+               CONF_FILENAME_LENGTH);
+      fatal_error(error_message, "invert.c");
+    }
     if (g_cart_id == 0) {
       printf("#\n# Trying to read gauge field from file %s in %s precision.\n",
             conf_filename, (gauge_precision_read_flag == 32 ? "single" : "double"));
@@ -292,15 +318,15 @@ int main(int argc, char *argv[])
       exit(-2);
     }
 
-
     if (g_cart_id == 0) {
       printf("# Finished reading gauge field.\n");
       fflush(stdout);
     }
-#ifdef MPI
+#ifdef TM_USE_MPI
     xchange_gauge(g_gauge_field);
 #endif
-
+    /*Convert to a 32 bit gauge field, after xchange*/
+    convert_32_gauge_field(g_gauge_field_32, g_gauge_field, VOLUMEPLUSRAND);
     /*compute the energy of the gauge field*/
     plaquette_energy = measure_plaquette( (const su3**) g_gauge_field);
 
@@ -323,6 +349,16 @@ int main(int argc, char *argv[])
       }
     }
 
+    /* if any measurements are defined in the input file, do them here */
+    measurement * meas;
+    for(int imeas = 0; imeas < no_measurements; imeas++){
+      meas = &measurement_list[imeas];
+      if (g_proc_id == 0) {
+        fprintf(stdout, "#\n# Beginning online measurement.\n");
+      }
+      meas->measurefunc(nstore, imeas, even_odd_flag);
+    }
+
     if (reweighting_flag == 1) {
       reweighting_factor(reweighting_samples, nstore);
     }
@@ -333,89 +369,22 @@ int main(int argc, char *argv[])
                   0, compute_evs, nstore, even_odd_flag);
     }
     if (phmc_compute_evs != 0) {
-#ifdef MPI
+#ifdef TM_USE_MPI
       MPI_Finalize();
 #endif
       return(0);
     }
 
     /* Compute the mode number or topological susceptibility using spectral projectors, if wanted*/
-
     if(compute_modenumber != 0 || compute_topsus !=0){
-      
-      s_ = calloc(no_sources_z2*VOLUMEPLUSRAND+1, sizeof(spinor));
-      s  = calloc(no_sources_z2, sizeof(spinor*));
-      if(s_ == NULL) { 
-	printf("Not enough memory in %s: %d",__FILE__,__LINE__); exit(42); 
-      }
-      if(s == NULL) { 
-	printf("Not enough memory in %s: %d",__FILE__,__LINE__); exit(42); 
-      }
-      
-      
-      for(i = 0; i < no_sources_z2; i++) {
-#if (defined SSE3 || defined SSE2 || defined SSE)
-        s[i] = (spinor*)(((unsigned long int)(s_)+ALIGN_BASE)&~ALIGN_BASE)+i*VOLUMEPLUSRAND;
-#else
-        s[i] = s_+i*VOLUMEPLUSRAND;
-#endif
-	
-        random_spinor_field_lexic(s[i], reproduce_randomnumber_flag,RN_Z2);
-	
-/* 	what is this here needed for?? */
-/*         spinor *aux_,*aux; */
-/* #if ( defined SSE || defined SSE2 || defined SSE3 ) */
-/*         aux_=calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); */
-/*         aux = (spinor *)(((unsigned long int)(aux_)+ALIGN_BASE)&~ALIGN_BASE); */
-/* #else */
-/*         aux_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); */
-/*         aux = aux_; */
-/* #endif */
-	
-        if(g_proc_id == 0) {
-          printf("source %d \n", i);
-        }
-	
-        if(compute_modenumber != 0){
-          mode_number(s[i], mstarsq);
-        }
-	
-        if(compute_topsus !=0) {
-          top_sus(s[i], mstarsq);
-        }
-      }
-      free(s);
-      free(s_);
+      invert_compute_modenumber(); 
     }
 
-
-    /* move to operators as well */
-    if (g_dflgcr_flag == 1) {
-      /* set up deflation blocks */
+    //  set up blocks if Deflation is used 
+    if (g_dflgcr_flag) 
       init_blocks(nblocks_t, nblocks_x, nblocks_y, nblocks_z);
-
-      /* the can stay here for now, but later we probably need */
-      /* something like init_dfl_solver called somewhere else  */
-      /* create set of approximate lowest eigenvectors ("global deflation subspace") */
-
-      /*       g_mu = 0.; */
-      /*       boundary(0.125); */
-      generate_dfl_subspace(g_N_s, VOLUME, reproduce_randomnumber_flag);
-      /*       boundary(g_kappa); */
-      /*       g_mu = g_mu1; */
-
-      /* Compute little Dirac operators */
-      /*       alt_block_compute_little_D(); */
-      if (g_debug_level > 0) {
-        check_projectors(reproduce_randomnumber_flag);
-        check_local_D(reproduce_randomnumber_flag);
-      }
-      if (g_debug_level > 1) {
-        check_little_D_inversion(reproduce_randomnumber_flag);
-      }
-
-    }
-    if(SourceInfo.type == 1) {
+    
+    if(SourceInfo.type == SRC_TYPE_VOL || SourceInfo.type == SRC_TYPE_PION_TS || SourceInfo.type == SRC_TYPE_GEN_PION_TS) {
       index_start = 0;
       index_end = 1;
     }
@@ -423,7 +392,7 @@ int main(int argc, char *argv[])
     g_precWS=NULL;
     if(use_preconditioning == 1){
       /* todo load fftw wisdom */
-#if (defined HAVE_FFTW ) && !( defined MPI)
+#if (defined HAVE_FFTW ) && !( defined TM_USE_MPI)
       loadFFTWWisdom(g_spinor_field[0],g_spinor_field[1],T,LX);
 #else
       use_preconditioning=0;
@@ -436,7 +405,20 @@ int main(int argc, char *argv[])
     for(op_id = 0; op_id < no_operators; op_id++) {
       boundary(operator_list[op_id].kappa);
       g_kappa = operator_list[op_id].kappa; 
-      g_mu = 0.;
+#if defined TM_USE_BSM
+      if (operator_list[op_id].type == BSM2f){
+        init_D_psi_BSM2f();
+      }
+      else if (operator_list[op_id].type == BSM3){
+        init_D_psi_BSM3();
+      }
+#endif
+      g_mu = operator_list[op_id].mu;
+      g_c_sw = operator_list[op_id].c_sw;
+      // DFLGCR and DFLFGMRES
+      if(operator_list[op_id].solver == DFLGCR || operator_list[op_id].solver == DFLFGMRES) {
+        generate_dfl_subspace(g_N_s, VOLUME, reproduce_randomnumber_flag);
+      }
 
       if(use_preconditioning==1 && PRECWSOPERATORSELECT[operator_list[op_id].solver]!=PRECWS_NO ){
         printf("# Using preconditioning with treelevel preconditioning operator: %s \n",
@@ -455,20 +437,20 @@ int main(int argc, char *argv[])
         }
       }
      
-      /* set scalar field counter to InitialScalarCounter */
-      int iscalar = nscalar; 
       /* support multiple inversions for the BSM operator, one for each scalar field */
+#ifdef TM_USE_BSM
       for(int i_pergauge = 0; i_pergauge < operator_list[op_id].npergauge; ++i_pergauge){
+        /* set scalar field counter to InitialScalarCounter */
+        int iscalar= nscalar+j*operator_list[op_id].nscalarstep*operator_list[op_id].npergauge + i_pergauge*operator_list[op_id].nscalarstep;
         // generate or read the scalar field for the BSM operator
         if(operator_list[op_id].type == BSM || operator_list[op_id].type == BSM2b || operator_list[op_id].type == BSM2m || operator_list[op_id].type == BSM2f ){
           /* used by op_write_prop to generate an appropriate output filename */
-          operator_list[op_id].n = i_pergauge;
+          operator_list[op_id].n = iscalar;
           // read scalar field
           if( strcmp(scalar_input_filename, "create_random_scalarfield") == 0 ) {
             for( int s = 0; s < 4; s++) { ranlxd(g_scalar_field[s], VOLUME); }
           } else {
             snprintf(scalar_filename, 50, "%s.%d", scalar_input_filename, iscalar);
-            ++iscalar;
     	      if (g_cart_id == 0) {
     	        printf("#\n# Trying to read scalar field from file %s in %s precision.\n",
     		             scalar_filename, (scalar_precision_read_flag == 32 ? "single" : "double"));
@@ -500,15 +482,16 @@ int main(int argc, char *argv[])
             /* we use g_spinor_field[0-7] for sources and props for the moment */
             /* 0-3 in case of 1 flavour  */
             /* 0-7 in case of 2 flavours */
-            prepare_source(nstore, isample, ix, op_id, read_source_flag, source_location);
+            prepare_source(nstore, isample, ix, op_id, read_source_flag, source_location, random_seed);
             //randmize initial guess for eigcg if needed-----experimental
             if( (operator_list[op_id].solver == INCREIGCG) && (operator_list[op_id].solver_params.eigcg_rand_guess_opt) ){ //randomize the initial guess
                 gaussian_volume_source( operator_list[op_id].prop0, operator_list[op_id].prop1,isample,ix,0); //need to check this
             } 
-            operator_list[op_id].inverter(op_id, index_start, 1);
+            operator_list[op_id].inverter(op_id, index_start, operator_list[op_id].write_prop_flag );
           }
         }
       }
+#endif
 
 
       if(use_preconditioning==1 && operator_list[op_id].precWS!=NULL ){
@@ -520,29 +503,46 @@ int main(int argc, char *argv[])
       if(operator_list[op_id].type == OVERLAP){
         free_Dov_WS();
       }
+#if defined TM_USE_BSM
+      if (operator_list[op_id].type == BSM2f){
+        free_D_psi_BSM2f();
+      }
+      if (operator_list[op_id].type == BSM3){
+        free_D_psi_BSM3();
+      }
+#endif
 
-    }
-    nstore += Nsave;
+    
   }
+  nstore += Nsave;
 
-#ifdef OMP
+ }
+#ifdef TM_USE_OMP
   free_omp_accumulators();
 #endif
   free_blocks();
   free_dfl_subspace();
   free_gauge_field();
+#if defined TM_USE_BSM
   free_scalar_field();
+#endif
+  free_gauge_field_32();
   free_geometry_indices();
   free_spinor_field();
+  free_spinor_field_32();  
   free_moment_field();
   free_chi_spinor_field();
   free(filename);
   free(input_filename);
-#ifdef MPI
+  free(SourceInfo.basename);
+  free(PropInfo.basename);
+#ifdef TM_USE_QUDA
+  _endQuda();
+#endif
+#ifdef TM_USE_MPI
   MPI_Barrier(MPI_COMM_WORLD);
   MPI_Finalize();
 #endif
-  free_D_psi_BSM2f();
   return(0);
 #ifdef _KOJAK_INST
 #pragma pomp inst end(main)
@@ -607,3 +607,32 @@ static void set_default_filenames(char ** input_filename, char ** filename) {
   } 
 }
 
+static void invert_compute_modenumber() {
+  spinor * s_ = calloc(no_sources_z2*VOLUMEPLUSRAND+1, sizeof(spinor));
+  spinor ** s  = calloc(no_sources_z2, sizeof(spinor*));
+  if(s_ == NULL) { 
+    printf("Not enough memory in %s: %d",__FILE__,__LINE__); exit(42); 
+  }
+  if(s == NULL) { 
+    printf("Not enough memory in %s: %d",__FILE__,__LINE__); exit(42); 
+  }
+  for(int i = 0; i < no_sources_z2; i++) {
+    s[i] = (spinor*)(((unsigned long int)(s_)+ALIGN_BASE)&~ALIGN_BASE)+i*VOLUMEPLUSRAND;
+    random_spinor_field_lexic(s[i], reproduce_randomnumber_flag,RN_Z2);
+	
+    if(g_proc_id == 0) {
+      printf("source %d \n", i);
+    }
+	
+    if(compute_modenumber != 0){
+      mode_number(s[i], mstarsq);
+    }
+	  
+    if(compute_topsus !=0) {
+      top_sus(s[i], mstarsq);
+    }
+  }
+  free(s);
+  free(s_);
+}
+
diff --git a/invert_clover_eo.c b/invert_clover_eo.c
index f318acb1d..ff9990cf3 100644
--- a/invert_clover_eo.c
+++ b/invert_clover_eo.c
@@ -32,7 +32,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include<stdlib.h>
 #include"global.h"
@@ -41,68 +41,171 @@
 #include"operator/tm_operators.h"
 #include"operator/Hopping_Matrix.h"
 #include"operator/clovertm_operators.h"
+#include"operator/clovertm_operators_32.h"
 #include"operator/D_psi.h"
 #include"gamma.h"
+#include"read_input.h"
 #include"solver/solver.h"
+#include"solver/solver_params.h"
 #include"invert_clover_eo.h"
 #include "solver/dirac_operator_eigenvectors.h"
-
+#include "solver/dfl_projector.h"
+#ifdef TM_USE_QUDA
+#  include "quda_interface.h"
+#endif
+#ifdef DDalphaAMG
+#  include "DDalphaAMG_interface.h"
+#endif
+#ifdef TM_USE_QPHIX
+#  include "qphix_interface.h"
+#endif
 
 int invert_clover_eo(spinor * const Even_new, spinor * const Odd_new, 
-		     spinor * const Even, spinor * const Odd,
-		     const double precision, const int max_iter,
-		     const int solver_flag, const int rel_prec,solver_params_t solver_params,
-		     su3 *** gf, matrix_mult Qsq, matrix_mult Qm) {
+                     spinor * const Even, spinor * const Odd,
+                     const double precision, const int max_iter,
+                     const int solver_flag, const int rel_prec, const int even_odd_flag,
+		     solver_params_t solver_params,
+                     su3 *** gf, matrix_mult Qsq, matrix_mult Qm,
+                     const ExternalInverter external_inverter, const SloppyPrecision sloppy, const CompressionType compression) {
   int iter;
 
-  if(g_proc_id == 0 && g_debug_level > 0) {
-    printf("# Using even/odd preconditioning!\n"); fflush(stdout);
-  }
-
-  assign_mul_one_sw_pm_imu_inv(EE, Even_new, Even, +g_mu);
+  if(even_odd_flag) {  
+    if(g_proc_id == 0 && g_debug_level > 0) {
+      printf("# Using even/odd preconditioning!\n"); fflush(stdout);
+    }
     
-  Hopping_Matrix(OE, g_spinor_field[DUM_DERI], Even_new); 
-  /* The sign is plus, since in Hopping_Matrix */
-  /* the minus is missing                      */
-  assign_mul_add_r(g_spinor_field[DUM_DERI], +1., Odd, VOLUME/2);
-  /* Do the inversion with the preconditioned  */
-  /* matrix to get the odd sites               */
-
-  /* Here we invert the hermitean operator squared */
-  gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2);
-  if(g_proc_id == 0) {
-    //printf("# Using CG!\n"); 
-    printf("# mu = %f, kappa = %f, csw = %f\n", 
-	   g_mu/2./g_kappa, g_kappa, g_c_sw);
-    fflush(stdout);
-  }
-  
-  if(solver_flag == CG){
-    if(g_proc_id == 0) {printf("# Using CG!\n"); fflush(stdout);}
-    iter = cg_her(Odd_new, g_spinor_field[DUM_DERI], max_iter, 
-		 precision, rel_prec, 
-		 VOLUME/2, Qsq);
-    Qm(Odd_new, Odd_new);
-    }else if(solver_flag == INCREIGCG){
+#ifdef TM_USE_QUDA
+    if( external_inverter==QUDA_INVERTER ) {
+      return invert_eo_quda(Even_new, Odd_new, Even, Odd,
+                            precision, max_iter,
+                            solver_flag, rel_prec,
+                            even_odd_flag, solver_params,
+                            sloppy, compression);
+    }
+#endif
+    
+#ifdef DDalphaAMG
+     if ( solver_flag == MG )
+    {
+      return MG_solver_eo(Even_new, Odd_new, Even, Odd, precision, max_iter,
+                          rel_prec, VOLUME/2, gf[0], &Msw_full);
+    }
+#endif
 
-       if(g_proc_id == 0) {printf("# Using Incremental Eig-CG!\n"); fflush(stdout);}
-       iter = incr_eigcg(VOLUME/2,solver_params.eigcg_nrhs, solver_params.eigcg_nrhs1, Odd_new, g_spinor_field[DUM_DERI], solver_params.eigcg_ldh, Qsq,
- 		    	            solver_params.eigcg_tolsq1, solver_params.eigcg_tolsq, solver_params.eigcg_restolsq , solver_params.eigcg_rand_guess_opt, 
-                                    rel_prec, max_iter, solver_params.eigcg_nev, solver_params.eigcg_vmax);
-       Qm(Odd_new, Odd_new);
+    if(g_proc_id == 0) {
+      printf("# mu = %.12f, kappa = %.12f, csw = %.12f\n", 
+             g_mu/2./g_kappa, g_kappa, g_c_sw);
+      fflush(stdout);
+    }
 
-   }else{
-    if(g_proc_id == 0) {printf("# This solver is not available for this operator. Exisiting!\n"); fflush(stdout);}
-    return 0;
+    assign_mul_one_sw_pm_imu_inv(EE, Even_new, Even, +g_mu);
+    
+    Hopping_Matrix(OE, g_spinor_field[DUM_DERI], Even_new); 
+    /* The sign is plus, since in Hopping_Matrix */
+    /* the minus is missing                      */
+    assign_mul_add_r(g_spinor_field[DUM_DERI], +1., Odd, VOLUME/2);
+    /* Do the inversion with the preconditioned  */
+    /* matrix to get the odd sites               */
+    
+    /* Here we invert the hermitean operator squared */
+#ifdef TM_USE_QPHIX
+    if( external_inverter==QPHIX_INVERTER ) {
+      // QPhiX inverts M(mu)M(mu)^dag or M(mu), no gamma_5 multiplication required
+      iter = invert_eo_qphix_oneflavour(Odd_new, g_spinor_field[DUM_DERI],
+                                        max_iter, precision,
+                                        solver_flag, rel_prec,
+                                        solver_params,
+                                        sloppy,
+                                        compression);
+      // for solver_params.solution_type == TM_SOLUTION_M (the default)
+      // QPhiX applies M(mu)^dag internally for normal equation solves, no call to tmLQCD operaor required
+    } else
+#endif    
+    if(solver_flag == CG) {
+      if(g_proc_id == 0) {printf("# Using CG!\n"); fflush(stdout);}
+      gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2);
+      iter = cg_her(Odd_new, g_spinor_field[DUM_DERI], max_iter, 
+                    precision, rel_prec, 
+                    VOLUME/2, Qsq);
+      Qm(Odd_new, Odd_new);
+    }
+    else if(solver_flag == INCREIGCG){
+      gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2);
+      if(g_proc_id == 0) {printf("# Using Incremental Eig-CG!\n"); fflush(stdout);}
+      iter = incr_eigcg(VOLUME/2,solver_params.eigcg_nrhs, solver_params.eigcg_nrhs1, Odd_new, g_spinor_field[DUM_DERI], solver_params.eigcg_ldh, Qsq,
+                        solver_params.eigcg_tolsq1, solver_params.eigcg_tolsq, solver_params.eigcg_restolsq , solver_params.eigcg_rand_guess_opt, 
+                        rel_prec, max_iter, solver_params.eigcg_nev, solver_params.eigcg_vmax);
+      Qm(Odd_new, Odd_new);
+    }
+    else if(solver_flag == MIXEDCG){
+      gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2);
+      iter = mixed_cg_her(Odd_new, g_spinor_field[DUM_DERI], solver_params, 
+			  max_iter, precision, rel_prec, 
+                          VOLUME/2, &Qsw_pm_psi, &Qsw_pm_psi_32);
+      Qm(Odd_new, Odd_new);
+    }
+    else if(solver_flag == RGMIXEDCG){
+      gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2);
+      iter = rg_mixed_cg_her(Odd_new, g_spinor_field[DUM_DERI], solver_params, max_iter, precision, rel_prec,
+			                     VOLUME/2, &Qsw_pm_psi, &Qsw_pm_psi_32);
+      Qm(Odd_new, Odd_new);
+    }
+    else{
+      if(g_proc_id == 0) {printf("# This solver is not available for this operator. Exiting!\n"); fflush(stdout);}
+      return 0;
+    }
+    
+    /* Reconstruct the even sites                */
+    Hopping_Matrix(EO, g_spinor_field[DUM_DERI], Odd_new);
+    clover_inv(g_spinor_field[DUM_DERI], +1, g_mu);
+    /* The sign is plus, since in Hopping_Matrix */
+    /* the minus is missing                      */
+    assign_add_mul_r(Even_new, g_spinor_field[DUM_DERI], +1., VOLUME/2);
   }
 
+  else {
+    if(g_proc_id == 0) {
+      printf("# Not using even/odd preconditioning!\n"); fflush(stdout);
+    }
+#ifdef TM_USE_QUDA
+    if( external_inverter==QUDA_INVERTER ) {
+      return invert_eo_quda(Even_new, Odd_new, Even, Odd,
+                            precision, max_iter,
+                            solver_flag, rel_prec,
+                            even_odd_flag, solver_params,
+                            sloppy, compression);
+    }
+#endif
+    convert_eo_to_lexic(g_spinor_field[DUM_DERI], Even, Odd);
 
-  /* Reconstruct the even sites                */
-  Hopping_Matrix(EO, g_spinor_field[DUM_DERI], Odd_new);
-  clover_inv(g_spinor_field[DUM_DERI], +1, g_mu);
-  /* The sign is plus, since in Hopping_Matrix */
-  /* the minus is missing                      */
-  assign_add_mul_r(Even_new, g_spinor_field[DUM_DERI], +1., VOLUME/2);
-
+    if(solver_flag == DFLGCR) {
+      if(g_proc_id == 0) {printf("# Using deflated FGMRES solver! m = %d\n", gmres_m_parameter); fflush(stdout);}
+      iter = gcr(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], gmres_m_parameter, 
+                 max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, 2, &D_psi);
+    }
+    else if (solver_flag == DFLFGMRES) {
+      if(g_proc_id == 0) {printf("# Using deflated FGMRES solver! m = %d\n", gmres_m_parameter); fflush(stdout);}
+      iter = fgmres(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], gmres_m_parameter, 
+                    max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, 2, &D_psi);
+    }
+    else if(solver_flag == CG){
+      if(g_proc_id == 0) {
+	printf("# Using CG!\n"); fflush(stdout);
+      }
+      gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], VOLUME);
+      iter = cg_her(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], max_iter, precision, 
+		    rel_prec, VOLUME, Qsq);
+      Qm(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]);
+    }
+#ifdef DDalphaAMG
+    else if ( solver_flag == MG )
+    {
+      return MG_solver_eo(Even_new, Odd_new, Even, Odd, precision, max_iter,
+                          rel_prec, VOLUME/2, gf[0], &Msw_full);
+    }
+#endif
+    convert_lexic_to_eo(Even_new, Odd_new, g_spinor_field[DUM_DERI+1]);
+  }
   return(iter);
 }
+
diff --git a/invert_clover_eo.h b/invert_clover_eo.h
index e0d7cd996..adecc037e 100644
--- a/invert_clover_eo.h
+++ b/invert_clover_eo.h
@@ -1,13 +1,18 @@
 #ifndef _INVERT_CLOVER_EO_H
 #define _INVERT_CLOVER_EO_H
 
+#include "global.h"
 #include "su3.h"
+#include "misc_types.h"
 #include "solver/matrix_mult_typedef.h"
 #include "solver/solver_params.h"
+
 int invert_clover_eo(spinor * const Even_new, spinor * const Odd_new, 
-		     spinor * const Even, spinor * const Odd,
-		     const double precision, const int max_iter,
-		     const int solver_flag, const int rel_prec,solver_params_t solver_params,
-		     su3 *** gf, matrix_mult Qsq, matrix_mult Qm);
+                     spinor * const Even, spinor * const Odd,
+                     const double precision, const int max_iter,
+                     const int solver_flag, const int rel_prec,
+		     const int even_odd_flag, solver_params_t solver_params,
+                     su3 *** gf, matrix_mult Qsq, matrix_mult Qm,
+                     const ExternalInverter external_inverter, const SloppyPrecision sloppy, const CompressionType compression);
 
 #endif
diff --git a/invert_doublet_eo.c b/invert_doublet_eo.c
index 4a6419abe..55b4f2d9d 100644
--- a/invert_doublet_eo.c
+++ b/invert_doublet_eo.c
@@ -32,7 +32,7 @@
  ****************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include<stdlib.h>
 #include"global.h"
@@ -45,8 +45,17 @@
 #include"read_input.h"
 #include"xchange/xchange.h"
 #include"operator/tm_operators_nd.h"
+#include"operator/tm_operators_nd_32.h"
 #include"invert_doublet_eo.h"
-
+#ifdef TM_USE_QUDA
+#  include "quda_interface.h"
+#endif
+#ifdef DDalphaAMG
+#  include "DDalphaAMG_interface.h"
+#endif
+#ifdef TM_USE_QPHIX
+#include "qphix_interface.h"
+#endif
 
 #ifdef HAVE_GPU
 #  include"GPU/cudadefs.h"
@@ -59,73 +68,42 @@ extern su3* g_trafo;
 #  endif
 #endif
 
-
 int invert_doublet_eo(spinor * const Even_new_s, spinor * const Odd_new_s, 
-		      spinor * const Even_new_c, spinor * const Odd_new_c, 
-		      spinor * const Even_s, spinor * const Odd_s,
-		      spinor * const Even_c, spinor * const Odd_c,
-		      const double precision, const int max_iter,
-		      const int solver_flag, const int rel_prec) {
+                      spinor * const Even_new_c, spinor * const Odd_new_c,
+                      spinor * const Even_s, spinor * const Odd_s,
+                      spinor * const Even_c, spinor * const Odd_c,
+                      const double precision, const int max_iter,
+                      const int solver_flag, const int rel_prec, 
+                      solver_params_t solver_params, const ExternalInverter external_inverter, 
+                      const SloppyPrecision sloppy, const CompressionType compression) {
 
   int iter = 0;
-  
+
+#ifdef TM_USE_QUDA
+  if( external_inverter==QUDA_INVERTER ) {
+    return invert_doublet_eo_quda( Even_new_s, Odd_new_s, Even_new_c, Odd_new_c,
+                                   Even_s, Odd_s, Even_c, Odd_c,
+                                   precision, max_iter,
+                                   solver_flag, rel_prec, 1,
+                                   sloppy, compression );
+  }
+#endif
+
+#ifdef DDalphaAMG
+  if( solver_flag==MG ) {
+    return MG_solver_nd_eo( Even_new_s, Odd_new_s, Even_new_c, Odd_new_c,
+                            Even_s, Odd_s, Even_c, Odd_c,
+                            precision, max_iter, rel_prec,
+                            VOLUME/2, g_gauge_field, M_full_ndpsi );
+  }
+#endif
   
 #ifdef HAVE_GPU
 #  ifdef TEMPORALGAUGE
-  
-  /* initialize temporal gauge here */
-  int retval;
-  double dret1, dret2;
-  double plaquette1 = 0.0;
-  double plaquette2 = 0.0;
-  
   if (usegpu_flag) {
-    
-    /* need VOLUME here (not N=VOLUME/2)*/
-    if ((retval = init_temporalgauge_trafo(VOLUME, g_gauge_field)) != 0 ) {				// initializes the transformation matrices
-      if (g_proc_id == 0) printf("Error while gauge fixing to temporal gauge. Aborting...\n");   	//	g_tempgauge_field as a copy of g_gauge_field
-      exit(200);
-    }
-    
-    /* do trafo */
-    plaquette1 = measure_plaquette(g_gauge_field);
-    apply_gtrafo(g_gauge_field, g_trafo);								// transformation of the gauge field
-    plaquette2 = measure_plaquette(g_gauge_field);
-    if (g_proc_id == 0) printf("\tPlaquette before gauge fixing: %.16e\n", plaquette1/6./VOLUME);
-    if (g_proc_id == 0) printf("\tPlaquette after gauge fixing:  %.16e\n", plaquette2/6./VOLUME);
-    
-    /* do trafo to odd_s part of source */
-    dret1 = square_norm(Odd_s, VOLUME/2 , 1);
-    apply_gtrafo_spinor_odd(Odd_s, g_trafo);								// odd spinor transformation, strange
-    dret2 = square_norm(Odd_s, VOLUME/2, 1);
-    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); 
-    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2);
-    
-    /* do trafo to odd_c part of source */
-    dret1 = square_norm(Odd_c, VOLUME/2 , 1);
-    apply_gtrafo_spinor_odd(Odd_c, g_trafo);								// odd spinor transformation, charm
-    dret2 = square_norm(Odd_c, VOLUME/2, 1);
-    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); 
-    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2);       
-    
-    /* do trafo to even_s part of source */
-    dret1 = square_norm(Even_s, VOLUME/2 , 1);
-    apply_gtrafo_spinor_even(Even_s, g_trafo);							// even spinor transformation, strange
-    dret2 = square_norm(Even_s, VOLUME/2, 1);
-    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); 
-    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2);
-    
-    /* do trafo to even_c part of source */
-    dret1 = square_norm(Even_c, VOLUME/2 , 1);
-    apply_gtrafo_spinor_even(Even_c, g_trafo);							// even spinor transformation, charm
-    dret2 = square_norm(Even_c, VOLUME/2, 1);
-    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); 
-    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2);
-    
-#    ifdef MPI
-    xchange_gauge(g_gauge_field);
-#    endif
-    
+    gtrafo_eo_nd(Even_s, Odd_s, Even_c, Odd_c, 
+                 (spinor*const)NULL, (spinor*const)NULL, (spinor*const)NULL, (spinor*const)NULL, 
+                 GTRAFO_APPLY);    
   } 
 #  endif  
 #endif /* HAVE_GPU*/
@@ -134,8 +112,8 @@ int invert_doublet_eo(spinor * const Even_new_s, spinor * const Odd_new_s,
   /* here comes the inversion using even/odd preconditioning */
   if(g_proc_id == 0) {printf("# Using even/odd preconditioning!\n"); fflush(stdout);}
   M_ee_inv_ndpsi(Even_new_s, Even_new_c, 
-		 Even_s, Even_c,
-		 g_mubar, g_epsbar);
+                 Even_s, Even_c,
+                 g_mubar, g_epsbar);
   Hopping_Matrix(OE, g_spinor_field[DUM_DERI], Even_new_s);
   Hopping_Matrix(OE, g_spinor_field[DUM_DERI+1], Even_new_c);
   
@@ -153,44 +131,58 @@ int invert_doublet_eo(spinor * const Even_new_s, spinor * const Odd_new_s,
     printf("# Using CG for TMWILSON flavour doublet!\n"); 
     fflush(stdout);
   }
-  gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2);
-  gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+1], VOLUME/2);
-  
+  if ( external_inverter == NO_EXT_INV ){
+    gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2);
+    gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+1], VOLUME/2);
   
 #ifdef HAVE_GPU
-  if (usegpu_flag) {	// GPU, mixed precision solver
-#  if defined(MPI) && defined(PARALLELT)
-    iter = mixedsolve_eo_nd(Odd_new_s, Odd_new_c, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1],
-			    max_iter, precision, rel_prec);
-#  elif !defined(MPI) && !defined(PARALLELT)
-    iter = mixedsolve_eo_nd(Odd_new_s, Odd_new_c, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1],
-			    max_iter, precision, rel_prec);
-#  else
-    printf("MPI and/or PARALLELT are not appropriately set for the GPU implementation. Aborting...\n");
-    exit(-1);
-#  endif
-  }
-  else {		// CPU, conjugate gradient
-    iter = cg_her_nd(Odd_new_s, Odd_new_c, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1],
-		     max_iter, precision, rel_prec, 
-		     VOLUME/2, &Qtm_pm_ndpsi);
-  }
-#else			// CPU, conjugate gradient
-  iter = cg_her_nd(Odd_new_s, Odd_new_c, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1],
-		   max_iter, precision, rel_prec, 
-		   VOLUME/2, &Qtm_pm_ndpsi);
+    if (usegpu_flag) {    // GPU, mixed precision solver
+#    if ( defined TM_USE_MPI  && defined PARALLELT )
+      iter = mixedsolve_eo_nd(Odd_new_s, Odd_new_c, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1],
+                              max_iter, precision, rel_prec);
+#    elif ( !defined TM_USE_MPI  && !defined PARALLELT )
+      iter = mixedsolve_eo_nd(Odd_new_s, Odd_new_c, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1],
+                              max_iter, precision, rel_prec);
+#    else
+      printf("MPI and/or PARALLELT are not appropriately set for the GPU implementation. Aborting...\n");
+      exit(-1);
+#    endif
+    }
+    else {                // CPU, conjugate gradient
+      iter = cg_her_nd(Odd_new_s, Odd_new_c, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1],
+                       max_iter, precision, rel_prec, 
+                       VOLUME/2, &Qtm_pm_ndpsi);
+    }
+#else                   // CPU, conjugate gradient
+    if(solver_flag == RGMIXEDCG){
+      iter = rg_mixed_cg_her_nd(Odd_new_s, Odd_new_c, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1],
+                                solver_params, max_iter, precision, rel_prec, VOLUME/2,
+                                &Qtm_pm_ndpsi, &Qtm_pm_ndpsi_32);
+    } 
+    else {
+      iter = cg_her_nd(Odd_new_s, Odd_new_c, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1],
+                       max_iter, precision, rel_prec, VOLUME/2, &Qtm_pm_ndpsi);
+    }
 #endif
-  
-  
-  Qtm_dagger_ndpsi(Odd_new_s, Odd_new_c,
-		   Odd_new_s, Odd_new_c);
+    Qtm_dagger_ndpsi(Odd_new_s, Odd_new_c,
+                     Odd_new_s, Odd_new_c);
+  } // if(NO_EXT_INV)
+#ifdef TM_USE_QPHIX
+  else if (external_inverter == QPHIX_INVERTER ) {
+    // using QPhiX, we invert M M^dagger y = b, so we don't need gamma_5 multiplications
+    iter = invert_eo_qphix_twoflavour(Odd_new_s, Odd_new_c, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1],
+                                      max_iter, precision, solver_flag, rel_prec,
+                                      solver_params, sloppy, compression);
+    // and it multiplies y internally by M^dagger, returning M^{-1} b as required
+  }
+#endif // TM_USE_QPHIX
 
   /* Reconstruct the even sites                */
   Hopping_Matrix(EO, g_spinor_field[DUM_DERI], Odd_new_s);
   Hopping_Matrix(EO, g_spinor_field[DUM_DERI+1], Odd_new_c);
   M_ee_inv_ndpsi(g_spinor_field[DUM_DERI+2], g_spinor_field[DUM_DERI+3],
-		 g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1],
-		 g_mubar, g_epsbar);
+                 g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1],
+                 g_mubar, g_epsbar);
   
   /* The sign is plus, since in Hopping_Matrix */
   /* the minus is missing                      */
@@ -201,83 +193,9 @@ int invert_doublet_eo(spinor * const Even_new_s, spinor * const Odd_new_s,
 #ifdef HAVE_GPU  
   /* return from temporal gauge again */
 #  ifdef TEMPORALGAUGE
-  
   if (usegpu_flag) { 
-    
-    /* undo trafo */
-    /* apply_inv_gtrafo(g_gauge_field, g_trafo);*/
-    /* copy back the saved original field located in g_tempgauge_field -> update necessary*/
-    plaquette1 = measure_plaquette(g_gauge_field);
-    copy_gauge_field(g_gauge_field, g_tempgauge_field);
-    g_update_gauge_copy = 1;
-    plaquette2 = measure_plaquette(g_gauge_field);
-    if (g_proc_id == 0) printf("\tPlaquette before inverse gauge fixing: %.16e\n", plaquette1/6./VOLUME);
-    if (g_proc_id == 0) printf("\tPlaquette after inverse gauge fixing:  %.16e\n", plaquette2/6./VOLUME);
-    
-    /* undo trafo to source Even_s */
-    dret1 = square_norm(Even_s, VOLUME/2 , 1);
-    apply_inv_gtrafo_spinor_even(Even_s, g_trafo);
-    dret2 = square_norm(Even_s, VOLUME/2, 1);
-    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); 
-    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2);
-    
-    
-    /* undo trafo to source Even_c */
-    dret1 = square_norm(Even_c, VOLUME/2 , 1);
-    apply_inv_gtrafo_spinor_even(Even_c, g_trafo);
-    dret2 = square_norm(Even_c, VOLUME/2, 1);
-    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1);
-    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2); 
-    
-    /* undo trafo to source Odd_s */
-    dret1 = square_norm(Odd_s, VOLUME/2 , 1);
-    apply_inv_gtrafo_spinor_odd(Odd_s, g_trafo);
-    dret2 = square_norm(Odd_s, VOLUME/2, 1);
-    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); 
-    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2);
-    
-    /* undo trafo to source Odd_c */
-    dret1 = square_norm(Odd_c, VOLUME/2 , 1);
-    apply_inv_gtrafo_spinor_odd(Odd_c, g_trafo);
-    dret2 = square_norm(Odd_c, VOLUME/2, 1);
-    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); 
-    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2); 
-    
-    
-    // Even_new_s
-    dret1 = square_norm(Even_new_s, VOLUME/2 , 1);
-    apply_inv_gtrafo_spinor_even(Even_new_s, g_trafo);
-    dret2 = square_norm(Even_new_s, VOLUME/2, 1);
-    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); 
-    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2);
-    
-    // Even_new_c
-    dret1 = square_norm(Even_new_c, VOLUME/2 , 1);
-    apply_inv_gtrafo_spinor_even(Even_new_c, g_trafo);
-    dret2 = square_norm(Even_new_c, VOLUME/2, 1);
-    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); 
-    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2);
-    
-    // Odd_new_s
-    dret1 = square_norm(Odd_new_s, VOLUME/2 , 1);
-    apply_inv_gtrafo_spinor_odd(Odd_new_s, g_trafo);
-    dret2 = square_norm(Odd_new_s, VOLUME/2, 1);
-    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); 
-    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2);
-    
-    // Odd_new_c
-    dret1 = square_norm(Odd_new_c, VOLUME/2 , 1);
-    apply_inv_gtrafo_spinor_odd(Odd_new_c, g_trafo);
-    dret2 = square_norm(Odd_new_c, VOLUME/2, 1);
-    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); 
-    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2); 
-    
-    finalize_temporalgauge();
-    
-#    ifdef MPI
-    xchange_gauge(g_gauge_field);
-#    endif
-    
+    gtrafo_eo_nd(Even_s, Odd_s, Even_c, Odd_c, Even_new_s, Odd_new_s, Even_new_c, Odd_new_c,
+                 GTRAFO_REVERT);
   }
 #  endif
 #endif
@@ -286,52 +204,86 @@ int invert_doublet_eo(spinor * const Even_new_s, spinor * const Odd_new_s,
 
 
 int invert_cloverdoublet_eo(spinor * const Even_new_s, spinor * const Odd_new_s, 
-			    spinor * const Even_new_c, spinor * const Odd_new_c, 
-			    spinor * const Even_s, spinor * const Odd_s,
-			    spinor * const Even_c, spinor * const Odd_c,
-			    const double precision, const int max_iter,
-			    const int solver_flag, const int rel_prec) {
+                      spinor * const Even_new_c, spinor * const Odd_new_c,
+                      spinor * const Even_s, spinor * const Odd_s,
+                      spinor * const Even_c, spinor * const Odd_c,
+                      const double precision, const int max_iter,
+                      const int solver_flag, const int rel_prec, solver_params_t solver_params,
+                      const ExternalInverter external_inverter, const SloppyPrecision sloppy, const CompressionType compression) {
   
   int iter = 0;
-  
+
+#ifdef TM_USE_QUDA
+  if( external_inverter==QUDA_INVERTER ) {
+    return invert_doublet_eo_quda( Even_new_s, Odd_new_s, Even_new_c, Odd_new_c,
+                                   Even_s, Odd_s, Even_c, Odd_c,
+                                   precision, max_iter,
+                                   solver_flag, rel_prec, 1,
+                                   sloppy, compression );
+  }
+#endif
+
+#ifdef DDalphaAMG
+  if( solver_flag==MG ) {
+    return MG_solver_nd_eo( Even_new_s, Odd_new_s, Even_new_c, Odd_new_c,
+                            Even_s, Odd_s, Even_c, Odd_c,
+                            precision, max_iter, rel_prec,
+                            VOLUME/2, g_gauge_field, Msw_full_ndpsi );
+  }
+#endif
   
   /* here comes the inversion using even/odd preconditioning */
   if(g_proc_id == 0) {printf("# Using even/odd preconditioning!\n"); fflush(stdout);}
   Msw_ee_inv_ndpsi(Even_new_s, Even_new_c, 
-		   Even_s, Even_c);
+                  Even_s, Even_c);
   Hopping_Matrix(OE, g_spinor_field[DUM_DERI], Even_new_s);
   Hopping_Matrix(OE, g_spinor_field[DUM_DERI+1], Even_new_c);
   
   /* The sign is plus, since in Hopping_Matrix */
   /* the minus is missing                      */
   assign_mul_add_r(g_spinor_field[DUM_DERI], +1., Odd_s, VOLUME/2);
-  assign_mul_add_r(g_spinor_field[DUM_DERI+1], +1., Odd_c, VOLUME/2);
-  
-  /* Do the inversion with the preconditioned  */
-  /* matrix to get the odd sites               */
-  
-  /* Here we invert the hermitean operator squared */
-  
-  if(g_proc_id == 0) {
-    printf("# Using CG for TMWILSON flavour doublet!\n"); 
-    fflush(stdout);
+  assign_mul_add_r(g_spinor_field[DUM_DERI+1], +1., Odd_c, VOLUME/2);  
+  if( external_inverter == NO_EXT_INV ){    
+    /* Do the inversion with the preconditioned  */
+    /* matrix to get the odd sites               */
+    
+    /* Here we invert the hermitean operator squared */
+    
+    if(g_proc_id == 0) {
+      printf("# Using CG for TMWILSON flavour doublet!\n"); 
+      fflush(stdout);
+    }
+    gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2);
+    gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+1], VOLUME/2);
+    
+    if(solver_flag == RGMIXEDCG){
+      iter = rg_mixed_cg_her_nd(Odd_new_s, Odd_new_c, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1],
+                                solver_params, max_iter, precision, rel_prec, VOLUME/2,
+                                &Qsw_pm_ndpsi, &Qsw_pm_ndpsi_32);
+    } else {
+      iter = cg_her_nd(Odd_new_s, Odd_new_c, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1],
+                                  max_iter, precision, rel_prec, 
+                                  VOLUME/2, &Qsw_pm_ndpsi);
+    }
+    
+    Qsw_dagger_ndpsi(Odd_new_s, Odd_new_c,
+                    Odd_new_s, Odd_new_c);
+  } // if(NO_EXT_INV)
+#ifdef TM_USE_QPHIX
+  else if (external_inverter == QPHIX_INVERTER ) {
+    // using QPhiX, we invert M M^dagger y = b, so we don't need gamma_5 multiplications
+    iter = invert_eo_qphix_twoflavour(Odd_new_s, Odd_new_c, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1],
+                                      max_iter, precision, solver_flag, rel_prec,
+                                      solver_params, sloppy, compression);
+    // and it multiplies y internally by M^dagger, returning M^{-1} b as required
   }
-  gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2);
-  gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+1], VOLUME/2);
-  
-  iter = cg_her_nd(Odd_new_s, Odd_new_c, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1],
-		   max_iter, precision, rel_prec, 
-		   VOLUME/2, &Qsw_pm_ndpsi);
-  
-  
-  Qsw_dagger_ndpsi(Odd_new_s, Odd_new_c,
-		   Odd_new_s, Odd_new_c);
+#endif // TM_USE_QPHIX
   
   /* Reconstruct the even sites                */
   Hopping_Matrix(EO, g_spinor_field[DUM_DERI], Odd_new_s);
   Hopping_Matrix(EO, g_spinor_field[DUM_DERI+1], Odd_new_c);
   Msw_ee_inv_ndpsi(g_spinor_field[DUM_DERI+2], g_spinor_field[DUM_DERI+3],
-		   g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1]);
+                   g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1]);
   
   /* The sign is plus, since in Hopping_Matrix */
   /* the minus is missing                      */
diff --git a/invert_doublet_eo.h b/invert_doublet_eo.h
index d6835f3ff..ef30b307c 100644
--- a/invert_doublet_eo.h
+++ b/invert_doublet_eo.h
@@ -30,12 +30,17 @@
 #ifndef _INVERT_DOUBLET_EO_H
 #define _INVERT_DOUBLET_EO_H
 
+#include "global.h"
+#include "misc_types.h"
+#include "solver/solver_params.h"
+
 int invert_doublet_eo(spinor * const Even_new_s, spinor * const Odd_new_s, 
-		      spinor * const Even_new_c, spinor * const Odd_new_c, 
-		      spinor * const Even_s, spinor * const Odd_s,
-		      spinor * const Even_c, spinor * const Odd_c,
-		      const double precision, const int max_iter,
-		      const int solver_flag, const int rel_prec);
+                      spinor * const Even_new_c, spinor * const Odd_new_c,
+                      spinor * const Even_s, spinor * const Odd_s,
+                      spinor * const Even_c, spinor * const Odd_c,
+                      const double precision, const int max_iter,
+                      const int solver_flag, const int rel_prec, solver_params_t solver_params,
+                      const ExternalInverter extenral_inverter, const SloppyPrecision sloppy, const CompressionType compression);
 
 
 /* This is the full matrix multiplication */
@@ -47,9 +52,10 @@ int invert_doublet_eo(spinor * const Even_new_s, spinor * const Odd_new_s,
 /* 		      spinor * const Even, spinor * const Odd); */
 
 int invert_cloverdoublet_eo(spinor * const Even_new_s, spinor * const Odd_new_s, 
-			    spinor * const Even_new_c, spinor * const Odd_new_c, 
-			    spinor * const Even_s, spinor * const Odd_s,
-			    spinor * const Even_c, spinor * const Odd_c,
-			    const double precision, const int max_iter,
-			    const int solver_flag, const int rel_prec);
+                      spinor * const Even_new_c, spinor * const Odd_new_c,
+                      spinor * const Even_s, spinor * const Odd_s,
+                      spinor * const Even_c, spinor * const Odd_c,
+                      const double precision, const int max_iter,
+                      const int solver_flag, const int rel_prec, solver_params_t solver_params,
+                      const ExternalInverter external_inverter, const SloppyPrecision sloppy, const CompressionType compression);
 #endif
diff --git a/invert_eo.c b/invert_eo.c
index 43b6981a1..a0a7c1c06 100644
--- a/invert_eo.c
+++ b/invert_eo.c
@@ -1,5 +1,5 @@
 /***********************************************************************
- * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+  Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
  *
  * This file is part of tmLQCD.
  *
@@ -32,7 +32,7 @@
  ****************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include<stdlib.h>
 #include"global.h"
@@ -40,6 +40,7 @@
 #include"operator/tm_operators.h"
 #include"operator/Hopping_Matrix.h"
 #include"operator/D_psi.h"
+#include"operator/tm_operators_32.h"
 #include"gamma.h"
 #include"solver/solver.h"
 #include"read_input.h"
@@ -48,11 +49,20 @@
 #include"solver/dfl_projector.h"
 #include"invert_eo.h"
 #include "solver/dirac_operator_eigenvectors.h"
-
 /* FIXME temporary includes and declarations until IO and interface for invert and CGMMS are generelized */
 #include "init/init_spinor_field.h"
 #include <io/params.h>
 #include <io/spinor.h>
+#ifdef TM_USE_QUDA
+#  include "quda_interface.h"
+#endif
+#ifdef TM_USE_QPHIX
+#  include "qphix_interface.h"
+#endif
+#ifdef DDalphaAMG
+#  include "DDalphaAMG_interface.h"
+#endif
+
 static double cgmms_reached_prec = 0.0; 
 static void cgmms_write_props(spinor ** const P, double const * const extra_masses, const int no_extra_masses, const int id, const int iteration);
 
@@ -62,23 +72,40 @@ static void cgmms_write_props(spinor ** const P, double const * const extra_mass
 #include"measure_gauge_action.h"
 
 extern int mixed_solve (spinor * const P, spinor * const Q, const int max_iter, 
-			double eps, const int rel_prec,const int N);
+                        double eps, const int rel_prec,const int N);
 extern  int mixed_solve_eo (spinor * const P, spinor * const Q, const int max_iter, 
-			    double eps, const int rel_prec, const int N);
+                            double eps, const int rel_prec, const int N);
 #ifdef TEMPORALGAUGE
 extern su3* g_trafo;
 #endif
 #endif
 
 int invert_eo(spinor * const Even_new, spinor * const Odd_new, 
-	      spinor * const Even, spinor * const Odd,
-	      const double precision, const int max_iter,
-	      const int solver_flag, const int rel_prec,
-	      const int sub_evs_flag, const int even_odd_flag,
-        const int no_extra_masses, double * const extra_masses, solver_params_t solver_params,
-        const int id )  {
+              spinor * const Even, spinor * const Odd,
+              const double precision, const int max_iter,
+              const int solver_flag, const int rel_prec,
+              const int sub_evs_flag, const int even_odd_flag,
+              const int no_extra_masses, double * const extra_masses, solver_params_t solver_params, const int id,
+              const ExternalInverter external_inverter, const SloppyPrecision sloppy, const CompressionType compression )  {
 
   int iter = 0;
+
+#ifdef TM_USE_QUDA
+  if( external_inverter==QUDA_INVERTER ) {
+    return invert_eo_quda(Even_new, Odd_new, Even, Odd,
+                          precision, max_iter,
+                          solver_flag, rel_prec,
+                          even_odd_flag, solver_params,
+                          sloppy, compression);
+  }
+#endif
+
+#ifdef DDalphaAMG
+  if ( solver_flag==MG )
+    return MG_solver_eo(Even_new, Odd_new, Even, Odd, precision, max_iter,
+			rel_prec, VOLUME/2, g_gauge_field, &M_full);
+#endif
+
   /* here comes the inversion using even/odd preconditioning */
   if(even_odd_flag) {
     if(g_proc_id == 0) {printf("# Using even/odd preconditioning!\n"); fflush(stdout);}
@@ -94,8 +121,8 @@ int invert_eo(spinor * const Even_new, spinor * const Odd_new,
     
       /* need VOLUME here (not N=VOLUME/2)*/
       if((retval=init_temporalgauge_trafo(VOLUME, g_gauge_field)) !=0){
-	if(g_proc_id == 0) printf("Error while gauge fixing to temporal gauge. Aborting...\n");   
-	exit(200);
+        if(g_proc_id == 0) printf("Error while gauge fixing to temporal gauge. Aborting...\n");   
+        exit(200);
       }
       plaquette = measure_plaquette(g_gauge_field);
       if(g_proc_id == 0) printf("Plaquette before gauge fixing: %.16e\n", plaquette/6./VOLUME);
@@ -124,18 +151,37 @@ int invert_eo(spinor * const Even_new, spinor * const Odd_new,
  
     assign_mul_one_pm_imu_inv(Even_new, Even, +1., VOLUME/2);
     
-    Hopping_Matrix(OE, g_spinor_field[DUM_DERI], Even_new); 
+    Hopping_Matrix(OE, g_spinor_field[DUM_DERI], Even_new);
     /* The sign is plus, since in Hopping_Matrix */
     /* the minus is missing                      */
     assign_mul_add_r(g_spinor_field[DUM_DERI], +1., Odd, VOLUME/2);
     /* Do the inversion with the preconditioned  */
     /* matrix to get the odd sites               */
     
+#ifdef TM_USE_QPHIX
+    if( external_inverter==QPHIX_INVERTER ) {
+      // QPhiX inverts M(mu)M(mu)^dag or M(mu), no gamma_5 source multiplication required
+      iter = invert_eo_qphix_oneflavour(Odd_new, g_spinor_field[DUM_DERI],
+                                        max_iter, precision,
+                                        solver_flag, rel_prec,
+                                        solver_params,
+                                        sloppy,
+                                        compression);
+      // for solver_params.solution_type == TM_SOLUTION_M (the default)
+      // QPhiX applies M(mu)^dag internally for normal equation solves, no call to tmLQCD operaor required
+    } else
+#endif
     if(solver_flag == BICGSTAB) {
       if(g_proc_id == 0) {printf("# Using BiCGstab!\n"); fflush(stdout);}
       mul_one_pm_imu_inv(g_spinor_field[DUM_DERI], +1., VOLUME/2); 
       iter = bicgstab_complex(Odd_new, g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME/2, &Mtm_plus_sym_psi);
     }
+    else if(solver_flag == BICG) {
+      if(g_proc_id == 0) {printf("# Using BiCG!\n"); fflush(stdout);}
+      mul_one_pm_imu_inv(g_spinor_field[DUM_DERI], +1., VOLUME/2);
+      iter = bicg_complex(Odd_new, g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME/2, &Mtm_plus_sym_psi, &Mtm_plus_sym_dagg_psi);
+    }
+
     else if(solver_flag == GMRES) {
       if(g_proc_id == 0) {printf("# Using GMRES! m = %d\n", gmres_m_parameter); fflush(stdout);}
       mul_one_pm_imu_inv(g_spinor_field[DUM_DERI], +1., VOLUME/2);
@@ -146,9 +192,15 @@ int invert_eo(spinor * const Even_new, spinor * const Odd_new,
       mul_one_pm_imu_inv(g_spinor_field[DUM_DERI], +1., VOLUME/2);
       iter = gcr(Odd_new, g_spinor_field[DUM_DERI], gmres_m_parameter, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME/2, 0, &Mtm_plus_sym_psi);
     }
+    else if (solver_flag == MCR) {
+      if(g_proc_id == 0) {printf("# Using MCR! m = %d\n", gmres_m_parameter); fflush(stdout);}
+      gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2);
+      iter = mcr(Odd_new, g_spinor_field[DUM_DERI], gmres_m_parameter, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME/2, 0, &Qtm_pm_psi);
+      Qtm_minus_psi(Odd_new, Odd_new);
+    }
     else if(solver_flag == GMRESDR) {
       if(g_proc_id == 0) {printf("# Using GMRES-DR! m = %d, NrEv = %d\n", 
-         gmres_m_parameter, gmresdr_nr_ev); fflush(stdout);}
+                                 gmres_m_parameter, gmresdr_nr_ev); fflush(stdout);}
       mul_one_pm_imu_inv(g_spinor_field[DUM_DERI], +1., VOLUME/2);
       iter = gmres_dr(Odd_new, g_spinor_field[DUM_DERI], gmres_m_parameter, gmresdr_nr_ev, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME/2, &Mtm_plus_sym_psi);
     }
@@ -171,42 +223,50 @@ int invert_eo(spinor * const Even_new, spinor * const Odd_new,
       Qtm_minus_psi(Odd_new, Odd_new);
     }
     else if(solver_flag == INCREIGCG) {
-       /* Here we invert the hermitean operator squared */
-       gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2);  
-       if(g_proc_id == 0) {printf("# Using Incremental Eig-CG!\n"); fflush(stdout);}
-       iter = incr_eigcg(VOLUME/2,solver_params.eigcg_nrhs,solver_params.eigcg_nrhs1, Odd_new, g_spinor_field[DUM_DERI], solver_params.eigcg_ldh, &Qtm_pm_psi,
- 			 solver_params.eigcg_tolsq1, solver_params.eigcg_tolsq, solver_params.eigcg_restolsq , solver_params.eigcg_rand_guess_opt,
-                         rel_prec, max_iter, solver_params.eigcg_nev, solver_params.eigcg_vmax);
-       Qtm_minus_psi(Odd_new, Odd_new);
-     }
+      /* Here we invert the hermitean operator squared */
+      gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2);  
+      if(g_proc_id == 0) {printf("# Using Incremental Eig-CG!\n"); fflush(stdout);}
+      iter = incr_eigcg(VOLUME/2,solver_params.eigcg_nrhs,solver_params.eigcg_nrhs1, Odd_new, g_spinor_field[DUM_DERI], solver_params.eigcg_ldh, &Qtm_pm_psi,
+                        solver_params.eigcg_tolsq1, solver_params.eigcg_tolsq, solver_params.eigcg_restolsq , solver_params.eigcg_rand_guess_opt,
+                        rel_prec, max_iter, solver_params.eigcg_nev, solver_params.eigcg_vmax);
+      Qtm_minus_psi(Odd_new, Odd_new);
+    }
     else if(solver_flag == MIXEDCG) {
       /* Here we invert the hermitean operator squared */
       gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2);
       if(g_proc_id == 0) {printf("# Using Mixed Precision CG!\n"); fflush(stdout);}
-      iter = mixed_cg_her(Odd_new, g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, 
-			  VOLUME/2, &Qtm_pm_psi);
+      iter = mixed_cg_her(Odd_new, g_spinor_field[DUM_DERI], solver_params, max_iter, precision, rel_prec, 
+                          VOLUME/2, &Qtm_pm_psi, &Qtm_pm_psi_32);
+      Qtm_minus_psi(Odd_new, Odd_new);
+    }
+    else if(solver_flag == RGMIXEDCG) {
+      /* Here we invert the hermitean operator squared */
+      gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2);
+      if(g_proc_id == 0) {printf("# Using Mixed Precision CG!\n"); fflush(stdout);}
+      iter = rg_mixed_cg_her(Odd_new, g_spinor_field[DUM_DERI], solver_params, max_iter, precision, rel_prec,
+                             VOLUME/2, &Qtm_pm_psi, &Qtm_pm_psi_32);
       Qtm_minus_psi(Odd_new, Odd_new);
     }
     else if(solver_flag == CG) {
       /* Here we invert the hermitean operator squared */
       gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2);
       if(g_proc_id == 0) {
-	printf("# Using CG!\n"); 
-	printf("# mu = %f, kappa = %f\n", g_mu/2./g_kappa, g_kappa);
-	fflush(stdout);
+        printf("# Using CG!\n"); 
+        printf("# mu = %.12f, kappa = %.12f\n", g_mu/2./g_kappa, g_kappa);
+        fflush(stdout);
       }
 #ifdef HAVE_GPU
       if(usegpu_flag){
-	if(g_proc_id == 0) printf("Using GPU for inversion\n");
-	iter = mixed_solve_eo(Odd_new, g_spinor_field[DUM_DERI], max_iter,   precision, rel_prec, VOLUME/2);
+        if(g_proc_id == 0) printf("Using GPU for inversion\n");
+        iter = mixed_solve_eo(Odd_new, g_spinor_field[DUM_DERI], max_iter,   precision, rel_prec, VOLUME/2);
       }
-      else{
+      else {
         iter = cg_her(Odd_new, g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME/2, &Qtm_pm_psi);
         Qtm_minus_psi(Odd_new, Odd_new);
       }
 #else        
       iter = cg_her(Odd_new, g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, 
-		    VOLUME/2, &Qtm_pm_psi);
+                    VOLUME/2, &Qtm_pm_psi);
       Qtm_minus_psi(Odd_new, Odd_new);
 #endif /*HAVE_GPU*/
     }
@@ -224,7 +284,7 @@ int invert_eo(spinor * const Even_new, spinor * const Odd_new,
       gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2);
 #ifdef HAVE_GPU
       if(g_proc_id == 0) {printf("Using GPU for inversion\n");
-	fflush(stdout);}
+        fflush(stdout);}
       iter = mixed_solve_eo(Odd_new, g_spinor_field[DUM_DERI], max_iter,   precision, rel_prec, VOLUME/2);
 #else
       iter = cg_her(Odd_new, g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME/2, &Qtm_pm_psi);
@@ -317,6 +377,19 @@ int invert_eo(spinor * const Even_new, spinor * const Odd_new,
         iter = bicgstab_complex(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME, &D_psi);
       }
     }
+    else if(solver_flag == BICG) {
+      if(g_proc_id == 0) {printf("# Using BiCG!\n"); fflush(stdout);}
+      if(use_preconditioning==1 && g_precWS!=NULL){
+        //if(g_proc_id == 0) {printf("# Using preconditioning (which one?)!\n");}
+        //iter = bicg_complex(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME, &D_psi_prec, &D_dagg_psi_prec);
+        
+        if(g_proc_id == 0) {printf("# Not using preconditioning (which one?)!\n");}
+        iter = bicg_complex(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME, &D_psi, &D_dagg_psi);
+      } else {
+        if(g_proc_id == 0) {printf("# Not using preconditioning (which one?)!\n");}
+        iter = bicg_complex(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME, &D_psi, &D_dagg_psi);
+      }
+    }
     else if(solver_flag == CGS) {
       if(g_proc_id == 0) {printf("# Using CGS!\n"); fflush(stdout);}
 
@@ -341,40 +414,51 @@ int invert_eo(spinor * const Even_new, spinor * const Odd_new,
         iter = gmres(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], gmres_m_parameter, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, 1, &D_psi);
       }
     }
-    else if(solver_flag == FGMRES) {
+    else if(solver_flag == MIXEDCG) {
+      if(g_proc_id == 0) {printf("# Using MIXEDCG!\n"); fflush(stdout);}
+      gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], VOLUME);
+      iter = mixed_cg_her(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], solver_params, max_iter, 
+                          precision, rel_prec, VOLUME, &Q_pm_psi, &Q_pm_psi_32);
+      Q_minus_psi(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]);
+    } else if(solver_flag == RGMIXEDCG) {
+      if(g_proc_id == 0) {printf("# Using MIXEDCG!\n"); fflush(stdout);}
+      gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], VOLUME);
+      iter = rg_mixed_cg_her(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], solver_params, max_iter, 
+                             precision, rel_prec, VOLUME, &Q_pm_psi, &Q_pm_psi_32);
+      Q_minus_psi(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]);
+    } else if(solver_flag == FGMRES) {
       if(g_proc_id == 0) {printf("# Using FGMRES! m = %d\n", gmres_m_parameter); fflush(stdout);}
-      iter = fgmres(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], gmres_m_parameter, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, 1, &D_psi); 
-      /*       gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], VOLUME); */
-      /*       iter = fgmres(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], gmres_m_parameter, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, &Q_pm_psi);  */
-      /*       Q_minus_psi(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]); */
+      iter = fgmres(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], gmres_m_parameter, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, Msap_precon, &D_psi); 
     }
     else if(solver_flag == GCR) {
       if(g_proc_id == 0) {printf("# Using GCR! m = %d\n", gmres_m_parameter); fflush(stdout);}
-      iter = gcr(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], gmres_m_parameter, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, 1, &D_psi); 
-      /*       gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], VOLUME); */
-      /*       iter = gcr(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], gmres_m_parameter, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, &Q_pm_psi); */
-      /*       Q_minus_psi(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]); */
-    }
-    else if(solver_flag == DFLGCR || solver_flag == DFLFGMRES) {
-      if(g_proc_id == 0) {printf("# Using deflated solver! m = %d\n", gmres_m_parameter); fflush(stdout);}
-      /* apply P_L to source           */
-      project_left(g_spinor_field[DUM_DERI+2], g_spinor_field[DUM_DERI]);
-      if(g_proc_id == 0) printf("# Applied P_L to source\n");
-      /* invert P_L D on source -> chi */
-      if(solver_flag == DFLGCR) {
-        iter = gcr(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+2], gmres_m_parameter, 
-                   max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, 1, &project_left_D);
-      }
-      else {
-        iter = fgmres(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+2], gmres_m_parameter, 
-                      max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, 1, &project_left_D);
-      }
-      /* apply P_R to chi              */
-      project_right(g_spinor_field[DUM_DERI+2], g_spinor_field[DUM_DERI+1]);
-      if(g_proc_id == 0) printf("# Applied P_R to solution\n");
-      /* reconstruct solution          */
-      project(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]);
-      add(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+2], VOLUME);
+      iter = gcr(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], gmres_m_parameter, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, Msap_precon, &D_psi); 
+    }
+    else if(solver_flag == MCR) {
+      if(g_proc_id == 0) {printf("# Using mCR! m = %d\n", gmres_m_parameter); fflush(stdout);}
+      /* iter = mcr(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], gmres_m_parameter, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, 0, &D_psi); */
+      gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME);
+      iter = mcr(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], gmres_m_parameter, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, 0, &Q_plus_psi);
+    }
+    else if(solver_flag == CR) {
+      if(g_proc_id == 0) {printf("# Using CR and iQ!\n"); fflush(stdout);}
+      gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME);
+      iter = cr(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], gmres_m_parameter, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, 0, &Q_plus_psi);
+      /* Solve DdaggD */
+      /* gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], VOLUME);
+         iter = cr(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], gmres_m_parameter, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, 0, &Q_pm_psi);
+         Q_minus_psi(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]);
+      */  
+    }
+    else if (solver_flag == DFLGCR) {
+      if(g_proc_id == 0) {printf("# Using deflated GCR solver! m = %d\n", gmres_m_parameter); fflush(stdout);}
+      iter = gcr(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], gmres_m_parameter, 
+                 max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, 2, &D_psi);
+    }
+    else if (solver_flag == DFLFGMRES) {
+      if(g_proc_id == 0) {printf("# Using deflated FGMRES solver! m = %d\n", gmres_m_parameter); fflush(stdout);}
+      iter = fgmres(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], gmres_m_parameter, 
+                    max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, 2, &D_psi);
     }
     else if (solver_flag == CGMMS) {
       /* FIXME temporary workaround for the multiple masses interface */
@@ -383,7 +467,7 @@ int invert_eo(spinor * const Even_new, spinor * const Odd_new,
       for(int i = 0; i < no_extra_masses; ++i)
         shifts[i+1] = extra_masses[i];
       g_mu = 0;
-      solver_pm_t solver_params;
+      solver_params_t solver_params;
       solver_params.shifts = shifts;
       solver_params.no_shifts = no_extra_masses+1;
       solver_params.rel_prec = rel_prec;
@@ -401,7 +485,7 @@ int invert_eo(spinor * const Even_new, spinor * const Odd_new,
       if(g_proc_id == 0) {printf("# Using multi mass CG!\n"); fflush(stdout);}
       
       gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], VOLUME);
-      iter = cg_mms_tm(P, g_spinor_field[DUM_DERI+1],&solver_params,&cgmms_reached_prec);
+      iter = cg_mms_tm(P, g_spinor_field[DUM_DERI+1],&solver_params);
       g_mu = shifts[0];
       Q_minus_psi(g_spinor_field[DUM_DERI+1], P[0]);
       
@@ -411,56 +495,63 @@ int invert_eo(spinor * const Even_new, spinor * const Odd_new,
       free(P);
       free(shifts);
     }
+    else if(solver_flag == PCG) {
+      if(g_proc_id == 0) {printf("# Using PCG!\n"); fflush(stdout);}
+      gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], VOLUME);
+      iter = pcg_her(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], max_iter, precision, 
+                     rel_prec, VOLUME, &Q_pm_psi);
+      Q_minus_psi(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]);
+    }
     else {
       if(g_proc_id == 0) {printf("# Using CG!\n"); fflush(stdout);}
 #ifdef HAVE_GPU 
       if(usegpu_flag){
-	if(g_proc_id == 0) printf("# Using GPU for inversion\n");
-	iter = mixed_solve(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME);
+        if(g_proc_id == 0) printf("# Using GPU for inversion\n");
+        iter = mixed_solve(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME);
       }
       else{
-	gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], VOLUME);
-	iter = cg_her(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], max_iter, precision, 
-		      rel_prec, VOLUME, &Q_pm_psi);
-	Q_minus_psi(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]);
+        gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], VOLUME);
+        iter = cg_her(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], max_iter, precision, 
+                      rel_prec, VOLUME, &Q_pm_psi);
+        Q_minus_psi(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]);
       }
 #else
       gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], VOLUME);
 
       if(use_preconditioning==1 && g_precWS!=NULL){
-	spinorPrecWS *ws=(spinorPrecWS*)g_precWS;
-	static _Complex double alpha = 0.0;
-	if(g_proc_id==0) {printf("# Using preconditioning (which one?)!\n");}
+        spinorPrecWS *ws=(spinorPrecWS*)g_precWS;
+        static _Complex double alpha = 0.0;
+        if(g_proc_id==0) {printf("# Using preconditioning (which one?)!\n");}
 
-	if(g_prec_sequence_d_dagger_d[2] != 0.0){
-	  alpha = g_prec_sequence_d_dagger_d[2];
-	  spinorPrecondition(g_spinor_field[DUM_DERI+1],g_spinor_field[DUM_DERI+1],ws,T,L,alpha,0,1);
-	}
+        if(g_prec_sequence_d_dagger_d[2] != 0.0){
+          alpha = g_prec_sequence_d_dagger_d[2];
+          spinorPrecondition(g_spinor_field[DUM_DERI+1],g_spinor_field[DUM_DERI+1],ws,T,L,alpha,0,1);
+        }
 
-	iter = cg_her(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], max_iter, precision, 
-		    rel_prec, VOLUME, &Q_pm_psi_prec);
+        iter = cg_her(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], max_iter, precision, 
+                      rel_prec, VOLUME, &Q_pm_psi_prec);
 
-	if(g_prec_sequence_d_dagger_d[0] != 0.0){
-	  alpha = g_prec_sequence_d_dagger_d[0];
-	  spinorPrecondition(g_spinor_field[DUM_DERI],g_spinor_field[DUM_DERI],ws,T,L,alpha,0,1);
-	}
+        if(g_prec_sequence_d_dagger_d[0] != 0.0){
+          alpha = g_prec_sequence_d_dagger_d[0];
+          spinorPrecondition(g_spinor_field[DUM_DERI],g_spinor_field[DUM_DERI],ws,T,L,alpha,0,1);
+        }
 
       } else {
-	if(g_proc_id==0) {printf("# Not using preconditioning!\n");}
-	iter = cg_her(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], max_iter, precision, 
-		      rel_prec, VOLUME, &Q_pm_psi);
+        if(g_proc_id==0) {printf("# Not using preconditioning!\n");}
+        iter = cg_her(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], max_iter, precision, 
+                      rel_prec, VOLUME, &Q_pm_psi);
       }
 
 
       Q_minus_psi(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]);
 
       if(use_preconditioning==1 && g_precWS!=NULL){
-	spinorPrecWS *ws=(spinorPrecWS*)g_precWS;
-	static _Complex double alpha = 0.0;
-	if(g_prec_sequence_d_dagger_d[1] != 0.0){
-	  alpha = g_prec_sequence_d_dagger_d[1];
-	  spinorPrecondition(g_spinor_field[DUM_DERI+1],g_spinor_field[DUM_DERI+1],ws,T,L,alpha,0,1);
-	}
+        spinorPrecWS *ws=(spinorPrecWS*)g_precWS;
+        static _Complex double alpha = 0.0;
+        if(g_prec_sequence_d_dagger_d[1] != 0.0){
+          alpha = g_prec_sequence_d_dagger_d[1];
+          spinorPrecondition(g_spinor_field[DUM_DERI+1],g_spinor_field[DUM_DERI+1],ws,T,L,alpha,0,1);
+        }
       }
 #endif
     }
@@ -470,8 +561,8 @@ int invert_eo(spinor * const Even_new, spinor * const Odd_new,
 }
 
 /* FIXME temporary solution for the writing of CGMMS propagators until the input/output interface for
- invert_eo has been generalized
- NOTE that no_shifts = no_extra_masses+1 */
+   invert_eo has been generalized
+   NOTE that no_shifts = no_extra_masses+1 */
 static void cgmms_write_props(spinor ** const P, double const * const shifts, const int no_shifts, const int id, const int iteration) {
   int append = 0;
   char filename[300];
@@ -486,14 +577,15 @@ static void cgmms_write_props(spinor ** const P, double const * const shifts, co
 
   /* save all the results of (Q^dagger Q)^(-1) \gamma_5 \phi */
   for(int im = 0; im < no_shifts; im++) {
-    if(SourceInfo.type != 1) {
+    if(SourceInfo.type != SRC_TYPE_VOL) {
       if (PropInfo.splitted) {
-        sprintf(filename, "%s.%.2d.%.4d.%.2d.%.2d.cgmms.%.2d.inverted", SourceInfo.basename, id, SourceInfo.nstore, SourceInfo.t, SourceInfo.ix, im);
+        if(T_global > 99) sprintf(filename, "%s.%.2d.%.4d.%.3d.%.2d.cgmms.%.2d.inverted", PropInfo.basename, id, SourceInfo.nstore, SourceInfo.t, SourceInfo.ix, im);
+        else sprintf(filename, "%s.%.2d.%.4d.%.2d.%.2d.cgmms.%.2d.inverted", PropInfo.basename, id, SourceInfo.nstore, SourceInfo.t, SourceInfo.ix, im);
       } else {
-        sprintf(filename, "%s.%.2d.%.4d.%.2d.cgmms.%.2d.inverted", SourceInfo.basename, id, SourceInfo.nstore, SourceInfo.t, im);
+        sprintf(filename, "%s.%.2d.%.4d.%.2d.cgmms.%.2d.inverted", PropInfo.basename, id, SourceInfo.nstore, SourceInfo.t, im);
       }
     } else {
-      sprintf(filename, "%s.%.2d.%.4d.%.5d.cgmms.%.2d.0", SourceInfo.basename, id, SourceInfo.nstore, SourceInfo.sample, im);
+      sprintf(filename, "%s.%.2d.%.4d.%.5d.cgmms.%.2d.0", PropInfo.basename, id, SourceInfo.nstore, SourceInfo.sample, im);
     }
     
     if(g_kappa != 0) {
diff --git a/invert_eo.h b/invert_eo.h
index c8f8a87e4..522f4dd47 100644
--- a/invert_eo.h
+++ b/invert_eo.h
@@ -26,13 +26,16 @@
 
 #ifndef _INVERT_EO_H
 #define _INVERT_EO_H
+#include "global.h"
+#include "misc_types.h"
 #include "solver/solver_params.h"
+
 int invert_eo(spinor * const Even_new, spinor * const Odd_new, 
-	      spinor * const Even, spinor * const Odd,
-	      const double precision, const int iter_max,
-	      const int solver_flag, const int rel_prec,
-	      const int sub_evs_flag, const int even_odd_flag,
-        const int no_extra_masses, double * const extra_masses, solver_params_t solver_params, 
-        const int id );
+              spinor * const Even, spinor * const Odd,
+              const double precision, const int iter_max,
+              const int solver_flag, const int rel_prec,
+              const int sub_evs_flag, const int even_odd_flag,
+              const int no_extra_masses, double * const extra_masses, solver_params_t solver_params, const int id,
+              const ExternalInverter external_inverter, const SloppyPrecision sloppy, const CompressionType compression );
 
 #endif
diff --git a/invert_overlap.c b/invert_overlap.c
index c5e308f8b..35c01af62 100644
--- a/invert_overlap.c
+++ b/invert_overlap.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include "global.h"
diff --git a/io/DML_crc32.c b/io/DML_crc32.c
index b576e4cf1..1b04e3174 100644
--- a/io/DML_crc32.c
+++ b/io/DML_crc32.c
@@ -41,7 +41,7 @@
 */
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include"dml.h"
diff --git a/io/Makefile.in b/io/Makefile.in
index 486b5c8d3..568b252ca 100644
--- a/io/Makefile.in
+++ b/io/Makefile.in
@@ -73,7 +73,8 @@ libio_TARGETS = utils_engineering \
 		eospinor_write \
 		eospinor_read \
 		io_cm \
-		deri_write_stdout spinor_write_stdout sw_write_stdout
+		deri_write_stdout spinor_write_stdout sw_write_stdout \
+		gauge_write_luscher_binary
 
 libio_OBJECTS = $(addsuffix .o, ${libio_TARGETS})
 
@@ -99,7 +100,7 @@ include ${top_srcdir}/Makefile.global
 
 # rule to compile objects
 
-%.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h
+%.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config.h
 	$(COMPILE) -c $<
 
 
diff --git a/io/deri_write_stdout.c b/io/deri_write_stdout.c
index 89fa970d3..cf281d709 100644
--- a/io/deri_write_stdout.c
+++ b/io/deri_write_stdout.c
@@ -18,12 +18,12 @@
 ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include "global.h"
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "su3adj.h"
@@ -45,7 +45,7 @@ void deri_write_stdout(su3adj** const df) {
 	for(int z = 0; z < g_nproc_z*LZ; z++) {
 	  Z = z - g_proc_coords[3]*LZ;
 	  coords[3] = z / LZ;
-#ifdef MPI
+#ifdef TM_USE_MPI
 	  MPI_Cart_rank(g_cart_grid, coords, &id);
 #endif
 	  if(g_cart_id == id) {
@@ -66,7 +66,7 @@ void deri_write_stdout(su3adj** const df) {
 	      fflush(stdout);
 	    }
 	  }
-#ifdef MPI
+#ifdef TM_USE_MPI
 	  MPI_Barrier(MPI_COMM_WORLD);
 #endif
 	}
diff --git a/io/dml.c b/io/dml.c
index 34c2a5182..567a569fa 100644
--- a/io/dml.c
+++ b/io/dml.c
@@ -4,9 +4,9 @@
 */
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
@@ -26,7 +26,7 @@ void DML_checksum_init(DML_Checksum *checksum){
 }
 
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 int DML_global_xor(uint32_t *x) {
   unsigned long work = (unsigned long)*x;
   unsigned long dest;
diff --git a/io/eospinor.ih b/io/eospinor.ih
index 8e9bd1ccb..1d41d56c6 100644
--- a/io/eospinor.ih
+++ b/io/eospinor.ih
@@ -19,7 +19,7 @@
 
 #include <lime.h>
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 
 #include <stdlib.h>
@@ -28,7 +28,7 @@
 #include <time.h>
 #include <sys/time.h>
 #include <sys/types.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include <unistd.h>
diff --git a/io/eospinor_read.c b/io/eospinor_read.c
index ab82fb9c2..4065f6239 100644
--- a/io/eospinor_read.c
+++ b/io/eospinor_read.c
@@ -25,7 +25,7 @@ int read_eospinor(spinor * const s, char * filename) {
   n_uint64_t bytes;
   char * header_type;
   LimeReader * limereader;
-#ifdef MPI
+#ifdef TM_USE_MPI
   int position;
 #endif
   spinor tmp[1];
@@ -73,7 +73,7 @@ int read_eospinor(spinor * const s, char * filename) {
   for(x = 0; x < LX; x++) {
     for(y = 0; y < LY; y++) {
       for(z = 0; z < LZ; z++) {
-#if (defined MPI)
+#if (defined TM_USE_MPI)
 	limeReaderSeek(limereader, (n_uint64_t)
 		       (g_proc_coords[0]*T+
 			(((g_proc_coords[1]*LX+x)*g_nproc_y*LY+g_proc_coords[2]*LY+y)*g_nproc_z*LZ
@@ -90,7 +90,7 @@ int read_eospinor(spinor * const s, char * filename) {
 	    be_to_cpu_assign(s + i, tmp, sizeof(spinor)/8);
 	    if(status < 0 && status != LIME_EOR) {
 	      fprintf(stderr, "LIME read error occured with status = %d while reading file %s!\n Aborting...\n", status, filename);
-#ifdef MPI
+#ifdef TM_USE_MPI
 	      MPI_Abort(MPI_COMM_WORLD, 1);
 	      MPI_Finalize();
 #endif
diff --git a/io/eospinor_write.c b/io/eospinor_write.c
index dcbe21edd..a609c1382 100644
--- a/io/eospinor_write.c
+++ b/io/eospinor_write.c
@@ -40,17 +40,17 @@ int write_eospinor(spinor * const s, char * filename,
   int coords[4];
   char message[500];
   n_uint64_t bytes;
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Status mpistatus;
 #endif
 
   if(g_cart_id == 0){  
     if(g_kappa > 0. || g_kappa < 0.) {
-      sprintf(message,"\n eigenvalue = %e\n prec = %e\n conf nr = %d\n beta = %f, kappa = %f, mu = %f, c2_rec = %f\n hmcversion = %s", 
+      sprintf(message,"\n eigenvalue = %e\n prec = %e\n conf nr = %d\n beta = %.12f, kappa = %.12f, mu = %.12f, c2_rec = %f\n hmcversion = %s", 
 	      evalue, prec, nstore, g_beta, g_kappa, g_mu/2./g_kappa, g_rgi_C1, PACKAGE_VERSION);
     }
     else {
-      sprintf(message,"\n eigenvalue = %e\n prec = %e\n conf nr = %d\n beta = %f, kappa = %f, 2*kappa*mu = %f, c2_rec = %f\n hmcversion = %s", 
+      sprintf(message,"\n eigenvalue = %e\n prec = %e\n conf nr = %d\n beta = %.12f, kappa = %.12f, 2*kappa*mu = %.12f, c2_rec = %f\n hmcversion = %s", 
 	      evalue, prec, nstore, g_beta, g_kappa, g_mu, g_rgi_C1, PACKAGE_VERSION);
     }
     bytes = strlen( message );
@@ -62,7 +62,7 @@ int write_eospinor(spinor * const s, char * filename,
     limewriter = limeCreateWriter( ofs );
     if(limewriter == (LimeWriter*)NULL) {
       fprintf(stderr, "LIME error in file %s for writing!\n Aboring...\n", filename);
-#ifdef MPI
+#ifdef TM_USE_MPI
       MPI_Abort(MPI_COMM_WORLD, 1);
       MPI_Finalize();
 #endif
@@ -73,7 +73,7 @@ int write_eospinor(spinor * const s, char * filename,
     status = limeWriteRecordHeader( limeheader, limewriter);
     if(status < 0 ) {
       fprintf(stderr, "LIME write header (xlf-info) error %d\n", status);
-#ifdef MPI
+#ifdef TM_USE_MPI
       MPI_Abort(MPI_COMM_WORLD, 1);
       MPI_Finalize();
 #endif
@@ -88,7 +88,7 @@ int write_eospinor(spinor * const s, char * filename,
     status = limeWriteRecordHeader( limeheader, limewriter);
     if(status < 0 ) {
       fprintf(stderr, "LIME write header (eospinor-binary-data) error %d\n", status);
-#ifdef MPI
+#ifdef TM_USE_MPI
       MPI_Abort(MPI_COMM_WORLD, 1);
       MPI_Finalize();
 #endif
@@ -110,7 +110,7 @@ int write_eospinor(spinor * const s, char * filename,
 	for(t0 = 0; t0 < T*g_nproc_t; t0++){
 	  t = t0 - T*g_proc_coords[0];
 	  coords[0] = t0 / T;
-#ifdef MPI
+#ifdef TM_USE_MPI
 	  MPI_Cart_rank(g_cart_grid, coords, &id);
 #endif
 	  i = g_lexic2eosub[ g_ipt[t][X][Y][Z] ];
@@ -121,7 +121,7 @@ int write_eospinor(spinor * const s, char * filename,
 		be_to_cpu_assign(tmp, s + i , sizeof(spinor)/8);
 		status = limeWriteRecordData((void*)tmp, &bytes, limewriter);
 	      }
-#ifdef MPI
+#ifdef TM_USE_MPI
 	      else {
 		MPI_Recv(tmp, sizeof(spinor)/8, MPI_DOUBLE, id, tag, g_cart_grid, &mpistatus);
 		status = limeWriteRecordData((void*)tmp, &bytes, limewriter);
@@ -129,14 +129,14 @@ int write_eospinor(spinor * const s, char * filename,
 #endif
 	      if(status < 0 ) {
 		fprintf(stderr, "LIME write error %d\n", status);
-#ifdef MPI
+#ifdef TM_USE_MPI
 		MPI_Abort(MPI_COMM_WORLD, 1);
 		MPI_Finalize();
 #endif
 		exit(500);
 	      }
 	    }
-#ifdef MPI
+#ifdef TM_USE_MPI
 	    else {
 	      if(g_cart_id == id) {
 		be_to_cpu_assign(tmp, s + i, sizeof(spinor)/8);
@@ -147,7 +147,7 @@ int write_eospinor(spinor * const s, char * filename,
 	    tag++;
 	  }
 	}
-#ifdef MPI
+#ifdef TM_USE_MPI
  	MPI_Barrier(g_cart_grid); 
 #endif
 	tag=0;
diff --git a/io/gauge.h b/io/gauge.h
index 89206cbe8..e73b00ac6 100644
--- a/io/gauge.h
+++ b/io/gauge.h
@@ -21,7 +21,7 @@
 #define _GAUGE_H
 
 #if HAVE_CONFIG_H
-#include<config.h>
+#include<tmlqcd_config.h>
 #endif
 
 #include <io/selector.h>
diff --git a/io/gauge.ih b/io/gauge.ih
index 6d4b80168..44e76718a 100644
--- a/io/gauge.ih
+++ b/io/gauge.ih
@@ -19,7 +19,7 @@
 
 #include <lime.h>
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 
 #include <stdlib.h>
@@ -28,7 +28,7 @@
 #include <time.h>
 #include <sys/time.h>
 #include <sys/types.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include <unistd.h>
@@ -36,6 +36,7 @@
 #include <errno.h>
 #include "global.h"
 #include "su3.h"
+#include "gettime.h"
 
 #include <io/utils.h>
 #include <io/gauge.h>
diff --git a/io/gauge_read.c b/io/gauge_read.c
index c8d344fcf..ca5ba1b47 100644
--- a/io/gauge_read.c
+++ b/io/gauge_read.c
@@ -19,6 +19,9 @@
  ***********************************************************************/
 
 #include "gauge.ih"
+#ifdef DDalphaAMG
+# include "DDalphaAMG_interface.h"
+#endif
 
 extern int gauge_precision_read_flag;
 paramsGaugeInfo GaugeInfo = { 0., 0, {0,0}, NULL, NULL};
@@ -182,5 +185,10 @@ int read_gauge_field(char * filename, su3 ** const gf) {
 
   g_update_gauge_copy = 1;
 
+#ifdef DDalphaAMG
+  if(gf==g_gauge_field)
+    MG_reset();
+#endif
+
   return(0);
 }
diff --git a/io/gauge_read_binary.c b/io/gauge_read_binary.c
index ebb8713b5..fbc708d92 100644
--- a/io/gauge_read_binary.c
+++ b/io/gauge_read_binary.c
@@ -61,14 +61,18 @@ int read_binary_gauge_data(LemonReader * lemonreader, DML_Checksum * checksum, p
   }
 
   if (g_debug_level > 0) {
+#ifdef TM_USE_MPI
     MPI_Barrier(g_cart_grid);
     tick = MPI_Wtime();
+#endif
   }
 
   status = lemonReadLatticeParallelMapped(lemonreader, filebuffer, bytes, latticeSize, scidacMapping);
 
   if (g_debug_level > 0) {
+#ifdef TM_USE_MPI
     MPI_Barrier(g_cart_grid);
+#endif
     tock = MPI_Wtime();
   }
 
@@ -129,19 +133,17 @@ int read_binary_gauge_data(LimeReader * limereader, DML_Checksum * checksum, par
   n_uint64_t bytes;
   su3 tmp[4];
   float tmp2[72];
-#ifdef MPI
   double tick = 0, tock = 0;
-#endif
   char measure[64];
   DML_SiteRank rank;
   DML_checksum_init(checksum);
 
-#ifdef MPI
   if (g_debug_level > 0) {
+#ifdef TM_USE_MPI
     MPI_Barrier(g_cart_grid);
-    tick = MPI_Wtime();
-  }
 #endif
+    tick = gettime();
+  }
 
   bytes = limeReaderBytes(limereader); /* datalength of ildg-binary-data record in bytes */
   if (bytes != (n_uint64_t)g_nproc * (n_uint64_t)VOLUME * 4 * (n_uint64_t)sizeof(su3) / (input->prec==64 ? 1 : 2)) {
@@ -157,7 +159,7 @@ int read_binary_gauge_data(LimeReader * limereader, DML_Checksum * checksum, par
   for(t = 0; t < T; t++) {
     for(z = 0; z < LZ; z++) {
       for(y = 0; y < LY; y++) {
-#ifdef MPI
+#ifdef TM_USE_MPI
         limeReaderSeek(limereader,(n_uint64_t)
                        (((n_uint64_t) g_proc_coords[1]*LX) +
                         ((n_uint64_t) (((g_proc_coords[0]*T+t)*g_nproc_z*LZ+g_proc_coords[3]*LZ+z)*g_nproc_y*LY
@@ -178,7 +180,7 @@ int read_binary_gauge_data(LimeReader * limereader, DML_Checksum * checksum, par
           }
           if(status < 0 && status != LIME_EOR) {
             fprintf(stderr, "LIME read error occurred with status = %d while reading in gauge_read_binary.c!\n", status);
-#ifdef MPI
+#ifdef TM_USE_MPI
               MPI_Abort(MPI_COMM_WORLD, 1);
               MPI_Finalize();
 #endif
@@ -201,10 +203,11 @@ int read_binary_gauge_data(LimeReader * limereader, DML_Checksum * checksum, par
     }
   }
 
-#ifdef MPI
   if (g_debug_level > 0) {
+#ifdef TM_USE_MPI
     MPI_Barrier(g_cart_grid);
-    tock = MPI_Wtime();
+#endif
+    tock = gettime();
 
     if (g_cart_id == 0) {
       engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes, "b");
@@ -217,7 +220,7 @@ int read_binary_gauge_data(LimeReader * limereader, DML_Checksum * checksum, par
       fprintf(stdout, " (%s per MPI process).\n", measure);
     }
   }
-
+#ifdef TM_USE_MPI
   DML_checksum_combine(checksum);
 #endif
   return(0);
diff --git a/io/gauge_write.c b/io/gauge_write.c
index f911770bf..3c31e3f2a 100644
--- a/io/gauge_write.c
+++ b/io/gauge_write.c
@@ -49,7 +49,7 @@ int write_gauge_field(char * filename, const int prec, paramsXlfInfo const *xlfI
     fprintf(stdout, "#   Calculated            : A = %#010x B = %#010x.\n", checksum.suma, checksum.sumb);
     fflush(stdout);
   }
-#ifdef MPI
+#ifdef TM_USE_MPI
     MPI_Barrier(MPI_COMM_WORLD);
 #endif /* MPI */
 
diff --git a/io/gauge_write_binary.c b/io/gauge_write_binary.c
index 3a7941313..8b9058533 100644
--- a/io/gauge_write_binary.c
+++ b/io/gauge_write_binary.c
@@ -45,11 +45,12 @@ int write_binary_gauge_data(LemonWriter * lemonwriter, const int prec, DML_Check
     errno = 0;
     return 1;
   }
-
+#ifdef TM_USE_MPI
   if (g_debug_level > 0) {
     MPI_Barrier(g_cart_grid);
     tick = MPI_Wtime();
   }
+#endif
 
   tG = g_proc_coords[0]*T;
   zG = g_proc_coords[3]*LZ;
@@ -85,7 +86,9 @@ int write_binary_gauge_data(LemonWriter * lemonwriter, const int prec, DML_Check
   }
 
   if (g_debug_level > 0) {
+#ifdef TM_USE_MPI
     MPI_Barrier(g_cart_grid);
+#endif
     tock = MPI_Wtime();
 
     if (g_cart_id == 0) {
@@ -131,20 +134,21 @@ int write_binary_gauge_data(LimeWriter * limewriter, const int prec, DML_Checksu
   int coords[4];
   n_uint64_t bytes;
   DML_SiteRank rank;
-#ifdef MPI
   double tick = 0, tock = 0;
   char measure[64];
+#ifdef TM_USE_MPI
   MPI_Status mpi_status;
 #endif
 
   DML_checksum_init(checksum);
 
-#ifdef MPI
   if (g_debug_level > 0) {
+#ifdef TM_USE_MPI
     MPI_Barrier(g_cart_grid);
-    tick = MPI_Wtime();
-  }
 #endif
+    tick = gettime();
+  }
+
   if(prec == 32) bytes = (n_uint64_t)2*sizeof(su3);
   else bytes = (n_uint64_t)4*sizeof(su3);
   for(t0 = 0; t0 < T*g_nproc_t; t0++) {
@@ -160,7 +164,7 @@ int write_binary_gauge_data(LimeWriter * limewriter, const int prec, DML_Checksu
         for(x = 0; x < LX*g_nproc_x; x++) {
           X = x - g_proc_coords[1]*LX;
           coords[1] = x / LX;
-#ifdef MPI
+#ifdef TM_USE_MPI
           MPI_Cart_rank(g_cart_grid, coords, &id);
 #endif
           if(g_cart_id == 0) {
@@ -183,7 +187,7 @@ int write_binary_gauge_data(LimeWriter * limewriter, const int prec, DML_Checksu
                 status = limeWriteRecordData((void*)&tmp, &bytes, limewriter);
               }
             }
-#ifdef MPI
+#ifdef TM_USE_MPI
             else {
               if(prec == 32) {
                 MPI_Recv(tmp2, 4*sizeof(su3)/8, MPI_FLOAT, id, tag, g_cart_grid, &mpi_status);
@@ -201,14 +205,14 @@ int write_binary_gauge_data(LimeWriter * limewriter, const int prec, DML_Checksu
               fprintf(stderr, "LIME write error occurred with status = %d, while writing in gauge_write_binary.c!\n", status);
               fprintf(stderr, "x %d, y %d, z %d, t %d (%d,%d,%d,%d)\n",x,y,z,tt,X,Y,Z,tt);
               fprintf(stderr, "id = %d, bytes = %lu, size = %d\n", g_cart_id, bytes,  (int)(4*sizeof(su3)/8));
-#ifdef MPI
+#ifdef TM_USE_MPI
               MPI_Abort(MPI_COMM_WORLD, 1);
               MPI_Finalize();
 #endif
               exit(500);
             }
           }
-#ifdef MPI
+#ifdef TM_USE_MPI
           else {
             if(g_cart_id == id){
               memcpy(&tmp3[0], &g_gauge_field[ g_ipt[tt][X][Y][Z] ][1], sizeof(su3));
@@ -228,17 +232,19 @@ int write_binary_gauge_data(LimeWriter * limewriter, const int prec, DML_Checksu
 #endif
           tag++;
         }
-#ifdef MPI
+#ifdef TM_USE_MPI
         MPI_Barrier(g_cart_grid);
 #endif
       }
     }
   }
 
-#ifdef MPI
+
   if (g_debug_level > 0) {
+#ifdef TM_USE_MPI
     MPI_Barrier(g_cart_grid);
-    tock = MPI_Wtime();
+#endif
+    tock = gettime();
 
     if (g_cart_id == 0) {
       engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes, "b");
@@ -251,7 +257,6 @@ int write_binary_gauge_data(LimeWriter * limewriter, const int prec, DML_Checksu
       fprintf(stdout, " (%s per MPI process).\n", measure);
     }
   }
-#endif
 
   return(0);
 }
diff --git a/io/gauge_write_luscher_binary.c b/io/gauge_write_luscher_binary.c
new file mode 100644
index 000000000..ef36e5cdb
--- /dev/null
+++ b/io/gauge_write_luscher_binary.c
@@ -0,0 +1,62 @@
+/***********************************************************************
+* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+*
+* This file is part of tmLQCD.
+*
+* tmLQCD is free software: you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation, either version 3 of the License, or
+* (at your option) any later version.
+*
+* tmLQCD is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+***********************************************************************/
+
+#include "gauge.ih"
+#include <stdio.h>
+
+// not parallelised!
+
+int write_luscher_gauge_binary(const double plaq, char* filename, su3 ** const gf) {
+#ifdef TM_USE_MPI
+  fprintf(stdout, "Luescher DD-HMC format for gauges not implemented for MPI! Not writing anything!\n");
+#else
+  FILE * ofs;
+  int ix;
+
+  ofs = fopen(filename, "w");
+
+  fwrite(&T, sizeof(int), 1, ofs);
+  fwrite(&L, sizeof(int), 1, ofs);
+  fwrite(&L, sizeof(int), 1, ofs);
+  fwrite(&L, sizeof(int), 1, ofs);
+  fwrite(&plaq, sizeof(double), 1, ofs);
+
+  for(int t = 0; t < T; t++) {
+    for(int x = 0; x < LX; x++) {
+      for(int y = 0; y < LY; y++) {
+	for(int z = 0; z < LZ; z++) {
+	  ix = g_ipt[t][x][y][z];
+	  // if odd
+	  if((t + x + y + z)%2 == 1) {
+	    for(int mu =0; mu < 4; mu++) {
+	      // forward direction
+	      fwrite(&gf[ix][mu], sizeof(double), 18, ofs);
+	      // backward direction
+	      fwrite(&gf[ g_idn[ix][mu] ][mu], sizeof(double), 18, ofs);
+	    }
+	  }
+	}
+      }
+    }
+  }
+  fclose(ofs);
+
+#endif
+  return(0);
+}
diff --git a/io/gauge_write_luscher_binary.h b/io/gauge_write_luscher_binary.h
new file mode 100644
index 000000000..3b47fefc0
--- /dev/null
+++ b/io/gauge_write_luscher_binary.h
@@ -0,0 +1,30 @@
+/***********************************************************************
+* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+*
+* This file is part of tmLQCD.
+*
+* tmLQCD is free software: you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation, either version 3 of the License, or
+* (at your option) any later version.
+*
+* tmLQCD is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+***********************************************************************/
+#ifndef _GAUGE_WRITE_LUSCHER_BINARY_H
+#define _GAUGE_WRITE_LUSCHER_BINARY_H
+
+#include "gauge.ih"
+#include <stdio.h>
+#include <unistd.h>
+
+// not parallelised!
+
+int write_luscher_gauge_binary(const double plaq, char* filename, su3 ** const gf);
+
+#endif
diff --git a/io/io_cm.c b/io/io_cm.c
index 277845d5d..6e397b3f2 100644
--- a/io/io_cm.c
+++ b/io/io_cm.c
@@ -23,7 +23,7 @@ int read_spinorfield_cm_single(spinor * const s, spinor * const r, char * filena
   for(x = 0; x < LX; x++) {
     for(y = 0; y < LY; y++) {
       for(z = 0; z < LZ; z++) {
-#if (defined MPI)
+#if (defined TM_USE_MPI)
 	fseek(ifs,
 	      (g_proc_coords[0]*T+
 	       (((g_proc_coords[1]*LX+x)*g_nproc_y*LY+g_proc_coords[2]*LY+y)*g_nproc_z*LZ
@@ -98,7 +98,7 @@ int read_spinorfield_cm_swap_single(spinor * const s, spinor * const r, char * f
   ifs = fopen(filename, "r");
   if(ifs == (FILE *)NULL) {
     fprintf(stderr, "Could not open file %s\n Aborting...\n", filename);
-#ifdef MPI
+#ifdef TM_USE_MPI
     MPI_Abort(MPI_COMM_WORLD, 1);
     MPI_Finalize();
 #endif
@@ -108,7 +108,7 @@ int read_spinorfield_cm_swap_single(spinor * const s, spinor * const r, char * f
   for(x = 0; x < LX; x++) {
     for(y = 0; y < LY; y++) {
       for(z = 0; z < LZ; z++) {
-#if (defined MPI)
+#if (defined TM_USE_MPI)
         fseek(ifs,
               (g_proc_coords[0]*T+
                (((g_proc_coords[1]*LX+x)*g_nproc_y*LY+g_proc_coords[2]*LY+y)*g_nproc_z*LZ
@@ -158,7 +158,7 @@ int write_spinorfield_cm_single(spinor * const s, spinor * const r, char * filen
   spinor * p = NULL;
   float tmp[24];
   int coords[4];
-#ifdef MPI
+#ifdef TM_USE_MPI
   int  tag = 0;
   MPI_Status status;
 #endif
@@ -180,7 +180,7 @@ int write_spinorfield_cm_single(spinor * const s, spinor * const r, char * filen
 	for(t0 = 0; t0 < T*g_nproc_t; t0++) {
 	  t = t0 - T*g_proc_coords[0];
 	  coords[0] = t0 / T;
-#ifdef MPI
+#ifdef TM_USE_MPI
 	  MPI_Cart_rank(g_cart_grid, coords, &id);
 #endif
 	  if(g_cart_id == id) {
@@ -197,7 +197,7 @@ int write_spinorfield_cm_single(spinor * const s, spinor * const r, char * filen
 	    if(g_cart_id == id) {
 	      double2single_cm(tmp, p + i);
 	    }
-#ifdef MPI
+#ifdef TM_USE_MPI
 	    else {
 	      MPI_Recv(tmp, sizeof(spinor)/8, MPI_FLOAT, id, tag, g_cart_grid, &status);
 	    }
@@ -205,7 +205,7 @@ int write_spinorfield_cm_single(spinor * const s, spinor * const r, char * filen
 	    fwrite(tmp, sizeof(float), 24, ofs);
 	    //	    printf("%e,%e\n",tmp[0],tmp[5]);fflush(stdout);
 	  }
-#ifdef MPI
+#ifdef TM_USE_MPI
 	  else {
 	    if(g_cart_id == id) {
 	      double2single_cm(tmp, p + i);
@@ -215,7 +215,7 @@ int write_spinorfield_cm_single(spinor * const s, spinor * const r, char * filen
 	  tag++;
 #endif
 	}
-#ifdef MPI
+#ifdef TM_USE_MPI
 	MPI_Barrier(g_cart_grid); 
 	tag=0;
 #endif
diff --git a/io/io_cm.h b/io/io_cm.h
index 8cd9cbc4d..d1c1f82be 100644
--- a/io/io_cm.h
+++ b/io/io_cm.h
@@ -2,7 +2,7 @@
 #define _IO_CM_H
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 
 #include <stdlib.h>
@@ -11,7 +11,7 @@
 #include <time.h>
 #include <sys/time.h>
 #include <sys/types.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include <unistd.h>
diff --git a/io/params.h b/io/params.h
index c910f11d0..6ea177e95 100644
--- a/io/params.h
+++ b/io/params.h
@@ -134,6 +134,14 @@ typedef struct {
   char * basename;
 } paramsSourceInfo;
 
+typedef enum SRC_TYPE {
+  SRC_TYPE_POINT = 0,
+  SRC_TYPE_VOL,
+  SRC_TYPE_TS,
+  SRC_TYPE_PION_TS,
+  SRC_TYPE_GEN_PION_TS
+} SRC_TYPE;
+
 /* defined in gauge_read.c */
 extern paramsGaugeInfo GaugeInfo;
 /* defined in spinor_read.c */
diff --git a/io/params.ih b/io/params.ih
index 027010c28..99e7d934d 100644
--- a/io/params.ih
+++ b/io/params.ih
@@ -1,6 +1,6 @@
 #include <lime.h>
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 
 #include <string.h>
diff --git a/io/scalar.h b/io/scalar.h
index cf44602d9..09eb7c1eb 100644
--- a/io/scalar.h
+++ b/io/scalar.h
@@ -16,7 +16,7 @@
  * You should have received a copy of the GNU General Public License
  * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
  ***********************************************************************/
-
+#ifdef TM_USE_BSM
 #ifndef _SCALARIO_H
 #define _SCALARIO_H
 
@@ -25,6 +25,8 @@
 
 int read_scalar_field(char * filename, scalar ** const sf);
 int read_scalar_field_parallel(char * filename, scalar ** const sf);
-void smear_scalar_fields_correlator( scalar ** const sf, scalar ** smearedfield );
-
+void smear_scalar_fields_correlator( scalar ** const sf, scalar ** smearedfield, int timeaverage );
+void smear_scalar_fields( scalar ** const sf, scalar ** smearedfield );
+int unit_scalar_field( scalar **sf);
+#endif
 #endif
diff --git a/io/scalar_read.c b/io/scalar_read.c
index 6e1083bad..2b633add8 100644
--- a/io/scalar_read.c
+++ b/io/scalar_read.c
@@ -17,11 +17,16 @@
  * You should have received a copy of the GNU General Public License
  * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
  ***********************************************************************/
-
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
 #include <errno.h>
 #include "global.h"
 #include "scalar.h"
+
+#if defined TM_USE_MPI
 #include "buffers/utils_nonblocking.h"
+#endif
 
 extern int scalar_precision_read_flag;
 // TODO consider that input scalar field could be in single prec.
@@ -81,12 +86,12 @@ int read_scalar_field_parallel( char * filename, scalar ** const sf){
           if ( nread != count ) { printf("Error in reading the scalar fields, exiting ...\n"); exit(1); }
   
       }
-      MPI_Barrier(MPI_COMM_WORLD);
-  
-      MPI_Bcast(buffer, count,  scalar_precision_read_flag==64 ? MPI_DOUBLE : MPI_FLOAT ,0, MPI_COMM_WORLD );
-
+#if defined TM_USE_MPI
+      MPI_Barrier(g_cart_grid);
+      MPI_Bcast(buffer, count,  scalar_precision_read_flag==64 ? MPI_DOUBLE : MPI_FLOAT ,0, g_cart_grid );
+#endif
       int ix, j;
-      for (ix=0; ix< VOLUME; ++ix)
+      for (ix=0; ix< VOLUME; ++ix){
          if ( g_coord[ix][0] == t ){
             int ind = LY*N_PROC_Y*LZ*N_PROC_Z*g_coord[ix][1] + LZ*N_PROC_Z*g_coord[ix][2] + g_coord[ix][3];
             for (j=0; j<4; ++j){
@@ -97,14 +102,26 @@ int read_scalar_field_parallel( char * filename, scalar ** const sf){
 
             }
          }
- //     if (g_proc_id == 1) printf("Buffer coordinate %e\n",((double*)buffer)[0]);
-      MPI_Barrier(MPI_COMM_WORLD);
+      }
+#if defined TM_USE_MPI
+      MPI_Barrier(g_cart_grid);
+#endif
            
   }
   free(buffer);
   return(0);
 }
-void smear_scalar_fields( scalar ** const sf, scalar ** smearedfield ) {
+int unit_scalar_field( scalar **sf){
+   int i;
+   for (i=0; i<VOLUME; ++i){
+      sf[0][i]=1.;
+      sf[1][i]=0.;
+      sf[2][i]=0.;
+      sf[3][i]=0.;
+   }
+   return (0);
+}
+void smear_scalar_fields( scalar ** smearedfield, scalar ** const sf ) {
 
    int ix;
    int in;
@@ -116,11 +133,13 @@ void smear_scalar_fields( scalar ** const sf, scalar ** smearedfield ) {
    scalar *nearen= (scalar *)malloc(sizeof(scalar)*VOLUMEPLUSRAND );
 
    int neit, neix, neiy, neiz;
+#if defined TM_USE_MPI
    MPI_Status  statuses[8];
    MPI_Request *request;
    request=( MPI_Request *) malloc(sizeof(MPI_Request)*8);
 
    int count=0;
+#endif
 
 // hypercubic smearing 
 
@@ -139,27 +158,35 @@ void smear_scalar_fields( scalar ** const sf, scalar ** smearedfield ) {
             for (neiy=0; neiy<2; ++neiy)
                for (neiz=0; neiz<2; ++neiz){
 
+#if defined TM_USE_MPI
                   count=0;
                   generic_exchange_direction_nonblocking( sf[in], sizeof(scalar), neit ? TDOWN : TUP, request, &count );
                   MPI_Waitall( count, request, statuses);
+#endif
                   for (ix=0; ix<VOLUME; ++ix)
                      tmps1[ix]= sf[in][    neit ? g_idn[ix][TUP] : g_iup[ix][TUP] ];
 
+#if defined TM_USE_MPI
                   count=0;
                   generic_exchange_direction_nonblocking(  tmps1, sizeof(scalar), neix ? XDOWN : XUP, request, &count );
                   MPI_Waitall( count, request, statuses);
+#endif
                   for (ix=0; ix<VOLUME; ++ix)
                      tmps2[ix]= tmps1[ neix ? g_idn[ix][XUP] : g_iup[ix][XUP] ];
 
+#if defined TM_USE_MPI
                   count=0;
                   generic_exchange_direction_nonblocking(  tmps2, sizeof(scalar), neiy ? YDOWN : YUP, request, &count );
                   MPI_Waitall( count, request, statuses);
+#endif
                   for (ix=0; ix<VOLUME; ++ix)
                      tmps1[ix]= tmps2[ neiy ? g_idn[ix][YUP] : g_iup[ix][YUP] ];
 
+#if defined TM_USE_MPI
                   count=0;
                   generic_exchange_direction_nonblocking(  tmps1, sizeof(scalar), neix ? ZDOWN : ZUP, request, &count );
                   MPI_Waitall( count, request, statuses);
+#endif
                   for (ix=0; ix<VOLUME; ++ix)
                      tmps2[ix]= tmps1[ neiz ? g_idn[ix][ZUP] : g_iup[ix][ZUP] ];
 
@@ -169,65 +196,81 @@ void smear_scalar_fields( scalar ** const sf, scalar ** smearedfield ) {
       for (ix =0; ix<VOLUME; ++ix){
          hyperc[ix]/=17.0;
       }
+#if defined TM_USE_MPI
       count=0;
       generic_exchange_direction_nonblocking( sf[in], sizeof(scalar), TDOWN, request, &count );
       MPI_Waitall( count, request, statuses);
+#endif
       for (ix=0; ix<VOLUME; ++ix)
          tmps1[ix]= sf[in][g_idn[ix][TUP]];
       for (ix=0; ix<VOLUME; ++ix)
          nearen[ix]+= tmps1[ix];
       
+#if defined TM_USE_MPI
       count=0;
       generic_exchange_direction_nonblocking( sf[in], sizeof(scalar), TUP ,  request, &count );
-      MPI_Waitall( count, request, statuses);      
+      MPI_Waitall( count, request, statuses);     
+#endif 
       for (ix=0; ix<VOLUME; ++ix)
          tmps1[ix]= sf[in][g_iup[ix][TUP]];
       for (ix=0; ix<VOLUME; ++ix)
          nearen[ix]+= tmps1[ix];
       
+#if defined TM_USE_MPI
       count=0;
       generic_exchange_direction_nonblocking( sf[in], sizeof(scalar), XDOWN, request, &count );
-      MPI_Waitall( count, request, statuses);      
+      MPI_Waitall( count, request, statuses);   
+#endif   
       for (ix=0; ix<VOLUME; ++ix)
          tmps1[ix]= sf[in][g_idn[ix][XUP]];
       for (ix=0; ix<VOLUME; ++ix)
          nearen[ix]+= tmps1[ix];
 
+#if defined TM_USE_MPI
       count=0;
       generic_exchange_direction_nonblocking( sf[in], sizeof(scalar), XUP , request, &count );
       MPI_Waitall( count, request, statuses);
+#endif
       for (ix=0; ix<VOLUME; ++ix)
          tmps1[ix]= sf[in][g_iup[ix][XUP]];
       for (ix=0; ix<VOLUME; ++ix)
          nearen[ix]+= tmps1[ix];
 
+#if defined TM_USE_MPI
       count=0;    
       generic_exchange_direction_nonblocking( sf[in], sizeof(scalar), YDOWN, request, &count );
-      MPI_Waitall( count, request, statuses);      
+      MPI_Waitall( count, request, statuses);
+#endif      
       for (ix=0; ix<VOLUME; ++ix)
          tmps1[ix]= sf[in][g_idn[ix][YUP]];
       for (ix=0; ix<VOLUME; ++ix)
          nearen[ix]+= tmps1[ix];
     
+#if defined TM_USE_MPI
       count=0;
       generic_exchange_direction_nonblocking( sf[in], sizeof(scalar), YUP, request, &count );
       MPI_Waitall( count, request, statuses);
+#endif
       for (ix=0; ix<VOLUME; ++ix)
          tmps1[ix]= sf[in][g_iup[ix][YUP]];
       for (ix=0; ix<VOLUME; ++ix)
          nearen[ix]+= tmps1[ix];
 
+#if defined TM_USE_MPI
       count=0;
       generic_exchange_direction_nonblocking( sf[in], sizeof(scalar), ZDOWN, request, &count );
-      MPI_Waitall( count, request, statuses);      
+      MPI_Waitall( count, request, statuses);  
+#endif    
       for (ix=0; ix<VOLUME; ++ix)
          tmps1[ix]= sf[in][g_idn[ix][ZUP]];
       for (ix=0; ix<VOLUME; ++ix)
          nearen[ix]+= tmps1[ix];
 
+#if defined TM_USE_MPI
       count=0;
       generic_exchange_direction_nonblocking( sf[in], sizeof(scalar), ZUP, request, &count );
       MPI_Waitall( count, request, statuses);
+#endif
       for (ix=0; ix<VOLUME; ++ix)
          tmps1[ix]= sf[in][g_iup[ix][ZUP]];
       for (ix=0; ix<VOLUME; ++ix)
@@ -247,41 +290,401 @@ void smear_scalar_fields( scalar ** const sf, scalar ** smearedfield ) {
 
    free(hyperc);
    free(nearen);
+#if defined TM_USE_MPI
    free(request);
+#endif
 }
-void smear_scalar_fields_correlator( scalar ** const sf, scalar ** smearedfield ) {
+void smear_scalar_fields_correlator( scalar **smearedfield, scalar ** const sf, int timeaverage) {
 
    int x0,y0,z0,t0;
-   double timeslicesum[4];
+   double **timeslicesum;
+   double **timeslicesumnew;
+#if defined TM_USE_MPI
    double mpi_res;
+#endif
    int j;
+   double upneighbour[4];
+   double upsecneighbour[4];
+   double dnneighbour[4];
+   double dnsecneighbour[4];
    for (j = 0; j<4 ; ++j ){
       for (x0=0; x0<VOLUME; ++x0){
          smearedfield[j][x0]=0.0;
       }
    }
-   for (t0=0; t0<T; ++t0){
-      for (j=0; j<4; ++j){
-         timeslicesum[j]=0.0;
-         mpi_res=0.;
-//summing over the local volume
-
-         for (x0; x0<LX; x0++)
-            for (y0=0; y0<LY; ++y0)
-               for (z0=0; z0<LZ; ++z0){
-                  timeslicesum[j]+=sf[j][((t0*LX + x0)*LY + y0)*LZ + z0];
-               }
-#if defined MPI
-         MPI_Reduce(&timeslicesum[j], &mpi_res, 1, MPI_DOUBLE, MPI_SUM, 0, g_mpi_time_slices);
+   timeslicesum=(double **)malloc(sizeof(double *)*T);
+   for (j=0; j<T; ++j)
+      timeslicesum[j]= (double *)malloc(sizeof(double)*4);
+   timeslicesumnew=(double **)malloc(sizeof(double *)*T);
+   for (j=0; j<T; ++j)
+      timeslicesumnew[j] = (double *)malloc(sizeof(double)*4);
+   for (j=0; j<T; ++j){
+      timeslicesum[j][0]=0.;
+      timeslicesum[j][1]=0.;
+      timeslicesum[j][2]=0.;
+      timeslicesum[j][3]=0.;
+      timeslicesumnew[j][0]=0.;
+      timeslicesumnew[j][1]=0.;
+      timeslicesumnew[j][2]=0.;
+      timeslicesumnew[j][3]=0.;
+   }
+   for (j=0; j<VOLUME; ++j){
+          /* get (t,x,y,z) from j */
+      t0 = j/(LX*LY*LZ);
+      x0 = (j-t0*(LX*LY*LZ))/(LY*LZ);
+      y0 = (j-t0*(LX*LY*LZ)-x0*(LY*LZ))/(LZ);
+      z0 = (j-t0*(LX*LY*LZ)-x0*(LY*LZ) - y0*LZ);
+      timeslicesum[t0][0]+=sf[0][j];
+      timeslicesum[t0][1]+=sf[1][j];
+      timeslicesum[t0][2]+=sf[2][j];
+      timeslicesum[t0][3]+=sf[3][j];
+   }
+#if defined TM_USE_MPI
+   for (j=0; j<T; ++j){
+      MPI_Allreduce(&timeslicesum[j][0], &mpi_res, 1, MPI_DOUBLE, MPI_SUM, g_mpi_time_slices);
+      timeslicesum[j][0]=mpi_res;
+      MPI_Allreduce(&timeslicesum[j][1], &mpi_res, 1, MPI_DOUBLE, MPI_SUM, g_mpi_time_slices);
+      timeslicesum[j][1]=mpi_res;
+      MPI_Allreduce(&timeslicesum[j][2], &mpi_res, 1, MPI_DOUBLE, MPI_SUM, g_mpi_time_slices);
+      timeslicesum[j][2]=mpi_res;
+      MPI_Allreduce(&timeslicesum[j][3], &mpi_res, 1, MPI_DOUBLE, MPI_SUM, g_mpi_time_slices);
+      timeslicesum[j][3]=mpi_res;
+   }
 #endif
-         mpi_res/=(double)VOLUME*N_PROC_X*N_PROC_Y*N_PROC_Z;
-         for (x0; x0<LX; x0++)
-            for (y0=0; y0<LY; ++y0)
-               for (z0=0; z0<LZ; ++z0){
-                  smearedfield[j][((t0*LX + x0)*LY + y0)*LZ + z0]=mpi_res;
+/*   if (g_cart_id == 0){
+     for (j=0; j<T; ++j)
+       printf("%03d\t%10.10e\n", j, timeslicesum[j][0]);
+   }*/
+/*   for (j=0; j<T; ++j){
+      timeslicesum[j][0]/=(double)LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+      timeslicesum[j][1]/=(double)LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+      timeslicesum[j][2]/=(double)LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+      timeslicesum[j][3]/=(double)LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+   }
+*/
+   if (timeaverage == 0){
+     for (j=0; j<T; ++j){
+       timeslicesumnew[j][0]= timeslicesum[j][0] ;
+       timeslicesumnew[j][1]= timeslicesum[j][1] ;
+       timeslicesumnew[j][2]= timeslicesum[j][2] ;
+       timeslicesumnew[j][3]= timeslicesum[j][3] ;
+     }
+   }
+   if (timeaverage == 1){
+#if defined TM_USE_MPI
+     MPI_Status status[108];
+     int cntr=0;
+     MPI_Request request[108];
+
+     if (g_nproc_t > 1){
+       for (j=0; j<4;++j){
+
+         cntr=0;
+
+         MPI_Isend(&timeslicesum[0  ][j], 1, MPI_DOUBLE, g_nb_t_dn, 87, g_cart_grid, &request[cntr]);
+         MPI_Irecv(&upneighbour[j], 1, MPI_DOUBLE, g_nb_t_up, 87,g_cart_grid, &request[cntr+1]);
+
+         cntr=cntr+2;
+         MPI_Waitall(cntr, request, status);
+
+         cntr=0;
 
+         MPI_Isend(&timeslicesum[T-1][j], 1, MPI_DOUBLE, g_nb_t_up, 88, g_cart_grid, &request[cntr]);
+         MPI_Irecv(&dnneighbour[j], 1, MPI_DOUBLE, g_nb_t_dn, 88,g_cart_grid, &request[cntr+1]);
+
+         cntr=cntr+2;
+         MPI_Waitall(cntr, request, status);
+
+       }
+       if ( T > 1){
+         timeslicesumnew[0][0]= dnneighbour[0] + timeslicesum[0][0] + timeslicesum[1][0];
+         timeslicesumnew[0][1]= dnneighbour[1] + timeslicesum[0][1] + timeslicesum[1][1];
+         timeslicesumnew[0][2]= dnneighbour[2] + timeslicesum[0][2] + timeslicesum[1][2];
+         timeslicesumnew[0][3]= dnneighbour[3] + timeslicesum[0][3] + timeslicesum[1][3];
+
+         timeslicesumnew[T-1][0]= timeslicesum[T-2][0] + timeslicesum[T-1][0] + upneighbour[0];
+         timeslicesumnew[T-1][1]= timeslicesum[T-2][1] + timeslicesum[T-1][1] + upneighbour[1];
+         timeslicesumnew[T-1][2]= timeslicesum[T-2][2] + timeslicesum[T-1][2] + upneighbour[2];
+         timeslicesumnew[T-1][3]= timeslicesum[T-2][3] + timeslicesum[T-1][3] + upneighbour[3];
+
+         for (j=1;j<T-1;++j){
+           timeslicesumnew[j][0]= timeslicesum[j-1][0] + timeslicesum[j][0] + timeslicesum[j+1][0];
+           timeslicesumnew[j][1]= timeslicesum[j-1][1] + timeslicesum[j][1] + timeslicesum[j+1][1];
+           timeslicesumnew[j][2]= timeslicesum[j-1][2] + timeslicesum[j][2] + timeslicesum[j+1][2];
+           timeslicesumnew[j][3]= timeslicesum[j-1][3] + timeslicesum[j][3] + timeslicesum[j+1][3];
          }
+       }
+       else{
+
+         timeslicesumnew[0][0]= dnneighbour[0] + timeslicesum[0][0] + upneighbour[0];
+         timeslicesumnew[0][1]= dnneighbour[1] + timeslicesum[0][1] + upneighbour[1];
+         timeslicesumnew[0][2]= dnneighbour[2] + timeslicesum[0][2] + upneighbour[2];
+         timeslicesumnew[0][3]= dnneighbour[3] + timeslicesum[0][3] + upneighbour[3];
+
+       }
+     
+     }
+     else{
+       for (j=0;j<T;++j){
+         timeslicesumnew[j][0]= timeslicesum[(j-1+T)%T][0] + timeslicesum[j][0] + timeslicesum[(j+1)%T][0];
+         timeslicesumnew[j][1]= timeslicesum[(j-1+T)%T][1] + timeslicesum[j][1] + timeslicesum[(j+1)%T][1];
+         timeslicesumnew[j][2]= timeslicesum[(j-1+T)%T][2] + timeslicesum[j][2] + timeslicesum[(j+1)%T][2];
+         timeslicesumnew[j][3]= timeslicesum[(j-1+T)%T][3] + timeslicesum[j][3] + timeslicesum[(j+1)%T][3];
+       }
+     }
+#else
+     for (j=0;j<T;++j){
+       timeslicesumnew[j][0]= timeslicesum[(j-1+T)%T][0] + timeslicesum[j][0] + timeslicesum[(j+1)%T][0];
+       timeslicesumnew[j][1]= timeslicesum[(j-1+T)%T][1] + timeslicesum[j][1] + timeslicesum[(j+1)%T][1];
+       timeslicesumnew[j][2]= timeslicesum[(j-1+T)%T][2] + timeslicesum[j][2] + timeslicesum[(j+1)%T][2];
+       timeslicesumnew[j][3]= timeslicesum[(j-1+T)%T][3] + timeslicesum[j][3] + timeslicesum[(j+1)%T][3];
+     }
+#endif
+   }
+   else if (timeaverage == 2){
+#if defined TM_USE_MPI
+     MPI_Status status[108];
+     int cntr=0;
+     MPI_Request request[108];
+
+     if (g_nproc_t > 1){
+
+       for (j=0; j<4;++j){
+
+         cntr=0;
+
+         MPI_Isend(&timeslicesum[0  ][j], 1, MPI_DOUBLE, g_nb_t_dn, 87, g_cart_grid, &request[cntr]);
+         MPI_Irecv(&upneighbour[j], 1, MPI_DOUBLE, g_nb_t_up, 87,g_cart_grid, &request[cntr+1]);
+
+         cntr=cntr+2;
+         MPI_Waitall(cntr, request, status);
+
+         cntr=0;
+
+         MPI_Isend(&timeslicesum[T-1][j], 1, MPI_DOUBLE, g_nb_t_up, 88, g_cart_grid, &request[cntr]);
+         MPI_Irecv(&dnneighbour[j], 1, MPI_DOUBLE, g_nb_t_dn, 88,g_cart_grid, &request[cntr+1]);
+
+         cntr=cntr+2;
+         MPI_Waitall(cntr, request, status);
+
+       }
+       if (T > 1){
+
+         for (j=0; j<4;++j){
+
+           cntr=0;
+
+           MPI_Isend(&timeslicesum[1  ][j], 1, MPI_DOUBLE, g_nb_t_dn, 87, g_cart_grid, &request[cntr]);
+           MPI_Irecv(&upsecneighbour[j], 1, MPI_DOUBLE, g_nb_t_up, 87,g_cart_grid, &request[cntr+1]);
+
+           cntr=cntr+2;
+           MPI_Waitall(cntr, request, status);
+
+           cntr=0;
+
+           MPI_Isend(&timeslicesum[T-2][j], 1, MPI_DOUBLE, g_nb_t_up, 88, g_cart_grid, &request[cntr]);
+           MPI_Irecv(&dnsecneighbour[j], 1, MPI_DOUBLE, g_nb_t_dn, 88,g_cart_grid, &request[cntr+1]);
+
+           cntr=cntr+2;
+           MPI_Waitall(cntr, request, status);
+
+         }
+         if ( T > 3){
+           timeslicesumnew[0][0]= dnsecneighbour[0] + dnneighbour[0] + timeslicesum[0][0] + timeslicesum[1][0] + timeslicesum[2][0];
+           timeslicesumnew[0][1]= dnsecneighbour[1] + dnneighbour[1] + timeslicesum[0][1] + timeslicesum[1][1] + timeslicesum[2][1];
+           timeslicesumnew[0][2]= dnsecneighbour[2] + dnneighbour[2] + timeslicesum[0][2] + timeslicesum[1][2] + timeslicesum[2][2];
+           timeslicesumnew[0][3]= dnsecneighbour[3] + dnneighbour[3] + timeslicesum[0][3] + timeslicesum[1][3] + timeslicesum[2][3];
+
+           timeslicesumnew[1][0]= dnneighbour[0] + timeslicesum[0][0] + timeslicesum[1][0] + timeslicesum[2][0] + timeslicesum[3][0];
+           timeslicesumnew[1][1]= dnneighbour[1] + timeslicesum[0][1] + timeslicesum[1][1] + timeslicesum[2][1] + timeslicesum[3][1];
+           timeslicesumnew[1][2]= dnneighbour[2] + timeslicesum[0][2] + timeslicesum[1][2] + timeslicesum[2][2] + timeslicesum[3][2];
+           timeslicesumnew[1][3]= dnneighbour[3] + timeslicesum[0][3] + timeslicesum[1][3] + timeslicesum[2][3] + timeslicesum[3][3];
+
+           for (j=2; j<T-2; ++j){
+             timeslicesumnew[j][0]= timeslicesum[j-2][0] + timeslicesum[j-1][0] + timeslicesum[j][0] + timeslicesum[j+1][0] + timeslicesum[j+2][0];
+             timeslicesumnew[j][1]= timeslicesum[j-2][1] + timeslicesum[j-1][1] + timeslicesum[j][1] + timeslicesum[j+1][1] + timeslicesum[j+2][1];
+             timeslicesumnew[j][2]= timeslicesum[j-2][2] + timeslicesum[j-1][2] + timeslicesum[j][2] + timeslicesum[j+1][2] + timeslicesum[j+2][2];
+             timeslicesumnew[j][3]= timeslicesum[j-2][3] + timeslicesum[j-1][3] + timeslicesum[j][3] + timeslicesum[j+1][3] + timeslicesum[j+2][3];
+           }
+
+           timeslicesumnew[T-1][0]= upsecneighbour[0] + upneighbour[0] + timeslicesum[T-1][0] + timeslicesum[T-2][0] + timeslicesum[T-3][0];
+           timeslicesumnew[T-1][1]= upsecneighbour[1] + upneighbour[1] + timeslicesum[T-1][1] + timeslicesum[T-2][1] + timeslicesum[T-3][1];
+           timeslicesumnew[T-1][2]= upsecneighbour[2] + upneighbour[2] + timeslicesum[T-1][2] + timeslicesum[T-2][2] + timeslicesum[T-3][2];
+           timeslicesumnew[T-1][3]= upsecneighbour[3] + upneighbour[3] + timeslicesum[T-1][3] + timeslicesum[T-2][3] + timeslicesum[T-3][3];
+
+           timeslicesumnew[T-2][0]= upneighbour[0] + timeslicesum[T-1][0] + timeslicesum[T-2][0] + timeslicesum[T-3][0] + timeslicesum[T-4][0];
+           timeslicesumnew[T-2][1]= upneighbour[1] + timeslicesum[T-1][1] + timeslicesum[T-2][1] + timeslicesum[T-3][1] + timeslicesum[T-4][1];
+           timeslicesumnew[T-2][2]= upneighbour[2] + timeslicesum[T-1][2] + timeslicesum[T-2][2] + timeslicesum[T-3][2] + timeslicesum[T-4][2];
+           timeslicesumnew[T-2][3]= upneighbour[3] + timeslicesum[T-1][3] + timeslicesum[T-2][3] + timeslicesum[T-3][3] + timeslicesum[T-4][3];
+
+         }
+         else{
+
+           timeslicesumnew[0][0]= dnsecneighbour[0] + dnneighbour[0] + timeslicesum[0][0] + timeslicesum[1][0] + upneighbour[0];
+           timeslicesumnew[0][1]= dnsecneighbour[1] + dnneighbour[1] + timeslicesum[0][1] + timeslicesum[1][1] + upneighbour[1];
+           timeslicesumnew[0][2]= dnsecneighbour[2] + dnneighbour[2] + timeslicesum[0][2] + timeslicesum[1][2] + upneighbour[2];
+           timeslicesumnew[0][3]= dnsecneighbour[3] + dnneighbour[3] + timeslicesum[0][3] + timeslicesum[1][3] + upneighbour[3];
+
+           timeslicesumnew[1][0]= dnneighbour[0] + timeslicesum[0][0] + timeslicesum[1][0] + upneighbour[0] + upsecneighbour[0];
+           timeslicesumnew[1][1]= dnneighbour[1] + timeslicesum[0][1] + timeslicesum[1][1] + upneighbour[1] + upsecneighbour[1];
+           timeslicesumnew[1][2]= dnneighbour[2] + timeslicesum[0][2] + timeslicesum[1][2] + upneighbour[2] + upsecneighbour[2];
+           timeslicesumnew[1][3]= dnneighbour[3] + timeslicesum[0][3] + timeslicesum[1][3] + upneighbour[3] + upsecneighbour[3];
+
+         }
+       }
+       else{
+         for (j=0; j<4;++j){
+
+           cntr=0;
+
+           MPI_Isend(&dnneighbour[j], 1, MPI_DOUBLE, g_nb_t_up, 87, g_cart_grid, &request[cntr]);
+           MPI_Irecv(&dnsecneighbour[j], 1, MPI_DOUBLE, g_nb_t_dn, 87,g_cart_grid, &request[cntr+1]);
+
+           cntr=cntr+2;
+           MPI_Waitall(cntr, request, status);
+
+           cntr=0;
+
+           MPI_Isend(&upneighbour[j], 1, MPI_DOUBLE, g_nb_t_dn, 88, g_cart_grid, &request[cntr]);
+           MPI_Irecv(&upsecneighbour[j], 1, MPI_DOUBLE, g_nb_t_up, 88,g_cart_grid, &request[cntr+1]);
+
+           cntr=cntr+2;
+           MPI_Waitall(cntr, request, status);
+
+         }
+
+         timeslicesumnew[0][0]= dnsecneighbour[0] + dnneighbour[0] + timeslicesum[0][0] + upneighbour[0] + upsecneighbour[0];
+         timeslicesumnew[0][1]= dnsecneighbour[1] + dnneighbour[1] + timeslicesum[0][1] + upneighbour[1] + upsecneighbour[1];
+         timeslicesumnew[0][2]= dnsecneighbour[2] + dnneighbour[2] + timeslicesum[0][2] + upneighbour[2] + upsecneighbour[2];
+         timeslicesumnew[0][3]= dnsecneighbour[3] + dnneighbour[3] + timeslicesum[0][3] + upneighbour[3] + upsecneighbour[3];
+
+       }
+
+     }
+     else{
+
+       for (j=0;j<T;++j){
+         timeslicesumnew[j][0]= timeslicesum[(j-2+T)%T][0] + timeslicesum[(j-1+T)%T][0] + timeslicesum[j][0] + timeslicesum[(j+1)%T][0] + timeslicesum[(j+2)%T][0];
+         timeslicesumnew[j][1]= timeslicesum[(j-2+T)%T][1] + timeslicesum[(j-1+T)%T][1] + timeslicesum[j][1] + timeslicesum[(j+1)%T][1] + timeslicesum[(j+2)%T][1];
+         timeslicesumnew[j][2]= timeslicesum[(j-2+T)%T][2] + timeslicesum[(j-1+T)%T][2] + timeslicesum[j][2] + timeslicesum[(j+1)%T][2] + timeslicesum[(j+2)%T][2];
+         timeslicesumnew[j][3]= timeslicesum[(j-2+T)%T][3] + timeslicesum[(j-1+T)%T][3] + timeslicesum[j][3] + timeslicesum[(j+1)%T][3] + timeslicesum[(j+2)%T][3];
+       }
+
+     }
+#else
+     for (j=0;j<T;++j){
+       timeslicesumnew[j][0]= timeslicesum[(j-2+T)%T][0] + timeslicesum[(j-1+T)%T][0] + timeslicesum[j][0] + timeslicesum[(j+1)%T][0] + timeslicesum[(j+2)%T][0];
+       timeslicesumnew[j][1]= timeslicesum[(j-2+T)%T][1] + timeslicesum[(j-1+T)%T][1] + timeslicesum[j][1] + timeslicesum[(j+1)%T][1] + timeslicesum[(j+2)%T][1];
+       timeslicesumnew[j][2]= timeslicesum[(j-2+T)%T][2] + timeslicesum[(j-1+T)%T][2] + timeslicesum[j][2] + timeslicesum[(j+1)%T][2] + timeslicesum[(j+2)%T][2];
+       timeslicesumnew[j][3]= timeslicesum[(j-2+T)%T][3] + timeslicesum[(j-1+T)%T][3] + timeslicesum[j][3] + timeslicesum[(j+1)%T][3] + timeslicesum[(j+2)%T][3];
+     }
+#endif
+   }
+
+/*   if (g_cart_id == 1){
+     for (j=0; j<T; ++j){
+       printf("AFTER\t%03d\t%10.10e\n", j, timeslicesumnew[j][0]);
+       fflush(stdout);
+     }
+     exit(1);
+   }
+*/
+
+   if (timeaverage == 0){
+
+     for (j=0; j<T; ++j){
+
+       timeslicesum[j][0]/=(double)1.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+       timeslicesum[j][1]/=(double)1.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+       timeslicesum[j][2]/=(double)1.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+       timeslicesum[j][3]/=(double)1.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+
+       timeslicesumnew[j][0]/=(double)1.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+       timeslicesumnew[j][1]/=(double)1.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+       timeslicesumnew[j][2]/=(double)1.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+       timeslicesumnew[j][3]/=(double)1.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+     }
+
+   }
+   else if (timeaverage == 1) {
+
+     for (j=0; j<T; ++j){
+
+       timeslicesum[j][0]/=(double)1.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+       timeslicesum[j][1]/=(double)1.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+       timeslicesum[j][2]/=(double)1.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+       timeslicesum[j][3]/=(double)1.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+
+       timeslicesumnew[j][0]/=(double)3.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+       timeslicesumnew[j][1]/=(double)3.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+       timeslicesumnew[j][2]/=(double)3.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+       timeslicesumnew[j][3]/=(double)3.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+     }
+
+   }
+   else if (timeaverage == 2) {
+     for (j=0; j<T; ++j){
+
+       timeslicesum[j][0]/=(double)1.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+       timeslicesum[j][1]/=(double)1.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+       timeslicesum[j][2]/=(double)1.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+       timeslicesum[j][3]/=(double)1.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+
+       timeslicesumnew[j][0]/=(double)5.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+       timeslicesumnew[j][1]/=(double)5.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+       timeslicesumnew[j][2]/=(double)5.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+       timeslicesumnew[j][3]/=(double)5.*LX*LY*LZ*N_PROC_X*N_PROC_Y*N_PROC_Z;
+     }
+
+   }
+      
+   for (j=0; j<VOLUME; ++j){
+          /* get (t,x,y,z) from j */
+      t0 = j/(LX*LY*LZ);
+      x0 = (j-t0*(LX*LY*LZ))/(LY*LZ);
+      y0 = (j-t0*(LX*LY*LZ)-x0*(LY*LZ))/(LZ);
+      z0 = (j-t0*(LX*LY*LZ)-x0*(LY*LZ) - y0*LZ);
+
+      if (( timeaverage == 1 ) && ( ( g_coord[j][TUP] == 0 )  ||   ( g_coord[j][TUP] == ( T_global -1 ) ) )){
+
+         smearedfield[0][j]=timeslicesum[t0][0];
+         smearedfield[1][j]=timeslicesum[t0][1];
+         smearedfield[2][j]=timeslicesum[t0][2];
+         smearedfield[3][j]=timeslicesum[t0][3];
+
+      }
+
+      else if (( timeaverage == 2 ) && ( ( g_coord[j][TUP] == 0 ) || ( g_coord[j][TUP] == 1 ) || ( g_coord[j][TUP] == 2 ) ||( g_coord[j][TUP] == ( T_global -3 ) ) ||  ( g_coord[j][TUP] == ( T_global -2 ) ) ||  ( g_coord[j][TUP] == ( T_global -1 ) ) )){
+
+         smearedfield[0][j]=timeslicesum[t0][0];
+         smearedfield[1][j]=timeslicesum[t0][1];
+         smearedfield[2][j]=timeslicesum[t0][2];
+         smearedfield[3][j]=timeslicesum[t0][3];
+
+      }
+      else{
+
+         smearedfield[0][j]=timeslicesumnew[t0][0];
+         smearedfield[1][j]=timeslicesumnew[t0][1];
+         smearedfield[2][j]=timeslicesumnew[t0][2];
+         smearedfield[3][j]=timeslicesumnew[t0][3];
+
       }
    }
-}
+/*
+   if (g_cart_id == 0){
+     for (j=0; j<T; ++j){
+       printf("AFTER\t%03d\t%10.10e\n", j, timeslicesumnew[j][0]);
+       fflush(stdout);
+     }
+     exit(1);
+   }
+*/
+   for (j=0; j<T; ++j){
+     free(timeslicesumnew[j]);
+     free(timeslicesum[j]);
+   }
+   free(timeslicesumnew);
+   free(timeslicesum);
 
+}
diff --git a/io/spinor.ih b/io/spinor.ih
index 56f36fcb5..49fee6dda 100644
--- a/io/spinor.ih
+++ b/io/spinor.ih
@@ -19,7 +19,7 @@
 
 #include <lime.h>
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 
 #include <stdlib.h>
@@ -28,7 +28,7 @@
 #include <time.h>
 #include <sys/time.h>
 #include <sys/types.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include <unistd.h>
@@ -36,6 +36,7 @@
 #include <errno.h>
 
 #include <global.h>
+#include <gettime.h>
 
 #include <io/gauge.h>
 #include <io/spinor.h>
diff --git a/io/spinor_read.c b/io/spinor_read.c
index e9d030bd5..0e261931b 100644
--- a/io/spinor_read.c
+++ b/io/spinor_read.c
@@ -19,15 +19,20 @@
 
 #include "spinor.ih"
 #include "default_input_values.h"
+#include "global.h"
 
 paramsPropInfo PropInfo = {_default_propagator_splitted, _default_source_format_flag, _default_prop_precision_flag, NULL};
 paramsSourceInfo SourceInfo = {0, _default_propagator_splitted, _default_source_format_flag, _default_prop_precision_flag, 0, 0, 0, 0, 0, 0, 0, 1, NULL};
 
 int read_spinor(spinor * const s, spinor * const r, char * filename, const int position_) {
-  int status = 0, getpos = 0, bytes = 0, prec = 0, prop_type, position = position_, rstat=0;
-  char *header_type = NULL;
-  READER *reader = NULL;
+  int status = 0, getpos = 0, prec = 0, prop_type, position = position_, rstat=0;
+  uint64_t bytes = 0;
+  char * header_type = NULL;
+  READER * reader = NULL;
+  DML_Checksum checksum_read;
   DML_Checksum checksum;
+  char * checksum_string = NULL;
+  int DML_read_flag = 0;
   construct_reader(&reader, filename);
   /* determine the propagator type */
   prop_type = parse_propagator_type(reader);
@@ -75,16 +80,16 @@ int read_spinor(spinor * const s, spinor * const r, char * filename, const int p
 
   bytes = ReaderBytes(reader);
 
-  if ((int)bytes == LX * g_nproc_x * LY * g_nproc_y * LZ * g_nproc_z * T * g_nproc_t * sizeof(spinor)) {
+  if ((uint64_t)bytes == LX * g_nproc_x * LY * g_nproc_y * LZ * g_nproc_z * T * g_nproc_t * sizeof(spinor)) {
     prec = 64;
   }
   else {
-    if ((int)bytes == LX * g_nproc_x * LY * g_nproc_y * LZ * g_nproc_z * T * g_nproc_t * sizeof(spinor) / 2) {
+    if ((uint64_t)bytes == LX * g_nproc_x * LY * g_nproc_y * LZ * g_nproc_z * T * g_nproc_t * sizeof(spinor) / 2) {
       prec = 32;
     }
     else {
       fprintf(stderr, "Length of scidac-binary-data record in %s does not match input parameters.\n", filename);
-      fprintf(stderr, "Found %d bytes.\n", bytes);
+      fprintf(stderr, "Found %lu bytes.\n", bytes);
       return(-6);
     }
   }
@@ -106,10 +111,37 @@ int read_spinor(spinor * const s, spinor * const r, char * filename, const int p
     }
   }
 
-  if (g_cart_id == 0 && g_debug_level >= 0) {
-    printf("# Scidac checksums for DiracFermion field %s position %d:\n", filename, position);
-    printf("#   Calculated            : A = %#x B = %#x.\n", checksum.suma, checksum.sumb);
-    printf("# No Scidac checksum was read from headers, unable to check integrity of file.\n");
+  if( g_disable_src_IO_checks != 1 ){
+    // we search for a scidac-checksum directly after the binary data
+    // but only until more binary data is found
+    while ((status = ReaderNextRecord(reader)) != LIME_EOF) {
+      if (status != LIME_SUCCESS) {
+        fprintf(stderr, "ReaderNextRecord returned status %d.\n", status);
+        break;
+      }
+      header_type = ReaderType(reader);
+      if (strcmp("scidac-checksum", header_type) == 0) {
+        read_message(reader, &checksum_string);
+        DML_read_flag = parse_checksum_xml(checksum_string, &checksum_read);
+        free(checksum_string);
+        break;
+      }
+      if (strcmp("scidac-binary-data", header_type) == 0 || strcmp("ildg-binary-data", header_type) == 0) {
+        break;
+      }
+    }
+  
+    if (!DML_read_flag) {
+      fprintf(stderr, "LIME record with name: \"scidac-checksum\", in gauge file %s either missing or malformed.\n", filename);
+      fprintf(stderr, "Unable to verify integrity of gauge field data.\n");
+      return(-1);
+    }
+  
+    if (g_cart_id == 0 && g_debug_level >= 0) {
+      printf("# Scidac checksums for DiracFermion field %s position %d:\n", filename, position);
+      printf("#   Calculated            : A = %#010x B = %#010x.\n", checksum.suma, checksum.sumb);
+      printf("#   Read from LIME headers: A = %#010x B = %#010x.\n", checksum_read.suma, checksum_read.sumb);
+    }
   }
 
   destruct_reader(reader);
diff --git a/io/spinor_read_binary.c b/io/spinor_read_binary.c
index a16cfc352..1d494977d 100644
--- a/io/spinor_read_binary.c
+++ b/io/spinor_read_binary.c
@@ -145,7 +145,7 @@ int read_binary_spinor_data(spinor * const s, spinor * const r, LimeReader * lim
   for(t = 0; t < T; t++) {
     for(z = 0; z < LZ; z++) {
       for(y = 0; y < LY; y++) {
-#if (defined MPI)
+#if (defined TM_USE_MPI)
         limeReaderSeek(limereader,(n_uint64_t)
                        (g_proc_coords[1]*LX +
                         (((g_proc_coords[0]*T+t)*g_nproc_z*LZ+g_proc_coords[3]*LZ+z)*g_nproc_y*LY
@@ -177,7 +177,7 @@ int read_binary_spinor_data(spinor * const s, spinor * const r, LimeReader * lim
           }
           if(status < 0 && status != LIME_EOR) {
             fprintf(stderr, "LIME read error occurred with status = %d while reading in spinor_read_binary.c!\n", status);
-#ifdef MPI
+#ifdef TM_USE_MPI
             MPI_Abort(MPI_COMM_WORLD, 1);
             MPI_Finalize();
 #endif
@@ -187,7 +187,7 @@ int read_binary_spinor_data(spinor * const s, spinor * const r, LimeReader * lim
       }
     }
   }
-#ifdef MPI
+#ifdef TM_USE_MPI
   DML_checksum_combine(checksum);
 #endif
   return(0);
@@ -318,7 +318,7 @@ int read_binary_spinor_data_l(spinor * const s, LimeReader * limereader, DML_Che
   for(t = 0; t < T; t++) {
     for(z = 0; z < LZ; z++) {
       for(y = 0; y < LY; y++) {
-#if (defined MPI)
+#if (defined TM_USE_MPI)
         limeReaderSeek(limereader,(n_uint64_t)
                        (g_proc_coords[1]*LX +
                         (((g_proc_coords[0]*T+t)*g_nproc_z*LZ+g_proc_coords[3]*LZ+z)*g_nproc_y*LY
@@ -342,7 +342,7 @@ int read_binary_spinor_data_l(spinor * const s, LimeReader * limereader, DML_Che
           }
           if(status < 0 && status != LIME_EOR) {
             fprintf(stderr, "LIME read error occurred with status = %d while reading in spinor_read_binary.c!\n", status);
-#ifdef MPI
+#ifdef TM_USE_MPI
             MPI_Abort(MPI_COMM_WORLD, 1);
             MPI_Finalize();
 #endif
@@ -352,7 +352,7 @@ int read_binary_spinor_data_l(spinor * const s, LimeReader * limereader, DML_Che
       }
     }
   }
-#ifdef MPI
+#ifdef TM_USE_MPI
   DML_checksum_combine(checksum);
 #endif
   return(0);
diff --git a/io/spinor_write_binary.c b/io/spinor_write_binary.c
index 640422297..5b07c8c19 100644
--- a/io/spinor_write_binary.c
+++ b/io/spinor_write_binary.c
@@ -126,19 +126,19 @@ int write_binary_spinor_data(spinor * const s, spinor * const r, LimeWriter * li
   int coords[4];
   n_uint64_t bytes;
   DML_SiteRank rank;
-#ifdef MPI
   double tick = 0, tock = 0;
   char measure[64];
+#ifdef TM_USE_MPI
   MPI_Status mstatus;
 #endif
   DML_checksum_init(checksum);
 
-#ifdef MPI
   if (g_debug_level > 0) {
+#ifdef TM_USE_MPI
     MPI_Barrier(g_cart_grid);
-    tick = MPI_Wtime();
-  }
 #endif
+    tick = gettime();
+  }
 
   if(prec == 32) bytes = (n_uint64_t)sizeof(spinor)/2;
   else bytes = (n_uint64_t)sizeof(spinor);
@@ -154,7 +154,7 @@ int write_binary_spinor_data(spinor * const s, spinor * const r, LimeWriter * li
         for(x = 0; x < LX*g_nproc_x; x++) {
           X = x - g_proc_coords[1]*LX;
           coords[1] = x / LX;
-#ifdef MPI
+#ifdef TM_USE_MPI
           MPI_Cart_rank(g_cart_grid, coords, &id);
 #endif
           if(g_cart_id == id) {
@@ -183,7 +183,7 @@ int write_binary_spinor_data(spinor * const s, spinor * const r, LimeWriter * li
                 status = limeWriteRecordData((void*)tmp, &bytes, limewriter);
               }
             }
-#ifdef MPI
+#ifdef TM_USE_MPI
             else{
               if(prec == 32) {
                 MPI_Recv((void*)tmp2, sizeof(spinor)/8, MPI_FLOAT, id, tag, g_cart_grid, &mstatus);
@@ -199,14 +199,14 @@ int write_binary_spinor_data(spinor * const s, spinor * const r, LimeWriter * li
 #endif
             if(status < 0 ) {
               fprintf(stderr, "LIME write error occurred with status = %d, while in write_binary_spinor_data (spinor_write_binary.c)!\n", status);
-#ifdef MPI
+#ifdef TM_USE_MPI
               MPI_Abort(MPI_COMM_WORLD, 1);
               MPI_Finalize();
 #endif
               exit(500);
             }
           }
-#ifdef MPI
+#ifdef TM_USE_MPI
           else{
             if(g_cart_id == id){
               if(prec == 32) {
@@ -222,17 +222,18 @@ int write_binary_spinor_data(spinor * const s, spinor * const r, LimeWriter * li
 #endif
           tag++;
         }
-#ifdef MPI
+#ifdef TM_USE_MPI
         MPI_Barrier(g_cart_grid);
 #endif
         tag=0;
       }
     }
   }
-#ifdef MPI
   if (g_debug_level > 0) {
+#ifdef TM_USE_MPI
     MPI_Barrier(g_cart_grid);
-    tock = MPI_Wtime();
+#endif
+    tock = gettime();
 
     if (g_cart_id == 0) {
       engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes, "b");
@@ -246,7 +247,6 @@ int write_binary_spinor_data(spinor * const s, spinor * const r, LimeWriter * li
       fflush(stdout);
     }
   }
-#endif
   return(0);
 }
 #endif /* HAVE_LIBLEMON */
@@ -353,19 +353,19 @@ int write_binary_spinor_data_l(spinor * const s, LimeWriter * limewriter, DML_Ch
   int coords[4];
   n_uint64_t bytes;
   DML_SiteRank rank;
-#ifdef MPI
   double tick = 0, tock = 0;
   char measure[64];
+#ifdef TM_USE_MPI
   MPI_Status mstatus;
 #endif
   DML_checksum_init(checksum);
 
-#ifdef MPI
   if (g_debug_level > 0) {
+#ifdef TM_USE_MPI
     MPI_Barrier(g_cart_grid);
-    tick = MPI_Wtime();
-  }
 #endif
+    tick = gettime();
+  }
 
   if(prec == 32) bytes = (n_uint64_t)sizeof(spinor)/2;
   else bytes = (n_uint64_t)sizeof(spinor);
@@ -381,7 +381,7 @@ int write_binary_spinor_data_l(spinor * const s, LimeWriter * limewriter, DML_Ch
         for(x = 0; x < LX*g_nproc_x; x++) {
           X = x - g_proc_coords[1]*LX;
           coords[1] = x / LX;
-#ifdef MPI
+#ifdef TM_USE_MPI
           MPI_Cart_rank(g_cart_grid, coords, &id);
 #endif
           if(g_cart_id == id) {
@@ -403,7 +403,7 @@ int write_binary_spinor_data_l(spinor * const s, LimeWriter * limewriter, DML_Ch
                 status = limeWriteRecordData((void*)tmp, &bytes, limewriter);
               }
             }
-#ifdef MPI
+#ifdef TM_USE_MPI
             else{
               if(prec == 32) {
                 MPI_Recv((void*)tmp2, sizeof(spinor)/8, MPI_FLOAT, id, tag, g_cart_grid, &mstatus);
@@ -419,7 +419,7 @@ int write_binary_spinor_data_l(spinor * const s, LimeWriter * limewriter, DML_Ch
 #endif
             if(status < 0 ) {
               fprintf(stderr, "LIME write error occurred with status = %d, while in write_binary_spinor_data_l (spinor_write_binary.c)!\n", status);
-#ifdef MPI
+#ifdef TM_USE_MPI
               MPI_Abort(MPI_COMM_WORLD, 1);
               MPI_Finalize();
 #endif
@@ -427,7 +427,7 @@ int write_binary_spinor_data_l(spinor * const s, LimeWriter * limewriter, DML_Ch
             }
 
           }
-#ifdef MPI
+#ifdef TM_USE_MPI
           else{
             if(g_cart_id == id){
               if(prec == 32) {
@@ -443,17 +443,18 @@ int write_binary_spinor_data_l(spinor * const s, LimeWriter * limewriter, DML_Ch
 #endif
           tag++;
         }
-#ifdef MPI
+#ifdef TM_USE_MPI
         MPI_Barrier(g_cart_grid);
 #endif
         tag=0;
       }
     }
   }
-#ifdef MPI
   if (g_debug_level > 0) {
+#ifdef TM_USE_MPI
     MPI_Barrier(g_cart_grid);
-    tock = MPI_Wtime();
+#endif
+    tock = gettime();
 
     if (g_cart_id == 0) {
       engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes, "b");
@@ -467,7 +468,6 @@ int write_binary_spinor_data_l(spinor * const s, LimeWriter * limewriter, DML_Ch
       fflush(stdout);
     }
   }
-#endif
   return(0);
 }
 #endif /* HAVE_LIBLEMON */
diff --git a/io/spinor_write_stdout.c b/io/spinor_write_stdout.c
index 5d4a2e253..28b128902 100644
--- a/io/spinor_write_stdout.c
+++ b/io/spinor_write_stdout.c
@@ -18,12 +18,12 @@
 ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include "global.h"
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "su3.h"
@@ -46,7 +46,7 @@ void spinor_write_stdout(spinor * const s) {
 	for(int z = 0; z < g_nproc_z*LZ; z++) {
 	  Z = z - g_proc_coords[3]*LZ;
 	  coords[3] = z / LZ;
-#ifdef MPI
+#ifdef TM_USE_MPI
 	  MPI_Cart_rank(g_cart_grid, coords, &id);
 #endif
 	  if((t+x+y+z)%2 == 0 && g_cart_id == id) {
@@ -59,7 +59,7 @@ void spinor_write_stdout(spinor * const s) {
 		   creal(s[ix].s0.c0), cimag(s[ix].s0.c0));
 	    fflush(stdout);
 	  }
-#ifdef MPI
+#ifdef TM_USE_MPI
 	  MPI_Barrier(MPI_COMM_WORLD);
 #endif
 	}
diff --git a/io/sw_write_stdout.c b/io/sw_write_stdout.c
index 2d28b4ccc..652badcc1 100644
--- a/io/sw_write_stdout.c
+++ b/io/sw_write_stdout.c
@@ -18,12 +18,12 @@
 ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include "global.h"
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "su3.h"
@@ -45,7 +45,7 @@ void sw_write_stdout(su3 ** u) {
 	for(int z = 0; z < g_nproc_z*LZ; z++) {
 	  Z = z - g_proc_coords[3]*LZ;
 	  coords[3] = z / LZ;
-#ifdef MPI
+#ifdef TM_USE_MPI
 	  MPI_Cart_rank(g_cart_grid, coords, &id);
 #endif
 	  if(g_cart_id == id) {
@@ -66,7 +66,7 @@ void sw_write_stdout(su3 ** u) {
 	      fflush(stdout);
 	    }
 	  }
-#ifdef MPI
+#ifdef TM_USE_MPI
 	  MPI_Barrier(MPI_COMM_WORLD);
 #endif
 	}
diff --git a/io/utils.h b/io/utils.h
index dbaf0ac7b..a9d550423 100644
--- a/io/utils.h
+++ b/io/utils.h
@@ -21,7 +21,7 @@
 #define _UTILS_H
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 
 #include <stdlib.h>
@@ -35,14 +35,6 @@
 #include <io/dml.h>
 
 
-#ifndef isnan
-# define isnan(x)						 \
-  (sizeof (x) == sizeof (long double) ? isnan_ld (x)		 \
-   : sizeof (x) == sizeof (double) ? isnan_d (x)		 \
-   : isnan_f (x))
-
-#endif
-
 /* These are factory functions, since the constructors for c-lime and lemon are different
    and they need different ways of opening files. Moving this to utility functions unclutters
    the main code, since we don't need additional #ifdefs anymore.
diff --git a/io/utils.ih b/io/utils.ih
index 0164fe148..054c0bcca 100644
--- a/io/utils.ih
+++ b/io/utils.ih
@@ -19,7 +19,7 @@
 
 #include <lime.h>
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 
 #include <stdlib.h>
@@ -29,7 +29,7 @@
 #include <endian.h>
 #include <sys/time.h>
 #include <sys/types.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include <unistd.h>
diff --git a/io/utils_close_reader_record.c b/io/utils_close_reader_record.c
index ca07d780f..3005e65fd 100644
--- a/io/utils_close_reader_record.c
+++ b/io/utils_close_reader_record.c
@@ -23,7 +23,7 @@ void close_reader_record(READER *reader)
 {
   if (reader != NULL)
     ReaderCloseRecord(reader);
-  #ifdef MPI
+  #ifdef TM_USE_MPI
   MPI_Barrier(g_cart_grid);
   #endif
 }
diff --git a/io/utils_kill_with_error.c b/io/utils_kill_with_error.c
index f6c394181..fa50cbb56 100644
--- a/io/utils_kill_with_error.c
+++ b/io/utils_kill_with_error.c
@@ -15,7 +15,7 @@ void kill_with_error(LIME_FILE *fh, int const rank, char const *error)
     fclose(fh);
 #endif /* HAVE_LIBLEMON */
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Abort(MPI_COMM_WORLD, 1);
   MPI_Finalize();
 #endif
diff --git a/io/utils_parse_propagator_type.c b/io/utils_parse_propagator_type.c
index 9b91c6550..74406a1d9 100644
--- a/io/utils_parse_propagator_type.c
+++ b/io/utils_parse_propagator_type.c
@@ -82,10 +82,10 @@ int parse_propagator_type(READER * reader) {
       close_reader_record(reader);
       break;
     }
-    if ((sourcetypefound || proptypefound) == 0) {
-      fprintf(stderr, "Unable to find either source-type or propagator-type record.\nWARNING: Continuing in blind faith.\n");
-    }
     close_reader_record(reader);
   }
+  if ((sourcetypefound || proptypefound) == 0) {
+    fprintf(stderr, "Unable to find either source-type or propagator-type record.\nWARNING: Continuing in blind faith.\n");
+  }
   return(prop_type);
 }
diff --git a/io/utils_read_message.c b/io/utils_read_message.c
index 2b0b4c670..3f4953522 100644
--- a/io/utils_read_message.c
+++ b/io/utils_read_message.c
@@ -44,7 +44,7 @@ int read_message(READER * reader, char **buffer) {
   }
 
   status = ReaderReadData(*buffer, &bytesRead, reader);
-#if MPI
+#if TM_USE_MPI
   MPI_Barrier(g_cart_grid);
 #endif
 
diff --git a/io/utils_write_first_message.c b/io/utils_write_first_message.c
index 11cea0520..272bac230 100644
--- a/io/utils_write_first_message.c
+++ b/io/utils_write_first_message.c
@@ -93,7 +93,7 @@ int write_first_messages(FILE * parameterfile, char const * const executable, ch
          "# the code is compiled for persistent MPI calls (halfspinor only)\n");
 #  endif
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
 #  ifdef _NON_BLOCKING
   printf("# the code is compiled for non-blocking MPI calls (spinor and gauge)\n");
   fprintf(parameterfile,
@@ -105,13 +105,13 @@ int write_first_messages(FILE * parameterfile, char const * const executable, ch
 	  "# the code is compiled with MPI IO / Lemon\n");
 #  endif
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
   printf("# the code is compiled with openMP support\n");
   fprintf(parameterfile,
           "# the code is compiled with openMP support\n");
 #endif
   if( bc_flag == 0 ) {
-    printf("# Periodic boundary conditions are used\n");
+    printf("# Non-Schroedinger (anti-periodic, periodic or twisted) boundary conditions are used\n");
     fprintf(parameterfile, "# Periodic boundary conditions are used\n");
   }
   if( bc_flag == 1 ) {
@@ -130,10 +130,10 @@ int write_first_messages(FILE * parameterfile, char const * const executable, ch
     printf("# Even/odd preconditioning is not used\n");
     fprintf(parameterfile, "# Even/odd preconditioning is not used\n");
   }
-  printf("# beta = %f , kappa= %f\n", g_beta, g_kappa);
+  printf("# beta = %.12f , kappa= %.12f\n", g_beta, g_kappa);
   printf("# boundary conditions for fermion fields (t,x,y,z) * pi: %f %f %f %f \n",X0,X1,X2,X3);
   if( strcmp(executable,"hmc") == 0 ) {
-    printf("# mu = %f\n", g_mu/2./g_kappa);
+    printf("# mu = %.12f\n", g_mu/2./g_kappa);
     printf("# g_rgi_C0 = %f, g_rgi_C1 = %f\n", g_rgi_C0, g_rgi_C1);
     printf("# Using %s precision for the inversions!\n", 
 	   g_relative_precision_flag ? "relative" : "absolute");
@@ -141,20 +141,20 @@ int write_first_messages(FILE * parameterfile, char const * const executable, ch
   fprintf(parameterfile, "# The lattice size is %d x %d x %d x %d\n", (int)(g_nproc_t*T), (int)(g_nproc_x*LX), 
 	  (int)(g_nproc_y*LY), (int)(g_nproc_z*LZ));
   fprintf(parameterfile, "# The local lattice size is %d x %d x %d x %d\n", (int)(T), (int)(LX), (int)(LY), (int)(LZ));
-  fprintf(parameterfile, "# g_beta = %f , g_kappa= %f, g_kappa*csw/8= %f \n",g_beta,g_kappa,g_ka_csw_8);
+  fprintf(parameterfile, "# g_beta = %.12f , g_kappa= %.12f, c_sw = %.12f \n",g_beta,g_kappa,g_c_sw);
   fprintf(parameterfile, "# boundary conditions for fermion fields (t,x,y,z) * pi: %f %f %f %f \n",X0,X1,X2,X3);
   if( strcmp(executable,"hmc") == 0 ) {
     fprintf(parameterfile, "# Nmeas=%d, Nsave=%d \n",
 	    Nmeas,Nsave);
-    fprintf(parameterfile, "# mu = %f\n", g_mu/2./g_kappa);
+    fprintf(parameterfile, "# mu = %.12f\n", g_mu/2./g_kappa);
     fprintf(parameterfile, "# g_rgi_C0 = %f, g_rgi_C1 = %f\n", g_rgi_C0, g_rgi_C1);
     fprintf(parameterfile, "# Using %s precision for the inversions!\n", 
 	    g_relative_precision_flag ? "relative" : "absolute");
   }
   if( strcmp(executable,"invert") == 0 ) {
-    printf("# beta = %f, mu = %f, kappa = %f\n", g_beta, g_mu/2./g_kappa, g_kappa);
+    printf("# beta = %.12f, mu = %.12f, kappa = %.12f\n", g_beta, g_mu/2./g_kappa, g_kappa);
     fprintf(parameterfile,
-	    "# beta = %f, mu = %f, kappa = %f\n", g_beta, g_mu/2./g_kappa, g_kappa);
+	    "# beta = %.12f, mu = %.12f, kappa = %.12f\n", g_beta, g_mu/2./g_kappa, g_kappa);
   }
   fflush(stdout); fflush(parameterfile);
   return(0);
diff --git a/io/utils_write_inverter_info.c b/io/utils_write_inverter_info.c
index 82589789d..a37315a4f 100644
--- a/io/utils_write_inverter_info.c
+++ b/io/utils_write_inverter_info.c
@@ -31,7 +31,7 @@ void write_inverter_info(WRITER * writer, paramsInverterInfo const *info)
                      "multiple mass solver\n"
                      "epssq = %e\n"
                      "noiter = %d\n"
-                     "kappa = %f, inverted mu = %f, lowest mu = %f\n"
+                     "kappa = %.12f, inverted mu = %.12f, lowest mu = %.12f\n"
                      "inverter version = %s\n"
                      "date = %s",
                      info->inverter,
@@ -45,7 +45,7 @@ void write_inverter_info(WRITER * writer, paramsInverterInfo const *info)
       sprintf(message, "solver = %s\n"
                        "epssq = %e\n"
                        "noiter = %d\n"
-                       "kappa = %f, mu = %f\n"
+                       "kappa = %.12f, mu = %.12f\n"
                        "inverter version = %s\n"
                        "date = %s",
                        info->inverter,
@@ -56,7 +56,7 @@ void write_inverter_info(WRITER * writer, paramsInverterInfo const *info)
       sprintf(message, "solver = %s\n"
                        "epssq = %e\n"
                        "noiter = %d\n"
-                       "kappa = %f, mubar = %f, epsbar=%f\n"
+                       "kappa = %.12f, mubar = %.12f, epsbar=%.12f\n"
                        "inverter version = %s\n"
                        "date = %s",
                        info->inverter,
diff --git a/io/utils_write_xlf.c b/io/utils_write_xlf.c
index 2d5546bd1..3651f1106 100644
--- a/io/utils_write_xlf.c
+++ b/io/utils_write_xlf.c
@@ -31,11 +31,11 @@ void write_xlf_info(WRITER * writer, paramsXlfInfo const *info)
   if (info->kappa != 0.0) {
     sprintf(message, "plaquette = %14.12f\n"
                      " trajectory nr = %d\n"
-                     " beta = %f, kappa = %f, mu = %f, c2_rec = %f\n"
+                     " beta = %.12f, kappa = %.12f, mu = %.12f, c2_rec = %f\n"
                      " time = %ld\n"
                      " hmcversion = %s\n"
-                     " mubar = %f\n"
-                     " epsilonbar = %f\n"
+                     " mubar = %.12f\n"
+                     " epsilonbar = %.12f\n"
                      " date = %s",
                      info->plaq, info->counter, info->beta, info->kappa,
                      info->mu, info->c2_rec, info->time, info->package_version,
@@ -44,9 +44,9 @@ void write_xlf_info(WRITER * writer, paramsXlfInfo const *info)
   else {
     sprintf(message, "plaquette = %e\n"
                      " trajectory nr = %d\n"
-                     " beta = %f\n"
-                     " kappa = %f\n"
-                     " 2*kappa*mu = %f\n"
+                     " beta = %.12f\n"
+                     " kappa = %.12f\n"
+                     " 2*kappa*mu = %.12f\n"
                      " c2_rec = %f\n"
                      " date = %s",
                      info->plaq, info->counter, info->beta, info->kappa,
diff --git a/io/utils_write_xlf_xml.c b/io/utils_write_xlf_xml.c
index d536d646b..4fa67dfd2 100644
--- a/io/utils_write_xlf_xml.c
+++ b/io/utils_write_xlf_xml.c
@@ -33,14 +33,14 @@ void write_xlf_info_xml(WRITER * writer, paramsXlfInfo const *info)
         "<xlf-info>\n"
         "  <plaquette>%14.12f</plaquette>\n"
         "  <trajectory>%d</trajectory>\n"
-        "  <beta>%f</beta>\n"
-        "  <kappa>%f</kappa>\n"
-        "  <mu>%f</mu>\n"
+        "  <beta>%.12f</beta>\n"
+        "  <kappa>%.12f</kappa>\n"
+        "  <mu>%.12f</mu>\n"
         "  <c2_rec>%f</c2_rec>\n"
         "  <time>%ld</time>\n"
         "  <hmcversion>%s</hmcversion>\n"
-        "  <mubar>%f</mubar>\n"
-        "  <epsilonbar>%f</epsilonbar>\n"
+        "  <mubar>%.12f</mubar>\n"
+        "  <epsilonbar>%.12f</epsilonbar>\n"
         "  <date>%s</date>\n"
         "</xlf-info>", info->plaq, info->counter, info->beta, info->kappa,
                        info->mu, info->c2_rec, info->time, info->package_version,
@@ -52,9 +52,9 @@ void write_xlf_info_xml(WRITER * writer, paramsXlfInfo const *info)
         "<xlf-info>\n"
         "  <plaquette>%e</plaquette>\n"
         "  <trajectory>%d</trajectory>\n"
-        "  <beta>%f</beta>\n"
-        "  <kappa>%f</kappa>\n"
-        "  <2kappamu>%f</2kappamu>\n"
+        "  <beta>%.12f</beta>\n"
+        "  <kappa>%.12f</kappa>\n"
+        "  <2kappamu>%.12f</2kappamu>\n"
         "  <c2_rec>%f</c2_rec>\n"
         "  <date>%s</date>\n"
         "</xlf-info>", info->plaq, info->counter, info->beta, info->kappa,
diff --git a/jacobi.c b/jacobi.c
index b43b5eb23..721d542c3 100644
--- a/jacobi.c
+++ b/jacobi.c
@@ -23,7 +23,7 @@
  *
  *******************************************************************************/
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -31,7 +31,7 @@
 #include <math.h>
 #include <errno.h>
 #include <time.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
@@ -45,7 +45,7 @@ void Jacobi(su3_vector * const l, su3_vector * const k,int t)
   int ix,mu,tcoord,coord;
   su3_vector lt;
         
-#ifdef MPI
+#ifdef TM_USE_MPI
   xchange_jacobi(k);
 #endif
 
@@ -66,7 +66,7 @@ void Jacobi(su3_vector * const l, su3_vector * const k,int t)
 	  l[ix].c2 -= lt.c2;
 	}
     }
-#ifdef MPI
+#ifdef TM_USE_MPI
   xchange_jacobi(l);
 #endif
 }
diff --git a/kahan_summation.h b/kahan_summation.h
new file mode 100644
index 000000000..825054b41
--- /dev/null
+++ b/kahan_summation.h
@@ -0,0 +1,101 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2018 Bartosz Kostrzewa 
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ************************************************************************/
+
+#ifndef KAHAN_SUMMATION_H
+#define KAHAN_SUMMATION_H
+
+typedef struct kahan_re_t {
+  double kc;
+  double ks;
+  double ts;
+  double tr;
+  double tt;
+} kahan_re_t;
+
+typedef struct kahan_cplx_t {
+  complex double kc;
+  complex double ks;
+  complex double ts;
+  complex double tr;
+  complex double tt;
+} kahan_cplx_t;
+
+static inline void kahan_sum_re_step(const double in, kahan_re_t * const acc){
+  acc->tr = in + acc->kc;
+  acc->ts = acc->tr + acc->ks;
+  acc->tt = acc->ts - acc->ks;
+  acc->ks = acc->ts;
+  acc->kc = acc->tr - acc->tt;
+}
+
+static inline double kahan_sum_re_final(const kahan_re_t * acc){
+  return( acc->kc + acc->ks );
+}
+
+static inline void kahan_sum_cplx_step(const complex double in, kahan_cplx_t * const acc){
+  acc->tr = in + acc->kc;
+  acc->ts = acc->tr + acc->ks;
+  acc->tt = acc->ts - acc->ks;
+  acc->ks = acc->ts;
+  acc->kc = acc->tr - acc->tt;
+}
+
+static inline complex double kahan_sum_cplx_final(const kahan_cplx_t * const acc){
+  return( acc->kc + acc->ks );
+}
+
+static inline kahan_re_t new_kahan_re(){
+  kahan_re_t ret;
+  ret.kc = 0.0;
+  ret.ks = 0.0;
+  ret.ts = 0.0;
+  ret.tr = 0.0;
+  ret.tt = 0.0;
+  return(ret);
+}
+
+static inline kahan_cplx_t new_kahan_cplx(){
+  kahan_cplx_t ret;
+  ret.kc = 0.0;
+  ret.ks = 0.0;
+  ret.ts = 0.0;
+  ret.tr = 0.0;
+  ret.tt = 0.0;
+  return(ret);
+}
+
+static inline void reset_kahan_re(kahan_re_t * const in){
+  in->kc = 0.0;
+  in->ks = 0.0;
+  in->ts = 0.0;
+  in->tr = 0.0;
+  in->tt = 0.0;
+}
+
+static inline void reset_kahan_cplx(kahan_cplx_t * const in){
+  in->kc = 0.0;
+  in->ks = 0.0;
+  in->ts = 0.0;
+  in->tr = 0.0;
+  in->tt = 0.0;
+}
+
+#endif
diff --git a/linalg/Makefile.in b/linalg/Makefile.in
index d7e749859..0c953e542 100644
--- a/linalg/Makefile.in
+++ b/linalg/Makefile.in
@@ -33,17 +33,21 @@ LIBRARIES = liblinalg
 liblinalg_TARGETS = assign_add_mul_r_add_mul \
 	assign_mul_bra_add_mul_ket_add_r \
 	scalar_prod_r scalar_prod_i \
-	square_and_prod_r assign_mul_bra_add_mul_r mul_r \
-	diff_and_square_norm assign \
-	scalar_prod mul_diff_r mul_diff_mul assign_add_mul add \
+	square_and_prod_r assign_mul_bra_add_mul_r mul_r mul_r_32 \
+	diff_and_square_norm square_and_minmax assign \
+	scalar_prod mul_diff_r mul_diff_mul assign_add_mul assign_mul_add add \
 	assign_diff_mul mul_add_mul mul assign_add_mul_add_mul \
 	assign_mul_bra_add_mul_ket_add assign_mul_add_mul_add_mul_add_mul_r \
 	mul_diff_mul_r assign_add_mul_add_mul_r \
-        comp_decomp \
-	convert_eo_to_lexic assign_mul_add_mul_r mul_add_mul_r \
-	assign_mul_add_mul_add_mul_r mattimesvec \
+        comp_decomp assign_mul_add_mul \
+	convert_eo_to_lexic assign_mul_add_mul_r assign_mul_add_mul_r_32 \
+	mul_add_mul_r assign_mul_add_mul_add_mul_r mattimesvec \
 	scalar_prod_su3spinor \
-	assign_mul_add_r_and_square
+	assign_mul_add_r_and_square \
+	addto_32 scalar_prod_r_32 assign_mul_add_r_32 assign_add_mul_r_32 \
+	square_norm_32 assign_to_32 diff_32 \
+	convert_odd_to_lexic convert_even_to_lexic set_even_to_zero \
+	mul_gamma5 mul_r_gamma5
 
 liblinalg_STARGETS = diff assign_add_mul_r assign_mul_add_r square_norm
 
@@ -71,10 +75,10 @@ include ${top_srcdir}/Makefile.global
 
 # rule to compile objects
 
-${liblinalg_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h
+${liblinalg_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config.h
 	$(COMPILE) ${OPTARGS} -c $<
 
-${liblinalg_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h
+${liblinalg_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config.h
 	$(COMPILE) ${SOPTARGS} -c $<
 
 # rule to make liblinalg
diff --git a/linalg/add.c b/linalg/add.c
index 0866d4187..8b1d546d4 100644
--- a/linalg/add.c
+++ b/linalg/add.c
@@ -23,9 +23,9 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include <stdlib.h>
@@ -37,7 +37,7 @@
 #if (defined BGQ && defined XLC)
 
 void add(spinor * const Q,const spinor * const R,const spinor * const S, const int N) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -56,7 +56,7 @@ void add(spinor * const Q,const spinor * const R,const spinor * const S, const i
   __prefetch_by_load(R);
   __prefetch_by_stream(1, Q);
 
-#ifndef OMP
+#ifndef TM_USE_OMP
 #pragma unroll(2)
 #else
 #pragma omp for
@@ -94,7 +94,7 @@ void add(spinor * const Q,const spinor * const R,const spinor * const S, const i
     vec_st(z5, 0, q+20);
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /*OpenMP parallel closing brace */
 #endif
   return;
@@ -104,7 +104,7 @@ void add(spinor * const Q,const spinor * const R,const spinor * const S, const i
 
 /* Q output, R input, S input */
 void add(spinor * const Q,const spinor * const R,const spinor * const S, const int N){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -112,7 +112,7 @@ void add(spinor * const Q,const spinor * const R,const spinor * const S, const i
   int ix;
   spinor *q,*r,*s;
   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (ix = 0; ix < N; ix++){
@@ -137,7 +137,7 @@ void add(spinor * const Q,const spinor * const R,const spinor * const S, const i
     q->s3.c2 = r->s3.c2 + s->s3.c2;
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 
diff --git a/linalg/addto_32.c b/linalg/addto_32.c
new file mode 100644
index 000000000..6c1c1bd6d
--- /dev/null
+++ b/linalg/addto_32.c
@@ -0,0 +1,55 @@
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#ifdef TM_USE_OMP
+# include <omp.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "su3.h"
+#include "addto_32.h"
+
+
+
+/* Q output, R input, S input */
+void addto_32(spinor * const Q, const spinor32 * const R, const int N)
+{
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+
+  int ix;
+  spinor *q;
+  spinor32 * r;
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for (ix = 0; ix < N; ix++){
+    q=(spinor *) Q + ix;
+    r=(spinor32 *) R + ix;
+
+    
+    q->s0.c0 += r->s0.c0;
+    q->s0.c1 += r->s0.c1;
+    q->s0.c2 += r->s0.c2;
+    
+    q->s1.c0 += r->s1.c0;
+    q->s1.c1 += r->s1.c1;
+    q->s1.c2 += r->s1.c2;
+    
+    q->s2.c0 += r->s2.c0;
+    q->s2.c1 += r->s2.c1;
+    q->s2.c2 += r->s2.c2;
+    
+    q->s3.c0 += r->s3.c0;
+    q->s3.c1 += r->s3.c1;
+    q->s3.c2 += r->s3.c2;
+  }
+
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+
+}
diff --git a/linalg/addto_32.h b/linalg/addto_32.h
new file mode 100644
index 000000000..9afbc350c
--- /dev/null
+++ b/linalg/addto_32.h
@@ -0,0 +1,10 @@
+#ifndef _ADDTO_32_H
+#define _ADDTO_32_H
+
+#include "su3.h"
+
+/* Makes the sum (*Q) = (*Q) + (*S) */
+void addto_32(spinor * const Q, const spinor32 * const R, const int N);
+
+
+#endif
\ No newline at end of file
diff --git a/linalg/assign.c b/linalg/assign.c
index 6abb7f47b..927732a11 100644
--- a/linalg/assign.c
+++ b/linalg/assign.c
@@ -26,7 +26,7 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -45,6 +45,214 @@ void assign(spinor * const R, spinor * const S, const int N)
   return;
 }
 
+void bispinor_assign(bispinor * const R, bispinor * const S, const int N)
+{
+  memcpy(R, S, N*sizeof(bispinor));
+  return;
+}
+
+//copy a complex double S of size N into a spinor R of size N/24 
+/* S input, R output        */
+/* S and R must not overlap */
+void assign_complex_to_bispinor(bispinor * const R, _Complex double * const S, const int N)
+{
+
+  int k; //spinor index
+  bispinor *r;
+  _Complex double *s;
+
+  k=0;
+  for(int ix=0; ix<N ; ix +=24)
+  {
+     s=S + ix;
+     r=R + k;
+
+     (r->sp_up).s0.c0 = *s;
+     (r->sp_up).s0.c1 = *(s+1);
+     (r->sp_up).s0.c2 = *(s+2);
+
+     (r->sp_up).s1.c0 = *(s+3);
+     (r->sp_up).s1.c1 = *(s+4);
+     (r->sp_up).s1.c2 = *(s+5);
+
+     (r->sp_up).s2.c0 = *(s+6);
+     (r->sp_up).s2.c1 = *(s+7);
+     (r->sp_up).s2.c2 = *(s+8);
+
+
+     (r->sp_up).s3.c0 = *(s+9);
+     (r->sp_up).s3.c1 = *(s+10);
+     (r->sp_up).s3.c2 = *(s+11);
+     s=S + ix + 12;
+     
+     (r->sp_dn).s0.c0 = *s;
+     (r->sp_dn).s0.c1 = *(s+1);
+     (r->sp_dn).s0.c2 = *(s+2);
+
+     (r->sp_dn).s1.c0 = *(s+3);
+     (r->sp_dn).s1.c1 = *(s+4);
+     (r->sp_dn).s1.c2 = *(s+5);
+
+     (r->sp_dn).s2.c0 = *(s+6);
+     (r->sp_dn).s2.c1 = *(s+7);
+     (r->sp_dn).s2.c2 = *(s+8);
+
+
+     (r->sp_dn).s3.c0 = *(s+9);
+     (r->sp_dn).s3.c1 = *(s+10);
+     (r->sp_dn).s3.c2 = *(s+11);
+
+     k++;
+  }
+
+  return;
+}
+
+
+
+//copy a spinor S of size N into a complex double R of size 24*N 
+/* S input, R output        */
+/* S and R must not overlap */
+void assign_bispinor_to_complex(_Complex double * const R, bispinor * const S, const int N)
+{
+
+  int k; //complex double index
+  _Complex double *r;
+  bispinor *s;
+  int n=N/24;
+  k=0;
+  for(int ix=0; ix<n ; ix++)
+  {
+     s=S+ix;
+     r=R+k;
+
+     *r     =  (s->sp_up).s0.c0;
+     *(r+1) =  (s->sp_up).s0.c1;
+     *(r+2) =  (s->sp_up).s0.c2;
+
+     *(r+3) =  (s->sp_up).s1.c0;
+     *(r+4) =  (s->sp_up).s1.c1;
+     *(r+5) =  (s->sp_up).s1.c2;
+
+     *(r+6) =  (s->sp_up).s2.c0;
+     *(r+7) =  (s->sp_up).s2.c1;
+     *(r+8) =  (s->sp_up).s2.c2;
+
+     *(r+9)  =  (s->sp_up).s3.c0;
+     *(r+10) =  (s->sp_up).s3.c1;
+     *(r+11) =  (s->sp_up).s3.c2;
+
+     k +=12;
+
+     r=R+k;
+
+     *r     =  (s->sp_dn).s0.c0;
+     *(r+1) =  (s->sp_dn).s0.c1;
+     *(r+2) =  (s->sp_dn).s0.c2;
+
+     *(r+3) =  (s->sp_dn).s1.c0;
+     *(r+4) =  (s->sp_dn).s1.c1;
+     *(r+5) =  (s->sp_dn).s1.c2;
+
+     *(r+6) =  (s->sp_dn).s2.c0;
+     *(r+7) =  (s->sp_dn).s2.c1;
+     *(r+8) =  (s->sp_dn).s2.c2;
+
+     *(r+9)  =  (s->sp_dn).s3.c0;
+     *(r+10) =  (s->sp_dn).s3.c1;
+     *(r+11) =  (s->sp_dn).s3.c2;
+    
+     k+=12;
+
+
+  }
+
+  return;
+}          
+
+/* S and R must not overlap */
+void assign_complex_to_spinor(spinor * const R, _Complex double * const S, const int N)
+{
+
+  int k; //spinor index
+  spinor *r;
+  _Complex double *s;
+
+  k=0;
+  for(int ix=0; ix<N ; ix +=12)
+  {
+     s=S+ix;
+     r=R+k;
+
+     (r->s0).c0 = *s;
+     (r->s0).c1 = *(s+1);
+     (r->s0).c2 = *(s+2);
+             
+     (r->s1).c0 = *(s+3);
+     (r->s1).c1 = *(s+4);
+     (r->s1).c2 = *(s+5);
+
+     (r->s2).c0 = *(s+6);
+     (r->s2).c1 = *(s+7);
+     (r->s2).c2 = *(s+8);
+
+
+     (r->s3).c0 = *(s+9);
+     (r->s3).c1 = *(s+10);
+     (r->s3).c2 = *(s+11);
+
+     k++;
+  }
+  
+  return;
+}
+
+
+
+//copy a spinor S of size N into a complex double R of size 12*N 
+/* S input, R output        */
+/* S and R must not overlap */
+void assign_spinor_to_complex(_Complex double * const R, spinor * const S, const int N)
+{
+
+  int k; //complex double index
+  _Complex double *r;
+  spinor *s;
+
+  k=0;
+  int n=N/12;
+  for(int ix=0; ix<n ; ix++)
+  {
+     s=S+ix;
+     r=R+k;
+
+     *r     =  (s->s0).c0;
+     *(r+1) =  (s->s0).c1;
+     *(r+2) =  (s->s0).c2;
+
+     *(r+3) =  (s->s1).c0;
+     *(r+4) =  (s->s1).c1;
+     *(r+5) =  (s->s1).c2;
+
+     *(r+6) =  (s->s2).c0;
+     *(r+7) =  (s->s2).c1;
+     *(r+8) =  (s->s2).c2;
+
+     *(r+9)  =  (s->s3).c0;
+     *(r+10) =  (s->s3).c1;
+     *(r+11) =  (s->s3).c2;
+
+     k +=12;
+  }
+
+  return;
+}
+
+void assign_32(spinor32 * const R, spinor32 * const S, const int N)
+{
+  memcpy(R, S, N*sizeof(spinor32));
+  return;
+}
 
 #ifdef WITHLAPH
 void assign_su3vect(su3_vector * const R, su3_vector * const S, const int N)
diff --git a/linalg/assign.h b/linalg/assign.h
index f7ad648a5..67fa15d66 100644
--- a/linalg/assign.h
+++ b/linalg/assign.h
@@ -23,7 +23,14 @@
 #include "su3.h"
 
 /* Assign (*R) = (*S) */
+void bispinor_assign( bispinor * const R, bispinor * const S, const int N);
 void assign(spinor * const R, spinor * const S, const int N);
+void assign_32(spinor32 * const R, spinor32 * const S, const int N);
 void assign_su3vect(su3_vector * const R, su3_vector * const S, const int N);
+void assign_complex_to_bispinor(bispinor * const R, _Complex double * const S, const int N); //N is the size of S
+void assign_bispinor_to_complex(_Complex double * const R, bispinor * const S, const int N); //N is the size of S
+
+void assign_complex_to_spinor(spinor * const R, _Complex double * const S, const int N); //N is the size of S
+void assign_spinor_to_complex(_Complex double * const R, spinor * const S, const int N); //N is the size of S
 
 #endif
diff --git a/linalg/assign_add_mul.c b/linalg/assign_add_mul.c
index 127092fa0..3ee38305e 100644
--- a/linalg/assign_add_mul.c
+++ b/linalg/assign_add_mul.c
@@ -26,9 +26,9 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include <stdlib.h>
@@ -38,43 +38,23 @@
 #include "assign_add_mul.h"
 
 
-void assign_add_mul(spinor * const R, spinor * const S, const _Complex double c, const int N)
-{
-#ifdef OMP
-#pragma omp parallel
-  {
-#endif
-  spinor *r,*s;
-
-#ifdef OMP
-#pragma omp for
-#endif
-  for (int ix=0; ix<N; ix++)
-  {
-    r=(spinor *) R + ix;
-    s=(spinor *) S + ix;
+#define _C_TYPE _Complex double
+#define _PSWITCH(s) s
+#define _PTSWITCH(s) s
 
-    r->s0.c0 += c * s->s0.c0;
-    r->s0.c1 += c * s->s0.c1;
-    r->s0.c2 += c * s->s0.c2;
+#include"assign_add_mul_body.c"
 
-    r->s1.c0 += c * s->s1.c0;
-    r->s1.c1 += c * s->s1.c1;
-    r->s1.c2 += c * s->s1.c2;
+#undef _C_TYPE
+#undef _PSWITCH
+#undef _PTSWITCH
 
-    r->s2.c0 += c * s->s2.c0;
-    r->s2.c1 += c * s->s2.c1;
-    r->s2.c2 += c * s->s2.c2;
-
-    r->s3.c0 += c * s->s3.c0;
-    r->s3.c1 += c * s->s3.c1;
-    r->s3.c2 += c * s->s3.c2;
-  }
-
-#ifdef OMP
-  } /* OpenMP closing brace */
-#endif
+#define _C_TYPE _Complex float
+#define _PSWITCH(s) s ## _32
+#define _PTSWITCH(s) s ## 32
 
-}
+#include"assign_add_mul_body.c"
 
+#undef _C_TYPE
+#undef _PSWITCH
+#undef _PTSWITCH
 
diff --git a/linalg/assign_add_mul.h b/linalg/assign_add_mul.h
index 89d67ee32..0d5167a9f 100644
--- a/linalg/assign_add_mul.h
+++ b/linalg/assign_add_mul.h
@@ -24,5 +24,9 @@
 
 /*   (*P) = (*P) + c(*Q)        c is a complex constant   */
 void assign_add_mul(spinor * const P, spinor * const Q, const _Complex double c, const int N);
+void assign_add_mul_32(spinor32 * const P, spinor32 * const Q, const _Complex float c, const int N);
+void assign_add_mul_ts(spinor * const P, spinor * const Q, const _Complex double c, const int N);
+void assign_add_mul_ts_32(spinor32 * const P, spinor32 * const Q, const _Complex float c, const int N);
+
 
 #endif
diff --git a/linalg/assign_add_mul_add_mul.c b/linalg/assign_add_mul_add_mul.c
index 34368eaf7..b1b70e985 100644
--- a/linalg/assign_add_mul_add_mul.c
+++ b/linalg/assign_add_mul_add_mul.c
@@ -26,9 +26,9 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include <stdlib.h>
@@ -41,14 +41,14 @@
 /* S,U input, R inoutput, c1,c2 input */
 void assign_add_mul_add_mul(spinor * const R, spinor * const S, spinor * const U, const _Complex double c1, const _Complex double c2, const int N)
 {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
 
   spinor *r, *s, *u;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (int ix = 0; ix < N; ++ix)
@@ -74,7 +74,7 @@ void assign_add_mul_add_mul(spinor * const R, spinor * const S, spinor * const U
     r->s3.c2 += c1 * s->s3.c2 + c2 * u->s3.c2;
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
   
diff --git a/linalg/assign_add_mul_add_mul_r.c b/linalg/assign_add_mul_add_mul_r.c
index 7d372cd37..9201087df 100644
--- a/linalg/assign_add_mul_add_mul_r.c
+++ b/linalg/assign_add_mul_add_mul_r.c
@@ -26,7 +26,7 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/linalg/assign_add_mul_body.c b/linalg/assign_add_mul_body.c
new file mode 100644
index 000000000..d4828b3ad
--- /dev/null
+++ b/linalg/assign_add_mul_body.c
@@ -0,0 +1,88 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+void _PSWITCH(assign_add_mul)(_PTSWITCH(spinor) * const R, _PTSWITCH(spinor) * const S, 
+                              const _C_TYPE c, const int N)
+{
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+    _PTSWITCH(spinor) *r,*s;
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+    for (int ix=0; ix<N; ix++)
+      {
+        r=(_PTSWITCH(spinor) *) R + ix;
+        s=(_PTSWITCH(spinor) *) S + ix;
+
+        r->s0.c0 += c * s->s0.c0;
+        r->s0.c1 += c * s->s0.c1;
+        r->s0.c2 += c * s->s0.c2;
+
+        r->s1.c0 += c * s->s1.c0;
+        r->s1.c1 += c * s->s1.c1;
+        r->s1.c2 += c * s->s1.c2;
+
+        r->s2.c0 += c * s->s2.c0;
+        r->s2.c1 += c * s->s2.c1;
+        r->s2.c2 += c * s->s2.c2;
+
+        r->s3.c0 += c * s->s3.c0;
+        r->s3.c1 += c * s->s3.c1;
+        r->s3.c2 += c * s->s3.c2;
+      }
+
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+  return;
+}
+
+void _PSWITCH(assign_add_mul_ts)(_PTSWITCH(spinor) * const R, _PTSWITCH(spinor) * const S, 
+                                 const _C_TYPE c, const int N)
+{
+  _PTSWITCH(spinor) *r,*s;
+
+  for (int ix=0; ix<N; ix++)
+    {
+      r=(_PTSWITCH(spinor) *) R + ix;
+      s=(_PTSWITCH(spinor) *) S + ix;
+
+      r->s0.c0 += c * s->s0.c0;
+      r->s0.c1 += c * s->s0.c1;
+      r->s0.c2 += c * s->s0.c2;
+
+      r->s1.c0 += c * s->s1.c0;
+      r->s1.c1 += c * s->s1.c1;
+      r->s1.c2 += c * s->s1.c2;
+
+      r->s2.c0 += c * s->s2.c0;
+      r->s2.c1 += c * s->s2.c1;
+      r->s2.c2 += c * s->s2.c2;
+
+      r->s3.c0 += c * s->s3.c0;
+      r->s3.c1 += c * s->s3.c1;
+      r->s3.c2 += c * s->s3.c2;
+    }
+
+  return;
+}
diff --git a/linalg/assign_add_mul_r.c b/linalg/assign_add_mul_r.c
index d81ca94ea..18532ce5d 100644
--- a/linalg/assign_add_mul_r.c
+++ b/linalg/assign_add_mul_r.c
@@ -21,14 +21,14 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include "su3.h"
 #include "assign_add_mul_r.h"
@@ -41,7 +41,7 @@
 
 void assign_add_mul_r(spinor * const P, spinor * const Q, const double c, const int N)
 {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -52,16 +52,16 @@ void assign_add_mul_r(spinor * const P, spinor * const Q, const double c, const
 			:
 			:
 			"m" (c));
-#ifndef OMP
+#ifndef TM_USE_OMP
   s=&P[0].s0;
   r=&Q[0].s0;
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (ix = 0;ix < 4*N; ix++) {
-#ifdef OMP
+#ifdef TM_USE_OMP
     s=&P[0].s0+ix;
     r=&Q[0].s0+ix;
 #endif
@@ -78,11 +78,11 @@ void assign_add_mul_r(spinor * const P, spinor * const Q, const double c, const
 			  :
 			  :);
     _sse_store(*s);
-#ifndef OMP
+#ifndef TM_USE_OMP
     s++; r++;
 #endif
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
@@ -90,7 +90,7 @@ void assign_add_mul_r(spinor * const P, spinor * const Q, const double c, const
 #elif (defined BGQ && defined XLC)
 
 void assign_add_mul_r(spinor * const R, spinor * const S, const double c, const int N) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -108,7 +108,7 @@ void assign_add_mul_r(spinor * const R, spinor * const S, const double c, const
   __alignx(32, S);
   __alignx(32, R);
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #else
 #pragma unroll(2)
@@ -143,7 +143,7 @@ void assign_add_mul_r(spinor * const R, spinor * const S, const double c, const
     vec_st(z4, 0, r+16);
     vec_st(z5, 0, r+20);
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
   return;
@@ -345,7 +345,7 @@ void assign_add_mul_r(spinor * const R, spinor * const S, const double c, const
 
 void assign_add_mul_r(spinor * const P, spinor * const Q, const double c, const int N)
 {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -353,7 +353,7 @@ void assign_add_mul_r(spinor * const P, spinor * const Q, const double c, const
   register spinor *q;
 
   /* Change due to even-odd preconditioning : VOLUME   to VOLUME/2 */   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (int ix = 0; ix < N; ++ix)
@@ -376,7 +376,7 @@ void assign_add_mul_r(spinor * const P, spinor * const Q, const double c, const
     p->s3.c1 += c * q->s3.c1;
     p->s3.c2 += c * q->s3.c2;
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
@@ -385,14 +385,14 @@ void assign_add_mul_r(spinor * const P, spinor * const Q, const double c, const
 #ifdef WITHLAPH
 void assign_add_mul_r_su3vect(su3_vector * const P, su3_vector * const Q, const double c, const int N)
 {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
 
   su3_vector *p,*q;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (int ix = 0; ix < N; ++ix) 
@@ -404,7 +404,7 @@ void assign_add_mul_r_su3vect(su3_vector * const P, su3_vector * const Q, const
     p->c1 += c * q->c1;
     p->c2 += c * q->c2;
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
diff --git a/linalg/assign_add_mul_r_32.c b/linalg/assign_add_mul_r_32.c
new file mode 100644
index 000000000..8a2254f1d
--- /dev/null
+++ b/linalg/assign_add_mul_r_32.c
@@ -0,0 +1,141 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+/*******************************************************************************
+ *
+ * File assign_add_mul.c 
+ *
+ *   void assign_add_mul(spinor * const P, spinor * const Q, const complex c)
+ *     (*P) = (*P) + c(*Q)        c is a complex constant
+ *
+ *******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#ifdef TM_USE_OMP
+# include <omp.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "su3.h"
+#include "assign_add_mul_r_32.h"
+
+#if (defined BGQ && defined XLC)
+void assign_add_mul_r_32_orphaned(spinor32 * const R, spinor32 * const S, const float c, const int N) {
+  vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5;
+  vector4double z0, z1, z2, z3, z4, z5, k;
+  float *s, *r;
+  float ALIGN32 _c;
+  _c = c;
+  __prefetch_by_load(S);
+  __prefetch_by_load(R);
+
+  k = vec_splats((double)_c);
+  __alignx(16, s);
+  __alignx(16, r);
+  __alignx(16, S);
+  __alignx(16, R);
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#else
+#pragma unroll(2)
+#endif
+  for(int i = 0; i < N; i++) {
+    s=(float*)((spinor32 *) S + i);
+    r=(float*)((spinor32 *) R + i);
+    __prefetch_by_load(S + i + 1);
+    __prefetch_by_stream(1, R + i + 1);
+    x0 = vec_ld(0, r);
+    x1 = vec_ld(0, r+4);
+    x2 = vec_ld(0, r+8);
+    x3 = vec_ld(0, r+12);
+    x4 = vec_ld(0, r+16);
+    x5 = vec_ld(0, r+20);
+    y0 = vec_ld(0, s);
+    y1 = vec_ld(0, s+4);
+    y2 = vec_ld(0, s+8);
+    y3 = vec_ld(0, s+12);
+    y4 = vec_ld(0, s+16);
+    y5 = vec_ld(0, s+20);
+    z0 = vec_madd(k, y0, x0);
+    z1 = vec_madd(k, y1, x1);
+    z2 = vec_madd(k, y2, x2);
+    z3 = vec_madd(k, y3, x3);
+    z4 = vec_madd(k, y4, x4);
+    z5 = vec_madd(k, y5, x5);
+    vec_st(z0, 0, r);
+    vec_st(z1, 0, r+4);
+    vec_st(z2, 0, r+8);
+    vec_st(z3, 0, r+12);
+    vec_st(z4, 0, r+16);
+    vec_st(z5, 0, r+20);
+  }
+  return;
+}
+
+#else
+
+void assign_add_mul_r_32_orphaned(spinor32 * const R, spinor32 * const S, const float c, const int N)
+{
+  spinor32 *r,*s;
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for (int ix=0; ix<N; ix++)
+  {
+    r=(spinor32 *) R + ix;
+    s=(spinor32 *) S + ix;
+
+    r->s0.c0 += c * s->s0.c0;
+    r->s0.c1 += c * s->s0.c1;
+    r->s0.c2 += c * s->s0.c2;
+
+    r->s1.c0 += c * s->s1.c0;
+    r->s1.c1 += c * s->s1.c1;
+    r->s1.c2 += c * s->s1.c2;
+
+    r->s2.c0 += c * s->s2.c0;
+    r->s2.c1 += c * s->s2.c1;
+    r->s2.c2 += c * s->s2.c2;
+
+    r->s3.c0 += c * s->s3.c0;
+    r->s3.c1 += c * s->s3.c1;
+    r->s3.c2 += c * s->s3.c2;
+  }
+
+}
+
+#endif
+
+void assign_add_mul_r_32(spinor32 * const R, spinor32 * const S, const float c, const int N)
+{
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  assign_add_mul_r_32_orphaned(R,S,c,N);
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+return;
+}
+
diff --git a/linalg/assign_add_mul_r_32.h b/linalg/assign_add_mul_r_32.h
new file mode 100644
index 000000000..c3bec9ecf
--- /dev/null
+++ b/linalg/assign_add_mul_r_32.h
@@ -0,0 +1,9 @@
+#ifndef _ASSIGN_ADD_MUL_32_H
+#define _ASSIGN_ADD_MUL_32_H
+
+#include "su3.h"
+
+/*   (*P) = (*P) + c(*Q)        c is a complex constant   */
+void assign_add_mul_r_32(spinor32 * const R, spinor32 * const S, const float c, const int N);
+
+#endif
diff --git a/linalg/assign_add_mul_r_add_mul.c b/linalg/assign_add_mul_r_add_mul.c
index bff23eddd..61dddf3c3 100644
--- a/linalg/assign_add_mul_r_add_mul.c
+++ b/linalg/assign_add_mul_r_add_mul.c
@@ -18,15 +18,15 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include "su3.h"
@@ -86,14 +86,14 @@ void assign_add_mul_r_add_mul(spinor * const R, spinor * const S, spinor * const
 /* j, k input, l output */
 void assign_add_mul_r_add_mul(spinor * const R, spinor * const S, spinor * const U,
 			      const double c1,const double c2, const int N) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
 
    spinor *r,*s,*t;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
    for (int ix = 0; ix < N; ++ix)
@@ -119,7 +119,7 @@ void assign_add_mul_r_add_mul(spinor * const R, spinor * const S, spinor * const
      r->s3.c2 += c1 * s->s3.c2 + c2 * t->s3.c2;
    }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 
diff --git a/linalg/assign_diff_mul.c b/linalg/assign_diff_mul.c
index 48b77167e..21b8c46f9 100644
--- a/linalg/assign_diff_mul.c
+++ b/linalg/assign_diff_mul.c
@@ -18,9 +18,9 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include <stdlib.h>
@@ -30,13 +30,13 @@
 /* R=R-c*S */
 void assign_diff_mul(spinor * const R, spinor * const S, const _Complex double c, const int N)
 {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
   spinor *r, *s;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (int ix = 0; ix < N; ++ix)
@@ -60,7 +60,7 @@ void assign_diff_mul(spinor * const R, spinor * const S, const _Complex double c
     r->s3.c1 -= c * s->s3.c1;
     r->s3.c2 -= c * s->s3.c2;
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
diff --git a/linalg/assign_mul_add.c b/linalg/assign_mul_add.c
new file mode 100644
index 000000000..99352cb28
--- /dev/null
+++ b/linalg/assign_mul_add.c
@@ -0,0 +1,71 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#ifdef TM_USE_OMP
+# include <omp.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "su3.h"
+#include "assign_mul_add.h"
+
+void assign_mul_add(spinor * const R, const _Complex double c, spinor * const S, const int N)
+{
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  spinor *r,*s;
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for (int ix=0; ix<N; ix++)
+  {
+    r=(spinor *) R + ix;
+    s=(spinor *) S + ix;
+
+    r->s0.c0 = c * r->s0.c0 + s->s0.c0;
+    r->s0.c1 = c * r->s0.c1 + s->s0.c1;
+    r->s0.c2 = c * r->s0.c2 + s->s0.c2;
+                          
+    r->s1.c0 = c * r->s1.c0 + s->s1.c0;
+    r->s1.c1 = c * r->s1.c1 + s->s1.c1;
+    r->s1.c2 = c * r->s1.c2 + s->s1.c2;
+                          
+    r->s2.c0 = c * r->s2.c0 + s->s2.c0;
+    r->s2.c1 = c * r->s2.c1 + s->s2.c1;
+    r->s2.c2 = c * r->s2.c2 + s->s2.c2;
+                          
+    r->s3.c0 = c * r->s3.c0 + s->s3.c0;
+    r->s3.c1 = c * r->s3.c1 + s->s3.c1;
+    r->s3.c2 = c * r->s3.c2 + s->s3.c2;
+  }
+
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+
+}
+
+
diff --git a/linalg/assign_mul_add.h b/linalg/assign_mul_add.h
new file mode 100644
index 000000000..4571c80e4
--- /dev/null
+++ b/linalg/assign_mul_add.h
@@ -0,0 +1,28 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _ASSIGN_MUL_ADD_H
+#define _ASSIGN_MUL_ADD_H
+
+#include "su3.h"
+
+/*   (*P) = c(*P) + (*Q)        c is a complex constant   */
+void assign_mul_add(spinor * const P, const _Complex double c, spinor * const Q, const int N);
+
+#endif
diff --git a/linalg/assign_mul_add_mul.c b/linalg/assign_mul_add_mul.c
new file mode 100644
index 000000000..16a8ce3f9
--- /dev/null
+++ b/linalg/assign_mul_add_mul.c
@@ -0,0 +1,70 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#ifdef TM_USE_OMP
+#include <omp.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "su3.h"
+#include "assign_mul_add_mul.h"
+
+/* Makes (*R)=c1*(*R)+c2*(*S) , c1 and c2 are complex constants */
+void assign_mul_add_mul(spinor * const R, const _Complex double c1, spinor * const S, const _Complex double c2, const int N){
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+
+  spinor *r,*s;
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for (int ix = 0; ix < N; ++ix){
+
+    r=(spinor *) R + ix;
+    s=(spinor *) S + ix;
+
+    r->s0.c0 = c1 * r->s0.c0 + c2 * s->s0.c0;
+    r->s0.c1 = c1 * r->s0.c1 + c2 * s->s0.c1;
+    r->s0.c2 = c1 * r->s0.c2 + c2 * s->s0.c2;
+  
+	r->s1.c0 = c1 * r->s1.c0 + c2 * s->s1.c0;
+    r->s1.c1 = c1 * r->s1.c1 + c2 * s->s1.c1;
+    r->s1.c2 = c1 * r->s1.c2 + c2 * s->s1.c2;
+
+    r->s2.c0 = c1 * r->s2.c0 + c2 * s->s2.c0;
+    r->s2.c1 = c1 * r->s2.c1 + c2 * s->s2.c1;
+    r->s2.c2 = c1 * r->s2.c2 + c2 * s->s2.c2;
+
+    r->s3.c0 = c1 * r->s3.c0 + c2 * s->s3.c0;
+    r->s3.c1 = c1 * r->s3.c1 + c2 * s->s3.c1;
+    r->s3.c2 = c1 * r->s3.c2 + c2 * s->s3.c2;
+  }
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+
+}
+
diff --git a/linalg/assign_mul_add_mul.h b/linalg/assign_mul_add_mul.h
new file mode 100644
index 000000000..59e9194c1
--- /dev/null
+++ b/linalg/assign_mul_add_mul.h
@@ -0,0 +1,28 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _ASSIGN_MUL_ADD_MUL_H
+#define _ASSIGN_MUL_ADD_MUL_H
+
+#include "su3.h"
+
+/* Makes (*R)=c1*(*R)+c2*(*S) , c1 and c2 are complex constants */
+void assign_mul_add_mul(spinor * const R, const _Complex double c1, spinor * const S, const _Complex double c2, const int N);
+ 
+#endif
diff --git a/linalg/assign_mul_add_mul_add_mul_add_mul_r.c b/linalg/assign_mul_add_mul_add_mul_add_mul_r.c
index 72e123597..82e75ea29 100644
--- a/linalg/assign_mul_add_mul_add_mul_add_mul_r.c
+++ b/linalg/assign_mul_add_mul_add_mul_add_mul_r.c
@@ -27,9 +27,9 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include <stdlib.h>
@@ -44,14 +44,14 @@
 void assign_mul_add_mul_add_mul_add_mul_r(spinor * const R, spinor * const S, spinor * const U, spinor * const V,
 					  const double c1, const double c2, const double c3, const double c4, const int N)
 {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
 
   spinor *r, *s, *u, *v;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (int ix = 0; ix < N; ++ix)
@@ -77,7 +77,7 @@ void assign_mul_add_mul_add_mul_add_mul_r(spinor * const R, spinor * const S, sp
     r->s3.c1 = c1 * r->s3.c1 + c2 * s->s3.c1 + c3 * u->s3.c1 + c4 * v->s3.c1;
     r->s3.c2 = c1 * r->s3.c2 + c2 * s->s3.c2 + c3 * u->s3.c2 + c4 * v->s3.c2;
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
diff --git a/linalg/assign_mul_add_mul_add_mul_r.c b/linalg/assign_mul_add_mul_add_mul_r.c
index 6ef2d6006..eea427b5f 100644
--- a/linalg/assign_mul_add_mul_add_mul_r.c
+++ b/linalg/assign_mul_add_mul_add_mul_r.c
@@ -27,9 +27,9 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include <stdlib.h>
@@ -43,14 +43,14 @@ void assign_mul_add_mul_add_mul_r(spinor * const R, spinor * const S, spinor * c
 				  const double c1,const double c2,const double c3,
 				  const int N)
 {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
 
   spinor *r,*s,*u;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (int ix = 0; ix < N; ++ix)
@@ -76,7 +76,7 @@ void assign_mul_add_mul_add_mul_r(spinor * const R, spinor * const S, spinor * c
     r->s3.c2 = c1 * r->s3.c2 + c2 * s->s3.c2 + c3 * u->s3.c2;
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
diff --git a/linalg/assign_mul_add_mul_r.c b/linalg/assign_mul_add_mul_r.c
index d72e38a68..b6826798a 100644
--- a/linalg/assign_mul_add_mul_r.c
+++ b/linalg/assign_mul_add_mul_r.c
@@ -21,9 +21,9 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include <stdlib.h>
@@ -37,14 +37,14 @@
 void assign_mul_add_mul_r(spinor * const R,spinor * const S, 
 			  const double c1, const double c2,
 			  const int N) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
 
   spinor *r,*s;
   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (int ix = 0; ix < N; ++ix){
@@ -67,7 +67,7 @@ void assign_mul_add_mul_r(spinor * const R,spinor * const S,
     r->s3.c1 = c1 * r->s3.c1 + c2 * s->s3.c1;
     r->s3.c2 = c1 * r->s3.c2 + c2 * s->s3.c2;
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
diff --git a/linalg/assign_mul_add_mul_r_32.c b/linalg/assign_mul_add_mul_r_32.c
new file mode 100644
index 000000000..71f902aa0
--- /dev/null
+++ b/linalg/assign_mul_add_mul_r_32.c
@@ -0,0 +1,78 @@
+/***********************************************************************
+ * Copyright (C) 2015 Florian Burger
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+/*******************************************************************************
+ * Makes (*R)=c1*(*R)+c2*(*S) , c1 and c2 are real constants 
+ *******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#ifdef TM_USE_OMP
+# include <omp.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "su3.h"
+#include "assign_mul_add_mul_r_32.h"
+
+
+/* S input, R inoutput, c1,c2 input */
+void assign_mul_add_mul_r_32(spinor32 * const R, spinor32 * const S, 
+			  const float c1, const float c2,
+			  const int N) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+
+  spinor32 *r,*s;
+  
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for (int ix = 0; ix < N; ++ix){
+    r=(spinor32 *) R + ix;
+    s=(spinor32 *) S + ix;
+
+    r->s0.c0 = c1 * r->s0.c0 + c2 * s->s0.c0;
+    r->s0.c1 = c1 * r->s0.c1 + c2 * s->s0.c1;
+    r->s0.c2 = c1 * r->s0.c2 + c2 * s->s0.c2;
+    
+    r->s1.c0 = c1 * r->s1.c0 + c2 * s->s1.c0;
+    r->s1.c1 = c1 * r->s1.c1 + c2 * s->s1.c1;
+    r->s1.c2 = c1 * r->s1.c2 + c2 * s->s1.c2;
+    
+    r->s2.c0 = c1 * r->s2.c0 + c2 * s->s2.c0;
+    r->s2.c1 = c1 * r->s2.c1 + c2 * s->s2.c1;
+    r->s2.c2 = c1 * r->s2.c2 + c2 * s->s2.c2;
+    
+    r->s3.c0 = c1 * r->s3.c0 + c2 * s->s3.c0;
+    r->s3.c1 = c1 * r->s3.c1 + c2 * s->s3.c1;
+    r->s3.c2 = c1 * r->s3.c2 + c2 * s->s3.c2;
+  }
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+}
+
+
+
+
+
diff --git a/linalg/assign_mul_add_mul_r_32.h b/linalg/assign_mul_add_mul_r_32.h
new file mode 100644
index 000000000..7c2527937
--- /dev/null
+++ b/linalg/assign_mul_add_mul_r_32.h
@@ -0,0 +1,30 @@
+/***********************************************************************
+ * Copyright (C) 2015 Florian Burger
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _ASSIGN_MUL_ADD_MUL_R_32_H
+#define _ASSIGN_MUL_ADD_MUL_R_32_H
+
+#include "su3.h"
+
+/* Makes (*R)=c1*(*R)+c2*(*S) , c1 and c2 are real constants */
+void assign_mul_add_mul_r_32(spinor32 * const R,spinor32 * const S, 
+			  const float c1, const float c2,
+			  const int N);
+
+#endif
diff --git a/linalg/assign_mul_add_r.c b/linalg/assign_mul_add_r.c
index dd1c528f5..1782eb1dc 100644
--- a/linalg/assign_mul_add_r.c
+++ b/linalg/assign_mul_add_r.c
@@ -18,11 +18,11 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <complex.h>
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include "su3.h"
@@ -34,7 +34,7 @@
 
 /* k input , l output*/
 void assign_mul_add_r(spinor * const R, const double c, const spinor * const S, const int N) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -45,14 +45,14 @@ void assign_mul_add_r(spinor * const R, const double c, const spinor * const S,
 			:
 			:
 			"m" (c));
-#ifndef OMP
+#ifndef TM_USE_OMP
   s=&S[0].s0;
   r=&R[0].s0;
 #else
 #pragma omp for
 #endif
   for (ix=0;ix<4*N;ix++) {
-#ifdef OMP
+#ifdef TM_USE_OMP
   s=&S[0].s0+ix;
   r=&R[0].s0+ix;
 #endif
@@ -69,12 +69,12 @@ void assign_mul_add_r(spinor * const R, const double c, const spinor * const S,
 			  :
 			  :);
     _sse_store(*r);
-#ifndef OMP
+#ifndef TM_USE_OMP
     s++; r++;
 #endif
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif  
 }
@@ -82,7 +82,7 @@ void assign_mul_add_r(spinor * const R, const double c, const spinor * const S,
 #elif (defined BGQ && defined XLC)
 
 void assign_mul_add_r(spinor * const R, const double c, const spinor * const S, const int N) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -101,7 +101,7 @@ void assign_mul_add_r(spinor * const R, const double c, const spinor * const S,
   __alignx(32, S);
   __alignx(32, R);
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #else
 #pragma unroll(4)
@@ -136,7 +136,7 @@ void assign_mul_add_r(spinor * const R, const double c, const spinor * const S,
     vec_st(z4, 0, r+16);
     vec_st(z5, 0, r+20);
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif  
   return;
@@ -339,7 +339,7 @@ void assign_mul_add_r(spinor * const R, const double c, const spinor * const S,
 
 void assign_mul_add_r(spinor * const R, const double c, const spinor * const S, const int N)
 {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -347,7 +347,7 @@ void assign_mul_add_r(spinor * const R, const double c, const spinor * const S,
   const spinor *s;
   
   /* Change due to even-odd preconditioning : VOLUME   to VOLUME/2 */   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (int ix = 0; ix < N; ++ix)
@@ -371,7 +371,7 @@ void assign_mul_add_r(spinor * const R, const double c, const spinor * const S,
     r->s3.c1 = c * r->s3.c1 + s->s3.c1;
     r->s3.c2 = c * r->s3.c2 + s->s3.c2;   
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
@@ -381,13 +381,13 @@ void assign_mul_add_r(spinor * const R, const double c, const spinor * const S,
 #ifdef WITHLAPH
 void assign_mul_add_r_su3vect(su3_vector * const R, const double c, su3_vector * const S, const int N)
 {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
   su3_vector *r,*s;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (int ix = 0; ix < N; ++ix) 
@@ -398,7 +398,7 @@ void assign_mul_add_r_su3vect(su3_vector * const R, const double c, su3_vector *
     r->c1 = c * r->c1 + s->c1;
     r->c2 = c * r->c2 + s->c2;    
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
diff --git a/linalg/assign_mul_add_r_32.c b/linalg/assign_mul_add_r_32.c
new file mode 100644
index 000000000..9e3f3fcbf
--- /dev/null
+++ b/linalg/assign_mul_add_r_32.c
@@ -0,0 +1,121 @@
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include <stdlib.h>
+#include <complex.h>
+#ifdef TM_USE_OMP
+# include <omp.h>
+#endif
+#include "su3.h"
+#include "assign_mul_add_r_32.h"
+
+
+/* R inoutput , c,S input*/
+/*   (*R) = c*(*R) + (*S)        c is a real constant   */
+
+#if (defined BGQ && defined XLC)
+
+void assign_mul_add_r_32(spinor32 * const R, const float c, const spinor32 * const S, const int N) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+
+  vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5;
+  vector4double z0, z1, z2, z3, z4, z5, k;
+  float *s, *r;
+  float ALIGN32 _c;
+  _c = c;
+  __prefetch_by_load(S);
+  __prefetch_by_load(R);
+
+  k = vec_splats((double)_c);
+  __alignx(16, s);
+  __alignx(16, r);
+  __alignx(16, S);
+  __alignx(16, R);
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#else
+#pragma unroll(4)
+#endif
+  for(int i = 0; i < N; i++) {
+    s=(float*)((spinor32 *) S + i);
+    r=(float*)((spinor32 *) R + i);
+    __prefetch_by_load(S + i + 1);
+    __prefetch_by_stream(1, R + i + 1);
+    x0 = vec_ld(0, r);
+    x1 = vec_ld(0, r+4);
+    x2 = vec_ld(0, r+8);
+    x3 = vec_ld(0, r+12);
+    x4 = vec_ld(0, r+16);
+    x5 = vec_ld(0, r+20);
+    y0 = vec_ld(0, s);
+    y1 = vec_ld(0, s+4);
+    y2 = vec_ld(0, s+8);
+    y3 = vec_ld(0, s+12);
+    y4 = vec_ld(0, s+16);
+    y5 = vec_ld(0, s+20);
+    z0 = vec_madd(k, x0, y0);
+    z1 = vec_madd(k, x1, y1);
+    z2 = vec_madd(k, x2, y2);
+    z3 = vec_madd(k, x3, y3);
+    z4 = vec_madd(k, x4, y4);
+    z5 = vec_madd(k, x5, y5);
+    vec_st(z0, 0, r);
+    vec_st(z1, 0, r+4);
+    vec_st(z2, 0, r+8);
+    vec_st(z3, 0, r+12);
+    vec_st(z4, 0, r+16);
+    vec_st(z5, 0, r+20);
+  }
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif  
+  return;
+}
+
+#else
+
+void assign_mul_add_r_32(spinor32 * const R, const float c, const spinor32 * const S, const int N)
+{
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  spinor32 *r;
+  const spinor32 *s;
+  
+  /* Change due to even-odd preconditioning : VOLUME   to VOLUME/2 */   
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for (int ix = 0; ix < N; ++ix)
+  {
+    r = R + ix;
+    s = S + ix;
+    
+    r->s0.c0 = c * r->s0.c0 + s->s0.c0;
+    r->s0.c1 = c * r->s0.c1 + s->s0.c1;
+    r->s0.c2 = c * r->s0.c2 + s->s0.c2;    
+
+    r->s1.c0 = c * r->s1.c0 + s->s1.c0;
+    r->s1.c1 = c * r->s1.c1 + s->s1.c1;
+    r->s1.c2 = c * r->s1.c2 + s->s1.c2;    
+
+    r->s2.c0 = c * r->s2.c0 + s->s2.c0;
+    r->s2.c1 = c * r->s2.c1 + s->s2.c1;
+    r->s2.c2 = c * r->s2.c2 + s->s2.c2;    
+
+    r->s3.c0 = c * r->s3.c0 + s->s3.c0;
+    r->s3.c1 = c * r->s3.c1 + s->s3.c1;
+    r->s3.c2 = c * r->s3.c2 + s->s3.c2;   
+  }
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+}
+
+
+#endif
diff --git a/linalg/assign_mul_add_r_32.h b/linalg/assign_mul_add_r_32.h
new file mode 100644
index 000000000..6a7038cfb
--- /dev/null
+++ b/linalg/assign_mul_add_r_32.h
@@ -0,0 +1,8 @@
+#ifndef ASSIGN_MUL_ADD_R_32_H
+#define ASSIGN_MUL_ADD_R_32_H
+
+#include "su3.h"
+
+void assign_mul_add_r_32(spinor32 * const R, const float c, const spinor32 * const S, const int N);
+
+#endif
\ No newline at end of file
diff --git a/linalg/assign_mul_add_r_and_square.c b/linalg/assign_mul_add_r_and_square.c
index a8921af05..c28981fe1 100644
--- a/linalg/assign_mul_add_r_and_square.c
+++ b/linalg/assign_mul_add_r_and_square.c
@@ -18,14 +18,14 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include<mpi.h>
 #endif
 #include <stdlib.h>
 #include <complex.h>
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 # include <global.h>
 #endif
@@ -38,11 +38,11 @@
 double assign_mul_add_r_and_square(spinor * const R, const double c, spinor * const S, 
 				   const int N, const int parallel) {
   double ALIGN res = 0.0;
-#ifdef MPI
+#ifdef TM_USE_MPI
   double ALIGN mres;
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
   int thread_num = omp_get_thread_num();
@@ -53,7 +53,7 @@ double assign_mul_add_r_and_square(spinor * const R, const double c, spinor * co
   double *s, *r;
   double ALIGN _c = c;
   double ALIGN ds = 0.0;
-#ifndef OMP
+#ifndef TM_USE_OMP
   __prefetch_by_load(S);
   __prefetch_by_load(R);
 #endif
@@ -71,7 +71,7 @@ double assign_mul_add_r_and_square(spinor * const R, const double c, spinor * co
   r5 = vec_splats(0.);
 
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for 
 #endif
   for(int i = 0; i < N; i++) {
@@ -117,7 +117,7 @@ double assign_mul_add_r_and_square(spinor * const R, const double c, spinor * co
   y1 = vec_add(x2, y0);
   ds = y1[0] + y1[1] + y1[2] + y1[3];
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   g_omp_acc_re[thread_num] = ds;
   } /* OpenMP closing brace */
 
@@ -128,7 +128,7 @@ double assign_mul_add_r_and_square(spinor * const R, const double c, spinor * co
   res = ds;
 #endif
 
-#  ifdef MPI
+#  ifdef TM_USE_MPI
   if(parallel) {
     MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
     return(mres);
@@ -145,11 +145,11 @@ double assign_mul_add_r_and_square(spinor * const R, const double c, spinor * co
 double assign_mul_add_r_and_square(spinor * const R, const double c, const spinor * const S, 
 				   const int N, const int parallel) {
   double ALIGN res = 0.0;
-#ifdef MPI
+#ifdef TM_USE_MPI
   double ALIGN mres;
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
   int thread_num = omp_get_thread_num();
@@ -159,7 +159,7 @@ double assign_mul_add_r_and_square(spinor * const R, const double c, const spino
   double ALIGN ds = 0.0;
 
   /* Change due to even-odd preconditioning : VOLUME   to VOLUME/2 */   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for 
 #endif
   for (int ix = 0; ix < N; ++ix) {
@@ -195,7 +195,7 @@ double assign_mul_add_r_and_square(spinor * const R, const double c, const spino
     ds += creal(r->s3.c2)*creal(r->s3.c2) + cimag(r->s3.c2)*cimag(r->s3.c2);
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   g_omp_acc_re[thread_num] = ds;
   } /* OpenMP closing brace */
 
@@ -206,7 +206,7 @@ double assign_mul_add_r_and_square(spinor * const R, const double c, const spino
   res = ds;
 #endif
 
-#  ifdef MPI
+#  ifdef TM_USE_MPI
   if(parallel) {
     MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
     return(mres);
diff --git a/linalg/assign_mul_bra_add_mul_ket_add.c b/linalg/assign_mul_bra_add_mul_ket_add.c
index ad1a2e6ca..077f0a92a 100644
--- a/linalg/assign_mul_bra_add_mul_ket_add.c
+++ b/linalg/assign_mul_bra_add_mul_ket_add.c
@@ -27,9 +27,9 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include <stdlib.h>
@@ -43,14 +43,14 @@
 /* R inoutput, S input, U input, c1 input, c2 input */
 void assign_mul_bra_add_mul_ket_add(spinor * const R, spinor * const S,spinor * const U,
 				    const _Complex double c1, const _Complex double c2, const int N) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
 
   spinor *r, *s, *u;
   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (int ix = 0; ix < N; ++ix)
@@ -76,7 +76,7 @@ void assign_mul_bra_add_mul_ket_add(spinor * const R, spinor * const S,spinor *
     r->s3.c1 = u->s3.c1 + c2 * (r->s3.c1 + c1 * s->s3.c1);
     r->s3.c2 = u->s3.c2 + c2 * (r->s3.c2 + c1 * s->s3.c2);
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
diff --git a/linalg/assign_mul_bra_add_mul_ket_add_r.c b/linalg/assign_mul_bra_add_mul_ket_add_r.c
index 0b5fe6712..c88bcb782 100644
--- a/linalg/assign_mul_bra_add_mul_ket_add_r.c
+++ b/linalg/assign_mul_bra_add_mul_ket_add_r.c
@@ -27,9 +27,9 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include <stdlib.h>
@@ -40,7 +40,7 @@
 
 void assign_mul_bra_add_mul_ket_add_r(spinor * const R, spinor * const S, spinor * const U, 
 				      const double c1, const double c2, const int N) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -48,7 +48,7 @@ void assign_mul_bra_add_mul_ket_add_r(spinor * const R, spinor * const S, spinor
    int ix;
    spinor *r,*s,*u;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
    for (ix = 0; ix < N; ix++)
@@ -74,7 +74,7 @@ void assign_mul_bra_add_mul_ket_add_r(spinor * const R, spinor * const S, spinor
      r->s3.c2 = c2 * (r->s3.c2 + c1 * s->s3.c2) + u->s3.c2;
    }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 
diff --git a/linalg/assign_mul_bra_add_mul_r.c b/linalg/assign_mul_bra_add_mul_r.c
index bf37101ca..88cb0fdf7 100644
--- a/linalg/assign_mul_bra_add_mul_r.c
+++ b/linalg/assign_mul_bra_add_mul_r.c
@@ -26,9 +26,9 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include <stdlib.h>
@@ -39,7 +39,7 @@
 
 /*  R output, S input, c0 input, c input */
 void assign_mul_bra_add_mul_r(spinor * const R,const double c0, const double c,spinor * const S, const int N){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -52,7 +52,7 @@ void assign_mul_bra_add_mul_r(spinor * const R,const double c0, const double c,s
   fact0=c0;
 
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (ix = 0;ix < N; ++ix)
@@ -76,7 +76,7 @@ void assign_mul_bra_add_mul_r(spinor * const R,const double c0, const double c,s
     r->s3.c1 = fact0 * (r->s3.c1 + fact * s->s3.c1);
     r->s3.c2 = fact0 * (r->s3.c2 + fact * s->s3.c2);
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
diff --git a/linalg/assign_to_32.c b/linalg/assign_to_32.c
new file mode 100644
index 000000000..9e171789d
--- /dev/null
+++ b/linalg/assign_to_32.c
@@ -0,0 +1,132 @@
+/***********************************************************************
+ * Copyright (C) 2014 Florian Burger
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#ifdef TM_USE_OMP
+# include <omp.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <complex.h>
+#include <math.h>
+#include <string.h>
+#include "su3.h"
+#include "assign_to_32.h"
+
+
+/* S input, R output        */
+/* S and R must not overlap */
+void assign_to_32(spinor32 * const R, spinor * const S, const int N)
+{
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  spinor32 *r;
+  spinor *s;
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for (int ix=0; ix<N; ix++)
+  {
+    r=(spinor32 *) R + ix;
+    s=(spinor*) S + ix;
+
+    r->s0.c0 = s->s0.c0;
+    r->s0.c1 = s->s0.c1;
+    r->s0.c2 = s->s0.c2;
+
+    r->s1.c0 = s->s1.c0;
+    r->s1.c1 = s->s1.c1;
+    r->s1.c2 = s->s1.c2;
+
+    r->s2.c0 = s->s2.c0;
+    r->s2.c1 = s->s2.c1;
+    r->s2.c2 = s->s2.c2;
+
+    r->s3.c0 = s->s3.c0;
+    r->s3.c1 = s->s3.c1;
+    r->s3.c2 = s->s3.c2;
+  }
+
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+ 
+  return;
+}
+
+
+
+
+
+/* S input, R output        */
+/* S and R must not overlap */
+void assign_to_64(spinor * const R, spinor32 * const S, const int N)
+{
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  spinor *r;
+  spinor32 *s;
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for (int ix=0; ix<N; ix++)
+  {
+    r=(spinor *) R + ix;
+    s=(spinor32 *) S + ix;
+
+    r->s0.c0 = s->s0.c0;
+    r->s0.c1 = s->s0.c1;
+    r->s0.c2 = s->s0.c2;
+
+    r->s1.c0 = s->s1.c0;
+    r->s1.c1 = s->s1.c1;
+    r->s1.c2 = s->s1.c2;
+
+    r->s2.c0 = s->s2.c0;
+    r->s2.c1 = s->s2.c1;
+    r->s2.c2 = s->s2.c2;
+
+    r->s3.c0 = s->s3.c0;
+    r->s3.c1 = s->s3.c1;
+    r->s3.c2 = s->s3.c2;
+  }
+
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+ 
+  return;
+}
+
+
+
+
+
+
+
+
+
diff --git a/linalg/assign_to_32.h b/linalg/assign_to_32.h
new file mode 100644
index 000000000..87bcdafb0
--- /dev/null
+++ b/linalg/assign_to_32.h
@@ -0,0 +1,27 @@
+/***********************************************************************
+ * Copyright (C) 2015 Florian Burger
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _ASSIGN_TO_32_H
+#define _ASSIGN_TO_32_H
+
+#include "su3.h"
+void assign_to_32(spinor32 * const R, spinor * const S, const int N);
+void assign_to_64(spinor * const R, spinor32 * const S, const int N);
+#endif
+
diff --git a/linalg/blas.h b/linalg/blas.h
index 7d1083854..2ebfc9268 100644
--- a/linalg/blas.h
+++ b/linalg/blas.h
@@ -28,6 +28,7 @@
 #include"fortran.h"
 #define zgemm ZGEMM
 #define zgemv ZGEMV
+#define cgemv CGEMV
 #define ddot DDOT
 #define zdotc ZDOTC
 #define daxpy DAXPY
@@ -50,6 +51,7 @@ extern void _FT(dcopy)();
 extern void _FT(dscal)();
 extern void _FT(dgemv)();
 extern void _FT(zgemv)();
+extern void _FT(cgemv)();
 extern void _FT(dgemm)();
 extern void _FT(zgemm)();
 #else
@@ -75,11 +77,16 @@ extern void _FT(dscal)(int* n, double* a, double x[], int* incx);
 
 /* BLAS-2 subroutines */
 extern void _FT(dgemv)(char* trans, int* m, int* n, double* alpha,
-        double a[], int* lda, double x[], int* incx, double* beta,
-        double y[], int* incy, int len_trans);
+		       double a[], int* lda, double x[], int* incx, double* beta,
+		       double y[], int* incy, int len_trans);
+
 extern void _FT(zgemv)(char* trans, int* m, int* n, _Complex double* alpha,
-        _Complex double a[], int* lda, _Complex double x[], int* incx, _Complex double* beta,
-        _Complex double y[], int* incy, int len_trans);
+		       _Complex double a[], int* lda, _Complex double x[], int* incx, _Complex double* beta,
+		       _Complex double y[], int* incy, int len_trans);
+
+extern void _FT(cgemv)(char* trans, int* m, int* n, _Complex float* alpha,
+		       _Complex float a[], int* lda, _Complex float x[], int* incx, _Complex float* beta,
+		       _Complex float y[], int* incy, int len_trans);
 
 /* BLAS-3 subroutines */
 extern void _FT(dgemm)(char* transa, char* transb, int* m, int* n, int* k,
diff --git a/linalg/comp_decomp.c b/linalg/comp_decomp.c
index 6709aa766..491cd162d 100644
--- a/linalg/comp_decomp.c
+++ b/linalg/comp_decomp.c
@@ -34,9 +34,9 @@
 
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include <stdlib.h>
@@ -50,14 +50,14 @@
 /* S and P inputs, R output */
 void compact(bispinor * const R, spinor * const S, spinor * const P)
 { 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
   spinor *r,*s;
   spinor *u,*t;
   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (int ix = 0; ix < VOLUME; ix++){
@@ -102,7 +102,7 @@ void compact(bispinor * const R, spinor * const S, spinor * const P)
     u->s3.c2 = t->s3.c2;
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
   
@@ -132,7 +132,7 @@ void compact(bispinor * const R, spinor * const S, spinor * const P)
 
 /* R input , S and P outputs */
 void decompact(spinor * const S, spinor * const P, bispinor * const R){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -140,7 +140,7 @@ void decompact(spinor * const S, spinor * const P, bispinor * const R){
   spinor *r,*s;
   spinor *u,*t;
   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (int ix = 0; ix < VOLUME; ix++)
@@ -185,7 +185,7 @@ void decompact(spinor * const S, spinor * const P, bispinor * const R){
     u->s3.c2 = t->s3.c2;
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 
diff --git a/linalg/convert_eo_to_lexic.c b/linalg/convert_eo_to_lexic.c
index 47b53f03d..dff5a7210 100644
--- a/linalg/convert_eo_to_lexic.c
+++ b/linalg/convert_eo_to_lexic.c
@@ -18,23 +18,23 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include "global.h"
 #include "su3.h"
 #include "convert_eo_to_lexic.h"
 
-void convert_eo_to_lexic(spinor * const P, spinor * const s, spinor * const r) {
-#ifdef OMP
+void convert_eo_to_lexic(spinor * const P, const spinor * const s, const spinor * const r) {
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -42,7 +42,7 @@ void convert_eo_to_lexic(spinor * const P, spinor * const s, spinor * const r) {
   int x, y, z, t, i, ix;
   spinor * p = NULL;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(x = 0; x < LX; x++) {
@@ -64,7 +64,7 @@ void convert_eo_to_lexic(spinor * const P, spinor * const s, spinor * const r) {
     }
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /*OpenMP closing brace */
 #endif
 
@@ -76,8 +76,8 @@ void convert_eo_to_lexic(spinor * const P, spinor * const s, spinor * const r) {
  *      s: new spinor even 
  *      r: new spinor odd 
  */
-void convert_lexic_to_eo(spinor * const s, spinor * const r, spinor * const P) {
-#ifdef OMP
+void convert_lexic_to_eo(spinor * const s, spinor * const r, const spinor * const P) {
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -85,7 +85,7 @@ void convert_lexic_to_eo(spinor * const s, spinor * const r, spinor * const P) {
   int x, y, z, t, i, ix;
   spinor * p = NULL;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(x = 0; x < LX; x++) {
@@ -107,7 +107,7 @@ void convert_lexic_to_eo(spinor * const s, spinor * const r, spinor * const P) {
     }
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 
diff --git a/linalg/convert_eo_to_lexic.h b/linalg/convert_eo_to_lexic.h
index 2944bb1df..e1b834454 100644
--- a/linalg/convert_eo_to_lexic.h
+++ b/linalg/convert_eo_to_lexic.h
@@ -20,7 +20,7 @@
 #ifndef _CONVERT_EO_TO_LEXIC_H
 #define _CONVERT_EO_TO_LEXIC_H
 
-void convert_eo_to_lexic(spinor * const P, spinor * const s, spinor * const r);
-void convert_lexic_to_eo(spinor * const s, spinor * const r, spinor * const P);
+void convert_eo_to_lexic(spinor * const P, const spinor * const s, const spinor * const r);
+void convert_lexic_to_eo(spinor * const s, spinor * const r, const spinor * const P);
 
 #endif
diff --git a/linalg/convert_even_to_lexic.c b/linalg/convert_even_to_lexic.c
new file mode 100644
index 000000000..19bf6a87c
--- /dev/null
+++ b/linalg/convert_even_to_lexic.c
@@ -0,0 +1,108 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#ifdef TM_USE_MPI
+# include <mpi.h>
+#endif
+#ifdef TM_USE_OMP
+# include <omp.h>
+#endif
+#include "global.h"
+#include "su3.h"
+#include "convert_even_to_lexic.h"
+
+void convert_even_to_lexic(spinor * const P, spinor * const r) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+
+  int x, y, z, t, i, ix;
+  spinor * p = NULL;
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(x = 0; x < LX; x++) {
+    for(y = 0; y < LY; y++) {
+      for(z = 0; z < LZ; z++) {
+	for(t = 0; t < T; t++) {
+	  ix = g_ipt[t][x][y][z];
+	  i = g_lexic2eosub[ ix ];
+	  if((t+x+y+z+g_proc_coords[3]*LZ+g_proc_coords[2]*LY 
+	      + g_proc_coords[0]*T+g_proc_coords[1]*LX)%2 == 0) {
+	       p = r;
+	       memcpy((P+ix), (p+i), sizeof(spinor));
+	  }
+	}
+      }
+    }
+  }
+
+#ifdef TM_USE_OMP
+  } /*OpenMP closing brace */
+#endif
+
+  return;
+}
+
+/*
+ *      P: spinor with full volume 
+ *      r: new spinor even
+ */
+void convert_lexic_to_even(spinor * const r, spinor * const P) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+
+  int x, y, z, t, i, ix;
+  spinor * p = NULL;
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(x = 0; x < LX; x++) {
+    for(y = 0; y < LY; y++) {
+      for(z = 0; z < LZ; z++) {
+	for(t = 0; t < T; t++) {
+	  ix = g_ipt[t][x][y][z];
+	  i = g_lexic2eosub[ ix ];
+	  if((t+x+y+z+g_proc_coords[3]*LZ+g_proc_coords[2]*LY 
+	      + g_proc_coords[0]*T+g_proc_coords[1]*LX)%2 == 0) {
+	    p = r;
+	    memcpy((p+i), (P+ix), sizeof(spinor));
+	  }
+	}
+      }
+    }
+  }
+
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+
+  return;
+}
diff --git a/linalg/convert_even_to_lexic.h b/linalg/convert_even_to_lexic.h
new file mode 100644
index 000000000..04eb066c0
--- /dev/null
+++ b/linalg/convert_even_to_lexic.h
@@ -0,0 +1,26 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _CONVERT_EVEN_TO_LEXIC_H
+#define _CONVERT_EVEN_TO_LEXIC_H
+
+void convert_even_to_lexic(spinor * const P, spinor * const r);
+void convert_lexic_to_even(spinor * const r, spinor * const P);
+
+#endif
diff --git a/linalg/convert_odd_to_lexic.c b/linalg/convert_odd_to_lexic.c
new file mode 100644
index 000000000..bce97f403
--- /dev/null
+++ b/linalg/convert_odd_to_lexic.c
@@ -0,0 +1,108 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#ifdef TM_USE_MPI
+# include <mpi.h>
+#endif
+#ifdef TM_USE_OMP
+# include <omp.h>
+#endif
+#include "global.h"
+#include "su3.h"
+#include "convert_odd_to_lexic.h"
+
+void convert_odd_to_lexic(spinor * const P, spinor * const r) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+
+  int x, y, z, t, i, ix;
+  spinor * p = NULL;
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(x = 0; x < LX; x++) {
+    for(y = 0; y < LY; y++) {
+      for(z = 0; z < LZ; z++) {
+	for(t = 0; t < T; t++) {
+	  ix = g_ipt[t][x][y][z];
+	  i = g_lexic2eosub[ ix ];
+	  if((t+x+y+z+g_proc_coords[3]*LZ+g_proc_coords[2]*LY 
+	      + g_proc_coords[0]*T+g_proc_coords[1]*LX)%2 != 0) {
+	       p = r;
+	       memcpy((P+ix), (p+i), sizeof(spinor));
+	  }
+	}
+      }
+    }
+  }
+
+#ifdef TM_USE_OMP
+  } /*OpenMP closing brace */
+#endif
+
+  return;
+}
+
+/*
+ *      P: spinor with full volume 
+ *      r: new spinor odd 
+ */
+void convert_lexic_to_odd(spinor * const r, spinor * const P) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+
+  int x, y, z, t, i, ix;
+  spinor * p = NULL;
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(x = 0; x < LX; x++) {
+    for(y = 0; y < LY; y++) {
+      for(z = 0; z < LZ; z++) {
+	for(t = 0; t < T; t++) {
+	  ix = g_ipt[t][x][y][z];
+	  i = g_lexic2eosub[ ix ];
+	  if((t+x+y+z+g_proc_coords[3]*LZ+g_proc_coords[2]*LY 
+	      + g_proc_coords[0]*T+g_proc_coords[1]*LX)%2 != 0) {
+	    p = r;
+	    memcpy((p+i), (P+ix), sizeof(spinor));
+	  }
+	}
+      }
+    }
+  }
+
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+
+  return;
+}
diff --git a/linalg/convert_odd_to_lexic.h b/linalg/convert_odd_to_lexic.h
new file mode 100644
index 000000000..d42ce9671
--- /dev/null
+++ b/linalg/convert_odd_to_lexic.h
@@ -0,0 +1,26 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _CONVERT_ODD_TO_LEXIC_H
+#define _CONVERT_ODD_TO_LEXIC_H
+
+void convert_odd_to_lexic(spinor * const P, spinor * const r);
+void convert_lexic_to_odd(spinor * const r, spinor * const P);
+
+#endif
diff --git a/linalg/diff.c b/linalg/diff.c
index f133c9a8e..a32be9f1a 100644
--- a/linalg/diff.c
+++ b/linalg/diff.c
@@ -23,9 +23,9 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include <stdlib.h>
@@ -202,7 +202,7 @@ void diff(spinor * const Q,spinor * const R,spinor * const S, const int N)
 #elif (defined BGQ && defined XLC)
 
 void diff(spinor * const Q,const spinor * const R,const spinor * const S, const int N) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -220,7 +220,7 @@ void diff(spinor * const Q,const spinor * const R,const spinor * const S, const
   __prefetch_by_load(R);
   __prefetch_by_load(Q);
 
-#ifndef OMP
+#ifndef TM_USE_OMP
 #pragma unroll(2)
 #else
 #pragma omp for
@@ -258,7 +258,7 @@ void diff(spinor * const Q,const spinor * const R,const spinor * const S, const
     vec_st(z5, 0, q+20);
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP parallel closing brace */
 #endif
 
@@ -269,7 +269,7 @@ void diff(spinor * const Q,const spinor * const R,const spinor * const S, const
 
 void diff(spinor * const Q, const spinor * const R, const spinor * const S, const int N)
 {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -278,7 +278,7 @@ void diff(spinor * const Q, const spinor * const R, const spinor * const S, cons
    const spinor *r,*s;
 
 /* Change due to even-odd preconditioning : VOLUME   to VOLUME/2 */   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
    for (int ix = 0; ix < N; ix++)
@@ -303,24 +303,55 @@ void diff(spinor * const Q, const spinor * const R, const spinor * const S, cons
      q->s3.c1 = r->s3.c1 - s->s3.c1;
      q->s3.c2 = r->s3.c2 - s->s3.c2;
    }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
 
 #endif
 
+void diff_ts(spinor * const Q, const spinor * const R, const spinor * const S, const int N)
+{
+  spinor *q;
+  const spinor *r,*s;
+  
+  for (int ix = 0; ix < N; ix++)
+    {
+      q=(spinor *) Q + ix;
+      r=(spinor *) R + ix;
+      s=(spinor *) S + ix;
+      
+      q->s0.c0 = r->s0.c0 - s->s0.c0;
+      q->s0.c1 = r->s0.c1 - s->s0.c1;
+      q->s0.c2 = r->s0.c2 - s->s0.c2;
+      
+      q->s1.c0 = r->s1.c0 - s->s1.c0;
+      q->s1.c1 = r->s1.c1 - s->s1.c1;
+      q->s1.c2 = r->s1.c2 - s->s1.c2;
+      
+      q->s2.c0 = r->s2.c0 - s->s2.c0;
+      q->s2.c1 = r->s2.c1 - s->s2.c1;
+      q->s2.c2 = r->s2.c2 - s->s2.c2;
+      
+      q->s3.c0 = r->s3.c0 - s->s3.c0;
+      q->s3.c1 = r->s3.c1 - s->s3.c1;
+      q->s3.c2 = r->s3.c2 - s->s3.c2;
+    }
+}
+
+
+
 #ifdef WITHLAPH
 void diff_su3vect(su3_vector * const Q,su3_vector * const R,su3_vector * const S, const int N)
 {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
 
   su3_vector *q,*r,*s;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (int ix = 0; ix < N; ++ix) 
@@ -333,7 +364,7 @@ void diff_su3vect(su3_vector * const Q,su3_vector * const R,su3_vector * const S
     q->c1 = r->c1 - s->c1;
     q->c2 = r->c2 - s->c2;
   } 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
diff --git a/linalg/diff.h b/linalg/diff.h
index c9f604c83..a95afa732 100644
--- a/linalg/diff.h
+++ b/linalg/diff.h
@@ -24,6 +24,7 @@
 
 /* Makes the difference (*Q) = (*R) - (*S) */
 void diff(spinor * const Q, const spinor * const R, const spinor * const S, const int N);
+void diff_ts(spinor * const Q, const spinor * const R, const spinor * const S, const int N);
 void diff_su3vect(su3_vector * const Q, su3_vector * const R, su3_vector * const S, const int N);
 
 
diff --git a/linalg/diff_32.c b/linalg/diff_32.c
new file mode 100644
index 000000000..ef02781ba
--- /dev/null
+++ b/linalg/diff_32.c
@@ -0,0 +1,107 @@
+/***********************************************************************
+ * Copyright (C) 2015 Florian Burger
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+/*******************************************************************************
+ *
+ *   void diff_32(spinor32 * const Q,spinor32 * const R,spinor32 * const S)
+ *     Makes the difference (*Q) = (*R) - (*S)
+ *******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#ifdef TM_USE_OMP
+# include <omp.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <complex.h>
+#include "su3.h"
+#include "diff_32.h"
+
+
+void diff_32(spinor32 * const Q, const spinor32 * const R, const spinor32 * const S, const int N)
+{
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+
+   spinor32 *q;
+   const spinor32 *r,*s;
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+   for (int ix = 0; ix < N; ix++)
+   {
+     q=(spinor32 *) Q + ix;
+     r=(spinor32 *) R + ix;
+     s=(spinor32 *) S + ix;
+     
+     q->s0.c0 = r->s0.c0 - s->s0.c0;
+     q->s0.c1 = r->s0.c1 - s->s0.c1;
+     q->s0.c2 = r->s0.c2 - s->s0.c2;
+
+     q->s1.c0 = r->s1.c0 - s->s1.c0;
+     q->s1.c1 = r->s1.c1 - s->s1.c1;
+     q->s1.c2 = r->s1.c2 - s->s1.c2;
+
+     q->s2.c0 = r->s2.c0 - s->s2.c0;
+     q->s2.c1 = r->s2.c1 - s->s2.c1;
+     q->s2.c2 = r->s2.c2 - s->s2.c2;
+
+     q->s3.c0 = r->s3.c0 - s->s3.c0;
+     q->s3.c1 = r->s3.c1 - s->s3.c1;
+     q->s3.c2 = r->s3.c2 - s->s3.c2;
+   }
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+}
+
+void diff_ts_32(spinor32 * const Q, const spinor32 * const R, const spinor32 * const S, const int N)
+{
+  
+  spinor32 *q;
+  const spinor32 *r,*s;
+  
+  for (int ix = 0; ix < N; ix++)
+    {
+      q=(spinor32 *) Q + ix;
+      r=(spinor32 *) R + ix;
+     s=(spinor32 *) S + ix;
+     
+     q->s0.c0 = r->s0.c0 - s->s0.c0;
+     q->s0.c1 = r->s0.c1 - s->s0.c1;
+     q->s0.c2 = r->s0.c2 - s->s0.c2;
+     
+     q->s1.c0 = r->s1.c0 - s->s1.c0;
+     q->s1.c1 = r->s1.c1 - s->s1.c1;
+     q->s1.c2 = r->s1.c2 - s->s1.c2;
+     
+     q->s2.c0 = r->s2.c0 - s->s2.c0;
+     q->s2.c1 = r->s2.c1 - s->s2.c1;
+     q->s2.c2 = r->s2.c2 - s->s2.c2;
+     
+     q->s3.c0 = r->s3.c0 - s->s3.c0;
+     q->s3.c1 = r->s3.c1 - s->s3.c1;
+     q->s3.c2 = r->s3.c2 - s->s3.c2;
+    }
+}
diff --git a/linalg/diff_32.h b/linalg/diff_32.h
new file mode 100644
index 000000000..6973fef46
--- /dev/null
+++ b/linalg/diff_32.h
@@ -0,0 +1,30 @@
+/***********************************************************************
+ * Copyright (C) 2015 Florian Burger
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _DIFF_32_H
+#define _DIFF_32_H
+
+#include "su3.h"
+
+/* Makes the difference (*Q) = (*R) - (*S) */
+void diff_32(spinor32 * const Q, const spinor32 * const R, const spinor32 * const S, const int N);
+void diff_ts_32(spinor32 * const Q, const spinor32 * const R, const spinor32 * const S, const int N);
+
+
+#endif
diff --git a/linalg/diff_and_square_norm.c b/linalg/diff_and_square_norm.c
index 9b8ef1818..bfeaf0cf6 100644
--- a/linalg/diff_and_square_norm.c
+++ b/linalg/diff_and_square_norm.c
@@ -18,10 +18,10 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "su3.h"
@@ -72,7 +72,7 @@ double diff_and_square_norm(spinor * const Q, spinor * const R, const int N) {
     kc = tr-tt;
   }
   kc = ks+kc;
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Allreduce(&kc, &ks, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
   return ks;
 #else
diff --git a/linalg/fortran.h b/linalg/fortran.h
index 95b8ccaf8..2bef056a8 100644
--- a/linalg/fortran.h
+++ b/linalg/fortran.h
@@ -25,4 +25,11 @@
 #define _FT(s) s ## _
 #endif
 
+#if (defined NOARPACKUNDERSCORE)
+#define _AFT(s) s
+#else
+#define _AFT(s) s ## _
+#endif
+
+
 #endif
diff --git a/linalg/mattimesvec.c b/linalg/mattimesvec.c
index 1dd866640..2f8113475 100644
--- a/linalg/mattimesvec.c
+++ b/linalg/mattimesvec.c
@@ -18,10 +18,10 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
 #include "complex.h"
diff --git a/linalg/mul.c b/linalg/mul.c
index 1d31fbd5c..dd0377bc6 100644
--- a/linalg/mul.c
+++ b/linalg/mul.c
@@ -26,9 +26,9 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include <stdlib.h>
@@ -38,14 +38,14 @@
 #include "mul.h"
 
 void mul(spinor * const R, const _Complex double c, spinor * const S, const int N){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
 
   spinor *r,*s;
   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (int ix = 0; ix < N; ++ix)
@@ -69,7 +69,7 @@ void mul(spinor * const R, const _Complex double c, spinor * const S, const int
     r->s3.c1 = c * s->s3.c1;
     r->s3.c2 = c * s->s3.c2;
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
diff --git a/linalg/mul_add_mul.c b/linalg/mul_add_mul.c
index 28d3d620f..6089aa0f2 100644
--- a/linalg/mul_add_mul.c
+++ b/linalg/mul_add_mul.c
@@ -18,9 +18,9 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include <stdlib.h>
@@ -33,14 +33,14 @@
 /* Makes (*R)=c1*(*S)+c2*(*U) , c1 and c2 are complex constants */
 void mul_add_mul(spinor * const R,spinor * const S,spinor * const U,const _Complex double c1,const _Complex double c2, const int N)
 {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
 
   spinor *r, *s, *u;
   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (int ix = 0; ix < N; ++ix)
@@ -65,7 +65,7 @@ void mul_add_mul(spinor * const R,spinor * const S,spinor * const U,const _Compl
     r->s3.c1 = c1 * s->s3.c1 + c2 * u->s3.c1;
     r->s3.c2 = c1 * s->s3.c2 + c2 * u->s3.c2;
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
diff --git a/linalg/mul_add_mul_r.c b/linalg/mul_add_mul_r.c
index f2b93fc2e..c2a1b0246 100644
--- a/linalg/mul_add_mul_r.c
+++ b/linalg/mul_add_mul_r.c
@@ -21,9 +21,9 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include <stdlib.h>
@@ -36,7 +36,7 @@
 /* S,U input, R inoutput, c1,c2 input */
 void mul_add_mul_r(spinor * const R, spinor * const S, spinor * const U,
 		   const double c1,const double c2, const int N) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -44,7 +44,7 @@ void mul_add_mul_r(spinor * const R, spinor * const S, spinor * const U,
   int ix;
   spinor *r,*s,*u;
   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (ix=0; ix < N; ix++){
@@ -68,7 +68,7 @@ void mul_add_mul_r(spinor * const R, spinor * const S, spinor * const U,
     r->s3.c1 = c1 * s->s3.c1 + c2 * u->s3.c1;
     r->s3.c2 = c1 * s->s3.c2 + c2 * u->s3.c2;
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
diff --git a/linalg/mul_diff_mul.c b/linalg/mul_diff_mul.c
index 444ceb0ec..4c10666a9 100644
--- a/linalg/mul_diff_mul.c
+++ b/linalg/mul_diff_mul.c
@@ -18,9 +18,9 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include <stdlib.h>
@@ -32,7 +32,7 @@
 
 /* Makes (*R)=c1*(*S)-c2*(*U) , c1 and c2 are complex constants */
 void mul_diff_mul(spinor * const R,spinor * const S,spinor * const U,const _Complex double c1,const _Complex double c2, const int N){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -40,7 +40,7 @@ void mul_diff_mul(spinor * const R,spinor * const S,spinor * const U,const _Comp
   int ix;
   spinor *r,*s,*u;
   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (ix=0;ix<N;ix++){
@@ -65,7 +65,7 @@ void mul_diff_mul(spinor * const R,spinor * const S,spinor * const U,const _Comp
     r->s3.c2 = c1 * s->s3.c2 - c2 * u->s3.c2;
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
diff --git a/linalg/mul_diff_mul_r.c b/linalg/mul_diff_mul_r.c
index 78876ec1a..ce76766a2 100644
--- a/linalg/mul_diff_mul_r.c
+++ b/linalg/mul_diff_mul_r.c
@@ -18,9 +18,9 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include <stdlib.h>
@@ -34,14 +34,14 @@
 void mul_diff_mul_r(spinor * const R, spinor * const S,spinor * const U,
 		    const double c1, const double c2, const int N)
 {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
 
   spinor *r,*s,*u;
   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (int ix = 0; ix < N; ++ix)
@@ -67,7 +67,7 @@ void mul_diff_mul_r(spinor * const R, spinor * const S,spinor * const U,
     r->s3.c2 = c1 * s->s3.c2 - c2 * u->s3.c2;
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
diff --git a/linalg/mul_diff_r.c b/linalg/mul_diff_r.c
index 99f6de7fa..715e18e01 100644
--- a/linalg/mul_diff_r.c
+++ b/linalg/mul_diff_r.c
@@ -21,9 +21,9 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include <stdlib.h>
@@ -36,14 +36,14 @@
 /* S,U input, R inoutput, c1 input */
 void mul_diff_r(spinor * const R,spinor * const S,spinor * const U, const double c1, const int N)
 {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
 
   spinor *r,*s,*u;
   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (int ix = 0; ix < N; ++ix)
@@ -69,7 +69,7 @@ void mul_diff_r(spinor * const R,spinor * const S,spinor * const U, const double
     r->s3.c2 = c1 * s->s3.c2 - u->s3.c2;
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
diff --git a/linalg/mul_gamma5.c b/linalg/mul_gamma5.c
new file mode 100644
index 000000000..8338e565b
--- /dev/null
+++ b/linalg/mul_gamma5.c
@@ -0,0 +1,67 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+/*******************************************************************************
+ *
+ * File mul_r.c
+ *
+ *   void mul_r(spinor * const R, const double c, spinor * const S){
+ *     Makes (*R) = c*(*S)        c is a real constant
+ *       
+ *******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#ifdef TM_USE_OMP
+# include <omp.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "su3.h"
+#include "mul_r.h"
+
+void mul_gamma5(spinor * const R, const int N){
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+
+  int ix;
+  spinor *r;
+  
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for (ix = 0; ix < N; ix++){
+    r=(spinor *) R + ix;
+    
+    r->s2.c0 = -1.0*r->s2.c0;
+    r->s2.c1 = -1.0*r->s2.c1;
+    r->s2.c2 = -1.0*r->s2.c2;
+    
+    r->s3.c0 = -1.0*r->s3.c0;
+    r->s3.c1 = -1.0*r->s3.c1;
+    r->s3.c2 = -1.0*r->s3.c2;
+  }
+#ifdef TM_USE_OMP
+  } /*OpenMP closing brace */
+#endif
+
+}
diff --git a/linalg/mul_gamma5.h b/linalg/mul_gamma5.h
new file mode 100644
index 000000000..a67561cb1
--- /dev/null
+++ b/linalg/mul_gamma5.h
@@ -0,0 +1,28 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _MUL_GAMMA5_H
+#define _MUL_GAMMA5_H
+
+#include "su3.h"
+
+/*   multiply the spinor R with gamma5  (*R) = gamma5*(*R)  with gamma5 = [1 0 ; 0 -1]*/
+void mul_gamma5(spinor * const R, const int N);
+
+#endif
diff --git a/linalg/mul_r.c b/linalg/mul_r.c
index 09f267ead..85dfe5115 100644
--- a/linalg/mul_r.c
+++ b/linalg/mul_r.c
@@ -26,9 +26,9 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include <stdlib.h>
@@ -38,7 +38,7 @@
 #include "mul_r.h"
 
 void mul_r(spinor * const R, const double c, spinor * const S, const int N){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -46,7 +46,7 @@ void mul_r(spinor * const R, const double c, spinor * const S, const int N){
   int ix;
   spinor *r,*s;
   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (ix = 0; ix < N; ix++){
@@ -69,7 +69,7 @@ void mul_r(spinor * const R, const double c, spinor * const S, const int N){
     r->s3.c1 = c * s->s3.c1;
     r->s3.c2 = c * s->s3.c2;
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /*OpenMP closing brace */
 #endif
 
diff --git a/linalg/mul_r_32.c b/linalg/mul_r_32.c
new file mode 100644
index 000000000..aba0c5a1b
--- /dev/null
+++ b/linalg/mul_r_32.c
@@ -0,0 +1,78 @@
+/***********************************************************************
+ * Copyright (C) 2015 Florian Burger
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+/*******************************************************************************
+ *
+ * File mul_r_32.c
+ *
+ *   void mul_r_32(spinor32 * const R, const float c, spinor32 * const S){
+ *     Makes (*R) = c*(*S)        c is a real constant
+ *       
+ *******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#ifdef TM_USE_OMP
+# include <omp.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "su3.h"
+#include "mul_r_32.h"
+
+void mul_r_32_orphaned(spinor32 * const R, const float c, spinor32 * const S, const int N){
+  int ix;
+  spinor32 *r,*s;
+  
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for (ix = 0; ix < N; ix++){
+    r=(spinor32 *) R + ix;
+    s=(spinor32 *) S + ix;
+    
+    r->s0.c0 = c * s->s0.c0;
+    r->s0.c1 = c * s->s0.c1;
+    r->s0.c2 = c * s->s0.c2;
+    
+    r->s1.c0 = c * s->s1.c0;
+    r->s1.c1 = c * s->s1.c1;
+    r->s1.c2 = c * s->s1.c2;
+    
+    r->s2.c0 = c * s->s2.c0;
+    r->s2.c1 = c * s->s2.c1;
+    r->s2.c2 = c * s->s2.c2;
+    
+    r->s3.c0 = c * s->s3.c0;
+    r->s3.c1 = c * s->s3.c1;
+    r->s3.c2 = c * s->s3.c2;
+  }
+}
+
+void mul_r_32(spinor32 * const R, const float c, spinor32 * const S, const int N){
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  mul_r_32_orphaned(R,c,S,N);
+#ifdef TM_USE_OMP
+  } /*OpenMP closing brace */
+#endif
+}
diff --git a/linalg/mul_r_32.h b/linalg/mul_r_32.h
new file mode 100644
index 000000000..3e95761d7
--- /dev/null
+++ b/linalg/mul_r_32.h
@@ -0,0 +1,29 @@
+/***********************************************************************
+ * Copyright (C) 2015 Florian Burger
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _MUL_R_32_H
+#define _MUL_R_32_H
+
+#include "su3.h"
+
+/*   Makes (*R) = c*(*S)   c is a real constant*/
+void mul_r_32(spinor32 * const R, const float c, spinor32 * const S, const int N);
+void mul_r_32_orphaned(spinor32 * const R, const float c, spinor32 * const S, const int N);
+
+#endif
diff --git a/linalg/mul_r_gamma5.c b/linalg/mul_r_gamma5.c
new file mode 100644
index 000000000..00f3ae228
--- /dev/null
+++ b/linalg/mul_r_gamma5.c
@@ -0,0 +1,68 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *               2017                               Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#ifdef TM_USE_OMP
+# include <omp.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "su3.h"
+#include "mul_r_gamma5.h"
+
+void mul_r_gamma5(spinor * const R, const double c, const int N){
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+
+  int ix;
+  spinor *r;
+  
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for (ix = 0; ix < N; ix++){
+    r=(spinor *) R + ix;
+    
+    r->s0.c0 = c * r->s0.c0;
+    r->s0.c1 = c * r->s0.c1;
+    r->s0.c2 = c * r->s0.c2;
+    
+    r->s1.c0 = c * r->s1.c0;
+    r->s1.c1 = c * r->s1.c1;
+    r->s1.c2 = c * r->s1.c2;
+    
+    r->s2.c0 = -c * r->s2.c0;
+    r->s2.c1 = -c * r->s2.c1;
+    r->s2.c2 = -c * r->s2.c2;
+    
+    r->s3.c0 = -c * r->s3.c0;
+    r->s3.c1 = -c * r->s3.c1;
+    r->s3.c2 = -c * r->s3.c2;
+  }
+#ifdef TM_USE_OMP
+  } /*OpenMP closing brace */
+#endif
+
+}
diff --git a/linalg/mul_r_gamma5.h b/linalg/mul_r_gamma5.h
new file mode 100644
index 000000000..c1b603ff1
--- /dev/null
+++ b/linalg/mul_r_gamma5.h
@@ -0,0 +1,29 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *               2017                               Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef MUL_R_GAMMA5_H
+#define MUL_R_GAMMA5_H
+
+#include "su3.h"
+
+/*   Makes (*R) = c*\gamma5 (*R)   c is a real constant*/
+void mul_r_gamma5(spinor * const R, const double c, const int N);
+
+#endif
diff --git a/linalg/scalar_prod.c b/linalg/scalar_prod.c
index 47c5537b8..66855601f 100644
--- a/linalg/scalar_prod.c
+++ b/linalg/scalar_prod.c
@@ -18,13 +18,13 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 # include <global.h>
 #endif
@@ -32,73 +32,26 @@
 #include "scalar_prod.h"
 
 /*  <S,R>=S^* times R */
-_Complex double scalar_prod(const spinor * const S, const spinor * const R, const int N, const int parallel) {
-  _Complex double ALIGN res = 0.0;
-#ifdef MPI
-  _Complex double ALIGN mres;
-#endif
+#define _C_TYPE _Complex double
+#define _PSWITCH(s) s
+#define _PTSWITCH(s) s
 
-#ifdef OMP
-#pragma omp parallel
-  {
-  int thread_num = omp_get_thread_num();
-#endif
+#include"scalar_prod_body.c"
 
-  _Complex double ALIGN ds,tr,ts,tt,ks,kc;
-  const spinor *s,*r;
-
-  ks = 0.0;
-  kc = 0.0;
+#undef _C_TYPE
+#undef _PSWITCH
+#undef _PTSWITCH
 
-#if (defined BGL && defined XLC)
-  __alignx(16, S);
-  __alignx(16, R);
-#endif
+#define _C_TYPE _Complex float
+#define _PSWITCH(s) s ## _32
+#define _PTSWITCH(s) s ## 32
 
-#ifdef OMP
-#pragma omp for
-#endif
-  for (int ix = 0; ix < N; ix++)
-  {
-    s= S + ix;
-    r= R + ix;
-    
-    ds = r->s0.c0 * conj(s->s0.c0) + r->s0.c1 * conj(s->s0.c1) + r->s0.c2 * conj(s->s0.c2) +
-         r->s1.c0 * conj(s->s1.c0) + r->s1.c1 * conj(s->s1.c1) + r->s1.c2 * conj(s->s1.c2) +
-	 r->s2.c0 * conj(s->s2.c0) + r->s2.c1 * conj(s->s2.c1) + r->s2.c2 * conj(s->s2.c2) + 
-         r->s3.c0 * conj(s->s3.c0) + r->s3.c1 * conj(s->s3.c1) + r->s3.c2 * conj(s->s3.c2);
+#include"scalar_prod_body.c"
 
-    /* Kahan Summation */
-    tr=ds+kc;
-    ts=tr+ks;
-    tt=ts-ks;
-    ks=ts;
-    kc=tr-tt;
-  }
-  kc=ks+kc;
+#undef _C_TYPE
+#undef _PSWITCH
+#undef _PTSWITCH
 
-#ifdef OMP
-  g_omp_acc_cp[thread_num] = kc;
-
-  } /* OpenMP closing brace */
-
-  /* having left the parallel section, we can now sum up the Kahan
-     corrected sums from each thread into kc */
-  for(int i = 0; i < omp_num_threads; ++i)
-    res += g_omp_acc_cp[i];
-#else
-  res=kc;
-#endif
-
-#ifdef MPI
-  if(parallel == 1)
-  {
-    MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
-    return(mres);
-  }
-#endif
-  return(res);
-}
 
 #ifdef WITHLAPH
 _Complex double scalar_prod_su3vect(su3_vector * const S, su3_vector * const R, const int N, const int parallel)
@@ -106,7 +59,7 @@ _Complex double scalar_prod_su3vect(su3_vector * const S, su3_vector * const R,
   double ALIGN ks, ds, tr, ts, tt;
   su3_vector *s, *r;
   _Complex double c;
-#ifdef MPI
+#ifdef TM_USE_MPI
   _Complex double d;
 #endif
 
@@ -130,7 +83,7 @@ _Complex double scalar_prod_su3vect(su3_vector * const S, su3_vector * const R,
     }
   c = ks + c;
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   if(parallel == 1)
   {
     d = c;
diff --git a/linalg/scalar_prod.h b/linalg/scalar_prod.h
index fa220332d..d8e1325aa 100644
--- a/linalg/scalar_prod.h
+++ b/linalg/scalar_prod.h
@@ -23,6 +23,10 @@
 #include "su3.h"
 /*  <S,R>=SxR^* */
 _Complex double scalar_prod(const spinor * const S, const spinor * const R, const int N, const int parallel);
+_Complex double scalar_prod_32(const spinor32 * const S, const spinor32 * const R, const int N, const int parallel);
+_Complex double scalar_prod_ts(const spinor * const S, const spinor * const R, const int N, const int parallel);
+_Complex double scalar_prod_ts_32(const spinor32 * const S, const spinor32 * const R, const int N, const int parallel);
+
 _Complex double scalar_prod_su3vect(su3_vector * const S,su3_vector * const R, const int N, const int parallel);
 
 #endif
diff --git a/linalg/scalar_prod_body.c b/linalg/scalar_prod_body.c
new file mode 100644
index 000000000..1eaa57d5d
--- /dev/null
+++ b/linalg/scalar_prod_body.c
@@ -0,0 +1,137 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+_Complex double _PSWITCH(scalar_prod)(const _PTSWITCH(spinor) * const S, const _PTSWITCH(spinor) * const R, 
+                                      const int N, const int parallel) {
+  _Complex double ALIGN res = 0.0;
+#ifdef TM_USE_MPI
+  _Complex double ALIGN mres;
+#endif
+
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+    int thread_num = omp_get_thread_num();
+#endif
+
+    _Complex double ALIGN ds,tr,ts,tt,ks,kc;
+    const _PTSWITCH(spinor) *s,*r;
+
+    ks = 0.0;
+    kc = 0.0;
+
+#if (defined BGL && defined XLC)
+    __alignx(16, S);
+    __alignx(16, R);
+#endif
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+    for (int ix = 0; ix < N; ix++)
+      {
+        s= S + ix;
+        r= R + ix;
+    
+        ds = r->s0.c0 * conj(s->s0.c0) + r->s0.c1 * conj(s->s0.c1) + r->s0.c2 * conj(s->s0.c2) +
+          r->s1.c0 * conj(s->s1.c0) + r->s1.c1 * conj(s->s1.c1) + r->s1.c2 * conj(s->s1.c2) +
+          r->s2.c0 * conj(s->s2.c0) + r->s2.c1 * conj(s->s2.c1) + r->s2.c2 * conj(s->s2.c2) + 
+          r->s3.c0 * conj(s->s3.c0) + r->s3.c1 * conj(s->s3.c1) + r->s3.c2 * conj(s->s3.c2);
+
+        /* Kahan Summation */
+        tr=ds+kc;
+        ts=tr+ks;
+        tt=ts-ks;
+        ks=ts;
+        kc=tr-tt;
+      }
+    kc=ks+kc;
+
+#ifdef TM_USE_OMP
+    g_omp_acc_cp[thread_num] = kc;
+
+  } /* OpenMP closing brace */
+
+  /* having left the parallel section, we can now sum up the Kahan
+     corrected sums from each thread into kc */
+  for(int i = 0; i < omp_num_threads; ++i)
+    res += g_omp_acc_cp[i];
+#else
+  res=kc;
+#endif
+
+#ifdef TM_USE_MPI
+  if(parallel == 1)
+    {
+      MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
+      return(mres);
+    }
+#endif
+  return(res);
+}
+
+// threadsafe versions
+_Complex double _PSWITCH(scalar_prod_ts)(const _PTSWITCH(spinor) * const S, const _PTSWITCH(spinor) * const R, 
+                                         const int N, const int parallel) {
+  _Complex double ALIGN res = 0.0;
+#ifdef TM_USE_MPI
+  _Complex double ALIGN mres;
+#endif
+
+  _Complex double ALIGN ds,tr,ts,tt,ks,kc;
+  const _PTSWITCH(spinor) *s,*r;
+
+  ks = 0.0;
+  kc = 0.0;
+
+#if (defined BGL && defined XLC)
+  __alignx(16, S);
+  __alignx(16, R);
+#endif
+
+  for (int ix = 0; ix < N; ix++)
+    {
+      s= S + ix;
+      r= R + ix;
+    
+      ds = r->s0.c0 * conj(s->s0.c0) + r->s0.c1 * conj(s->s0.c1) + r->s0.c2 * conj(s->s0.c2) +
+        r->s1.c0 * conj(s->s1.c0) + r->s1.c1 * conj(s->s1.c1) + r->s1.c2 * conj(s->s1.c2) +
+        r->s2.c0 * conj(s->s2.c0) + r->s2.c1 * conj(s->s2.c1) + r->s2.c2 * conj(s->s2.c2) + 
+        r->s3.c0 * conj(s->s3.c0) + r->s3.c1 * conj(s->s3.c1) + r->s3.c2 * conj(s->s3.c2);
+
+      /* Kahan Summation */
+      tr=ds+kc;
+      ts=tr+ks;
+      tt=ts-ks;
+      ks=ts;
+      kc=tr-tt;
+    }
+  kc=ks+kc;
+
+  res = kc;
+
+#ifdef TM_USE_MPI
+  if(parallel == 1)
+    {
+      MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
+      return(mres);
+    }
+#endif
+  return(res);
+}
diff --git a/linalg/scalar_prod_i.c b/linalg/scalar_prod_i.c
index fde1ea2c2..e183404d8 100644
--- a/linalg/scalar_prod_i.c
+++ b/linalg/scalar_prod_i.c
@@ -26,12 +26,12 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include <complex.h>
@@ -70,7 +70,7 @@ double scalar_prod_i(spinor * const S,spinor * const R, const int N, const int p
   }
   kc=ks+kc;
   
-#if defined MPI
+#if defined TM_USE_MPI
   if(parallel == 1) {
     MPI_Allreduce(&kc, &ks, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
     kc = ks;
diff --git a/linalg/scalar_prod_r.c b/linalg/scalar_prod_r.c
index 6a47493e1..4ae112880 100644
--- a/linalg/scalar_prod_r.c
+++ b/linalg/scalar_prod_r.c
@@ -24,12 +24,12 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 # include <global.h>
 #endif
@@ -44,11 +44,11 @@
 
 double scalar_prod_r(const spinor * const S, const spinor * const R, const int N, const int parallel) {
   double ALIGN res = 0.0;
-#ifdef MPI
+#ifdef TM_USE_MPI
   double ALIGN mres;
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
   int thread_num = omp_get_thread_num();
@@ -69,7 +69,7 @@ double scalar_prod_r(const spinor * const S, const spinor * const R, const int N
   ks = vec_splats(0.0);
   kc = vec_splats(0.0);
 
-#ifndef OMP
+#ifndef TM_USE_OMP
 #pragma unroll(2)
 #else
 #pragma omp for
@@ -111,7 +111,7 @@ double scalar_prod_r(const spinor * const S, const spinor * const R, const int N
   }
   buffer = vec_add(kc, ks);
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   g_omp_acc_re[thread_num] = buffer[0] + buffer[1] + buffer[2] + buffer[3];
   } /* OpenMP parallel closing brace */
   for( int i = 0; i < omp_num_threads; ++i)
@@ -120,7 +120,7 @@ double scalar_prod_r(const spinor * const S, const spinor * const R, const int N
   res = buffer[0] + buffer[1] + buffer[2] + buffer[3]; 
 #endif
 
-#if defined MPI
+#if defined TM_USE_MPI
   if(parallel) {
     MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
     return(mres);
@@ -135,11 +135,11 @@ double scalar_prod_r(const spinor * const S, const spinor * const R, const int N
 double scalar_prod_r(const spinor * const S, const spinor * const R, const int N, const int parallel)
 {
   double ALIGN res = 0.0;
-#ifdef MPI
+#ifdef TM_USE_MPI
   double ALIGN mres;
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
   int thread_num = omp_get_thread_num();
@@ -155,7 +155,7 @@ double scalar_prod_r(const spinor * const S, const spinor * const R, const int N
   __alignx(16, R);
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (int ix = 0; ix < N; ++ix) {
@@ -175,7 +175,7 @@ double scalar_prod_r(const spinor * const S, const spinor * const R, const int N
   }
   kc=ks+kc;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   g_omp_acc_re[thread_num] = kc;
 
   } /* OpenMP closing brace */
@@ -186,7 +186,7 @@ double scalar_prod_r(const spinor * const S, const spinor * const R, const int N
   res = kc;
 #endif
 
-#if defined MPI
+#if defined TM_USE_MPI
   if(parallel)
   {
     MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
@@ -224,7 +224,7 @@ double scalar_prod_r_su3vect(su3_vector * const S,su3_vector * const R, const in
     kc = tr-tt;
   }
   kc = ks + kc;
-#if defined MPI
+#if defined TM_USE_MPI
   if(parallel)
   {
     MPI_Allreduce(&kc, &ks, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
diff --git a/linalg/scalar_prod_r_32.c b/linalg/scalar_prod_r_32.c
new file mode 100644
index 000000000..e2b56d44d
--- /dev/null
+++ b/linalg/scalar_prod_r_32.c
@@ -0,0 +1,168 @@
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#ifdef TM_USE_MPI
+# include <mpi.h>
+#endif
+#ifdef TM_USE_OMP
+# include <omp.h>
+# include <global.h>
+#endif
+#include "su3.h"
+#include "scalar_prod_r_32.h"
+
+/*  R input, S input */
+
+#include <complex.h>
+#if (defined BGQ && defined XLC)
+
+float scalar_prod_r_32(const spinor32 * const S, const spinor32 * const R, const int N, const int parallel) {
+  float ALIGN32 res = 0.0;
+#ifdef TM_USE_MPI
+  float ALIGN32 mres;
+#endif
+
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+  int thread_num = omp_get_thread_num();
+#endif
+  vector4double ks, kc, ds, tr, ts, tt;
+  vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5;
+  vector4double z0, z1, z2, z3, z4, z5;
+  float *s, *r;
+  vector4double buffer;
+  __alignx(16, s);
+  __alignx(16, r);
+  __alignx(16, S);
+  __alignx(16, R);
+
+  __prefetch_by_load(S);
+  __prefetch_by_load(R);
+
+  ks = vec_splats(0.0);
+  kc = vec_splats(0.0);
+
+#ifndef TM_USE_OMP
+#pragma unroll(2)
+#else
+#pragma omp for
+#endif
+  for (int ix = 0; ix < N; ++ix) {
+    s=(float*)((spinor32 *) S + ix);
+    r=(float*)((spinor32 *) R + ix);
+    __prefetch_by_load(S + ix + 1);
+    __prefetch_by_load(R + ix + 1);
+    x0 = vec_ld(0, s);
+    x1 = vec_ld(0, s+4);
+    x2 = vec_ld(0, s+8);
+    x3 = vec_ld(0, s+12);
+    x4 = vec_ld(0, s+16);
+    x5 = vec_ld(0, s+20);
+    y0 = vec_ld(0, r);
+    y1 = vec_ld(0, r+4);
+    y2 = vec_ld(0, r+8);
+    y3 = vec_ld(0, r+12);
+    y4 = vec_ld(0, r+16);
+    y5 = vec_ld(0, r+20);
+    z0 = vec_mul(x0, y0);
+    z1 = vec_mul(x1, y1);
+    z2 = vec_mul(x2, y2);
+    z3 = vec_mul(x3, y3);
+    z4 = vec_mul(x4, y4);
+    z5 = vec_mul(x5, y5);
+    x0 = vec_add(z0, z1);
+    x1 = vec_add(z2, z3);
+    x2 = vec_add(z4, z5);
+    x3 = vec_add(x0, x1);
+    ds = vec_add(x2, x3);
+
+    tr = vec_add(ds, kc);
+    ts = vec_add(tr, ks);
+    tt = vec_sub(ts, ks);
+    ks = ts;
+    kc = vec_sub(tr, tt);
+  }
+  buffer = vec_add(kc, ks);
+
+#ifdef TM_USE_OMP
+  g_omp_acc_re[thread_num] = buffer[0] + buffer[1] + buffer[2] + buffer[3];
+  } /* OpenMP parallel closing brace */
+  for( int i = 0; i < omp_num_threads; ++i)
+    res += g_omp_acc_re[i];
+#else
+  res = buffer[0] + buffer[1] + buffer[2] + buffer[3]; 
+#endif
+
+#if defined TM_USE_MPI
+  if(parallel) {
+    MPI_Allreduce(&res, &mres, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
+    return(mres);
+  }
+#endif
+
+  return (res);
+}
+
+#else
+
+float scalar_prod_r_32(const spinor32 * const S, const spinor32 * const R, const int N, const int parallel)
+{
+  float ALIGN32 res = 0.0;
+#ifdef TM_USE_MPI
+  float ALIGN32 mres;
+#endif
+
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+  int thread_num = omp_get_thread_num();
+#endif
+  float ALIGN32 kc,ks,ds,tr,ts,tt;
+  const spinor32 *s,*r;
+
+  ks = 0.0;
+  kc = 0.0;
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for (int ix = 0; ix < N; ++ix) {
+    s = S + ix;
+    r = R + ix;
+    
+    ds = creal(r->s0.c0 * conj(s->s0.c0)) + creal(r->s0.c1 * conj(s->s0.c1)) + creal(r->s0.c2 * conj(s->s0.c2)) +
+      creal(r->s1.c0 * conj(s->s1.c0)) + creal(r->s1.c1 * conj(s->s1.c1)) + creal(r->s1.c2 * conj(s->s1.c2)) +
+      creal(r->s2.c0 * conj(s->s2.c0)) + creal(r->s2.c1 * conj(s->s2.c1)) + creal(r->s2.c2 * conj(s->s2.c2)) +
+      creal(r->s3.c0 * conj(s->s3.c0)) + creal(r->s3.c1 * conj(s->s3.c1)) + creal(r->s3.c2 * conj(s->s3.c2));    
+    
+    tr=ds+kc;
+    ts=tr+ks;
+    tt=ts-ks;
+    ks=ts;
+    kc=tr-tt;
+  }
+  kc=ks+kc;
+
+#ifdef TM_USE_OMP
+  g_omp_acc_re[thread_num] = kc;
+
+  } /* OpenMP closing brace */
+
+  for(int i = 0; i < omp_num_threads; ++i)
+    res += g_omp_acc_re[i];
+#else
+  res = kc;
+#endif
+
+#if defined TM_USE_MPI
+  if(parallel)
+  {
+    MPI_Allreduce(&res, &mres, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
+    return mres;
+  }
+#endif
+  return res;
+}
+
+#endif
diff --git a/linalg/scalar_prod_r_32.h b/linalg/scalar_prod_r_32.h
new file mode 100644
index 000000000..f27c22b73
--- /dev/null
+++ b/linalg/scalar_prod_r_32.h
@@ -0,0 +1,9 @@
+#ifndef _SCALAR_PROD_R_32_H
+#define _SCALAR_PROD_R_32_H
+
+#include "su3.h"
+
+/* Returns the real part of the scalar product (*R,*S) */
+float scalar_prod_r_32(const spinor32 * const S, const spinor32 * const R, const int N, const int parallel);
+
+#endif
\ No newline at end of file
diff --git a/linalg/scalar_prod_su3spinor.c b/linalg/scalar_prod_su3spinor.c
index 8ca2a61c4..fa037bed5 100644
--- a/linalg/scalar_prod_su3spinor.c
+++ b/linalg/scalar_prod_su3spinor.c
@@ -18,10 +18,10 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
 #include "su3.h"
@@ -33,7 +33,7 @@ complex_spinor scalar_prod_su3spinor(su3_vector * const S, spinor * const R, con
   static _Complex double ks, kc, ds, tr, ts, tt;
   su3_vector *s, *r;
   complex_spinor c;
-#ifdef MPI
+#ifdef TM_USE_MPI
   complex_spinor d;
 #endif
 
@@ -117,7 +117,7 @@ complex_spinor scalar_prod_su3spinor(su3_vector * const S, spinor * const R, con
   kc = ks + kc;
   c.sc3 = kc;
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   if(parallel == 1) {
     d = c;
     MPI_Allreduce(&d, &c, 4, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD); //???
diff --git a/linalg/set_even_to_zero.c b/linalg/set_even_to_zero.c
new file mode 100644
index 000000000..bce4ded32
--- /dev/null
+++ b/linalg/set_even_to_zero.c
@@ -0,0 +1,86 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#ifdef TM_USE_MPI
+# include <mpi.h>
+#endif
+#ifdef TM_USE_OMP
+# include <omp.h>
+#endif
+#include "global.h"
+#include "su3.h"
+#include "set_even_to_zero.h"
+
+void set_even_to_zero(spinor * const P) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+
+  int x, y, z, t, i, ix;
+  spinor * p = NULL;
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(x = 0; x < LX; x++) {
+    for(y = 0; y < LY; y++) {
+      for(z = 0; z < LZ; z++) {
+	for(t = 0; t < T; t++) {
+	  ix = g_ipt[t][x][y][z];
+	  i = g_lexic2eosub[ ix ];
+	  if((t+x+y+z+g_proc_coords[3]*LZ+g_proc_coords[2]*LY 
+	      + g_proc_coords[0]*T+g_proc_coords[1]*LX)%2 == 0) {
+	     
+	    p = P+ix;
+	     
+	    p->s0.c0 = 0.0;
+	    p->s0.c1 = 0.0;
+	    p->s0.c2 = 0.0;
+	    
+	    p->s1.c0 = 0.0;
+	    p->s1.c1 = 0.0;
+	    p->s1.c2 = 0.0;
+	    
+	    p->s2.c0 = 0.0;
+	    p->s2.c1 = 0.0;
+	    p->s2.c2 = 0.0;
+	    
+	    p->s3.c0 = 0.0;
+	    p->s3.c1 = 0.0;
+	    p->s3.c2 = 0.0;
+	    
+	  }
+	}
+      }
+    }
+  }
+
+#ifdef TM_USE_OMP
+  } /*OpenMP closing brace */
+#endif
+
+  return;
+}
diff --git a/linalg/set_even_to_zero.h b/linalg/set_even_to_zero.h
new file mode 100644
index 000000000..807ce5967
--- /dev/null
+++ b/linalg/set_even_to_zero.h
@@ -0,0 +1,25 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _SET_EVEN_TO_ZERO_H
+#define _SET_EVEN_TO_ZERO_H
+
+void set_even_to_zero(spinor * const P);
+
+#endif
diff --git a/linalg/square_and_minmax.c b/linalg/square_and_minmax.c
new file mode 100644
index 000000000..1bff61c2c
--- /dev/null
+++ b/linalg/square_and_minmax.c
@@ -0,0 +1,424 @@
+/***********************************************************************
+ * copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * File square_and_max.c
+ *
+ *   void square_and_max(spinor * const P )
+ *     Returns the square norm and max local deviation of *P
+ *
+ *******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#ifdef TM_USE_MPI
+# include <mpi.h>
+#endif
+#ifdef TM_USE_OMP
+# include <omp.h>
+# include "global.h"
+#endif
+#include <complex.h>
+#include "su3.h"
+#include "su3adj.h"
+#include "su3spinor.h"
+#include "square_and_minmax.h"
+
+void square_and_minmax(double * const sum, double * const min, double * const max, const spinor * const P, const int N)
+{
+  int ix;
+  double ALIGN ks,kc,ds,tr,ts,tt;
+  spinor *s;
+  
+  ks=0.0;
+  kc=0.0;
+  *max = 0.0;
+  *min = -1;
+
+#if (defined BGL && defined XLC)
+  __alignx(16, S);
+  __alignx(16, R);
+#endif
+  
+  for (ix = 0; ix < N; ix++)
+  {
+    s=(spinor *) P + ix;
+
+    ds=s->s0.c0 * conj(s->s0.c0) + s->s0.c1 * conj(s->s0.c1) + s->s0.c2 * conj(s->s0.c2) +  
+      s->s1.c0 * conj(s->s1.c0) + s->s1.c1 * conj(s->s1.c1) + s->s1.c2 * conj(s->s1.c2) +  
+      s->s2.c0 * conj(s->s2.c0) + s->s2.c1 * conj(s->s2.c1) + s->s2.c2 * conj(s->s2.c2) +
+      s->s3.c0 * conj(s->s3.c0) + s->s3.c1 * conj(s->s3.c1) + s->s3.c2 * conj(s->s3.c2);
+    
+    tr=ds + kc;
+    ts=tr + ks;
+    tt=ts-ks;
+    ks=ts;
+    kc=tr-tt;
+
+    if(ds > *max) *max = ds;
+    if(ds < *min || *min < 0) *min = ds;
+  }
+  kc=ks + kc;
+  *sum=kc;
+
+#if defined TM_USE_MPI
+
+  MPI_Allreduce(&kc, sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+  MPI_Allreduce(min, &kc, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+  *min = kc;
+
+  MPI_Allreduce(max, &kc, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  *max = kc;
+
+#endif
+
+  return;
+}
+
+void square_and_minmax_rel(double * const sum, double * const min, double * const max, const spinor * const P, const spinor * const Q, const int N)
+{
+  int ix;
+  double ALIGN ks,kc,ds,dr,tr,ts,tt;
+  spinor *s, *r;
+  
+  ks=0.0;
+  kc=0.0;
+  *max = 0.0;
+  *min = -1;
+
+#if (defined BGL && defined XLC)
+  __alignx(16, S);
+  __alignx(16, R);
+#endif
+  
+  for (ix = 0; ix < N; ix++)
+  {
+    s=(spinor *) P + ix;
+    r=(spinor *) Q + ix;
+
+    ds=s->s0.c0 * conj(s->s0.c0) + s->s0.c1 * conj(s->s0.c1) + s->s0.c2 * conj(s->s0.c2) +  
+      s->s1.c0 * conj(s->s1.c0) + s->s1.c1 * conj(s->s1.c1) + s->s1.c2 * conj(s->s1.c2) +  
+      s->s2.c0 * conj(s->s2.c0) + s->s2.c1 * conj(s->s2.c1) + s->s2.c2 * conj(s->s2.c2) +
+      s->s3.c0 * conj(s->s3.c0) + s->s3.c1 * conj(s->s3.c1) + s->s3.c2 * conj(s->s3.c2);
+
+    dr=r->s0.c0 * conj(r->s0.c0) + r->s0.c1 * conj(r->s0.c1) + r->s0.c2 * conj(r->s0.c2) +  
+      r->s1.c0 * conj(r->s1.c0) + r->s1.c1 * conj(r->s1.c1) + r->s1.c2 * conj(r->s1.c2) +  
+      r->s2.c0 * conj(r->s2.c0) + r->s2.c1 * conj(r->s2.c1) + r->s2.c2 * conj(r->s2.c2) +
+      r->s3.c0 * conj(r->s3.c0) + r->s3.c1 * conj(r->s3.c1) + r->s3.c2 * conj(r->s3.c2);
+    
+    ds = ds/dr;
+
+    tr=ds + kc;
+    ts=tr + ks;
+    tt=ts-ks;
+    ks=ts;
+    kc=tr-tt;
+
+    if(ds > *max) *max = ds;
+    if(ds < *min || *min < 0) *min = ds;
+  }
+  kc=ks + kc;
+  *sum=kc;
+
+#if defined TM_USE_MPI
+
+  MPI_Allreduce(&kc, sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+  MPI_Allreduce(min, &kc, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+  *min = kc;
+
+  MPI_Allreduce(max, &kc, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  *max = kc;
+
+#endif
+
+  return;
+}
+
+void square_and_minmax_abs(double * const sum, double * const min, double * const max,  double * const min_abs, double * const max_abs, const spinor * const P, const int N)
+{
+  int ix;
+  double ALIGN ks,kc,ds,dds,tr,ts,tt;
+  spinor *s;
+  
+  ks=0.0;
+  kc=0.0;
+  *max = 0.0;
+  *min = -1;
+  *max_abs = 0.0;
+  *min_abs = -1;
+
+#if (defined BGL && defined XLC)
+  __alignx(16, S);
+  __alignx(16, R);
+#endif
+  
+  for (ix = 0; ix < N; ix++)
+  {
+    s=(spinor *) P + ix;
+
+    dds=s->s0.c0 * conj(s->s0.c0);
+    ds=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s0.c1 * conj(s->s0.c1);
+    ds+=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s0.c2 * conj(s->s0.c2);
+    ds+=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s1.c0 * conj(s->s1.c0);
+    ds+=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s1.c1 * conj(s->s1.c1);
+    ds+=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s1.c2 * conj(s->s1.c2);
+    ds+=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s2.c0 * conj(s->s2.c0);
+    ds+=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s2.c1 * conj(s->s2.c1);
+    ds+=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s2.c2 * conj(s->s2.c2);
+    ds+=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s3.c0 * conj(s->s3.c0);
+    ds+=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s3.c1 * conj(s->s3.c1);
+    ds+=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s3.c2 * conj(s->s3.c2);
+    ds+=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    tr=ds + kc;
+    ts=tr + ks;
+    tt=ts-ks;
+    ks=ts;
+    kc=tr-tt;
+
+    if(ds > *max) *max = ds;
+    if(ds < *min || *min < 0) *min = ds;
+  }
+  kc=ks + kc;
+  *sum=kc;
+
+#if defined TM_USE_MPI
+
+  MPI_Allreduce(&kc, sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+  MPI_Allreduce(min, &kc, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+  *min = kc;
+
+  MPI_Allreduce(max, &kc, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  *max = kc;
+
+  MPI_Allreduce(min_abs, &kc, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+  *min_abs = kc;
+
+  MPI_Allreduce(max_abs, &kc, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  *max_abs = kc;
+
+#endif
+
+  return;
+}
+
+void square_and_minmax_rel_abs(double * const sum, double * const min, double * const max, double * const min_abs, double * const max_abs, const spinor * const P, const spinor * const Q, const int N)
+{
+  int ix;
+  double ALIGN ks,kc,ds,dds,dr,ddr,tr,ts,tt;
+  spinor *s, *r;
+  
+  ks=0.0;
+  kc=0.0;
+  *max = 0.0;
+  *min = -1;
+  *max_abs = 0.0;
+  *min_abs = -1;
+
+#if (defined BGL && defined XLC)
+  __alignx(16, S);
+  __alignx(16, R);
+#endif
+  
+  for (ix = 0; ix < N; ix++)
+  {
+    s=(spinor *) P + ix;
+    r=(spinor *) Q + ix;
+
+    dds=s->s0.c0 * conj(s->s0.c0);
+    ddr=r->s0.c0 * conj(r->s0.c0);
+    ds=dds;
+    dr=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s0.c1 * conj(s->s0.c1);
+    ddr=r->s0.c1 * conj(r->s0.c1);
+    ds+=dds;
+    dr+=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s0.c2 * conj(s->s0.c2);
+    ddr=r->s0.c2 * conj(r->s0.c2);
+    ds+=dds;
+    dr+=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s1.c0 * conj(s->s1.c0);
+    ddr=r->s1.c0 * conj(r->s1.c0);
+    ds+=dds;
+    dr+=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s1.c1 * conj(s->s1.c1);
+    ddr=r->s1.c1 * conj(r->s1.c1);
+    ds+=dds;
+    dr+=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s1.c2 * conj(s->s1.c2);
+    ddr=r->s1.c2 * conj(r->s1.c2);
+    ds+=dds;
+    dr+=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s2.c0 * conj(s->s2.c0);
+    ddr=r->s2.c0 * conj(r->s2.c0);
+    ds+=dds;
+    dr+=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s2.c1 * conj(s->s2.c1);
+    ddr=r->s2.c1 * conj(r->s2.c1);
+    ds+=dds;
+    dr+=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s2.c2 * conj(s->s2.c2);
+    ddr=r->s2.c2 * conj(r->s2.c2);
+    ds+=dds;
+    dr+=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s3.c0 * conj(s->s3.c0);
+    ddr=r->s3.c0 * conj(r->s3.c0);
+    ds+=dds;
+    dr+=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s3.c1 * conj(s->s3.c1);
+    ddr=r->s3.c1 * conj(r->s3.c1);
+    ds+=dds;
+    dr+=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s3.c2 * conj(s->s3.c2);
+    ddr=r->s3.c2 * conj(r->s3.c2);
+    ds+=dds;
+    dr+=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+    
+    ds = ds/dr;
+
+    tr=ds + kc;
+    ts=tr + ks;
+    tt=ts-ks;
+    ks=ts;
+    kc=tr-tt;
+
+    if(ds > *max) *max = ds;
+    if(ds < *min || *min < 0) *min = ds;
+  }
+  kc=ks + kc;
+  *sum=kc;
+
+#if defined TM_USE_MPI
+
+  MPI_Allreduce(&kc, sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+  MPI_Allreduce(min, &kc, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+  *min = kc;
+
+  MPI_Allreduce(max, &kc, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  *max = kc;
+
+  MPI_Allreduce(min_abs, &kc, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+  *min_abs = kc;
+
+  MPI_Allreduce(max_abs, &kc, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  *max_abs = kc;
+
+#endif
+
+  return;
+}
diff --git a/linalg/square_and_minmax.h b/linalg/square_and_minmax.h
new file mode 100644
index 000000000..7be9ef725
--- /dev/null
+++ b/linalg/square_and_minmax.h
@@ -0,0 +1,41 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _SQUARE_AND_MAX_H
+#define _SQUARE_AND_MAX_H
+
+#include "su3.h"
+
+/* double square_and_minmax(spinor * const P )
+ *     Returns the square norm of *P and the local minimal/maximal norm */
+
+/* double square_and_minmax(spinor * const P, spinor * const Q )
+ *     Returns the square norm of *P/\*Q (locally) and the local minimal/maximal norm */
+
+void square_and_minmax(double * const sum, double * const min, double * const max, const spinor * const P, const int N);
+void square_and_minmax_rel(double * const sum, double * const min, double * const max, const spinor * const P,  const spinor * const Q, const int N);
+void square_and_minmax_abs(double * const sum, double * const min, double * const max, double * const min_abs, double * const max_abs, const spinor * const P, const int N);
+void square_and_minmax_rel_abs(double * const sum, double * const min, double * const max, double * const min_abs, double * const max_abs, const spinor * const P,  const spinor * const Q, const int N);
+
+
+
+#endif
+
+
+
diff --git a/linalg/square_and_prod_r.c b/linalg/square_and_prod_r.c
index b6482c802..7aa29454e 100644
--- a/linalg/square_and_prod_r.c
+++ b/linalg/square_and_prod_r.c
@@ -27,12 +27,12 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "su3.h"
@@ -86,7 +86,7 @@ void square_and_prod_r(double * const x1, double * const x2, spinor * const S, s
   xkc=xks + xkc;
   *x1=xkc;
 
-#if defined MPI
+#if defined TM_USE_MPI
 
   MPI_Allreduce(&xkc, x1, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 
@@ -94,7 +94,7 @@ void square_and_prod_r(double * const x1, double * const x2, spinor * const S, s
   kc=ks + kc;
   *x2=kc;
 
-#if defined MPI
+#if defined TM_USE_MPI
 
     MPI_Allreduce(&kc, x2, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 
diff --git a/linalg/square_norm.c b/linalg/square_norm.c
index 02f50e74a..9a3f92d0c 100644
--- a/linalg/square_norm.c
+++ b/linalg/square_norm.c
@@ -24,15 +24,15 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 # include "global.h"
 #endif
@@ -163,7 +163,7 @@ double square_norm(spinor * const P, const int N, const int parallel) {
   y00 = __fpadd(y00, y04);
   y00 = __fpadd(y00, y08);
   res = __creal(y00)+__cimag(y00);
-#  ifdef MPI
+#  ifdef TM_USE_MPI
   if(parallel) {
     MPI_Allreduce(&res, &res2, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
     return res2;
@@ -176,11 +176,11 @@ double square_norm(spinor * const P, const int N, const int parallel) {
 
 double square_norm(spinor * const P, const int N, const int parallel) {
   double ALIGN res = 0.0;
-#ifdef MPI
+#ifdef TM_USE_MPI
   double ALIGN mres;
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
     int thread_num = omp_get_thread_num();
@@ -192,7 +192,7 @@ double square_norm(spinor * const P, const int N, const int parallel) {
   ks = vec_splats(0.);
   kc = vec_splats(0.);
 
-#ifndef OMP
+#ifndef TM_USE_OMP
 #pragma unroll(4)
 #else
 #pragma omp for
@@ -227,7 +227,7 @@ double square_norm(spinor * const P, const int N, const int parallel) {
   }
   buffer = vec_add(kc,ks);
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   g_omp_acc_re[thread_num] = buffer[0] + buffer[1] + buffer[2] + buffer[3];
   } /* OpenMP closing brace */
 
@@ -237,7 +237,7 @@ double square_norm(spinor * const P, const int N, const int parallel) {
   res = buffer[0] + buffer[1] + buffer[2] + buffer[3];
 #endif
 
-#  ifdef MPI
+#  ifdef TM_USE_MPI
   if(parallel) {
     MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
     return mres;
@@ -253,11 +253,11 @@ double square_norm(spinor * const P, const int N, const int parallel) {
 double square_norm(const spinor * const P, const int N, const int parallel)
 {
   double ALIGN res = 0.0;
-#ifdef MPI
+#ifdef TM_USE_MPI
   double ALIGN mres;
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
     int thread_num = omp_get_thread_num();
@@ -269,7 +269,7 @@ double square_norm(const spinor * const P, const int N, const int parallel)
   ks = 0.0;
   kc = 0.0;
   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif    
   for (int ix  =  0; ix < N; ix++) {
@@ -296,7 +296,7 @@ double square_norm(const spinor * const P, const int N, const int parallel)
   }
   kc=ks+kc;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   g_omp_acc_re[thread_num] = kc;
 
   } /* OpenMP closing brace */
@@ -309,7 +309,7 @@ double square_norm(const spinor * const P, const int N, const int parallel)
   res = kc;
 #endif
 
-#  ifdef MPI
+#  ifdef TM_USE_MPI
   if(parallel) {
     MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
     return mres;
@@ -321,6 +321,67 @@ double square_norm(const spinor * const P, const int N, const int parallel)
 
 #endif
 
+// threadsafe version
+
+double square_norm_ts(const spinor * const P, const int N, const int parallel)
+{
+  double ALIGN res = 0.0;
+#ifdef TM_USE_MPI
+  double ALIGN mres;
+#endif
+
+#ifdef TM_USE_OMP2
+#pragma omp parallel reduction(+:res)
+  {
+#endif
+  double ALIGN ks,kc,ds,tr,ts,tt;
+  const spinor *s;
+  
+  ks = 0.0;
+  kc = 0.0;
+  
+#ifdef TM_USE_OMP2
+#pragma omp for
+#endif    
+  for (int ix  =  0; ix < N; ix++) {
+    s = P + ix;
+    
+    ds = conj(s->s0.c0) * s->s0.c0 +
+         conj(s->s0.c1) * s->s0.c1 +
+         conj(s->s0.c2) * s->s0.c2 +
+         conj(s->s1.c0) * s->s1.c0 +
+         conj(s->s1.c1) * s->s1.c1 +
+         conj(s->s1.c2) * s->s1.c2 +
+         conj(s->s2.c0) * s->s2.c0 +
+         conj(s->s2.c1) * s->s2.c1 +
+         conj(s->s2.c2) * s->s2.c2 +
+         conj(s->s3.c0) * s->s3.c0 +
+         conj(s->s3.c1) * s->s3.c1 +
+         conj(s->s3.c2) * s->s3.c2;
+
+    tr = ds + kc;
+    ts = tr + ks;
+    tt = ts-ks;
+    ks = ts;
+    kc = tr-tt;
+  }
+  res=ks+kc;
+
+#ifdef TM_USE_OMP2
+  } /* OpenMP closing brace */
+#endif
+
+#  ifdef TM_USE_MPI
+  if(parallel) {
+    MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    return mres;
+  }
+#endif
+
+  return res;
+}
+
+
 #ifdef WITHLAPH
 double square_norm_su3vect(su3_vector * const P, const int N, const int parallel) 
 {
@@ -346,7 +407,7 @@ double square_norm_su3vect(su3_vector * const P, const int N, const int parallel
       kc = tr-tt;
     }
   kc = ks + kc;
-#  ifdef MPI
+#  ifdef TM_USE_MPI
   if(parallel) {
     MPI_Allreduce(&kc, &ks, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
     return ks;
diff --git a/linalg/square_norm.h b/linalg/square_norm.h
index 29aa88ba8..00de4960c 100644
--- a/linalg/square_norm.h
+++ b/linalg/square_norm.h
@@ -26,6 +26,7 @@
  *     Returns the square norm of *P */
 
 double square_norm(const spinor * const P, const int N, const int parallel);
+double square_norm_ts(const spinor * const P, const int N, const int parallel);
 double square_norm_su3vect(su3_vector * const P, const int N, const int parallel);
 
 
diff --git a/linalg/square_norm_32.c b/linalg/square_norm_32.c
new file mode 100644
index 000000000..5d0c51f5c
--- /dev/null
+++ b/linalg/square_norm_32.c
@@ -0,0 +1,223 @@
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#ifdef TM_USE_MPI
+# include <mpi.h>
+#endif
+#ifdef TM_USE_OMP
+# include <omp.h>
+# include "global.h"
+#endif
+#include <complex.h>
+#include "su3.h"
+#include "square_norm_32.h"
+
+#if (defined BGQ && defined XLC)
+
+float square_norm_32(spinor32 * const P, const int N, const int parallel) {
+  float ALIGN32 res = 0.0;
+#ifdef TM_USE_MPI
+  float ALIGN32 mres;
+#endif
+
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+    int thread_num = omp_get_thread_num();
+#endif
+  vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5;
+  vector4double ds,tt,tr,ts,kc,ks,buffer;
+  float *s ALIGN32;
+
+  ks = vec_splats(0.);
+  kc = vec_splats(0.);
+
+#ifndef TM_USE_OMP
+#pragma unroll(4)
+#else
+#pragma omp for
+#endif
+  for(int i = 0; i < N; i++) {
+    s = (float*)((spinor32*) P+i);
+    __prefetch_by_load(P+i+1);
+    x0 = vec_ld(0, s);
+    x1 = vec_ld(0, s+4);
+    x2 = vec_ld(0, s+8);
+    x3 = vec_ld(0, s+12);
+    x4 = vec_ld(0, s+16);
+    x5 = vec_ld(0, s+20);
+    y0 = vec_mul(x0, x0);
+    y1 = vec_mul(x1, x1);
+    y2 = vec_mul(x2, x2);
+    y3 = vec_mul(x3, x3);
+    y4 = vec_mul(x4, x4);
+    y5 = vec_mul(x5, x5);
+
+    x0 = vec_add(y0, y1);
+    x1 = vec_add(y2, y3);
+    x2 = vec_add(y4, y5);
+    x3 = vec_add(x0, x1);
+    ds = vec_add(x2, x3);
+
+    tr = vec_add(ds, kc);
+    ts = vec_add(tr, ks);
+    tt = vec_sub(ts, ks);
+    ks = ts;
+    kc = vec_sub(tr, tt);
+  }
+  buffer = vec_add(kc,ks);
+
+#ifdef TM_USE_OMP
+  g_omp_acc_re[thread_num] = buffer[0] + buffer[1] + buffer[2] + buffer[3];
+  } /* OpenMP closing brace */
+
+  for(int i = 0; i < omp_num_threads; ++i)
+    res += g_omp_acc_re[i];
+#else
+  res = buffer[0] + buffer[1] + buffer[2] + buffer[3];
+#endif
+
+#  ifdef TM_USE_MPI
+  if(parallel) {
+    MPI_Allreduce(&res, &mres, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
+    return mres;
+  }
+#  endif
+
+  return res;
+}
+
+
+#else 
+float square_norm_32(const spinor32 * const P, const int N, const int parallel)
+{
+  float ALIGN32 res = 0.0;
+#ifdef TM_USE_MPI
+  float ALIGN32 mres;
+#endif
+
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+    int thread_num = omp_get_thread_num();
+    g_omp_acc_re[thread_num] = 0.0;
+#endif
+  float ALIGN32 ks,kc,ds,tr,ts,tt;
+  const spinor32 *s;
+  
+  ks = 0.0;
+  kc = 0.0;
+  
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif    
+  for (int ix  =  0; ix < N; ix++) {
+    s = P + ix;
+    
+    ds = conj(s->s0.c0) * s->s0.c0 +
+         conj(s->s0.c1) * s->s0.c1 +
+         conj(s->s0.c2) * s->s0.c2 +
+         conj(s->s1.c0) * s->s1.c0 +
+         conj(s->s1.c1) * s->s1.c1 +
+         conj(s->s1.c2) * s->s1.c2 +
+         conj(s->s2.c0) * s->s2.c0 +
+         conj(s->s2.c1) * s->s2.c1 +
+         conj(s->s2.c2) * s->s2.c2 +
+         conj(s->s3.c0) * s->s3.c0 +
+         conj(s->s3.c1) * s->s3.c1 +
+         conj(s->s3.c2) * s->s3.c2;
+
+    tr = ds + kc;
+    ts = tr + ks;
+    tt = ts-ks;
+    ks = ts;
+    kc = tr-tt;
+  }
+  kc=ks+kc;
+
+#ifdef TM_USE_OMP
+  g_omp_acc_re[thread_num] = kc;
+
+  } /* OpenMP closing brace */
+
+  /* having left the parallel section, we can now sum up the Kahan
+     corrected sums from each thread into kc */
+  for(int i = 0; i < omp_num_threads; ++i)
+    res += g_omp_acc_re[i];
+#else
+  res = kc;
+#endif
+
+#  ifdef TM_USE_MPI
+  if(parallel) {
+    MPI_Allreduce(&res, &mres, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
+    return mres;
+  }
+#endif
+
+  return res;
+}
+
+#endif
+
+// threadsafe version
+
+float square_norm_ts_32(const spinor32 * const P, const int N, const int parallel)
+{
+  float ALIGN32 res = 0.0;
+#ifdef TM_USE_MPI
+  float ALIGN32 mres;
+#endif
+
+#ifdef TM_USE_OMP2
+#pragma omp parallel reduction(+:res)
+  {
+#endif
+  float ALIGN32 ks,kc,ds,tr,ts,tt;
+  const spinor32 *s;
+  
+  ks = 0.0;
+  kc = 0.0;
+  
+#ifdef TM_USE_OMP2
+#pragma omp for
+#endif    
+  for (int ix  =  0; ix < N; ix++) {
+    s = P + ix;
+    
+    ds = conj(s->s0.c0) * s->s0.c0 +
+         conj(s->s0.c1) * s->s0.c1 +
+         conj(s->s0.c2) * s->s0.c2 +
+         conj(s->s1.c0) * s->s1.c0 +
+         conj(s->s1.c1) * s->s1.c1 +
+         conj(s->s1.c2) * s->s1.c2 +
+         conj(s->s2.c0) * s->s2.c0 +
+         conj(s->s2.c1) * s->s2.c1 +
+         conj(s->s2.c2) * s->s2.c2 +
+         conj(s->s3.c0) * s->s3.c0 +
+         conj(s->s3.c1) * s->s3.c1 +
+         conj(s->s3.c2) * s->s3.c2;
+
+    tr = ds + kc;
+    ts = tr + ks;
+    tt = ts-ks;
+    ks = ts;
+    kc = tr-tt;
+  }
+  res=ks+kc;
+#ifdef TM_USE_OMP2
+  } /* OpenMP closing brace */
+#endif
+
+#  ifdef TM_USE_MPI
+  if(parallel) {
+    MPI_Allreduce(&res, &mres, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
+    return mres;
+  }
+#endif
+
+  return res;
+}
diff --git a/linalg/square_norm_32.h b/linalg/square_norm_32.h
new file mode 100644
index 000000000..b6ce1a386
--- /dev/null
+++ b/linalg/square_norm_32.h
@@ -0,0 +1,11 @@
+#ifndef _SQUARE_NORM_32_H
+#define _SQUARE_NORM_32_H
+
+#include "su3.h"
+
+/* double square_norm(spinor * const P )
+ *     Returns the square norm of *P */
+
+float square_norm_32(const spinor32 * const P, const int N, const int parallel);
+float square_norm_ts_32(const spinor32 * const P, const int N, const int parallel);
+#endif
diff --git a/linalg_eo.h b/linalg_eo.h
index b33fc1340..2bba98c4f 100644
--- a/linalg_eo.h
+++ b/linalg_eo.h
@@ -21,31 +21,44 @@
 #define _LINALG_EO_H
 
 #include "linalg/diff.h"
+#include "linalg/diff_32.h"
 #include "linalg/mul_r.h"
+#include "linalg/mul_r_32.h"
 #include "linalg/square_norm.h"
+#include "linalg/square_norm_32.h"
 #include "linalg/scalar_prod_r.h"
+#include "linalg/scalar_prod_r_32.h"
 #include "linalg/scalar_prod_i.h"
 #include "linalg/square_and_prod_r.h"
+#include "linalg/square_and_minmax.h"
 #include "linalg/assign_add_mul_r.h"
+#include "linalg/assign_add_mul_r_32.h"
 #include "linalg/assign_mul_bra_add_mul_r.h"
 #include "linalg/assign_add_mul_r_add_mul.h"
 #include "linalg/assign_mul_bra_add_mul_ket_add_r.h"
 #include "linalg/assign_mul_add_mul_add_mul_add_mul_r.h"
 #include "linalg/diff_and_square_norm.h"
 #include "linalg/assign.h"
+#include "linalg/assign_to_32.h"
 /* #include "linalg/deri_linalg.h" */
 #include "linalg/assign_mul_add_r.h"
+#include "linalg/assign_mul_add_r_32.h"
 #include "linalg/assign_mul_add_r_and_square.h"
 #include "linalg/scalar_prod.h"
 #include "linalg/mul_diff_mul.h"
 #include "linalg/assign_add_mul.h"
+#include "linalg/assign_mul_add.h"
 #include "linalg/assign_diff_mul.h"
 #include "linalg/mul_add_mul.h"
 #include "linalg/mul.h"
 #include "linalg/assign_add_mul_add_mul.h"
 #include "linalg/assign_mul_bra_add_mul_ket_add.h"
 #include "linalg/add.h"
+#include "linalg/addto_32.h"
+#include "linalg/assign_to_32.h"
 #include "linalg/assign_mul_add_mul_r.h"
+#include "linalg/assign_mul_add_mul.h"
+#include "linalg/assign_mul_add_mul_r_32.h"
 #include "linalg/assign_mul_add_mul_add_mul_r.h"
 #include "linalg/mul_add_mul_r.h"
 
@@ -54,5 +67,10 @@
 #include "linalg/mattimesvec.h"
 
 #include "linalg/convert_eo_to_lexic.h"
+#include "linalg/convert_even_to_lexic.h"
+#include "linalg/convert_odd_to_lexic.h"
+#include "linalg/set_even_to_zero.h"
+#include "linalg/mul_gamma5.h"
+
 
 #endif
diff --git a/little_D.c b/little_D.c
index 30775e9c5..7cda9385f 100644
--- a/little_D.c
+++ b/little_D.c
@@ -19,13 +19,13 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <string.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
@@ -34,7 +34,8 @@
 #include "linalg/blas.h"
 #include "solver/gcr4complex.h"
 #include "solver/generate_dfl_subspace.h"
-#include "block.h"
+#include "xchange/little_field_gather.h"
+#include "gamma.h"
 #include "linalg_eo.h"
 #include "little_D.h"
 
@@ -42,8 +43,6 @@
 /* assume we have a little field w                       */
 /* which has length 9*nb_blocks*N_s                      */
 /* with usual order in space                             */
-/* nb_blocks = 2 currently fixed                         */
-/* and blocks devide z-direction by 2                    */
 /*                                                       */
 /* block[0], block[1], block[0], block[1], block[0]  ... */
 /* local             , +t                , -t        ... */
@@ -57,6 +56,7 @@ int dfl_subspace_updated = 1;
 /* some lapack related stuff */
 static int ONE = 1;
 static _Complex double CONE, CZERO, CMONE;
+static _Complex float CONE_32, CZERO_32, CMONE_32;
 
 enum{
   NONE = 0,
@@ -99,7 +99,7 @@ void invert_little_D_spinor(spinor *r, spinor *s){
     }
   }
 
-  i = gcr4complex(w, v, 10, 100, 1e-31, 1, nb_blocks * g_N_s, 1, nb_blocks * 9 * g_N_s, &little_D);
+  i = gcr4complex(w, v, 10, 100, little_solver_high_prec, 1, nb_blocks * g_N_s, 1, nb_blocks * 9 * g_N_s, 0, &little_D);
   if(g_proc_id == 0 && g_debug_level > 0) {
     printf("lgcr: %d iterations in invert_little_D_spinor\n", i);
   }
@@ -146,12 +146,12 @@ void invert_little_D_eo_spinor(spinor *r, spinor *s){
     for(i=0;i<nb_blocks;i++) {
       v[j + i*g_N_s] = scalar_prod(block_list[i].basis[j], psi[i], VOLUME/nb_blocks, 0);
       if (block_list[i].evenodd==0) {
-      v_eo[j+i_e*g_N_s] = v[j+i*g_N_s];
-      i_e++;
+	v_eo[j+i_e*g_N_s] = v[j+i*g_N_s];
+	i_e++;
       }
       if (block_list[i].evenodd==1) {
-      v_eo[j+nb_blocks*g_N_s/2+i_o*g_N_s] = v[j+i*g_N_s];
-      i_o++; 
+	v_eo[j+nb_blocks*g_N_s/2+i_o*g_N_s] = v[j+i*g_N_s];
+	i_o++; 
       }
     }
   }
@@ -160,26 +160,26 @@ void invert_little_D_eo_spinor(spinor *r, spinor *s){
   little_D_hop(1,v_o,v_e);
   little_Dhat_rhs(1,v_o,-1,v_eo);
   
-  iter = gcr4complex(w_eo, v_o, 10, 100, 1e-31, 1, nb_blocks * g_N_s, 1, nb_blocks * 9 * g_N_s, &little_D_sym);
+  iter = gcr4complex(w_eo, v_o, 10, 100, 1e-31, 1, nb_blocks * g_N_s, 1, nb_blocks * 9 * g_N_s, 0, &little_D_sym);
 
-      little_D_hop(0,ctmp2, w_eo);   
-      little_D_ee_inv(w_eo,ctmp2);
-      little_Dhat_rhs(0,w_eo, -1., v_e);
+  little_D_hop(0,ctmp2, w_eo);   
+  little_D_ee_inv(w_eo,ctmp2);
+  little_Dhat_rhs(0,w_eo, -1., v_e);
             
-      for (j = 0; j < g_N_s; j++) {
-        i_o=0;
-        i_e=0;
-        for(i = 0; i < nb_blocks; i++) {
-         if (block_list[i].evenodd==0) {
-            w[j + i*g_N_s] = w_eo[j + i_e*g_N_s];
-            i_e++;
-          }
-          if (block_list[i].evenodd==1) {
-            w[j + i*g_N_s] = w_eo[j + nb_blocks*g_N_s/2+i_o*g_N_s];
-            i_o++;
-          }
-        }
+  for (j = 0; j < g_N_s; j++) {
+    i_o=0;
+    i_e=0;
+    for(i = 0; i < nb_blocks; i++) {
+      if (block_list[i].evenodd==0) {
+	w[j + i*g_N_s] = w_eo[j + i_e*g_N_s];
+	i_e++;
       }
+      if (block_list[i].evenodd==1) {
+	w[j + i*g_N_s] = w_eo[j + nb_blocks*g_N_s/2+i_o*g_N_s];
+	i_o++;
+      }
+    }
+  }
 
   if(g_proc_id == 0 && g_debug_level > 0) {
     printf("lgcr: %d iterations in invert_little_D_eo_spinor\n", iter);
@@ -228,55 +228,8 @@ void apply_little_D_spinor(spinor *r, spinor *s){
     for(i = 0; i < nb_blocks; i++) v[j + i*g_N_s] = scalar_prod(block_list[i].basis[j], psi[i], VOLUME/nb_blocks, 0);
   }
 
-  if (g_debug_level > 2){
-    if (!g_cart_id) {
-      for (j = 0; j < nb_blocks* g_N_s; ++j) {
-        printf("LITTLE_D for 0: v[%u] = %1.5e + %1.5e i\n", j, creal(v[j]), cimag(v[j]));
-      }
-    }
-#ifdef MPI
-    MPI_Barrier(MPI_COMM_WORLD);
-#endif
-  }
-
-  if (g_debug_level > 4) {
-    for (k = 1; k < 16; ++k) {
-      if (g_cart_id == k) {
-        for (j = 0; j < nb_blocks* g_N_s; ++j) {
-          printf("LITTLE_D for %u: v[%u] = %1.5e + %1.5e i\n", k, j, creal(v[j]), cimag(v[j]));
-        }
-      }
-#ifdef MPI
-      MPI_Barrier(MPI_COMM_WORLD);
-#endif
-    }
-  }
-
   little_D(w, v);
 
-  if (g_debug_level > 2){
-    if (!g_cart_id){
-      for (j = 0; j < nb_blocks * g_N_s; ++j) {
-        printf("LITTLE_D for 0: w[%u] = %1.5e + %1.5e i\n", j, creal(w[j]), cimag(w[j]));
-      }
-    }
-#ifdef MPI
-    MPI_Barrier(MPI_COMM_WORLD);
-#endif
-  }
-
-  if (g_debug_level > 4) {
-    for (k = 1; k < 16; ++k) {
-      if (g_cart_id == k) {
-        for (j = 0; j < nb_blocks* g_N_s; ++j) {
-          printf("LITTLE_D for %u: w[%u] = %1.5e + %1.5e i\n", k, j, creal(w[j]), cimag(w[j]));
-        }
-      }
-#ifdef MPI
-      MPI_Barrier(MPI_COMM_WORLD);
-#endif
-    }
-  }
   for(i = 0; i < nb_blocks; i++) {
     mul(psi[i], w[i*g_N_s], block_list[i].basis[0], VOLUME/nb_blocks);
   }
@@ -294,482 +247,48 @@ void apply_little_D_spinor(spinor *r, spinor *s){
 }
 
 
-void alt_little_field_gather(_Complex double * w) {
-#ifdef MPI
-  MPI_Status status;
-  int size = 25 * g_N_s * sizeof(_Complex double);
-  _Complex double *buf = malloc(size);
-  MPI_Buffer_attach((void*)buf, size);
-
-  /* LOWER BLOCK */
-
-  /* Send t up */
-  MPI_Bsend(w, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_up, T_UP, g_cart_grid);
-  MPI_Recv(w + 4 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_dn, T_UP, g_cart_grid, &status);
-
-  /* Send t down */
-  MPI_Bsend(w, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_dn, T_DN, g_cart_grid);
-  MPI_Recv(w + 2 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_up, T_DN, g_cart_grid, &status);
-
-  /* Send x up */
-  MPI_Bsend(w, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_up, X_UP, g_cart_grid);
-  MPI_Recv(w + 8 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_dn, X_UP, g_cart_grid, &status);
-
-  /* Send x down */
-  MPI_Bsend(w, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_dn, X_DN, g_cart_grid);
-  MPI_Recv(w + 6 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_up, X_DN, g_cart_grid, &status);
-
-  /* Send y up */
-  MPI_Bsend(w, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_up, Y_UP, g_cart_grid);
-  MPI_Recv(w + 12 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_dn, Y_UP, g_cart_grid, &status);
-
-  /* Send y down */
-  MPI_Bsend(w, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_dn, Y_DN, g_cart_grid);
-  MPI_Recv(w + 10 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_up, Y_DN, g_cart_grid, &status);
-
-  /* Send z up */
-  memcpy(w + 17 * g_N_s, w, g_N_s * sizeof(_Complex double));
-
-  /* Send z down */
-  MPI_Bsend(w, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_dn, Z_DN, g_cart_grid);
-  MPI_Recv(w + 15 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_up, Z_DN, g_cart_grid, &status);
-
-  /* END LOWER BLOCK */
-
-  MPI_Barrier(MPI_COMM_WORLD);
-
-  /* UPPER BLOCK */
-
-  /* Send t up */
-  MPI_Bsend(w + g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_up, T_UP, g_cart_grid);
-  MPI_Recv(w + 5 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_dn, T_UP, g_cart_grid, &status);
-
-  /* Send t down */
-  MPI_Bsend(w + g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_dn, T_DN, g_cart_grid);
-  MPI_Recv(w + 3 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_up, T_DN, g_cart_grid, &status);
-
-  /* Send x up */
-  MPI_Bsend(w + g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_up, X_UP, g_cart_grid);
-  MPI_Recv(w + 9 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_dn, X_UP, g_cart_grid, &status);
-
-  /* Send x down */
-  MPI_Bsend(w + g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_dn, X_DN, g_cart_grid);
-  MPI_Recv(w + 7 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_up, X_DN, g_cart_grid, &status);
-
-  /* Send y up */
-  MPI_Bsend(w + g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_up, Y_UP, g_cart_grid);
-  MPI_Recv(w + 13 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_dn, Y_UP, g_cart_grid, &status);
-
-  /* Send y down */
-  MPI_Bsend(w + g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_dn, Y_DN, g_cart_grid);
-  MPI_Recv(w + 11 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_up, Y_DN, g_cart_grid, &status);
-
-  /* Send z up */
-  MPI_Bsend(w + g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_up, Z_UP, g_cart_grid);
-  MPI_Recv(w + 16 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_dn, Z_UP, g_cart_grid, &status);
-
-  /* Send z down */
-  memcpy(w + 14 * g_N_s, w + g_N_s, g_N_s * sizeof(_Complex double));
-
-  MPI_Barrier(MPI_COMM_WORLD);
-  MPI_Buffer_detach((void*)buf, &size);
-
-  free(buf);
-#endif
-  return;
-}
-
-#ifdef MPI
-MPI_Request lrequests[16];
-MPI_Status lstatus[16];
-int waitcount = 0;
-#endif
-
-
-void little_field_gather(_Complex double * w) {
-#ifdef MPI
-  int err, bt, bx, by, bz, pm, ib;
-  _Complex double *wt, *wx, *wy, *wz;
-  _Complex double *wt_buf, *wx_buf, *wy_buf, *wz_buf, *w_buf, *w_source, *w_dest;
-  /************************************************************************/
-  /* This routine has been extended for multi_dimensional blocking        */
-  /* by Claude Tadonki (claude.tadonki@u-psud.fr) from PetaQCD project    */
-  /* June 2010                                                            */
-  /************************************************************************/
-
-  w_buf = calloc(8 * nb_blocks * g_N_s, sizeof(_Complex double)); // +-t +-x +-y +-z
-
-  wt = w + ( 0*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction t starts
-  wx = w + ( 1*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction x starts
-  wy = w + ( 2*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction y starts
-  wz = w + ( 3*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction z starts
-
-  wt_buf = w_buf + ( 0*(2*nb_blocks)) * g_N_s; // Were data in the direction t starts
-  wx_buf = w_buf + ( 1*(2*nb_blocks)) * g_N_s; // Were data in the direction x starts
-  wy_buf = w_buf + ( 2*(2*nb_blocks)) * g_N_s; // Were data in the direction y starts
-  wz_buf = w_buf + ( 3*(2*nb_blocks)) * g_N_s; // Were data in the direction z starts
-
-  /* We first exchange the fields regardless of block considerations                   */
-  /* The data need to be received in an intermediate buffer because of later shuffling */
-
-  /* Send t up */
-  MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_up, T_UP, g_cart_grid, &lrequests[0]);
-  MPI_Irecv(wt_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_dn, T_UP, g_cart_grid, &lrequests[1]);
-
-  /* Send t down */
-  MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_dn, T_DN, g_cart_grid, &lrequests[2]);
-  MPI_Irecv(wt_buf, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_up, T_DN, g_cart_grid, &lrequests[3]);
-
-  /* Send x up */
-  MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_up, X_UP, g_cart_grid, &lrequests[4]);
-  MPI_Irecv(wx_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_dn, X_UP, g_cart_grid, &lrequests[5]);
-
-  /* Send x down */
-  MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_dn, X_DN, g_cart_grid, &lrequests[6]);
-  MPI_Irecv(wx_buf, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_up, X_DN, g_cart_grid, &lrequests[7]);
-
-  /* Send y up */
-  MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_up, Y_UP, g_cart_grid, &lrequests[8]);
-  MPI_Irecv(wy_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_dn, Y_UP, g_cart_grid, &lrequests[9]);
-
-  /* Send y down */
-  MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_dn, Y_DN, g_cart_grid, &lrequests[10]);
-  MPI_Irecv(wy_buf, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_up, Y_DN, g_cart_grid, &lrequests[11]);
-
-  /* Send z up */
-  MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_up, Z_UP, g_cart_grid, &lrequests[12]);
-  MPI_Irecv(wz_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_dn, Z_UP, g_cart_grid, &lrequests[13]);
-
-  /* Send z down */
-  MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_dn, Z_DN, g_cart_grid, &lrequests[14]);
-  MPI_Irecv(wz_buf, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_up, Z_DN, g_cart_grid, &lrequests[15]);
-  
-  err = MPI_Waitall(16, lrequests, lstatus);
-  
-  /* We now correct the field according to block partitionning               */
-  /* We could have avoid the previous corresponding MPI communication        */
-  /* We proceed like this for code simplicity, maybe will be optimized later */
-  
-  for(pm = 0; pm < 8; pm++) {
-    for(bt = 0; bt < nblks_t; bt++) {
-      for(bx = 0; bx < nblks_x; bx++) {
-	for(by = 0; by < nblks_y; by++) {
-	  for(bz = 0; bz < nblks_z; bz++) {
-	    ib = block_index(bt, bx, by, bz) * g_N_s;
-	    switch(pm){ 
-	    case 0: /* Direction +t */
-	      w_dest = wt + ib;
-	      if( bt == nblks_t - 1 ) {ib = block_index(0, bx, by, bz) * g_N_s; w_source = wt_buf + ib;}					 // got it from the MPI exchange
-	      else  {ib = block_index(bt + 1, bx, by, bz) * g_N_s; w_source = w + ib;}										 // got it from the diagonal block
-	      break; 
-	    case 1: /* Direction -t */
-	      w_dest = wt + ib + nb_blocks * g_N_s;
-	      if( bt == 0 ) {ib = block_index(nblks_t - 1, bx, by, bz) * g_N_s; w_source = wt_buf + ib + nb_blocks * g_N_s;} // got it from the MPI exchange
-	      else  {ib = block_index(bt - 1, bx, by, bz) * g_N_s;w_source = w + ib;}										 // got it from the diagonal block
-	      break; 
-	    case 2: /* Direction +x */
-	      w_dest = wx + ib;
-	      if( bx == nblks_x - 1 ) {ib = block_index(bt, 0, by, bz) * g_N_s; w_source = wx_buf + ib;}					 // got it from the MPI exchange
-	      else  {ib = block_index(bt, bx + 1, by, bz) * g_N_s; w_source = w + ib;}									     // got it from the diagonal block
-	      break; 
-	    case 3: /* Direction -x */
-	      w_dest = wx + ib + nb_blocks * g_N_s;
-	      if( bx == 0 ) {ib = block_index(bt, nblks_x - 1, by, bz) * g_N_s; w_source = wx_buf + ib + nb_blocks * g_N_s;} // got it from the MPI exchange
-	      else  {ib = block_index(bt, bx - 1, by, bz) * g_N_s;w_source = w + ib;}									     // got it from the diagonal block
-	      break; 
-	    case 4: /* Direction +y */
-	      w_dest = wy + ib;
-	      if( by == nblks_y - 1 ) {ib = block_index(bt, bx, 0, bz) * g_N_s; w_source = wy_buf + ib;}			         // got it from the MPI exchange
-	      else  {ib = block_index(bt, bx, by + 1, bz) * g_N_s; w_source = w + ib;}									     // got it from the diagonal block
-	      break; 
-	    case 5: /* Direction -y */
-	      w_dest = wy + ib + nb_blocks * g_N_s;
-	      if( by == 0 ) {ib = block_index(bt, bx, nblks_y - 1, bz) * g_N_s; w_source = wy_buf + ib + nb_blocks * g_N_s;} // got it from the MPI exchange
-	      else  {ib = block_index(bt, bx, by - 1, bz) * g_N_s;w_source = w + ib;}									     // got it from the diagonal block
-	      break; 
-	    case 6: /* Direction +z */
-	      w_dest = wz + ib;
-	      if( bz == nblks_z - 1 ) {ib = block_index(bt, bx, by, 0) * g_N_s; w_source = wz_buf + ib;	}		             // got it from the MPI exchange
-	      else  {ib = block_index(bt, bx, by, bz + 1) * g_N_s; w_source = w + ib;	}						             // got it from the diagonal block
-	      break; 
-	    case 7: /* Direction -z */
-	      w_dest = wz + ib + nb_blocks * g_N_s;
-	      if( bz == 0 ) {ib = block_index(bt, bx, by, nblks_z - 1) * g_N_s; w_source = wz_buf + ib + nb_blocks * g_N_s;} // got it from the MPI exchange
-	      else  {ib = block_index(bt, bx, by, bz - 1) * g_N_s; w_source = w + ib; }                                      // got it from the diagonal block
-	      break; 
-	      
-	    default: 
-	      w_dest = NULL;
-	      w_source = NULL;
-	    }
-	    memcpy(w_dest, w_source, g_N_s * sizeof(_Complex double));
-	  }
-	}
-      }
-    }
-  }
-  
-  free(w_buf);
-  
-#endif
-  return;
-}
-
-void little_field_gather_eo(int eo, _Complex double * w) {
-#ifdef MPI
-  int err, bt, bx, by, bz, pm, ib,ib2;
-  _Complex double *wt, *wx, *wy, *wz;
-  _Complex double *wt_buf, *wx_buf, *wy_buf, *wz_buf, *w_buf, *w_source, *w_dest;
-
-  w_buf = calloc(8 * nb_blocks * g_N_s, sizeof(_Complex double)); // +-t +-x +-y +-z
-  
-  wt = w + ( 0*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction t starts
-  wx = w + ( 1*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction x starts
-  wy = w + ( 2*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction y starts
-  wz = w + ( 3*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction z starts
-
-  wt_buf = w_buf + ( 0*(2*nb_blocks)) * g_N_s; // Were data in the direction t starts
-  wx_buf = w_buf + ( 1*(2*nb_blocks)) * g_N_s; // Were data in the direction x starts
-  wy_buf = w_buf + ( 2*(2*nb_blocks)) * g_N_s; // Were data in the direction y starts
-  wz_buf = w_buf + ( 3*(2*nb_blocks)) * g_N_s; // Were data in the direction z starts
-
-  /* We first exchange the fields regardless of block considerations                   */
-  /* The data need to be received in an intermediate buffer because of later shuffling */
-
-  /* Send t up */
-  MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_up, T_UP, g_cart_grid, &lrequests[0]);
-  MPI_Irecv(wt_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_dn, T_UP, g_cart_grid, &lrequests[1]);
-
-  /* Send t down */
-  MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_dn, T_DN, g_cart_grid, &lrequests[2]);
-  MPI_Irecv(wt_buf, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_up, T_DN, g_cart_grid, &lrequests[3]);
-
-  /* Send x up */
-  MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_up, X_UP, g_cart_grid, &lrequests[4]);
-  MPI_Irecv(wx_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_dn, X_UP, g_cart_grid, &lrequests[5]);
-
-  /* Send x down */
-  MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_dn, X_DN, g_cart_grid, &lrequests[6]);
-  MPI_Irecv(wx_buf, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_up, X_DN, g_cart_grid, &lrequests[7]);
-
-  /* Send y up */
-  MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_up, Y_UP, g_cart_grid, &lrequests[8]);
-  MPI_Irecv(wy_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_dn, Y_UP, g_cart_grid, &lrequests[9]);
-
-  /* Send y down */
-  MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_dn, Y_DN, g_cart_grid, &lrequests[10]);
-  MPI_Irecv(wy_buf, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_up, Y_DN, g_cart_grid, &lrequests[11]);
-
-  /* Send z up */
-  MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_up, Z_UP, g_cart_grid, &lrequests[12]);
-  MPI_Irecv(wz_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_dn, Z_UP, g_cart_grid, &lrequests[13]);
-
-  /* Send z down */
-  MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_dn, Z_DN, g_cart_grid, &lrequests[14]);
-  MPI_Irecv(wz_buf, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_up, Z_DN, g_cart_grid, &lrequests[15]);
-  
-  err = MPI_Waitall(16, lrequests, lstatus);
-  
-  /* We now correct the field according to block partitionning               */
-  /* We could have avoid the previous corresponding MPI communication        */
-  /* We proceed like this for code simplicity, maybe will be optimized later */
-
-  for(pm = 0; pm < 8; pm++) {
-    ib2=0;
-    for(bt = 0; bt < nblks_t; bt++) {
-      for(bx = 0; bx < nblks_x; bx++) {
-	for(by = 0; by < nblks_y; by++) {
-	  for(bz = 0; bz < nblks_z; bz++) {
-	    if ((bt+bx+by+bz)%2==eo) {
-	      ib2 = index_block_eo[block_index(bt, bx, by, bz)] * g_N_s;
-	      
-	      switch(pm){ 
-	      case 0: /* Direction +t */
-		w_dest = wt + ib2;
-		if( bt == nblks_t - 1 ) {ib = index_block_eo[block_index(0,bx, by,bz)] * g_N_s; w_source = wt_buf + ib;	}		             // got it from the MPI exchange
-		else  {ib = index_block_eo[block_index(bt+1, bx, by, bz)] * g_N_s; w_source = w + ib;	}						             // got it from the diagonal block
-		break; 
-	      case 1: /* Direction -t */
-		w_dest = wt + ib2 + nb_blocks * g_N_s;
-		if( bt == 0) {ib = index_block_eo[block_index(nblks_t-1, bx,by,bz)] * g_N_s; w_source = wt_buf + ib + nb_blocks * g_N_s;} // got it from the MPI exchange
-		else  {ib = index_block_eo[block_index(bt-1,bx, by, bz)] * g_N_s; w_source = w + ib; }                                      // got it from the diagonal block
-		break; 
-	      case 2: /* Direction +x */
-		w_dest = wx + ib2;
-		if( bx == nblks_x - 1 ) {ib = index_block_eo[block_index(bt, 0, by,bz)] * g_N_s; w_source = wx_buf + ib;	}		             // got it from the MPI exchange
-		else  {ib = index_block_eo[block_index(bt, bx+1, by, bz)] * g_N_s; w_source = w + ib;	}						             // got it from the diagonal block
-		break; 
-	      case 3: /* Direction -x */
-		w_dest = wx + ib2 + nb_blocks * g_N_s;
-		if( bx == 0) {ib = index_block_eo[block_index(bt, nblks_x-1, by,bz)] * g_N_s; w_source = wx_buf + ib + nb_blocks * g_N_s;} // got it from the MPI exchange
-		else  {ib = index_block_eo[block_index(bt, bx-1, by, bz)] * g_N_s; w_source = w + ib; }                                      // got it from the diagonal block
-		break; 
-	      case 4: /* Direction +y */
-		w_dest = wy + ib2;
-		if( by == nblks_y - 1 ) {ib = index_block_eo[block_index(bt, bx, 0,bz)] * g_N_s; w_source = wy_buf + ib;	}		             // got it from the MPI exchange
-		else  {ib = index_block_eo[block_index(bt, bx, by+1, bz)] * g_N_s; w_source = w + ib;	}						             // got it from the diagonal block
-		break; 
-	      case 5: /* Direction -y */
-		w_dest = wy + ib2 + nb_blocks * g_N_s;
-		if( by == 0) {ib = index_block_eo[block_index(bt, bx, nblks_y-1, bz)] * g_N_s; w_source = wy_buf + ib + nb_blocks * g_N_s;} // got it from the MPI exchange
-		else  {ib = index_block_eo[block_index(bt, bx, by-1, bz)] * g_N_s; w_source = w + ib; }                                      // got it from the diagonal block
-		break; 
-	      case 6: /* Direction +z */
-		w_dest = wz + ib2;
-		if( bz == nblks_z - 1 ) {ib = index_block_eo[block_index(bt, bx, by, 0)] * g_N_s; w_source = wz_buf + ib;	}		             // got it from the MPI exchange
-		else  {ib = index_block_eo[block_index(bt, bx, by, bz + 1)] * g_N_s; w_source = w + ib;	}						             // got it from the diagonal block
-		break; 
-	      case 7: /* Direction -z */
-		w_dest = wz + ib2 + nb_blocks * g_N_s;
-		if( bz == 0) {ib = index_block_eo[block_index(bt, bx, by, nblks_z - 1)] * g_N_s; w_source = wz_buf + ib + nb_blocks * g_N_s;} // got it from the MPI exchange
-		else  {ib = index_block_eo[block_index(bt, bx, by, bz - 1)] * g_N_s; w_source = w + ib; }                                      // got it from the diagonal block
-		break; 
-	      default:
-		w_dest = NULL;
-		w_source = NULL;
-	      }
-	      memcpy(w_dest, w_source, g_N_s * sizeof(_Complex double));
-	    }
-	  }
-	}
-      }
-    }
-  }
-  free(w_buf);
+#define _PSWITCH(s) s
+#define _PTSWITCH(s) s
+#if (defined NOF77UNDERSCORE || defined NOF77_)
+#define _MV(x) zgemv
+#else
+#define _MV(x) zgemv_
 #endif
-  return;
-}
+#define _C_TYPE _Complex double
 
+#include"little_D_body.c"
 
+#undef _C_TYPE
+#undef _PSWITCH
+#undef _PTSWITCH
+#undef _MV
 
-void little_D(_Complex double * v, _Complex double *w) {
-  int i, j, sq = g_N_s*g_N_s;
-  CONE = 1.0;
-  CMONE = -1.0;
-  CZERO = 0.0;
-
-  if(dfl_subspace_updated) {
-    compute_little_D();
-    dfl_subspace_updated = 0;
-  }
-  
-#ifdef MPI
-  /*init_little_field_exchange(w);*/
-  little_field_gather(w);
+#define _PSWITCH(s) s ## _32
+#define _PTSWITCH(s) s ## 32
+#if (defined NOF77UNDERSCORE || defined NOF77_)
+#define _MV(x) cgemv
+#else
+#define _MV(x) cgemv_
 #endif
-  
-  /* all the mpilocal stuff first */
-  for(i = 0; i < nb_blocks; i++) {
-    /* diagonal term */
-    _FT(zgemv)("N", &g_N_s, &g_N_s, &CONE, block_list[i].little_dirac_operator,
-               &g_N_s, w + i * g_N_s, &ONE, &CZERO, v + i * g_N_s, &ONE, 1);
-    
-    /* offdiagonal terms */
-    for(j = 1; j < 9; j++) {
-      _FT(zgemv)("N", &g_N_s, &g_N_s, &CONE, block_list[i].little_dirac_operator + j * sq,
-		 &g_N_s, w + (nb_blocks * j + i) * g_N_s, &ONE, &CONE, v + i * g_N_s, &ONE, 1);
-    }
-  }
-  return;
-}
+#define _C_TYPE _Complex float
 
+#include"little_D_body.c"
 
-void little_D_sym(_Complex double * v, _Complex double *w) {
-  
-  _Complex double* tmpc1, * tmpc2, * tmpc3;
-  tmpc1 = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double));
-  tmpc2 = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double));
-  tmpc3 = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double));
-  
-  if(dfl_subspace_updated) {
-    compute_little_D();
-    dfl_subspace_updated = 0;
-  }
-  
-  little_D_hop(0,tmpc1, w);
-  little_D_ee_inv(tmpc2,tmpc1);
-  little_D_hop(1,tmpc3, tmpc2);
-  little_Dhat_lhs(v, w,tmpc3);
-  
-  free(tmpc1);
-  free(tmpc2);
-  free(tmpc3);
-  return;
-}
-
-
-void little_D_ee_inv(_Complex double * v, _Complex double *w) {
-  int i;
-  CONE = 1.0;
-  CMONE = -1.0;
-  CZERO = 0.0;
-  
-  for(i = 0; i < nb_blocks/2; i++) {
-    _FT(zgemv)("N", &g_N_s, &g_N_s, &CONE, block_list[i].little_dirac_operator_eo,
-               &g_N_s, w + i * g_N_s, &ONE, &CZERO, v + i * g_N_s, &ONE, 1);
-  }
-  return;
-}
+#undef _C_TYPE
+#undef _PSWITCH
+#undef _PTSWITCH
+#undef _MV
 
-
-void little_D_hop(int eo,_Complex double * v, _Complex double *w) {
-  int i, j, i_eo,sq = g_N_s*g_N_s;
-  CONE = 1.0;
-  CMONE = -1.0;
-  CZERO = 0.0;
-
-  i_eo=(eo+1)%2;
-  
-#ifdef MPI
-  /*init_little_field_exchange(w);*/
-  little_field_gather_eo(eo,w+i_eo*nb_blocks*g_N_s/2);
+#ifdef TM_USE_MPI
+// in xchange/little_field_gather.c
+extern MPI_Request lrequests[16];
+extern MPI_Status lstatus[16];
+extern int waitcount;
 #endif
-  
-  for(i = 0; i < nb_blocks/2; i++) {
-    for(j = 1; j < 9; j++) {
-      _FT(zgemv)("N", &g_N_s, &g_N_s, &CONE, block_list[eo*(nb_blocks/2)+i].little_dirac_operator_eo + j * sq,
-		 &g_N_s, w + (nb_blocks * j + (nb_blocks/2)*i_eo+i) * g_N_s, &ONE, &CONE, v + (eo*nb_blocks/2+i) * g_N_s, &ONE, 1);
-    } 
-  }
-  return;
-}
-
-void little_Dhat_lhs(_Complex double * v, _Complex double *w, _Complex double *u) {
-  int i,j;
-  CONE = 1.0;
-  CMONE = -1.0;
-  CZERO = 0.0;
-
-
-  for(i = nb_blocks/2; i < nb_blocks; i++) {
-    _FT(zgemv)("N", &g_N_s, &g_N_s, &CONE, block_list[i].little_dirac_operator_eo,
-               &g_N_s, w + i * g_N_s, &ONE, &CZERO, v + i * g_N_s, &ONE, 1);
-  }
-  
-  for (i=nb_blocks/2; i < nb_blocks; i++) {
-    for (j=0;j<g_N_s;j++) {
-      *(v+ i * g_N_s+ j) = *(v+ i * g_N_s+ j) - *(u+ i * g_N_s+ j);
-    }
-  }
-  return;
-}
-
-
-
-void little_Dhat_rhs(int eo, _Complex double * v, double r, _Complex double *w) {
-  int i, j;
-  
-  for(i = 0; i < nb_blocks/2; i++) {
-    for (j=0;j<g_N_s;j++) {
-      *(v+eo*nb_blocks*g_N_s/2+i*g_N_s+j) = *(w+eo*nb_blocks*g_N_s/2+i*g_N_s+j) + r * *(v+eo*nb_blocks*g_N_s/2+i*g_N_s+j);
-    }
-  }
-  return;
-}
 
 
 void init_little_field_exchange(_Complex double * w) {
-#ifdef MPI
+#ifdef TM_USE_MPI
   int i = 0;
 #  if (defined PARALLELT || defined PARALLELX)
   int no_dirs = 2;
@@ -819,7 +338,7 @@ void init_little_field_exchange(_Complex double * w) {
 }
 
 void wait_little_field_exchange(const int mu) {
-#ifdef MPI
+#ifdef TM_USE_MPI
   int err;
   err = MPI_Waitall(2, &lrequests[2*mu], &lstatus[2*mu]);
   waitcount -= 2;
diff --git a/little_D.h b/little_D.h
index b389f91f3..95903c75a 100644
--- a/little_D.h
+++ b/little_D.h
@@ -25,11 +25,21 @@
 
 extern int dfl_subspace_updated;
 void little_D(_Complex double * v, _Complex double *w);
+void little_Q_pm(_Complex double * v, _Complex double *w);
 void little_D_sym(_Complex double * v, _Complex double *w);
 void little_D_ee_inv(_Complex double * v, _Complex double *w);
 void little_D_hop(int eo,_Complex double * v, _Complex double *w);
 void little_Dhat_lhs(_Complex double * v, _Complex double *w, _Complex double *u);
 void little_Dhat_rhs(int eo, _Complex double * v, double r, _Complex double *w);
+
+void little_D_32(_Complex float * v, _Complex float *w);
+void little_Q_pm_32(_Complex float * v, _Complex float *w);
+void little_D_sym_32(_Complex float * v, _Complex float *w);
+void little_D_ee_inv_32(_Complex float * v, _Complex float *w);
+void little_D_hop_32(int eo,_Complex float * v, _Complex float *w);
+void little_Dhat_lhs_32(_Complex float * v, _Complex float *w, _Complex float *u);
+void little_Dhat_rhs_32(int eo, _Complex float * v, double r, _Complex float *w);
+
 void unit_little_D(_Complex double *v, _Complex double *w);
 void invert_little_D_spinor(spinor *r, spinor *s);
 void invert_little_D_eo_spinor(spinor *r, spinor *s);
diff --git a/little_D_body.c b/little_D_body.c
new file mode 100644
index 000000000..164614b17
--- /dev/null
+++ b/little_D_body.c
@@ -0,0 +1,149 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2008 Albert Deuzeman, Siebren Reker, Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+void _PSWITCH(little_D)(_C_TYPE * v, _C_TYPE *w) {
+  int sq = g_N_s*g_N_s;
+  _PSWITCH(CONE) = 1.0;
+  _PSWITCH(CMONE) = -1.0;
+  _PSWITCH(CZERO) = 0.0;
+
+  if(dfl_subspace_updated) {
+    compute_little_D(0);
+    dfl_subspace_updated = 0;
+  }
+  
+  _PSWITCH(little_field_gather)(w);
+  
+#ifdef TM_USE_OMP
+#pragma omp parallel for
+#endif
+  for(int i = 0; i < nb_blocks; i++) {
+    for(int j = 0; j < 9; j++) {
+      _MV(zgemv)("N", &g_N_s, &g_N_s, &_PSWITCH(CONE), _PSWITCH(block_list[i].little_dirac_operator) + j * sq,
+                 &g_N_s, w + (nb_blocks * j + i) * g_N_s, &ONE, &_PSWITCH(CONE), v + i * g_N_s, &ONE, 1);
+    }
+  }
+  return;
+}
+
+
+void _PSWITCH(little_Q_pm)(_C_TYPE * v, _C_TYPE *w) {
+  _C_TYPE * tmp = (_C_TYPE*) aligned_malloc_zero(nb_blocks * 9 * g_N_s * sizeof(_C_TYPE));
+  double musave= g_mu;
+  if(dfl_subspace_updated) {
+    g_mu = 0.;
+    compute_little_D(1);
+    g_mu = musave;
+    dfl_subspace_updated = 0;
+  }
+  _PSWITCH(little_D)(tmp, w);
+  _PSWITCH(little_D)(v, tmp);
+  aligned_free(tmp);
+  _PSWITCH(lassign_add_mul)(v, w, g_mu*g_mu + g_mu2*g_mu2, nb_blocks*g_N_s);
+}
+
+
+void _PSWITCH(little_D_sym)(_C_TYPE * v, _C_TYPE *w) {
+  
+  _C_TYPE* tmpc1, * tmpc2, * tmpc3;
+  tmpc1 = (_C_TYPE*) aligned_malloc_zero(3*nb_blocks * 9 * g_N_s * sizeof(_C_TYPE));
+  tmpc2 = tmpc1 + nb_blocks * 9 * g_N_s;
+  tmpc3 = tmpc1 + 2*nb_blocks * 9 * g_N_s;
+  
+  if(dfl_subspace_updated) {
+    compute_little_D(0);
+    dfl_subspace_updated = 0;
+  }
+  
+  _PSWITCH(little_D_hop)(0, tmpc1, w);
+  _PSWITCH(little_D_ee_inv)(tmpc2, tmpc1);
+  _PSWITCH(little_D_hop)(1, tmpc3, tmpc2);
+  _PSWITCH(little_Dhat_lhs)(v, w, tmpc3);
+  
+  aligned_free(tmpc1);
+  return;
+}
+
+
+void _PSWITCH(little_D_ee_inv)(_C_TYPE * v, _C_TYPE *w) {
+  int i;
+  _PSWITCH(CONE) = 1.0;
+  _PSWITCH(CMONE) = -1.0;
+  _PSWITCH(CZERO) = 0.0;
+  
+  for(i = 0; i < nb_blocks/2; i++) {
+    _MV(zgemv)("N", &g_N_s, &g_N_s, &_PSWITCH(CONE), _PSWITCH(block_list[i].little_dirac_operator_eo),
+               &g_N_s, w + i * g_N_s, &ONE, &_PSWITCH(CZERO), v + i * g_N_s, &ONE, 1);
+  }
+  return;
+}
+
+
+void _PSWITCH(little_D_hop)(int eo, _C_TYPE * v, _C_TYPE *w) {
+  int i, j, i_eo,sq = g_N_s*g_N_s;
+  _PSWITCH(CONE) = 1.0;
+  _PSWITCH(CMONE) = -1.0;
+  _PSWITCH(CZERO) = 0.0;
+
+  i_eo = (eo+1) % 2;
+  
+  _PSWITCH(little_field_gather_eo)(eo, w + i_eo*nb_blocks*g_N_s/2);
+  
+  for(j = 1; j < 9; j++) {
+    for(i = 0; i < nb_blocks/2; i++) {
+      _MV(zgemv)("N", &g_N_s, &g_N_s, &_PSWITCH(CONE), _PSWITCH(block_list[eo*(nb_blocks/2)+i].little_dirac_operator_eo) + j * sq,
+                 &g_N_s, w + (nb_blocks * j + (nb_blocks/2)*i_eo+i) * g_N_s, &ONE, &_PSWITCH(CONE), v + (eo*nb_blocks/2+i) * g_N_s, &ONE, 1);
+    } 
+  }
+  return;
+}
+
+void _PSWITCH(little_Dhat_lhs)(_C_TYPE * v, _C_TYPE *w, _C_TYPE *u) {
+  int i,j;
+  _PSWITCH(CONE) = 1.0;
+  _PSWITCH(CMONE) = -1.0;
+  _PSWITCH(CZERO) = 0.0;
+
+
+  for(i = nb_blocks/2; i < nb_blocks; i++) {
+    _MV(zgemv)("N", &g_N_s, &g_N_s, &_PSWITCH(CONE), _PSWITCH(block_list[i].little_dirac_operator_eo),
+               &g_N_s, w + i * g_N_s, &ONE, &_PSWITCH(CZERO), v + i * g_N_s, &ONE, 1);
+  }
+  
+  for (i=nb_blocks/2; i < nb_blocks; i++) {
+    for (j=0;j<g_N_s;j++) {
+      *(v+ i * g_N_s+ j) = *(v+ i * g_N_s+ j) - *(u+ i * g_N_s+ j);
+    }
+  }
+  return;
+}
+
+
+
+void _PSWITCH(little_Dhat_rhs)(int eo, _C_TYPE * v, double r, _C_TYPE *w) {
+  int i, j;
+  
+  for(i = 0; i < nb_blocks/2; i++) {
+    for (j=0;j<g_N_s;j++) {
+      *(v+eo*nb_blocks*g_N_s/2+i*g_N_s+j) = *(w+eo*nb_blocks*g_N_s/2+i*g_N_s+j) + r * *(v+eo*nb_blocks*g_N_s/2+i*g_N_s+j);
+    }
+  }
+  return;
+}
diff --git a/matrix_utils.c b/matrix_utils.c
new file mode 100644
index 000000000..11594cf83
--- /dev/null
+++ b/matrix_utils.c
@@ -0,0 +1,138 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2013 Albert Deuzeman 
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ************************************************************************/
+
+#if HAVE_CONFIG_H
+#include <tmlqcd_config.h>
+#endif
+#include <math.h>
+#include <complex.h> 
+
+#if (defined SSE || defined SSE2 || defined SSE3)
+# include "sse.h"
+#endif
+#include "su3.h"
+
+#ifndef TM_USE_OMP
+static
+#endif
+void exponent_from_coefficients(su3 *out, _Complex double f0, _Complex double f1, _Complex double f2, su3 const *in)                                  
+{
+  su3 ALIGN tmp;
+  _complex_times_su3(tmp, f2, *in);
+  _su3_add_equals_complex_identity(tmp, f1);
+  _su3_times_su3(*out, tmp, *in);
+  _su3_add_equals_complex_identity(*out, f0);
+}
+
+void cayley_hamilton_exponent(su3* expA, su3 const *A)
+{
+  static double const fac_1_3 = 1 / 3.0;
+    
+  _Complex double f0,f1,f2;
+
+  /* c0 = det[A] */
+  double c0 = I * (A->c00 * (A->c11 * A->c22 - A->c12 * A->c21) + 
+                   A->c01 * (A->c12 * A->c20 - A->c10 * A->c22) +
+                   A->c02 * (A->c10 * A->c21 - A->c11 * A->c20)  );
+  
+  /* c1 = 0.5 * Tr[AA] */
+  double c1 = -0.5 * (A->c00 * A->c00 + A->c01 * A->c10 + A->c02 * A->c20 +
+                      A->c10 * A->c01 + A->c11 * A->c11 + A->c12 * A->c21 +
+                      A->c20 * A->c02 + A->c21 * A->c12 + A->c22 * A->c22  );
+
+  /* There is a special, but common (cold start) case where the given matrix is actually 0!
+   * We need to account for it. */
+  if (c0 == 0 && c1 == 0) 
+  {
+    _su3_one(*expA);
+    f1 = I;
+    f2 = -0.5;
+    return;
+  }
+  
+  /* P&M give symmetry relations that can be used when c0 < 0, to avoid the numerically problematic c0 -> -c0_max limit.
+     We note the sign here for future reference, then continue with c0 as if it were positive. */
+  int c0_negative = (c0 < 0);
+  c0 = fabs(c0);
+
+  /* The call to fmin below is needed, because for small deviations alpha from zero -- O(10e-12) -- rounding errors can cause c0 > c0max by epsilon.
+     In that case, acos(c0/c0max) will produce NaNs, whereas the mathematically correct conclusion would be that theta is zero to machine precision! 
+     Note that this approach will *not* produce identity and zero for all output, but rather the correct answer of order (I + alpha) for exp(iQ). */
+  
+  double c0max = 2.0 * pow(fac_1_3 * c1, 1.5);
+  double theta_3 = fac_1_3 * acos(fmin(c0 / c0max, 1.0));
+
+  double u = sqrt(fac_1_3 * c1) * cos(theta_3);
+  double w = sqrt(c1) * sin(theta_3);
+  
+  /* Calculate and cache some repeating factors. *
+   * We can fold in the sign immediately -- c.f. f_j(-c0, c1) = -1^j * conj(f_j(c0, c1)) 
+   * This should just amount to potentially adding a minus to all imaginary components and an overall phase for f1. */
+  _Complex double ma = cexp(2 * I * u);
+  _Complex double mb = cexp(-I * u);
+  double cw = cos(w);
+  double u2 = u * u;
+  double w2 = w * w;
+  
+  /* Modification w.r.t. Peardon & Morningstar:  w is always positive, so |w| =  w */
+  double xi0 = (w > 0.05) ? (sin(w) / w) 
+                          : 1 - 0.16666666666666667 *  w2 * (1 - 0.05 *  w2 * (1 - 0.023809523809523808 *  w2));
+  double divisor = 1.0 / (9.0 * u2 -  w2);
+
+  f0 = divisor * (ma * (u * u -  w * w) + mb * (8 * u * u * cw + 2 * I * u * (3 * u * u +  w * w) * xi0));
+  f1 = divisor * (-2 * I * u * ma + mb * (2 * I * u * cw + (3 * u * u -  w * w) * xi0));
+  f2 = divisor * (mb * (cw + 3 * I * u * xi0) - ma);
+
+  /* The first point where we use the symmetry relations to calculate the negative c0 possibility */
+  if (c0_negative)
+  {
+    f0 = conj(f0);
+    f1 = conj(f1);
+    f2 = conj(f2);
+  }
+  
+  exponent_from_coefficients(expA, f0, f1, f2, A);
+   
+  return;
+ }
+
+void project_traceless_antiherm(su3 *in)
+{
+  static const double fac_3 = 1.00 / 3.00;
+  double tr_in = fac_3 * (cimag(in->c00) + cimag(in->c11) + cimag(in->c22));
+  
+  in->c00  = (cimag(in->c00) - tr_in) * I;
+  in->c11  = (cimag(in->c11) - tr_in) * I;
+  in->c22  = (cimag(in->c22) - tr_in) * I;
+
+  in->c01 -= conj(in->c10);
+  in->c01 *= 0.50;
+  in->c10  = -conj(in->c01);
+
+  in->c02 -= conj(in->c20);
+  in->c02 *= 0.50;
+  in->c20  = -conj(in->c02);
+
+  in->c12 -= conj(in->c21);
+  in->c12 *= 0.50;
+  in->c21  = -conj(in->c12);
+}
+
diff --git a/matrix_utils.h b/matrix_utils.h
new file mode 100644
index 000000000..c4dec2f8c
--- /dev/null
+++ b/matrix_utils.h
@@ -0,0 +1,28 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2013 Albert Deuzeman 
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ************************************************************************/
+
+#ifndef _MATRIX_UTILS_H
+#define _MATRIX_UTILS_H
+
+void cayley_hamilton_exponent(su3* expA, su3 const *A);
+void project_traceless_antiherm(su3* M);
+
+#endif
diff --git a/meas/Makefile.in b/meas/Makefile.in
new file mode 100644
index 000000000..2eb71df27
--- /dev/null
+++ b/meas/Makefile.in
@@ -0,0 +1,95 @@
+
+srcdir = @srcdir@
+top_builddir =  @top_builddir@
+abs_top_builddir = @abs_top_builddir@
+top_srcdir = @top_srcdir@
+abs_top_srcdir = @abs_top_srcdir@
+subdir = meas
+builddir = @builddir@
+
+CFLAGS = @CFLAGS@ @MEASDIR@
+DEPFLAGS = @DEPFLAGS@
+LDFLAGS = @LDFLAGS@
+DEFS = @DEFS@
+OPTARGS = @OPTARGS@
+
+AR = @AR@
+RANLIB = @RANLIB@
+CC = @CC@
+CCDEP = @CCDEP@
+CCLD = $(CC)
+LINK = $(CCLD) $(CFLAGS) $(LDFLAGS) ${OPTARGS} -o $@
+LEX = @LEX@
+AUTOCONF = @AUTOCONF@
+DEFS = @DEFS@
+
+LEMON_AVAILABLE = @LEMON_AVAILABLE@
+
+INCLUDES = @INCLUDES@
+LDADD =
+COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} ${OPTARGS}
+
+LIBRARIES = libmeas
+
+libmeas_TARGETS = measurements \
+	oriented_plaquettes \
+	correlators \
+	pion_norm \
+	polyakov_loop \
+	measure_clover_field_strength_observables \
+	gradient_flow
+
+libmeas_OBJECTS = $(addsuffix .o, ${libmeas_TARGETS})
+
+# default rule
+
+all: Makefile dep libmeas.a
+
+# rules for debugging
+debug all-debug: CFLAGS := $(CFLAGS) @DEBUG_FLAG@
+debug all-debug: all
+
+# rules for profiling information
+profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) @PROFILE_FLAG@
+profile all-profile: all
+
+
+#include dep rules
+-include $(addsuffix .d,${libmeas_TARGETS})
+
+include ${top_srcdir}/Makefile.global
+
+# rule to compile objects
+
+%.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config.h
+	$(COMPILE) -c $<
+
+libmeas.a: ${libmeas_OBJECTS} Makefile
+	@rm -f libmeas.a
+	@${AR} cru libmeas.a $(libmeas_OBJECTS)
+	@$(RANLIB) libmeas.a
+	@cp libmeas.a ${top_builddir}/lib/libmeas.a
+
+# rule to generate .d files
+
+$(addsuffix .d,$(libmeas_TARGETS)): %.d: ${srcdir}/%.c Makefile
+	@$(CCDEP) ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@
+
+# rule to make dependencies
+
+dep: ${addsuffix .d, ${libmeas_TARGETS}}
+
+# rules to clean
+
+compile-clean: Makefile
+	rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} *.d
+
+clean: compile-clean
+	rm -f $(addsuffix .a, ${LIBRARIES})
+	rm -f ../lib/libmeas.a
+
+distclean: clean
+	rm -f Makefile
+
+
+.PHONY: all dep clean compile-clean distclean debug all-debug profile all-profile
diff --git a/meas/correlators.c b/meas/correlators.c
new file mode 100644
index 000000000..1d746eba1
--- /dev/null
+++ b/meas/correlators.c
@@ -0,0 +1,250 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include "global.h"
+#include "start.h"
+#include "ranlxs.h"
+#include "su3spinor.h"
+#include "source_generation.h"
+#include "operator.h"
+#include "invert_eo.h"
+#include "solver/solver.h"
+#include "geometry_eo.h"
+#include "linalg/convert_eo_to_lexic.h"
+#include "measurements.h"
+#include "correlators.h"
+#include "gettime.h"
+
+
+/******************************************************
+ *
+ * This routine computes the correlators
+ * <PP>, <PA> and <PV> (<source sink>)
+ * using a stochastic time slice source
+ * and only one inversion (actually A_0)
+ * 
+ * for <AP> we would need another inversion
+ *
+ *
+ *
+ ******************************************************/
+
+#define TM_OMEAS_FILENAME_LENGTH 100
+
+void correlators_measurement(const int traj, const int id, const int ieo) {
+  int i, j, t, tt, t0;
+  double *Cpp = NULL, *Cpa = NULL, *Cp4 = NULL;
+  double res = 0., respa = 0., resp4 = 0.;
+  double atime, etime;
+  float tmp;
+  operator * optr;
+#ifdef TM_USE_MPI
+  double mpi_res = 0., mpi_respa = 0., mpi_resp4 = 0.;
+  // send buffer for MPI_Gather
+  double *sCpp = NULL, *sCpa = NULL, *sCp4 = NULL;
+#endif
+  FILE *ofs;
+  char filename[TM_OMEAS_FILENAME_LENGTH];
+  spinor phi;
+
+  init_operators();
+  if(no_operators < 1) {
+    if(g_proc_id == 0) {
+      fprintf(stderr, "Warning! no operators defined in input file, cannot perform online correlator mesurements!\n");
+    }
+    return;
+  }
+  if(no_operators > 1 && g_proc_id == 0) {
+    fprintf(stderr, "Warning! number of operators defined larger than 1, using only the first!\n");
+  }
+  optr = &operator_list[0];
+  // we don't want to do inversion twice for this purpose here
+  optr->DownProp = 0;
+  if(optr->type != TMWILSON && optr->type != WILSON && optr->type != CLOVER) {
+    if(g_proc_id == 0) {
+      fprintf(stderr, "Warning! correlator online measurement currently only implemented for TMWILSON, WILSON and CLOVER\n");
+      fprintf(stderr, "Cannot perform correlator online measurement!\n");
+    }
+    return;
+  }
+  
+  if(ranlxs_init == 0) {
+    rlxs_init(1, 123456);
+  }
+
+  // there are three modes of operation
+  // 1) one single time-slice source (default)
+  // 2) no_samples time-slice sources on random time-slices
+  // 3) one sample on all time-slices
+  int max_samples = measurement_list[id].all_time_slices ? 1 : measurement_list[id].no_samples;
+  int max_time_slices = measurement_list[id].all_time_slices ? measurement_list[id].max_source_slice : 1;
+  for(int sample = 0; sample < max_samples; sample++ ){
+    for(int ts = 0; ts < max_time_slices; ts++){
+
+      if( max_samples == 1 && max_time_slices == 1 ){
+        snprintf(filename, TM_OMEAS_FILENAME_LENGTH, 
+                 "%s%06d", "onlinemeas." ,traj);
+      } else if ( max_samples == 1 && max_time_slices > 1){
+        snprintf(filename, TM_OMEAS_FILENAME_LENGTH, 
+                 "%s.t%03d.%06d", "onlinemeas", ts, traj );
+      } else {
+        snprintf(filename, TM_OMEAS_FILENAME_LENGTH,
+                 "%s.s%03d.%06d", "onlinemeas", sample, traj);
+      }
+      /* generate random timeslice */
+      t0 = ts;
+      if( !measurement_list[id].all_time_slices ){
+        ranlxs(&tmp, 1);
+        t0 = (int)(measurement_list[id].max_source_slice*tmp);
+      }
+#ifdef TM_USE_MPI
+      MPI_Bcast(&t0, 1, MPI_INT, 0, MPI_COMM_WORLD);
+#endif
+      if(g_debug_level > 1 && g_proc_id == 0) {
+        printf("# timeslice set to %d (T=%d) for online measurement\n", t0, g_nproc_t*T);
+        printf("# online measurements parameters: kappa = %.12f, mu = %.12f\n", optr->kappa, optr->mu/2./optr->kappa);
+      }
+      atime = gettime();
+
+#ifdef TM_USE_MPI
+      sCpp = (double*) calloc(T, sizeof(double));
+      sCpa = (double*) calloc(T, sizeof(double));
+      sCp4 = (double*) calloc(T, sizeof(double));
+      if(g_mpi_time_rank == 0) {
+        Cpp = (double*) calloc(g_nproc_t*T, sizeof(double));
+        Cpa = (double*) calloc(g_nproc_t*T, sizeof(double));
+        Cp4 = (double*) calloc(g_nproc_t*T, sizeof(double));
+      }
+#else
+      Cpp = (double*) calloc(T, sizeof(double));
+      Cpa = (double*) calloc(T, sizeof(double));
+      Cp4 = (double*) calloc(T, sizeof(double));
+#endif
+      source_generation_pion_only(g_spinor_field[0], g_spinor_field[1], 
+	    		      t0, sample, traj, measurement_list[id].seed);
+      optr->sr0 = g_spinor_field[0];
+      optr->sr1 = g_spinor_field[1];
+      optr->prop0 = g_spinor_field[2];
+      optr->prop1 = g_spinor_field[3];
+
+      // op_id = 0, index_start = 0, write_prop = 0
+      optr->inverter(0, 0, 0);
+
+      /* now we bring it to normal format */
+      /* here we use implicitly DUM_MATRIX and DUM_MATRIX+1 */
+      convert_eo_to_lexic(g_spinor_field[DUM_MATRIX], g_spinor_field[2], g_spinor_field[3]);
+      
+      /* now we sum only over local space for every t */
+      for(t = 0; t < T; t++) {
+        j = g_ipt[t][0][0][0];
+        res = 0.;
+        respa = 0.;
+        resp4 = 0.;
+        for(i = j; i < j+LX*LY*LZ; i++) {
+          res += _spinor_prod_re(g_spinor_field[DUM_MATRIX][i], g_spinor_field[DUM_MATRIX][i]);
+          _gamma0(phi, g_spinor_field[DUM_MATRIX][i]);
+          respa += _spinor_prod_re(g_spinor_field[DUM_MATRIX][i], phi);
+          _gamma5(phi, phi);
+          resp4 += _spinor_prod_im(g_spinor_field[DUM_MATRIX][i], phi);
+        }
+
+#if defined TM_USE_MPI
+        MPI_Reduce(&res, &mpi_res, 1, MPI_DOUBLE, MPI_SUM, 0, g_mpi_time_slices);
+        res = mpi_res;
+        MPI_Reduce(&respa, &mpi_respa, 1, MPI_DOUBLE, MPI_SUM, 0, g_mpi_time_slices);
+        respa = mpi_respa;
+        MPI_Reduce(&resp4, &mpi_resp4, 1, MPI_DOUBLE, MPI_SUM, 0, g_mpi_time_slices);
+        resp4 = mpi_resp4;
+        sCpp[t] = +res/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)/2./optr->kappa/optr->kappa;
+        sCpa[t] = -respa/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)/2./optr->kappa/optr->kappa;
+        sCp4[t] = +resp4/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)/2./optr->kappa/optr->kappa;
+#else
+        Cpp[t] = +res/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)/2./optr->kappa/optr->kappa;
+        Cpa[t] = -respa/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)/2./optr->kappa/optr->kappa;
+        Cp4[t] = +resp4/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)/2./optr->kappa/optr->kappa;
+#endif
+      }
+
+#ifdef TM_USE_MPI
+      /* some gymnastics needed in case of parallelisation */
+      if(g_mpi_time_rank == 0) {
+        MPI_Gather(sCpp, T, MPI_DOUBLE, Cpp, T, MPI_DOUBLE, 0, g_mpi_SV_slices);
+        MPI_Gather(sCpa, T, MPI_DOUBLE, Cpa, T, MPI_DOUBLE, 0, g_mpi_SV_slices);
+        MPI_Gather(sCp4, T, MPI_DOUBLE, Cp4, T, MPI_DOUBLE, 0, g_mpi_SV_slices);
+      }
+#endif
+
+      /* and write everything into a file */
+      if(g_mpi_time_rank == 0 && g_proc_coords[0] == 0) {
+        ofs = fopen(filename, "w");
+        fprintf( ofs, "1  1  0  %e  %e\n", Cpp[t0], 0.);
+        for(t = 1; t < g_nproc_t*T/2; t++) {
+          tt = (t0+t)%(g_nproc_t*T);
+          fprintf( ofs, "1  1  %d  %e  ", t, Cpp[tt]);
+          tt = (t0+g_nproc_t*T-t)%(g_nproc_t*T);
+          fprintf( ofs, "%e\n", Cpp[tt]);
+        }
+        tt = (t0+g_nproc_t*T/2)%(g_nproc_t*T);
+        fprintf( ofs, "1  1  %d  %e  %e\n", t, Cpp[tt], 0.);
+
+        fprintf( ofs, "2  1  0  %e  %e\n", Cpa[t0], 0.);
+        for(t = 1; t < g_nproc_t*T/2; t++) {
+          tt = (t0+t)%(g_nproc_t*T);
+          fprintf( ofs, "2  1  %d  %e  ", t, Cpa[tt]);
+          tt = (t0+g_nproc_t*T-t)%(g_nproc_t*T);
+          fprintf( ofs, "%e\n", Cpa[tt]);
+        }
+        tt = (t0+g_nproc_t*T/2)%(g_nproc_t*T);
+        fprintf( ofs, "2  1  %d  %e  %e\n", t, Cpa[tt], 0.);
+
+        fprintf( ofs, "6  1  0  %e  %e\n", Cp4[t0], 0.);
+        for(t = 1; t < g_nproc_t*T/2; t++) {
+          tt = (t0+t)%(g_nproc_t*T);
+          fprintf( ofs, "6  1  %d  %e  ", t, Cp4[tt]);
+          tt = (t0+g_nproc_t*T-t)%(g_nproc_t*T);
+          fprintf( ofs, "%e\n", Cp4[tt]);
+        }
+        tt = (t0+g_nproc_t*T/2)%(g_nproc_t*T);
+        fprintf( ofs, "6  1  %d  %e  %e\n", t, Cp4[tt], 0.);
+        fclose(ofs);
+      }
+#ifdef TM_USE_MPI
+      if(g_mpi_time_rank == 0) {
+        free(Cpp); free(Cpa); free(Cp4);
+      }
+      free(sCpp); free(sCpa); free(sCp4);
+#else
+      free(Cpp); free(Cpa); free(Cp4);
+#endif
+    } // for(max_time_slices)
+  } // for(max_samples)
+  etime = gettime();
+  if(g_proc_id == 0 && g_debug_level > 0) {
+    printf("ONLINE: measurement done int t/s = %1.4e\n", etime - atime);
+  }
+  return;
+}
diff --git a/online_measurement.h b/meas/correlators.h
similarity index 92%
rename from online_measurement.h
rename to meas/correlators.h
index 8d03cb24c..c9a1c4ac0 100644
--- a/online_measurement.h
+++ b/meas/correlators.h
@@ -21,6 +21,6 @@
 #ifndef _ONLINE_MEASUREMENT_H
 #define _ONLINE_MEASUREMENT_H
 
-void online_measurement(const int traj, const int t0, const int ieo);
+void correlators_measurement(const int traj, const int t0, const int ieo);
 
 #endif
diff --git a/meas/field_strength_types.h b/meas/field_strength_types.h
new file mode 100644
index 000000000..b245d2f0d
--- /dev/null
+++ b/meas/field_strength_types.h
@@ -0,0 +1,29 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2018 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef FIELD_STRENGTH_TYPES_H
+#define FIELD_STRENGTH_TYPES_H
+
+typedef struct field_strength_obs_t {
+  double E;
+  double Q;
+} field_strength_obs_t;
+
+#endif
diff --git a/meas/gradient_flow.c b/meas/gradient_flow.c
new file mode 100644
index 000000000..cb8db7476
--- /dev/null
+++ b/meas/gradient_flow.c
@@ -0,0 +1,240 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2013      Albert Deuzeman 
+ *               2015,2018 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include <tmlqcd_config.h>
+#endif
+#ifdef TM_USE_OMP
+# include <omp.h>
+#endif
+#ifdef TM_USE_MPI
+# include <mpi.h>
+#endif
+
+#include <string.h>
+#include <stdio.h>
+#include <math.h>
+#include "global.h"
+#include "fatal_error.h"
+#include "aligned_malloc.h"
+#include "expo.h"
+#include "get_staples.h"
+#include "get_rectangle_staples.h"
+#include "gettime.h"
+#include "measure_gauge_action.h"
+#include "matrix_utils.h"
+#include "xchange/xchange_gauge.h"
+#include "gradient_flow.h"
+#include "measure_clover_field_strength_observables.h"
+#include "meas/field_strength_types.h"
+#include "meas/measurements.h"
+
+
+void step_gradient_flow(su3 ** x0, su3 ** x1, su3 ** x2, su3 ** z, const unsigned int type, const double eps ) {
+  double zfac[5] = { 1, (8.0)/(9.0), (-17.0)/(36.0), (3.0)/(4.0), -1 };
+  double zepsfac[3] = { 0.25, 1, 1 };
+  su3** fields[4];
+
+  double t1;
+  if( g_debug_level >= 4 ) t1 = gettime();
+
+  fields[0] = x0;
+  fields[1] = x1;
+  fields[2] = x2;
+  fields[3] = x0;
+
+#ifdef TM_USE_MPI
+  xchange_gauge(x0);
+#endif
+
+#ifdef TM_USE_OMP
+#pragma omp parallel
+#endif
+  {
+ 
+  su3 ALIGN w,w1,w2;
+  su3 ALIGN z_tmp,z_tmp1;
+
+  // implementation of third-order Runge-Kutta integrator following Luescher's hep-lat/1006.4518
+  // this can probably be improved...
+
+  for( int f = 0; f < 3; ++f ){
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+    for( int x = 0; x < VOLUME; ++x ){
+      for( int mu = 0; mu < 4; ++mu ){
+        get_staples(&w1, x, mu, fields[f]);
+        // usually we dagger the staples, but the sign convention seems to require this
+        _su3_times_su3d(z_tmp,w1,fields[f][x][mu]);
+        project_traceless_antiherm(&z_tmp);
+
+        // implementing the Iwasaki, Symanzik or DBW2 flow from here should be a trivial extension
+        // but it will require adding some (more) parameters and making sure that g_dbw2rand exists
+        // also in the inverter if the measurement is to be carried out there
+        //get_rectangle_staples_general(&w2,x,mu,fields[f]);
+        //_su3_times_su3d(w1,w2,fields[f][x][mu]);
+
+        if(f==0){
+          _real_times_su3(z[x][mu],eps,z_tmp);
+        }else{
+          _real_times_su3(z_tmp,eps*zfac[2*f-1],z_tmp);
+          _su3_refac_acc(z_tmp,zfac[2*f],z[x][mu]);
+          z[x][mu] = z_tmp;
+        }
+        _real_times_su3(z_tmp,zepsfac[f],z[x][mu]);
+        project_traceless_antiherm(&z_tmp);
+        cayley_hamilton_exponent(&w,&z_tmp);
+        _su3_times_su3(fields[f+1][x][mu],w,fields[f][x][mu]);
+      }
+    }
+#ifdef TM_USE_MPI
+#ifdef TM_USE_OMP
+#pragma omp single
+#endif
+    {
+      double tex = gettime();
+      xchange_gauge(fields[f+1]); 
+      if( g_proc_id == 0 && g_debug_level >= 4 ){
+        printf("Time for gauge exchange in gradient flow step: %lf\n", gettime()-tex);
+      }
+    }
+#endif
+  }
+
+  } /* OpenMP parallel closing brace */
+  
+  if( g_proc_id == 0 && g_debug_level >= 4 ){
+    printf("Time for gradient flow step: %lf\n", gettime()-t1);
+  }
+
+}
+
+void gradient_flow_measurement(const int traj, const int id, const int ieo) {
+
+  double t[3], P[3];
+  field_strength_obs_t fso[3];
+  double W=0, tsqE=0;
+  double t1, t2;
+
+  double eps = measurement_list[id].gf_eps;
+  double tmax = measurement_list[id].gf_tmax;
+
+  if( g_proc_id == 0 ) {
+    printf("# Doing gradient flow measurement. id=%d\n", id);
+    printf("# eps=%lf, tmax=%lf\n", eps, tmax);
+  }
+  
+  FILE *outfile;
+  if( g_proc_id == 0 ) {
+    char filename[100];
+    snprintf(filename,100,"gradflow.%06d",traj);
+    outfile = fopen(filename,"w");
+
+    if( outfile == NULL ) {
+      char error_message[200];
+      snprintf(error_message,200,"Couldn't open %s for writing during measurement %d!",filename, id);
+      fatal_error(error_message,"gradient_flow_measurement");
+    }
+
+    fprintf(outfile, "traj t P Eplaq Esym tsqEplaq tsqEsym Wsym Qsym\n");
+  }
+
+  aligned_su3_field_t vt = aligned_su3_field_alloc(VOLUMEPLUSRAND+g_dbw2rand);
+  aligned_su3_field_t x1 = aligned_su3_field_alloc(VOLUMEPLUSRAND+g_dbw2rand);
+  aligned_su3_field_t x2 = aligned_su3_field_alloc(VOLUMEPLUSRAND+g_dbw2rand);
+  aligned_su3_field_t z = aligned_su3_field_alloc(VOLUME);
+
+#ifdef TM_USE_MPI
+  xchange_gauge(g_gauge_field);
+#endif
+  memcpy(vt.field[0],g_gauge_field[0],sizeof(su3)*4*(VOLUMEPLUSRAND+g_dbw2rand));
+
+  t[0] = fso[0].E = fso[0].Q = P[0] = 0.0;
+  t[1] = fso[1].E = fso[1].Q = P[1] = 0.0;
+  t[2] = fso[2].E = fso[2].Q = P[2] = 0.0;
+
+  t1 = gettime();
+  measure_clover_field_strength_observables(vt.field, &fso[2]);
+  P[2] = measure_plaquette(vt.field)/(6.0*VOLUME*g_nproc);
+  t2 = gettime();
+  if(g_proc_id==0 && g_debug_level > 1) {
+    printf("# GRADFLOW: time for field strength observables measurement: %lf\n",t2-t1);
+  }
+
+  while( t[1] < tmax ) {
+    t[0] = t[2];
+    fso[0].E = fso[2].E;
+    fso[0].Q = fso[2].Q;
+    P[0] = P[2];
+    for(int step = 1; step < 3; ++step) {
+      t[step] = t[step-1]+eps;
+      step_gradient_flow(vt.field,x1.field,x2.field,z.field,0,eps);
+      measure_clover_field_strength_observables(vt.field, &fso[step]);
+      P[step] = measure_plaquette(vt.field)/(6.0*VOLUME*g_nproc);
+    }
+    W = t[1]*t[1]*( 2*fso[1].E + t[1]*((fso[2].E - fso[0].E)/(2*eps)) ) ;
+    tsqE = t[1]*t[1]*fso[1].E;
+    
+    if(g_proc_id==0 && g_debug_level >= 3){
+      printf("# GRADFLOW: sym(plaq)  t=%lf 1-P(t)=%1.8lf E(t)=%2.8lf(%2.8lf) t^2E=%2.8lf(%2.8lf) W(t)=%2.8lf Q(t)=%.8lf \n",
+             t[1],
+             1-P[1],
+             fso[1].E,
+             36*(1-P[1]),
+             tsqE,
+             t[1]*t[1]*36*(1-P[1]),
+             W, 
+             fso[1].Q );
+    }
+    if(g_proc_id==0){
+      fprintf(outfile,"%06d %f %2.12lf %2.12lf %2.12lf %2.12lf %2.12lf %2.12lf %.12lf \n",
+                      traj,
+                      t[1],
+                      P[1],
+                      36*(1-P[1]),
+                      fso[1].E,
+                      t[1]*t[1]*36*(1-P[1]),
+                      tsqE,
+                      W,
+                      fso[1].Q );
+      fflush(outfile);
+    }
+  }
+
+  aligned_su3_field_free(&vt);
+  aligned_su3_field_free(&x1);
+  aligned_su3_field_free(&x2);
+  aligned_su3_field_free(&z);
+ 
+  t2 = gettime();
+  
+  if( g_proc_id == 0 ) {
+    if(g_debug_level>1){
+      printf("# GRADFLOW: Gradient flow measurement done in %f seconds!\n",t2-t1);
+    }
+    fclose(outfile);
+  }
+
+  return;
+}
+
diff --git a/meas/gradient_flow.h b/meas/gradient_flow.h
new file mode 100644
index 000000000..43a24ee3e
--- /dev/null
+++ b/meas/gradient_flow.h
@@ -0,0 +1,30 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2013 Albert Deuzeman
+ *               2015 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _GRADIENT_FLOW_H
+#define _GRADIENT_FLOW_H
+
+#include "su3.h"
+
+void step_gradient_flow(su3 ** vt, su3 ** x1, su3 ** x2, su3 ** z, const unsigned int type, const double eps);
+void gradient_flow_measurement(const int traj, const int id, const int ieo);
+
+#endif
diff --git a/meas/measure_clover_field_strength_observables.c b/meas/measure_clover_field_strength_observables.c
new file mode 100644
index 000000000..e37a5038a
--- /dev/null
+++ b/meas/measure_clover_field_strength_observables.c
@@ -0,0 +1,225 @@
+/***********************************************************************
+*
+* Copyright (C) 1995 Ulli Wolff, Stefan Sint
+*               2001,2005 Martin Hasenbusch
+*               2011,2012 Carsten Urbach
+*               2013      Albert Deuzeman
+*               2015,2018 Bartosz Kostrzewa
+*
+* This file is part of tmLQCD.
+*
+* tmLQCD is free software: you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation, either version 3 of the License, or
+* (at your option) any later version.
+* 
+* tmLQCD is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+* 
+* You should have received a copy of the GNU General Public License
+* along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <errno.h>
+#include <time.h>
+#ifdef TM_USE_MPI
+# include <mpi.h>
+#endif
+#ifdef TM_USE_OMP
+# include <omp.h>
+#endif
+#include "global.h"
+#include "su3.h"
+#include "sse.h"
+#include "su3adj.h"
+#include "matrix_utils.h"
+#include "field_strength_types.h" 
+#include "kahan_summation.h"
+#include "omp_accumulator.h"
+#include "tensors.h"
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+void measure_clover_field_strength_observables(const su3 ** const gf, field_strength_obs_t * const fso)
+{
+  // we have iG_\mu\nu = 1/4 P_T.A. [clover] where P is the projection to the
+  // traceless anti-hermitian part
+  // the minus sign compensates for the i^2 in the lattice definition of G_\mu\nu
+  // our traceless anti-hermitian projection includes a factor of 0.5, so instead of 
+  // the usual (1/8)^2 we get (1/4)^2 of the clover
+  // 1/4 from the definition of the energy density <E> = 1\4 Tr[ G_\mu\nu G_\mu\nu ]
+  //
+  // The additional multiplication by 4 (the first factor below), originates in the fact
+  // that we only accumulate over the upper triangle of G_\mu\nu below, which
+  // such that iG_\mu\nu = 1/2 P_T.A. [clover upper triangle]
+  
+  const double energy_density_normalization = - 4 / ( 4 * 16.0 * VOLUME * g_nproc);
+
+  // for the toplogical charge, we would naively have a normalisation of -1/(16 * 32 * pi^2)
+  // but we only sum up contributions from the upper triangle of Gmunu, saving a factor of 4
+  const double topo_charge_normalization = - 1 / ( 4 * 32.0 * M_PI * M_PI );
+  double Eres = 0;
+  double Qres = 0;
+ 
+  // Euclidean 4D totally anti-symemtric tensor 
+  epsilon4_t eps4 = new_epsilon4();
+
+#ifdef TM_USE_OMP
+  // accumulators for thread-local sums
+  omp_re_acc_t Eacc = new_omp_re_acc();
+  omp_re_acc_t Qacc = new_omp_re_acc();
+  // this involves memory allocation, so we need corresponding frees!
+  omp_re_acc_init(&Eacc, 1);
+  omp_re_acc_init(&Qacc, 1);
+#endif
+
+#ifdef TM_USE_MPI
+  double ALIGN mres=0;
+#endif
+
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+    su3 ALIGN v1, v2;
+    double ALIGN E;
+    double ALIGN Q;
+    
+    // kahan accumulators for energy density and top. charge
+    kahan_re_t E_kahan = new_kahan_re();
+    kahan_re_t Q_kahan = new_kahan_re();
+
+    // for the measurement of the top. charge density, we need to temporarily
+    // store the components of Gmunu
+    // for simplicity of notation, we allocate 4x4 but will only use the
+    // upper triangle
+    su3 ALIGN Gmunu[4][4];
+  
+    /*  compute the clover-leaves, store them in Gmunu and compute the energy density
+     *  later compute the topological charge */
+  /*  l  __   __
+        |  | |  |
+        |__| |__|
+        __   __
+        |  | |  |
+        |__| |__| k  */
+  
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+    for(int x = 0; x < VOLUME; x++)
+    {
+      for(int k = 0; k < 4; k++)
+      {
+        for(int l = k+1; l < 4; l++)
+        {
+          int xpk = g_iup[x][k];
+          int xpl = g_iup[x][l];
+          int xmk = g_idn[x][k];
+          int xml = g_idn[x][l];
+          int xpkml = g_idn[xpk][l];
+          int xplmk = g_idn[xpl][k];
+          int xmkml = g_idn[xml][k];
+          const su3 *w1 = &gf[x][k];
+          const su3 *w2 = &gf[xpk][l];
+          const su3 *w3 = &gf[xpl][k];
+          const su3 *w4 = &gf[x][l];
+          _su3_times_su3(v1, *w1, *w2);
+          _su3_times_su3(v2, *w4, *w3);
+          _su3_times_su3d(Gmunu[k][l], v1, v2);
+          w1 = &gf[x][l];
+          w2 = &gf[xplmk][k];
+          w3 = &gf[xmk][l];
+          w4 = &gf[xmk][k];
+          _su3_times_su3d(v1, *w1, *w2);
+          _su3d_times_su3(v2, *w3, *w4);
+          _su3_times_su3_acc(Gmunu[k][l], v1, v2);
+          w1 = &gf[xmk][k];
+          w2 = &gf[xmkml][l];
+          w3 = &gf[xmkml][k];
+          w4 = &gf[xml][l];
+          _su3_times_su3(v1, *w2, *w1);
+          _su3_times_su3(v2, *w3, *w4);
+          _su3d_times_su3_acc(Gmunu[k][l], v1, v2);
+          w1 = &gf[xml][l];
+          w2 = &gf[xml][k];
+          w3 = &gf[xpkml][l];
+          w4 = &gf[x][k];
+          _su3d_times_su3(v1, *w1, *w2);
+          _su3_times_su3d(v2, *w3, *w4);
+          _su3_times_su3_acc(Gmunu[k][l], v1, v2);
+          project_traceless_antiherm(&Gmunu[k][l]);
+          
+          // compute and accumulate the energy density at this stage
+          _trace_su3_times_su3(E, Gmunu[k][l], Gmunu[k][l]);
+          kahan_sum_re_step(E, &E_kahan);
+        }
+      }
+      
+      // sum up the topological charge contribution now
+      for( int i = 0; i < eps4.N; i++ ){
+        int i1 = eps4.eps_idx[i][0];
+        int i2 = eps4.eps_idx[i][1];
+        int i3 = eps4.eps_idx[i][2];
+        int i4 = eps4.eps_idx[i][3];
+
+        // when Gmunu components from the lower triangle are to be used,
+        // we can simply skip them and multiply our normalisation by a factor of two
+        if( eps4.eps_idx[i][1] < eps4.eps_idx[i][0] ){
+          continue;
+        }
+        if( eps4.eps_idx[i][3] < eps4.eps_idx[i][2] ){
+          continue;
+        }
+        
+        _trace_su3_times_su3( Q, 
+                              Gmunu[ i1 ][ i2 ],
+                              Gmunu[ i3 ][ i4 ] );
+
+        // (Kahan) accumulate topological charge and take care of signs coming
+        // the Levi-Civita
+        kahan_sum_re_step(eps4.eps_val[i]*Q, &Q_kahan);
+      }
+
+    }
+    
+    E = kahan_sum_re_final(&E_kahan);
+    Q = kahan_sum_re_final(&Q_kahan);
+
+#ifdef TM_USE_OMP
+    omp_re_acc_add(&Eacc, &E, 1);
+    omp_re_acc_add(&Qacc, &Q, 1);
+  } /* OpenMP parallel closing brace */
+
+  omp_re_acc_reduce(&Eres, &Eacc);  
+  omp_re_acc_reduce(&Qres, &Qacc);
+  // free omp accumulator memory
+  omp_re_acc_free(&Eacc);
+  omp_re_acc_free(&Qacc); 
+
+#else
+  Eres = E;
+  Qres = Q;
+#endif
+
+#ifdef TM_USE_MPI
+  MPI_Allreduce(&Eres, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  Eres = mres;
+  MPI_Allreduce(&Qres, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  Qres = mres;
+#endif
+  fso->E = energy_density_normalization * Eres;
+  fso->Q = topo_charge_normalization * Qres;
+}
diff --git a/meas/measure_clover_field_strength_observables.h b/meas/measure_clover_field_strength_observables.h
new file mode 100644
index 000000000..93b177b3c
--- /dev/null
+++ b/meas/measure_clover_field_strength_observables.h
@@ -0,0 +1,28 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2015,2018 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef MEASURE_CLOVER_FIELD_STRENGTH_OBSERVABLES_H
+#define MEASURE_CLOVER_FIELD_STRENGTH_OBSERVABLES_H
+
+#include "meas/field_strength_types.h"
+
+void measure_clover_field_strength_observables(const su3 ** const gf, field_strength_obs_t * const ret);
+
+#endif  
diff --git a/measurements.c b/meas/measurements.c
similarity index 83%
rename from measurements.c
rename to meas/measurements.c
index 24765eda5..b18d263f9 100644
--- a/measurements.c
+++ b/meas/measurements.c
@@ -20,7 +20,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -29,10 +29,12 @@
 #include "global.h"
 #include "default_input_values.h"
 #include "read_input.h"
+
 #include "pion_norm.h"
-#include "online_measurement.h"
+#include "correlators.h"
 #include "polyakov_loop.h"
-#include "measure_oriented_plaquettes.h"
+#include "oriented_plaquettes.h"
+#include "gradient_flow.h"
 #include "measurements.h"
 
 measurement measurement_list[max_no_measurements];
@@ -55,13 +57,14 @@ int init_measurements(){
  int i;
   for(i = 0; i < no_measurements; i++) {
  
+    measurement_list[i].seed = random_seed;
     if(measurement_list[i].type == ONLINE) {
-      measurement_list[i].measurefunc = &online_measurement;
+      measurement_list[i].measurefunc = &correlators_measurement;
       measurement_list[i].max_source_slice = g_nproc_t*T;
     }
 
     if(measurement_list[i].type == PIONNORM) {
-      measurement_list[i].measurefunc = &pion_norm;
+      measurement_list[i].measurefunc = &pion_norm_measurement;
       measurement_list[i].max_source_slice = g_nproc_z*LZ;
     }
     
@@ -70,7 +73,11 @@ int init_measurements(){
     }
 
     if(measurement_list[i].type == ORIENTED_PLAQUETTES) {
-      measurement_list[i].measurefunc = oriented_plaquettes_measurement;
+      measurement_list[i].measurefunc = &oriented_plaquettes_measurement;
+    }
+
+    if(measurement_list[i].type == GRADIENT_FLOW) {
+      measurement_list[i].measurefunc = &gradient_flow_measurement;
     }
     
     measurement_list[i].id = i;
diff --git a/measurements.h b/meas/measurements.h
similarity index 86%
rename from measurements.h
rename to meas/measurements.h
index 676d27965..73e695a17 100644
--- a/measurements.h
+++ b/meas/measurements.h
@@ -29,7 +29,8 @@ enum MEAS_TYPE {
   ONLINE, 
   PIONNORM, 
   POLYAKOV, 
-  ORIENTED_PLAQUETTES 
+  ORIENTED_PLAQUETTES,
+  GRADIENT_FLOW 
   };
 
 typedef struct {
@@ -43,7 +44,10 @@ typedef struct {
   int max_iter;
   /* for polyakov loop */
   int direction;
-  
+
+  // random seed
+  unsigned int seed;
+
   /* how it's usually called */
   char name[100];
 
@@ -52,7 +56,16 @@ typedef struct {
     T(LZ) by init_measurements
   */
   int max_source_slice;
-  
+
+  /* for correlators, can also measure all time-slices or average over samples */
+  int all_time_slices;
+  int no_samples;
+
+  // step size for gradient flow measurement
+  double gf_eps;
+  // maximum flow time for gf measuremnt
+  double gf_tmax;
+
   /* functions for the measurement */
   void (*measurefunc) (const int traj, const int id, const int ieo);
 } measurement;
diff --git a/measure_oriented_plaquettes.c b/meas/oriented_plaquettes.c
similarity index 92%
rename from measure_oriented_plaquettes.c
rename to meas/oriented_plaquettes.c
index aa111aab1..9ca94de3e 100644
--- a/measure_oriented_plaquettes.c
+++ b/meas/oriented_plaquettes.c
@@ -20,24 +20,24 @@
  ************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include <config.h>
+# include <tmlqcd_config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 
 #include <string.h>
 #include <stdio.h>
 
+#include "global.h"
 #include "su3.h"
 #include "geometry_eo.h"
-#include "global.h"
-#include "measure_oriented_plaquettes.h"
+#include "oriented_plaquettes.h"
 #include "fatal_error.h"
-
+#include "measurements.h"
 
 void measure_oriented_plaquettes(const su3 ** const gf, double *plaq) {
-#ifdef MPI
+#ifdef TM_USE_MPI
   double ALIGN mplaq[6];
 #endif
 
@@ -77,7 +77,7 @@ void measure_oriented_plaquettes(const su3 ** const gf, double *plaq) {
     plaq[j] = kc[j]/(g_nproc*VOLUME);
   }
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Allreduce(plaq, mplaq, 6, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
   for(int j = 0; j < 6; j++)
     plaq[j] = mplaq[j];
@@ -87,6 +87,10 @@ void measure_oriented_plaquettes(const su3 ** const gf, double *plaq) {
 
 void oriented_plaquettes_measurement(const int traj, const int id, const int ieo) {
   double plaq[6];
+
+  if( g_proc_id == 0 ) {
+    printf("# Doing oriented plaquettes measurement.\n");
+  }
   measure_oriented_plaquettes((const su3** const)g_gauge_field,plaq);
 
   if( g_proc_id == 0 ) {
diff --git a/measure_oriented_plaquettes.h b/meas/oriented_plaquettes.h
similarity index 100%
rename from measure_oriented_plaquettes.h
rename to meas/oriented_plaquettes.h
diff --git a/pion_norm.c b/meas/pion_norm.c
similarity index 94%
rename from pion_norm.c
rename to meas/pion_norm.c
index acf24cbe2..cd3523ad8 100644
--- a/pion_norm.c
+++ b/meas/pion_norm.c
@@ -20,7 +20,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -41,7 +41,7 @@
 #include "pion_norm.h"
 #include "gettime.h"
 
-void pion_norm(const int traj, const int id, const int ieo) {
+void pion_norm_measurement(const int traj, const int id, const int ieo) {
   int i, j, z, zz, z0;
   double *Cpp;
   double res = 0.;
@@ -49,7 +49,7 @@ void pion_norm(const int traj, const int id, const int ieo) {
   double atime, etime;
   float tmp;
   solver_params_t tmp_solver_params;
-#ifdef MPI
+#ifdef TM_USE_MPI
   double mpi_res = 0.;
 #endif
   FILE *ofs, *ofs2;
@@ -69,7 +69,7 @@ void pion_norm(const int traj, const int id, const int ieo) {
   }
   ranlxs(&tmp, 1);
   z0 = (int)(measurement_list[id].max_source_slice*tmp);
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Bcast(&z0, 1, MPI_INT, 0, MPI_COMM_WORLD);
 #endif
 
@@ -87,7 +87,8 @@ void pion_norm(const int traj, const int id, const int ieo) {
   /* invert on the stochastic source */
   invert_eo(g_spinor_field[2], g_spinor_field[3], 
             g_spinor_field[0], g_spinor_field[1],
-            1.e-14, measurement_list[id].max_iter, CG, 1, 0, ieo, 0, NULL,tmp_solver_params, -1);
+            1.e-14, measurement_list[id].max_iter, CG, 1, 0, ieo, 0, NULL,tmp_solver_params, -1,
+            NO_EXT_INV, SLOPPY_DOUBLE, NO_COMPRESSION);
 
   /* now we bring it to normal format */
   /* here we use implicitly DUM_MATRIX and DUM_MATRIX+1 */
@@ -107,14 +108,14 @@ void pion_norm(const int traj, const int id, const int ieo) {
 
 
     
-#if defined MPI
+#if defined TM_USE_MPI
     MPI_Reduce(&res, &mpi_res, 1, MPI_DOUBLE, MPI_SUM, 0, g_mpi_z_slices);
     res = mpi_res;
 #endif
     Cpp[z+g_proc_coords[3]*LZ] = +res/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_t*T)*2.;
   }
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   /* some gymnastics needed in case of parallelisation */
   if(g_mpi_z_rank == 0) {
     MPI_Gather(&Cpp[g_proc_coords[3]*LZ], LZ, MPI_DOUBLE, Cpp, LZ, MPI_DOUBLE, 0, g_mpi_ST_slices);
diff --git a/pion_norm.h b/meas/pion_norm.h
similarity index 93%
rename from pion_norm.h
rename to meas/pion_norm.h
index 720664b4a..49ca916d4 100644
--- a/pion_norm.h
+++ b/meas/pion_norm.h
@@ -23,7 +23,7 @@
 #ifndef _PION_NORM_H
 #define _PION_NORM_H
 
-void pion_norm(const int traj, const int id, const int ieo);
+void pion_norm_measurement(const int traj, const int id, const int ieo);
 
 #endif
 
diff --git a/polyakov_loop.c b/meas/polyakov_loop.c
similarity index 98%
rename from polyakov_loop.c
rename to meas/polyakov_loop.c
index faa885155..62a481a69 100644
--- a/polyakov_loop.c
+++ b/meas/polyakov_loop.c
@@ -29,13 +29,13 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <time.h>
 #include <math.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
@@ -56,7 +56,7 @@ void polyakov_loop(_Complex double * pl_, const int mu) {
   su3 *v = NULL , *w = NULL;
   static _Complex double pl; 
   /* For the Kahan summation:*/
-#ifdef MPI
+#ifdef TM_USE_MPI
   static _Complex double pls; 
 #endif
   static _Complex double ks = 0.0, kc = 0.0, tr, ts, tt;
@@ -67,7 +67,7 @@ void polyakov_loop(_Complex double * pl_, const int mu) {
     fprintf(stderr, "Wrong parameter for Polyakov loop calculation in polyakov_loop.c:\n");
     fprintf(stderr, "Only direction %d and %d are allowed.\n",2,3);
     fprintf(stderr, "Actual value is %d! Aborting...\n",mu);
-#ifdef MPI
+#ifdef TM_USE_MPI
     MPI_Abort(MPI_COMM_WORLD, 10);
     MPI_Finalize();
 #endif
@@ -142,7 +142,7 @@ void polyakov_loop(_Complex double * pl_, const int mu) {
   
   
   /* Collect the results and return:*/
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Allreduce(&pl, &pls, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
   pl=pls;
 #endif
@@ -170,7 +170,7 @@ int polyakov_loop_0(const int nstore, _Complex double *pl) {
   
   FILE *ofs = NULL;
   
-#ifdef MPI
+#ifdef TM_USE_MPI
   int iproc;
   MPI_Status status;
   su3 *tmp_nnb = NULL;
@@ -217,7 +217,7 @@ int polyakov_loop_0(const int nstore, _Complex double *pl) {
   
   /********************************************************************************/
     
-#ifdef MPI
+#ifdef TM_USE_MPI
   /***************
    * global part *
    ***************/
@@ -273,7 +273,7 @@ int polyakov_loop_0(const int nstore, _Complex double *pl) {
     pl_tmp = ks + kc;
   }
   
-#ifdef MPI
+#ifdef TM_USE_MPI
   /* (3) sum over all contributions from all nodes (also nodes with pl_tmp=0;
      apparently the easiest way) */
   MPI_Reduce(&pl_tmp, pl, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, 0, g_cart_grid);
@@ -303,7 +303,7 @@ int polyakov_loop_0(const int nstore, _Complex double *pl) {
     fprintf(ofs, "%25.16e\t%25.16e\n", creal(*pl), cimag(*pl)); 
     fclose(ofs);
   }
-#ifdef MPI
+#ifdef TM_USE_MPI
   free(tmp_nnb);
 #endif
   free(tmp_loc);
@@ -335,7 +335,7 @@ int polyakov_loop_dir(
 
   FILE *ofs;
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   int rank_slice, rank_ray;
   MPI_Comm slice, ray;
   su3 *tmp_ray;
@@ -434,7 +434,7 @@ int polyakov_loop_dir(
 
   /********************************************************************************/
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   /***************
    * global part *
    ***************/
@@ -484,7 +484,7 @@ int polyakov_loop_dir(
     kc = 0.0;
     ks = 0.0;
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 #  ifdef PARALLELXYZT
     u = tmp_ray;
 #  else
@@ -506,7 +506,7 @@ int polyakov_loop_dir(
     }
     pl_tmp = ks + kc;
 
-#ifdef MPI
+#ifdef TM_USE_MPI
     MPI_Reduce(&pl_tmp, &pl, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, 0, slice);
   }
 #  ifndef PARALLELXYZT
@@ -524,7 +524,7 @@ int polyakov_loop_dir(
   /* normalization pl |-> pl / ( 3 * 3-dim. volume)*/
   VOLUME3 = VOL3;
    
-#ifdef MPI
+#ifdef TM_USE_MPI
   if(rank_slice==0 && rank_ray==0) { /* this process has the sum 
 					of the Polyakov loop values */
     if(dir==0) { 
@@ -550,7 +550,7 @@ int polyakov_loop_dir(
     }
     fprintf(ofs, "%4d\t%2d\t%25.16e\t%25.16e\n", nstore, dir, creal(pl), cimag(pl));
     fclose(ofs);
-#if defined MPI
+#if defined TM_USE_MPI
   }
 #endif
   free(tmp_loc);
diff --git a/polyakov_loop.h b/meas/polyakov_loop.h
similarity index 100%
rename from polyakov_loop.h
rename to meas/polyakov_loop.h
diff --git a/measure_gauge_action.c b/measure_gauge_action.c
index 04bb5f358..92aa3c2bb 100644
--- a/measure_gauge_action.c
+++ b/measure_gauge_action.c
@@ -27,12 +27,12 @@
  ************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include "su3.h"
@@ -45,11 +45,11 @@
 
 double measure_plaquette(const su3 ** const gf) {
   static double res;
-#ifdef MPI
+#ifdef TM_USE_MPI
   double ALIGN mres;
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
   int thread_num = omp_get_thread_num();
@@ -61,7 +61,7 @@ double measure_plaquette(const su3 ** const gf) {
   double ALIGN ac, ks, kc, tr, ts, tt;
 
   kc=0.0; ks=0.0;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (int ix = 0; ix < VOLUME; ix++){
@@ -85,20 +85,20 @@ double measure_plaquette(const su3 ** const gf) {
     }
   }
   kc=(kc+ks)/3.0;
-#ifdef OMP
+#ifdef TM_USE_OMP
   g_omp_acc_re[thread_num] = kc;
 #else
   res = kc;
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP parallel closing brace */
 
   res = 0.0;
   for(int i=0; i < omp_num_threads; ++i)
     res += g_omp_acc_re[i];
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
   res = mres;
 #endif
@@ -107,11 +107,11 @@ double measure_plaquette(const su3 ** const gf) {
 
 double measure_gauge_action(const su3 ** const gf, const double lambda) {
   static double res;
-#ifdef MPI
+#ifdef TM_USE_MPI
   double ALIGN mres;
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
   int thread_num = omp_get_thread_num();
@@ -123,7 +123,7 @@ double measure_gauge_action(const su3 ** const gf, const double lambda) {
   double ALIGN ac, ks, kc, tr, ts, tt;
 
   kc=0.0; ks=0.0;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (int ix = 0; ix < VOLUME; ix++){
@@ -167,20 +167,20 @@ double measure_gauge_action(const su3 ** const gf, const double lambda) {
     }
   }
   kc=(kc+ks)/3.0;
-#ifdef OMP
+#ifdef TM_USE_OMP
   g_omp_acc_re[thread_num] = kc;
 #else
   res = kc;
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP parallel closing brace */
 
   res = 0.0;
   for(int i=0; i < omp_num_threads; ++i)
     res += g_omp_acc_re[i];
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
   res = mres;
 #endif
diff --git a/measure_rectangles.c b/measure_rectangles.c
index 82ad165bd..a7de2e5fb 100644
--- a/measure_rectangles.c
+++ b/measure_rectangles.c
@@ -32,12 +32,12 @@
  *******************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include "global.h"
@@ -50,11 +50,11 @@
 
 double measure_rectangles(const su3 ** const gf) {
   static double res;
-#ifdef MPI
+#ifdef TM_USE_MPI
   double ALIGN mres;
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
   int thread_num = omp_get_thread_num();
@@ -67,7 +67,7 @@ double measure_rectangles(const su3 ** const gf) {
 
   kc = 0.0;
   ks = 0.0;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (i = 0; i < VOLUME; i++) {
@@ -117,13 +117,13 @@ double measure_rectangles(const su3 ** const gf) {
     }
   }
   kc=(kc+ks)/3.0;
-#ifdef OMP
+#ifdef TM_USE_OMP
   g_omp_acc_re[thread_num] = kc;
 #else
   res = kc;
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP parallel closing brace */
   
   res = 0.0;
@@ -131,7 +131,7 @@ double measure_rectangles(const su3 ** const gf) {
     res += g_omp_acc_re[i];
 #else
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
   res = mres;
 #endif
diff --git a/misc_types.h b/misc_types.h
new file mode 100644
index 000000000..36b2564ba
--- /dev/null
+++ b/misc_types.h
@@ -0,0 +1,93 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2017 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef MISC_TYPES_H
+#define MISC_TYPES_H
+
+#ifdef TM_USE_MPI
+#include <mpi.h>
+#endif
+
+#ifdef TM_USE_QPHIX
+#include "qphix/qphix_config.h"
+#endif
+
+#ifdef QPHIX_QMP_COMMS
+#include <qmp.h>
+#endif
+
+#include "tm_debug_printf.h"
+
+#define TM_GAUGE_FIELD_NAME_LENGTH 100
+#define TM_GAUGE_PROPAGATE_THRESHOLD 10.0
+#define TM_GAUGE_PROPAGATE_MIN 0.01
+
+/* enumeration type for the identity of the program
+ * which is being executed
+ * this is useful to unify various utility functions which
+ * otherwise lead to a lot of code duplication */
+typedef enum tm_ProgramId_t {
+  TM_PROGRAM_HMC_TM = 0,
+  TM_PROGRAM_INVERT,
+  TM_PROGRAM_OFFLINE_MEASUREMENT,
+  TM_PROGRAM_BENCHMARK,
+  TM_PROGRAM_EXTERNAL
+} tm_ProgramId_t;
+
+/* enumeration type for return value 
+ * we follow http://tldp.org/LDP/abs/html/exitcodes.html for the starting 
+ * value */
+typedef enum tm_ExitCode_t {
+  TM_EXIT_SUCCESS = 0,
+  TM_EXIT_INVALID_CMDLINE_ARG = 166
+} tm_ExitCode_t;
+
+/* enumeration type for the sloppy prec. of the inverter */
+typedef enum SloppyPrecision_s {
+  SLOPPY_DOUBLE = 0,
+  SLOPPY_SINGLE,
+  SLOPPY_HALF
+} SloppyPrecision;
+
+/* enumeration type for the compression of the inverter */
+typedef enum CompressionType_s {
+  NO_COMPRESSION = 18,
+  COMPRESSION_12 = 12,
+  COMPRESSION_8  = 8
+} CompressionType;
+
+/* enumeration type for the external inverter */
+typedef enum ExternalInverter_s {
+  NO_EXT_INV = 0,
+  QUDA_INVERTER,
+  QPHIX_INVERTER
+} ExternalInverter;
+
+typedef enum backup_restore_t {
+  TM_BACKUP_GLOBALS = 0,
+  TM_RESTORE_GLOBALS
+} backup_restore_t;
+
+typedef enum real_imag_t {
+  TM_REAL = 0,
+  TM_IMAG
+} real_imag_t;
+
+#endif // MISC_TYPES_H
diff --git a/monomial/Makefile.in b/monomial/Makefile.in
index 1e3a3e4cc..596123828 100644
--- a/monomial/Makefile.in
+++ b/monomial/Makefile.in
@@ -32,6 +32,7 @@ COMPILE = ${CC} $(DEFS) ${INCLUDES} ${CFLAGS}
 LIBRARIES = libmonomial
 libmonomial_TARGETS =  nddetratio_monomial monomial det_monomial detratio_monomial \
 	gauge_monomial ndpoly_monomial clover_trlog_monomial cloverdet_monomial cloverdetratio_monomial \
+	cloverdetratio_rwmonomial \
 	clovernd_trlog_monomial poly_monomial cloverndpoly_monomial moment_energy \
 	ndrat_monomial ndratcor_monomial rat_monomial ratcor_monomial monitor_forces
 
@@ -62,10 +63,10 @@ include ${top_srcdir}/Makefile.global
 
 # rule to compile objects
 
-${libmonomial_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h
+${libmonomial_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config.h
 	$(COMPILE) ${OPTARGS} -c $<
 
-${libmonomial_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h
+${libmonomial_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config.h
 	$(COMPILE) ${SOPTARGS} -c $<
 
 # rule to make libmonomial
diff --git a/monomial/clover_trlog_monomial.c b/monomial/clover_trlog_monomial.c
index c7abdd53c..ccaf32ce8 100644
--- a/monomial/clover_trlog_monomial.c
+++ b/monomial/clover_trlog_monomial.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/monomial/cloverdet_monomial.c b/monomial/cloverdet_monomial.c
index 48ce9feae..cf30a2513 100644
--- a/monomial/cloverdet_monomial.c
+++ b/monomial/cloverdet_monomial.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -34,17 +34,20 @@
 #include "gettime.h"
 #include "linalg_eo.h"
 #include "deriv_Sb.h"
+#include "deriv_Sb_D_psi.h"
 #include "gamma.h"
 #include "operator/tm_operators.h"
 #include "operator/Hopping_Matrix.h"
 #include "solver/chrono_guess.h"
 #include "solver/solver.h"
+#include "solver/monomial_solve.h"
 #include "operator/clover_leaf.h"
 #include "read_input.h"
 #include "hamiltonian_field.h"
 #include "boundary.h"
 #include "monomial/monomial.h"
 #include "operator/clovertm_operators.h"
+#include "operator/clovertm_operators_32.h"
 #include "cloverdet_monomial.h"
 
 /* think about chronological solver ! */
@@ -52,6 +55,7 @@
 void cloverdet_derivative(const int id, hamiltonian_field_t * const hf) {
   monomial * mnl = &monomial_list[id];
   double atime, etime;
+  int N = VOLUME/2;
   atime = gettime();
   for(int i = 0; i < VOLUME; i++) { 
     for(int mu = 0; mu < 4; mu++) { 
@@ -63,20 +67,27 @@ void cloverdet_derivative(const int id, hamiltonian_field_t * const hf) {
   mnl->forcefactor = 1.;
   /*********************************************************************
    * 
-   * even/odd version 
    *
    * This a term is det(\hat Q^2(\mu))
    *
    *********************************************************************/
   
+  mnl_backup_restore_globals(TM_BACKUP_GLOBALS);
+  g_c_sw = mnl->c_sw;
   g_mu = mnl->mu;
   g_mu3 = mnl->rho;
+  g_kappa = mnl->kappa;
   boundary(mnl->kappa);
   
   // we compute the clover term (1 + T_ee(oo)) for all sites x
   sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
   // we invert it for the even sites only
-  sw_invert(EE, mnl->mu);
+  if(!mnl->even_odd_flag) {
+    N = VOLUME;
+  }
+  else {
+    sw_invert(EE, mnl->mu);
+  }
   
   if(mnl->solver != CG && g_proc_id == 0) {
     fprintf(stderr, "Bicgstab currently not implemented, using CG instead! (cloverdet_monomial.c)\n");
@@ -85,41 +96,49 @@ void cloverdet_derivative(const int id, hamiltonian_field_t * const hf) {
   // Invert Q_{+} Q_{-}
   // X_o -> w_fields[1]
   chrono_guess(mnl->w_fields[1], mnl->pf, mnl->csg_field, mnl->csg_index_array,
-	       mnl->csg_N, mnl->csg_n, VOLUME/2, mnl->Qsq);
-  mnl->iter1 += cg_her(mnl->w_fields[1], mnl->pf, mnl->maxiter, mnl->forceprec, 
-		       g_relative_precision_flag, VOLUME/2, mnl->Qsq);
+               mnl->csg_N, mnl->csg_n, VOLUME/2, mnl->Qsq);
+  mnl->iter1 += solve_degenerate(mnl->w_fields[1], mnl->pf, mnl->solver_params, mnl->maxiter,
+                                 mnl->forceprec, g_relative_precision_flag, VOLUME/2, mnl->Qsq, 
+                                 mnl->solver);
   chrono_add_solution(mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array,
-		      mnl->csg_N, &mnl->csg_n, VOLUME/2);
+                      mnl->csg_N, &mnl->csg_n, N);
   
   // Y_o -> w_fields[0]
   mnl->Qm(mnl->w_fields[0], mnl->w_fields[1]);
+  if(mnl->even_odd_flag) {
+    // apply Hopping Matrix M_{eo}
+    // to get the even sites of X_e
+    H_eo_sw_inv_psi(mnl->w_fields[2], mnl->w_fields[1], EO, -1, mnl->mu);
+    // \delta Q sandwitched by Y_o^\dagger and X_e
+    deriv_Sb(OE, mnl->w_fields[0], mnl->w_fields[2], hf, mnl->forcefactor); 
+    
+    // to get the even sites of Y_e
+    H_eo_sw_inv_psi(mnl->w_fields[3], mnl->w_fields[0], EO, +1, mnl->mu);
+    // \delta Q sandwitched by Y_e^\dagger and X_o
+    // uses the gauge field in hf and changes the derivative fields in hf
+    deriv_Sb(EO, mnl->w_fields[3], mnl->w_fields[1], hf, mnl->forcefactor);
+    
+    // here comes the clover term...
+    // computes the insertion matrices for S_eff
+    // result is written to swp and swm
+    // even/even sites sandwiched by gamma_5 Y_e and gamma_5 X_e
+    sw_spinor_eo(EE, mnl->w_fields[2], mnl->w_fields[3], mnl->forcefactor);
+    
+    // odd/odd sites sandwiched by gamma_5 Y_o and gamma_5 X_o
+    sw_spinor_eo(OO, mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
   
-  // apply Hopping Matrix M_{eo}
-  // to get the even sites of X_e
-  H_eo_sw_inv_psi(mnl->w_fields[2], mnl->w_fields[1], EO, -1, mnl->mu);
-  // \delta Q sandwitched by Y_o^\dagger and X_e
-  deriv_Sb(OE, mnl->w_fields[0], mnl->w_fields[2], hf, mnl->forcefactor); 
-  
-  // to get the even sites of Y_e
-  H_eo_sw_inv_psi(mnl->w_fields[3], mnl->w_fields[0], EO, +1, mnl->mu);
-  // \delta Q sandwitched by Y_e^\dagger and X_o
-  // uses the gauge field in hf and changes the derivative fields in hf
-  deriv_Sb(EO, mnl->w_fields[3], mnl->w_fields[1], hf, mnl->forcefactor);
-  
-  // here comes the clover term...
-  // computes the insertion matrices for S_eff
-  // result is written to swp and swm
-  // even/even sites sandwiched by gamma_5 Y_e and gamma_5 X_e
-  sw_spinor(EE, mnl->w_fields[2], mnl->w_fields[3], mnl->forcefactor);
-  
-  // odd/odd sites sandwiched by gamma_5 Y_o and gamma_5 X_o
-  sw_spinor(OO, mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
-  
-  // compute the contribution for the det-part
-  // we again compute only the insertion matrices for S_det
-  // the result is added to swp and swm
-  // even sites only!
-  sw_deriv(EE, mnl->mu);
+    // compute the contribution for the det-part
+    // we again compute only the insertion matrices for S_det
+    // the result is added to swp and swm
+    // even sites only!
+    sw_deriv(EE, mnl->mu);
+  }
+  else {
+    /* \delta Q sandwitched by Y^\dagger and X */
+    deriv_Sb_D_psi(mnl->w_fields[0], mnl->w_fields[1], hf, mnl->forcefactor);
+
+    sw_spinor(mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
+  }
   
   // now we compute
   // finally, using the insertion matrices stored in swm and swp
@@ -127,9 +146,7 @@ void cloverdet_derivative(const int id, hamiltonian_field_t * const hf) {
   // uses the gaugefields in hf and changes the derivative field in hf
   sw_all(hf, mnl->kappa, mnl->c_sw);
 
-  g_mu = g_mu1;
-  g_mu3 = 0.;
-  boundary(g_kappa);
+  mnl_backup_restore_globals(TM_RESTORE_GLOBALS);
   etime = gettime();
   if(g_debug_level > 1 && g_proc_id == 0) {
     printf("# Time for %s monomial derivative: %e s\n", mnl->name, etime-atime);
@@ -143,10 +160,13 @@ void cloverdet_heatbath(const int id, hamiltonian_field_t * const hf) {
   monomial * mnl = &monomial_list[id];
   double atime, etime;
   atime = gettime();
+  int N = VOLUME/2;
 
+  mnl_backup_restore_globals(TM_BACKUP_GLOBALS);
   g_mu = mnl->mu;
   g_mu3 = mnl->rho;
   g_c_sw = mnl->c_sw;
+  g_kappa = mnl->kappa;
   boundary(mnl->kappa);
   mnl->csg_n = 0;
   mnl->csg_n2 = 0;
@@ -155,18 +175,22 @@ void cloverdet_heatbath(const int id, hamiltonian_field_t * const hf) {
 
   init_sw_fields();
   sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
-  sw_invert(EE, mnl->mu);
 
-  random_spinor_field_eo(mnl->w_fields[0], mnl->rngrepro, RN_GAUSS);
-  mnl->energy0 = square_norm(mnl->w_fields[0], VOLUME/2, 1);
+  if(!mnl->even_odd_flag) {
+    N = VOLUME;
+    random_spinor_field_lexic(mnl->w_fields[0], mnl->rngrepro, RN_GAUSS);
+  }
+  else {
+    sw_invert(EE, mnl->mu);
+    random_spinor_field_eo(mnl->w_fields[0], mnl->rngrepro, RN_GAUSS);
+  }
+  mnl->energy0 = square_norm(mnl->w_fields[0], N, 1);
   
   mnl->Qp(mnl->pf, mnl->w_fields[0]);
   chrono_add_solution(mnl->pf, mnl->csg_field, mnl->csg_index_array,
-		      mnl->csg_N, &mnl->csg_n, VOLUME/2);
+                      mnl->csg_N, &mnl->csg_n, N);
 
-  g_mu = g_mu1;
-  g_mu3 = 0.;
-  boundary(g_kappa);
+  mnl_backup_restore_globals(TM_RESTORE_GLOBALS);
   etime = gettime();
   if(g_proc_id == 0) {
     if(g_debug_level > 1) {
@@ -185,29 +209,43 @@ double cloverdet_acc(const int id, hamiltonian_field_t * const hf) {
   int save_sloppy = g_sloppy_precision_flag;
   double atime, etime;
   atime = gettime();
+  int N = VOLUME/2;
 
+  mnl_backup_restore_globals(TM_BACKUP_GLOBALS);
   g_mu = mnl->mu;
   g_mu3 = mnl->rho;
   g_c_sw = mnl->c_sw;
+  g_kappa = mnl->kappa;
   boundary(mnl->kappa);
 
   sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
-  sw_invert(EE, mnl->mu);
 
-  chrono_guess(mnl->w_fields[0], mnl->pf, mnl->csg_field, mnl->csg_index_array,
-	       mnl->csg_N, mnl->csg_n, VOLUME/2, mnl->Qsq);
+  if(!mnl->even_odd_flag) {
+    N = VOLUME;
+  }
+  else {
+    sw_invert(EE, mnl->mu);
+  }
+
   g_sloppy_precision_flag = 0;
-  mnl->iter0 = cg_her(mnl->w_fields[0], mnl->pf, mnl->maxiter, mnl->accprec,  
-		      g_relative_precision_flag, VOLUME/2, mnl->Qsq); 
-  mnl->Qm(mnl->w_fields[0], mnl->w_fields[0]);
-  
+
+  if( mnl->solver == MG ){
+      chrono_guess(mnl->w_fields[1], mnl->pf, mnl->csg_field, mnl->csg_index_array,
+		   mnl->csg_N, mnl->csg_n, N, mnl->Qp);
+      mnl->iter0 += solve_degenerate(mnl->w_fields[0], mnl->pf, mnl->solver_params, mnl->maxiter, mnl->accprec,  
+				     g_relative_precision_flag, VOLUME/2, mnl->Qp, mnl->solver); 
+  } else {
+      chrono_guess(mnl->w_fields[1], mnl->pf, mnl->csg_field, mnl->csg_index_array,
+		   mnl->csg_N, mnl->csg_n, N, mnl->Qsq);
+      mnl->iter0 += solve_degenerate(mnl->w_fields[0], mnl->pf, mnl->solver_params, mnl->maxiter, mnl->accprec,  
+				     g_relative_precision_flag, VOLUME/2, mnl->Qsq, mnl->solver); 
+      mnl->Qm(mnl->w_fields[0], mnl->w_fields[0]);
+  }
   g_sloppy_precision_flag = save_sloppy;
   /* Compute the energy contr. from first field */
-  mnl->energy1 = square_norm(mnl->w_fields[0], VOLUME/2, 1);
+  mnl->energy1 = square_norm(mnl->w_fields[0], N, 1);
 
-  g_mu = g_mu1;
-  g_mu3 = 0.;
-  boundary(g_kappa);
+  mnl_backup_restore_globals(TM_RESTORE_GLOBALS);
   etime = gettime();
   if(g_proc_id == 0) {
     if(g_debug_level > 1) {
@@ -215,7 +253,7 @@ double cloverdet_acc(const int id, hamiltonian_field_t * const hf) {
     }
     if(g_debug_level > 3) {
       printf("called cloverdet_acc for id %d dH = %1.10e\n", 
-	     id, mnl->energy1 - mnl->energy0);
+             id, mnl->energy1 - mnl->energy0);
     }
   }
   return(mnl->energy1 - mnl->energy0);
diff --git a/monomial/cloverdetratio_monomial.c b/monomial/cloverdetratio_monomial.c
index 17a24409d..9e119885b 100644
--- a/monomial/cloverdetratio_monomial.c
+++ b/monomial/cloverdetratio_monomial.c
@@ -20,7 +20,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -37,8 +37,10 @@
 #include "operator/Hopping_Matrix.h"
 #include "solver/chrono_guess.h"
 #include "solver/solver.h"
+#include "solver/monomial_solve.h"
 #include "read_input.h"
 #include "operator/clovertm_operators.h"
+#include "operator/clovertm_operators_32.h"
 #include "operator/clover_leaf.h"
 #include "monomial/monomial.h"
 #include "boundary.h"
@@ -63,6 +65,7 @@ void cloverdetratio_derivative_orig(const int no, hamiltonian_field_t * const hf
    *********************************************************************/
   /* First term coming from the second field */
   /* Multiply with W_+ */
+  mnl_backup_restore_globals(TM_BACKUP_GLOBALS);
   g_mu = mnl->mu;
   g_mu3 = mnl->rho2; //rho2
   boundary(mnl->kappa);
@@ -72,7 +75,7 @@ void cloverdetratio_derivative_orig(const int no, hamiltonian_field_t * const hf
   // we invert it for the even sites only including mu
   sw_invert(EE, mnl->mu);
   
-  if(mnl->solver != CG) {
+  if(mnl->solver == BICGSTAB && g_proc_id==0) {
     fprintf(stderr, "Bicgstab currently not implemented, using CG instead! (detratio_monomial.c)\n");
   }
   
@@ -83,8 +86,8 @@ void cloverdetratio_derivative_orig(const int no, hamiltonian_field_t * const hf
   /* X_W -> w_fields[1] */
   chrono_guess(mnl->w_fields[1], mnl->w_fields[2], mnl->csg_field, 
 	       mnl->csg_index_array, mnl->csg_N, mnl->csg_n, VOLUME/2, mnl->Qsq);
-  mnl->iter1 += cg_her(mnl->w_fields[1], mnl->w_fields[2], mnl->maxiter, 
-		       mnl->forceprec, g_relative_precision_flag, VOLUME/2, mnl->Qsq);
+  mnl->iter1 += solve_degenerate(mnl->w_fields[1], mnl->w_fields[2], mnl->solver_params, mnl->maxiter, 
+		       mnl->forceprec, g_relative_precision_flag, VOLUME/2, mnl->Qsq, mnl->solver);
   chrono_add_solution(mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array,
 		      mnl->csg_N, &mnl->csg_n, VOLUME/2);
   /* Y_W -> w_fields[0]  */
@@ -105,10 +108,10 @@ void cloverdetratio_derivative_orig(const int no, hamiltonian_field_t * const hf
   // computes the insertion matrices for S_eff
   // result is written to swp and swm
   // even/even sites sandwiched by gamma_5 Y_e and gamma_5 X_e  
-  sw_spinor(EE, mnl->w_fields[2], mnl->w_fields[3], mnl->forcefactor);
+  sw_spinor_eo(EE, mnl->w_fields[2], mnl->w_fields[3], mnl->forcefactor);
   
   // odd/odd sites sandwiched by gamma_5 Y_o and gamma_5 X_o
-  sw_spinor(OO, mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
+  sw_spinor_eo(OO, mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
 
   g_mu3 = mnl->rho2; // rho2
   
@@ -131,16 +134,14 @@ void cloverdetratio_derivative_orig(const int no, hamiltonian_field_t * const hf
   // computes the insertion matrices for S_eff
   // result is written to swp and swm
   // even/even sites sandwiched by gamma_5 Y_e and gamma_5 X_e
-  sw_spinor(EE, mnl->w_fields[2], mnl->w_fields[3], mnl->forcefactor);
+  sw_spinor_eo(EE, mnl->w_fields[2], mnl->w_fields[3], mnl->forcefactor);
   
   // odd/odd sites sandwiched by gamma_5 Y_o and gamma_5 X_o
-  sw_spinor(OO, mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
+  sw_spinor_eo(OO, mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
 
   sw_all(hf, mnl->kappa, mnl->c_sw);
   
-  g_mu = g_mu1;
-  g_mu3 = 0.;
-  boundary(g_kappa);
+  mnl_backup_restore_globals(TM_RESTORE_GLOBALS);
   etime = gettime();
   if(g_debug_level > 1 && g_proc_id == 0) {
     printf("# Time for %s monomial derivative: %e s\n", mnl->name, etime-atime);
@@ -171,15 +172,18 @@ void cloverdetratio_derivative(const int no, hamiltonian_field_t * const hf) {
    *********************************************************************/
   /* First term coming from the second field */
   /* Multiply with W_+ */
+  mnl_backup_restore_globals(TM_BACKUP_GLOBALS);
   g_mu = mnl->mu;
-  boundary(mnl->kappa);
+  g_kappa = mnl->kappa;
+  g_c_sw = mnl->c_sw;
+  boundary(g_kappa);
 
   // we compute the clover term (1 + T_ee(oo)) for all sites x
   sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
   // we invert it for the even sites only including mu
   sw_invert(EE, mnl->mu);
   
-  if(mnl->solver != CG) {
+  if(mnl->solver == BICGSTAB && g_proc_id == 0) {
     fprintf(stderr, "Bicgstab currently not implemented, using CG instead! (cloverdetratio_monomial.c)\n");
   }
   
@@ -192,8 +196,8 @@ void cloverdetratio_derivative(const int no, hamiltonian_field_t * const hf) {
   // X_W -> w_fields[1] 
   chrono_guess(mnl->w_fields[1], mnl->w_fields[2], mnl->csg_field, 
 	       mnl->csg_index_array, mnl->csg_N, mnl->csg_n, VOLUME/2, mnl->Qsq);
-  mnl->iter1 += cg_her(mnl->w_fields[1], mnl->w_fields[2], mnl->maxiter, 
-		       mnl->forceprec, g_relative_precision_flag, VOLUME/2, mnl->Qsq);
+  mnl->iter1 += solve_degenerate(mnl->w_fields[1], mnl->w_fields[2], mnl->solver_params, mnl->maxiter, 
+		       mnl->forceprec, g_relative_precision_flag, VOLUME/2, mnl->Qsq, mnl->solver);
   chrono_add_solution(mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array,
 		      mnl->csg_N, &mnl->csg_n, VOLUME/2);
   // Apply Q_{-} to get Y_W -> w_fields[0] 
@@ -216,16 +220,14 @@ void cloverdetratio_derivative(const int no, hamiltonian_field_t * const hf) {
   // computes the insertion matrices for S_eff
   // result is written to swp and swm
   // even/even sites sandwiched by gamma_5 Y_e and gamma_5 X_e  
-  sw_spinor(EO, mnl->w_fields[2], mnl->w_fields[3], mnl->forcefactor);
+  sw_spinor_eo(EO, mnl->w_fields[2], mnl->w_fields[3], mnl->forcefactor);
   
   // odd/odd sites sandwiched by gamma_5 Y_o and gamma_5 X_o
-  sw_spinor(OE, mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
+  sw_spinor_eo(OE, mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
 
   sw_all(hf, mnl->kappa, mnl->c_sw);
-  
-  g_mu = g_mu1;
-  g_mu3 = 0.;
-  boundary(g_kappa);
+
+  mnl_backup_restore_globals(TM_RESTORE_GLOBALS);
   etime = gettime();
     if(g_debug_level > 1 && g_proc_id == 0) {
     printf("# Time for %s monomial derivative: %e s\n", mnl->name, etime-atime);
@@ -238,9 +240,11 @@ void cloverdetratio_heatbath(const int id, hamiltonian_field_t * const hf) {
   monomial * mnl = &monomial_list[id];
   double atime, etime;
   atime = gettime();
+  mnl_backup_restore_globals(TM_BACKUP_GLOBALS);
   g_mu = mnl->mu;
   g_c_sw = mnl->c_sw;
-  boundary(mnl->kappa);
+  g_kappa = mnl->kappa;
+  boundary(g_kappa);
   mnl->csg_n = 0;
   mnl->csg_n2 = 0;
   mnl->iter0 = 0;
@@ -258,12 +262,20 @@ void cloverdetratio_heatbath(const int id, hamiltonian_field_t * const hf) {
   g_mu3 = mnl->rho2;
   zero_spinor_field(mnl->pf,VOLUME/2);
 
-  mnl->iter0 = cg_her(mnl->pf, mnl->w_fields[1], mnl->maxiter, mnl->accprec,  
-		      g_relative_precision_flag, VOLUME/2, mnl->Qsq); 
-
-  chrono_add_solution(mnl->pf, mnl->csg_field, mnl->csg_index_array,
-		      mnl->csg_N, &mnl->csg_n, VOLUME/2);
-  mnl->Qm(mnl->pf, mnl->pf);
+  if( mnl->solver == MG ){
+      mnl->iter0 = solve_degenerate(mnl->pf, mnl->w_fields[1], mnl->solver_params, mnl->maxiter, mnl->accprec,  
+				    g_relative_precision_flag, VOLUME/2, mnl->Qp, mnl->solver); 
+      
+      chrono_add_solution(mnl->pf, mnl->csg_field, mnl->csg_index_array,
+			  mnl->csg_N, &mnl->csg_n, VOLUME/2);
+  } else {
+      mnl->iter0 = solve_degenerate(mnl->pf, mnl->w_fields[1], mnl->solver_params, mnl->maxiter, mnl->accprec,  
+				    g_relative_precision_flag, VOLUME/2, mnl->Qsq, mnl->solver); 
+      
+      chrono_add_solution(mnl->pf, mnl->csg_field, mnl->csg_index_array,
+			  mnl->csg_N, &mnl->csg_n, VOLUME/2);
+      mnl->Qm(mnl->pf, mnl->pf);
+  }
   etime = gettime();
   if(g_proc_id == 0) {
     if(g_debug_level > 1) {
@@ -273,9 +285,7 @@ void cloverdetratio_heatbath(const int id, hamiltonian_field_t * const hf) {
       printf("called cloverdetratio_heatbath for id %d energy %f\n", id, mnl->energy0);
     }
   }
-  g_mu3 = 0.;
-  g_mu = g_mu1;
-  boundary(g_kappa);
+  mnl_backup_restore_globals(TM_RESTORE_GLOBALS);
   return;
 }
 
@@ -284,7 +294,10 @@ double cloverdetratio_acc(const int id, hamiltonian_field_t * const hf) {
   int save_sloppy = g_sloppy_precision_flag;
   double atime, etime;
   atime = gettime();
+  mnl_backup_restore_globals(TM_BACKUP_GLOBALS);
   g_mu = mnl->mu;
+  g_kappa = mnl->kappa;
+  g_c_sw = mnl->c_sw;
   boundary(mnl->kappa);
 
   sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
@@ -297,18 +310,21 @@ double cloverdetratio_acc(const int id, hamiltonian_field_t * const hf) {
   chrono_guess(mnl->w_fields[0], mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array, 
 	       mnl->csg_N, mnl->csg_n, VOLUME/2, &Qtm_plus_psi);
   g_sloppy_precision_flag = 0;    
-  mnl->iter0 += cg_her(mnl->w_fields[0], mnl->w_fields[1], mnl->maxiter, mnl->accprec,  
-		      g_relative_precision_flag, VOLUME/2, mnl->Qsq);
-  mnl->Qm(mnl->w_fields[0], mnl->w_fields[0]);
 
+  if( mnl->solver == MG ){
+      mnl->iter0 += solve_degenerate(mnl->w_fields[0], mnl->w_fields[1], mnl->solver_params, mnl->maxiter, 
+				     mnl->accprec, g_relative_precision_flag, VOLUME/2, mnl->Qp, mnl->solver);
+  } else {
+      mnl->iter0 += solve_degenerate(mnl->w_fields[0], mnl->w_fields[1], mnl->solver_params, mnl->maxiter, 
+				     mnl->accprec, g_relative_precision_flag, VOLUME/2, mnl->Qsq, mnl->solver);
+      mnl->Qm(mnl->w_fields[0], mnl->w_fields[0]);
+  }
   g_sloppy_precision_flag = save_sloppy;
 
   /* Compute the energy contr. from second field */
   mnl->energy1 = square_norm(mnl->w_fields[0], VOLUME/2, 1);
 
-  g_mu = g_mu1;
-  g_mu3 = 0.;
-  boundary(g_kappa);
+  mnl_backup_restore_globals(TM_RESTORE_GLOBALS);
   etime = gettime();
   if(g_proc_id == 0) {
     if(g_debug_level > 1) {
diff --git a/monomial/cloverdetratio_rwmonomial.c b/monomial/cloverdetratio_rwmonomial.c
new file mode 100644
index 000000000..2d89c1b3f
--- /dev/null
+++ b/monomial/cloverdetratio_rwmonomial.c
@@ -0,0 +1,106 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2012 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include "global.h"
+#include "su3.h"
+#include "start.h"
+#include "gettime.h"
+#include "linalg_eo.h"
+#include "deriv_Sb.h"
+#include "gamma.h"
+#include "operator/tm_operators.h"
+#include "operator/Hopping_Matrix.h"
+#include "solver/chrono_guess.h"
+#include "solver/solver.h"
+#include "solver/monomial_solve.h"
+#include "read_input.h"
+#include "operator/clovertm_operators.h"
+#include "operator/clovertm_operators_32.h"
+#include "operator/clover_leaf.h"
+#include "monomial/monomial.h"
+#include "boundary.h"
+#include "cloverdetratio_rwmonomial.h"
+#include "expo.h"
+#include "xchange/xchange.h"
+#include "init/init_gauge_tmp.h"
+#include "DDalphaAMG_interface.h"
+
+double cloverdetratio_rwacc(const int id, hamiltonian_field_t * const hf) {
+  monomial * mnl = &monomial_list[id];
+  int save_sloppy = g_sloppy_precision_flag;
+  double atime, etime;
+  atime = gettime();
+
+  g_mu = mnl->mu2;
+  boundary(mnl->kappa2);
+
+  init_sw_fields();
+  sw_term( (const su3**) hf->gaugefield, mnl->kappa2, mnl->c_sw); 
+  sw_invert(EE, mnl->mu2);
+  g_mu3 = 0.;
+  mnl->Qp(mnl->w_fields[1], mnl->pf);
+
+  g_mu3 = 0.;
+  g_mu = mnl->mu;
+  boundary(mnl->kappa);
+  sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
+  sw_invert(EE, mnl->mu);
+
+  chrono_guess(mnl->w_fields[0], mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array, 
+	       mnl->csg_N, mnl->csg_n, VOLUME/2, &Qtm_plus_psi);
+  g_sloppy_precision_flag = 0;    
+  if( mnl->solver == MG ) {
+    mnl->iter0 += solve_degenerate(mnl->w_fields[0], mnl->w_fields[1], mnl->solver_params, 
+                                   mnl->maxiter, mnl->accprec,
+				   g_relative_precision_flag, VOLUME/2, mnl->Qp, mnl->solver);
+  } else {
+    mnl->iter0 += solve_degenerate(mnl->w_fields[0], mnl->w_fields[1], mnl->solver_params, mnl->maxiter, mnl->accprec,
+				   g_relative_precision_flag, VOLUME/2, mnl->Qsq, mnl->solver);
+    mnl->Qm(mnl->w_fields[0], mnl->w_fields[0]);
+  }
+
+  g_sloppy_precision_flag = save_sloppy;
+
+  /* Compute the energy contr. from second field */
+  mnl->energy1 = square_norm(mnl->w_fields[0], VOLUME/2, 1);
+
+  g_mu = g_mu1;
+  g_mu3 = 0.;
+  boundary(g_kappa);
+  etime = gettime();
+  if(g_proc_id == 0) {
+    if(g_debug_level > 1) {
+      printf("# Time for %s monomial rwacc step: %e s\n", mnl->name, etime-atime);
+    }
+    if(g_debug_level > 3) {
+      printf("called cloverdetratio_rwacc for id %d dH = %1.10e\n", 
+	     id, mnl->energy1 - mnl->energy0);
+    }
+  }
+  return(mnl->energy1 - mnl->energy0);
+}
diff --git a/monomial/cloverdetratio_rwmonomial.h b/monomial/cloverdetratio_rwmonomial.h
new file mode 100644
index 000000000..258a2c8bf
--- /dev/null
+++ b/monomial/cloverdetratio_rwmonomial.h
@@ -0,0 +1,28 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2012 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+#ifndef _CLOVERDETRATIO_RWMONOMIAL_H
+#define _CLOVERDETRATIO_RWMONOMIAL_H
+
+#include "hamiltonian_field.h"
+
+double cloverdetratio_rwacc(const int no, hamiltonian_field_t * const hf);
+
+#endif
diff --git a/monomial/clovernd_trlog_monomial.c b/monomial/clovernd_trlog_monomial.c
index fecc5400d..6a5d06bec 100644
--- a/monomial/clovernd_trlog_monomial.c
+++ b/monomial/clovernd_trlog_monomial.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/monomial/cloverndpoly_monomial.c b/monomial/cloverndpoly_monomial.c
index 2fcdeab1d..5e9c90b05 100644
--- a/monomial/cloverndpoly_monomial.c
+++ b/monomial/cloverndpoly_monomial.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -118,14 +118,14 @@ void cloverndpoly_derivative(const int id, hamiltonian_field_t * const hf) {
     deriv_Sb(OE, g_chi_dn_spinor_field[j-1], mnl->w_fields[3], hf, mnl->forcefactor);
 
     // even/even sites sandwiched by tau_1 gamma_5 Y_e and gamma_5 X_e
-    sw_spinor(EE, mnl->w_fields[3], mnl->w_fields[0], mnl->forcefactor);
+    sw_spinor_eo(EE, mnl->w_fields[3], mnl->w_fields[0], mnl->forcefactor);
     // odd/odd sites sandwiched by tau_1 gamma_5 Y_o and gamma_5 X_o
-    sw_spinor(OO, g_chi_up_spinor_field[j-1], g_chi_dn_spinor_field[mnl->MDPolyDegree], mnl->forcefactor);
+    sw_spinor_eo(OO, g_chi_up_spinor_field[j-1], g_chi_dn_spinor_field[mnl->MDPolyDegree], mnl->forcefactor);
 
     // even/even sites sandwiched by tau_1 gamma_5 Y_e and gamma_5 X_e
-    sw_spinor(EE, mnl->w_fields[2], mnl->w_fields[1], mnl->forcefactor);
+    sw_spinor_eo(EE, mnl->w_fields[2], mnl->w_fields[1], mnl->forcefactor);
     // odd/odd sites sandwiched by tau_1 gamma_5 Y_o and gamma_5 X_o
-    sw_spinor(OO, g_chi_dn_spinor_field[j-1], g_chi_up_spinor_field[mnl->MDPolyDegree], mnl->forcefactor);
+    sw_spinor_eo(OO, g_chi_dn_spinor_field[j-1], g_chi_up_spinor_field[mnl->MDPolyDegree], mnl->forcefactor);
   }
   // trlog part does not depend on the normalisation of the polynomial
   sw_deriv_nd(EE);
diff --git a/monomial/det_monomial.c b/monomial/det_monomial.c
index e12b04e7f..a45f978f7 100644
--- a/monomial/det_monomial.c
+++ b/monomial/det_monomial.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -35,6 +35,7 @@
 #include "operator/Hopping_Matrix.h"
 #include "solver/chrono_guess.h"
 #include "solver/solver.h"
+#include "solver/monomial_solve.h"
 #include "read_input.h"
 #include "hamiltonian_field.h"
 #include "boundary.h"
@@ -48,6 +49,11 @@ void det_derivative(const int id, hamiltonian_field_t * const hf) {
   double atime, etime;
   atime = gettime();
   mnl->forcefactor = 1.;
+  mnl_backup_restore_globals(TM_BACKUP_GLOBALS);
+  
+  g_mu = mnl->mu;
+  g_kappa = mnl->kappa;
+  boundary(g_kappa);
 
   if(mnl->even_odd_flag) {
     /*********************************************************************
@@ -58,19 +64,23 @@ void det_derivative(const int id, hamiltonian_field_t * const hf) {
      *
      *********************************************************************/
     
-    g_mu = mnl->mu;
-    boundary(mnl->kappa);
-
-    if(mnl->solver != CG) {
-      fprintf(stderr, "Bicgstab currently not implemented, using CG instead! (det_monomial.c)\n");
-    }
-    
     /* Invert Q_{+} Q_{-} */
     /* X_o -> w_fields[1] */
     chrono_guess(mnl->w_fields[1], mnl->pf, mnl->csg_field, mnl->csg_index_array,
 		 mnl->csg_N, mnl->csg_n, VOLUME/2, mnl->Qsq);
-    mnl->iter1 += cg_her(mnl->w_fields[1], mnl->pf, mnl->maxiter, mnl->forceprec, 
-			 g_relative_precision_flag, VOLUME/2, mnl->Qsq);
+
+    if(mnl->solver==BICGSTAB) 
+    {      
+	  fprintf(stderr, "Bicgstab currently not implemented, using CG instead! (det_monomial.c)\n");
+	  mnl->iter1 += solve_degenerate(mnl->w_fields[1], mnl->pf, mnl->solver_params, mnl->maxiter, mnl->forceprec, 
+			 g_relative_precision_flag, VOLUME/2, mnl->Qsq, CG);
+    }
+    else{
+	  mnl->iter1 += solve_degenerate(mnl->w_fields[1], mnl->pf, mnl->solver_params, mnl->maxiter, mnl->forceprec, 
+			 g_relative_precision_flag, VOLUME/2, mnl->Qsq, mnl->solver);
+    }
+
+
     chrono_add_solution(mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array,
 			mnl->csg_N, &mnl->csg_n, VOLUME/2);
     
@@ -95,18 +105,17 @@ void det_derivative(const int id, hamiltonian_field_t * const hf) {
      * This term is det(Q^2 + \mu_1^2)
      *
      *********************************************************************/
-    g_mu = mnl->mu;
-    boundary(mnl->kappa);
-    if(mnl->solver == CG) {
+    
+    if((mnl->solver == CG) || (mnl->solver == MIXEDCG) || (mnl->solver == RGMIXEDCG) || (mnl->solver == MG)) {
       /* Invert Q_{+} Q_{-} */
       /* X -> w_fields[1] */
       chrono_guess(mnl->w_fields[1], mnl->pf, mnl->csg_field, mnl->csg_index_array,
 		   mnl->csg_N, mnl->csg_n, VOLUME/2, &Q_pm_psi);
-      mnl->iter1 += cg_her(mnl->w_fields[1], mnl->pf, 
-			mnl->maxiter, mnl->forceprec, g_relative_precision_flag, 
-			VOLUME, &Q_pm_psi);
+      mnl->iter1 += solve_degenerate(mnl->w_fields[1], mnl->pf, mnl->solver_params, 
+				     mnl->maxiter, mnl->forceprec, g_relative_precision_flag, 
+				     VOLUME, &Q_pm_psi, mnl->solver);
       chrono_add_solution(mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array,
-			  mnl->csg_N, &mnl->csg_n, VOLUME/2);
+			  mnl->csg_N, &mnl->csg_n, VOLUME);
 
       /* Y -> w_fields[0]  */
       Q_minus_psi(mnl->w_fields[0], mnl->w_fields[1]);
@@ -117,30 +126,29 @@ void det_derivative(const int id, hamiltonian_field_t * const hf) {
       /* Y -> w_fields[0]  */
       chrono_guess(mnl->w_fields[0], mnl->pf, mnl->csg_field, mnl->csg_index_array,
 		   mnl->csg_N, mnl->csg_n, VOLUME/2, &Q_plus_psi);
-      mnl->iter1 += bicgstab_complex(mnl->w_fields[0], mnl->pf, 
+      mnl->iter1 += solve_degenerate(mnl->w_fields[0], mnl->pf, mnl->solver_params, 
 				     mnl->maxiter, mnl->forceprec, g_relative_precision_flag, 
-				     VOLUME, &Q_plus_psi);
+				     VOLUME, &Q_plus_psi, mnl->solver);
       chrono_add_solution(mnl->w_fields[0], mnl->csg_field, mnl->csg_index_array,
 			  mnl->csg_N, &mnl->csg_n, VOLUME/2);
       
       /* Now Q_- */
       /* X -> w_fields[1] */
-      g_mu = -g_mu;
+      
       chrono_guess(mnl->w_fields[1], mnl->w_fields[0], mnl->csg_field2, 
 		   mnl->csg_index_array2, mnl->csg_N2, mnl->csg_n2, VOLUME/2, &Q_minus_psi);
-      mnl->iter1 += bicgstab_complex(mnl->w_fields[1], mnl->w_fields[0], 
+      mnl->iter1 += solve_degenerate(mnl->w_fields[1], mnl->w_fields[0], mnl->solver_params, 
 				     mnl->maxiter, mnl->forceprec, g_relative_precision_flag, 
-				     VOLUME, &Q_minus_psi);
+				     VOLUME, &Q_minus_psi, mnl->solver);
       chrono_add_solution(mnl->w_fields[1], mnl->csg_field2, mnl->csg_index_array2,
 			  mnl->csg_N2, &mnl->csg_n2, VOLUME/2);
-      g_mu = -g_mu;   
+        
     }
     
     /* \delta Q sandwitched by Y^\dagger and X */
     deriv_Sb_D_psi(mnl->w_fields[0], mnl->w_fields[1], hf, mnl->forcefactor);
   }
-  g_mu = g_mu1;
-  boundary(g_kappa);
+  mnl_backup_restore_globals(TM_RESTORE_GLOBALS);
   etime = gettime();
   if(g_debug_level > 1 && g_proc_id == 0) {
     printf("# Time for %s monomial derivative: %e s\n", mnl->name, etime-atime);
@@ -154,8 +162,12 @@ void det_heatbath(const int id, hamiltonian_field_t * const hf) {
   monomial * mnl = &monomial_list[id];
   double atime, etime;
   atime = gettime();
+  
+  mnl_backup_restore_globals(TM_BACKUP_GLOBALS);
   g_mu = mnl->mu;
-  boundary(mnl->kappa);
+  g_kappa = mnl->kappa;
+  boundary(g_kappa);
+  
   mnl->csg_n = 0;
   mnl->csg_n2 = 0;
   mnl->iter0 = 0;
@@ -166,6 +178,7 @@ void det_heatbath(const int id, hamiltonian_field_t * const hf) {
     mnl->energy0 = square_norm(mnl->w_fields[0], VOLUME/2, 1);
 
     mnl->Qp(mnl->pf, mnl->w_fields[0]);
+    
     chrono_add_solution(mnl->pf, mnl->csg_field, mnl->csg_index_array,
 			mnl->csg_N, &mnl->csg_n, VOLUME/2);
     if(mnl->solver != CG) {
@@ -179,14 +192,13 @@ void det_heatbath(const int id, hamiltonian_field_t * const hf) {
 
     Q_plus_psi(mnl->pf, mnl->w_fields[0]);
     chrono_add_solution(mnl->pf, mnl->csg_field, mnl->csg_index_array,
-			mnl->csg_N, &mnl->csg_n, VOLUME/2);
+			mnl->csg_N, &mnl->csg_n, VOLUME);
     if(mnl->solver != CG) {
       chrono_add_solution(mnl->pf, mnl->csg_field2, mnl->csg_index_array2, 
-			  mnl->csg_N2, &mnl->csg_n2, VOLUME/2);
+			  mnl->csg_N2, &mnl->csg_n2, VOLUME);
     }
   }
-  g_mu = g_mu1;
-  boundary(g_kappa);
+  mnl_backup_restore_globals(TM_RESTORE_GLOBALS);
   etime = gettime();
   if(g_proc_id == 0) {
     if(g_debug_level > 1) {
@@ -205,27 +217,39 @@ double det_acc(const int id, hamiltonian_field_t * const hf) {
   int save_sloppy = g_sloppy_precision_flag;
   double atime, etime;
   atime = gettime();
+  
+  mnl_backup_restore_globals(TM_BACKUP_GLOBALS);
   g_mu = mnl->mu;
+  g_kappa = mnl->kappa;
   boundary(mnl->kappa);
+  
   if(mnl->even_odd_flag) {
-
-    chrono_guess(mnl->w_fields[0], mnl->pf, mnl->csg_field, mnl->csg_index_array,
-    	 mnl->csg_N, mnl->csg_n, VOLUME/2, mnl->Qsq);
     g_sloppy_precision_flag = 0;
-    mnl->iter0 = cg_her(mnl->w_fields[0], mnl->pf, mnl->maxiter, mnl->accprec, g_relative_precision_flag,
-    			VOLUME/2, mnl->Qsq);
-    mnl->Qm(mnl->w_fields[1], mnl->w_fields[0]);
+    if( mnl->solver == MG ){
+	chrono_guess(mnl->w_fields[0], mnl->pf, mnl->csg_field, mnl->csg_index_array,
+		     mnl->csg_N, mnl->csg_n, VOLUME/2, mnl->Qp);
+	mnl->iter0 += solve_degenerate(mnl->w_fields[0], mnl->pf, mnl->solver_params, mnl->maxiter,
+				      mnl->accprec, g_relative_precision_flag, VOLUME/2, mnl->Qp, mnl->solver);
+	/* Compute the energy contr. from second field */
+	mnl->energy1 = square_norm(mnl->w_fields[0], VOLUME/2, 1); 
+    } else {
+	chrono_guess(mnl->w_fields[0], mnl->pf, mnl->csg_field, mnl->csg_index_array,
+		     mnl->csg_N, mnl->csg_n, VOLUME/2, mnl->Qsq);
+	mnl->iter0 += solve_degenerate(mnl->w_fields[0], mnl->pf, mnl->solver_params, mnl->maxiter, 
+				       mnl->accprec, g_relative_precision_flag,VOLUME/2, mnl->Qsq, mnl->solver);
+	mnl->Qm(mnl->w_fields[1], mnl->w_fields[0]);
+	/* Compute the energy contr. from first field */
+	mnl->energy1 = square_norm(mnl->w_fields[1], VOLUME/2, 1);
+    }
     g_sloppy_precision_flag = save_sloppy;
-    /* Compute the energy contr. from first field */
-    mnl->energy1 = square_norm(mnl->w_fields[1], VOLUME/2, 1);
   }
   else {
-    if(mnl->solver == CG) {
+    if((mnl->solver == CG) || (mnl->solver == MIXEDCG) || (mnl->solver == RGMIXEDCG)) {
       chrono_guess(mnl->w_fields[1], mnl->pf, mnl->csg_field, mnl->csg_index_array,
 		   mnl->csg_N, mnl->csg_n, VOLUME/2, &Q_pm_psi);
-      mnl->iter0 = cg_her(mnl->w_fields[1], mnl->pf, 
-			  mnl->maxiter, mnl->accprec, g_relative_precision_flag, 
-			  VOLUME, &Q_pm_psi);
+      mnl->iter0 += solve_degenerate(mnl->w_fields[1], mnl->pf, mnl->solver_params, mnl->maxiter, 
+                                     mnl->accprec, g_relative_precision_flag, 
+			  VOLUME, &Q_pm_psi, mnl->solver);
       Q_minus_psi(mnl->w_fields[0], mnl->w_fields[1]);
       /* Compute the energy contr. from first field */
       mnl->energy1 = square_norm(mnl->w_fields[0], VOLUME, 1);
@@ -233,14 +257,13 @@ double det_acc(const int id, hamiltonian_field_t * const hf) {
     else {
       chrono_guess(mnl->w_fields[0], mnl->pf, mnl->csg_field, mnl->csg_index_array,
 		   mnl->csg_N, mnl->csg_n, VOLUME/2, &Q_plus_psi);
-      mnl->iter0 += bicgstab_complex(mnl->w_fields[0], mnl->pf, 
+      mnl->iter0 += solve_degenerate(mnl->w_fields[0], mnl->pf, mnl->solver_params, 
 				     mnl->maxiter, mnl->forceprec, g_relative_precision_flag, 
-				     VOLUME,  &Q_plus_psi);
+				     VOLUME,  &Q_plus_psi, mnl->solver);
       mnl->energy1 = square_norm(mnl->w_fields[0], VOLUME, 1);
     }
   }
-  g_mu = g_mu1;
-  boundary(g_kappa);
+  mnl_backup_restore_globals(TM_RESTORE_GLOBALS);
   etime = gettime();
   if(g_proc_id == 0) {
     if(g_debug_level > 1) {
diff --git a/monomial/detratio_monomial.c b/monomial/detratio_monomial.c
index c3d14bf82..4d1f3f492 100644
--- a/monomial/detratio_monomial.c
+++ b/monomial/detratio_monomial.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -36,11 +36,13 @@
 #include "operator/Hopping_Matrix.h"
 #include "solver/chrono_guess.h"
 #include "solver/solver.h"
+#include "solver/monomial_solve.h"
 #include "read_input.h"
 #include "gamma.h"
 #include "monomial/monomial.h"
 #include "boundary.h"
 #include "detratio_monomial.h"
+#include "misc_types.h"
 
 /* think about chronological solver ! */
 
@@ -50,6 +52,11 @@ void detratio_derivative(const int no, hamiltonian_field_t * const hf) {
   
   atime = gettime();
   mnl->forcefactor = 1.;
+  
+  mnl_backup_restore_globals(TM_BACKUP_GLOBALS);
+  g_mu = mnl->mu2;
+  g_kappa = mnl->kappa2;
+  boundary(g_kappa);  
 
   if(mnl->even_odd_flag) {
     /*
@@ -63,22 +70,26 @@ void detratio_derivative(const int no, hamiltonian_field_t * const hf) {
      *********************************************************************/
     /* First term coming from the second field */
     /* Multiply with W_+ */
-    g_mu = mnl->mu2;
-    boundary(mnl->kappa2);
-
-    if(mnl->solver != CG) {
-      fprintf(stderr, "Bicgstab currently not implemented, using CG instead! (detratio_monomial.c)\n");
-    }
-
     Qtm_plus_psi(mnl->w_fields[2], mnl->pf);
+    
     g_mu = mnl->mu;
-    boundary(mnl->kappa);
+    g_kappa = mnl->kappa;
+    boundary(g_kappa);
+    
     /* Invert Q_{+} Q_{-} */
     /* X_W -> w_fields[1] */
     chrono_guess(mnl->w_fields[1], mnl->w_fields[2], mnl->csg_field, 
 		 mnl->csg_index_array, mnl->csg_N, mnl->csg_n, VOLUME/2, &Qtm_pm_psi);
-    mnl->iter1 += cg_her(mnl->w_fields[1], mnl->w_fields[2], mnl->maxiter, 
-			 mnl->forceprec, g_relative_precision_flag, VOLUME/2, &Qtm_pm_psi);
+    
+    if(mnl->solver == BICGSTAB) {
+      fprintf(stderr, "Bicgstab currently not implemented, using CG instead! (detratio_monomial.c)\n"); 
+       mnl->iter1 += solve_degenerate(mnl->w_fields[1], mnl->w_fields[2], mnl->solver_params, mnl->maxiter, 
+			                                mnl->forceprec, g_relative_precision_flag, VOLUME/2, &Qtm_pm_psi, CG);
+    }
+    else{
+       mnl->iter1 += solve_degenerate(mnl->w_fields[1], mnl->w_fields[2], mnl->solver_params, mnl->maxiter, 
+			                                mnl->forceprec, g_relative_precision_flag, VOLUME/2, &Qtm_pm_psi, mnl->solver); 
+    }
     chrono_add_solution(mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array,
 			mnl->csg_N, &mnl->csg_n, VOLUME/2);
     /* Y_W -> w_fields[0]  */
@@ -94,13 +105,14 @@ void detratio_derivative(const int no, hamiltonian_field_t * const hf) {
     H_eo_tm_inv_psi(mnl->w_fields[3], mnl->w_fields[0], EO, +1);
     /* \delta Q sandwitched by Y_e^\dagger and X_o */
     deriv_Sb(EO, mnl->w_fields[3], mnl->w_fields[1], hf, mnl->forcefactor); 
-
-    g_mu = mnl->mu2;
-    boundary(mnl->kappa2);
     
     /* Second term coming from the second field */
     /* The sign is opposite!! */
     mul_r(mnl->w_fields[0], -1., mnl->pf, VOLUME/2);
+    
+    g_mu = mnl->mu2;
+    g_kappa = mnl->kappa2;
+    boundary(g_kappa);
 
     /* apply Hopping Matrix M_{eo} */
     /* to get the even sites of X */
@@ -121,22 +133,23 @@ void detratio_derivative(const int no, hamiltonian_field_t * const hf) {
      *
      *********************************************************************/
     /* First term coming from the second field */
-    /* Multiply with W_+ */
-    g_mu = mnl->mu2;
-    boundary(mnl->kappa2);	
+    /* Multiply with W_+ */	
     Q_plus_psi(mnl->w_fields[2], mnl->pf);
+    
     g_mu = mnl->mu;
-    boundary(mnl->kappa);
-    if(mnl->solver == CG) {
+    g_kappa = mnl->kappa;
+    boundary(g_kappa);
+    
+    if((mnl->solver == CG) || (mnl->solver == MIXEDCG) || (mnl->solver == RGMIXEDCG)  || (mnl->solver == MG)) {
       /* If CG is used anyhow */
       /*       gamma5(mnl->w_fields[1], mnl->w_fields[2], VOLUME/2); */
       /* Invert Q_{+} Q_{-} */
       /* X_W -> w_fields[1] */
       chrono_guess(mnl->w_fields[1], mnl->w_fields[2], mnl->csg_field, 
 		   mnl->csg_index_array, mnl->csg_N, mnl->csg_n, VOLUME/2, &Q_pm_psi);
-      mnl->iter1 += cg_her(mnl->w_fields[1], mnl->w_fields[2], 
+      mnl->iter1 += solve_degenerate(mnl->w_fields[1], mnl->w_fields[2], mnl->solver_params, 
 			   mnl->maxiter, mnl->forceprec, g_relative_precision_flag, 
-			   VOLUME, &Q_pm_psi);
+			   VOLUME, &Q_pm_psi, mnl->solver);
       chrono_add_solution(mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array,
 			  mnl->csg_N, &mnl->csg_n, VOLUME/2);
       
@@ -150,31 +163,32 @@ void detratio_derivative(const int no, hamiltonian_field_t * const hf) {
       chrono_guess(mnl->w_fields[0], mnl->w_fields[2], mnl->csg_field, mnl->csg_index_array,
 		   mnl->csg_N, mnl->csg_n, VOLUME/2, &Q_plus_psi);
       gamma5(mnl->w_fields[0], mnl->w_fields[0], VOLUME);
-      mnl->iter1 += bicgstab_complex(mnl->w_fields[0], mnl->w_fields[2], 
+      mnl->iter1 += solve_degenerate(mnl->w_fields[0], mnl->w_fields[2], mnl->solver_params, 
 				     mnl->maxiter, mnl->forceprec, g_relative_precision_flag, 
-				     VOLUME, Q_plus_psi);
+				     VOLUME, Q_plus_psi, mnl->solver);
       chrono_add_solution(mnl->w_fields[0], mnl->csg_field, mnl->csg_index_array,
 			  mnl->csg_N, &mnl->csg_n, VOLUME/2);
 
       /* Now Q_- */
       /* X_o -> w_fields[1] */
-      g_mu = -g_mu;
+      
       chrono_guess(mnl->w_fields[1], mnl->w_fields[0], mnl->csg_field2, 
 		   mnl->csg_index_array2, mnl->csg_N2, mnl->csg_n2, VOLUME/2, &Q_minus_psi);
       gamma5(mnl->w_fields[1], mnl->w_fields[1], VOLUME);
-      mnl->iter1 += bicgstab_complex(mnl->w_fields[1],mnl->w_fields[0], 
+      mnl->iter1 += solve_degenerate(mnl->w_fields[1],mnl->w_fields[0], mnl->solver_params, 
 				     mnl->maxiter, mnl->forceprec, g_relative_precision_flag, 
-				     VOLUME, Q_minus_psi);
+				     VOLUME, Q_minus_psi, mnl->solver);
       chrono_add_solution(mnl->w_fields[1], mnl->csg_field2, mnl->csg_index_array2,
 			  mnl->csg_N2, &mnl->csg_n2, VOLUME/2);
-      g_mu = -g_mu;   
+        
     }
 
     /* \delta Q sandwitched by Y^\dagger and X */
     deriv_Sb_D_psi(mnl->w_fields[0], mnl->w_fields[1], hf, mnl->forcefactor); 
     
     g_mu = mnl->mu2;
-    boundary(mnl->kappa2);
+    g_kappa = mnl->kappa2;
+    boundary(g_kappa);
     
     /* Second term coming from the second field */
     /* The sign is opposite!! */
@@ -183,8 +197,7 @@ void detratio_derivative(const int no, hamiltonian_field_t * const hf) {
     /* \delta Q sandwitched by Y^\dagger and X */
     deriv_Sb_D_psi(mnl->w_fields[0], mnl->w_fields[1], hf, mnl->forcefactor);
   }
-  g_mu = g_mu1;
-  boundary(g_kappa);
+  mnl_backup_restore_globals(TM_RESTORE_GLOBALS);
   etime = gettime();
   if(g_debug_level > 1 && g_proc_id == 0) {
     printf("# Time for %s monomial derivative: %e s\n", mnl->name, etime-atime);
@@ -197,8 +210,12 @@ void detratio_heatbath(const int id, hamiltonian_field_t * const hf) {
   monomial * mnl = &monomial_list[id];
   double atime, etime;
   atime = gettime();
+  mnl_backup_restore_globals(TM_BACKUP_GLOBALS);
+  
   g_mu = mnl->mu;
-  boundary(mnl->kappa);
+  g_kappa = mnl->kappa;
+  boundary(g_kappa);
+  
   mnl->csg_n = 0;
   mnl->csg_n2 = 0;
   mnl->iter0 = 0;
@@ -208,32 +225,59 @@ void detratio_heatbath(const int id, hamiltonian_field_t * const hf) {
     mnl->energy0  = square_norm(mnl->w_fields[0], VOLUME/2, 1);
 
     mnl->Qp(mnl->w_fields[1], mnl->w_fields[0]);
+    
     g_mu = mnl->mu2;
-    boundary(mnl->kappa2);
+    g_kappa = mnl->kappa2;
+    boundary(g_kappa);
+    
     zero_spinor_field(mnl->w_fields[0], VOLUME/2);
-    mnl->iter0 = cg_her(mnl->w_fields[0], mnl->w_fields[1], mnl->maxiter, mnl->accprec, g_relative_precision_flag,
-    			VOLUME/2, mnl->Qsq);
-    mnl->Qm(mnl->pf, mnl->w_fields[0]);
-    chrono_add_solution(mnl->w_fields[0], mnl->csg_field, mnl->csg_index_array,
-			mnl->csg_N, &mnl->csg_n, VOLUME/2);
+    if( mnl->solver == MG ){
+      mnl->iter0 = solve_degenerate(mnl->pf, mnl->w_fields[1], mnl->solver_params, mnl->maxiter, mnl->accprec,
+				    g_relative_precision_flag, VOLUME/2, mnl->Qp, mnl->solver);
+      chrono_add_solution(mnl->pf, mnl->csg_field, mnl->csg_index_array,
+			  mnl->csg_N, &mnl->csg_n, VOLUME/2);      
+    } else {
+      mnl->iter0 = solve_degenerate(mnl->w_fields[0], mnl->w_fields[1], mnl->solver_params, mnl->maxiter,
+				    mnl->accprec, g_relative_precision_flag, VOLUME/2, mnl->Qsq, mnl->solver);
+      mnl->Qm(mnl->pf, mnl->w_fields[0]);
+
+      chrono_add_solution(mnl->w_fields[0], mnl->csg_field, mnl->csg_index_array,
+			  mnl->csg_N, &mnl->csg_n, VOLUME/2);
+    }
   }
   else {
     random_spinor_field_lexic(mnl->w_fields[0], mnl->rngrepro,RN_GAUSS);
     mnl->energy0 = square_norm(mnl->w_fields[0], VOLUME, 1);
 
     Q_plus_psi(mnl->w_fields[1], mnl->w_fields[0]);
+    
     g_mu = mnl->mu2;
-    boundary(mnl->kappa2);
+    g_kappa = mnl->kappa2;
+    boundary(g_kappa);
+    
     zero_spinor_field(mnl->pf,VOLUME);
-    mnl->iter0 += bicgstab_complex(mnl->pf, mnl->w_fields[1], mnl->maxiter, mnl->accprec, 
-				   g_relative_precision_flag, VOLUME, Q_plus_psi);
-    chrono_add_solution(mnl->pf, mnl->csg_field, mnl->csg_index_array,
-			mnl->csg_N, &mnl->csg_n, VOLUME/2);
-    if(mnl->solver != CG) {
+    if((mnl->solver == CG) || (mnl->solver == MIXEDCG)){
+      mnl->iter0 = solve_degenerate(mnl->w_fields[0], mnl->w_fields[1], mnl->solver_params,
+                                    mnl->maxiter, mnl->accprec, 
+				    g_relative_precision_flag, VOLUME, Q_pm_psi, mnl->solver);
+      Q_minus_psi(mnl->pf, mnl->w_fields[0]);
+      chrono_add_solution(mnl->pf, mnl->csg_field, mnl->csg_index_array,
+			  mnl->csg_N, &mnl->csg_n, VOLUME/2);      
+    } else if( mnl->solver == MG ){
+      mnl->iter0 = solve_degenerate(mnl->pf, mnl->w_fields[1], mnl->solver_params, mnl->maxiter, mnl->accprec,
+				    g_relative_precision_flag, VOLUME, Q_plus_psi, mnl->solver);
+      chrono_add_solution(mnl->pf, mnl->csg_field, mnl->csg_index_array,
+			  mnl->csg_N, &mnl->csg_n, VOLUME/2);      
+    } else {
+      mnl->iter0 = solve_degenerate(mnl->pf, mnl->w_fields[1], mnl->solver_params, mnl->maxiter, mnl->accprec, 
+				    g_relative_precision_flag, VOLUME, Q_plus_psi, mnl->solver);
+      chrono_add_solution(mnl->pf, mnl->csg_field, mnl->csg_index_array,
+			  mnl->csg_N, &mnl->csg_n, VOLUME/2);
       chrono_add_solution(mnl->pf, mnl->csg_field2, mnl->csg_index_array2,
-			  mnl->csg_N2, &mnl->csg_n2, VOLUME/2);
+			  mnl->csg_N2, &mnl->csg_n2, VOLUME/2);           
     }
   }
+  mnl_backup_restore_globals(TM_RESTORE_GLOBALS);
   etime = gettime();
   if(g_proc_id == 0) {
     if(g_debug_level > 1) {
@@ -243,8 +287,6 @@ void detratio_heatbath(const int id, hamiltonian_field_t * const hf) {
       printf("called detratio_heatbath for id %d energy %f\n", id, mnl->energy0);
     }
   }
-  g_mu = g_mu1;
-  boundary(g_kappa);
   return;
 }
 
@@ -253,36 +295,68 @@ double detratio_acc(const int id, hamiltonian_field_t * const hf) {
   int save_sloppy = g_sloppy_precision_flag;
   double etime, atime;
   atime = gettime();
+  
+  mnl_backup_restore_globals(TM_BACKUP_GLOBALS);
+  
   g_mu = mnl->mu2;
-  boundary(mnl->kappa2);
+  g_kappa = mnl->kappa2;
+  boundary(g_kappa);
+  
   if(even_odd_flag) {
     mnl->Qp(mnl->w_fields[1], mnl->pf);
+    
     g_mu = mnl->mu;
-    boundary(mnl->kappa);
-    chrono_guess(mnl->w_fields[0], mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array, 
-		 mnl->csg_N, mnl->csg_n, VOLUME/2, mnl->Qsq);
+    g_kappa = mnl->kappa;
+    boundary(g_kappa);
+    
     g_sloppy_precision_flag = 0;
-    mnl->iter0 += cg_her(mnl->w_fields[0], mnl->w_fields[1], mnl->maxiter, mnl->accprec, g_relative_precision_flag,
-			 VOLUME/2, mnl->Qsq);
-    mnl->Qm(mnl->w_fields[1], mnl->w_fields[0]);
+    if( mnl->solver == MG ){
+      chrono_guess(mnl->w_fields[0], mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array, 
+		   mnl->csg_N, mnl->csg_n, VOLUME/2, mnl->Qp);
+      mnl->iter0 += solve_degenerate(mnl->w_fields[0], mnl->w_fields[1], mnl->solver_params, mnl->maxiter,
+				    mnl->accprec, g_relative_precision_flag, VOLUME/2, mnl->Qp, mnl->solver);
+      /* Compute the energy contr. from second field */
+      mnl->energy1 = square_norm(mnl->w_fields[0], VOLUME/2, 1); 
+    } else {
+      chrono_guess(mnl->w_fields[0], mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array, 
+		   mnl->csg_N, mnl->csg_n, VOLUME/2, mnl->Qsq);
+      mnl->iter0 += solve_degenerate(mnl->w_fields[0], mnl->w_fields[1], mnl->solver_params, mnl->maxiter,
+				     mnl->accprec, g_relative_precision_flag, VOLUME/2, mnl->Qsq, mnl->solver);
+      mnl->Qm(mnl->w_fields[1], mnl->w_fields[0]);
+      /* Compute the energy contr. from second field */
+      mnl->energy1 = square_norm(mnl->w_fields[1], VOLUME/2, 1);
+    }
     g_sloppy_precision_flag = save_sloppy;
-    /* Compute the energy contr. from second field */
-    mnl->energy1 = square_norm(mnl->w_fields[1], VOLUME/2, 1);
   }
   else {
     Q_plus_psi(mnl->w_fields[1], mnl->pf);
+    
     g_mu = mnl->mu;
-    boundary(mnl->kappa);
+    g_kappa = mnl->kappa;
+    boundary(g_kappa);
+    
     chrono_guess(mnl->w_fields[0], mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array, 
 		 mnl->csg_N, mnl->csg_n, VOLUME/2, &Q_plus_psi);
-    mnl->iter0 += bicgstab_complex(mnl->w_fields[0], mnl->w_fields[1], 
-				   mnl->maxiter, mnl->accprec, g_relative_precision_flag, 
-				   VOLUME, Q_plus_psi); 
-    /* Compute the energy contr. from second field */
-    mnl->energy1 = square_norm(mnl->w_fields[0], VOLUME, 1);
+    g_sloppy_precision_flag = 0;
+    if((mnl->solver == CG) || (mnl->solver == MIXEDCG)){
+      
+      mnl->iter0 += solve_degenerate(mnl->w_fields[0], mnl->w_fields[1], mnl->solver_params, mnl->maxiter,
+				     mnl->accprec, g_relative_precision_flag, VOLUME, &Q_pm_psi, mnl->solver); 
+      Q_minus_psi(mnl->w_fields[1], mnl->w_fields[0]);
+      /* Compute the energy contr. from second field */
+      mnl->energy1 = square_norm(mnl->w_fields[1], VOLUME, 1);      
+    }
+    else{
+      mnl->iter0 += solve_degenerate(mnl->w_fields[0], mnl->w_fields[1], mnl->solver_params, 
+				     mnl->maxiter, mnl->accprec, g_relative_precision_flag, 
+				     VOLUME, Q_plus_psi, mnl->solver); 
+    
+      /* Compute the energy contr. from second field */
+      mnl->energy1 = square_norm(mnl->w_fields[0], VOLUME, 1); 
+    }
+    g_sloppy_precision_flag = save_sloppy;
   }
-  g_mu = g_mu1;
-  boundary(g_kappa);
+  mnl_backup_restore_globals(TM_RESTORE_GLOBALS);
   etime = gettime();
   if(g_proc_id == 0) {
     if(g_debug_level > 1) {
diff --git a/monomial/gauge_monomial.c b/monomial/gauge_monomial.c
index a7d5ae689..ea1ce8353 100644
--- a/monomial/gauge_monomial.c
+++ b/monomial/gauge_monomial.c
@@ -19,12 +19,12 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
-#ifdef OMP 
+#ifdef TM_USE_OMP 
 # include <omp.h>
 #endif
 #include "global.h"
@@ -55,7 +55,7 @@ void gauge_derivative(const int id, hamiltonian_field_t * const hf) {
   
   double atime, etime;
   atime = gettime();
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -65,7 +65,7 @@ void gauge_derivative(const int id, hamiltonian_field_t * const hf) {
   su3 *z;
   su3adj *xm;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(i = 0; i < VOLUME; i++) { 
@@ -84,7 +84,7 @@ void gauge_derivative(const int id, hamiltonian_field_t * const hf) {
     }
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
   etime = gettime();
@@ -105,7 +105,7 @@ void gauge_EMderivative(const int id, hamiltonian_field_t * const hf) {
   
   double atime, etime;
   atime = gettime();
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -115,7 +115,7 @@ void gauge_EMderivative(const int id, hamiltonian_field_t * const hf) {
   su3 *z;
   su3adj *xm;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(i = 0; i < VOLUME; i++) { 
@@ -151,7 +151,7 @@ void gauge_EMderivative(const int id, hamiltonian_field_t * const hf) {
     }
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
   etime = gettime();
diff --git a/monomial/moment_energy.c b/monomial/moment_energy.c
index 3366b49de..0e5ef80b6 100644
--- a/monomial/moment_energy.c
+++ b/monomial/moment_energy.c
@@ -23,7 +23,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -72,7 +72,7 @@ double moment_energy(su3adj ** const momenta) {
   /* from the loop I got: p^2 */
   /* the contribution to the E is however (p^2)/2: */
   kc=0.5*(ks+kc);
-#ifdef MPI
+#ifdef TM_USE_MPI
   ks = kc;
   MPI_Allreduce(&ks, &kc, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 #endif
diff --git a/monomial/monitor_forces.c b/monomial/monitor_forces.c
index 2395b7e34..99fd25f59 100644
--- a/monomial/monitor_forces.c
+++ b/monomial/monitor_forces.c
@@ -20,16 +20,16 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <time.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include "global.h"
@@ -46,7 +46,7 @@ void monitor_forces(hamiltonian_field_t * const hf) {
 
   for(int id = 0; id < no_monomials; id++) {
     if(monomial_list[ id ].derivativefunction != NULL) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel for
 #endif
       for(int i = 0; i < (VOLUMEPLUSRAND + g_dbw2rand);i++) { 
@@ -57,12 +57,12 @@ void monitor_forces(hamiltonian_field_t * const hf) {
       
       monomial_list[ id ].derivativefunction(id, hf);
       
-#ifdef MPI
+#ifdef TM_USE_MPI
       xchange_deri(hf->derivative);
 #endif
       
       double sum = 0., max = 0., sum2;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel private(sum2)
       {
 	int thread_num = omp_get_thread_num();
@@ -73,14 +73,14 @@ void monitor_forces(hamiltonian_field_t * const hf) {
 	  for(int mu = 0; mu < 4; mu++) {
 	    sum2 = _su3adj_square_norm(hf->derivative[i][mu]); 
 	    sum += sum2;
-#ifdef OMP
+#ifdef TM_USE_OMP
 	    if(sum2 > g_omp_acc_re[thread_num]) g_omp_acc_re[thread_num] = sum2;
 #else
 	    if(sum2 > max) max = sum2;
 #endif
 	  }
 	}
-#ifdef OMP
+#ifdef TM_USE_OMP
       } /* OMP closing brace */
       max = g_omp_acc_re[0];
       for( int i = 1; i < omp_num_threads; i++) {
@@ -89,14 +89,14 @@ void monitor_forces(hamiltonian_field_t * const hf) {
 #endif
       
       // output for force monitoring
-#ifdef MPI
+#ifdef TM_USE_MPI
       MPI_Reduce(&sum, &sum2, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
       sum = sum2;
       MPI_Reduce(&max, &sum2, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
       max = sum2;
 #endif
       if(g_proc_id == 0) {
-	printf("# squared force for monomial %s on timescale %d: aver: %1.2e max: %1.2e\n", 
+	printf("# squared force for monomial %s on timescale %d: aver: %1.8e max: %1.8e\n", 
 	       monomial_list[ id ].name,
 	       monomial_list[ id ].timescale,
 	       sum/((double)(VOLUME*g_nproc))/4., max);
diff --git a/monomial/monomial.c b/monomial/monomial.c
index d9493852d..1a37e8116 100644
--- a/monomial/monomial.c
+++ b/monomial/monomial.c
@@ -20,7 +20,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -28,11 +28,14 @@
 #include <errno.h>
 #include <string.h>
 #include "global.h"
+#include "boundary.h"
 #include "su3.h"
 #include "su3adj.h"
 #include "su3spinor.h"
 #include "operator/tm_operators.h"
+#include "operator/tm_operators_32.h"
 #include "operator/clovertm_operators.h"
+#include "operator/clovertm_operators_32.h"
 #include "operator/clover_leaf.h"
 #include "ranlxd.h"
 #include "sse.h"
@@ -41,8 +44,6 @@
 #include "read_input.h"
 #include "monomial/monomial.h"
 
-
-
 monomial monomial_list[max_no_monomials];
 int no_monomials = 0;
 int no_gauge_monomials = 0;
@@ -92,7 +93,25 @@ int add_monomial(const int type) {
   monomial_list[no_monomials].accprec = _default_g_eps_sq_acc;
   monomial_list[no_monomials].forceprec = _default_g_eps_sq_force;
   monomial_list[no_monomials].maxiter = _default_max_solver_iterations;
-  monomial_list[no_monomials].solver = _default_solver_flag;
+  if((monomial_list[no_monomials].type == NDRAT) ||
+     (monomial_list[no_monomials].type == NDRATCOR) ||
+     (monomial_list[no_monomials].type == NDCLOVERRAT) ||
+     (monomial_list[no_monomials].type == NDCLOVERRATCOR)
+  ) {
+    monomial_list[no_monomials].solver = _default_nd_solver_flag;    
+  }
+  else{
+    monomial_list[no_monomials].solver = _default_solver_flag;
+  }
+  monomial_list[no_monomials].solver_params.mcg_delta = _default_mixcg_innereps;
+  monomial_list[no_monomials].solver_params.solution_type = TM_SOLUTION_M_MDAG;
+  // the defaut is 1 because the QPhiX interface is generalised in such a way
+  // that normal solves correspond to solves with one shift, this does not 
+  // affect the used parameters in any way!
+  monomial_list[no_monomials].solver_params.no_shifts = 1;
+  monomial_list[no_monomials].solver_params.compression_type = _default_compression_type;
+  monomial_list[no_monomials].solver_params.external_inverter = _default_external_inverter;
+  monomial_list[no_monomials].solver_params.sloppy_precision = _default_operator_sloppy_precision_flag;
   monomial_list[no_monomials].even_odd_flag = _default_even_odd_flag;
   monomial_list[no_monomials].forcefactor = 1.;
   monomial_list[no_monomials].use_rectangles = 0;
@@ -125,7 +144,7 @@ int add_monomial(const int type) {
   monomial_list[no_monomials].rat.crange[1] = 11;
 
   monomial_list[no_monomials].initialised = 1;
-  if(monomial_list[no_monomials].type == NDDETRATIO) {
+  if(monomial_list[no_monomials].type == NDDETRATIO || monomial_list[no_monomials].type == NDCLOVERDETRATIO || monomial_list[no_monomials].type == CLOVERDETRATIORW) {
     monomial_list[no_monomials].timescale = -5;
   }
 
@@ -140,10 +159,13 @@ int init_monomials(const int V, const int even_odd_flag) {
   spinor * __pf = NULL;
   double sw_mu=0., sw_k=0., sw_c=0.;
   double swn_mubar=0., swn_epsbar = 0., swn_k=0., swn_c=0.;
+
+  if (g_exposu3_no_c == 0) init_exposu3();
+  
   for(int i = 0; i < no_monomials; i++) {
     if((monomial_list[i].type != GAUGE) && (monomial_list[i].type != SFGAUGE)) no++;
     /* non-degenerate monomials need two pseudo fermion fields */
-    if((monomial_list[i].type == NDPOLY) || (monomial_list[i].type == NDDETRATIO) || 
+    if((monomial_list[i].type == NDPOLY) || (monomial_list[i].type == NDDETRATIO) || (monomial_list[i].type == NDCLOVERDETRATIO) || 
        (monomial_list[i].type == NDCLOVER) || (monomial_list[i].type == NDRAT)||
        (monomial_list[i].type == NDRATCOR) || (monomial_list[i].type == NDCLOVERRATCOR) ||
        (monomial_list[i].type == NDCLOVERRAT)) no++;
@@ -179,9 +201,16 @@ int init_monomials(const int V, const int even_odd_flag) {
 	monomial_list[i].hbfunction = &det_heatbath;
 	monomial_list[i].accfunction = &det_acc;
 	monomial_list[i].derivativefunction = &det_derivative;
-	monomial_list[i].Qsq = &Qtm_pm_psi;
-	monomial_list[i].Qp = &Qtm_plus_psi;
-	monomial_list[i].Qm = &Qtm_minus_psi;
+	if(even_odd_flag) {
+	  monomial_list[i].Qsq = &Qtm_pm_psi;
+	  monomial_list[i].Qp = &Qtm_plus_psi;
+	  monomial_list[i].Qm = &Qtm_minus_psi;
+	}
+	else {
+	  monomial_list[i].Qsq = &Q_pm_psi;
+	  monomial_list[i].Qp = &Q_plus_psi;
+	  monomial_list[i].Qm = &Q_minus_psi;
+	}
 	if(g_proc_id == 0 && g_debug_level > 1) {
 	  printf("# Initialised monomial of type DET, no_monomials= %d\n", no_monomials);
 	}
@@ -190,10 +219,17 @@ int init_monomials(const int V, const int even_odd_flag) {
 	monomial_list[i].hbfunction = &cloverdet_heatbath;
 	monomial_list[i].accfunction = &cloverdet_acc;
 	monomial_list[i].derivativefunction = &cloverdet_derivative;
-	monomial_list[i].even_odd_flag = 1;
-	monomial_list[i].Qsq = &Qsw_pm_psi;
-	monomial_list[i].Qp = &Qsw_plus_psi;
-	monomial_list[i].Qm = &Qsw_minus_psi;
+	//monomial_list[i].derivativefunction = &det_derivative;
+	if(even_odd_flag) {
+	  monomial_list[i].Qsq = &Qsw_pm_psi;
+	  monomial_list[i].Qp = &Qsw_plus_psi;
+	  monomial_list[i].Qm = &Qsw_minus_psi;
+	}
+	else {
+	  monomial_list[i].Qsq = &Qsw_full_pm_psi;
+	  monomial_list[i].Qp = &Qsw_full_plus_psi;
+	  monomial_list[i].Qm = &Qsw_full_minus_psi;
+	}
 	init_swpm(VOLUME);
 	clover_monomials[no_clover_monomials] = i;
 	no_clover_monomials++;
@@ -214,11 +250,23 @@ int init_monomials(const int V, const int even_odd_flag) {
 	  printf("# Initialised monomial of type CLOVERDETRATIO, no_monomials= %d\n", no_monomials);
 	}
       }
+      else if(monomial_list[i].type == CLOVERDETRATIORW) {
+	monomial_list[i].accfunction = &cloverdetratio_rwacc;
+	monomial_list[i].even_odd_flag = 1;
+	monomial_list[i].Qsq = &Qsw_pm_psi;
+	monomial_list[i].Qp = &Qsw_plus_psi;
+	monomial_list[i].Qm = &Qsw_minus_psi;
+	init_swpm(VOLUME);
+	if(g_proc_id == 0 && g_debug_level > 1) {
+	  printf("# Initialised monomial of type CLOVERDETRATIORW, no_monomials= %d, currently only available for reweighting!\n", no_monomials);
+	}
+      }
       else if(monomial_list[i].type == DETRATIO) {
 	monomial_list[i].hbfunction = &detratio_heatbath;
 	monomial_list[i].accfunction = &detratio_acc;
 	monomial_list[i].derivativefunction = &detratio_derivative;
 	monomial_list[i].Qsq = &Qtm_pm_psi;
+	monomial_list[i].Qsq32 = &Qtm_pm_psi_32;	
 	monomial_list[i].Qp = &Qtm_plus_psi;
 	monomial_list[i].Qm = &Qtm_minus_psi;
 	if(g_proc_id == 0 && g_debug_level > 1) {
@@ -412,7 +460,18 @@ int init_monomials(const int V, const int even_odd_flag) {
 	monomial_list[i].timescale = -5;
 	no++;
 	if(g_proc_id == 0 && g_debug_level > 1) {
-	  printf("# Initialised monomial of type NDDETRATIO, no_monomials= %d\n", no_monomials);
+	  printf("# Initialised monomial of type NDDETRATIO, no_monomials= %d, currently only available for reweighting!\n", no_monomials);
+	}
+      }
+      else if(monomial_list[i].type == NDCLOVERDETRATIO) {
+	monomial_list[i].hbfunction = &dummy_heatbath;
+	monomial_list[i].accfunction = &nddetratio_acc;
+	monomial_list[i].derivativefunction = NULL;
+	monomial_list[i].pf2 = __pf+no*V;
+	monomial_list[i].timescale = -5;
+	no++;
+	if(g_proc_id == 0 && g_debug_level > 1) {
+	  printf("# Initialised monomial of type NDCLOVERDETRATIO, no_monomials= %d, currently only available for reweighting!\n", no_monomials);
 	}
       }
     }
@@ -557,7 +616,7 @@ int init_poly_monomial(const int V, const int id){
     } 
     else {
       fprintf(stderr,"Reading local normalization from file FAILED\n Borting Ab\n");
-#ifdef MPI
+#ifdef TM_USE_MPI
       MPI_Finalize();
 #endif
       exit(6);
@@ -576,7 +635,7 @@ int init_poly_monomial(const int V, const int id){
   if((rootsFile=fopen(mnl->MDPolyRootsFile,"r")) != (FILE*)NULL) {
     if (fgets(title, 100, rootsFile) == NULL) {
       fprintf(stderr, "Cant read Roots file: %s Aborting...\n", mnl->MDPolyRootsFile);
-#ifdef MPI
+#ifdef TM_USE_MPI
       MPI_Finalize();
 #endif
       exit(6);
@@ -590,7 +649,7 @@ int init_poly_monomial(const int V, const int id){
   }
   else {
     fprintf(stderr, "Roots File %s is missing! Aborting...\n", mnl->MDPolyRootsFile );
-#ifdef MPI
+#ifdef TM_USE_MPI
     MPI_Finalize();
 #endif
     exit(6);
@@ -630,3 +689,34 @@ double dummy_acc(const int id, hamiltonian_field_t * const hf) {
   }
   return(0.);
 }
+
+void mnl_backup_restore_globals(const backup_restore_t mode){
+  static double backup_kappa;
+  static double backup_mu;
+  static double backup_mu1;
+  static double backup_mu2;
+  static double backup_mu3;
+  static double backup_c_sw;
+  static double backup_mubar;
+  static double backup_epsbar;
+  if( mode == TM_BACKUP_GLOBALS ){
+    backup_kappa  = g_kappa;
+    backup_c_sw   = g_c_sw;
+    backup_mu     = g_mu;
+    backup_mu1    = g_mu1;
+    backup_mu2    = g_mu2;
+    backup_mu3    = g_mu3;
+    backup_mubar  = g_mubar;
+    backup_epsbar = g_epsbar;
+  } else {
+    g_kappa  = backup_kappa;
+    g_c_sw   = backup_c_sw;
+    g_mu     = backup_mu;
+    g_mu1    = backup_mu1;
+    g_mu2    = backup_mu2;
+    g_mu3    = backup_mu3;
+    g_mubar  = backup_mubar;
+    g_epsbar = backup_epsbar;
+    boundary(g_kappa);
+  }
+}
diff --git a/monomial/monomial.h b/monomial/monomial.h
index 54995e1a1..00c25a1c8 100644
--- a/monomial/monomial.h
+++ b/monomial/monomial.h
@@ -25,6 +25,8 @@
 #include "su3spinor.h"
 #include "hamiltonian_field.h"
 #include "rational/rational.h"
+#include "solver/solver_params.h"
+#include "misc_types.h"
 
 #define DET 0
 #define DETRATIO 1
@@ -47,6 +49,8 @@
 #define RATCOR 18
 #define CLOVERRAT 19
 #define CLOVERRATCOR 20
+#define CLOVERDETRATIORW 21
+#define NDCLOVERDETRATIO 22
 
 #define max_no_monomials 30
 
@@ -60,6 +64,7 @@ typedef struct {
   int even_odd_flag;
   int rngrepro;
   int solver;
+  
   int iter0, iter1, iter2;
   int csg_N, csg_N2;
   int csg_n, csg_n2;
@@ -82,6 +87,7 @@ typedef struct {
   double epsilon;
   double forceprec;
   double accprec;
+  solver_params_t solver_params;
   /* force normalisation */
   double forcefactor;
   /* some book-keeping */
@@ -116,6 +122,7 @@ typedef struct {
   void (*derivativefunction) (const int no, hamiltonian_field_t * const hf);
   /* the operator definitions */
   void (*Qsq) (spinor * const, spinor * const);
+  void (*Qsq32) (spinor32 * const, spinor32 * const);  
   void (*Qp) (spinor * const, spinor * const);
   void (*Qm) (spinor * const, spinor * const);
 } monomial;
@@ -131,6 +138,7 @@ typedef struct {
 #include "monomial/clovernd_trlog_monomial.h"
 #include "monomial/cloverdet_monomial.h"
 #include "monomial/cloverdetratio_monomial.h"
+#include "monomial/cloverdetratio_rwmonomial.h"
 #include "monomial/cloverndpoly_monomial.h"
 #include "monomial/ndrat_monomial.h"
 #include "monomial/rat_monomial.h"
@@ -164,4 +172,7 @@ void dummy_derivative(const int id, hamiltonian_field_t * const hf);
 void dummy_heatbath(const int id, hamiltonian_field_t * const hf);
 double dummy_acc(const int id, hamiltonian_field_t * const hf);
 
+void mnl_set_globals(const int id);
+void mnl_backup_restore_globals(const backup_restore_t mode);
+
 #endif
diff --git a/monomial/nddetratio_monomial.c b/monomial/nddetratio_monomial.c
index 81f96cfc1..1008fd3e7 100644
--- a/monomial/nddetratio_monomial.c
+++ b/monomial/nddetratio_monomial.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -38,6 +38,8 @@
 #include "operator/Hopping_Matrix.h"
 #include "phmc.h"
 #include "boundary.h"
+#include "operator/clovertm_operators.h"
+#include "operator/clover_leaf.h"
 #include "gamma.h"
 #include "operator/tm_operators_nd.h"
 #include "chebyshev_polynomial_nd.h"
@@ -47,6 +49,7 @@
 #include "monomial/monomial.h"
 #include "hamiltonian_field.h"
 #include "nddetratio_monomial.h"
+#include "DDalphaAMG_interface.h"
 
 
 
@@ -54,24 +57,43 @@ double nddetratio_acc(const int id, hamiltonian_field_t * const hf) {
   int iter;
   monomial * mnl = &monomial_list[id];
   double atime, etime;
+  matrix_mult_nd Q_pm_ndpsi = Qtm_pm_ndpsi, Q_dagger_ndpsi = Qtm_dagger_ndpsi, Q_ndpsi = Qtm_ndpsi;
   atime = gettime();
   
   g_mubar = mnl->mubar;
   g_epsbar = mnl->epsbar;
   boundary(mnl->kappa);
 
-  iter = cg_her_nd(mnl->w_fields[0], mnl->w_fields[1], mnl->pf, mnl->pf2,
-		   mnl->maxiter, mnl->accprec, g_relative_precision_flag, 
-		   VOLUME/2, &Qtm_pm_ndpsi);
-  Qtm_dagger_ndpsi(mnl->w_fields[2], mnl->w_fields[3],
-			mnl->w_fields[0], mnl->w_fields[1]);
+  if(mnl->type == NDCLOVERDETRATIO) {
+    Q_pm_ndpsi = Qsw_pm_ndpsi;
+    Q_dagger_ndpsi = Qsw_dagger_ndpsi;
+    Q_ndpsi = Qsw_ndpsi;
+    init_sw_fields();
+    sw_term((const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
+    sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar);
+  }
+  if( mnl->solver == MG ) {
+    iter = MG_solver_nd(mnl->w_fields[2], mnl->w_fields[3], mnl->pf, mnl->pf2,
+                        mnl->accprec, mnl->maxiter, g_relative_precision_flag, 
+                        VOLUME/2, g_gauge_field, Q_ndpsi);
+  } else {
+    iter = cg_her_nd(mnl->w_fields[0], mnl->w_fields[1], mnl->pf, mnl->pf2,
+                     mnl->maxiter, mnl->accprec, g_relative_precision_flag, 
+                     VOLUME/2, Q_pm_ndpsi);
+    Q_dagger_ndpsi(mnl->w_fields[2], mnl->w_fields[3],
+                   mnl->w_fields[0], mnl->w_fields[1]);
+  }
 
   g_mubar = mnl->mubar2;
   g_epsbar = mnl->epsbar2;
   boundary(mnl->kappa2);
 
-  Qtm_ndpsi(mnl->w_fields[0], mnl->w_fields[1],
-		  mnl->w_fields[2], mnl->w_fields[3]);
+  if(mnl->type == NDCLOVERDETRATIO) {
+    sw_term((const su3**) hf->gaugefield, mnl->kappa2, mnl->c_sw); 
+    sw_invert_nd(mnl->mubar2*mnl->mubar2 - mnl->epsbar2*mnl->epsbar2);
+  }
+  Q_ndpsi(mnl->w_fields[0], mnl->w_fields[1],
+            mnl->w_fields[2], mnl->w_fields[3]);
   
   mnl->energy1  = scalar_prod_r(mnl->pf , mnl->w_fields[0], VOLUME/2, 1);
   mnl->energy1 += scalar_prod_r(mnl->pf2, mnl->w_fields[1], VOLUME/2, 1);
diff --git a/monomial/ndpoly_monomial.c b/monomial/ndpoly_monomial.c
index fcdff0bda..47356e260 100644
--- a/monomial/ndpoly_monomial.c
+++ b/monomial/ndpoly_monomial.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -502,7 +502,7 @@ int init_ndpoly_monomial(const int id) {
   */
   if(mnl->MDPolyLocNormConst < 0.0){
     fprintf(stderr, "Error, please specify LocNormConst in the input file! Aborting...\n");
-#ifdef MPI
+#ifdef TM_USE_MPI
     MPI_Finalize();
 #endif
     exit(6);
@@ -513,7 +513,7 @@ int init_ndpoly_monomial(const int id) {
   if((ifs = fopen(mnl->MDPolyRootsFile, "r")) != (FILE*)NULL) {
     if (fgets(title, 100, ifs) == NULL) {
       fprintf(stderr, "Error in reading %s! Aborting...\n", mnl->MDPolyRootsFile);
-#ifdef MPI
+#ifdef TM_USE_MPI
       MPI_Finalize();
 #endif
       exit(6);
@@ -528,7 +528,7 @@ int init_ndpoly_monomial(const int id) {
   }
   else {
     fprintf(stderr, "File %s is missing! Aborting...\n", mnl->MDPolyRootsFile);
-#ifdef MPI
+#ifdef TM_USE_MPI
     MPI_Finalize();
 #endif
     exit(6);
diff --git a/monomial/ndrat_monomial.c b/monomial/ndrat_monomial.c
index fbe0ac1b5..7439ea2d0 100644
--- a/monomial/ndrat_monomial.c
+++ b/monomial/ndrat_monomial.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -31,10 +31,13 @@
 #include "start.h"
 #include "gettime.h"
 #include "solver/solver.h"
+#include "solver/monomial_solve.h"
 #include "deriv_Sb.h"
 #include "init/init_chi_spinor_field.h"
 #include "operator/tm_operators.h"
+#include "operator/tm_operators_32.h"
 #include "operator/tm_operators_nd.h"
+#include "operator/tm_operators_nd_32.h"
 #include "operator/Hopping_Matrix.h"
 #include "monomial/monomial.h"
 #include "hamiltonian_field.h"
@@ -44,6 +47,7 @@
 #include "rational/rational.h"
 #include "phmc.h"
 #include "ndrat_monomial.h"
+#include "default_input_values.h"
 
 void nd_set_global_parameter(monomial * const mnl) {
 
@@ -54,7 +58,7 @@ void nd_set_global_parameter(monomial * const mnl) {
   boundary(g_kappa);
   phmc_cheb_evmin = mnl->EVMin;
   phmc_invmaxev = mnl->EVMaxInv;
-  phmc_cheb_evmax = 1.;
+  phmc_cheb_evmax = mnl->EVMax;
   phmc_Cpol = 1.;
   // used for preconditioning in cloverdetrat
   g_mu3 = 0.;
@@ -71,7 +75,6 @@ void nd_set_global_parameter(monomial * const mnl) {
 
 void ndrat_derivative(const int id, hamiltonian_field_t * const hf) {
   monomial * mnl = &monomial_list[id];
-  solver_pm_t solver_pm;
   double atime, etime;
   atime = gettime();
   nd_set_global_parameter(mnl);
@@ -87,23 +90,28 @@ void ndrat_derivative(const int id, hamiltonian_field_t * const hf) {
     sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
     // we invert it for the even sites only
     sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar);
+    copy_32_sw_fields();
   }
   mnl->forcefactor = mnl->EVMaxInv;
 
-  solver_pm.max_iter = mnl->maxiter;
-  solver_pm.squared_solver_prec = mnl->forceprec;
-  solver_pm.no_shifts = mnl->rat.np;
-  solver_pm.shifts = mnl->rat.mu;
-  solver_pm.rel_prec = g_relative_precision_flag;
-  solver_pm.type = CGMMSND;
-  solver_pm.M_ndpsi = &Qtm_pm_ndpsi;
-  if(mnl->type == NDCLOVERRAT) solver_pm.M_ndpsi = &Qsw_pm_ndpsi;
-  solver_pm.sdim = VOLUME/2;
+  mnl->solver_params.max_iter = mnl->maxiter;
+  mnl->solver_params.squared_solver_prec = mnl->forceprec;
+  mnl->solver_params.no_shifts = mnl->rat.np;
+  mnl->solver_params.shifts = mnl->rat.mu;
+  mnl->solver_params.rel_prec = g_relative_precision_flag;
+  mnl->solver_params.type = mnl->solver; 
+  mnl->solver_params.M_ndpsi = &Qtm_pm_ndpsi;
+  mnl->solver_params.M_ndpsi32 = &Qtm_pm_ndpsi_32;    
+  if(mnl->type == NDCLOVERRAT) {
+    mnl->solver_params.M_ndpsi = &Qsw_pm_ndpsi;
+    mnl->solver_params.M_ndpsi32 = &Qsw_pm_ndpsi_32;
+  }
+  mnl->solver_params.sdim = VOLUME/2;
+
   // this generates all X_j,o (odd sites only) -> g_chi_up|dn_spinor_field
-  mnl->iter1 += cg_mms_tm_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field,
-			     mnl->pf, mnl->pf2,
-			     &solver_pm);
-  
+  mnl->iter1 += solve_mms_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field,
+                             mnl->pf, mnl->pf2, &(mnl->solver_params) );
+
   for(int j = (mnl->rat.np-1); j > -1; j--) {
     if(mnl->type == NDCLOVERRAT) {
       // multiply with Q_h * tau^1 + i mu_j to get Y_j,o (odd sites)
@@ -117,8 +125,7 @@ void ndrat_derivative(const int id, hamiltonian_field_t * const hf) {
       H_eo_sw_ndpsi(mnl->w_fields[2], mnl->w_fields[3], 
 		    g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j]);
 
-    }
-    else {
+    } else {
       // multiply with Q_h * tau^1 + i mu_j to get Y_j,o (odd sites)
       // needs phmc_Cpol = 1 to work for ndrat!
       Q_tau1_sub_const_ndpsi(mnl->w_fields[0], mnl->w_fields[1],
@@ -155,18 +162,18 @@ void ndrat_derivative(const int id, hamiltonian_field_t * const hf) {
 
     if(mnl->type == NDCLOVERRAT) {
       // even/even sites sandwiched by tau_1 gamma_5 Y_e and gamma_5 X_e
-      sw_spinor(EE, mnl->w_fields[5], mnl->w_fields[2], 
-		mnl->rat.rmu[j]*mnl->forcefactor);
+      sw_spinor_eo(EE, mnl->w_fields[5], mnl->w_fields[2], 
+		   mnl->rat.rmu[j]*mnl->forcefactor);
       // odd/odd sites sandwiched by tau_1 gamma_5 Y_o and gamma_5 X_o
-      sw_spinor(OO, g_chi_up_spinor_field[j], mnl->w_fields[1],
-		mnl->rat.rmu[j]*mnl->forcefactor);
+      sw_spinor_eo(OO, g_chi_up_spinor_field[j], mnl->w_fields[1],
+		   mnl->rat.rmu[j]*mnl->forcefactor);
       
       // even/even sites sandwiched by tau_1 gamma_5 Y_e and gamma_5 X_e
-      sw_spinor(EE, mnl->w_fields[4], mnl->w_fields[3], 
-		mnl->rat.rmu[j]*mnl->forcefactor);
+      sw_spinor_eo(EE, mnl->w_fields[4], mnl->w_fields[3], 
+		   mnl->rat.rmu[j]*mnl->forcefactor);
       // odd/odd sites sandwiched by tau_1 gamma_5 Y_o and gamma_5 X_o
-      sw_spinor(OO, g_chi_dn_spinor_field[j], mnl->w_fields[0],
-		mnl->rat.rmu[j]*mnl->forcefactor);
+      sw_spinor_eo(OO, g_chi_dn_spinor_field[j], mnl->w_fields[0],
+		   mnl->rat.rmu[j]*mnl->forcefactor);
     }
   }
   // trlog part does not depend on the normalisation
@@ -186,7 +193,6 @@ void ndrat_derivative(const int id, hamiltonian_field_t * const hf) {
 
 void ndrat_heatbath(const int id, hamiltonian_field_t * const hf) {
   monomial * mnl = &monomial_list[id];
-  solver_pm_t solver_pm;
   double atime, etime;
   atime = gettime();
   nd_set_global_parameter(mnl);
@@ -195,6 +201,7 @@ void ndrat_heatbath(const int id, hamiltonian_field_t * const hf) {
     init_sw_fields();
     sw_term((const su3**)hf->gaugefield, mnl->kappa, mnl->c_sw); 
     sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar);
+    copy_32_sw_fields();
   }
   // we measure before the trajectory!
   if((mnl->rec_ev != 0) && (hf->traj_counter%mnl->rec_ev == 0)) {
@@ -210,37 +217,29 @@ void ndrat_heatbath(const int id, hamiltonian_field_t * const hf) {
   random_spinor_field_eo(mnl->pf2, mnl->rngrepro, RN_GAUSS);
   mnl->energy0 += square_norm(mnl->pf2, VOLUME/2, 1);
   // set solver parameters
-  solver_pm.max_iter = mnl->maxiter;
-  solver_pm.squared_solver_prec = mnl->accprec;
-  solver_pm.no_shifts = mnl->rat.np;
-  solver_pm.shifts = mnl->rat.nu;
-  solver_pm.type = CGMMSND;
-  solver_pm.M_ndpsi = &Qtm_pm_ndpsi;
-  if(mnl->type == NDCLOVERRAT) solver_pm.M_ndpsi = &Qsw_pm_ndpsi;
-  solver_pm.sdim = VOLUME/2;
-  solver_pm.rel_prec = g_relative_precision_flag;
-  mnl->iter0 = cg_mms_tm_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field,
-			     mnl->pf, mnl->pf2, &solver_pm);
+  mnl->solver_params.max_iter = mnl->maxiter;
+  mnl->solver_params.squared_solver_prec = mnl->accprec;
+  mnl->solver_params.no_shifts = mnl->rat.np;
+  mnl->solver_params.shifts = mnl->rat.nu;
+  mnl->solver_params.type = mnl->solver;
+  mnl->solver_params.M_ndpsi = &Qtm_pm_ndpsi;
+  mnl->solver_params.M_ndpsi32 = &Qtm_pm_ndpsi_32;    
+  if(mnl->type == NDCLOVERRAT) {
+    mnl->solver_params.M_ndpsi = &Qsw_pm_ndpsi;
+    mnl->solver_params.M_ndpsi32 = &Qsw_pm_ndpsi_32;
+  }
+  mnl->solver_params.sdim = VOLUME/2;
+  mnl->solver_params.rel_prec = g_relative_precision_flag;
+  mnl->iter0 = solve_mms_nd_plus(g_chi_up_spinor_field, g_chi_dn_spinor_field,
+                                 mnl->pf, mnl->pf2, &(mnl->solver_params) );
 
   assign(mnl->w_fields[2], mnl->pf, VOLUME/2);
   assign(mnl->w_fields[3], mnl->pf2, VOLUME/2);
 
   // apply C to the random field to generate pseudo-fermion fields
   for(int j = (mnl->rat.np-1); j > -1; j--) {
-    // Q_h * tau^1 - i nu_j
-    // this needs phmc_Cpol = 1 to work!
-    if(mnl->type == NDCLOVERRAT) {
-      Qsw_tau1_sub_const_ndpsi(g_chi_up_spinor_field[mnl->rat.np], g_chi_dn_spinor_field[mnl->rat.np],
-			       g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], 
-			       I*mnl->rat.nu[j], 1., mnl->EVMaxInv);
-    }
-    else {
-      Q_tau1_sub_const_ndpsi(g_chi_up_spinor_field[mnl->rat.np], g_chi_dn_spinor_field[mnl->rat.np],
-			     g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], 
-			     I*mnl->rat.nu[j], 1., mnl->EVMaxInv);
-    }
-    assign_add_mul(mnl->pf, g_chi_up_spinor_field[mnl->rat.np], I*mnl->rat.rnu[j], VOLUME/2);
-    assign_add_mul(mnl->pf2, g_chi_dn_spinor_field[mnl->rat.np], I*mnl->rat.rnu[j], VOLUME/2);
+      assign_add_mul(mnl->pf, g_chi_up_spinor_field[j], I*mnl->rat.rnu[j], VOLUME/2);
+      assign_add_mul(mnl->pf2, g_chi_dn_spinor_field[j], I*mnl->rat.rnu[j], VOLUME/2);
   }
 
   etime = gettime();
@@ -248,7 +247,7 @@ void ndrat_heatbath(const int id, hamiltonian_field_t * const hf) {
     if(g_debug_level > 1) {
       printf("# Time for %s monomial heatbath: %e s\n", mnl->name, etime-atime);
     }
-    if(g_debug_level > 3) { 
+    if(g_debug_level > 3) {
       printf("called ndrat_heatbath for id %d energy %f\n", id, mnl->energy0);
     }
   }
@@ -257,7 +256,6 @@ void ndrat_heatbath(const int id, hamiltonian_field_t * const hf) {
 
 
 double ndrat_acc(const int id, hamiltonian_field_t * const hf) {
-  solver_pm_t solver_pm;
   monomial * mnl = &monomial_list[id];
   double atime, etime;
   atime = gettime();
@@ -265,21 +263,26 @@ double ndrat_acc(const int id, hamiltonian_field_t * const hf) {
   if(mnl->type == NDCLOVERRAT) {
     sw_term((const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
     sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar);
+    copy_32_sw_fields();
   }
   mnl->energy1 = 0.;
 
-  solver_pm.max_iter = mnl->maxiter;
-  solver_pm.squared_solver_prec = mnl->accprec;
-  solver_pm.no_shifts = mnl->rat.np;
-  solver_pm.shifts = mnl->rat.mu;
-  solver_pm.type = CGMMSND;
-  solver_pm.M_ndpsi = &Qtm_pm_ndpsi;
-  if(mnl->type == NDCLOVERRAT) solver_pm.M_ndpsi = &Qsw_pm_ndpsi;
-  solver_pm.sdim = VOLUME/2;
-  solver_pm.rel_prec = g_relative_precision_flag;
-  mnl->iter0 += cg_mms_tm_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field,
-			     mnl->pf, mnl->pf2,
-			     &solver_pm);
+  mnl->solver_params.max_iter = mnl->maxiter;
+  mnl->solver_params.squared_solver_prec = mnl->accprec;
+  mnl->solver_params.no_shifts = mnl->rat.np;
+  mnl->solver_params.shifts = mnl->rat.mu;
+  mnl->solver_params.type = mnl->solver;
+  
+  mnl->solver_params.M_ndpsi = &Qtm_pm_ndpsi;
+  mnl->solver_params.M_ndpsi32 = &Qtm_pm_ndpsi_32; 
+  if(mnl->type == NDCLOVERRAT) {
+    mnl->solver_params.M_ndpsi = &Qsw_pm_ndpsi;
+    mnl->solver_params.M_ndpsi32 = &Qsw_pm_ndpsi_32;
+  }
+  mnl->solver_params.sdim = VOLUME/2;
+  mnl->solver_params.rel_prec = g_relative_precision_flag;
+  mnl->iter0 += solve_mms_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field,
+                            mnl->pf, mnl->pf2, &(mnl->solver_params) );
 
   // apply R to the pseudo-fermion fields
   assign(mnl->w_fields[0], mnl->pf, VOLUME/2);
@@ -298,8 +301,8 @@ double ndrat_acc(const int id, hamiltonian_field_t * const hf) {
     if(g_debug_level > 1) {
       printf("# Time for %s monomial acc step: %e s\n", mnl->name, etime-atime);
     }
-    if(g_debug_level > 0) { // shoud be 3
-      printf("called ndrat_acc for id %d dH = %1.10e\n", id, mnl->energy1 - mnl->energy0);
+    if(g_debug_level > 3) {
+      printf("called ndrat_acc for id %d, H_1 = %.10e, dH = %1.10e\n", id, mnl->energy1,  mnl->energy1 - mnl->energy0);
     }
   }
   return(mnl->energy1 - mnl->energy0);
@@ -308,26 +311,40 @@ double ndrat_acc(const int id, hamiltonian_field_t * const hf) {
 
 int init_ndrat_monomial(const int id) {
   monomial * mnl = &monomial_list[id];  
+  int scale = 0;
+
+  if(mnl->type == RAT || mnl->type == CLOVERRAT ||
+     mnl->type == RATCOR || mnl->type == CLOVERRATCOR) 
+    scale = 1;
+
+  if(scale) {
+    // When scale = 1 
+    //   the rational approximation is done for the standard operator 
+    //   which have eigenvalues between EVMin and EVMax.  Indeed the 
+    //   parameters of the rational approximation are scaled. Thus 
+    //   additional scaling of the operator (EVMaxInv) is not required.
+    mnl->EVMin = mnl->StildeMin;
+    mnl->EVMax = mnl->StildeMax;
+    mnl->EVMaxInv = 1.;
+  } else {
+    // When scale = 0 
+    //   the rational approximation is done for the normalized operator 
+    //   which have eigenvalues between EVMin/EVMax and 1. Thus the 
+    //   operator need to be scaled by EVMaxInv=1/EVMax.
+    mnl->EVMin = mnl->StildeMin / mnl->StildeMax;
+    mnl->EVMax = 1.;
+    mnl->EVMaxInv = 1./sqrt(mnl->StildeMax);
+  }
 
-  mnl->EVMin = mnl->StildeMin / mnl->StildeMax;
-  mnl->EVMax = 1.;
-  mnl->EVMaxInv = 1./(sqrt(mnl->StildeMax));
+  init_rational(&mnl->rat, scale);
 
   if(mnl->type == RAT || mnl->type == CLOVERRAT ||
      mnl->type == RATCOR || mnl->type == CLOVERRATCOR) {
-    init_rational(&mnl->rat, 1);
-
     if(init_chi_spinor_field(VOLUMEPLUSRAND/2, (mnl->rat.np+2)/2) != 0) {
       fprintf(stderr, "Not enough memory for Chi fields! Aborting...\n");
       exit(0);
     }
-  }
-  else {
-    init_rational(&mnl->rat, 0);
-    mnl->EVMin = mnl->StildeMin / mnl->StildeMax;
-    mnl->EVMax = 1.;
-    mnl->EVMaxInv = 1./(sqrt(mnl->StildeMax));
-    
+  } else {
     if(init_chi_spinor_field(VOLUMEPLUSRAND/2, (mnl->rat.np+1)) != 0) {
       fprintf(stderr, "Not enough memory for Chi fields! Aborting...\n");
       exit(0);
diff --git a/monomial/ndratcor_monomial.c b/monomial/ndratcor_monomial.c
index 6c1eb7257..0f02d7af0 100644
--- a/monomial/ndratcor_monomial.c
+++ b/monomial/ndratcor_monomial.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -31,10 +31,13 @@
 #include "start.h"
 #include "gettime.h"
 #include "solver/solver.h"
+#include "solver/monomial_solve.h"
 #include "deriv_Sb.h"
 #include "init/init_chi_spinor_field.h"
 #include "operator/tm_operators.h"
+#include "operator/tm_operators_32.h"
 #include "operator/tm_operators_nd.h"
+#include "operator/tm_operators_nd_32.h"
 #include "operator/Hopping_Matrix.h"
 #include "monomial/monomial.h"
 #include "hamiltonian_field.h"
@@ -50,22 +53,22 @@
 void check_C_ndpsi(spinor * const k_up, spinor * const k_dn,
 		   spinor * const l_up, spinor * const l_dn,
 		   const int id, hamiltonian_field_t * const hf,
-		   solver_pm_t * solver_pm);
+		   solver_params_t * solver_params);
 
 // applies (Q^2 R^2 -1) phi
-double apply_Z_ndpsi(spinor * const k_up, spinor * const k_dn,
+void apply_Z_ndpsi(spinor * const k_up, spinor * const k_dn,
 		     spinor * const l_up, spinor * const l_dn,
 		     const int id, hamiltonian_field_t * const hf,
-		     solver_pm_t * solver_pm);
+		     solver_params_t * solver_params);
 
 
 
 void ndratcor_heatbath(const int id, hamiltonian_field_t * const hf) {
   monomial * mnl = &monomial_list[id];
-  solver_pm_t solver_pm;
   double atime, etime, delta;
-  spinor * up0, * dn0, * up1, * dn1, * tup, * tdn;
-  double coefs[6] = {1./4., -3./32., 7./122., -77./2048., 231./8192., -1463./65536.};
+  spinor * up0, * dn0, * up1, * dn1, * tup, * tdn, * Zup, * Zdn;
+  double coefs[6] = {1./4., -3./32., 7./128., -77./2048., 231./8192., -1463./65536.}; // series of (1+x)^(1/4)
+  double coefs_check[6] = {1./2., -1./8., 1./16., -5./128., 7./256., -21./1024.}; // series of (1+x)^(1/2)
   atime = gettime();
   nd_set_global_parameter(mnl);
   g_mu3 = 0.;
@@ -74,10 +77,11 @@ void ndratcor_heatbath(const int id, hamiltonian_field_t * const hf) {
     init_sw_fields();
     sw_term((const su3**)hf->gaugefield, mnl->kappa, mnl->c_sw); 
     sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar);
+    copy_32_sw_fields();
   }
   // we measure before the trajectory!
   if((mnl->rec_ev != 0) && (hf->traj_counter%mnl->rec_ev == 0)) {
-    if(mnl->type != NDCLOVERRAT) phmc_compute_ev(hf->traj_counter-1, id, &Qtm_pm_ndbipsi);
+    if(mnl->type != NDCLOVERRATCOR) phmc_compute_ev(hf->traj_counter-1, id, &Qtm_pm_ndbipsi);
     else phmc_compute_ev(hf->traj_counter-1, id, &Qsw_pm_ndbipsi);
   }
 
@@ -89,30 +93,72 @@ void ndratcor_heatbath(const int id, hamiltonian_field_t * const hf) {
   random_spinor_field_eo(mnl->pf2, mnl->rngrepro, RN_GAUSS);
   mnl->energy0 += square_norm(mnl->pf2, VOLUME/2, 1);
 
-  solver_pm.max_iter = mnl->maxiter;
-  solver_pm.squared_solver_prec = mnl->accprec;
-  solver_pm.no_shifts = mnl->rat.np;
-  solver_pm.shifts = mnl->rat.mu;
-  solver_pm.type = CGMMSND;
-  solver_pm.M_ndpsi = &Qtm_pm_ndpsi;
-  if(mnl->type == NDCLOVERRATCOR) solver_pm.M_ndpsi = &Qsw_pm_ndpsi;
-  solver_pm.sdim = VOLUME/2;
-  solver_pm.rel_prec = g_relative_precision_flag;
+  mnl->solver_params.max_iter = mnl->maxiter;
+  mnl->solver_params.squared_solver_prec = mnl->accprec;
+  mnl->solver_params.no_shifts = mnl->rat.np;
+  mnl->solver_params.shifts = mnl->rat.mu;
+  mnl->solver_params.type = mnl->solver;
+  mnl->solver_params.M_ndpsi = &Qtm_pm_ndpsi;
+  mnl->solver_params.M_ndpsi32 = &Qtm_pm_ndpsi_32;    
+  if(mnl->type == NDCLOVERRATCOR) {
+    mnl->solver_params.M_ndpsi = &Qsw_pm_ndpsi;
+    mnl->solver_params.M_ndpsi32 = &Qsw_pm_ndpsi_32;
+  }
+  mnl->solver_params.sdim = VOLUME/2;
+  mnl->solver_params.rel_prec = g_relative_precision_flag;
 
   // apply B to the random field to generate pseudo-fermion fields
-  assign(mnl->w_fields[0], mnl->pf, VOLUME/2);
-  assign(mnl->w_fields[1], mnl->pf2, VOLUME/2);
   up0 = mnl->w_fields[0]; dn0 = mnl->w_fields[1];
   up1 = mnl->w_fields[2]; dn1 = mnl->w_fields[3];
-	 
-  for(int i = 1; i < 8; i++) {
-    delta = apply_Z_ndpsi(up1, dn1, up0, dn0, id, hf, &solver_pm);
-    assign_add_mul_r(mnl->pf, up1, coefs[i-1], VOLUME/2);
-    assign_add_mul_r(mnl->pf2, dn1, coefs[i-1], VOLUME/2);
-    if(delta < mnl->accprec) break;
-    tup = up0; tdn = dn0;
-    up0 = up1; dn0 = dn1;
-    up1 = tup; dn1 = tdn;
+  Zup = mnl->w_fields[4]; Zdn = mnl->w_fields[5];
+
+  apply_Z_ndpsi(up0, dn0, mnl->pf, mnl->pf2, id, hf, &(mnl->solver_params));
+  // computing correction to energy1
+  delta = coefs_check[0]*(scalar_prod_r(mnl->pf, up0, VOLUME/2, 1) + scalar_prod_r(mnl->pf2, dn0, VOLUME/2, 1));
+  if(g_debug_level > 2 && g_proc_id == 0)
+    printf("# NDRATCOR heatbath: c_%d*(R * Z^%d * R) = %e\n", 1, 1, delta);
+  // debug for showing that the old check was giving a smaller delta
+  if(g_debug_level > 3) {
+    double delta_old = square_norm(up0, VOLUME/2, 1) + square_norm(dn0, VOLUME/2, 1);
+    if(g_proc_id == 0) {
+      printf("# NDRATCOR old check: || Z^%d * R ||^2 = %e\n", 1, delta_old);
+      printf("# NDRATCOR new check: (c_%d*(R * Z^%d * R))^2 = %e\n", 1, 1, delta*delta);
+    }
+  }
+
+  if(delta*delta > mnl->accprec) {
+    assign_add_mul_r(mnl->pf, up0, coefs[0], VOLUME/2);
+    assign_add_mul_r(mnl->pf2, dn0, coefs[0], VOLUME/2);
+    
+    // saving first application
+    assign(Zup, up0, VOLUME/2);
+    assign(Zdn, dn0, VOLUME/2);
+    
+    
+    for(int i = 2; i < 8; i++) {
+      // computing next order correction to energy1
+      delta = coefs_check[i-1]*(scalar_prod_r(Zup, up0, VOLUME/2, 1) + scalar_prod_r(Zup, dn0, VOLUME/2, 1)); 
+      if(g_debug_level > 2 && g_proc_id == 0)
+        printf("# NDRATCOR heatbath: c_%d*(R * Z^%d * R) = %e\n", i, i, delta);
+      // debug for showing that the old check was giving a smaller delta
+      if(g_debug_level > 3) {
+        double delta_old = square_norm(up0, VOLUME/2, 1) + square_norm(dn0, VOLUME/2, 1);
+        if(g_proc_id == 0) {
+          printf("# NDRATCOR old check: || Z^%d * R ||^2 = %e\n", 1, delta_old);
+          printf("# NDRATCOR new check: (c_%d*(R * Z^%d * R))^2 = %e\n", 1, 1, delta*delta);
+        }
+      }
+      if(delta*delta < mnl->accprec) break;
+
+      apply_Z_ndpsi(up1, dn1, up0, dn0, id, hf, &(mnl->solver_params));
+      
+      assign_add_mul_r(mnl->pf, up1, coefs[i-1], VOLUME/2);
+      assign_add_mul_r(mnl->pf2, dn1, coefs[i-1], VOLUME/2);
+
+      tup = up0; tdn = dn0;
+      up0 = up1; dn0 = dn1;
+      up1 = tup; dn1 = tdn;
+    }
   }
   etime = gettime();
   if(g_proc_id == 0) {
@@ -128,7 +174,6 @@ void ndratcor_heatbath(const int id, hamiltonian_field_t * const hf) {
 
 
 double ndratcor_acc(const int id, hamiltonian_field_t * const hf) {
-  solver_pm_t solver_pm;
   monomial * mnl = &monomial_list[id];
   double atime, etime, delta;
   spinor * up0, * dn0, * up1, * dn1, * tup, * tdn;
@@ -139,41 +184,55 @@ double ndratcor_acc(const int id, hamiltonian_field_t * const hf) {
   if(mnl->type == NDCLOVERRATCOR) {
     sw_term((const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
     sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar);
+    copy_32_sw_fields();
   }
-  mnl->energy1 = 0.;
+  mnl->energy1 = square_norm(mnl->pf, VOLUME/2, 1) + square_norm(mnl->pf2, VOLUME/2, 1);
 
-  solver_pm.max_iter = mnl->maxiter;
-  solver_pm.squared_solver_prec = mnl->accprec;
-  solver_pm.no_shifts = mnl->rat.np;
-  solver_pm.shifts = mnl->rat.mu;
-  solver_pm.type = CGMMSND;
-  solver_pm.M_ndpsi = &Qtm_pm_ndpsi;
-  if(mnl->type == NDCLOVERRATCOR) solver_pm.M_ndpsi = &Qsw_pm_ndpsi;
-  solver_pm.sdim = VOLUME/2;
-  solver_pm.rel_prec = g_relative_precision_flag;
+  mnl->solver_params.max_iter = mnl->maxiter;
+  mnl->solver_params.squared_solver_prec = mnl->accprec;
+  mnl->solver_params.no_shifts = mnl->rat.np;
+  mnl->solver_params.shifts = mnl->rat.mu;
+  mnl->solver_params.type = mnl->solver;
+  mnl->solver_params.M_ndpsi = &Qtm_pm_ndpsi;
+  mnl->solver_params.M_ndpsi32 = &Qtm_pm_ndpsi_32;    
+  if(mnl->type == NDCLOVERRATCOR) {
+    mnl->solver_params.M_ndpsi = &Qsw_pm_ndpsi;
+    mnl->solver_params.M_ndpsi32 = &Qsw_pm_ndpsi_32;
+  }
+  mnl->solver_params.sdim = VOLUME/2;
+  mnl->solver_params.rel_prec = g_relative_precision_flag;
 
   // apply (Q R)^(-1) to pseudo-fermion fields
-  assign(mnl->w_fields[4], mnl->pf, VOLUME/2);
-  assign(mnl->w_fields[5], mnl->pf2, VOLUME/2);
   up0 = mnl->w_fields[0]; dn0 = mnl->w_fields[1];
   up1 = mnl->w_fields[2]; dn1 = mnl->w_fields[3];
 
-  delta = apply_Z_ndpsi(up0, dn0, mnl->pf, mnl->pf2, id, hf, &solver_pm);
-  assign_add_mul_r(mnl->w_fields[4], up0, coefs[0], VOLUME/2);
-  assign_add_mul_r(mnl->w_fields[5], dn0, coefs[0], VOLUME/2);
+  apply_Z_ndpsi(up0, dn0, mnl->pf, mnl->pf2, id, hf, &(mnl->solver_params));
+  delta = coefs[0]*(scalar_prod_r(mnl->pf, up0, VOLUME/2, 1) + scalar_prod_r(mnl->pf2, dn0, VOLUME/2, 1));
+  mnl->energy1 += delta;
+  if(g_debug_level > 2 && g_proc_id == 0)
+    printf("# NDRATCOR acc step: c_%d*(phi * Z^%d * phi) = %e\n", 1, 1, delta);
 
   for(int i = 2; i < 8; i++) {
-    if(delta < mnl->accprec) break;
-    delta = apply_Z_ndpsi(up1, dn1, up0, dn0, id, hf, &solver_pm);
-    assign_add_mul_r(mnl->w_fields[4], up1, coefs[i-1], VOLUME/2);
-    assign_add_mul_r(mnl->w_fields[5], dn1, coefs[i-1], VOLUME/2);
+    if(delta*delta < mnl->accprec) break;
+
+    delta = coefs[i-1]*(square_norm(up0, VOLUME/2, 1) + square_norm(dn0, VOLUME/2, 1)); 
+    mnl->energy1 += delta;
+    if(g_debug_level > 2 && g_proc_id == 0)
+      printf("# NDRATCOR acc step: c_%d*(phi * Z^%d * phi) = %e\n", i, i, delta);
+    i++; //incrementing i
+    if(delta*delta < mnl->accprec) break;
+
+    apply_Z_ndpsi(up1, dn1, up0, dn0, id, hf, &(mnl->solver_params));
+    delta = coefs[i-1]*(scalar_prod_r(up0, up1, VOLUME/2, 1) + scalar_prod_r(dn0, dn1, VOLUME/2, 1));
+    mnl->energy1 += delta;
+    if(g_debug_level > 2 && g_proc_id == 0)
+      printf("# NDRATCOR acc step: c_%d*(phi * Z^%d * phi) = %e\n", i, i, delta);
+
     tup = up0; tdn = dn0;
     up0 = up1; dn0 = dn1;
     up1 = tup; dn1 = tdn;
   }
 
-  mnl->energy1 = scalar_prod_r(mnl->pf, mnl->w_fields[4], VOLUME/2, 1);
-  mnl->energy1 += scalar_prod_r(mnl->pf2, mnl->w_fields[5], VOLUME/2, 1);
   etime = gettime();
   if(g_proc_id == 0) {
     if(g_debug_level > 1) {
@@ -187,16 +246,14 @@ double ndratcor_acc(const int id, hamiltonian_field_t * const hf) {
 }
 
 // applies ((Q_h\tau_1 * R)^2 - 1)
-
-double apply_Z_ndpsi(spinor * const k_up, spinor * const k_dn,
+void apply_Z_ndpsi(spinor * const k_up, spinor * const k_dn,
 		     spinor * const l_up, spinor * const l_dn,
 		     const int id, hamiltonian_field_t * const hf,
-		     solver_pm_t * solver_pm) {
+		     solver_params_t * solver_params) {
   monomial * mnl = &monomial_list[id];
 
-  mnl->iter0 += cg_mms_tm_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field,
-			     l_up, l_dn,
-			     solver_pm);  
+  mnl->iter0 += solve_mms_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field,
+			                       l_up, l_dn, solver_params);  
   
   // apply R to the pseudo-fermion fields
   assign(k_up, l_up, VOLUME/2);
@@ -209,9 +266,9 @@ double apply_Z_ndpsi(spinor * const k_up, spinor * const k_dn,
   }
 
   // apply R a second time
-  cg_mms_tm_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field,
+  mnl->iter0 += solve_mms_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field,
 	       k_up, k_dn,
-	       solver_pm);
+	       solver_params);
   for(int j = (mnl->rat.np-1); j > -1; j--) {
     assign_add_mul_r(k_up, g_chi_up_spinor_field[j], 
 		     mnl->rat.rmu[j], VOLUME/2);
@@ -223,26 +280,21 @@ double apply_Z_ndpsi(spinor * const k_up, spinor * const k_dn,
   mul_r(g_chi_dn_spinor_field[mnl->rat.np], mnl->rat.A*mnl->rat.A, 
 	k_dn, VOLUME/2);
   // apply Q^2 and compute the residue
-  solver_pm->M_ndpsi(k_up, k_dn,
+  solver_params->M_ndpsi(k_up, k_dn,
 		     g_chi_up_spinor_field[mnl->rat.np], g_chi_dn_spinor_field[mnl->rat.np]);
   diff(k_up, k_up, l_up, VOLUME/2);
   diff(k_dn, k_dn, l_dn, VOLUME/2);
-  double resi = square_norm(k_up, VOLUME/2, 1) + square_norm(k_dn, VOLUME/2, 1);
-  if(g_debug_level > 2 && g_proc_id == 0) {
-    printf("# NDRATCOR: ||Z * phi|| = %e\n", resi);
-  }
-  return(resi);
+  
 }
 
 // computes ||(1 - C^dagger R C) phi||
-
 void check_C_ndpsi(spinor * const k_up, spinor * const k_dn,
 		   spinor * const l_up, spinor * const l_dn,
 		   const int id, hamiltonian_field_t * const hf,
-		   solver_pm_t * solver_pm) {
+		   solver_params_t * solver_params) {
   monomial * mnl = &monomial_list[id];
-  mnl->iter0 = cg_mms_tm_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field,
-			     l_up, l_dn, solver_pm);
+  mnl->iter0 = solve_mms_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field,
+			     l_up, l_dn, solver_params);
 
   assign(k_up, l_up, VOLUME/2);
   assign(k_dn, l_dn, VOLUME/2);
@@ -265,10 +317,10 @@ void check_C_ndpsi(spinor * const k_up, spinor * const k_dn,
     assign_add_mul(k_dn, g_chi_dn_spinor_field[mnl->rat.np], I*mnl->rat.rnu[j], VOLUME/2);
   }
   //apply R
-  solver_pm->shifts = mnl->rat.mu;
-  cg_mms_tm_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field,
+  solver_params->shifts = mnl->rat.mu;
+  solve_mms_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field,
 	       k_up, k_dn,
-	       solver_pm);
+	       solver_params);
   for(int j = (mnl->rat.np-1); j > -1; j--) {
     assign_add_mul_r(k_up, g_chi_up_spinor_field[j], 
 		     mnl->rat.rmu[j], VOLUME/2);
@@ -276,9 +328,9 @@ void check_C_ndpsi(spinor * const k_up, spinor * const k_dn,
 		     mnl->rat.rmu[j], VOLUME/2);
   }
   // apply C^dagger
-  solver_pm->shifts = mnl->rat.nu;
-  cg_mms_tm_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field,
-	       k_up, k_dn, solver_pm);
+  solver_params->shifts = mnl->rat.nu;
+  solve_mms_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field,
+	       k_up, k_dn, solver_params);
   for(int j = (mnl->rat.np-1); j > -1; j--) {
     // Q_h * tau^1 + i nu_j
     if(mnl->type == NDCLOVERRATCOR || mnl->type == NDCLOVERRAT) {
diff --git a/monomial/poly_monomial.c b/monomial/poly_monomial.c
index 07fc20455..1dd5c2c53 100644
--- a/monomial/poly_monomial.c
+++ b/monomial/poly_monomial.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/monomial/rat_monomial.c b/monomial/rat_monomial.c
index 3e3aeef7c..a2d223ec7 100644
--- a/monomial/rat_monomial.c
+++ b/monomial/rat_monomial.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -30,7 +30,8 @@
 #include "linalg_eo.h"
 #include "start.h"
 #include "gettime.h"
-#include "solver/solver.h"
+#include "solver/monomial_solve.h"
+#include "solver/solver_types.h"
 #include "deriv_Sb.h"
 #include "init/init_chi_spinor_field.h"
 #include "operator/tm_operators.h"
@@ -54,9 +55,10 @@
 
 void rat_derivative(const int id, hamiltonian_field_t * const hf) {
   monomial * mnl = &monomial_list[id];
-  solver_pm_t solver_pm;
-  double atime, etime, dummy;
+  double atime, etime;
   atime = gettime();
+  mnl_backup_restore_globals(TM_BACKUP_GLOBALS);
+  g_kappa = mnl->kappa;
   g_mu = 0;
   g_mu3 = 0.;
   boundary(mnl->kappa);
@@ -78,17 +80,17 @@ void rat_derivative(const int id, hamiltonian_field_t * const hf) {
   //mnl->forcefactor = mnl->EVMaxInv*mnl->EVMaxInv;
   mnl->forcefactor = 1.;
 
-  solver_pm.max_iter = mnl->maxiter;
-  solver_pm.squared_solver_prec = mnl->forceprec;
-  solver_pm.no_shifts = mnl->rat.np;
-  solver_pm.shifts = mnl->rat.mu;
-  solver_pm.rel_prec = g_relative_precision_flag;
-  solver_pm.type = CGMMS;
-  solver_pm.M_psi = mnl->Qsq;
-  solver_pm.sdim = VOLUME/2;
+  mnl->solver_params.max_iter = mnl->maxiter;
+  mnl->solver_params.squared_solver_prec = mnl->forceprec;
+  mnl->solver_params.no_shifts = mnl->rat.np;
+  mnl->solver_params.shifts = mnl->rat.mu;
+  mnl->solver_params.rel_prec = g_relative_precision_flag;
+  mnl->solver_params.type = mnl->solver;
+  mnl->solver_params.M_psi = mnl->Qsq;
+  mnl->solver_params.sdim = VOLUME/2;
   // this generates all X_j,o (odd sites only) -> g_chi_up_spinor_field
-  mnl->iter1 += cg_mms_tm(g_chi_up_spinor_field, mnl->pf,
-			  &solver_pm, &dummy);
+  mnl->iter1 += solve_mms_tm(g_chi_up_spinor_field, mnl->pf,
+                             &(mnl->solver_params) );
   
   for(int j = (mnl->rat.np-1); j > -1; j--) {
     mnl->Qp(mnl->w_fields[0], g_chi_up_spinor_field[j]);
@@ -108,10 +110,10 @@ void rat_derivative(const int id, hamiltonian_field_t * const hf) {
 	       mnl->rat.rmu[j]*mnl->forcefactor);
 
       // even/even sites sandwiched by gamma_5 Y_e and gamma_5 X_e
-      sw_spinor(EE, mnl->w_fields[2], mnl->w_fields[3], mnl->rat.rmu[j]*mnl->forcefactor);
+      sw_spinor_eo(EE, mnl->w_fields[2], mnl->w_fields[3], mnl->rat.rmu[j]*mnl->forcefactor);
   
       // odd/odd sites sandwiched by gamma_5 Y_o and gamma_5 X_o
-      sw_spinor(OO, mnl->w_fields[0], g_chi_up_spinor_field[j], mnl->rat.rmu[j]*mnl->forcefactor);
+      sw_spinor_eo(OO, mnl->w_fields[0], g_chi_up_spinor_field[j], mnl->rat.rmu[j]*mnl->forcefactor);
 
     }
     else {
@@ -139,15 +141,17 @@ void rat_derivative(const int id, hamiltonian_field_t * const hf) {
   if(g_debug_level > 1 && g_proc_id == 0) {
     printf("# Time for %s monomial derivative: %e s\n", mnl->name, etime-atime);
   }
+  mnl_backup_restore_globals(TM_RESTORE_GLOBALS);
   return;
 }
 
 
 void rat_heatbath(const int id, hamiltonian_field_t * const hf) {
   monomial * mnl = &monomial_list[id];
-  solver_pm_t solver_pm;
-  double atime, etime, dummy;
+  double atime, etime;
   atime = gettime();
+  mnl_backup_restore_globals(TM_BACKUP_GLOBALS);
+  g_kappa = mnl->kappa;
   // only for non-twisted operators
   g_mu = 0.;
   g_mu3 = 0.;
@@ -173,16 +177,17 @@ void rat_heatbath(const int id, hamiltonian_field_t * const hf) {
   mnl->energy0 = square_norm(mnl->pf, VOLUME/2, 1);
 
   // set solver parameters
-  solver_pm.max_iter = mnl->maxiter;
-  solver_pm.squared_solver_prec = mnl->accprec;
-  solver_pm.no_shifts = mnl->rat.np;
-  solver_pm.shifts = mnl->rat.nu;
-  solver_pm.type = CGMMS;
-  solver_pm.M_psi = mnl->Qsq;
-  solver_pm.sdim = VOLUME/2;
-  solver_pm.rel_prec = g_relative_precision_flag;
-  mnl->iter0 = cg_mms_tm(g_chi_up_spinor_field, mnl->pf,
-			 &solver_pm, &dummy);
+  mnl->solver_params.max_iter = mnl->maxiter;
+  mnl->solver_params.squared_solver_prec = mnl->accprec;
+  mnl->solver_params.no_shifts = mnl->rat.np;
+  mnl->solver_params.shifts = mnl->rat.nu;
+  mnl->solver_params.type = mnl->solver;
+  mnl->solver_params.M_psi = mnl->Qsq;
+  mnl->solver_params.sdim = VOLUME/2;
+  mnl->solver_params.rel_prec = g_relative_precision_flag;
+
+  mnl->iter0 = solve_mms_tm(g_chi_up_spinor_field, mnl->pf,
+                            &(mnl->solver_params) );
 
   assign(mnl->w_fields[2], mnl->pf, VOLUME/2);
 
@@ -203,15 +208,17 @@ void rat_heatbath(const int id, hamiltonian_field_t * const hf) {
       printf("called rat_heatbath for id %d energy %f\n", id, mnl->energy0);
     }
   }
+  mnl_backup_restore_globals(TM_RESTORE_GLOBALS);
   return;
 }
 
 
 double rat_acc(const int id, hamiltonian_field_t * const hf) {
-  solver_pm_t solver_pm;
   monomial * mnl = &monomial_list[id];
-  double atime, etime, dummy;
+  double atime, etime;
   atime = gettime();
+  mnl_backup_restore_globals(TM_BACKUP_GLOBALS);
+  g_kappa = mnl->kappa;
   // only for non-twisted operators
   g_mu = 0.;
   g_mu3 = 0.;
@@ -223,16 +230,16 @@ double rat_acc(const int id, hamiltonian_field_t * const hf) {
   }
   mnl->energy1 = 0.;
 
-  solver_pm.max_iter = mnl->maxiter;
-  solver_pm.squared_solver_prec = mnl->accprec;
-  solver_pm.no_shifts = mnl->rat.np;
-  solver_pm.shifts = mnl->rat.mu;
-  solver_pm.type = CGMMS;
-  solver_pm.M_psi = mnl->Qsq;
-  solver_pm.sdim = VOLUME/2;
-  solver_pm.rel_prec = g_relative_precision_flag;
-  mnl->iter0 += cg_mms_tm(g_chi_up_spinor_field, mnl->pf,
-			  &solver_pm, &dummy);
+  mnl->solver_params.max_iter = mnl->maxiter;
+  mnl->solver_params.squared_solver_prec = mnl->accprec;
+  mnl->solver_params.no_shifts = mnl->rat.np;
+  mnl->solver_params.shifts = mnl->rat.mu;
+  mnl->solver_params.type = mnl->solver;
+  mnl->solver_params.M_psi = mnl->Qsq;
+  mnl->solver_params.sdim = VOLUME/2;
+  mnl->solver_params.rel_prec = g_relative_precision_flag;
+  mnl->iter0 += solve_mms_tm(g_chi_up_spinor_field, mnl->pf,
+                             &(mnl->solver_params) );
 
   // apply R to the pseudo-fermion fields
   assign(mnl->w_fields[0], mnl->pf, VOLUME/2);
@@ -240,17 +247,18 @@ double rat_acc(const int id, hamiltonian_field_t * const hf) {
     assign_add_mul_r(mnl->w_fields[0], g_chi_up_spinor_field[j], 
 		     mnl->rat.rmu[j], VOLUME/2);
   }
-
+  
   mnl->energy1 = scalar_prod_r(mnl->pf, mnl->w_fields[0], VOLUME/2, 1);
   etime = gettime();
   if(g_proc_id == 0) {
     if(g_debug_level > 1) {
       printf("# Time for %s monomial acc step: %e s\n", mnl->name, etime-atime);
     }
-    if(g_debug_level > 0) { // shoud be 3
+    if(g_debug_level > 3) {
       printf("called rat_acc for id %d dH = %1.10e\n", id, mnl->energy1 - mnl->energy0);
     }
   }
+  mnl_backup_restore_globals(TM_RESTORE_GLOBALS);
   return(mnl->energy1 - mnl->energy0);
 }
 
diff --git a/monomial/ratcor_monomial.c b/monomial/ratcor_monomial.c
index a7354c360..c1117cf28 100644
--- a/monomial/ratcor_monomial.c
+++ b/monomial/ratcor_monomial.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -48,21 +48,21 @@
 // computes ||(1 - C^dagger R C) phi||
 void check_C_psi(spinor * const k_up, spinor * const l_up, 
 		 const int id, hamiltonian_field_t * const hf,
-		 solver_pm_t * solver_pm);
+		 solver_params_t * solver_params);
 
 // applies (Q^2 R^2 -1) phi
 double apply_Z_psi(spinor * const k_up, spinor * const l_up, 
 		   const int id, hamiltonian_field_t * const hf,
-		   solver_pm_t * solver_pm);
+		   solver_params_t * solver_params);
 
 
 
 void ratcor_heatbath(const int id, hamiltonian_field_t * const hf) {
   monomial * mnl = &monomial_list[id];
-  solver_pm_t solver_pm;
   double atime, etime, delta;
   spinor * up0, * up1, * tup;
-  double coefs[6] = {1./4., -3./32., 7./122., -77./2048., 231./8192., -1463./65536.};
+  double coefs[6] = {1./4., -3./32., 7./128., -77./2048., 231./8192., -1463./65536.}; // series of (1+x)^(1/4)
+  double coefs_check[6] = {1./2., -1./8., 1./16., -5./128., 7./256., -21./1024.}; // series of (1+x)^(1/2)
   atime = gettime();
   nd_set_global_parameter(mnl);
   g_mu = 0.;
@@ -87,14 +87,14 @@ void ratcor_heatbath(const int id, hamiltonian_field_t * const hf) {
   random_spinor_field_eo(mnl->pf, mnl->rngrepro, RN_GAUSS);
   mnl->energy0 = square_norm(mnl->pf, VOLUME/2, 1);
 
-  solver_pm.max_iter = mnl->maxiter;
-  solver_pm.squared_solver_prec = mnl->accprec;
-  solver_pm.no_shifts = mnl->rat.np;
-  solver_pm.shifts = mnl->rat.mu;
-  solver_pm.type = CGMMS;
-  solver_pm.M_psi = mnl->Qsq;
-  solver_pm.sdim = VOLUME/2;
-  solver_pm.rel_prec = g_relative_precision_flag;
+  mnl->solver_params.max_iter = mnl->maxiter;
+  mnl->solver_params.squared_solver_prec = mnl->accprec;
+  mnl->solver_params.no_shifts = mnl->rat.np;
+  mnl->solver_params.shifts = mnl->rat.mu;
+  mnl->solver_params.type = mnl->solver;
+  mnl->solver_params.M_psi = mnl->Qsq;
+  mnl->solver_params.sdim = VOLUME/2;
+  mnl->solver_params.rel_prec = g_relative_precision_flag;
 
   // apply B to the random field to generate pseudo-fermion fields
   assign(mnl->w_fields[0], mnl->pf, VOLUME/2);
@@ -102,7 +102,7 @@ void ratcor_heatbath(const int id, hamiltonian_field_t * const hf) {
   up1 = mnl->w_fields[2]; 
 	 
   for(int i = 1; i < 8; i++) {
-    delta = apply_Z_psi(up1, up0, id, hf, &solver_pm);
+    delta = apply_Z_psi(up1, up0, id, hf, &(mnl->solver_params) );
     assign_add_mul_r(mnl->pf, up1, coefs[i-1], VOLUME/2);
     if(delta < mnl->accprec) break;
     tup = up0;
@@ -114,7 +114,7 @@ void ratcor_heatbath(const int id, hamiltonian_field_t * const hf) {
     if(g_debug_level > 1) {
       printf("# Time for %s monomial heatbath: %e s\n", mnl->name, etime-atime);
     }
-    if(g_debug_level > 3) { 
+    if(g_debug_level > 3) {
       printf("called ratcor_heatbath for id %d energy %f\n", id, mnl->energy0);
     }
   }
@@ -123,7 +123,6 @@ void ratcor_heatbath(const int id, hamiltonian_field_t * const hf) {
 
 
 double ratcor_acc(const int id, hamiltonian_field_t * const hf) {
-  solver_pm_t solver_pm;
   monomial * mnl = &monomial_list[id];
   double atime, etime, delta;
   spinor * up0, * up1, * tup;
@@ -141,26 +140,26 @@ double ratcor_acc(const int id, hamiltonian_field_t * const hf) {
   }
   mnl->energy1 = 0.;
 
-  solver_pm.max_iter = mnl->maxiter;
-  solver_pm.squared_solver_prec = mnl->accprec;
-  solver_pm.no_shifts = mnl->rat.np;
-  solver_pm.shifts = mnl->rat.mu;
-  solver_pm.type = CGMMS;
-  solver_pm.M_psi = mnl->Qsq;
-  solver_pm.sdim = VOLUME/2;
-  solver_pm.rel_prec = g_relative_precision_flag;
+  mnl->solver_params.max_iter = mnl->maxiter;
+  mnl->solver_params.squared_solver_prec = mnl->accprec;
+  mnl->solver_params.no_shifts = mnl->rat.np;
+  mnl->solver_params.shifts = mnl->rat.mu;
+  mnl->solver_params.type = CGMMS;
+  mnl->solver_params.M_psi = mnl->Qsq;
+  mnl->solver_params.sdim = VOLUME/2;
+  mnl->solver_params.rel_prec = g_relative_precision_flag;
 
   // apply (Q R)^(-1) to pseudo-fermion fields
   assign(mnl->w_fields[4], mnl->pf, VOLUME/2);
   up0 = mnl->w_fields[0];
   up1 = mnl->w_fields[2];
 
-  delta = apply_Z_psi(up0, mnl->pf, id, hf, &solver_pm);
+  delta = apply_Z_psi(up0, mnl->pf, id, hf, &(mnl->solver_params) );
   assign_add_mul_r(mnl->w_fields[4], up0, coefs[0], VOLUME/2);
 
   for(int i = 2; i < 8; i++) {
     if(delta < mnl->accprec) break;
-    delta = apply_Z_psi(up1, up0, id, hf, &solver_pm);
+    delta = apply_Z_psi(up1, up0, id, hf, &(mnl->solver_params) );
     assign_add_mul_r(mnl->w_fields[4], up1, coefs[i-1], VOLUME/2);
     tup = up0;
     up0 = up1;
@@ -173,7 +172,7 @@ double ratcor_acc(const int id, hamiltonian_field_t * const hf) {
     if(g_debug_level > 1) {
       printf("# Time for %s monomial acc step: %e s\n", mnl->name, etime-atime);
     }
-    if(g_debug_level > 3) { // shoud be 3
+    if(g_debug_level > 3) {
       printf("called ratcor_acc for id %d dH = %1.10e\n", id, mnl->energy1 - mnl->energy0);
     }
   }
@@ -184,12 +183,11 @@ double ratcor_acc(const int id, hamiltonian_field_t * const hf) {
 
 double apply_Z_psi(spinor * const k_up,	spinor * const l_up, 
 		     const int id, hamiltonian_field_t * const hf,
-		     solver_pm_t * solver_pm) {
+		     solver_params_t * solver_params) {
   monomial * mnl = &monomial_list[id];
-  double dummy;
 
-  mnl->iter0 += cg_mms_tm(g_chi_up_spinor_field, l_up,
-			  solver_pm, &dummy);  
+  mnl->iter0 += solve_mms_tm(g_chi_up_spinor_field, l_up,
+                             solver_params);  
   
   // apply R to the pseudo-fermion fields
   assign(k_up, l_up, VOLUME/2);
@@ -199,8 +197,9 @@ double apply_Z_psi(spinor * const k_up,	spinor * const l_up,
   }
 
   // apply R a second time
-  cg_mms_tm(g_chi_up_spinor_field, k_up,
-	    solver_pm, &dummy);
+  mnl->iter0 += solve_mms_tm(g_chi_up_spinor_field, k_up,
+                             solver_params);
+
   for(int j = (mnl->rat.np-1); j > -1; j--) {
     assign_add_mul_r(k_up, g_chi_up_spinor_field[j], 
 		     mnl->rat.rmu[j], VOLUME/2);
@@ -208,10 +207,12 @@ double apply_Z_psi(spinor * const k_up,	spinor * const l_up,
   mul_r(g_chi_up_spinor_field[mnl->rat.np], mnl->rat.A*mnl->rat.A, 
 	k_up, VOLUME/2);
 
-  // apply Q^2 and compute the residue
-  solver_pm->M_psi(k_up, g_chi_up_spinor_field[mnl->rat.np]);
+  // apply Q^2 
+  solver_params->M_psi(k_up, g_chi_up_spinor_field[mnl->rat.np]);
+  // compute the residue
   diff(k_up, k_up, l_up, VOLUME/2);
   double resi = square_norm(k_up, VOLUME/2, 1);
+  
   if(g_debug_level > 2 && g_proc_id == 0) {
     printf("# RATCOR: ||Z * phi|| = %e\n", resi);
   }
@@ -222,10 +223,10 @@ double apply_Z_psi(spinor * const k_up,	spinor * const l_up,
 
 void check_C_psi(spinor * const k_up, spinor * const l_up,
 		 const int id, hamiltonian_field_t * const hf,
-		 solver_pm_t * solver_pm) {
+		 solver_params_t * solver_params) {
   monomial * mnl = &monomial_list[id];
-  double dummy;
-  mnl->iter0 = cg_mms_tm(g_chi_up_spinor_field, l_up, solver_pm, &dummy);
+
+  mnl->iter0 = solve_mms_tm(g_chi_up_spinor_field, l_up, solver_params);
 
   assign(k_up, l_up, VOLUME/2);
 
@@ -243,17 +244,18 @@ void check_C_psi(spinor * const k_up, spinor * const l_up,
     assign_add_mul(k_up, g_chi_up_spinor_field[mnl->rat.np], I*mnl->rat.rnu[j], VOLUME/2);
   }
   //apply R
-  solver_pm->shifts = mnl->rat.mu;
-  cg_mms_tm(g_chi_up_spinor_field, k_up,
-	    solver_pm, &dummy);
+  solver_params->shifts = mnl->rat.mu;
+  mnl->iter0 += solve_mms_tm(g_chi_up_spinor_field, k_up,
+                             solver_params);
   for(int j = (mnl->rat.np-1); j > -1; j--) {
     assign_add_mul_r(k_up, g_chi_up_spinor_field[j], 
 		     mnl->rat.rmu[j], VOLUME/2);
   }
   // apply C^dagger
-  solver_pm->shifts = mnl->rat.nu;
-  cg_mms_tm(g_chi_up_spinor_field, k_up,
-	    solver_pm, &dummy);
+  solver_params->shifts = mnl->rat.nu;
+  mnl->iter0 += solve_mms_tm(g_chi_up_spinor_field, k_up,
+	    solver_params);
+
   for(int j = (mnl->rat.np-1); j > -1; j--) {
     if(mnl->type == NDCLOVERRATCOR || mnl->type == NDCLOVERRAT) {
       //Qsw_tau1_sub_const_ndpsi(g_chi_up_spinor_field[mnl->rat.np], g_chi_dn_spinor_field[mnl->rat.np],
diff --git a/monomial/sf_gauge_monomial.c b/monomial/sf_gauge_monomial.c
index 075f62c49..e9c5f1c91 100644
--- a/monomial/sf_gauge_monomial.c
+++ b/monomial/sf_gauge_monomial.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/mpi_init.c b/mpi_init.c
index 90656ca78..545bdd627 100644
--- a/mpi_init.c
+++ b/mpi_init.c
@@ -18,11 +18,11 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #ifdef _USE_SHMEM
@@ -31,9 +31,10 @@
 #include "global.h"
 #include "read_input.h"
 #include "mpi_init.h"
+#ifdef TM_USE_BSM
 #include "operator/D_psi_BSM2f.h"
-
-#ifdef MPI
+#endif
+#ifdef TM_USE_MPI
 /* Datatypes for the data exchange */
 MPI_Datatype mpi_su3;
 MPI_Datatype gauge_point;
@@ -44,8 +45,10 @@ MPI_Datatype deri_time_slice_cont;
 MPI_Datatype deri_time_slice_split;
 
 MPI_Datatype field_point;
+MPI_Datatype field_point32;
 MPI_Datatype field_time_slice_cont;
 MPI_Datatype lfield_time_slice_cont;
+MPI_Datatype lfield_time_slice_cont32;
 MPI_Datatype gauge_x_slice_cont;
 MPI_Datatype gauge_x_subslice;
 MPI_Datatype gauge_x_slice_gath;
@@ -53,8 +56,11 @@ MPI_Datatype field_x_slice_cont;
 MPI_Datatype field_x_subslice;
 MPI_Datatype field_x_slice_gath;
 MPI_Datatype lfield_x_slice_cont;
+MPI_Datatype lfield_x_slice_cont32;
 MPI_Datatype lfield_x_subslice;
+MPI_Datatype lfield_x_subslice32;
 MPI_Datatype lfield_x_slice_gath;
+MPI_Datatype lfield_x_slice_gath32;
 MPI_Datatype deri_x_slice_cont;
 MPI_Datatype deri_x_subslice;
 MPI_Datatype deri_x_slice_gath;
@@ -70,14 +76,19 @@ MPI_Datatype field_y_slice_gath;
 MPI_Datatype field_y_slice_cont;
 MPI_Datatype field_y_subslice;
 MPI_Datatype lfield_y_slice_gath;
+MPI_Datatype lfield_y_slice_gath32;
 MPI_Datatype lfield_y_slice_cont;
+MPI_Datatype lfield_y_slice_cont32;
 MPI_Datatype lfield_y_subslice;
+MPI_Datatype lfield_y_subslice32;
 
 MPI_Datatype field_z_slice_gath;
 MPI_Datatype field_z_subslice;
 MPI_Datatype field_z_slice_cont;
 MPI_Datatype lfield_z_slice_gath;
+MPI_Datatype lfield_z_slice_gath32;
 MPI_Datatype lfield_z_slice_cont;
+MPI_Datatype lfield_z_slice_cont32;
 MPI_Datatype field_z_slice_half;
 
 MPI_Datatype deri_y_slice_cont;
@@ -199,7 +210,7 @@ void reduce_su3_ray(
 
 void tmlqcd_mpi_init(int argc,char *argv[]) {
   int i;
-#ifdef MPI
+#ifdef TM_USE_MPI
   int periods[] = {1,1,1,1};
   int dims[] = {0,0,0,0};
   int ndims = 0;
@@ -215,8 +226,7 @@ void tmlqcd_mpi_init(int argc,char *argv[]) {
     g_nb_list[i] = 0;
   }
 
-
-#ifdef MPI
+#ifdef TM_USE_MPI
 #  ifdef _USE_SHMEM
   /* we need that the PE number in MPI_COMM_WORL  */
   /* exactly correspond to the one in g_cart_grid */
@@ -515,6 +525,7 @@ void tmlqcd_mpi_init(int argc,char *argv[]) {
   /* The spinor fields */
   /* this is a single spinor field on one space-time point */
   MPI_Type_contiguous(24, MPI_DOUBLE, &field_point);
+  MPI_Type_contiguous(24, MPI_FLOAT, &field_point32);  
   /* Tis is an even or odd spinor field time slice, continuous */
 /*   MPI_Type_contiguous(LX*LY*LZ/2, field_point, &field_time_slice_cont);  */
   MPI_Type_contiguous(LX*LY*LZ*12, MPI_DOUBLE, &field_time_slice_cont); 
@@ -524,7 +535,8 @@ void tmlqcd_mpi_init(int argc,char *argv[]) {
   /* this is the not even/odd field */
   MPI_Type_contiguous(LX*LY*LZ, field_point, &lfield_time_slice_cont);
   MPI_Type_commit(&lfield_time_slice_cont);
-
+  MPI_Type_contiguous(LX*LY*LZ, field_point32, &lfield_time_slice_cont32);
+  MPI_Type_commit(&lfield_time_slice_cont32);
 
   /* This is an even or odd continuous spinor field x-slice */
   MPI_Type_contiguous(T*LY*LZ/2, field_point, &field_x_slice_cont); 
@@ -544,6 +556,12 @@ void tmlqcd_mpi_init(int argc,char *argv[]) {
   MPI_Type_vector(T, 1, LX, lfield_x_subslice, &lfield_x_slice_gath);
   MPI_Type_commit(&lfield_x_slice_gath);
   MPI_Type_commit(&lfield_x_slice_cont);
+  
+  MPI_Type_contiguous(T*LY*LZ, field_point32, &lfield_x_slice_cont32);
+  MPI_Type_contiguous(LY*LZ, field_point32, &lfield_x_subslice32);
+  MPI_Type_vector(T, 1, LX, lfield_x_subslice32, &lfield_x_slice_gath32);
+  MPI_Type_commit(&lfield_x_slice_gath32);
+  MPI_Type_commit(&lfield_x_slice_cont32);  
 
   /* This is an even or odd continuous spinor field y-slice */
   MPI_Type_contiguous(T*LX*LZ/2, field_point, &field_y_slice_cont); 
@@ -563,6 +581,12 @@ void tmlqcd_mpi_init(int argc,char *argv[]) {
   MPI_Type_vector(T*LX, 1, LY, lfield_y_subslice, &lfield_y_slice_gath);
   MPI_Type_commit(&lfield_y_slice_cont);
   MPI_Type_commit(&lfield_y_slice_gath);
+  
+  MPI_Type_contiguous(T*LX*LZ, field_point32, &lfield_y_slice_cont32);
+  MPI_Type_contiguous(LZ, field_point32, &lfield_y_subslice32);
+  MPI_Type_vector(T*LX, 1, LY, lfield_y_subslice32, &lfield_y_slice_gath32);
+  MPI_Type_commit(&lfield_y_slice_cont32);
+  MPI_Type_commit(&lfield_y_slice_gath32);  
 
   /* If z-dir is parallelized, I have assumed that both LZ and T*LX*LY are even */
   /* This is an even or odd continuous spinor field z-slice */
@@ -579,6 +603,11 @@ void tmlqcd_mpi_init(int argc,char *argv[]) {
   MPI_Type_vector(T*LX*LY, 1, LZ, field_point, &lfield_z_slice_gath);
   MPI_Type_commit(&lfield_z_slice_cont);
   MPI_Type_commit(&lfield_z_slice_gath);
+  
+  MPI_Type_contiguous(T*LX*LY, field_point32, &lfield_z_slice_cont32);
+  MPI_Type_vector(T*LX*LY, 1, LZ, field_point32, &lfield_z_slice_gath32);
+  MPI_Type_commit(&lfield_z_slice_cont32);
+  MPI_Type_commit(&lfield_z_slice_gath32);  
 
 #ifdef _USE_TSPLITPAR
   /* here I construct the xt yt zt edges for use in _USE_TSPLITPAR  */
@@ -718,7 +747,7 @@ void tmlqcd_mpi_init(int argc,char *argv[]) {
 
   MPI_Op_create(reduce_su3_ray, 0, &mpi_reduce_su3_ray);
 
-#else /*ifdef MPI */
+#else /*ifdef TM_USE_MPI */
   g_nproc = 1;
   g_proc_id = 0;
   g_nproc_x = 1;
@@ -749,14 +778,14 @@ void tmlqcd_mpi_init(int argc,char *argv[]) {
   N_PROC_Z = 1;
 #  endif
   g_dbw2rand = 0;
-#endif   /*ifdef MPI */
+#endif   /*ifdef TM_USE_MPI */
 
   /* Here we perform some checks in order not to */
   /* run into trouble later                      */
 #if (defined PARALLELXYZT || defined PARALLELXYZ )
   if((T*LX*LY)%2 != 0 && even_odd_flag == 1) {
     fprintf(stderr, "T*LX*LY must be even!\nAborting prgram...\n");
-#  ifdef MPI 
+#  ifdef TM_USE_MPI 
     MPI_Finalize();
 #  endif
     exit(-1);
@@ -765,11 +794,10 @@ void tmlqcd_mpi_init(int argc,char *argv[]) {
 
   if(LZ%2 != 0 && even_odd_flag == 1) {
     fprintf(stderr, "LZ must be even!\nAborting prgram...\n");
-#ifdef MPI
+#ifdef TM_USE_MPI
     MPI_Finalize();
 #endif
     exit(-1);
   }
-  init_D_psi_BSM2f();
 }
 
diff --git a/mpi_init.h b/mpi_init.h
index 8a419ae34..3912ee506 100644
--- a/mpi_init.h
+++ b/mpi_init.h
@@ -19,25 +19,29 @@
 #ifndef _MPI_INIT_H
 #define _MPI_INIT_H
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include <mpi.h>
 
 
 /* Datatypes for the data exchange */
 extern MPI_Datatype mpi_su3;
 extern MPI_Datatype field_point;
+extern MPI_Datatype field_point32;
 extern MPI_Datatype gauge_time_slice_cont;
 extern MPI_Datatype gauge_time_slice_split;
 extern MPI_Datatype deri_time_slice_cont;
 extern MPI_Datatype deri_time_slice_split;
 extern MPI_Datatype field_time_slice_cont;
 extern MPI_Datatype lfield_time_slice_cont;
+extern MPI_Datatype lfield_time_slice_cont32;
 extern MPI_Datatype gauge_x_slice_cont;
 extern MPI_Datatype gauge_x_slice_gath;
 extern MPI_Datatype field_x_slice_cont;
 extern MPI_Datatype field_x_slice_gath;
 extern MPI_Datatype lfield_x_slice_cont;
+extern MPI_Datatype lfield_x_slice_cont32;
 extern MPI_Datatype lfield_x_slice_gath;
+extern MPI_Datatype lfield_x_slice_gath32;
 extern MPI_Datatype deri_x_slice_cont;
 extern MPI_Datatype deri_x_slice_gath;
 extern MPI_Datatype gauge_xt_edge_cont;
@@ -70,7 +74,9 @@ extern MPI_Datatype gauge_y_slice_gath;
 extern MPI_Datatype field_y_slice_cont;
 extern MPI_Datatype field_y_slice_gath;
 extern MPI_Datatype lfield_y_slice_cont;
+extern MPI_Datatype lfield_y_slice_cont32;
 extern MPI_Datatype lfield_y_slice_gath;
+extern MPI_Datatype lfield_y_slice_gath32;
 extern MPI_Datatype deri_y_slice_cont;
 extern MPI_Datatype deri_y_slice_gath;
 
@@ -83,7 +89,9 @@ extern MPI_Datatype gauge_z_slice_cont;
 extern MPI_Datatype field_z_slice_cont;
 extern MPI_Datatype field_z_slice_gath;
 extern MPI_Datatype lfield_z_slice_cont;
+extern MPI_Datatype lfield_z_slice_cont32;
 extern MPI_Datatype lfield_z_slice_gath;
+extern MPI_Datatype lfield_z_slice_gath32;
 extern MPI_Datatype field_z_slice_half;
 
 extern MPI_Datatype halffield_point;
@@ -123,7 +131,7 @@ extern MPI_Datatype jfield_z_slice_gath;
 extern MPI_Datatype jfield_y_subslice;
 #endif
 
-#if ( defined PARALLELXYZT || defined PARALLELXYZ )
+#if ( defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXYZ )
 extern MPI_Datatype field_z_slice_even_dn;
 extern MPI_Datatype field_z_slice_even_up;
 extern MPI_Datatype field_z_slice_odd_dn;
@@ -134,6 +142,19 @@ extern spinor * field_buffer_z ALIGN;
 extern spinor * field_buffer_z2 ALIGN;
 extern spinor * field_buffer_z3 ALIGN;
 extern spinor * field_buffer_z4 ALIGN;
+extern spinor * field_buffer_y ALIGN;
+extern spinor * field_buffer_y2 ALIGN;
+extern spinor * field_buffer_y3 ALIGN;
+extern spinor * field_buffer_y4 ALIGN;
+extern spinor * field_buffer_x ALIGN;
+extern spinor * field_buffer_x2 ALIGN;
+extern spinor * field_buffer_x3 ALIGN;
+extern spinor * field_buffer_x4 ALIGN;
+extern spinor * field_buffer_t ALIGN;
+extern spinor * field_buffer_t2 ALIGN;
+extern spinor * field_buffer_t3 ALIGN;
+extern spinor * field_buffer_t4 ALIGN;
+
 extern halfspinor * halffield_buffer_z ALIGN;
 extern halfspinor * halffield_buffer_z2 ALIGN;
 # endif
diff --git a/offline_measurement.c b/offline_measurement.c
new file mode 100644
index 000000000..da76a652c
--- /dev/null
+++ b/offline_measurement.c
@@ -0,0 +1,405 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2012 Carsten Urbach, Albert Deuzeman, Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * naive pion correlator for twisted mass QCD
+ *
+ *******************************************************************************/
+
+#define MAIN_PROGRAM
+
+#include"lime.h"
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <string.h>
+#include <signal.h>
+#ifdef TM_USE_MPI
+#include <mpi.h>
+#endif
+#ifdef TM_USE_OMP
+# include <omp.h>
+#endif
+#include "global.h"
+#include "git_hash.h"
+#include "getopt.h"
+#include "linalg_eo.h"
+#include "geometry_eo.h"
+#include "start.h"
+#include "measure_gauge_action.h"
+#ifdef TM_USE_MPI
+#include "xchange/xchange.h"
+#endif
+#include <io/utils.h>
+#include "read_input.h"
+#include "mpi_init.h"
+#include "sighandler.h"
+#include "boundary.h"
+#include "solver/solver.h"
+#include "init/init.h"
+#include "invert_eo.h"
+#include "monomial/monomial.h"
+#include "ranlxd.h"
+#include "phmc.h"
+#include "operator/D_psi.h"
+#include "little_D.h"
+#include "reweighting_factor.h"
+#include "linalg/convert_eo_to_lexic.h"
+#include "block.h"
+#include "operator.h"
+#include "sighandler.h"
+#include "solver/dfl_projector.h"
+#include "solver/generate_dfl_subspace.h"
+#include "prepare_source.h"
+#include <io/params.h>
+#include <io/gauge.h>
+#include <io/spinor.h>
+#include <io/utils.h>
+#include "solver/dirac_operator_eigenvectors.h"
+#include "P_M_eta.h"
+#include "operator/tm_operators.h"
+#include "operator/Dov_psi.h"
+#include "gettime.h"
+#ifdef TM_USE_QUDA
+#  include "quda_interface.h"
+#endif
+#ifdef TM_USE_QPHIX
+#  include "qphix_interface.h"
+#endif
+#ifdef DDalphaAMG
+#  include "DDalphaAMG_interface.h"
+#endif
+#include "meas/measurements.h"
+
+#define CONF_FILENAME_LENGTH 500
+
+extern int nstore;
+int check_geometry();
+
+static void usage();
+static void process_args(int argc, char *argv[], char ** input_filename, char ** filename);
+static void set_default_filenames(char ** input_filename, char ** filename);
+
+int main(int argc, char *argv[])
+{
+  FILE *parameterfile = NULL;
+  int j, i, ix = 0, isample = 0, op_id = 0;
+  char datafilename[206];
+  char parameterfilename[206];
+  char conf_filename[CONF_FILENAME_LENGTH];
+  char * input_filename = NULL;
+  char * filename = NULL;
+  double plaquette_energy;
+
+#ifdef _KOJAK_INST
+#pragma pomp inst init
+#pragma pomp inst begin(main)
+#endif
+
+#if (defined SSE || defined SSE2 || SSE3)
+  signal(SIGILL, &catch_ill_inst);
+#endif
+
+  DUM_DERI = 8;
+  DUM_MATRIX = DUM_DERI + 5;
+#if ((defined BGL && defined XLC) || defined _USE_TSPLITPAR)
+  NO_OF_SPINORFIELDS = DUM_MATRIX + 3;
+#else
+  NO_OF_SPINORFIELDS = DUM_MATRIX + 3;
+#endif
+
+  verbose = 0;
+  g_use_clover_flag = 0;
+
+  process_args(argc,argv,&input_filename,&filename);
+  set_default_filenames(&input_filename, &filename);
+  init_parallel_and_read_input(argc, argv, input_filename);
+
+  /* this DBW2 stuff is not needed for the inversion ! */
+  if (g_dflgcr_flag == 1) {
+    even_odd_flag = 0;
+  }
+  if (Nsave == 0) {
+    Nsave = 1;
+  }
+
+  if (g_running_phmc) {
+    NO_OF_SPINORFIELDS = DUM_MATRIX + 8;
+  }
+
+  tmlqcd_mpi_init(argc, argv);
+
+  /* starts the single and double precision random number */
+  /* generator                                            */
+  start_ranlux(rlxd_level, random_seed^nstore);
+  
+  /* we need to make sure that we don't have even_odd_flag = 1 */
+  /* if any of the operators doesn't use it                    */
+  /* in this way even/odd can still be used by other operators */
+  for(j = 0; j < no_operators; j++) if(!operator_list[j].even_odd_flag) even_odd_flag = 0;
+
+#ifndef TM_USE_MPI
+  g_dbw2rand = 0;
+#endif
+
+#ifdef _GAUGE_COPY
+  j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
+#else
+  j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
+#endif
+  if (j != 0) {
+    fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n");
+    exit(-1);
+  }
+  j = init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand);
+  if (j != 0) {
+    fprintf(stderr, "Not enough memory for geometry indices! Aborting...\n");
+    exit(-1);
+  }
+  if (no_monomials > 0) {
+    if (even_odd_flag) {
+      j = init_monomials(VOLUMEPLUSRAND / 2, even_odd_flag);
+    }
+    else {
+      j = init_monomials(VOLUMEPLUSRAND, even_odd_flag);
+    }
+    if (j != 0) {
+      fprintf(stderr, "Not enough memory for monomial pseudo fermion fields! Aborting...\n");
+      exit(-1);
+    }
+  }
+  if (even_odd_flag) {
+    j = init_spinor_field(VOLUMEPLUSRAND / 2, NO_OF_SPINORFIELDS);
+  }
+  else {
+    j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS);
+  }
+  if (j != 0) {
+    fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n");
+    exit(-1);
+  }
+
+  if (g_running_phmc) {
+    j = init_chi_spinor_field(VOLUMEPLUSRAND / 2, 20);
+    if (j != 0) {
+      fprintf(stderr, "Not enough memory for PHMC Chi fields! Aborting...\n");
+      exit(-1);
+    }
+  }
+
+  g_mu = g_mu1;
+
+  if (g_cart_id == 0) {
+    /*construct the filenames for the observables and the parameters*/
+    strncpy(datafilename, filename, 200);
+    strcat(datafilename, ".data");
+    strncpy(parameterfilename, filename, 200);
+    strcat(parameterfilename, ".para");
+
+    parameterfile = fopen(parameterfilename, "w");
+    write_first_messages(parameterfile, "invert", git_hash);
+    fclose(parameterfile);
+  }
+
+  /* define the geometry */
+  geometry();
+  int status = check_geometry();
+
+  if (status != 0) {
+    fprintf(stderr, "Checking of geometry failed. Unable to proceed.\nAborting....\n");
+    exit(1);
+  }
+
+  /* define the boundary conditions for the fermion fields */
+  boundary(g_kappa);
+
+  phmc_invmaxev = 1.;
+
+  init_operators();
+  
+  /* list and initialize measurements*/
+  if(g_proc_id == 0) {
+    printf("\n");
+    for(int j = 0; j < no_measurements; j++) {
+      printf("# measurement id %d, type = %d\n", j, measurement_list[j].type);
+    }
+  }
+  init_measurements();  
+
+  /* this could be maybe moved to init_operators */
+#ifdef _USE_HALFSPINOR
+  j = init_dirac_halfspinor();
+  if (j != 0) {
+    fprintf(stderr, "Not enough memory for halffield! Aborting...\n");
+    exit(-1);
+  }
+  if (g_sloppy_precision_flag == 1) {
+    j = init_dirac_halfspinor32();
+    if (j != 0)
+    {
+      fprintf(stderr, "Not enough memory for 32-bit halffield! Aborting...\n");
+      exit(-1);
+    }
+  }
+#  if (defined _PERSISTENT)
+  if (even_odd_flag)
+    init_xchange_halffield();
+#  endif
+#endif
+
+  for (j = 0; j < Nmeas; j++) {
+    int n_written = snprintf(conf_filename, CONF_FILENAME_LENGTH, "%s.%.4d", gauge_input_filename, nstore);
+    if( n_written < 0 || n_written > CONF_FILENAME_LENGTH ){
+      char error_message[500];
+      snprintf(error_message,
+               500,
+               "Encoding error or gauge configuration filename "
+               "longer than %d characters! See offline_measurement.c CONF_FILENAME_LENGTH\n", 
+               CONF_FILENAME_LENGTH);
+      fatal_error(error_message, "offline_measurement.c");
+    }
+    if (g_cart_id == 0) {
+      printf("#\n# Trying to read gauge field from file %s in %s precision.\n",
+            conf_filename, (gauge_precision_read_flag == 32 ? "single" : "double"));
+      fflush(stdout);
+    }
+    if( (i = read_gauge_field(conf_filename,g_gauge_field)) !=0) {
+      fprintf(stderr, "Error %d while reading gauge field from %s\n Aborting...\n", i, conf_filename);
+      exit(-2);
+    }
+  
+    if (g_cart_id == 0) {
+      printf("# Finished reading gauge field.\n");
+      fflush(stdout);
+    }
+
+#ifdef TM_USE_MPI
+    xchange_gauge(g_gauge_field);
+#endif
+
+    /*compute the energy of the gauge field*/
+    plaquette_energy = measure_plaquette( (const su3** const) g_gauge_field);
+
+    if (g_cart_id == 0) {
+      printf("# The computed plaquette value is %e.\n", plaquette_energy / (6.*VOLUME*g_nproc));
+      fflush(stdout);
+    }
+
+    if (g_cart_id == 0) {
+      fprintf(stdout, "#\n"); /*Indicate starting of the operator part*/
+    }
+
+    
+    /* offline measurements */
+    measurement * meas;
+    for(int imeas = 0; imeas < no_measurements; imeas++){
+      meas = &measurement_list[imeas];
+      if (g_proc_id == 0) {
+        fprintf(stdout, "#\n# Beginning offline measurement.\n");
+      }
+      meas->measurefunc(nstore, imeas, even_odd_flag);
+    }      
+    nstore += Nsave;
+  }
+
+#ifdef TM_USE_OMP
+  free_omp_accumulators();
+#endif
+#ifdef TM_USE_QUDA
+  _endQuda();
+#endif
+
+  free_blocks();
+  free_dfl_subspace();
+  free_geometry_indices();
+  free_spinor_field();
+
+  free_chi_spinor_field();
+
+  free(filename);
+  free(input_filename);
+
+#ifdef TM_USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Finalize();
+#endif
+  return(0);
+  
+  
+#ifdef _KOJAK_INST
+#pragma pomp inst end(main)
+#endif
+}
+
+static void usage()
+{
+  fprintf(stdout, "Offline version of the online measurements for twisted mass QCD\n");
+  fprintf(stdout, "Version %s \n\n", PACKAGE_VERSION);
+  fprintf(stdout, "Please send bug reports to %s\n", PACKAGE_BUGREPORT);
+  fprintf(stdout, "Usage:   invert [options]\n");
+  fprintf(stdout, "Options: [-f input-filename]\n");
+  fprintf(stdout, "         [-v] more verbosity\n");
+  fprintf(stdout, "         [-h|-? this help]\n");
+  fprintf(stdout, "         [-V] print version information and exit\n");
+  exit(0);
+}
+
+static void process_args(int argc, char *argv[], char ** input_filename, char ** filename) {
+  int c;
+  while ((c = getopt(argc, argv, "h?vVf:o:")) != -1) {
+    switch (c) {
+      case 'f':
+        *input_filename = calloc(200, sizeof(char));
+        strncpy(*input_filename, optarg, 200);
+        break;
+      case 'v':
+        verbose = 1;
+        break;
+      case 'V':
+        if(g_proc_id == 0) {
+          fprintf(stdout,"%s %s\n",PACKAGE_STRING,git_hash);
+        }
+        exit(0);
+        break;
+      case 'h':
+      case '?':
+      default:
+        if( g_proc_id == 0 ) {
+          usage();
+        }
+        break;
+    }
+  }
+}
+
+static void set_default_filenames(char ** input_filename, char ** filename) {
+  if( *input_filename == NULL ) {
+    *input_filename = calloc(28, sizeof(char));
+    strcpy(*input_filename,"offline_measurement.input");
+  }
+  
+  if( *filename == NULL ) {
+    *filename = calloc(7, sizeof(char));
+    strcpy(*filename,"output");
+  } 
+}
+
diff --git a/omp_accumulator.h b/omp_accumulator.h
new file mode 100644
index 000000000..6474efaac
--- /dev/null
+++ b/omp_accumulator.h
@@ -0,0 +1,205 @@
+/***********************************************************************
+ * Copyright (C) 2018 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef OMP_ACCUMULATOR_H
+#define OMP_ACCUMULATOR_H
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#ifdef TM_USE_OMP
+#include <omp.h>
+#endif
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include <assert.h>
+#include "global.h"
+
+#define TM_CHECK_INIT_OMP_ACC(x) assert( (x)->init == 1 );
+
+#define TM_CHECK_BOUNDS_OMP_ACC(x, count) assert( (x)->num_values == count );
+
+#ifdef TM_USE_OMP
+#define TM_CHECK_NUM_THREADS_OMP_ACC(x) assert( (x)->num_threads == omp_get_num_threads() );
+#else
+#define TM_CHECK_NUM_THREADS_OMP_ACC(x) assert( (x)->num_threads == 1 ); 
+#endif
+
+#ifdef TM_USE_OMP
+#define TM_CHECK_SINGLE_THREAD assert( omp_in_parallel() != 1 );
+#else
+#define TM_CHECK_SINGLE_THREAD
+#endif
+
+typedef struct omp_re_acc_t {
+  double * mem;
+  double ** acc;
+
+  int num_threads;
+  int num_values;
+
+  int init;
+
+} omp_re_acc_t;
+
+typedef struct omp_cplx_acc_t {
+  complex double * mem;
+  complex double ** acc;
+
+  int num_threads;
+  int num_values;
+
+  int init;
+
+} omp_cplx_acc_t;
+
+static inline omp_re_acc_t new_omp_re_acc(void) {
+  omp_re_acc_t ret;
+  ret.mem = (double*) NULL;
+  ret.acc = (double**) NULL;
+  ret.init = 0;
+  ret.num_threads = 0;
+  ret.num_values = 0;
+  return(ret);
+}
+
+static inline omp_cplx_acc_t new_omp_cplx_acc(void) {
+  omp_cplx_acc_t ret;
+  ret.mem = (complex double*) NULL;
+  ret.acc = (complex double**) NULL;
+  ret.init = 0;
+  ret.num_threads = 0;
+  ret.num_values = 0;
+  return(ret);
+}
+
+static inline void omp_re_acc_free(omp_re_acc_t * const acc ){
+  free( acc->mem );
+  free( acc->acc );
+  acc->init = 0;
+}
+
+static inline void omp_cplx_acc_free(omp_cplx_acc_t * const acc ){
+  free( acc->mem );
+  free( acc->acc );
+  acc->init = 0;
+}
+
+static inline void omp_re_acc_reset(omp_re_acc_t * const acc){
+  TM_CHECK_SINGLE_THREAD;
+  TM_CHECK_INIT_OMP_ACC(acc);
+  memset(acc->mem, 0, acc->num_threads*acc->num_values*sizeof(double)); 
+}
+
+static inline void omp_cplx_acc_reset(omp_cplx_acc_t * const acc){
+  TM_CHECK_SINGLE_THREAD;
+  TM_CHECK_INIT_OMP_ACC(acc);
+  memset(acc->mem, 0, acc->num_threads*acc->num_values*sizeof(complex double)); 
+}
+
+static inline void omp_re_acc_init(omp_re_acc_t * const acc, const int num_values) {
+  TM_CHECK_SINGLE_THREAD;
+  if( acc->init != 0 ){
+    omp_re_acc_free(acc);
+  }
+#ifdef TM_USE_OMP
+  acc->num_threads = omp_num_threads;
+#else
+  acc->num_threads = 1;
+#endif
+  acc->num_values = num_values;
+  acc->acc = (double**) calloc( acc->num_threads, sizeof( double* ) );
+  acc->mem = (double*) calloc( acc->num_threads*acc->num_values, sizeof(double) );
+  for( int thread = 0; thread < acc->num_threads; ++thread ){
+    acc->acc[thread] = acc->mem + thread*acc->num_values;
+  }
+  acc->init = 1;
+  omp_re_acc_reset(acc);
+}
+
+static inline void omp_cplx_acc_init(omp_cplx_acc_t * const acc, const int num_values) {
+  TM_CHECK_SINGLE_THREAD;
+  if( acc->init != 0 ){
+    omp_cplx_acc_free(acc);
+  }
+#ifdef TM_USE_OMP
+  acc->num_threads = omp_num_threads;
+#else
+  acc->num_threads = 1;
+#endif
+  acc->num_values = num_values;
+  acc->acc = (complex double**) calloc( acc->num_threads, sizeof( complex double* ) );
+  acc->mem = (complex double*) calloc( acc->num_threads*acc->num_values, sizeof(complex double) );
+  for( int thread = 0; thread < acc->num_threads; ++thread ){
+    acc->acc[thread] = acc->mem + thread*acc->num_values;
+  }
+  acc->init = 1;
+  omp_cplx_acc_reset(acc);
+}
+
+static inline void omp_re_acc_add(omp_re_acc_t * const acc, const double * const values, const int num_values){
+  TM_CHECK_INIT_OMP_ACC(acc);
+  TM_CHECK_BOUNDS_OMP_ACC(acc, num_values);
+  TM_CHECK_NUM_THREADS_OMP_ACC(acc);
+  for(int i = 0; i < num_values; ++i){ 
+#ifdef TM_USE_OMP
+    acc->acc[omp_get_thread_num()][i] += values[i];
+#else
+    acc->acc[0][i] += values[i];
+#endif
+  }
+}
+
+static inline void omp_cplx_acc_add(omp_cplx_acc_t * const acc, const complex double * const values, const int num_values){
+  TM_CHECK_INIT_OMP_ACC(acc);
+  TM_CHECK_BOUNDS_OMP_ACC(acc, num_values);
+  TM_CHECK_NUM_THREADS_OMP_ACC(acc);
+  for(int i = 0; i < num_values; ++i){ 
+#ifdef TM_USE_OMP
+    acc->acc[omp_get_thread_num()][i] += values[i];
+#else
+    acc->acc[0][i] += values[i];
+#endif
+  }
+}
+
+static inline void omp_re_acc_reduce(double * const ret, omp_re_acc_t * const acc){
+  TM_CHECK_SINGLE_THREAD;
+  TM_CHECK_INIT_OMP_ACC(acc);
+  for(int thread = 0; thread < acc->num_threads; thread++){
+    for(int value = 0; value < acc->num_values; value++){
+      ret[value] += acc->acc[thread][value];
+    }
+  }
+}
+
+static inline void omp_cplx_acc_reduce(complex double * const ret, omp_cplx_acc_t * const acc){
+  TM_CHECK_SINGLE_THREAD;
+  TM_CHECK_INIT_OMP_ACC(acc);
+  for(int thread = 0; thread < acc->num_threads; thread++){
+    for(int value = 0; value < acc->num_values; value++){
+      ret[value] += acc->acc[thread][value];
+    }
+  }
+}
+
+
+#endif  // header guard
diff --git a/online_measurement.c b/online_measurement.c
deleted file mode 100644
index 151597868..000000000
--- a/online_measurement.c
+++ /dev/null
@@ -1,227 +0,0 @@
-/***********************************************************************
- *
- * Copyright (C) 2008 Carsten Urbach
- *
- * This file is part of tmLQCD.
- *
- * tmLQCD is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * tmLQCD is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public License
- * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
- ***********************************************************************/
-
-#ifdef HAVE_CONFIG_H
-# include<config.h>
-#endif
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <time.h>
-#include "global.h"
-#include "start.h"
-#include "ranlxs.h"
-#include "su3spinor.h"
-#include "source_generation.h"
-#include "operator.h"
-#include "invert_eo.h"
-#include "solver/solver.h"
-#include "geometry_eo.h"
-#include "linalg/convert_eo_to_lexic.h"
-#include "measurements.h"
-#include "online_measurement.h"
-#include "gettime.h"
-
-
-/******************************************************
- *
- * This routine computes the correlators
- * <PP>, <PA> and <PV> (<source sink>)
- * using a stochastic time slice source
- * and only one inversion (actually A_0)
- * 
- * for <AP> we would need another inversion
- *
- *
- *
- ******************************************************/
-
-void online_measurement(const int traj, const int id, const int ieo) {
-  int i, j, t, tt, t0;
-  double *Cpp = NULL, *Cpa = NULL, *Cp4 = NULL;
-  double res = 0., respa = 0., resp4 = 0.;
-  double atime, etime;
-  float tmp;
-  operator * optr;
-#ifdef MPI
-  double mpi_res = 0., mpi_respa = 0., mpi_resp4 = 0.;
-  // send buffer for MPI_Gather
-  double *sCpp = NULL, *sCpa = NULL, *sCp4 = NULL;
-#endif
-  FILE *ofs;
-  char *filename;
-  char buf[100];
-  spinor phi;
-  filename=buf;
-  sprintf(filename,"%s%.6d", "onlinemeas." ,traj);
-
-  init_operators();
-  if(no_operators < 1) {
-    if(g_proc_id == 0) {
-      fprintf(stderr, "Warning! no operators defined in input file, cannot perform online correlator mesurements!\n");
-    }
-    return;
-  }
-  if(no_operators > 1 && g_proc_id == 0) {
-    fprintf(stderr, "Warning! number of operators defined larger than 1, using only the first!\n");
-  }
-  optr = &operator_list[0];
-  // we don't want to do inversion twice for this purpose here
-  optr->DownProp = 0;
-  if(optr->type != TMWILSON && optr->type != WILSON && optr->type != CLOVER) {
-    if(g_proc_id == 0) {
-      fprintf(stderr, "Warning! correlator online measurement currently only implemented for TMWILSON, WILSON and CLOVER\n");
-      fprintf(stderr, "Cannot perform online measurement!\n");
-    }
-    return;
-  }
-
-  /* generate random timeslice */
-  if(ranlxs_init == 0) {
-    rlxs_init(1, 123456);
-  }
-  ranlxs(&tmp, 1);
-  t0 = (int)(measurement_list[id].max_source_slice*tmp);
-#ifdef MPI
-  MPI_Bcast(&t0, 1, MPI_INT, 0, MPI_COMM_WORLD);
-#endif
-  if(g_debug_level > 1 && g_proc_id == 0) {
-    printf("# timeslice set to %d (T=%d) for online measurement\n", t0, g_nproc_t*T);
-    printf("# online measurements parameters: kappa = %g, mu = %g\n", optr->kappa, optr->mu/2./optr->kappa);
-  }
-  atime = gettime();
-
-#ifdef MPI
-  sCpp = (double*) calloc(T, sizeof(double));
-  sCpa = (double*) calloc(T, sizeof(double));
-  sCp4 = (double*) calloc(T, sizeof(double));
-  if(g_mpi_time_rank == 0) {
-    Cpp = (double*) calloc(g_nproc_t*T, sizeof(double));
-    Cpa = (double*) calloc(g_nproc_t*T, sizeof(double));
-    Cp4 = (double*) calloc(g_nproc_t*T, sizeof(double));
-  }
-#else
-  Cpp = (double*) calloc(T, sizeof(double));
-  Cpa = (double*) calloc(T, sizeof(double));
-  Cp4 = (double*) calloc(T, sizeof(double));
-#endif
-  source_generation_pion_only(g_spinor_field[0], g_spinor_field[1], 
-			      t0, 0, traj);
-  optr->sr0 = g_spinor_field[0];
-  optr->sr1 = g_spinor_field[1];
-  optr->prop0 = g_spinor_field[2];
-  optr->prop1 = g_spinor_field[3];
-
-  // op_id = 0, index_start = 0, write_prop = 0
-  optr->inverter(0, 0, 0);
-
-  /* now we bring it to normal format */
-  /* here we use implicitly DUM_MATRIX and DUM_MATRIX+1 */
-  convert_eo_to_lexic(g_spinor_field[DUM_MATRIX], g_spinor_field[2], g_spinor_field[3]);
-  
-  /* now we sum only over local space for every t */
-  for(t = 0; t < T; t++) {
-    j = g_ipt[t][0][0][0];
-    res = 0.;
-    respa = 0.;
-    resp4 = 0.;
-    for(i = j; i < j+LX*LY*LZ; i++) {
-      res += _spinor_prod_re(g_spinor_field[DUM_MATRIX][i], g_spinor_field[DUM_MATRIX][i]);
-      _gamma0(phi, g_spinor_field[DUM_MATRIX][i]);
-      respa += _spinor_prod_re(g_spinor_field[DUM_MATRIX][i], phi);
-      _gamma5(phi, phi);
-      resp4 += _spinor_prod_im(g_spinor_field[DUM_MATRIX][i], phi);
-    }
-
-#if defined MPI
-    MPI_Reduce(&res, &mpi_res, 1, MPI_DOUBLE, MPI_SUM, 0, g_mpi_time_slices);
-    res = mpi_res;
-    MPI_Reduce(&respa, &mpi_respa, 1, MPI_DOUBLE, MPI_SUM, 0, g_mpi_time_slices);
-    respa = mpi_respa;
-    MPI_Reduce(&resp4, &mpi_resp4, 1, MPI_DOUBLE, MPI_SUM, 0, g_mpi_time_slices);
-    resp4 = mpi_resp4;
-    sCpp[t] = +res/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)/2./optr->kappa/optr->kappa;
-    sCpa[t] = -respa/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)/2./optr->kappa/optr->kappa;
-    sCp4[t] = +resp4/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)/2./optr->kappa/optr->kappa;
-#else
-    Cpp[t] = +res/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)/2./optr->kappa/optr->kappa;
-    Cpa[t] = -respa/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)/2./optr->kappa/optr->kappa;
-    Cp4[t] = +resp4/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)/2./optr->kappa/optr->kappa;
-#endif
-  }
-
-#ifdef MPI
-  /* some gymnastics needed in case of parallelisation */
-  if(g_mpi_time_rank == 0) {
-    MPI_Gather(sCpp, T, MPI_DOUBLE, Cpp, T, MPI_DOUBLE, 0, g_mpi_SV_slices);
-    MPI_Gather(sCpa, T, MPI_DOUBLE, Cpa, T, MPI_DOUBLE, 0, g_mpi_SV_slices);
-    MPI_Gather(sCp4, T, MPI_DOUBLE, Cp4, T, MPI_DOUBLE, 0, g_mpi_SV_slices);
-  }
-#endif
-
-  /* and write everything into a file */
-  if(g_mpi_time_rank == 0 && g_proc_coords[0] == 0) {
-    ofs = fopen(filename, "w");
-    fprintf( ofs, "1  1  0  %e  %e\n", Cpp[t0], 0.);
-    for(t = 1; t < g_nproc_t*T/2; t++) {
-      tt = (t0+t)%(g_nproc_t*T);
-      fprintf( ofs, "1  1  %d  %e  ", t, Cpp[tt]);
-      tt = (t0+g_nproc_t*T-t)%(g_nproc_t*T);
-      fprintf( ofs, "%e\n", Cpp[tt]);
-    }
-    tt = (t0+g_nproc_t*T/2)%(g_nproc_t*T);
-    fprintf( ofs, "1  1  %d  %e  %e\n", t, Cpp[tt], 0.);
-
-    fprintf( ofs, "2  1  0  %e  %e\n", Cpa[t0], 0.);
-    for(t = 1; t < g_nproc_t*T/2; t++) {
-      tt = (t0+t)%(g_nproc_t*T);
-      fprintf( ofs, "2  1  %d  %e  ", t, Cpa[tt]);
-      tt = (t0+g_nproc_t*T-t)%(g_nproc_t*T);
-      fprintf( ofs, "%e\n", Cpa[tt]);
-    }
-    tt = (t0+g_nproc_t*T/2)%(g_nproc_t*T);
-    fprintf( ofs, "2  1  %d  %e  %e\n", t, Cpa[tt], 0.);
-
-    fprintf( ofs, "6  1  0  %e  %e\n", Cp4[t0], 0.);
-    for(t = 1; t < g_nproc_t*T/2; t++) {
-      tt = (t0+t)%(g_nproc_t*T);
-      fprintf( ofs, "6  1  %d  %e  ", t, Cp4[tt]);
-      tt = (t0+g_nproc_t*T-t)%(g_nproc_t*T);
-      fprintf( ofs, "%e\n", Cp4[tt]);
-    }
-    tt = (t0+g_nproc_t*T/2)%(g_nproc_t*T);
-    fprintf( ofs, "6  1  %d  %e  %e\n", t, Cp4[tt], 0.);
-    fclose(ofs);
-  }
-#ifdef MPI
-  if(g_mpi_time_rank == 0) {
-    free(Cpp); free(Cpa); free(Cp4);
-  }
-  free(sCpp); free(sCpa); free(sCp4);
-#else
-  free(Cpp); free(Cpa); free(Cp4);
-#endif
-  etime = gettime();
-  
-  if(g_proc_id == 0 && g_debug_level > 0) {
-    printf("ONLINE: measurement done int t/s = %1.4e\n", etime - atime);
-  }
-  return;
-}
diff --git a/operator.c b/operator.c
index 92e800ce7..0b39df110 100644
--- a/operator.c
+++ b/operator.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -27,7 +27,7 @@
 #include <math.h>
 #include <errno.h>
 #include <time.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
@@ -37,10 +37,13 @@
 #include "operator/tm_operators.h"
 #include "linalg_eo.h"
 #include "operator/D_psi.h"
+#if defined TM_USE_BSM
 #include "operator/D_psi_BSM.h"
 #include "operator/D_psi_BSM2b.h"
 #include "operator/D_psi_BSM2f.h"
 #include "operator/D_psi_BSM2m.h"
+#include "operator/D_psi_BSM3.h"
+#endif
 #include "operator/Dov_psi.h"
 #include "operator/tm_operators_nd.h"
 #include "operator/Hopping_Matrix.h"
@@ -48,8 +51,10 @@
 #include "invert_doublet_eo.h"
 #include "invert_overlap.h"
 #include "invert_clover_eo.h"
+#if TM_USE_BSM
 #include "init/init_scalar_field.h"
 #include "init/init_bsm_2hop_lookup.h"
+#endif
 #include "boundary.h"
 #include "start.h"
 #include "solver/eigenvalues.h"
@@ -60,17 +65,26 @@
 #include <io/utils.h>
 #include "test/overlaptests.h"
 #include "solver/index_jd.h"
+#include "little_D.h"
 #include "operator/clovertm_operators.h"
+#include "operator/clovertm_operators_32.h"
 #include "operator/clover_leaf.h"
 #include "operator.h"
 #include "gettime.h"
+#ifdef TM_USE_QUDA
+#  include "quda_interface.h"
+#endif
+#ifdef DDalphaAMG
+#  include "DDalphaAMG_interface.h"
+#endif
 
 
 void dummy_D(spinor * const, spinor * const);
+void dummy_Mee(spinor * const, spinor * const, double const);
+void dummy_M(spinor * const, spinor * const, spinor * const, spinor * const);
 void dummy_DbD(spinor * const s, spinor * const r, spinor * const p, spinor * const q);
 void op_invert(const int op_id, const int index_start, const int write_prop);
 void op_write_prop(const int op_id, const int index_start, const int append_);
-
 operator operator_list[max_no_operators];
 
 int no_operators = 0;
@@ -86,7 +100,11 @@ int add_operator(const int type) {
   optr->kappa = _default_g_kappa;
   optr->mu = _default_g_mu;
   optr->c_sw = _default_c_sw;
-  optr->sloppy_precision = _default_g_sloppy_precision_flag;
+  optr->sloppy_precision = _default_operator_sloppy_precision_flag;
+  optr->compression_type = _default_compression_type;
+  optr->external_inverter = _default_external_inverter;
+  optr->solver_params.solution_type = TM_SOLUTION_M;
+  optr->solver_params.no_shifts = 1;
   optr->coefs = NULL;
   optr->rel_prec = _default_g_relative_precision_flag;
   optr->eps_sq = _default_solver_precision;
@@ -105,16 +123,22 @@ int add_operator(const int type) {
   optr->prop3 = NULL;
   optr->error_code = 0;
   optr->prop_precision = _default_prop_precision_flag;
+  optr->write_prop_flag = _default_write_prop_flag;
   optr->no_flavours = 1;
   optr->DownProp = 0;
   optr->conf_input = _default_gauge_input_filename;
   optr->no_extra_masses = 0;
 
+#if defined TM_USE_BSM
   optr->npergauge = 1;
+  optr->nscalarstep = 1;
   optr->n = 0;
-
-  optr->applyM = &dummy_D;
-  optr->applyQ = &dummy_D;
+#endif
+  optr->applyM = &dummy_M;
+  optr->applyQ = &dummy_M;
+  optr->applyMee = &dummy_Mee;
+  optr->applyMeeInv = &dummy_Mee;
+  (optr->solver_params).mcg_delta = _default_mixcg_innereps;
   optr->applyQp = &dummy_D;
   optr->applyQm = &dummy_D;
   optr->applyMp = &dummy_D;
@@ -138,10 +162,17 @@ int add_operator(const int type) {
     optr->m = 0.;
     optr->inverter = &op_invert;
   }
-  if(optr->type == DBTMWILSON || optr->type == DBCLOVER || optr->type == BSM || optr->type == BSM2m || optr->type == BSM2b || optr->type == BSM2f ) {
-    optr->no_flavours = 2;
-    g_running_phmc = 1;
+  if(optr->type == DBTMWILSON || optr->type == DBCLOVER ){
+      optr->no_flavours = 2;
+      g_running_phmc = 1;
   }
+
+#if defined TM_USE_BSM
+    if ( optr->type == BSM || optr->type == BSM2m || optr->type == BSM2b || optr->type == BSM2f || optr->type == BSM3 ) {
+      optr->no_flavours = 2;
+      g_running_phmc = 1;
+    }
+#endif
   
   optr->precWS=NULL;
 
@@ -164,7 +195,11 @@ int init_operators() {
         if(optr->c_sw > 0) {
           init_sw_fields();
         }
+        optr->applyM = &M_full;
+        optr->applyQ = &Q_full;
         if(optr->even_odd_flag) {
+          optr->applyMee    = &Mee_psi;
+          optr->applyMeeInv = &Mee_inv_psi;
           optr->applyQp = &Qtm_plus_psi;
           optr->applyQm = &Qtm_minus_psi;
           optr->applyQsq = &Qtm_pm_psi;
@@ -176,33 +211,65 @@ int init_operators() {
           optr->applyQm = &Q_minus_psi;
           optr->applyQsq = &Q_pm_psi;
           optr->applyMp = &D_psi;
-          optr->applyMm = &D_psi;
+          optr->applyMm = &M_minus_psi;
         }
-        if(optr->solver == 12) {
-          if (g_cart_id == 0 && optr->even_odd_flag == 1)
-            fprintf(stderr, "CG Multiple mass solver works only without even/odd! Forcing!\n");
-          optr->even_odd_flag = 0;
+        if(optr->solver == CGMMS) {
+          if( optr->external_inverter != QPHIX_INVERTER ){
+            if (g_cart_id == 0 && optr->even_odd_flag == 1)
+              fprintf(stderr, "CG Multiple mass solver works only without even/odd! Forcing!\n");
+            optr->even_odd_flag = 0;
+          }
           if (g_cart_id == 0 && optr->DownProp)
             fprintf(stderr, "CGMMS doesn't need AddDownPropagator! Switching Off!\n");
           optr->DownProp = 0;
         }
-              
-        if(optr->solver == INCREIGCG){
-          if (g_cart_id == 0 && optr->DownProp){
-             fprintf(stderr,"Warning: When even-odd preconditioning is used, the eigenvalues for +mu and -mu will be little different\n");
-             fprintf(stderr,"Incremental EigCG solver will still work however.\n");
+        if(optr->solver == INCREIGCG) {
+          if (g_cart_id == 0 && optr->DownProp) {
+            fprintf(stderr,"Warning: When even-odd preconditioning is used, the eigenvalues for +mu and -mu will be little different\n");
+            fprintf(stderr,"Incremental EigCG solver will still work however.\n");
           }
-          
-
           if (g_cart_id == 0 && optr->even_odd_flag == 0)
-             fprintf(stderr,"Incremental EigCG solver is added only with Even-Odd preconditioning!. Forcing\n");
-          optr->even_odd_flag = 1; 
+            fprintf(stderr,"Incremental EigCG solver is added only with Even-Odd preconditioning!. Forcing\n");
+          optr->even_odd_flag = 1;
+        }
+      }
+      else if(optr->type == CLOVER) {
+        if(optr->c_sw > 0) {
+          init_sw_fields();
+        }
+        optr->applyM = &Msw_full;
+        optr->applyQ = &Qsw_full;
+        if(optr->even_odd_flag) {
+          optr->applyMee    = &Mee_sw_psi;
+          optr->applyMeeInv = &Mee_sw_inv_psi;
+          optr->applyQp = &Qsw_plus_psi;
+          optr->applyQm = &Qsw_minus_psi;
+          optr->applyQsq = &Qsw_pm_psi;
+          optr->applyMp = &Msw_plus_psi;
+          optr->applyMm = &Msw_minus_psi;
+        }
+        else {
+          optr->applyQp = &Qsw_full_plus_psi;
+          optr->applyQm = &Qsw_full_minus_psi;
+          optr->applyQsq = &Qsw_full_pm_psi;
+          optr->applyMp = &D_psi;
+          optr->applyMm = &Msw_full_minus_psi;
+        }
+        if(optr->solver == CGMMS) {
+          if( optr->external_inverter != QPHIX_INVERTER ){
+            if (g_cart_id == 0 && optr->even_odd_flag == 1)
+              fprintf(stderr, "CG Multiple mass solver works only without even/odd! Forcing!\n");
+            optr->even_odd_flag = 0;
+          }
+          if (g_cart_id == 0 && optr->DownProp)
+            fprintf(stderr, "CGMMS doesn't need AddDownPropagator! Switching Off!\n");
+          optr->DownProp = 0;
         }
       }
       else if(optr->type == OVERLAP) {
         optr->even_odd_flag = 0;
-        optr->applyM = &Dov_psi;
-        optr->applyQ = &Qov_psi;
+        optr->applyMp = &Dov_psi;
+        optr->applyQp = &Qov_psi;
       }
       else if(optr->type == DBTMWILSON) {
         optr->even_odd_flag = 1;
@@ -210,15 +277,16 @@ int init_operators() {
         /* TODO: this should be here!       */
         /* Chi`s-spinors  memory allocation */
         /*       if(init_chi_spinor_field(VOLUMEPLUSRAND/2, 20) != 0) { */
-        /*  fprintf(stderr, "Not enough memory for 20 NDPHMC Chi fields! Aborting...\n"); */
-        /*  exit(0); */
+        /*   fprintf(stderr, "Not enough memory for 20 NDPHMC Chi fields! Aborting...\n"); */
+        /*   exit(0); */
         /*       } */
       }
       else if(optr->type == DBCLOVER) {
         optr->even_odd_flag = 1;
-        optr->applyDbQsq = &Qtm_pm_ndpsi;
+        optr->applyDbQsq = &Qsw_pm_ndpsi;
       }
-      else if(optr->type == BSM || optr->type == BSM2b || optr->type == BSM2m || optr->type== BSM2f ) {
+#if defined TM_USE_BSM
+      else if(optr->type == BSM || optr->type == BSM2b || optr->type == BSM2m || optr->type== BSM2f || optr->type==BSM3) {
         // For the BSM operator we don't use kappa normalisation,
         // as a result, when twisted boundary conditions are applied this needs to be unity.
         // In addition, unlike in the Wilson case, the hopping term comes with a plus sign.
@@ -243,6 +311,10 @@ int init_operators() {
           optr->applyMbi    = &D_psi_BSM2f;
           optr->applyMdagbi = &D_psi_dagger_BSM2f;
           optr->applyQsqbi  = &Q2_psi_BSM2f;
+        } else if( optr->type == BSM3 ){
+          optr->applyMbi    = &D_psi_BSM3;
+          optr->applyMdagbi = &D_psi_dagger_BSM3;
+          optr->applyQsqbi  = &Q2_psi_BSM3;
         }
         // generate space for 4
         int j = init_scalar_field(VOLUMEPLUSRAND, 4);
@@ -251,7 +323,18 @@ int init_operators() {
           exit(0);
         }
       }
-    }
+#endif
+      if(optr->external_inverter==QUDA_INVERTER ) {
+#ifdef TM_USE_QUDA
+        _initQuda();
+#else
+        if(g_proc_id == 0) {
+          fprintf(stderr, "Error: You're trying to use QUDA but this build was not configured for QUDA usage.\n");
+          exit(-2);
+        }
+#endif
+      }
+    } /* loop over operators */
   }
   return(0);
 }
@@ -263,6 +346,21 @@ void dummy_D(spinor * const s, spinor * const r) {
   return;
 }
 
+void dummy_Mee(spinor * const s, spinor * const r, double const d) {
+  if(g_proc_id == 0) {
+    fprintf(stderr, "dummy_Mee was called. Was that really intended?\n");
+  } 
+  return;
+}
+
+void dummy_M(spinor * const s, spinor * const r, spinor * const t, spinor * const k) {
+  if(g_proc_id == 0) {
+    fprintf(stderr, "dummy_M was called. Was that really intended?\n");
+  } 
+  return;
+}
+
+
 void dummy_DbD(spinor * const s, spinor * const r, spinor * const p, spinor * const q) {
   if(g_proc_id == 0) {
     fprintf(stderr, "dummy_DbD was called. Was that really intended?\n");
@@ -275,55 +373,66 @@ void op_invert(const int op_id, const int index_start, const int write_prop) {
   double atime = 0., etime = 0., nrm1 = 0., nrm2 = 0.;
   int i;
   optr->iterations = 0;
-  optr->reached_prec = -1.;
-  g_kappa = optr->kappa;
+  optr->reached_prec = -1.; 
+  
+  op_backup_restore_globals(TM_BACKUP_GLOBALS);
+  op_set_globals(op_id);
   boundary(g_kappa);
-
+  
   atime = gettime();
   if(optr->type == TMWILSON || optr->type == WILSON || optr->type == CLOVER) {
-    g_mu = optr->mu;
-    g_c_sw = optr->c_sw;
     if(optr->type == CLOVER) {
       if (g_cart_id == 0 && g_debug_level > 1) {
-  printf("#\n# csw = %e, computing clover leafs\n", g_c_sw);
+        printf("#\n# csw = %.12f, computing clover leafs\n", g_c_sw);
       }
       init_sw_fields(VOLUME);
       sw_term( (const su3**) g_gauge_field, optr->kappa, optr->c_sw); 
     }
-
+    
+    // this loop is for +mu (i=0) and -mu (i=1)
+    // the latter if AddDownPropagator = yes is chosen
     for(i = 0; i < 2; i++) {
-      // we need this here again for the sign switch at i == 1
-      g_mu = optr->mu;
       if (g_cart_id == 0) {
-        printf("#\n# 2 kappa mu = %e, kappa = %e, c_sw = %e\n", g_mu, g_kappa, g_c_sw);
+        printf("#\n# 2 kappa mu = %.12f, kappa = %.12f, c_sw = %.12f\n", g_mu, g_kappa, g_c_sw);
+      }
+      if(i > 0) {
+        zero_spinor_field(optr->prop0, VOLUME/2);
+        zero_spinor_field(optr->prop1, VOLUME/2);
       }
       if(optr->type != CLOVER) {
-  if(use_preconditioning){
-    g_precWS=(void*)optr->precWS;
-  }
-  else {
-    g_precWS=NULL;
-  }
-  
-  optr->iterations = invert_eo( optr->prop0, optr->prop1, optr->sr0, optr->sr1,
-              optr->eps_sq, optr->maxiter,
-              optr->solver, optr->rel_prec,
-              0, optr->even_odd_flag,optr->no_extra_masses, optr->extra_masses, optr->solver_params, optr->id );
-  
-  /* check result */
-  M_full(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], optr->prop0, optr->prop1);
+        if(use_preconditioning){
+          g_precWS=(void*)optr->precWS;
+        }
+        else {
+          g_precWS=NULL;
+        }
+        optr->iterations = invert_eo( optr->prop0, optr->prop1, optr->sr0, optr->sr1,
+                                      optr->eps_sq, optr->maxiter,
+                                      optr->solver, optr->rel_prec,
+                                      0, optr->even_odd_flag,optr->no_extra_masses,
+                                      optr->extra_masses, optr->solver_params, optr->id,
+                                      optr->external_inverter, optr->sloppy_precision, optr->compression_type);
+
+        /* check result */
+        M_full(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], optr->prop0, optr->prop1);
       }
       else {
-  /* this must be EE here!   */
-  /* to match clover_inv in Qsw_psi */
-  sw_invert(EE, optr->mu);
-
-  optr->iterations = invert_clover_eo(optr->prop0, optr->prop1, optr->sr0, optr->sr1,
-              optr->eps_sq, optr->maxiter,
-              optr->solver, optr->rel_prec,optr->solver_params,
-              &g_gauge_field, &Qsw_pm_psi, &Qsw_minus_psi);
-  /* check result */
-  Msw_full(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], optr->prop0, optr->prop1);
+        /* this must be EE here!   */
+        /* to match clover_inv in Qsw_psi */
+        if(optr->even_odd_flag || optr->solver == DFLFGMRES || optr->solver == DFLGCR)
+          sw_invert(EE, g_mu); //this is needed only when we use even-odd preconditioning
+          
+        /* only now copy double sw and sw_inv fields to 32bit versions */
+        copy_32_sw_fields();
+        
+        optr->iterations = invert_clover_eo(optr->prop0, optr->prop1, optr->sr0, optr->sr1,
+                                            optr->eps_sq, optr->maxiter,
+                                            optr->solver, optr->rel_prec,
+                                            optr->even_odd_flag, optr->solver_params,
+                                            &g_gauge_field, optr->applyQsq, optr->applyQm,
+                                            optr->external_inverter, optr->sloppy_precision, optr->compression_type);
+        /* check result */
+        optr->applyM(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], optr->prop0, optr->prop1);
       }
 
       diff(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], optr->sr0, VOLUME / 2);
@@ -339,61 +448,55 @@ void op_invert(const int op_id, const int index_start, const int write_prop) {
         mul_r(optr->prop0, (2*optr->kappa), optr->prop0, VOLUME / 2);
         mul_r(optr->prop1, (2*optr->kappa), optr->prop1, VOLUME / 2);
       }
-      if (optr->solver != CGMMS && write_prop) /* CGMMS handles its own I/O */
+      /* CGMMS handles its own I/O */
+      if (optr->solver != CGMMS && write_prop) { 
         optr->write_prop(op_id, index_start, i);
+      }
       if(optr->DownProp) {
-        optr->mu = -optr->mu;
-      } else 
+        g_mu = -g_mu;
+        dfl_subspace_updated = 1;
+      } 
+      else 
         break;
     }
-  }
-  else if(optr->type == DBTMWILSON || optr->type == DBCLOVER) {
-    g_mubar = optr->mubar;
-    g_epsbar = optr->epsbar;
-    g_c_sw = 0.;
+  } else if(optr->type == DBTMWILSON || optr->type == DBCLOVER) {
     if(optr->type == DBCLOVER) {
-      g_c_sw = optr->c_sw;
       if (g_cart_id == 0 && g_debug_level > 1) {
-  printf("#\n# csw = %e, computing clover leafs\n", g_c_sw);
+        printf("#\n# csw = %e, computing clover leafs\n", g_c_sw);
       }
       init_sw_fields(VOLUME);
       sw_term( (const su3**) g_gauge_field, optr->kappa, optr->c_sw); 
       sw_invert_nd(optr->mubar*optr->mubar-optr->epsbar*optr->epsbar);
+      /* now copy double sw and sw_inv fields to 32bit versions */
+      copy_32_sw_fields();
     }
 
     for(i = 0; i < SourceInfo.no_flavours; i++) {
       if(optr->type != DBCLOVER) {
-  optr->iterations = invert_doublet_eo( optr->prop0, optr->prop1, optr->prop2, optr->prop3, 
-                optr->sr0, optr->sr1, optr->sr2, optr->sr3,
-                optr->eps_sq, optr->maxiter,
-                optr->solver, optr->rel_prec);
-      }
-      else {
-  optr->iterations = invert_cloverdoublet_eo( optr->prop0, optr->prop1, optr->prop2, optr->prop3, 
-                optr->sr0, optr->sr1, optr->sr2, optr->sr3,
-                optr->eps_sq, optr->maxiter,
-                optr->solver, optr->rel_prec);
-      }
-      g_mu = optr->mubar;
-      if(optr->type != DBCLOVER) {
-  M_full(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+2], optr->prop0, optr->prop1); 
-      }
-      else {
-  Msw_full(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+2], optr->prop0, optr->prop1); 
-      }
-      assign_add_mul_r(g_spinor_field[DUM_DERI+1], optr->prop2, -optr->epsbar, VOLUME/2);
-      assign_add_mul_r(g_spinor_field[DUM_DERI+2], optr->prop3, -optr->epsbar, VOLUME/2);
-
-      g_mu = -g_mu;
-      if(optr->type != DBCLOVER) {
-  M_full(g_spinor_field[DUM_DERI+3], g_spinor_field[DUM_DERI+4], optr->prop2, optr->prop3); 
+        optr->iterations = invert_doublet_eo( optr->prop0, optr->prop1, optr->prop2, optr->prop3,
+                                              optr->sr0, optr->sr1, optr->sr2, optr->sr3,
+                                              optr->eps_sq, optr->maxiter,
+                                              optr->solver, optr->rel_prec,
+                                              optr->solver_params, optr->external_inverter, 
+                                              optr->sloppy_precision, optr->compression_type);
+        // checking solution
+        M_full_ndpsi( g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+2],
+                      g_spinor_field[DUM_DERI+3], g_spinor_field[DUM_DERI+4],
+                      optr->prop0, optr->prop1, optr->prop2, optr->prop3 );
       }
       else {
-  Msw_full(g_spinor_field[DUM_DERI+3], g_spinor_field[DUM_DERI+4], optr->prop2, optr->prop3);
+        optr->iterations = invert_cloverdoublet_eo( optr->prop0, optr->prop1, optr->prop2, optr->prop3,
+                                                    optr->sr0, optr->sr1, optr->sr2, optr->sr3,
+                                                    optr->eps_sq, optr->maxiter,
+                                                    optr->solver, optr->rel_prec,
+                                                    optr->solver_params, optr->external_inverter, 
+                                                    optr->sloppy_precision, optr->compression_type);
+        // checking solution
+        Msw_full_ndpsi( g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+2],
+                        g_spinor_field[DUM_DERI+3], g_spinor_field[DUM_DERI+4],
+                        optr->prop0, optr->prop1, optr->prop2, optr->prop3 );
       }
-      assign_add_mul_r(g_spinor_field[DUM_DERI+3], optr->prop0, -optr->epsbar, VOLUME/2);
-      assign_add_mul_r(g_spinor_field[DUM_DERI+4], optr->prop1, -optr->epsbar, VOLUME/2);
-
+ 
       diff(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+1], optr->sr0, VOLUME/2); 
       diff(g_spinor_field[DUM_DERI+2], g_spinor_field[DUM_DERI+2], optr->sr1, VOLUME/2); 
       diff(g_spinor_field[DUM_DERI+3], g_spinor_field[DUM_DERI+3], optr->sr2, VOLUME/2); 
@@ -404,7 +507,7 @@ void op_invert(const int op_id, const int index_start, const int write_prop) {
       nrm1 += square_norm(g_spinor_field[DUM_DERI+3], VOLUME/2, 1); 
       nrm1 += square_norm(g_spinor_field[DUM_DERI+4], VOLUME/2, 1); 
       optr->reached_prec = nrm1;
-      g_mu = g_mu1;
+
       /* For standard normalisation */
       /* we have to mult. by 2*kappa */
       mul_r(g_spinor_field[DUM_DERI], (2*optr->kappa), optr->prop0, VOLUME/2);
@@ -429,7 +532,7 @@ void op_invert(const int op_id, const int index_start, const int write_prop) {
       mul_r(optr->prop3, 1./(2*optr->kappa), g_spinor_field[DUM_DERI+3], VOLUME/2);
 
       /* mirror source, but not for volume sources */
-      if(i == 0 && SourceInfo.no_flavours == 2 && SourceInfo.type != 1) {
+      if(i == 0 && SourceInfo.no_flavours == 2 && SourceInfo.type != SRC_TYPE_VOL) {
         if (g_cart_id == 0) {
           fprintf(stdout, "# Inversion done in %d iterations, squared residue = %e!\n",
                   optr->iterations, optr->reached_prec);
@@ -439,18 +542,16 @@ void op_invert(const int op_id, const int index_start, const int write_prop) {
 
         mul_one_pm_itau2(optr->sr0, optr->sr2, g_spinor_field[DUM_DERI+2], g_spinor_field[DUM_DERI], +1., VOLUME/2);
         mul_one_pm_itau2(optr->sr1, optr->sr3, g_spinor_field[DUM_DERI+3], g_spinor_field[DUM_DERI+1], +1., VOLUME/2);
-
       }
       /* volume sources need only one inversion */
-      else if(SourceInfo.type == 1) i++;
+      else if(SourceInfo.type == SRC_TYPE_VOL) i++;
     }
-  }
-  else if(optr->type == OVERLAP) {
+  } else if(optr->type == OVERLAP) {
     g_mu = 0.;
     m_ov=optr->m;
     eigenvalues(&optr->no_ev, 5000, optr->ev_prec, 0, optr->ev_readwrite, nstore, optr->even_odd_flag);
-/*     ov_check_locality(); */
-/*      index_jd(&optr->no_ev_index, 5000, 1.e-12, optr->conf_input, nstore, 4); */
+    /*     ov_check_locality(); */
+    /*      index_jd(&optr->no_ev_index, 5000, 1.e-12, optr->conf_input, nstore, 4); */
     ov_n_cheby=optr->deg_poly;
 
     if(use_preconditioning==1)
@@ -465,35 +566,64 @@ void op_invert(const int op_id, const int index_start, const int write_prop) {
 
     if(write_prop) optr->write_prop(op_id, index_start, 0);
   }
-  else if( optr->type == BSM || optr->type == BSM2b || optr->type == BSM2m || optr->type == BSM2f ) {
+#if defined TM_USE_BSM
+  else if( optr->type == BSM || optr->type == BSM2b || optr->type == BSM2m || optr->type == BSM2f || optr->type == BSM3 ) {
+    if (g_cart_id == 0 ) {
+     printf("# csw = %e, computing clover leafs\n", csw_BSM);
+    }
+    if (optr->type==BSM3){
+      init_sw_fields(VOLUME);
+      //Note here the factor of 1/2. has been applied since
+      //the routine assign_mul_one_sw_pm_imu_site_lexic computes
+      //1+i *csw*\sum_{\mu,nu} \sigma_mu,nuF_mu,nu/2.
+      sw_term( (const su3**) g_smeared_gauge_field, 1.,  csw_BSM/2.);
+    }
+
+    bispinor *src  = (bispinor *)malloc(sizeof(bispinor)*VOLUMEPLUSRAND );
+    bispinor *dest = (bispinor *)malloc(sizeof(bispinor)*VOLUMEPLUSRAND );
+    bispinor *temp = (bispinor *)malloc(sizeof(bispinor)*VOLUMEPLUSRAND );
+    bispinor *temp2= (bispinor *)malloc(sizeof(bispinor)*VOLUMEPLUSRAND );
+
+    if ( src == NULL || dest == NULL || temp == NULL || temp2 == NULL ){
+      printf("Error in allocating memory in invert\n");
+      exit(1);
+    }
     for(i = 0; i < SourceInfo.no_flavours; i++) {
 
       convert_eo_to_lexic(g_spinor_field[DUM_DERI], optr->sr0, optr->sr1);
       convert_eo_to_lexic(g_spinor_field[DUM_DERI+1], optr->sr2, optr->sr3);
-      compact(g_bispinor_field[1], g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1]);
+      compact(src, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1]);
       
-      optr->iterations = cg_her_bi(g_bispinor_field[0], g_bispinor_field[1],
+      optr->iterations = cg_her_bi(dest, src,
                 optr->maxiter, optr->eps_sq, optr->rel_prec, VOLUME, optr->applyQsqbi);
 
-      optr->applyQsqbi(g_bispinor_field[2], g_bispinor_field[0]);
-      assign_diff_mul((spinor*)g_bispinor_field[2], (spinor*)g_bispinor_field[1], 1.0, 2*VOLUME);
-      double squarenorm = square_norm((spinor*)g_bispinor_field[2], 2*VOLUME, 1);
+      optr->applyQsqbi(temp, dest);
+      assign_diff_mul((spinor*)temp, (spinor*)src, 1.0, 2*VOLUME);
+      double squarenorm = square_norm((spinor*)temp, 2*VOLUME, 1);
       optr->reached_prec = squarenorm;
       if(g_proc_id==0) {
         printf("# BSM Dirac inversion ||A*result1-b||^2 = %e\n", squarenorm);
         fflush(stdout);
       }
 
-      optr->applyMdagbi(g_bispinor_field[2], g_bispinor_field[0]);
-      optr->applyMbi(g_bispinor_field[0], g_bispinor_field[2]);
-      assign_diff_mul((spinor*)g_bispinor_field[0], (spinor*)g_bispinor_field[1], 1.0, 2*VOLUME);
-      squarenorm = square_norm((spinor*)g_bispinor_field[0], 2*VOLUME, 1);
+      optr->applyMdagbi(temp, dest);
+      optr->applyMbi(temp2, temp);
+      assign_diff_mul((spinor*)temp2, (spinor*)src, 1.0, 2*VOLUME);
+      squarenorm = square_norm((spinor*)temp2, 2*VOLUME, 1);
       if(g_proc_id==0) {
         printf("# BSM Dirac inversion || D(D^dag [DD^dag + m_0 ]^-1 b) - b ||^2 = %e\n\n", squarenorm);
         fflush(stdout);
       }
-      
-      decompact(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], g_bispinor_field[2]);
+      if (propagatorsonthefly_BSM == 1){
+        if (g_cart_id == 0){printf("#SourceInfo.t=%d, ix=%d\n", SourceInfo.t, SourceInfo.ix);fflush(stdout);};
+        if ( ( ( vectorcurrentcurrent_BSM == 1 ) || ( axialcurrentcurrent_BSM == 1 ) ) && ( SourceInfo.t != 0 ) ){
+           assign( (spinor *)optr->prop_ntmone[4*SourceInfo.ix+2*(1-i)], (spinor *)temp, 2*VOLUME);
+        }
+        else{
+           assign( (spinor *)optr->prop_zero[4*SourceInfo.ix+2*(1-i)], (spinor *)temp, 2*VOLUME);
+        }
+      }
+      decompact(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], temp);
 
       convert_lexic_to_eo(optr->prop0, optr->prop1, g_spinor_field[DUM_DERI]);
       convert_lexic_to_eo(optr->prop2, optr->prop3, g_spinor_field[DUM_DERI+1]);
@@ -503,17 +633,17 @@ void op_invert(const int op_id, const int index_start, const int write_prop) {
 
       convert_eo_to_lexic(g_spinor_field[DUM_DERI], optr->sr0, optr->sr1);
       convert_eo_to_lexic(g_spinor_field[DUM_DERI+1], optr->sr2, optr->sr3);
-      compact(g_bispinor_field[0], g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1]);
+      compact(src, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1]);
 
-      optr->applyMbi(g_bispinor_field[1], g_bispinor_field[0]);
+      optr->applyMbi(temp, src);
       
       // accumulate number of iterations
-      optr->iterations += cg_her_bi(g_bispinor_field[2], g_bispinor_field[1],
+      optr->iterations += cg_her_bi(dest, temp,
                                     optr->maxiter, optr->eps_sq, optr->rel_prec, VOLUME, optr->applyQsqbi);
 
-      optr->applyQsqbi(g_bispinor_field[0], g_bispinor_field[2]);
-      assign_diff_mul((spinor*)g_bispinor_field[0], (spinor*)g_bispinor_field[1], 1.0, 2*VOLUME);
-      squarenorm = square_norm((spinor*)g_bispinor_field[0], 2*VOLUME, 1);
+      optr->applyQsqbi(temp2, dest);
+      assign_diff_mul((spinor*)temp2, (spinor*)temp, 1.0, 2*VOLUME);
+      squarenorm = square_norm((spinor*)temp2, 2*VOLUME, 1);
       // store the larger of the two residual norms
       optr->reached_prec = optr->reached_prec > squarenorm ? optr->reached_prec : squarenorm;
       if(g_proc_id==0) {
@@ -521,15 +651,29 @@ void op_invert(const int op_id, const int index_start, const int write_prop) {
         fflush(stdout);
       }
 
-      compact(g_bispinor_field[0], g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1]);
-      optr->applyMdagbi(g_bispinor_field[1], g_bispinor_field[2]);
-      assign_diff_mul((spinor*)g_bispinor_field[1], (spinor*)g_bispinor_field[0], 1.0, 2*VOLUME);
-      squarenorm = square_norm((spinor*)g_bispinor_field[1], 2*VOLUME, 1);
+      compact(src, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1]);
+      optr->applyMdagbi(temp, dest);
+      assign_diff_mul((spinor*)temp, (spinor*)src, 1.0, 2*VOLUME);
+      squarenorm = square_norm((spinor*)temp, 2*VOLUME, 1);
       if(g_proc_id==0) {
         printf("# BSM Dirac inversion || D^dag ([DD^dag + m_0 ]^-1 D b) - b ||^2 = %e\n\n", squarenorm);
         fflush(stdout);
       }
-      decompact(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], g_bispinor_field[2]);
+      if (propagatorsonthefly_BSM == 1){
+        if (g_cart_id == 0){printf("#SourceInfo.t=%d, ix=%d\n", SourceInfo.t, SourceInfo.ix);fflush(stdout);};
+        if ( ( ( vectorcurrentcurrent_BSM == 1 ) || ( axialcurrentcurrent_BSM == 1 ) ) && ( SourceInfo.t != 0 ) ){
+           assign( (spinor *)optr->prop_ntmone[4*SourceInfo.ix+2*(1-i)+1], (spinor *)dest, 2*VOLUME);
+        }
+        else{
+           assign( (spinor *)optr->prop_zero[4*SourceInfo.ix+2*(1-i)+1], (spinor *)dest, 2*VOLUME);
+        }
+      }
+
+/*      if (propagatorsonthefly_BSM == 1){
+        assign((spinor*)optr->prop_zero[4*SourceInfo.ix+2*(1-i)+1],(spinor *)dest, 2*VOLUME);
+      }*/
+
+      decompact(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], dest);
 
       convert_lexic_to_eo(optr->prop0, optr->prop1, g_spinor_field[DUM_DERI]);
       convert_lexic_to_eo(optr->prop2, optr->prop3, g_spinor_field[DUM_DERI+1]);
@@ -550,17 +694,22 @@ void op_invert(const int op_id, const int index_start, const int write_prop) {
       /* volume sources need only one inversion */
       else if(SourceInfo.type == 1) i++;
     }
+    free(src  );
+    free(dest );
+    free(temp );
+    free(temp2);
   }
+#endif
   etime = gettime();
   if (g_cart_id == 0 && g_debug_level > 0) {
     fprintf(stdout, "# Inversion done in %d iterations, squared residue = %e!\n",
             optr->iterations, optr->reached_prec);
     fprintf(stdout, "# Inversion done in %1.2e sec. \n", etime - atime);
   }
+  op_backup_restore_globals(TM_RESTORE_GLOBALS);
   return;
 }
 
-
 void op_write_prop(const int op_id, const int index_start, const int append_) {
   operator * optr = &operator_list[op_id];
   const unsigned int strl = 100;
@@ -582,40 +731,108 @@ void op_write_prop(const int op_id, const int index_start, const int append_) {
   else {
     strcpy(ending, "inverted");
   }
-  
   // 1 == volume source
-  if(SourceInfo.type != 1) {
+  if(SourceInfo.type == SRC_TYPE_POINT || SourceInfo.type == SRC_TYPE_TS) {
+    // timeslice soruces are usually used for smearing/fuzzing and dilution, this is tracked via SourceInfo.ix in the filename 
     if (PropInfo.splitted) {
-      /* operators with additional external fields require one more index */
-      if( optr->type==BSM || optr->type==BSM2b || optr->type==BSM2m ){
-        snprintf(filename, strl, "%s.%.4d.%.2d.%.2d.%03d.%s", SourceInfo.basename, SourceInfo.nstore, SourceInfo.t, SourceInfo.ix, optr->n, ending);
-      }else{
-        snprintf(filename, strl, "%s.%.4d.%.2d.%.2d.%s", SourceInfo.basename, SourceInfo.nstore, SourceInfo.t, SourceInfo.ix, ending);
+        /* operators with additional external fields require one more index */
+#ifdef TM_USE_BSM
+      if( optr->type==BSM || optr->type==BSM2b || optr->type==BSM2m || optr->type==BSM2f || optr->type==BSM3){
+        if(T_global > 99) {
+          snprintf(filename, strl, "%s.%.4d.%.3d.%.2d.%.8d.%s", PropInfo.basename, SourceInfo.nstore, SourceInfo.t, SourceInfo.ix, optr->n, ending);
+        }
+        else{
+          snprintf(filename, strl, "%s.%.4d.%.2d.%.2d.%.8d.%s", PropInfo.basename, SourceInfo.nstore, SourceInfo.t, SourceInfo.ix, optr->n, ending);
+        } 
+      } 
+      else {
+         printf("invalid operator for the BSM project\n");
+         exit(1);
+      }
+#else
+      if(T_global > 99) {
+        snprintf(filename, strl, "%s.%.4d.%.3d.%.2d.%s", PropInfo.basename, SourceInfo.nstore, SourceInfo.t, SourceInfo.ix,  ending);
       }
+      else{
+        snprintf(filename, strl, "%s.%.4d.%.2d.%.2d.%s", PropInfo.basename, SourceInfo.nstore, SourceInfo.t, SourceInfo.ix,  ending);
+      }
+#endif
     }
     else {
-      if( optr->type==BSM || optr->type == BSM2b || optr->type==BSM2m ){
-        snprintf(filename, strl, "%s.%.4d.%.2d.%03d.%s", SourceInfo.basename, SourceInfo.nstore, SourceInfo.t, optr->n, ending);
-      }else{
-        snprintf(filename, strl, "%s.%.4d.%.2d.%s", SourceInfo.basename, SourceInfo.nstore, SourceInfo.t, ending);
-      }
+#ifdef TM_USE_BSM
+     if( optr->type==BSM || optr->type == BSM2b || optr->type==BSM2m || optr->type==BSM2f || optr->type==BSM3){
+       if(T_global > 99) {
+         snprintf(filename, strl, "%s.%.4d.%.3d.%.8d.%s", SourceInfo.basename, SourceInfo.nstore, SourceInfo.t, optr->n, ending);
+       }
+       else {
+         snprintf(filename, strl, "%s.%.4d.%.2d.%.8d.%s", SourceInfo.basename, SourceInfo.nstore, SourceInfo.t, optr->n, ending);
+       }
+     }
+     else {
+       printf("invalid operator for the BSM project\n");
+       exit(1);
+     }
+#else 
+     if(T_global > 99) {
+         snprintf(filename, strl, "%s.%.4d.%.3d.%s", SourceInfo.basename, SourceInfo.nstore, SourceInfo.t,  ending);
+     }
+     else {
+         snprintf(filename, strl, "%s.%.4d.%.3d.%s", SourceInfo.basename, SourceInfo.nstore, SourceInfo.t,  ending);
+     }
+
+#endif 
+   } 
+  }
+  else if (SourceInfo.type == SRC_TYPE_VOL) {
+#if defined TM_USE_BSM
+    if(optr->type==BSM || optr->type==BSM2b || optr->type==BSM2m || optr->type==BSM2f || optr->type==BSM3 ){
+      snprintf(filename, strl, "%s.%.4d.%.5d.%.8d.%s", PropInfo.basename, SourceInfo.nstore, SourceInfo.sample, optr->n, ending);
+    } 
+    else {
+      printf("invalid operator for the BSM project\n");
+      exit(1);
+    }
+#else 
+    snprintf(filename, strl, "%s.%.4d.%.5d.%s", PropInfo.basename, SourceInfo.nstore, SourceInfo.sample, ending);
+#endif
+  }
+  else if(SourceInfo.type == SRC_TYPE_PION_TS || SourceInfo.type == SRC_TYPE_GEN_PION_TS) {
+#if defined TM_USE_BSM
+    if(optr->type==BSM || optr->type==BSM2b || optr->type==BSM2m || optr->type==BSM2f || optr->type==BSM3 ){
+      snprintf(filename, strl, "%s.%.4d.%.5d.%.2d.%.8d%s", PropInfo.basename, SourceInfo.nstore, SourceInfo.sample, SourceInfo.t,optr->n, ending);
     }
+    else {
+      printf("invalid operator for the BSM project\n");
+      exit(1);
+    }
+#else
+
+    snprintf(filename, strl, "%s.%.4d.%.5d.%.2d.%s", PropInfo.basename, SourceInfo.nstore, SourceInfo.sample, SourceInfo.t, ending);
+#endif
   }
   else {
-    if(optr->type==BSM || optr->type==BSM2b || optr->type==BSM2m ){
-      snprintf(filename, strl, "%s.%.4d.%.5d.%03d.%s", SourceInfo.basename, SourceInfo.nstore, SourceInfo.sample, optr->n, ending);
-    } else {
-      snprintf(filename, strl, "%s.%.4d.%.5d.%s", SourceInfo.basename, SourceInfo.nstore, SourceInfo.sample, ending);
+#if defined TM_USE_BSM
+    if(optr->type==BSM || optr->type==BSM2b || optr->type==BSM2m || optr->type==BSM2f || optr->type==BSM3 ){
+      snprintf(filename, strl, "%s.%.4d.%.5d.%.8d.%s", SourceInfo.basename, SourceInfo.nstore, SourceInfo.sample, optr->n, ending);
     }
+    else {
+      printf("invalid operator for the BSM project\n");
+      exit(1);
+    }
+#else 
+    snprintf(filename, strl, "%s.%.4d.%.5d.%s", SourceInfo.basename, SourceInfo.nstore, SourceInfo.sample, ending);
+#endif
   }
-
   if(!PropInfo.splitted || append_)
     append = 1;
   /* the 1 is for appending */
   construct_writer(&writer, filename, append);
+//  printf("\n\nSourceinfo = %d index_start=%d\n\n",SourceInfo.ix, index_start);
   if (PropInfo.splitted || SourceInfo.ix == index_start) {
+    write_propagator_type(writer, 0);
+
     inverterInfo = construct_paramsInverterInfo(optr->reached_prec, optr->iterations, 
-            optr->solver, optr->no_flavours);
+                                                optr->solver, optr->no_flavours);
     write_spinor_info(writer, PropInfo.format, inverterInfo, append);
     free(inverterInfo);
   }
@@ -625,10 +842,10 @@ void op_write_prop(const int op_id, const int index_start, const int append_) {
     sourceFormat = construct_paramsSourceFormat(SourceInfo.precision, optr->no_flavours, 4, 3);
     write_source_format(writer, sourceFormat);
     status = write_spinor(writer, &operator_list[op_id].sr0, &operator_list[op_id].sr1, 
-        1, SourceInfo.precision);
+                          1, SourceInfo.precision);
     if(optr->no_flavours == 2) {
       status = write_spinor(writer, &operator_list[op_id].sr2, &operator_list[op_id].sr3, 
-          1, SourceInfo.precision);
+                            1, SourceInfo.precision);
     }
     free(sourceFormat);
   }
@@ -640,6 +857,54 @@ void op_write_prop(const int op_id, const int index_start, const int append_) {
     status = write_spinor(writer, &operator_list[op_id].prop2, &operator_list[op_id].prop3, 1, optr->prop_precision);
   }
   status = write_spinor(writer, &operator_list[op_id].prop0, &operator_list[op_id].prop1, 1, optr->prop_precision);
+  // check status for errors!?
   destruct_writer(writer);
   return;
 }
+
+void op_backup_restore_globals(const backup_restore_t mode){
+  static double backup_kappa;
+  static double backup_mu;
+  static double backup_c_sw;
+  static double backup_mubar;
+  static double backup_epsbar;
+  if( mode == TM_BACKUP_GLOBALS ){
+    backup_kappa  = g_kappa;
+    backup_c_sw   = g_c_sw;
+    backup_mu     = g_mu;
+    backup_mubar  = g_mubar;
+    backup_epsbar = g_epsbar;
+  } else {
+    g_kappa  = backup_kappa;
+    g_c_sw   = backup_c_sw;
+    g_mu     = backup_mu;
+    g_mubar  = backup_mubar;
+    g_epsbar = backup_epsbar;
+    boundary(g_kappa);
+  }
+}
+  
+void op_set_globals(const int op_id){
+  operator* op = &operator_list[op_id];
+
+  g_kappa = op->kappa;
+  g_mu    = op->mu;
+
+  if( op->type == CLOVER || op->type == DBCLOVER ){
+    g_c_sw = op->c_sw;
+  }
+  if( op->type == DBTMWILSON || op-> type == DBCLOVER){
+    g_mubar = op->mubar;
+    g_epsbar = op->epsbar;
+  }
+  if(g_debug_level > 2 && g_proc_id == 0){
+    printf("# op_set_globals set globals to:\n");
+    printf("# g_kappa = %.12lf\n", g_kappa);
+    printf("# g_c_sw = %.12lf\n", g_c_sw);
+    printf("# g_mu = %.12lf\n", g_mu);
+    printf("# g_mu2 = %.12lf\n", g_mu2);
+    printf("# g_mu3 = %.12lf\n", g_mu3);
+    printf("# g_mubar = %.12lf\n", g_mubar);
+    printf("# g_epsbar = %.12lf\n", g_epsbar);
+  }
+}
diff --git a/operator.h b/operator.h
index c977d9539..7d452f88d 100644
--- a/operator.h
+++ b/operator.h
@@ -1,4 +1,3 @@
-
 /***********************************************************************
  *
  * Copyright (C) 2009 Carsten Urbach
@@ -26,22 +25,10 @@
 #include "solver/dirac_operator_eigenvectors.h"
 #include "su3.h"
 #include "solver/solver_params.h"
-                                    
-
-typedef enum op_type {
-  TMWILSON = 0,
-  OVERLAP,
-  WILSON,
-  DBTMWILSON,
-  CLOVER,
-  DBCLOVER,
-  BSM,
-  BSM2b,
-  BSM2m,
-  BSM2f
-} op_type;
-
-#define max_no_operators 10
+#include "operator_types.h"
+#include "misc_types.h"
+
+#define max_no_operators 50
 
 typedef struct {
   /* ID of the operator */
@@ -52,7 +39,7 @@ typedef struct {
   int deg_poly;
   int no_ev;
   
-  int sloppy_precision;
+  SloppyPrecision sloppy_precision;
   int even_odd_flag;
   int solver;
   int N_s;
@@ -61,9 +48,12 @@ typedef struct {
   int maxiter;
   int iterations;
   int prop_precision;
+  int write_prop_flag;
   int no_flavours;
   int DownProp;
   int no_ev_index;
+  ExternalInverter external_inverter;
+  CompressionType compression_type;
 
   int error_code;
 
@@ -90,6 +80,9 @@ typedef struct {
   spinor *sr0, *sr1, *sr2, *sr3;
   /* generic place for propagators */
   spinor *prop0, *prop1, *prop2, *prop3;
+  /* generic place for all propagators for a point source */
+  bispinor **prop_zero;
+  bispinor **prop_ntmone;
 
   /*solver parameters struct*/
   solver_params_t solver_params;
@@ -98,27 +91,49 @@ typedef struct {
   double extra_masses[MAX_EXTRA_MASSES];
   int no_extra_masses;
 
+#if TM_USE_BSM
   /* for the BSM operator, support for multiple scalar fields per sample/index */
   int npergauge;
+  int nscalarstep;
   int n;
+#endif
 
 
   /* chebyshef coefficients for the overlap */
   double * coefs;
+
+ 
   /* various versions of the Dirac operator */
-  void (*applyM) (spinor * const, spinor * const);
-  void (*applyQ) (spinor * const, spinor * const);
-  /* with even/odd */
-  void (*applyQp) (spinor * const, spinor * const);
-  void (*applyQm) (spinor * const, spinor * const);
+  /* ---------------------------------------*/
+  //even-even part of the even-odd operator 
+  void (*applyMee) (spinor * const, spinor * const, double const);
+  //inverse of the even-even part of the even-odd operator 
+  void (*applyMeeInv) (spinor * const, spinor * const, double const);
+  //full operator M acting on a spinor given as even and odd parts separately
+  void (*applyM) (spinor * const, spinor * const, spinor * const, spinor * const); 
+  //full operator Q=gamma5*M on a spinor given as even and odd parts separately
+  void (*applyQ) (spinor * const, spinor * const, spinor * const, spinor * const); 
+  //either: the full operator Q^+ on lexiographic spinor 
+  //or    : eo-preconditioned Q^+ on odd part of an eo ordered spinor
+  void (*applyQp) (spinor * const, spinor * const); 
+  //either : the full operator Q^- on lexiographic spinor 
+  //or     : eo-preconditioned Q^- on odd part of an eo ordered spinor
+  void (*applyQm) (spinor * const, spinor * const); 
+  //either: the full operator Q^+*Q^- on lexiographic spinor 
+  //or    : eo-preconditioned Q^+*Q^- on odd part of an eo ordered spinor
   void (*applyQsq) (spinor * const, spinor * const);
 
   void (*applyMbi)    (bispinor * const, bispinor * const);
   void (*applyMdagbi) (bispinor * const, bispinor * const);
   void (*applyQsqbi)  (bispinor * const, bispinor * const);
   
-  void (*applyMp) (spinor * const, spinor * const);
-  void (*applyMm) (spinor * const, spinor * const);
+  //either: the full operator M^+ on lexiographic spinor 
+  //or    : eo-preconditioned M^+ on odd part of an eo ordered spinor
+  void (*applyMp) (spinor * const, spinor * const); 
+  //either: the full operator M^- on lexiographic spinor 
+  //or    : eo-preconditioned M^- on odd part of an eo ordered spinor
+  void (*applyMm) (spinor * const, spinor * const); 
+  //EO preconditoned Hermitian operator for the non-degenerate doublet (more explanantion needed here).
   void (*applyDbQsq) (spinor * const, spinor * const, spinor * const, spinor * const);
   /* the generic invert function */
   void (*inverter) (const int op_id, const int index_start, const int write_prop);
@@ -137,4 +152,7 @@ extern int no_operators;
 int add_operator(const int type);
 int init_operators();
 
+void op_set_globals(const int op_id);
+void op_backup_restore_globals(const backup_restore_t mode);
+
 #endif
diff --git a/operator/Block_D_psi_body.c b/operator/Block_D_psi_body.c
new file mode 100644
index 000000000..84c7496c5
--- /dev/null
+++ b/operator/Block_D_psi_body.c
@@ -0,0 +1,156 @@
+/* apply the Dirac operator to the block local spinor field s */
+/* and store the result in block local spinor field rr        */
+/* for block blk                                              */
+/* the block local gauge field is assumed to be in the order  */
+/* that is needed int local_D, which means also that it is a  */
+/* double copy                                                */
+// CU: has problems with SSE2,3
+void _PSWITCH(Block_D_psi)(block * blk, _PTSWITCH(spinor) * const rr, _PTSWITCH(spinor) * const s) {
+
+  if(g_c_sw > 0)
+    _PSWITCH(Block_Dsw_psi)(blk, rr, s);
+  else 
+    _PSWITCH(Block_Dtm_psi)(blk,rr,s);
+
+  return;
+}
+ 
+//this version is for c_sw=0
+void _PSWITCH(Block_Dtm_psi)(block * blk, _PTSWITCH(spinor) * const rr, _PTSWITCH(spinor) * const s) {
+  int i;
+  _PTSWITCH(spinor) *r = rr;
+  _PTSWITCH(spinor) *t = s;
+  _PSWITCH(su3) * u = blk->_PSWITCH(u);
+  int * idx = blk->idx;
+  static _C_TYPE rhoa, rhob;
+  _PTSWITCH(spinor) ALIGN tmpr;
+  if(blk_gauge_eo) {
+    _PSWITCH(init_blocks_gaugefield)();
+  }
+  rhoa = 1.0 + (_F_TYPE)g_mu * I;
+  rhob = conj(rhoa);
+
+  /* set the boundary term to zero */
+  _spinor_null(rr[blk->volume]);
+  _spinor_null(s[blk->volume]);
+
+  for(i = 0; i < blk->volume; i++) {
+    _complex_times_vector(tmpr.s0, rhoa, t->s0);
+    _complex_times_vector(tmpr.s1, rhoa, t->s1);
+    _complex_times_vector(tmpr.s2, rhob, t->s2);
+    _complex_times_vector(tmpr.s3, rhob, t->s3);
+
+    _PSWITCH(local_H)(r, s, u, idx, &tmpr);
+
+    r++;
+    t++;
+    idx += 8;
+    u += 8;
+  }
+
+  return;
+}
+
+
+/* apply the Dirac operator to the block local spinor field s */
+/* and store the result in block local spinor field rr        */
+/* for block blk                                              */
+/* the block local gauge field is assumed to be in the order  */
+/* that is needed int local_D, which means also that it is a  */
+/* double copy                                                */
+// CU: has problems with SSE2,3
+void _PSWITCH(Block_Dsw_psi)(block * blk, _PTSWITCH(spinor) * const rr, _PTSWITCH(spinor) * const s) {
+  _PTSWITCH(spinor) *r = rr;
+  _PTSWITCH(spinor) *t = s;
+  _PSWITCH(su3) * u = blk->_PSWITCH(u);
+  int * idx = blk->idx;
+  //static _Complex double rhoa, rhob;
+  _PTSWITCH(spinor) ALIGN tmpr;
+
+  int it, ix, iy, iz; //lexiographic index of the site w.r.t the block
+  int bt, bx, by, bz; //block coordinate on the local mpi process
+  int dT, dX, dY, dZ; //block size
+  int sT, sX, sY, sZ; //constant shifts
+  int lx; //lexiographic index of the block site w.r.t the local mpi process
+
+  dT = blk->BT;
+  dX = blk->BLX;
+  dY = blk->BLY;
+  dZ = blk->BLZ;
+
+  bt = blk->mpilocal_coordinate[0];
+  bx = blk->mpilocal_coordinate[1];
+  by = blk->mpilocal_coordinate[2];
+  bz = blk->mpilocal_coordinate[3];
+
+  sT = bt * dT;
+  sX = bx * dX;
+  sY = by * dY;
+  sZ = bz * dZ;
+  
+  if(blk_gauge_eo) {
+    init_blocks_gaugefield();
+  }
+  
+  /* set the boundary term to zero */
+  _spinor_null(rr[blk->volume]);
+  _spinor_null(s[blk->volume]);
+
+  for(int i = 0; i < blk->volume; i++) {
+
+    iz = i % dZ;
+    iy = (i / dZ) % dY;
+    ix = (i / (dZ * dY)) % dX;
+    it = i / (dZ * dY * dX);
+
+    lx = g_ipt[it + sT][ix + sX][iy + sY][iz + sZ];
+
+    _PSWITCH(assign_mul_one_sw_pm_imu_site_lexic)(lx, &tmpr, t, (_F_TYPE)g_mu);
+
+    _PSWITCH(local_H)(r, s, u, idx, &tmpr);
+
+    r++;
+    t++;
+    idx += 8;
+    u += 8;
+  }
+
+  return;
+}
+
+
+/* Apply Hopping Matrix to a even(odd) spinor */
+void _PSWITCH(Block_H_psi)(block * blk, _PTSWITCH(spinor) * const rr, _PTSWITCH(spinor) * const s, 
+			   const int eo) {
+  int i;
+  _PTSWITCH(spinor) *r = rr;
+  _PSWITCH(su3) * u = blk->_PSWITCH(u);
+  int * eoidx = blk->evenidx;
+  _PTSWITCH(spinor) ALIGN tmpr;
+
+  if(!blk_gauge_eo) {
+    _PSWITCH(init_blocks_eo_gaugefield)();
+  }
+
+  /* for OE */
+  if(eo == 1) {
+    u = blk->_PSWITCH(u) + blk->volume*8/2;
+    eoidx = blk->oddidx;
+  }
+
+  /* set the boundary term to zero */
+  _spinor_null(rr[blk->volume/2]);
+  _spinor_null(s[blk->volume/2]);
+  
+  for(i = 0; i < blk->volume/2; i++) {
+    _spinor_null(tmpr);
+
+    _PSWITCH(local_H)(r, s, u, eoidx, &tmpr);
+
+    r++;
+    eoidx += 8;
+    u += 8;
+  }
+
+  return;
+}
diff --git a/operator/D_psi.c b/operator/D_psi.c
index 15ed29e5f..beb6a7891 100644
--- a/operator/D_psi.c
+++ b/operator/D_psi.c
@@ -30,9 +30,17 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-
+#ifdef SSE2
+#  define OpSSE2
+#  undef SSE2
+#endif
+#ifdef SSE3
+#  define OpSSE3
+#  undef SSE3
+#endif
+#include "buffers/utils.h"
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
@@ -40,1163 +48,1452 @@
 #include "su3.h"
 #include "sse.h"
 #include "boundary.h"
-#ifdef MPI
+#include "gamma.h"
+#include "linalg_eo.h"
+#include "buffers/utils.h"
+#ifdef TM_USE_MPI
 # include "xchange/xchange.h"
 #endif
 #include "update_backward_gauge.h"
 #include "block.h"
 #include "operator/D_psi.h"
+#include "operator/clovertm_operators.h"
 #include "solver/dirac_operator_eigenvectors.h"
 
-#if (defined SSE23 || defined SSE33)
-
-#elif (defined BGL && defined XLC)
-
-/* We have 32 registers available */
-static double _Complex reg00, reg01, reg02, reg03, reg04, reg05;
-static double _Complex reg10, reg11, reg12, reg13, reg14, reg15;
-/* For the gauge field, reuse the first three!*/
-static double _Complex u00, u01, u02, u10, u11, u12;
-static double _Complex reg20, reg21;
-/* The following contains the result spinor (12 regs) */
-static double _Complex rs00, rs01, rs02, rs10, rs11, rs12, rs20, rs21, rs22, 
-  rs30, rs31, rs32;
-
-
-/* this is the hopping part only */
-void local_H(spinor * const rr, spinor * const s, su3 * u, int * _idx) {
-
-  int * idx = _idx;
-  su3 * restrict up ALIGN;
-  su3 * restrict um ALIGN;
-  spinor * restrict sp ALIGN;
-  spinor * restrict sm ALIGN;
-
-#pragma disjoint(*s, *sp, *sm, *rr, *up, *um)
-
-  __alignx(16,rr);
-  __alignx(16,s);
-
-  /*********************** direction +0 ************************/
-  up = u;
-  sp = (spinor *) s + (*idx);
-  idx++;
-
-  um = up+1;
-  _prefetch_su3(um); 
-  sm = (spinor *) s + (*idx);
-  _prefetch_spinor(sm); 
-  idx++;
-
-  _bgl_load_reg0(sp->s0);
-  _bgl_load_reg1(sp->s1);
-  _bgl_load_reg0_up(sp->s2);
-  _bgl_load_reg1_up(sp->s3);
-  _bgl_vector_add_reg0();
-  _bgl_vector_add_reg1();
-  /* result is now in regx0, regx1, regx2 x = 0,1 */
-  
-  _bgl_su3_multiply_double((*up));
-  _bgl_vector_cmplx_mul_double(phase_0);
-  _bgl_add_to_rs0_reg0();
-  _bgl_add_to_rs2_reg0();
-  _bgl_add_to_rs1_reg1();
-  _bgl_add_to_rs3_reg1();
-
-  /*********************** direction -0 ************************/
-  up = um+1;
-  _prefetch_su3(up); 
-  sp = (spinor*) s + (*idx);
-  _prefetch_spinor(sp); 
-  idx++;
-
-  _bgl_load_reg0(sm->s0);
-  _bgl_load_reg1(sm->s1);
-  _bgl_load_reg0_up(sm->s2);
-  _bgl_load_reg1_up(sm->s3);
-  _bgl_vector_sub_reg0();
-  _bgl_vector_sub_reg1();
-  
-  _bgl_su3_inverse_multiply_double((*um));
-  _bgl_vector_cmplxcg_mul_double(phase_0);
-  
-  _bgl_add_to_rs0_reg0();
-  _bgl_sub_from_rs2_reg0();
-  _bgl_add_to_rs1_reg1();
-  _bgl_sub_from_rs3_reg1();
-  
-  /*********************** direction +1 ************************/
-  
-  um = up+1;
-  _prefetch_su3(um); 
-  sm = (spinor*) s + (*idx);
-  _prefetch_spinor(sm); 
-  idx++;
-
-  _bgl_load_reg0(sp->s0);
-  _bgl_load_reg1(sp->s1);
-  _bgl_load_reg0_up(sp->s3);
-  _bgl_load_reg1_up(sp->s2);
-  _bgl_vector_i_mul_add_reg0();
-  _bgl_vector_i_mul_add_reg1();
-  
-  _bgl_su3_multiply_double((*up));
-  _bgl_vector_cmplx_mul_double(phase_1);
-  
-  _bgl_add_to_rs0_reg0();
-  _bgl_i_mul_sub_from_rs3_reg0();
-  _bgl_add_to_rs1_reg1();
-  _bgl_i_mul_sub_from_rs2_reg1();
-  
-  /*********************** direction -1 ************************/
-
-  up = um+1;
-  _prefetch_su3(up); 
-  sp = (spinor*) s + (*idx);
-  _prefetch_spinor(sp); 
-  idx++;
-
-  _bgl_load_reg0(sm->s0);
-  _bgl_load_reg1(sm->s1);
-  _bgl_load_reg0_up(sm->s3);
-  _bgl_load_reg1_up(sm->s2);
-  _bgl_vector_i_mul_sub_reg0();
-  _bgl_vector_i_mul_sub_reg1();
-  
-  _bgl_su3_inverse_multiply_double((*um));
-  _bgl_vector_cmplxcg_mul_double(phase_1);
-  
-  _bgl_add_to_rs0_reg0();
-  _bgl_add_to_rs1_reg1();
-  _bgl_i_mul_add_to_rs3_reg0();
-  _bgl_i_mul_add_to_rs2_reg1();      
-  
-  /*********************** direction +2 ************************/
-  
-  um = up+1;
-  _prefetch_su3(um); 
-  sm = (spinor*) s + (*idx);
-  _prefetch_spinor(sm); 
-  idx++;
-
-  _bgl_load_reg0(sp->s0);
-  _bgl_load_reg1(sp->s1);
-  _bgl_load_reg1_up(sp->s2);
-  _bgl_load_reg0_up(sp->s3);
-  _bgl_vector_add_reg0();
-  _bgl_vector_sub_reg1();
-  
-  _bgl_su3_multiply_double((*up));
-  _bgl_vector_cmplx_mul_double(phase_2);
-  
-  _bgl_add_to_rs0_reg0();
-  _bgl_add_to_rs1_reg1();
-  _bgl_sub_from_rs2_reg1();
-  _bgl_add_to_rs3_reg0();
-  
-
-  /*********************** direction -2 ************************/
-  up = um+1;
-  _prefetch_su3(up); 
-  sp = (spinor*) s + (*idx);
-  _prefetch_spinor(sp); 
-  idx++;
-
-  _bgl_load_reg0(sm->s0);
-  _bgl_load_reg1(sm->s1);
-  _bgl_load_reg1_up(sm->s2);
-  _bgl_load_reg0_up(sm->s3);
-  _bgl_vector_sub_reg0();
-  _bgl_vector_add_reg1();
-  
-  _bgl_su3_inverse_multiply_double((*um));
-  _bgl_vector_cmplxcg_mul_double(phase_2);
-  
-  _bgl_add_to_rs0_reg0();
-  _bgl_add_to_rs1_reg1();
-  _bgl_add_to_rs2_reg1();
-  _bgl_sub_from_rs3_reg0();
-  
-  /*********************** direction +3 ************************/
-  um = up+1;
-  _prefetch_su3(um); 
-  sm = (spinor*) s + (*idx);
-  _prefetch_spinor(sm); 
-
-  _bgl_load_reg0(sp->s0);
-  _bgl_load_reg1(sp->s1);
-  _bgl_load_reg0_up(sp->s2);
-  _bgl_load_reg1_up(sp->s3);
-  _bgl_vector_i_mul_add_reg0();
-  _bgl_vector_i_mul_sub_reg1();
-  
-  _bgl_su3_multiply_double((*up));
-  _bgl_vector_cmplx_mul_double(phase_3);
-  
-  _bgl_add_to_rs0_reg0();
-  _bgl_add_to_rs1_reg1();
-  _bgl_i_mul_sub_from_rs2_reg0();
-  _bgl_i_mul_add_to_rs3_reg1();
-  
-  /*********************** direction -3 ************************/
-
-  _bgl_load_reg0(sm->s0);
-  _bgl_load_reg1(sm->s1);
-  _bgl_load_reg0_up(sm->s2);
-  _bgl_load_reg1_up(sm->s3);
-  _bgl_vector_i_mul_sub_reg0();
-  _bgl_vector_i_mul_add_reg1();
-  
-  _bgl_su3_inverse_multiply_double((*um));
-  _bgl_vector_cmplxcg_mul_double(phase_3);
-  
-  _bgl_add_to_rs0_reg0();
-  _bgl_store_rs0(rr->s0);
-  _bgl_i_mul_add_to_rs2_reg0();
-  _bgl_store_rs2(rr->s2);
-  
-  _bgl_add_to_rs1_reg1();
-  _bgl_store_rs1(rr->s1);
-  _bgl_i_mul_sub_from_rs3_reg1();
-  _bgl_store_rs3(rr->s3);
 
+
+#define _C_TYPE _Complex float
+#define _F_TYPE float
+#define _PSWITCH(s) s ## _32
+#define _PTSWITCH(s) s ## 32
+
+#include"D_psi_body.c"
+
+#undef _C_TYPE
+#undef _F_TYPE
+#undef _PSWITCH
+#undef _PTSWITCH
+
+
+#if (!defined SSE2 && !defined SSE3)
+
+#  define _C_TYPE _Complex double
+#  define _F_TYPE double
+#  define _PSWITCH(s) s
+#  define _PTSWITCH(s) s
+
+#  include"D_psi_body.c"
+
+#  undef _C_TYPE
+#  undef _F_TYPE
+#  undef _PSWITCH
+#  undef _PTSWITCH
+
+#endif
+
+void D_psi_prec(spinor * const P, spinor * const Q){
+
+  /* todo: do preconditioning */
+  spinorPrecWS *ws=(spinorPrecWS*)g_precWS;
+  static _Complex double alpha = -1.0;
+
+  alpha = -0.5;
+  spinorPrecondition(P,Q,ws,T,L,alpha,0,1);
+  D_psi(g_spinor_field[DUM_MATRIX],P);
+  alpha = -0.5;
+  spinorPrecondition(P,g_spinor_field[DUM_MATRIX],ws,T,L,alpha,0,1);
 }
 
+// now come single and double precision versions of
+// block local operators
+#define _F_TYPE float
+#define _C_TYPE _Complex float
+#define _PSWITCH(s) s ## _32
+#define _PTSWITCH(s) s ## 32
 
-#else
+#include "Block_D_psi_body.c"
 
+#undef _F_TYPE
+#undef _C_TYPE
+#undef _PSWITCH
+#undef _PTSWITCH
+
+#define _F_TYPE double
+#define _C_TYPE _Complex double
+#define _PSWITCH(s) s
+#define _PTSWITCH(s) s
+
+#include "Block_D_psi_body.c"
+
+#undef _F_TYPE
+#undef _C_TYPE
+#undef _PSWITCH
+#undef _PTSWITCH
+
+#ifdef TM_USE_OMP
+#  define static
+#endif
+
+
+/* direction +t */
+void boundary_D_0(spinor * const r, spinor * const s, su3 * const u) {
+
+  static su3_vector chi, psi;
+
+  _vector_add(psi,s->s0,s->s2);
+
+  _su3_multiply(chi,(*u),psi);
+
+  _complex_times_vector(r->s0, phase_0, chi);
+  _vector_assign(r->s2,r->s0);
+
+  _vector_add(psi,s->s1,s->s3);
+
+  _su3_multiply(chi,(*u),psi);
+
+  _complex_times_vector(r->s1, phase_0, chi);
+  _vector_assign(r->s3, r->s1);
+
+  return;
+}
 
-static inline void p0add(spinor * restrict const tmpr , spinor const * restrict const s, 
-			 su3 const * restrict const u, const _Complex double phase) {
 
-#ifdef OMP
+static inline void p0add_bispinor(bispinor * restrict const tmpr , bispinor const * restrict const s,
+                         su3 const * restrict const u, const _Complex double phase) {
+
+#ifdef TM_USE_OMP
 #define static
 #endif
   static su3_vector chi, psi;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
-  _vector_add(psi,s->s0, s->s2);
+  _vector_add(psi,s->sp_up.s0, s->sp_up.s2);
+  _su3_multiply(chi, (*u), psi);
+
+  _complex_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->sp_up.s0, psi);
+  _vector_add_assign(tmpr->sp_up.s2, psi);
+
+  _vector_add(psi,s->sp_dn.s0, s->sp_dn.s2);
+  _su3_multiply(chi, (*u), psi);
+
+  _complex_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->sp_dn.s0, psi);
+  _vector_add_assign(tmpr->sp_dn.s2, psi);
+
+
+  _vector_add(psi, s->sp_up.s1, s->sp_up.s3);
   _su3_multiply(chi, (*u), psi);
 
   _complex_times_vector(psi, phase, chi);
-  _vector_add_assign(tmpr->s0, psi);
-  _vector_add_assign(tmpr->s2, psi);
+  _vector_add_assign(tmpr->sp_up.s1, psi);
+  _vector_add_assign(tmpr->sp_up.s3, psi);
 
-  _vector_add(psi, s->s1, s->s3);
+  _vector_add(psi, s->sp_dn.s1, s->sp_dn.s3);
   _su3_multiply(chi, (*u), psi);
 
   _complex_times_vector(psi, phase, chi);
-  _vector_add_assign(tmpr->s1, psi);
-  _vector_add_assign(tmpr->s3, psi);
+  _vector_add_assign(tmpr->sp_dn.s1, psi);
+  _vector_add_assign(tmpr->sp_dn.s3, psi);
+
+
+  return;
+}
+
+
+/* direction -t */
+void boundary_D_1(spinor * const r, spinor * const s, su3 * restrict u) {
+  static su3_vector chi, psi;
+
+  _vector_sub(psi, s->s0, s->s2);
+
+  _su3_inverse_multiply(chi, (*u), psi);
+
+  _complexcjg_times_vector(r->s0, phase_0, chi);
+  _vector_minus_assign(r->s2, r->s0);
+
+  _vector_sub(psi,s->s1,s->s3);
+
+  _su3_inverse_multiply(chi,(*u),psi);
+
+  _complexcjg_times_vector(r->s1,phase_0,chi);
+  _vector_minus_assign(r->s3, r->s1);
 
   return;
 }
 
 
-static inline void m0add(spinor * restrict const tmpr, spinor const * restrict const s, 
+
+static inline void m0add_bispinor(bispinor * restrict const tmpr, bispinor const * restrict const s, 
 			 su3 const * restrict const u, const _Complex double phase) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static su3_vector chi, psi;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
-  _vector_sub(psi, s->s0, s->s2);
+  _vector_sub(psi, s->sp_up.s0, s->sp_up.s2);
+  _su3_inverse_multiply(chi, (*u), psi);
+
+  _complexcjg_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->sp_up.s0, psi);
+  _vector_sub_assign(tmpr->sp_up.s2, psi);
+
+  _vector_sub(psi, s->sp_dn.s0, s->sp_dn.s2);
+  _su3_inverse_multiply(chi, (*u), psi);
+
+  _complexcjg_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->sp_dn.s0, psi);
+  _vector_sub_assign(tmpr->sp_dn.s2, psi);
+
+  _vector_sub(psi, s->sp_up.s1, s->sp_up.s3);
   _su3_inverse_multiply(chi, (*u), psi);
 
   _complexcjg_times_vector(psi, phase, chi);
-  _vector_add_assign(tmpr->s0, psi);
-  _vector_sub_assign(tmpr->s2, psi);
+  _vector_add_assign(tmpr->sp_up.s1, psi);
+  _vector_sub_assign(tmpr->sp_up.s3, psi);
 
-  _vector_sub(psi, s->s1, s->s3);
+  _vector_sub(psi, s->sp_dn.s1, s->sp_dn.s3);
   _su3_inverse_multiply(chi, (*u), psi);
 
   _complexcjg_times_vector(psi, phase, chi);
-  _vector_add_assign(tmpr->s1, psi);
-  _vector_sub_assign(tmpr->s3, psi);
+  _vector_add_assign(tmpr->sp_dn.s1, psi);
+  _vector_sub_assign(tmpr->sp_dn.s3, psi);
+
 
   return;
 }
 
-static inline void p1add(spinor * restrict const tmpr, spinor const * restrict const s, 
-			 su3 const * restrict const u, const _Complex double phase) {
-#ifdef OMP
+/* direction +x */
+void boundary_D_2(spinor * const r, spinor * const s, su3 * restrict u) {
+
+  static su3_vector chi, psi;
+
+  _vector_i_add(psi,s->s0,s->s3);
+
+  _su3_multiply(chi,(*u),psi);
+
+  _complex_times_vector(r->s0, phase_1, chi);
+  _vector_null(r->s3);
+  _vector_i_sub_assign(r->s3, r->s0);
+
+  _vector_i_add(psi,s->s1,s->s2);
+
+  _su3_multiply(chi,(*u),psi);
+
+  _complex_times_vector(r->s1, phase_1, chi);
+  _vector_null(r->s2);
+  _vector_i_sub_assign(r->s2, r->s1);
+
+  return;
+}
+
+static inline void p1add_bispinor(bispinor * restrict const tmpr, bispinor const * restrict const s,
+                         su3 const * restrict const u, const _Complex double phase) {
+#ifdef TM_USE_OMP
 #define static
 #endif
   static su3_vector chi, psi;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
-  _vector_i_add(psi,s->s0,s->s3);
+  _vector_i_add(psi,s->sp_up.s0,s->sp_up.s3);
+  _su3_multiply(chi,(*u),psi);
+
+  _complex_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->sp_up.s0, psi);
+  _vector_i_sub_assign(tmpr->sp_up.s3, psi);
+
+  _vector_i_add(psi, s->sp_up.s1, s->sp_up.s2);
+  _su3_multiply(chi, (*u), psi);
+
+  _complex_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->sp_up.s1, psi);
+  _vector_i_sub_assign(tmpr->sp_up.s2, psi);
+
+  _vector_i_add(psi,s->sp_dn.s0,s->sp_dn.s3);
   _su3_multiply(chi,(*u),psi);
 
   _complex_times_vector(psi, phase, chi);
-  _vector_add_assign(tmpr->s0, psi);
-  _vector_i_sub_assign(tmpr->s3, psi);
- 
-  _vector_i_add(psi, s->s1, s->s2);
+  _vector_add_assign(tmpr->sp_dn.s0, psi);
+  _vector_i_sub_assign(tmpr->sp_dn.s3, psi);
+
+  _vector_i_add(psi, s->sp_dn.s1, s->sp_dn.s2);
   _su3_multiply(chi, (*u), psi);
 
   _complex_times_vector(psi, phase, chi);
-  _vector_add_assign(tmpr->s1, psi);
-  _vector_i_sub_assign(tmpr->s2, psi);
+  _vector_add_assign(tmpr->sp_dn.s1, psi);
+  _vector_i_sub_assign(tmpr->sp_dn.s2, psi);
 
   return;
 }
 
-static inline void m1add(spinor * restrict const tmpr, spinor const * restrict const s, 
-			 su3 const * restrict const u, const _Complex double phase) {
-#ifdef OMP
+
+
+/* direction -x */
+void boundary_D_3(spinor * const r, spinor * const s, su3 * restrict u) {
+
+  static su3_vector chi, psi;
+
+  _vector_i_sub(psi,s->s0,s->s3);
+
+  _su3_inverse_multiply(chi,(*u),psi);
+
+  _complexcjg_times_vector(r->s0, phase_1, chi);
+  _vector_null(r->s3);
+  _vector_i_add_assign(r->s3, r->s0);
+
+  _vector_i_sub(psi,s->s1,s->s2);
+
+  _su3_inverse_multiply(chi,(*u),psi);
+
+  _complexcjg_times_vector(r->s1, phase_1, chi);
+  _vector_null(r->s2);
+  _vector_i_add_assign(r->s2, r->s1);
+
+  return;
+}
+
+static inline void m1add_bispinor(bispinor * restrict const tmpr, bispinor const * restrict const s,
+                         su3 const * restrict const u, const _Complex double phase) {
+#ifdef TM_USE_OMP
 #define static
 #endif
   static su3_vector chi, psi;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
-  _vector_i_sub(psi,s->s0, s->s3);
+  _vector_i_sub(psi,s->sp_up.s0, s->sp_up.s3);
   _su3_inverse_multiply(chi,(*u), psi);
 
   _complexcjg_times_vector(psi, phase, chi);
-  _vector_add_assign(tmpr->s0, psi);
-  _vector_i_add_assign(tmpr->s3, psi);
+  _vector_add_assign(tmpr->sp_up.s0, psi);
+  _vector_i_add_assign(tmpr->sp_up.s3, psi);
 
-  _vector_i_sub(psi, s->s1, s->s2);
+  _vector_i_sub(psi, s->sp_up.s1, s->sp_up.s2);
   _su3_inverse_multiply(chi, (*u), psi);
 
   _complexcjg_times_vector(psi, phase, chi);
-  _vector_add_assign(tmpr->s1, psi);
-  _vector_i_add_assign(tmpr->s2, psi);
+  _vector_add_assign(tmpr->sp_up.s1, psi);
+  _vector_i_add_assign(tmpr->sp_up.s2, psi);
+
+  _vector_i_sub(psi,s->sp_dn.s0, s->sp_dn.s3);
+  _su3_inverse_multiply(chi,(*u), psi);
+
+  _complexcjg_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->sp_dn.s0, psi);
+  _vector_i_add_assign(tmpr->sp_dn.s3, psi);
+
+  _vector_i_sub(psi, s->sp_dn.s1, s->sp_dn.s2);
+  _su3_inverse_multiply(chi, (*u), psi);
+
+  _complexcjg_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->sp_dn.s1, psi);
+  _vector_i_add_assign(tmpr->sp_dn.s2, psi);
+
 
   return;
 }
 
-static inline void p2add(spinor * restrict const tmpr, spinor const * restrict const s, 
-			 su3 const * restrict const u, const _Complex double phase) {
-#ifdef OMP
-#define static
-#endif
+/* direction +y */
+void boundary_D_4(spinor * const r, spinor * const s, su3 * restrict u) {
+
   static su3_vector chi, psi;
-#ifdef OMP
-#undef static
-#endif
 
   _vector_add(psi,s->s0,s->s3);
-  _su3_multiply(chi, (*u), psi);
 
-  _complex_times_vector(psi, phase, chi);
-  _vector_add_assign(tmpr->s0, psi);
-  _vector_add_assign(tmpr->s3, psi);
+  _su3_multiply(chi,(*u),psi);
+
+  _complex_times_vector(r->s0, phase_2, chi);
+  _vector_assign(r->s3, r->s0);
 
   _vector_sub(psi,s->s1,s->s2);
-  _su3_multiply(chi, (*u), psi);
 
-  _complex_times_vector(psi, phase, chi);
-  _vector_add_assign(tmpr->s1, psi);
-  _vector_sub_assign(tmpr->s2, psi);
+  _su3_multiply(chi,(*u),psi);
 
+  _complex_times_vector(r->s1, phase_2, chi);
+  _vector_minus_assign(r->s2, r->s1);
 
   return;
 }
 
-static inline void m2add(spinor * restrict const tmpr, spinor const * restrict const s, 
-			 su3 const * restrict const u, const _Complex double phase) {
-#ifdef OMP
+static inline void p2add_bispinor(bispinor * restrict const tmpr, bispinor const * restrict const s,
+                         su3 const * restrict const u, const _Complex double phase) {
+#ifdef TM_USE_OMP
 #define static
 #endif
   static su3_vector chi, psi;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
-  _vector_sub(psi, s->s0, s->s3);
-  _su3_inverse_multiply(chi, (*u), psi);
-
-  _complexcjg_times_vector(psi, phase, chi);
-  _vector_add_assign(tmpr->s0, psi);
-  _vector_sub_assign(tmpr->s3, psi);
+  _vector_add(psi,s->sp_up.s0,s->sp_up.s3);
+  _su3_multiply(chi, (*u), psi);
 
-  _vector_add(psi, s->s1, s->s2);
-  _su3_inverse_multiply(chi, (*u),psi);
+  _complex_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->sp_up.s0, psi);
+  _vector_add_assign(tmpr->sp_up.s3, psi);
 
-  _complexcjg_times_vector(psi, phase, chi);
-  _vector_add_assign(tmpr->s1, psi);
-  _vector_add_assign(tmpr->s2, psi);
+  _vector_sub(psi,s->sp_up.s1,s->sp_up.s2);
+  _su3_multiply(chi, (*u), psi);
 
-  return;
-}
+  _complex_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->sp_up.s1, psi);
+  _vector_sub_assign(tmpr->sp_up.s2, psi);
 
-static inline void p3add(spinor * restrict const tmpr, spinor const * restrict const s, 
-			 su3 const * restrict const u, const _Complex double phase) {
-#ifdef OMP
-#define static
-#endif
-  static su3_vector chi, psi;
-#ifdef OMP
-#undef static
-#endif
 
-  _vector_i_add(psi, s->s0, s->s2);
+  _vector_add(psi,s->sp_dn.s0,s->sp_dn.s3);
   _su3_multiply(chi, (*u), psi);
 
   _complex_times_vector(psi, phase, chi);
-  _vector_add_assign(tmpr->s0, psi);
-  _vector_i_sub_assign(tmpr->s2, psi);
+  _vector_add_assign(tmpr->sp_dn.s0, psi);
+  _vector_add_assign(tmpr->sp_dn.s3, psi);
 
-  _vector_i_sub(psi,s->s1, s->s3);
+  _vector_sub(psi,s->sp_dn.s1,s->sp_dn.s2);
   _su3_multiply(chi, (*u), psi);
 
   _complex_times_vector(psi, phase, chi);
-  _vector_add_assign(tmpr->s1, psi);
-  _vector_i_add_assign(tmpr->s3, psi);
+  _vector_add_assign(tmpr->sp_dn.s1, psi);
+  _vector_sub_assign(tmpr->sp_dn.s2, psi);
+
+  return;
+}
+
+
+/* direction -y */
+void boundary_D_5(spinor * const r, spinor * const s, su3 * restrict u) {
+
+  static su3_vector chi, psi;
+
+  _vector_sub(psi,s->s0,s->s3);
+
+  _su3_inverse_multiply(chi,(*u),psi);
+
+  _complexcjg_times_vector(r->s0, phase_2, chi);
+  _vector_minus_assign(r->s3, r->s0);
+
+  _vector_add(psi,s->s1,s->s2);
+
+  _su3_inverse_multiply(chi,(*u),psi);
+
+  _complexcjg_times_vector(r->s1, phase_2, chi);
+  _vector_assign(r->s2, r->s1);
+
 
   return;
 }
 
-static inline void m3addandstore(spinor * restrict const r, spinor const * restrict const s, 
-				 su3 const * restrict const u, const _Complex double phase,
-         spinor const * restrict const tmpr) {
-#ifdef OMP
+static inline void m2add_bispinor(bispinor * restrict const tmpr, bispinor const * restrict const s,
+                         su3 const * restrict const u, const _Complex double phase) {
+#ifdef TM_USE_OMP
 #define static
 #endif
   static su3_vector chi, psi;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
-  _vector_i_sub(psi,s->s0, s->s2);
+  _vector_sub(psi, s->sp_up.s0, s->sp_up.s3);
   _su3_inverse_multiply(chi, (*u), psi);
 
   _complexcjg_times_vector(psi, phase, chi);
-  _vector_add(r->s0, tmpr->s0, psi);
-  _vector_i_add(r->s2, tmpr->s2, psi);
+  _vector_add_assign(tmpr->sp_up.s0, psi);
+  _vector_sub_assign(tmpr->sp_up.s3, psi);
+
+  _vector_add(psi, s->sp_up.s1, s->sp_up.s2);
+  _su3_inverse_multiply(chi, (*u),psi);
+
+  _complexcjg_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->sp_up.s1, psi);
+  _vector_add_assign(tmpr->sp_up.s2, psi);
 
-  _vector_i_add(psi, s->s1, s->s3);
+
+  _vector_sub(psi, s->sp_dn.s0, s->sp_dn.s3);
   _su3_inverse_multiply(chi, (*u), psi);
 
   _complexcjg_times_vector(psi, phase, chi);
-  _vector_add(r->s1, tmpr->s1, psi);
-  _vector_i_sub(r->s3, tmpr->s3, psi);
+  _vector_add_assign(tmpr->sp_dn.s0, psi);
+  _vector_sub_assign(tmpr->sp_dn.s3, psi);
 
-  return;
-}
+  _vector_add(psi, s->sp_dn.s1, s->sp_dn.s2);
+  _su3_inverse_multiply(chi, (*u),psi);
+
+  _complexcjg_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->sp_dn.s1, psi);
+  _vector_add_assign(tmpr->sp_dn.s2, psi);
 
-/* this is the hopping part only */
-static inline void local_H(spinor * const rr, spinor const * const s, su3 const * restrict u, int * _idx, spinor * const restrict tmpr) {
-
-  int * idx = _idx;
-
-  /****** direction +0 ******/
-  p0add(tmpr, s + (*idx), u, phase_0);
-  u++;
-  idx++;
-  /****** direction -0 ******/
-  m0add(tmpr, s + (*idx), u, phase_0);
-  u++;
-  idx++;
-  /****** direction +1 ******/
-  p1add(tmpr, s + (*idx), u, phase_1);
-  u++;
-  idx++;
-  /****** direction -1 ******/
-  m1add(tmpr, s + (*idx), u, phase_1);
-  u++;
-  idx++;
-  /****** direction +2 ******/
-  p2add(tmpr, s + (*idx), u, phase_2);
-  u++;
-  idx++;
-  /****** direction -2 ******/
-  m2add(tmpr, s + (*idx), u, phase_2);
-  u++;
-  idx++;
-  /****** direction +3 ******/
-  p3add(tmpr, s + (*idx), u, phase_3);
-  u++;
-  idx++;
-  /****** direction -3 ******/
-  m3addandstore(rr, s + (*idx), u, phase_3, tmpr);
 
   return;
 }
 
 
-#endif
+/* direction +z */
+void boundary_D_6(spinor * const r, spinor * const s, su3 * restrict u) {
 
-#if (defined SSE2 || defined SSE3)
+  static su3_vector chi, psi;
 
-/* Serially Checked ! */
-void D_psi(spinor * const P, spinor * const Q){
+  _vector_i_add(psi,s->s0,s->s2);
 
-  if(P==Q){
-    printf("Error in D_psi (operator.c):\n");
-    printf("Arguments must be differen spinor fields\n");
-    printf("Program aborted\n");
-    exit(1);
-  }
+  _su3_multiply(chi,(*u),psi);
 
-#ifdef _GAUGE_COPY2
-  if(g_update_gauge_copy) {
-      update_backward_gauge(g_gauge_field);
+  _complex_times_vector(r->s0, phase_3, chi);
+  _vector_null(r->s2);
+  _vector_i_sub_assign(r->s2, r->s0);
+
+  _vector_i_sub(psi,s->s1,s->s3);
+
+  _su3_multiply(chi,(*u),psi);
+
+  _complex_times_vector(r->s1, phase_3, chi);
+  _vector_null(r->s3);
+  _vector_i_add_assign(r->s3, r->s1);
+
+  return;
+}
+
+static inline void p3add_bispinor(bispinor * restrict const tmpr, bispinor const * restrict const s,
+                         su3 const * restrict const u, const _Complex double phase) {
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static su3_vector chi, psi;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+
+  _vector_i_add(psi, s->sp_up.s0, s->sp_up.s2);
+  _su3_multiply(chi, (*u), psi);
+
+  _complex_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->sp_up.s0, psi);
+  _vector_i_sub_assign(tmpr->sp_up.s2, psi);
+
+  _vector_i_sub(psi,s->sp_up.s1, s->sp_up.s3);
+  _su3_multiply(chi, (*u), psi);
+
+  _complex_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->sp_up.s1, psi);
+  _vector_i_add_assign(tmpr->sp_up.s3, psi);
+
+  _vector_i_add(psi, s->sp_dn.s0, s->sp_dn.s2);
+  _su3_multiply(chi, (*u), psi);
+
+  _complex_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->sp_dn.s0, psi);
+  _vector_i_sub_assign(tmpr->sp_dn.s2, psi);
+
+  _vector_i_sub(psi,s->sp_dn.s1, s->sp_dn.s3);
+  _su3_multiply(chi, (*u), psi); 
+
+  _complex_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->sp_dn.s1, psi);
+  _vector_i_add_assign(tmpr->sp_dn.s3, psi);
+
+
+  return;
+}
+
+
+/* direction -z */
+void boundary_D_7(spinor * const r, spinor * const s, su3 * restrict u) {
+
+  static su3_vector chi, psi;
+
+  _vector_i_sub(psi,s->s0,s->s2);
+
+  _su3_inverse_multiply(chi,(*u),psi);
+
+  _complexcjg_times_vector(r->s0, phase_3, chi);
+  _vector_null(r->s2);
+  _vector_i_add_assign(r->s2, r->s0);
+
+  _vector_i_add(psi,s->s1,s->s3);
+
+  _su3_inverse_multiply(chi,(*u),psi);
+
+  _complexcjg_times_vector(r->s1, phase_3, chi);
+  _vector_null(r->s3);
+  _vector_i_sub_assign(r->s3, r->s1);
+
+  return;
+
+}
+
+static inline void m3addandstore_bispinor(bispinor * restrict const r, bispinor const * restrict const s,
+                                 su3 const * restrict const u, const _Complex double phase,
+         bispinor const * restrict const tmpr) {
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static su3_vector chi, psi;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+
+  _vector_i_sub(psi,s->sp_up.s0, s->sp_up.s2);
+  _su3_inverse_multiply(chi, (*u), psi);
+
+  _complexcjg_times_vector(psi, phase, chi);
+  _vector_add(r->sp_up.s0, tmpr->sp_up.s0, psi);
+  _vector_i_add(r->sp_up.s2, tmpr->sp_up.s2, psi);
+
+  _vector_i_add(psi, s->sp_up.s1, s->sp_up.s3);
+  _su3_inverse_multiply(chi, (*u), psi);
+
+  _complexcjg_times_vector(psi, phase, chi);
+  _vector_add(r->sp_up.s1, tmpr->sp_up.s1, psi);
+  _vector_i_sub(r->sp_up.s3, tmpr->sp_up.s3, psi);
+
+  _vector_i_sub(psi,s->sp_dn.s0, s->sp_dn.s2);
+  _su3_inverse_multiply(chi, (*u), psi);
+
+  _complexcjg_times_vector(psi, phase, chi);
+  _vector_add(r->sp_dn.s0, tmpr->sp_dn.s0, psi);
+  _vector_i_add(r->sp_dn.s2, tmpr->sp_dn.s2, psi);
+
+  _vector_i_add(psi, s->sp_dn.s1, s->sp_dn.s3);
+  _su3_inverse_multiply(chi, (*u), psi);
+
+  _complexcjg_times_vector(psi, phase, chi);
+  _vector_add(r->sp_dn.s1, tmpr->sp_dn.s1, psi);
+  _vector_i_sub(r->sp_dn.s3, tmpr->sp_dn.s3, psi);
+
+
+  return;
+}
+
+#ifdef TM_USE_OMP
+#  undef static
+#endif
+
+
+// now come the SSE versions
+// currently they are disabled, because incompatible
+#if (defined SSE2 || defined SSE3)
+#ifdef OpSSE2
+# define SSE2
+#endif
+#ifdef OpSSE3
+# define SSE3
+#endif
+#ifdef  _SSE_H
+# undef  _SSE_H
+#endif
+#ifdef _SU3_H
+# undef _SU3_H
+#endif
+#include "su3.h"
+#include "sse.h"
+
+
+// checked!
+void Dtm_psi(spinor * const P, spinor * const Q){
+
+  if(P==Q){
+    printf("Error in Dtm_psi (D_psi.c):\n");
+    printf("Arguments must be differen spinor fields\n");
+    printf("Program aborted\n");
+    exit(1);
+  }
+
+#ifdef _GAUGE_COPY2
+  if(g_update_gauge_copy) {
+    update_backward_gauge(g_gauge_field);
   }
 #endif
 
-# if defined MPI
+# if defined TM_USE_MPI
   xchange_lexicfield(Q);
 # endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
-  int ix,iy,iz;
-  su3 *up,*um;
-  spinor *s,*sp,*sm,*rn;
-  _Complex double fact1, fact2;
-  spinor rs __attribute__ ((aligned (16)));
-
-  fact1 = 1. + g_mu * I;
-  fact2 = conj(fact1);
-
-#ifndef OMP
-  iy=g_iup[0][0];
-  sp=(spinor *) Q + iy;
-  up=&g_gauge_field[0][0];
+    int ix,iy,iz;
+    su3 *up,*um;
+    spinor *s,*sp,*sm,*rn;
+    _Complex double fact1, fact2;
+    spinor rs __attribute__ ((aligned (16)));
+
+    fact1 = 1. + g_mu * I;
+    fact2 = conj(fact1);
+
+#ifndef TM_USE_OMP
+    iy=g_iup[0][0];
+    sp=(spinor *) Q + iy;
+    up=&g_gauge_field[0][0];
 #endif
 
-  /************************ loop over all lattice sites *************************/
-#ifdef OMP
+    /************************ loop over all lattice sites *************************/
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
-  for (ix=0;ix<VOLUME;ix++){
-#ifdef OMP
-    iy=g_iup[ix][0];
-    up=&g_gauge_field[ix][0];
-    sp=(spinor *) Q + iy;
+    for (ix=0;ix<VOLUME;ix++){
+#ifdef TM_USE_OMP
+      iy=g_iup[ix][0];
+      up=&g_gauge_field[ix][0];
+      sp=(spinor *) Q + iy;
 #endif
-    s=(spinor *) Q + ix;
-    _prefetch_spinor(s);
+      s=(spinor *) Q + ix;
+      _prefetch_spinor(s);
 
-    /******************************* direction +0 *********************************/
+      /******************************* direction +0 *********************************/
 
-    iy=g_idn[ix][0];
+      iy=g_idn[ix][0];
       
-    sm = (spinor *) Q + iy;
-    _prefetch_spinor(sm);       
-
-    _sse_load(sp->s0);
-    _sse_load_up(sp->s2);
-    _sse_vector_add();
-
-    _sse_su3_multiply((*up));
-    _sse_vector_cmplx_mul(phase_0);
-    _sse_store_up(rs.s2);
-
-    _sse_load_up(s->s0);
-    _sse_vector_cmplx_mul(fact1);
-/*     _sse_vector_mul(fact1); */
-    _sse_load(rs.s2);
-    _sse_vector_add();
-    _sse_store(rs.s0);
-
-    _sse_load_up(s->s2);
-    _sse_vector_cmplx_mul(fact2);
-/*     _sse_vector_mul(fact1);       */
-    _sse_load(rs.s2);
-    _sse_vector_add();
-    _sse_store(rs.s2);      
+      sm = (spinor *) Q + iy;
+      _prefetch_spinor(sm);       
+
+      _sse_load(sp->s0);
+      _sse_load_up(sp->s2);
+      _sse_vector_add();
+
+      _sse_su3_multiply((*up));
+      _sse_vector_cmplx_mul(phase_0);
+      _sse_store_up(rs.s2);
+
+      // the diagonal bit
+      _sse_load_up(s->s0);
+      _sse_vector_cmplx_mul(fact1);
+      _sse_load(rs.s2);
+      _sse_vector_add();
+      _sse_store(rs.s0);
+
+      // g5 in the twisted term
+      _sse_load_up(s->s2);
+      _sse_vector_cmplx_mul(fact2);
+      _sse_load(rs.s2);
+      _sse_vector_add();
+      _sse_store(rs.s2);      
       
-    um=&g_gauge_field[iy][0];
-    _prefetch_su3(um);
+      um=&g_gauge_field[iy][0];
+      _prefetch_su3(um);
       
-    _sse_load(sp->s1);
-    _sse_load_up(sp->s3);
-    _sse_vector_add();
+      _sse_load(sp->s1);
+      _sse_load_up(sp->s3);
+      _sse_vector_add();
       
-    _sse_su3_multiply((*up));
-    _sse_vector_cmplx_mul(phase_0);
-    _sse_store_up(rs.s3);
+      _sse_su3_multiply((*up));
+      _sse_vector_cmplx_mul(phase_0);
+      _sse_store_up(rs.s3);
     
-    _sse_load_up(s->s1);
-    _sse_vector_cmplx_mul(fact1);
-/*     _sse_vector_mul(fact1); */
-    _sse_load(rs.s3);
-    _sse_vector_add();
-    _sse_store(rs.s1);
-
-    _sse_load_up(s->s3);
-    _sse_vector_cmplx_mul(fact2);
-/*     _sse_vector_mul(fact1);       */
-    _sse_load(rs.s3);
-    _sse_vector_add();
-    _sse_store(rs.s3); 
-
-    /******************************* direction -0 *********************************/
-
-    iy=g_iup[ix][1];
-
-    sp = (spinor *) Q + iy;
-    _prefetch_spinor(sp);
-
-    _sse_load(sm->s0);
-    _sse_load_up(sm->s2);
-    _sse_vector_sub();
+      // the diagonal bit
+      _sse_load_up(s->s1);
+      _sse_vector_cmplx_mul(fact1);
+      _sse_load(rs.s3);
+      _sse_vector_add();
+      _sse_store(rs.s1);
+
+      // g5 in the twisted term
+      _sse_load_up(s->s3);
+      _sse_vector_cmplx_mul(fact2);
+      _sse_load(rs.s3);
+      _sse_vector_add();
+      _sse_store(rs.s3); 
+
+      /******************************* direction -0 *********************************/
+
+      iy=g_iup[ix][1];
+
+      sp = (spinor *) Q + iy;
+      _prefetch_spinor(sp);
+
+      _sse_load(sm->s0);
+      _sse_load_up(sm->s2);
+      _sse_vector_sub();
       
-    _sse_su3_inverse_multiply((*um));
-    _sse_vector_cmplxcg_mul(phase_0);
-    _sse_load(rs.s0);
-    _sse_vector_add();
-    _sse_store(rs.s0);
-
-    _sse_load(rs.s2);
-    _sse_vector_sub();
-    _sse_store(rs.s2);
+      _sse_su3_inverse_multiply((*um));
+      _sse_vector_cmplxcg_mul(phase_0);
+      _sse_load(rs.s0);
+      _sse_vector_add();
+      _sse_store(rs.s0);
+
+      _sse_load(rs.s2);
+      _sse_vector_sub();
+      _sse_store(rs.s2);
       
-    up+=1;
-    _prefetch_su3(up);
+      up+=1;
+      _prefetch_su3(up);
       
-    _sse_load(sm->s1);
-    _sse_load_up(sm->s3);
-    _sse_vector_sub();
+      _sse_load(sm->s1);
+      _sse_load_up(sm->s3);
+      _sse_vector_sub();
       
-    _sse_su3_inverse_multiply((*um));
-    _sse_vector_cmplxcg_mul(phase_0);
-    _sse_load(rs.s1);
-    _sse_vector_add();
-    _sse_store(rs.s1);
-
-    _sse_load(rs.s3);
-    _sse_vector_sub();
-    _sse_store(rs.s3);
+      _sse_su3_inverse_multiply((*um));
+      _sse_vector_cmplxcg_mul(phase_0);
+      _sse_load(rs.s1);
+      _sse_vector_add();
+      _sse_store(rs.s1);
+
+      _sse_load(rs.s3);
+      _sse_vector_sub();
+      _sse_store(rs.s3);
       
-    /******************************* direction +1 *********************************/
+      /******************************* direction +1 *********************************/
 
-    iy=g_idn[ix][1];
+      iy=g_idn[ix][1];
       
-    sm = (spinor *) Q + iy;
-    _prefetch_spinor(sm);
-
-    _sse_load(sp->s0);
-    _sse_load_up(sp->s3);
-    _sse_vector_i_mul();
-    _sse_vector_add();
-
-    _sse_su3_multiply((*up));
-    _sse_vector_cmplx_mul(phase_1);
-    _sse_load(rs.s0);
-    _sse_vector_add();
-    _sse_store(rs.s0);
-
-    _sse_load(rs.s3);
-    _sse_vector_i_mul();      
-    _sse_vector_sub();
-    _sse_store(rs.s3); 
+      sm = (spinor *) Q + iy;
+      _prefetch_spinor(sm);
+
+      _sse_load(sp->s0);
+      _sse_load_up(sp->s3);
+      _sse_vector_i_mul();
+      _sse_vector_add();
+
+      _sse_su3_multiply((*up));
+      _sse_vector_cmplx_mul(phase_1);
+      _sse_load(rs.s0);
+      _sse_vector_add();
+      _sse_store(rs.s0);
+
+      _sse_load(rs.s3);
+      _sse_vector_i_mul();      
+      _sse_vector_sub();
+      _sse_store(rs.s3); 
       
-    um=&g_gauge_field[iy][1];
-    _prefetch_su3(um);
+      um=&g_gauge_field[iy][1];
+      _prefetch_su3(um);
 
-    _sse_load(sp->s1);
-    _sse_load_up(sp->s2);
-    _sse_vector_i_mul();
-    _sse_vector_add();
+      _sse_load(sp->s1);
+      _sse_load_up(sp->s2);
+      _sse_vector_i_mul();
+      _sse_vector_add();
 
-    _sse_su3_multiply((*up));
-    _sse_vector_cmplx_mul(phase_1);
-    _sse_load(rs.s1);
-    _sse_vector_add();
-    _sse_store(rs.s1);
+      _sse_su3_multiply((*up));
+      _sse_vector_cmplx_mul(phase_1);
+      _sse_load(rs.s1);
+      _sse_vector_add();
+      _sse_store(rs.s1);
 
-    _sse_load(rs.s2);
-    _sse_vector_i_mul();      
-    _sse_vector_sub();
-    _sse_store(rs.s2);       
+      _sse_load(rs.s2);
+      _sse_vector_i_mul();      
+      _sse_vector_sub();
+      _sse_store(rs.s2);       
 
-    /******************************* direction -1 *********************************/
+      /******************************* direction -1 *********************************/
 
-    iy=g_iup[ix][2];
+      iy=g_iup[ix][2];
 
-    sp = (spinor *) Q + iy;
-    _prefetch_spinor(sp);
+      sp = (spinor *) Q + iy;
+      _prefetch_spinor(sp);
 
-    _sse_load(sm->s0);
-    _sse_load_up(sm->s3);
-    _sse_vector_i_mul();
-    _sse_vector_sub();
+      _sse_load(sm->s0);
+      _sse_load_up(sm->s3);
+      _sse_vector_i_mul();
+      _sse_vector_sub();
       
-    _sse_su3_inverse_multiply((*um));
-    _sse_vector_cmplxcg_mul(phase_1);
-    _sse_load(rs.s0);
-    _sse_vector_add();
-    _sse_store(rs.s0);
-
-    _sse_load(rs.s3);
-    _sse_vector_i_mul();      
-    _sse_vector_add();
-    _sse_store(rs.s3);
-
-    up+=1;
-    _prefetch_su3(up);
-
-    _sse_load(sm->s1);
-    _sse_load_up(sm->s2);
-    _sse_vector_i_mul();
-    _sse_vector_sub();
+      _sse_su3_inverse_multiply((*um));
+      _sse_vector_cmplxcg_mul(phase_1);
+      _sse_load(rs.s0);
+      _sse_vector_add();
+      _sse_store(rs.s0);
+
+      _sse_load(rs.s3);
+      _sse_vector_i_mul();      
+      _sse_vector_add();
+      _sse_store(rs.s3);
+
+      up+=1;
+      _prefetch_su3(up);
+
+      _sse_load(sm->s1);
+      _sse_load_up(sm->s2);
+      _sse_vector_i_mul();
+      _sse_vector_sub();
       
-    _sse_su3_inverse_multiply((*um));
-    _sse_vector_cmplxcg_mul(phase_1);
-    _sse_load(rs.s1);
-    _sse_vector_add();
-    _sse_store(rs.s1);
+      _sse_su3_inverse_multiply((*um));
+      _sse_vector_cmplxcg_mul(phase_1);
+      _sse_load(rs.s1);
+      _sse_vector_add();
+      _sse_store(rs.s1);
 
-    _sse_load(rs.s2);
-    _sse_vector_i_mul();      
-    _sse_vector_add();
-    _sse_store(rs.s2);
+      _sse_load(rs.s2);
+      _sse_vector_i_mul();      
+      _sse_vector_add();
+      _sse_store(rs.s2);
 
-    /******************************* direction +2 *********************************/
+      /******************************* direction +2 *********************************/
 
-    iy=g_idn[ix][2];
+      iy=g_idn[ix][2];
 
-    sm = (spinor *) Q + iy;
-    _prefetch_spinor(sm);
+      sm = (spinor *) Q + iy;
+      _prefetch_spinor(sm);
 
-    _sse_load(sp->s0);
-    _sse_load_up(sp->s3);
-    _sse_vector_add();
+      _sse_load(sp->s0);
+      _sse_load_up(sp->s3);
+      _sse_vector_add();
 
-    _sse_su3_multiply((*up));
-    _sse_vector_cmplx_mul(phase_2);
-    _sse_load(rs.s0);
-    _sse_vector_add();
-    _sse_store(rs.s0);
+      _sse_su3_multiply((*up));
+      _sse_vector_cmplx_mul(phase_2);
+      _sse_load(rs.s0);
+      _sse_vector_add();
+      _sse_store(rs.s0);
 
-    _sse_load(rs.s3);
-    _sse_vector_add();
-    _sse_store(rs.s3);
+      _sse_load(rs.s3);
+      _sse_vector_add();
+      _sse_store(rs.s3);
       
-    um=&g_gauge_field[iy][2];
-    _prefetch_su3(um);
+      um=&g_gauge_field[iy][2];
+      _prefetch_su3(um);
 
-    _sse_load(sp->s1);
-    _sse_load_up(sp->s2);
-    _sse_vector_sub();
+      _sse_load(sp->s1);
+      _sse_load_up(sp->s2);
+      _sse_vector_sub();
 
-    _sse_su3_multiply((*up));
-    _sse_vector_cmplx_mul(phase_2);
-    _sse_load(rs.s1);
-    _sse_vector_add();
-    _sse_store(rs.s1);
+      _sse_su3_multiply((*up));
+      _sse_vector_cmplx_mul(phase_2);
+      _sse_load(rs.s1);
+      _sse_vector_add();
+      _sse_store(rs.s1);
 
-    _sse_load(rs.s2);
-    _sse_vector_sub();
-    _sse_store(rs.s2);      
+      _sse_load(rs.s2);
+      _sse_vector_sub();
+      _sse_store(rs.s2);      
 
-    /******************************* direction -2 *********************************/
+      /******************************* direction -2 *********************************/
 
-    iy=g_iup[ix][3];
+      iy=g_iup[ix][3];
 
-    sp = (spinor *) Q + iy;
-    _prefetch_spinor(sp);
+      sp = (spinor *) Q + iy;
+      _prefetch_spinor(sp);
 
-    _sse_load(sm->s0);
-    _sse_load_up(sm->s3);
-    _sse_vector_sub();
+      _sse_load(sm->s0);
+      _sse_load_up(sm->s3);
+      _sse_vector_sub();
       
-    _sse_su3_inverse_multiply((*um));
-    _sse_vector_cmplxcg_mul(phase_2);
-    _sse_load(rs.s0);
-    _sse_vector_add();
-    _sse_store(rs.s0);
-
-    _sse_load(rs.s3);
-    _sse_vector_sub();
-    _sse_store(rs.s3);
+      _sse_su3_inverse_multiply((*um));
+      _sse_vector_cmplxcg_mul(phase_2);
+      _sse_load(rs.s0);
+      _sse_vector_add();
+      _sse_store(rs.s0);
+
+      _sse_load(rs.s3);
+      _sse_vector_sub();
+      _sse_store(rs.s3);
       
-    up+=1;
-    _prefetch_su3(up);
+      up+=1;
+      _prefetch_su3(up);
 
-    _sse_load(sm->s1);
-    _sse_load_up(sm->s2);
-    _sse_vector_add();
+      _sse_load(sm->s1);
+      _sse_load_up(sm->s2);
+      _sse_vector_add();
       
-    _sse_su3_inverse_multiply((*um));
-    _sse_vector_cmplxcg_mul(phase_2);
-    _sse_load(rs.s1);
-    _sse_vector_add();
-    _sse_store(rs.s1);
-
-    _sse_load(rs.s2);
-    _sse_vector_add();
-    _sse_store(rs.s2);      
+      _sse_su3_inverse_multiply((*um));
+      _sse_vector_cmplxcg_mul(phase_2);
+      _sse_load(rs.s1);
+      _sse_vector_add();
+      _sse_store(rs.s1);
+
+      _sse_load(rs.s2);
+      _sse_vector_add();
+      _sse_store(rs.s2);      
       
-    /******************************* direction +3 *********************************/
+      /******************************* direction +3 *********************************/
 
-    iy=g_idn[ix][3];
+      iy=g_idn[ix][3];
 
-    sm = (spinor *) Q + iy;
-    _prefetch_spinor(sm);
+      sm = (spinor *) Q + iy;
+      _prefetch_spinor(sm);
 
-    _sse_load(sp->s0);
-    _sse_load_up(sp->s2);
-    _sse_vector_i_mul();
-    _sse_vector_add();
+      _sse_load(sp->s0);
+      _sse_load_up(sp->s2);
+      _sse_vector_i_mul();
+      _sse_vector_add();
 
-    _sse_su3_multiply((*up));
-    _sse_vector_cmplx_mul(phase_3);
-    _sse_load(rs.s0);
-    _sse_vector_add();
-    _sse_store(rs.s0);
+      _sse_su3_multiply((*up));
+      _sse_vector_cmplx_mul(phase_3);
+      _sse_load(rs.s0);
+      _sse_vector_add();
+      _sse_store(rs.s0);
 
-    _sse_load(rs.s2);
-    _sse_vector_i_mul();      
-    _sse_vector_sub();
-    _sse_store(rs.s2);
+      _sse_load(rs.s2);
+      _sse_vector_i_mul();      
+      _sse_vector_sub();
+      _sse_store(rs.s2);
       
-    um=&g_gauge_field[iy][3];
-    _prefetch_su3(um);
-
-    _sse_load(sp->s1);
-    _sse_load_up(sp->s3);
-    _sse_vector_i_mul();
-    _sse_vector_sub();
-
-    _sse_su3_multiply((*up));
-    _sse_vector_cmplx_mul(phase_3);
-    _sse_load(rs.s1);
-    _sse_vector_add();
-    _sse_store(rs.s1);
-
-    _sse_load(rs.s3);
-    _sse_vector_i_mul();      
-    _sse_vector_add();
-    _sse_store(rs.s3);
+      um=&g_gauge_field[iy][3];
+      _prefetch_su3(um);
+
+      _sse_load(sp->s1);
+      _sse_load_up(sp->s3);
+      _sse_vector_i_mul();
+      _sse_vector_sub();
+
+      _sse_su3_multiply((*up));
+      _sse_vector_cmplx_mul(phase_3);
+      _sse_load(rs.s1);
+      _sse_vector_add();
+      _sse_store(rs.s1);
+
+      _sse_load(rs.s3);
+      _sse_vector_i_mul();      
+      _sse_vector_add();
+      _sse_store(rs.s3);
       
-    /******************************* direction -3 *********************************/
+      /******************************* direction -3 *********************************/
 
-    iz=(ix+1+VOLUME)%VOLUME;
+      iz=(ix+1+VOLUME)%VOLUME;
 
-    iy=g_iup[iz][0];
+      iy=g_iup[iz][0];
       
-    sp = (spinor *) Q + iy;
-    _prefetch_spinor(sp);
+      sp = (spinor *) Q + iy;
+      _prefetch_spinor(sp);
 
-    _sse_load(sm->s0);
-    _sse_load_up(sm->s2);
-    _sse_vector_i_mul();
-    _sse_vector_sub();
+      _sse_load(sm->s0);
+      _sse_load_up(sm->s2);
+      _sse_vector_i_mul();
+      _sse_vector_sub();
       
-    _sse_su3_inverse_multiply((*um));
-    _sse_vector_cmplxcg_mul(phase_3);
-    rn = (spinor *) P + ix;
+      _sse_su3_inverse_multiply((*um));
+      _sse_vector_cmplxcg_mul(phase_3);
+      rn = (spinor *) P + ix;
       
-    _sse_load(rs.s0);
-    _sse_vector_add();
-/*     _sse_vector_mul(fact2); */
-    _sse_store_nt(rn->s0);
-
-    _sse_load(rs.s2);
-    _sse_vector_i_mul();      
-    _sse_vector_add();
-/*     _sse_vector_mul(fact2);       */
-    _sse_store_nt(rn->s2);
-
-    up=&g_gauge_field[iz][0];
-    _prefetch_su3(up);
-
-    _sse_load(sm->s1);
-    _sse_load_up(sm->s3);
-    _sse_vector_i_mul();
-    _sse_vector_add();
+      _sse_load(rs.s0);
+      _sse_vector_add();
+      _sse_store_nt(rn->s0);
+
+      _sse_load(rs.s2);
+      _sse_vector_i_mul();      
+      _sse_vector_add();
+      _sse_store_nt(rn->s2);
+
+      up=&g_gauge_field[iz][0];
+      _prefetch_su3(up);
+
+      _sse_load(sm->s1);
+      _sse_load_up(sm->s3);
+      _sse_vector_i_mul();
+      _sse_vector_add();
       
-    _sse_su3_inverse_multiply((*um));
-    _sse_vector_cmplxcg_mul(phase_3);
-    _sse_load(rs.s1);
-    _sse_vector_add();
-/*     _sse_vector_mul(fact2);       */
-    _sse_store_nt(rn->s1);
-
-    _sse_load(rs.s3);
-    _sse_vector_i_mul();      
-    _sse_vector_sub();
-/*     _sse_vector_mul(fact2); */
-    _sse_store_nt(rn->s3);
+      _sse_su3_inverse_multiply((*um));
+      _sse_vector_cmplxcg_mul(phase_3);
+      _sse_load(rs.s1);
+      _sse_vector_add();
+      _sse_store_nt(rn->s1);
+
+      _sse_load(rs.s3);
+      _sse_vector_i_mul();      
+      _sse_vector_sub();
+      _sse_store_nt(rn->s3);
       
-    /******************************** end of loop *********************************/
+      /******************************** end of loop *********************************/
 
-  }
-#ifdef OMP
+    }
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
 
-#elif ((defined BGL) && (defined XLC))
-
-
-/**********************************
- *
- * Blue Gene/L Version
- *
- * Author: Carsten Urbach
- *
- **********************************/
-/* Checked! */
-void D_psi(spinor * const P, spinor * const Q){
-  int ix,iy,iz;
-  static _Complex double fact1;
-  su3 * restrict up ALIGN;
-  su3 * restrict um ALIGN;
-  spinor * restrict s ALIGN;
-  spinor * restrict sp ALIGN;
-  spinor * restrict sm ALIGN;
-  spinor * restrict rn ALIGN;
+void Dsw_psi(spinor * const P, spinor * const Q){
 
-#pragma disjoint(*s, *sp, *sm, *rn, *up, *um, *P, *Q)
-
-  __alignx(16,P);
-  __alignx(16,Q);
+  if(P==Q){
+    printf("Error in Dsw_psi (D_psi.c):\n");
+    printf("Arguments must be differen spinor fields\n");
+    printf("Program aborted\n");
+    exit(1);
+  }
 
-#ifdef _GAUGE_COPY
+#ifdef _GAUGE_COPY2
   if(g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
 #endif
 
-#    if (defined MPI && !(defined _NO_COMM))
+# if defined TM_USE_MPI
   xchange_lexicfield(Q);
-#    endif
-
-  fact1 = 1.0 + g_mu * I;
-
-  iy=g_iup[0][0];
-  sp=(spinor *) Q + iy;
-  up=&g_gauge_field[0][0];
-
-  /**************** loop over all lattice sites ******************/
-  for(ix = 0; ix < VOLUME; ix++){
-    s=(spinor *) Q + ix;
-    rn = (spinor *) P + ix;
-    /*********************** direction +0 ************************/
-
-    iy=g_idn[ix][0]; 
-
-    um=&g_gauge_field[iy][0]; 
-
-    _prefetch_su3(um); 
-    sm = (spinor*) Q + iy;
-    _prefetch_spinor(sm); 
-
-    _bgl_load_reg0(sp->s0);
-    _bgl_load_reg1(sp->s1);
-    _bgl_load_reg0_up(sp->s2);
-    _bgl_load_reg1_up(sp->s3);
-    _bgl_vector_add_reg0();
-    _bgl_vector_add_reg1();
-    /* result is now in regx0, regx1, regx2 x = 0,1 */
+# endif
 
-    _bgl_su3_multiply_double((*up));
-    _bgl_vector_cmplx_mul_double(phase_0);
-    _bgl_load_rs0(s->s0);
-    _bgl_load_rs1(s->s1);
-    _bgl_load_rs2(s->s2);
-    _bgl_load_rs3(s->s3);
-    _bgl_vector_cmplx_mul_rs(fact1);
-    _bgl_add_to_rs0_reg0();
-    _bgl_add_to_rs2_reg0();
-    _bgl_add_to_rs1_reg1();
-    _bgl_add_to_rs3_reg1();
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+    int ix,iy,iz;
+    su3 * restrict up,* restrict um;
+    spinor *s,*sp,*sm,*rn;
+    spinor ALIGN stmp,rs;
 
-    /*********************** direction -0 ************************/
+#ifndef TM_USE_OMP
+    iy=g_iup[0][0];
+    sp=(spinor *) Q + iy;
+    up=&g_gauge_field[0][0];
+#endif
 
-    iy=g_iup[ix][1]; 
+    /************************ loop over all lattice sites *************************/
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+    for (ix=0;ix<VOLUME;ix++){
+#ifdef TM_USE_OMP
+      iy=g_iup[ix][0];
+      up=&g_gauge_field[ix][0];
+      sp=(spinor *) Q + iy;
+#endif
+      s=(spinor *) Q + ix;
+      _prefetch_spinor(s);
 
-    up+=1;
-    _prefetch_su3(up); 
-    sp = (spinor *) Q + iy;
-    _prefetch_spinor(sp); 
+      /******************************* direction +0 *********************************/
 
-    _bgl_load_reg0(sm->s0);
-    _bgl_load_reg1(sm->s1);
-    _bgl_load_reg0_up(sm->s2);
-    _bgl_load_reg1_up(sm->s3);
-    _bgl_vector_sub_reg0();
-    _bgl_vector_sub_reg1();
+      iy=g_idn[ix][0];
+      
+      sm = (spinor *) Q + iy;
+      _prefetch_spinor(sm);       
 
-    _bgl_su3_inverse_multiply_double((*um));
-    _bgl_vector_cmplxcg_mul_double(phase_0);
+      _sse_load(sp->s0);
+      _sse_load_up(sp->s2);
+      _sse_vector_add();
 
-    _bgl_add_to_rs0_reg0();
-    _bgl_sub_from_rs2_reg0();
-    _bgl_add_to_rs1_reg1();
-    _bgl_sub_from_rs3_reg1();
+      _sse_su3_multiply((*up));
+      _sse_vector_cmplx_mul(phase_0);
+      _sse_store_up(rs.s2);
 
-    /*********************** direction +1 ************************/
+      // apply the clover plus twisted term to diagonal bit
+      assign_mul_one_sw_pm_imu_site_lexic(ix, &stmp, s, g_mu);
 
-    iy=g_idn[ix][1]; 
+      _sse_load_up(stmp.s0);
+      _sse_load(rs.s2);
+      _sse_vector_add();
+      _sse_store(rs.s0);
 
-    um=&g_gauge_field[iy][1]; 
+      _sse_load_up(stmp.s2);
+      _sse_load(rs.s2);
+      _sse_vector_add();
+      _sse_store(rs.s2);      
+      
+      um=&g_gauge_field[iy][0];
+      _prefetch_su3(um);
+      
+      _sse_load(sp->s1);
+      _sse_load_up(sp->s3);
+      _sse_vector_add();
+      
+      _sse_su3_multiply((*up));
+      _sse_vector_cmplx_mul(phase_0);
+      _sse_store_up(rs.s3);
+    
+      _sse_load_up(stmp.s1);
+      _sse_load(rs.s3);
+      _sse_vector_add();
+      _sse_store(rs.s1);
 
-    _prefetch_su3(um); 
-    sm = (spinor *) Q + iy;
-    _prefetch_spinor(sm); 
+      _sse_load_up(stmp.s3);
+      _sse_load(rs.s3);
+      _sse_vector_add();
+      _sse_store(rs.s3); 
 
-    _bgl_load_reg0(sp->s0);
-    _bgl_load_reg1(sp->s1);
-    _bgl_load_reg0_up(sp->s3);
-    _bgl_load_reg1_up(sp->s2);
-    _bgl_vector_i_mul_add_reg0();
-    _bgl_vector_i_mul_add_reg1();
+      /******************************* direction -0 *********************************/
 
-    _bgl_su3_multiply_double((*up));
-    _bgl_vector_cmplx_mul_double(phase_1);
+      iy=g_iup[ix][1];
 
-    _bgl_add_to_rs0_reg0();
-    _bgl_i_mul_sub_from_rs3_reg0();
-    _bgl_add_to_rs1_reg1();
-    _bgl_i_mul_sub_from_rs2_reg1();
+      sp = (spinor *) Q + iy;
+      _prefetch_spinor(sp);
 
-    /*********************** direction -1 ************************/
+      _sse_load(sm->s0);
+      _sse_load_up(sm->s2);
+      _sse_vector_sub();
+      
+      _sse_su3_inverse_multiply((*um));
+      _sse_vector_cmplxcg_mul(phase_0);
+      _sse_load(rs.s0);
+      _sse_vector_add();
+      _sse_store(rs.s0);
+
+      _sse_load(rs.s2);
+      _sse_vector_sub();
+      _sse_store(rs.s2);
+      
+      up+=1;
+      _prefetch_su3(up);
+      
+      _sse_load(sm->s1);
+      _sse_load_up(sm->s3);
+      _sse_vector_sub();
+      
+      _sse_su3_inverse_multiply((*um));
+      _sse_vector_cmplxcg_mul(phase_0);
+      _sse_load(rs.s1);
+      _sse_vector_add();
+      _sse_store(rs.s1);
+
+      _sse_load(rs.s3);
+      _sse_vector_sub();
+      _sse_store(rs.s3);
+      
+      /******************************* direction +1 *********************************/
 
-    iy=g_iup[ix][2]; 
+      iy=g_idn[ix][1];
+      
+      sm = (spinor *) Q + iy;
+      _prefetch_spinor(sm);
+
+      _sse_load(sp->s0);
+      _sse_load_up(sp->s3);
+      _sse_vector_i_mul();
+      _sse_vector_add();
+
+      _sse_su3_multiply((*up));
+      _sse_vector_cmplx_mul(phase_1);
+      _sse_load(rs.s0);
+      _sse_vector_add();
+      _sse_store(rs.s0);
+
+      _sse_load(rs.s3);
+      _sse_vector_i_mul();      
+      _sse_vector_sub();
+      _sse_store(rs.s3); 
+      
+      um=&g_gauge_field[iy][1];
+      _prefetch_su3(um);
 
-    up+=1;
-    _prefetch_su3(up); 
-    sp = (spinor *) Q + iy;
-    _prefetch_spinor(sp); 
+      _sse_load(sp->s1);
+      _sse_load_up(sp->s2);
+      _sse_vector_i_mul();
+      _sse_vector_add();
 
-    _bgl_load_reg0(sm->s0);
-    _bgl_load_reg1(sm->s1);
-    _bgl_load_reg0_up(sm->s3);
-    _bgl_load_reg1_up(sm->s2);
-    _bgl_vector_i_mul_sub_reg0();
-    _bgl_vector_i_mul_sub_reg1();
+      _sse_su3_multiply((*up));
+      _sse_vector_cmplx_mul(phase_1);
+      _sse_load(rs.s1);
+      _sse_vector_add();
+      _sse_store(rs.s1);
 
-    _bgl_su3_inverse_multiply_double((*um));
-    _bgl_vector_cmplxcg_mul_double(phase_1);
-
-    _bgl_add_to_rs0_reg0();
-    _bgl_add_to_rs1_reg1();
-    _bgl_i_mul_add_to_rs3_reg0();
-    _bgl_i_mul_add_to_rs2_reg1();
+      _sse_load(rs.s2);
+      _sse_vector_i_mul();      
+      _sse_vector_sub();
+      _sse_store(rs.s2);       
 
-    /*********************** direction +2 ************************/
+      /******************************* direction -1 *********************************/
 
-    iy=g_idn[ix][2];
+      iy=g_iup[ix][2];
 
-    um=&g_gauge_field[iy][2];
-    _prefetch_su3(um);
-    sm = (spinor *) Q + iy;
-    _prefetch_spinor(sm);
+      sp = (spinor *) Q + iy;
+      _prefetch_spinor(sp);
 
-    _bgl_load_reg0(sp->s0);
-    _bgl_load_reg1(sp->s1);
-    _bgl_load_reg1_up(sp->s2);
-    _bgl_load_reg0_up(sp->s3);
-    _bgl_vector_add_reg0();
-    _bgl_vector_sub_reg1();
+      _sse_load(sm->s0);
+      _sse_load_up(sm->s3);
+      _sse_vector_i_mul();
+      _sse_vector_sub();
+      
+      _sse_su3_inverse_multiply((*um));
+      _sse_vector_cmplxcg_mul(phase_1);
+      _sse_load(rs.s0);
+      _sse_vector_add();
+      _sse_store(rs.s0);
+
+      _sse_load(rs.s3);
+      _sse_vector_i_mul();      
+      _sse_vector_add();
+      _sse_store(rs.s3);
+
+      up+=1;
+      _prefetch_su3(up);
+
+      _sse_load(sm->s1);
+      _sse_load_up(sm->s2);
+      _sse_vector_i_mul();
+      _sse_vector_sub();
+      
+      _sse_su3_inverse_multiply((*um));
+      _sse_vector_cmplxcg_mul(phase_1);
+      _sse_load(rs.s1);
+      _sse_vector_add();
+      _sse_store(rs.s1);
 
-    _bgl_su3_multiply_double((*up));
-    _bgl_vector_cmplx_mul_double(phase_2);
+      _sse_load(rs.s2);
+      _sse_vector_i_mul();      
+      _sse_vector_add();
+      _sse_store(rs.s2);
 
-    _bgl_add_to_rs0_reg0();
-    _bgl_add_to_rs1_reg1();
-    _bgl_sub_from_rs2_reg1();
-    _bgl_add_to_rs3_reg0();
+      /******************************* direction +2 *********************************/
 
+      iy=g_idn[ix][2];
 
-    /*********************** direction -2 ************************/
+      sm = (spinor *) Q + iy;
+      _prefetch_spinor(sm);
 
-    iy=g_iup[ix][3]; 
+      _sse_load(sp->s0);
+      _sse_load_up(sp->s3);
+      _sse_vector_add();
 
-    up+=1;
-    _prefetch_su3(up); 
-    sp = (spinor *) Q + iy;
-    _prefetch_spinor(sp); 
+      _sse_su3_multiply((*up));
+      _sse_vector_cmplx_mul(phase_2);
+      _sse_load(rs.s0);
+      _sse_vector_add();
+      _sse_store(rs.s0);
 
-    _bgl_load_reg0(sm->s0);
-    _bgl_load_reg1(sm->s1);
-    _bgl_load_reg1_up(sm->s2);
-    _bgl_load_reg0_up(sm->s3);
-    _bgl_vector_sub_reg0();
-    _bgl_vector_add_reg1();
+      _sse_load(rs.s3);
+      _sse_vector_add();
+      _sse_store(rs.s3);
+      
+      um=&g_gauge_field[iy][2];
+      _prefetch_su3(um);
 
-    _bgl_su3_inverse_multiply_double((*um));
-    _bgl_vector_cmplxcg_mul_double(phase_2);
+      _sse_load(sp->s1);
+      _sse_load_up(sp->s2);
+      _sse_vector_sub();
 
-    _bgl_add_to_rs0_reg0();
-    _bgl_add_to_rs1_reg1();
-    _bgl_add_to_rs2_reg1();
-    _bgl_sub_from_rs3_reg0();
+      _sse_su3_multiply((*up));
+      _sse_vector_cmplx_mul(phase_2);
+      _sse_load(rs.s1);
+      _sse_vector_add();
+      _sse_store(rs.s1);
 
-    /*********************** direction +3 ************************/
+      _sse_load(rs.s2);
+      _sse_vector_sub();
+      _sse_store(rs.s2);      
 
-    iy=g_idn[ix][3]; 
+      /******************************* direction -2 *********************************/
 
-    um=&g_gauge_field[iy][3]; 
-    _prefetch_su3(um); 
-    sm = (spinor *) Q + iy;
-    _prefetch_spinor(sm); 
+      iy=g_iup[ix][3];
 
-    _bgl_load_reg0(sp->s0);
-    _bgl_load_reg1(sp->s1);
-    _bgl_load_reg0_up(sp->s2);
-    _bgl_load_reg1_up(sp->s3);
-    _bgl_vector_i_mul_add_reg0();
-    _bgl_vector_i_mul_sub_reg1();
+      sp = (spinor *) Q + iy;
+      _prefetch_spinor(sp);
 
-    _bgl_su3_multiply_double((*up));
-    _bgl_vector_cmplx_mul_double(phase_3);
+      _sse_load(sm->s0);
+      _sse_load_up(sm->s3);
+      _sse_vector_sub();
+      
+      _sse_su3_inverse_multiply((*um));
+      _sse_vector_cmplxcg_mul(phase_2);
+      _sse_load(rs.s0);
+      _sse_vector_add();
+      _sse_store(rs.s0);
+
+      _sse_load(rs.s3);
+      _sse_vector_sub();
+      _sse_store(rs.s3);
+      
+      up+=1;
+      _prefetch_su3(up);
 
-    _bgl_add_to_rs0_reg0();
-    _bgl_add_to_rs1_reg1();
-    _bgl_i_mul_sub_from_rs2_reg0();
-    _bgl_i_mul_add_to_rs3_reg1();
+      _sse_load(sm->s1);
+      _sse_load_up(sm->s2);
+      _sse_vector_add();
+      
+      _sse_su3_inverse_multiply((*um));
+      _sse_vector_cmplxcg_mul(phase_2);
+      _sse_load(rs.s1);
+      _sse_vector_add();
+      _sse_store(rs.s1);
+
+      _sse_load(rs.s2);
+      _sse_vector_add();
+      _sse_store(rs.s2);      
+      
+      /******************************* direction +3 *********************************/
 
-    /*********************** direction -3 ************************/
+      iy=g_idn[ix][3];
 
-    iz=(ix+1+VOLUME)%VOLUME;
+      sm = (spinor *) Q + iy;
+      _prefetch_spinor(sm);
 
-    iy=g_iup[iz][0];
+      _sse_load(sp->s0);
+      _sse_load_up(sp->s2);
+      _sse_vector_i_mul();
+      _sse_vector_add();
 
-    up=&g_gauge_field[iz][0];
-    _prefetch_su3(up); 
-    sp = (spinor *) Q + iy;
-    _prefetch_spinor(sp); 
+      _sse_su3_multiply((*up));
+      _sse_vector_cmplx_mul(phase_3);
+      _sse_load(rs.s0);
+      _sse_vector_add();
+      _sse_store(rs.s0);
 
-    _bgl_load_reg0(sm->s0);
-    _bgl_load_reg1(sm->s1);
-    _bgl_load_reg0_up(sm->s2);
-    _bgl_load_reg1_up(sm->s3);
-    _bgl_vector_i_mul_sub_reg0();
-    _bgl_vector_i_mul_add_reg1();
+      _sse_load(rs.s2);
+      _sse_vector_i_mul();      
+      _sse_vector_sub();
+      _sse_store(rs.s2);
+      
+      um=&g_gauge_field[iy][3];
+      _prefetch_su3(um);
+
+      _sse_load(sp->s1);
+      _sse_load_up(sp->s3);
+      _sse_vector_i_mul();
+      _sse_vector_sub();
+
+      _sse_su3_multiply((*up));
+      _sse_vector_cmplx_mul(phase_3);
+      _sse_load(rs.s1);
+      _sse_vector_add();
+      _sse_store(rs.s1);
+
+      _sse_load(rs.s3);
+      _sse_vector_i_mul();      
+      _sse_vector_add();
+      _sse_store(rs.s3);
+      
+      /******************************* direction -3 *********************************/
 
-    _bgl_su3_inverse_multiply_double((*um));
-    _bgl_vector_cmplxcg_mul_double(phase_3);
+      iz=(ix+1+VOLUME)%VOLUME;
 
-    _bgl_add_to_rs0_reg0();
-    _bgl_store_rs0(rn->s0);
-    _bgl_i_mul_add_to_rs2_reg0();
-    _bgl_store_rs2(rn->s2);
+      iy=g_iup[iz][0];
+      
+      sp = (spinor *) Q + iy;
+      _prefetch_spinor(sp);
 
-    _bgl_add_to_rs1_reg1();
-    _bgl_store_rs1(rn->s1);
-    _bgl_i_mul_sub_from_rs3_reg1();
-    _bgl_store_rs3(rn->s3);
+      _sse_load(sm->s0);
+      _sse_load_up(sm->s2);
+      _sse_vector_i_mul();
+      _sse_vector_sub();
+      
+      _sse_su3_inverse_multiply((*um));
+      _sse_vector_cmplxcg_mul(phase_3);
+      rn = (spinor *) P + ix;
+      
+      _sse_load(rs.s0);
+      _sse_vector_add();
+      _sse_store_nt(rn->s0);
+
+      _sse_load(rs.s2);
+      _sse_vector_i_mul();      
+      _sse_vector_add();
+      _sse_store_nt(rn->s2);
+
+      up=&g_gauge_field[iz][0];
+      _prefetch_su3(up);
+
+      _sse_load(sm->s1);
+      _sse_load_up(sm->s3);
+      _sse_vector_i_mul();
+      _sse_vector_add();
+      
+      _sse_su3_inverse_multiply((*um));
+      _sse_vector_cmplxcg_mul(phase_3);
+      _sse_load(rs.s1);
+      _sse_vector_add();
+      _sse_store_nt(rn->s1);
+
+      _sse_load(rs.s3);
+      _sse_vector_i_mul();      
+      _sse_vector_sub();
+      _sse_store_nt(rn->s3);
+      
+      /******************************** end of loop *********************************/
 
-    /************************ end of loop ************************/
-  }
+    }
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
 }
 
+void D_psi(spinor * const P, spinor * const Q){
+   if(g_c_sw > 0)
+     Dsw_psi(P,Q);
+   else
+     Dtm_psi(P,Q);
 
-#else
+   return ;
+}
 
-/* Serially Checked ! */
+#endif // SSE
 
-void D_psi(spinor * const P, spinor * const Q){
+void D_psi_bispinor(bispinor * const P, bispinor * const Q){
   if(P==Q){
     printf("Error in D_psi (operator.c):\n");
     printf("Arguments must be different spinor fields\n");
@@ -1208,386 +1505,116 @@ void D_psi(spinor * const P, spinor * const Q){
       update_backward_gauge(g_gauge_field);
   }
 #endif
-# if defined MPI
-  xchange_lexicfield(Q);
-# endif
+#if defined TM_USE_MPI
+  generic_exchange(Q, sizeof(bispinor));
+#endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
 
   int ix,iy;
   su3 * restrict up,* restrict um;
-  spinor * restrict rr; 
-  spinor const * restrict s;
-  spinor const * restrict sp;
-  spinor const * restrict sm;
+  bispinor * restrict rr;
+  bispinor const * restrict s;
+  bispinor const * restrict sp;
+  bispinor const * restrict sm;
   _Complex double rho1, rho2;
-  spinor tmpr;
+  bispinor tmpr;
 
-  rho1 = 1. + g_mu * I;
+  rho1 = 0.5*(1. + g_mu * I);
   rho2 = conj(rho1);
 
   /************************ loop over all lattice sites *************************/
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (ix=0;ix<VOLUME;ix++)
   {
-    rr  = (spinor *) P +ix;
-    s  = (spinor *) Q +ix;
+    rr  = (bispinor *) P +ix;
+    s  = (bispinor *) Q +ix;
+
+    _bispinor_null( tmpr);
+
+    if(g_c_sw > 0) {
+      (assign_mul_one_sw_pm_imu_site_lexic)(ix, &(tmpr.sp_up), &(s->sp_up), +g_mu);
+      (assign_mul_one_sw_pm_imu_site_lexic)(ix, &(tmpr.sp_dn), &(s->sp_dn), -g_mu);
+    }
+    else {
+      _complex_times_vector(tmpr.sp_up.s0, rho1, s->sp_up.s0);
+      _complex_times_vector(tmpr.sp_up.s1, rho1, s->sp_up.s1);
+      _complex_times_vector(tmpr.sp_up.s2, rho2, s->sp_up.s2);
+      _complex_times_vector(tmpr.sp_up.s3, rho2, s->sp_up.s3);
+
+      _complex_times_vector(tmpr.sp_dn.s0, rho2, s->sp_dn.s0);
+      _complex_times_vector(tmpr.sp_dn.s1, rho2, s->sp_dn.s1);
+      _complex_times_vector(tmpr.sp_dn.s2, rho1, s->sp_dn.s2);
+      _complex_times_vector(tmpr.sp_dn.s3, rho1, s->sp_dn.s3);
+
+    }
+
+    _vector_add_mul(tmpr.sp_up.s0, (3.5), s->sp_up.s0);
+    _vector_add_mul(tmpr.sp_up.s1, (3.5), s->sp_up.s1);
+    _vector_add_mul(tmpr.sp_up.s2, (3.5), s->sp_up.s2);
+    _vector_add_mul(tmpr.sp_up.s3, (3.5), s->sp_up.s3);
+
+    _vector_add_mul(tmpr.sp_dn.s0, (3.5), s->sp_dn.s0);
+    _vector_add_mul(tmpr.sp_dn.s1, (3.5), s->sp_dn.s1);
+    _vector_add_mul(tmpr.sp_dn.s2, (3.5), s->sp_dn.s2);
+    _vector_add_mul(tmpr.sp_dn.s3, (3.5), s->sp_dn.s3);
 
-    _complex_times_vector(tmpr.s0, rho1, s->s0);
-    _complex_times_vector(tmpr.s1, rho1, s->s1);
-    _complex_times_vector(tmpr.s2, rho2, s->s2);
-    _complex_times_vector(tmpr.s3, rho2, s->s3);
 
     /******************************* direction +0 *********************************/
     iy=g_iup[ix][0];
-    sp = (spinor *) Q +iy;
+    sp = (bispinor *) Q +iy;
     up=&g_gauge_field[ix][0];
-    p0add(&tmpr, sp, up, phase_0);
+    p0add_bispinor(&tmpr, sp, up, -0.5*phase_0);
 
     /******************************* direction -0 *********************************/
     iy=g_idn[ix][0];
-    sm  = (spinor *) Q +iy;
+    sm  = (bispinor *) Q +iy;
     um=&g_gauge_field[iy][0];
-    m0add(&tmpr, sm, um, phase_0);
+    m0add_bispinor(&tmpr, sm, um,-0.5*phase_0);
 
     /******************************* direction +1 *********************************/
     iy=g_iup[ix][1];
-    sp = (spinor *) Q +iy;
+    sp = (bispinor *) Q +iy;
     up=&g_gauge_field[ix][1];
-    p1add(&tmpr, sp, up, phase_1);
+    p1add_bispinor(&tmpr, sp, up,-0.5*phase_1);
 
     /******************************* direction -1 *********************************/
     iy=g_idn[ix][1];
-    sm = (spinor *) Q +iy;
+    sm = (bispinor *) Q +iy;
     um=&g_gauge_field[iy][1];
-    m1add(&tmpr, sm, um, phase_1);
+    m1add_bispinor(&tmpr, sm, um,-0.5*phase_1);
 
     /******************************* direction +2 *********************************/
     iy=g_iup[ix][2];
-    sp = (spinor *) Q +iy;
+    sp = (bispinor *) Q +iy;
     up=&g_gauge_field[ix][2];
-    p2add(&tmpr, sp, up, phase_2);
+    p2add_bispinor(&tmpr, sp, up,-0.5*phase_2);
 
     /******************************* direction -2 *********************************/
     iy=g_idn[ix][2];
-    sm = (spinor *) Q +iy;
+    sm = (bispinor *) Q +iy;
     um=&g_gauge_field[iy][2];
-    m2add(&tmpr, sm, um, phase_2);
+    m2add_bispinor(&tmpr, sm, um,-0.5*phase_2);
 
     /******************************* direction +3 *********************************/
     iy=g_iup[ix][3];
-    sp = (spinor *) Q +iy;
+    sp = (bispinor *) Q +iy;
     up=&g_gauge_field[ix][3];
-    p3add(&tmpr, sp, up, phase_3);
+    p3add_bispinor(&tmpr, sp, up,-0.5*phase_3);
 
     /******************************* direction -3 *********************************/
     iy=g_idn[ix][3];
-    sm = (spinor *) Q +iy;
+    sm = (bispinor *) Q +iy;
     um=&g_gauge_field[iy][3];
-    m3addandstore(rr, sm, um, phase_3, &tmpr);
+    m3addandstore_bispinor(rr, sm, um,-0.5*phase_3, &tmpr);
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
-
-#endif
-
-void D_psi_prec(spinor * const P, spinor * const Q){
-
-  /* todo: do preconditioning */
-  spinorPrecWS *ws=(spinorPrecWS*)g_precWS;
-  static _Complex double alpha = -1.0;
-
-  alpha = -0.5;
-  spinorPrecondition(P,Q,ws,T,L,alpha,0,1);
-  D_psi(g_spinor_field[DUM_MATRIX],P);
-  alpha = -0.5;
-  spinorPrecondition(P,g_spinor_field[DUM_MATRIX],ws,T,L,alpha,0,1);
-}
-
-/* apply the Dirac operator to the block local spinor field s */
-/* and store the result in block local spinor field rr        */
-/* for block blk                                              */
-/* the block local gauge field is assumed to be in the order  */
-/* that is needed int local_D, which means also that it is a  */
-/* double copy                                                */
-
-void Block_D_psi(block * blk, spinor * const rr, spinor * const s) {
-  int i;
-  spinor *r = rr;
-  spinor *t = s;
-  su3 * u = blk->u;
-  int * idx = blk->idx;
-  static _Complex double rhoa, rhob;
-  spinor tmpr;
-#if (defined BGL && defined XLC)
-  __alignx(16,s);
-#endif
-  if(blk_gauge_eo) {
-    init_blocks_gaugefield();
-  }
-  rhoa = 1.0 + g_mu * I;
-  rhob = conj(rhoa);
-
-  /* set the boundary term to zero */
-  _spinor_null(rr[blk->volume]);
-  _spinor_null(s[blk->volume]);
-
-  for(i = 0; i < blk->volume; i++) {
-#if (defined BGL && defined XLC)
-    _bgl_load_rs0(t->s0);
-    _bgl_load_rs1(t->s1);
-    _bgl_load_rs2(t->s2);
-    _bgl_load_rs3(t->s3);
-    _bgl_vector_cmplx_mul_rs(rhoa);
-#else
-    _complex_times_vector(tmpr.s0, rhoa, t->s0);
-    _complex_times_vector(tmpr.s1, rhoa, t->s1);
-    _complex_times_vector(tmpr.s2, rhob, t->s2);
-    _complex_times_vector(tmpr.s3, rhob, t->s3);
-#endif
-
-    local_H(r, s, u, idx, &tmpr);
-
-    r++;
-    t++;
-    idx += 8;
-    u += 8;
-  }
-  return;
-}
-
-/* Apply Hopping Matrix to a even(odd) spinor */
-void Block_H_psi(block * blk, spinor * const rr, spinor * const s, const int eo) {
-  int i;
-  spinor *r = rr;
-  su3 * u = blk->u;
-  int * eoidx = blk->evenidx;
-  spinor tmpr;
-
-  if(!blk_gauge_eo) {
-    init_blocks_eo_gaugefield();
-  }
-
-  /* for OE */
-  if(eo == 1) {
-    u = blk->u + blk->volume*8/2;
-    eoidx = blk->oddidx;
-  }
-
-  /* set the boundary term to zero */
-  _spinor_null(rr[blk->volume/2]);
-  _spinor_null(s[blk->volume/2]);
-  
-  for(i = 0; i < blk->volume/2; i++) {
-#if (defined BGL && defined XLC)
-    _spinor_null(tmpr);
-    _bgl_load_rs0(tmpr.s0);
-    _bgl_load_rs1(tmpr.s1);
-    _bgl_load_rs2(tmpr.s2);
-    _bgl_load_rs3(tmpr.s3);
-#else
-    _spinor_null(tmpr);
-#endif
-
-    local_H(r, s, u, eoidx, &tmpr);
-
-    r++;
-    eoidx += 8;
-    u += 8;
-  }
-  return;
-}
-
-/* direction +t */
-void boundary_D_0(spinor * const r, spinor * const s, su3 * const u) {
-
-  static su3_vector chi, psi;
-
-  _vector_add(psi,s->s0,s->s2);
-
-  _su3_multiply(chi,(*u),psi);
-
-  _complex_times_vector(r->s0, phase_0, chi);
-  _vector_assign(r->s2,r->s0);
-
-  _vector_add(psi,s->s1,s->s3);
-
-  _su3_multiply(chi,(*u),psi);
-
-  _complex_times_vector(r->s1, phase_0, chi);
-  _vector_assign(r->s3, r->s1);
-
-  return;
-}
-
-/* direction -t */
-void boundary_D_1(spinor * const r, spinor * const s, su3 * restrict u) {
-
-  static su3_vector chi, psi;
-
-  _vector_sub(psi, s->s0, s->s2);
-
-  _su3_inverse_multiply(chi, (*u), psi);
-
-  _complexcjg_times_vector(r->s0, phase_0, chi);
-  _vector_minus_assign(r->s2, r->s0);
-
-  _vector_sub(psi,s->s1,s->s3);
-
-  _su3_inverse_multiply(chi,(*u),psi);
-
-  _complexcjg_times_vector(r->s1,phase_0,chi);
-  _vector_minus_assign(r->s3, r->s1);
-
-  return;
-}
-
-/* direction +x */
-void boundary_D_2(spinor * const r, spinor * const s, su3 * restrict u) {
-
-  static su3_vector chi, psi;
-
-  _vector_i_add(psi,s->s0,s->s3);
-
-  _su3_multiply(chi,(*u),psi);
-
-  _complex_times_vector(r->s0, phase_1, chi);
-  _vector_null(r->s3);
-  _vector_i_sub_assign(r->s3, r->s0);
-
-  _vector_i_add(psi,s->s1,s->s2);
-
-  _su3_multiply(chi,(*u),psi);
-
-  _complex_times_vector(r->s1, phase_1, chi);
-  _vector_null(r->s2);
-  _vector_i_sub_assign(r->s2, r->s1);
-
-  return;
-}
-
-/* direction -x */
-void boundary_D_3(spinor * const r, spinor * const s, su3 * restrict u) {
-
-  static su3_vector chi, psi;
-
-  _vector_i_sub(psi,s->s0,s->s3);
-
-  _su3_inverse_multiply(chi,(*u),psi);
-
-  _complexcjg_times_vector(r->s0, phase_1, chi);
-  _vector_null(r->s3);
-  _vector_i_add_assign(r->s3, r->s0);
-
-  _vector_i_sub(psi,s->s1,s->s2);
-
-  _su3_inverse_multiply(chi,(*u),psi);
-
-  _complexcjg_times_vector(r->s1, phase_1, chi);
-  _vector_null(r->s2);
-  _vector_i_add_assign(r->s2, r->s1);
-
-  return;
-}
-
-/* direction +y */
-void boundary_D_4(spinor * const r, spinor * const s, su3 * restrict u) {
-
-  static su3_vector chi, psi;
-
-  _vector_add(psi,s->s0,s->s3);
-
-  _su3_multiply(chi,(*u),psi);
-
-  _complex_times_vector(r->s0, phase_2, chi);
-  _vector_assign(r->s3, r->s0);
-
-  _vector_sub(psi,s->s1,s->s2);
-
-  _su3_multiply(chi,(*u),psi);
-
-  _complex_times_vector(r->s1, phase_2, chi);
-  _vector_minus_assign(r->s2, r->s1);
-
-  return;
-}
-
-/* direction -y */
-void boundary_D_5(spinor * const r, spinor * const s, su3 * restrict u) {
-
-  static su3_vector chi, psi;
-
-  _vector_sub(psi,s->s0,s->s3);
-
-  _su3_inverse_multiply(chi,(*u),psi);
-
-  _complexcjg_times_vector(r->s0, phase_2, chi);
-  _vector_minus_assign(r->s3, r->s0);
-
-  _vector_add(psi,s->s1,s->s2);
-
-  _su3_inverse_multiply(chi,(*u),psi);
-
-  _complexcjg_times_vector(r->s1, phase_2, chi);
-  _vector_assign(r->s2, r->s1);
-
-
-  return;
-}
-
-/* direction +z */
-void boundary_D_6(spinor * const r, spinor * const s, su3 * restrict u) {
-
-  static su3_vector chi, psi;
-
-  _vector_i_add(psi,s->s0,s->s2);
-
-  _su3_multiply(chi,(*u),psi);
-
-  _complex_times_vector(r->s0, phase_3, chi);
-  _vector_null(r->s2);
-  _vector_i_sub_assign(r->s2, r->s0);
-
-  _vector_i_sub(psi,s->s1,s->s3);
-
-  _su3_multiply(chi,(*u),psi);
-
-  _complex_times_vector(r->s1, phase_3, chi);
-  _vector_null(r->s3);
-  _vector_i_add_assign(r->s3, r->s1);
-
-  return;
-}
-
-/* direction -z */
-void boundary_D_7(spinor * const r, spinor * const s, su3 * restrict u) {
-
-  static su3_vector chi, psi;
-
-  _vector_i_sub(psi,s->s0,s->s2);
-
-  _su3_inverse_multiply(chi,(*u),psi);
-
-  _complexcjg_times_vector(r->s0, phase_3, chi);
-  _vector_null(r->s2);
-  _vector_i_add_assign(r->s2, r->s0);
-
-  _vector_i_add(psi,s->s1,s->s3);
-
-  _su3_inverse_multiply(chi,(*u),psi);
-
-  _complexcjg_times_vector(r->s1, phase_3, chi);
-  _vector_null(r->s3);
-  _vector_i_sub_assign(r->s3, r->s1);
-
-  return;
-}
-
diff --git a/operator/D_psi.h b/operator/D_psi.h
index a5e802c07..179aaf423 100644
--- a/operator/D_psi.h
+++ b/operator/D_psi.h
@@ -21,13 +21,35 @@
 #ifndef _D_PSI_H
 #define _D_PSI_H
 
+void generic_exchange(void *field_in, int bytes_per_site);
+
 #include "block.h"
 
+//This works with tm and tm+clover 
 void D_psi(spinor * const P, spinor * const Q);
+void D_psi_bispinor(bispinor * const P, bispinor * const Q);
+
+void D_psi_32(spinor32 * const P, spinor32 * const Q);
 void D_psi_prec(spinor * const P, spinor * const Q);
+
+//works for tm and tm+clover
 void Block_D_psi(block * blk, spinor * const rr, spinor * const s);
 void Block_H_psi(block * blk, spinor * const rr, spinor * const s, const int eo);
 
+void Block_D_psi_32(block * blk, spinor32 * const rr, spinor32 * const s);
+void Block_H_psi_32(block * blk, spinor32 * const rr, spinor32 * const s, const int eo);
+
+//c_sw=0
+void Block_Dtm_psi(block * blk, spinor * const rr, spinor * const s);
+//c_sw > 0
+void Block_Dsw_psi(block * blk, spinor * const rr, spinor * const s);
+
+//c_sw=0
+void Block_Dtm_psi_32(block * blk, spinor32 * const rr, spinor32 * const s);
+//c_sw > 0
+void Block_Dsw_psi_32(block * blk, spinor32 * const rr, spinor32 * const s);
+
+
 void boundary_D_0(spinor * const r, spinor * const s, su3 *u);
 void boundary_D_1(spinor * const r, spinor * const s, su3 *u);
 void boundary_D_2(spinor * const r, spinor * const s, su3 *u);
diff --git a/operator/D_psi_BSM.c b/operator/D_psi_BSM.c
index d624c39ea..b19632874 100644
--- a/operator/D_psi_BSM.c
+++ b/operator/D_psi_BSM.c
@@ -25,9 +25,8 @@
  * Action of a Dirac operator (Frezzotti-Rossi BSM toy model) on a bispinor field
  *
  *******************************************************************************/
-
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 
 #include <stdlib.h>
@@ -37,7 +36,7 @@
 #include "su3.h"
 #include "sse.h"
 #include "boundary.h"
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include "xchange/xchange.h"
 #endif
 #include "update_backward_gauge.h"
@@ -61,11 +60,11 @@
  */
 
 static inline void Fadd(bispinor * const out, const bispinor * const in, const scalar * const phi, const double c, const double sign) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static spinor tmp;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
   
@@ -139,11 +138,11 @@ static inline void Fadd(bispinor * const out, const bispinor * const in, const s
 static inline void bispinor_times_phase_times_u(bispinor * restrict const us, const _Complex double phase,
 						su3 const * restrict const u, bispinor const * restrict const s)
 {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static su3_vector chi;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -177,11 +176,11 @@ static inline void bispinor_times_phase_times_u(bispinor * restrict const us, co
 static inline void bispinor_times_phase_times_inverse_u(bispinor * restrict const us, const _Complex double phase,
 							su3 const * restrict const u, bispinor const * restrict const s)
 {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static su3_vector chi;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -217,11 +216,11 @@ static inline void p0add(bispinor * restrict const tmpr , bispinor const * restr
                          const double phaseF, const scalar * const phi, const scalar * const phip,
                          const double sign) {
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static bispinor us;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -253,11 +252,11 @@ static inline void p1add(bispinor * restrict const tmpr, bispinor const * restri
                          su3 const * restrict const u, const int inv, const _Complex double phase,
                          const double phaseF, const scalar * const phi, const scalar * const phip, 
                          const double sign) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static bispinor us;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -289,11 +288,11 @@ static inline void p2add(bispinor * restrict const tmpr, bispinor const * restri
                          su3 const * restrict const u, const int inv, const _Complex double phase,
                          const double phaseF, const scalar * const phi, const scalar * const phip, 
                          const double sign) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static bispinor us;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -325,11 +324,11 @@ static inline void p3add(bispinor * restrict const tmpr, bispinor const * restri
                          su3 const * restrict const u, const int inv, const _Complex double phase,
                          const double phaseF, const scalar * const phi, const scalar * const phip, 
                          const double sign) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static bispinor us;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -357,6 +356,53 @@ static inline void p3add(bispinor * restrict const tmpr, bispinor const * restri
   return;
 }
 
+static inline void tm3_add(bispinor * const out, const bispinor * const in, const double sign)
+{
+  /*out+=s*i\gamma_5 \tau_3 mu3 *in
+ *    * sign>0 for D+i\gamma_5\tau_3
+ *       * sign<0 for D_dag-i\gamma_5\tau_3
+ *          */
+  double s = +1.;
+  if(sign < 0) s = -1.;
+
+  /* out_up += s * i \gamma_5 \mu3 * in_up */
+  _vector_add_i_mul(out->sp_up.s0,  s*mu03_BSM, in->sp_up.s0);
+  _vector_add_i_mul(out->sp_up.s1,  s*mu03_BSM, in->sp_up.s1);
+  _vector_add_i_mul(out->sp_up.s2, -s*mu03_BSM, in->sp_up.s2);
+  _vector_add_i_mul(out->sp_up.s3, -s*mu03_BSM, in->sp_up.s3);
+
+
+  /* out_dn +=- s * i \gamma_5 \mu3 * in_dn */
+  _vector_add_i_mul(out->sp_dn.s0, -s*mu03_BSM, in->sp_dn.s0);
+  _vector_add_i_mul(out->sp_dn.s1, -s*mu03_BSM, in->sp_dn.s1);
+  _vector_add_i_mul(out->sp_dn.s2,  s*mu03_BSM, in->sp_dn.s2);
+  _vector_add_i_mul(out->sp_dn.s3,  s*mu03_BSM, in->sp_dn.s3);
+
+}
+static inline void tm1_add(bispinor * const out, const bispinor * const in, const double sign)
+{
+  /*out+=s*i\gamma_5 \tau_1 mu1 *in
+ *    * sign>0 for D+i\gamma_5\tau_1
+ *       * sign<0 for D_dag-i\gamma_5\tau_1
+ *          */
+  double s = +1.;
+  if(sign < 0) s = -1.;
+
+  /* out_up += s * i \gamma_5 \mu1 * in_dn */
+  _vector_add_i_mul(out->sp_up.s0,  s*mu01_BSM, in->sp_dn.s0);
+  _vector_add_i_mul(out->sp_up.s1,  s*mu01_BSM, in->sp_dn.s1);
+  _vector_add_i_mul(out->sp_up.s2, -s*mu01_BSM, in->sp_dn.s2);
+  _vector_add_i_mul(out->sp_up.s3, -s*mu01_BSM, in->sp_dn.s3);
+
+
+  /* out_dn += s * i \gamma_5 \mu1 * in_up */
+  _vector_add_i_mul(out->sp_dn.s0,  s*mu01_BSM, in->sp_up.s0);
+  _vector_add_i_mul(out->sp_dn.s1,  s*mu01_BSM, in->sp_up.s1);
+  _vector_add_i_mul(out->sp_dn.s2, -s*mu01_BSM, in->sp_up.s2);
+  _vector_add_i_mul(out->sp_dn.s3, -s*mu01_BSM, in->sp_up.s3);
+
+}
+
 
 /* D_psi_BSM acts on bispinor fields */
 void D_psi_BSM(bispinor * const P, bispinor * const Q){
@@ -371,11 +417,11 @@ void D_psi_BSM(bispinor * const P, bispinor * const Q){
     update_backward_gauge(g_gauge_field);
   }
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
   generic_exchange(Q, sizeof(bispinor));
 #endif
         
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -393,7 +439,7 @@ void D_psi_BSM(bispinor * const P, bispinor * const Q){
 
     /************************ loop over all lattice sites *************************/
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
     for (ix=0;ix<VOLUME;ix++)
@@ -445,6 +491,15 @@ void D_psi_BSM(bispinor * const P, bispinor * const Q){
 	  Fadd(rr, s, phim[mu], 0.25*rho_BSM, +1.);
 	}
 
+        /* tmpr+=i\gamma_5\tau_1 mu0 *Q */
+        if( fabs(mu01_BSM) > 1.e-10 )
+          tm1_add(rr, s, 1);
+
+        /* tmpr+=i\gamma_5\tau_3 mu0 *Q */
+        if( fabs(mu03_BSM) > 1.e-10 )
+          tm3_add(rr, s, 1);
+
+
 
 	// the hopping part:
 	// tmpr += +-1/2 \sum_\mu (\gamma_\mu -+ \rho_BSM/2*F(x) -+ \rho_BSM/2*F(x+-\mu)*U_{+-\mu}(x)*Q(x+-\mu)
@@ -496,7 +551,7 @@ void D_psi_BSM(bispinor * const P, bispinor * const Q){
 	um=&g_gauge_field[iy][3];
 	p3add(rr, sm, um, 1, -0.5*phase_3, 0.5*rho_BSM, phi, phim[3], +1.);
       }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
@@ -516,11 +571,11 @@ void D_psi_dagger_BSM(bispinor * const P, bispinor * const Q){
     update_backward_gauge(g_gauge_field);
   }
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
   generic_exchange(Q, sizeof(bispinor));
 #endif
   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -538,7 +593,7 @@ void D_psi_dagger_BSM(bispinor * const P, bispinor * const Q){
     
     /************************ loop over all lattice sites *************************/
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
     for (ix = 0; ix < VOLUME; ix++) {
@@ -586,6 +641,15 @@ void D_psi_dagger_BSM(bispinor * const P, bispinor * const Q){
         Fadd(rr, s, phip[mu], 0.25*rho_BSM, -1.);
         Fadd(rr, s, phim[mu], 0.25*rho_BSM, -1.);
       }
+
+      /* tmpr+=i\gamma_5\tau_1 mu0 *Q */
+      if( fabs(mu01_BSM) > 1.e-10 )
+        tm1_add(rr, s, 1);
+
+      /* tmpr+=i\gamma_5\tau_3 mu0 *Q */
+      if( fabs(mu03_BSM) > 1.e-10 )
+        tm3_add(rr, s, 1);
+
       
       // the hopping part:
       // tmpr += +-1/2 \sum_\mu (-\gamma_\mu -+ \rho_BSM/2*Fbar(x) -+ \rho_BSM/2*Fbar(x+-\mu)*U_{+-\mu}(x)*Q(x+-\mu)
@@ -637,7 +701,7 @@ void D_psi_dagger_BSM(bispinor * const P, bispinor * const Q){
       um=&g_gauge_field[iy][3];
       p3add(rr, sm, um, 1, 0.5*phase_3, -0.5*rho_BSM, phi, phim[3], -1.);
     }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
diff --git a/operator/D_psi_BSM2b.c b/operator/D_psi_BSM2b.c
index 6b2186ceb..9cbbc15f0 100644
--- a/operator/D_psi_BSM2b.c
+++ b/operator/D_psi_BSM2b.c
@@ -28,9 +28,9 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
-
+#ifdef TM_USE_BSM
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
@@ -38,7 +38,7 @@
 #include "su3.h"
 #include "sse.h"
 #include "boundary.h"
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include "xchange/xchange.h"
 #endif
 #include "update_backward_gauge.h"
@@ -63,12 +63,59 @@
  * sign = -1 -> Fbaradd
  */
 
+static inline void tm3_add(bispinor * const out, const bispinor * const in, const double sign)
+{  
+  /*out+=s*i\gamma_5 \tau_3 mu3 *in
+   * sign>0 for D+i\gamma_5\tau_3
+   * sign<0 for D_dag-i\gamma_5\tau_3
+   */
+  double s = +1.;
+  if(sign < 0) s = -1.;
+  
+  // out_up += s * i \gamma_5 \mu3 * in_up
+  _vector_add_i_mul(out->sp_up.s0,  s*mu03_BSM, in->sp_up.s0);
+  _vector_add_i_mul(out->sp_up.s1,  s*mu03_BSM, in->sp_up.s1);
+  _vector_add_i_mul(out->sp_up.s2, -s*mu03_BSM, in->sp_up.s2);
+  _vector_add_i_mul(out->sp_up.s3, -s*mu03_BSM, in->sp_up.s3);
+  
+  
+  // out_dn +=- s * i \gamma_5 \mu3 * in_dn
+  _vector_add_i_mul(out->sp_dn.s0, -s*mu03_BSM, in->sp_dn.s0);
+  _vector_add_i_mul(out->sp_dn.s1, -s*mu03_BSM, in->sp_dn.s1);
+  _vector_add_i_mul(out->sp_dn.s2,  s*mu03_BSM, in->sp_dn.s2);
+  _vector_add_i_mul(out->sp_dn.s3,  s*mu03_BSM, in->sp_dn.s3);
+  
+}
+static inline void tm1_add(bispinor * const out, const bispinor * const in, const double sign)
+{  
+  /*out+=s*i\gamma_5 \tau_1 mu1 *in
+   * sign>0 for D+i\gamma_5\tau_1
+   * sign<0 for D_dag-i\gamma_5\tau_1
+   */
+  double s = +1.;
+  if(sign < 0) s = -1.;
+  
+  // out_up += s * i \gamma_5 \mu1 * in_dn
+  _vector_add_i_mul(out->sp_up.s0,  s*mu01_BSM, in->sp_dn.s0);
+  _vector_add_i_mul(out->sp_up.s1,  s*mu01_BSM, in->sp_dn.s1);
+  _vector_add_i_mul(out->sp_up.s2, -s*mu01_BSM, in->sp_dn.s2);
+  _vector_add_i_mul(out->sp_up.s3, -s*mu01_BSM, in->sp_dn.s3);
+  
+  
+  // out_dn += s * i \gamma_5 \mu1 * in_up
+  _vector_add_i_mul(out->sp_dn.s0,  s*mu01_BSM, in->sp_up.s0);
+  _vector_add_i_mul(out->sp_dn.s1,  s*mu01_BSM, in->sp_up.s1);
+  _vector_add_i_mul(out->sp_dn.s2, -s*mu01_BSM, in->sp_up.s2);
+  _vector_add_i_mul(out->sp_dn.s3, -s*mu01_BSM, in->sp_up.s3);
+  
+}
+
 static inline void Fadd(bispinor * const out, const bispinor * const in, const scalar * const phi, const double c, const double sign) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static spinor tmp;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
   
@@ -142,11 +189,11 @@ static inline void Fadd(bispinor * const out, const bispinor * const in, const s
 static inline void bispinor_times_phase_times_u(bispinor * const us, const _Complex double phase,
             su3 const * restrict const u, bispinor const * const s)
 {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static su3_vector chi;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -180,11 +227,11 @@ static inline void bispinor_times_phase_times_u(bispinor * const us, const _Comp
 static inline void bispinor_times_phase_times_inverse_u(bispinor * const us, const _Complex double phase,
               su3 const * restrict const u, bispinor const * const s)
 {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static su3_vector chi;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -342,11 +389,11 @@ void D_psi_BSM2b(bispinor * const P, bispinor * const Q){
     update_backward_gauge(g_gauge_field);
   }
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
   generic_exchange(Q, sizeof(bispinor));
 #endif
         
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -366,7 +413,7 @@ void D_psi_BSM2b(bispinor * const P, bispinor * const Q){
 
     /************************ loop over all lattice sites *************************/
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (ix=0;ix<VOLUME;ix++)
@@ -492,9 +539,17 @@ void D_psi_BSM2b(bispinor * const P, bispinor * const Q){
     
     Fadd2hop(rr, spm2, phim[3], &stmp, -0.125*rho_BSM, +1.0, &uu, upm, upm2, phase_33, HOP_DN );
     p3add(rr, spm, &stmp, upm, HOP_DN, -0.5*phase_3);
+   
+   // tmpr+=i\gamma_5\tau_1 mu1 *Q 
+    if( fabs(mu01_BSM) > 1.e-10 )
+        tm1_add(rr, s, 1);
+    
+   // tmpr+=i\gamma_5\tau_3 mu3 *Q 
+    if( fabs(mu03_BSM) > 1.e-10 )
+        tm3_add(rr, s, 1);
 
   } 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
@@ -511,11 +566,11 @@ void D_psi_dagger_BSM2b(bispinor * const P, bispinor * const Q){
     update_backward_gauge(g_gauge_field);
   }
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
   generic_exchange(Q, sizeof(bispinor));
 #endif
         
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -535,7 +590,7 @@ void D_psi_dagger_BSM2b(bispinor * const P, bispinor * const Q){
 
     /************************ loop over all lattice sites *************************/
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (ix=0;ix<VOLUME;ix++)
@@ -661,9 +716,17 @@ void D_psi_dagger_BSM2b(bispinor * const P, bispinor * const Q){
     
     Fadd2hop(rr, spm2, phim[3], &stmp, -0.125*rho_BSM, -1.0, &uu, upm, upm2, phase_33, HOP_DN );
     p3add(rr, spm, &stmp, upm, HOP_DN, 0.5*phase_3);
-
+    
+   // tmpr+=i\gamma_5\tau_1 mu1 *Q 
+    if( fabs(mu01_BSM) > 1.e-10 )
+        tm1_add(rr, s, -1);
+   
+   // tmpr+=-i\gamma_5\tau_3 mu3 *Q 
+    if( fabs(mu03_BSM) > 1.e-10 )
+        tm3_add(rr, s, -1);
+    
   } 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
@@ -681,4 +744,4 @@ void Q2_psi_BSM2b(bispinor * const P, bispinor * const Q){
   }
 
 }
-
+#endif
diff --git a/operator/D_psi_BSM2f.c b/operator/D_psi_BSM2f.c
index ca7bfeff9..66ff1bdc4 100644
--- a/operator/D_psi_BSM2f.c
+++ b/operator/D_psi_BSM2f.c
@@ -26,11 +26,11 @@
  * with a scalar field coupling.
  *
  *******************************************************************************/
-
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 
+#ifdef TM_USE_BSM
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
@@ -39,7 +39,7 @@
 #include "su3spinor.h"
 #include "sse.h"
 #include "boundary.h"
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include "xchange/xchange.h"
 #endif
 #include "update_backward_gauge.h"
@@ -47,7 +47,9 @@
 #include "operator/D_psi_BSM2f.h"
 #include "solver/dirac_operator_eigenvectors.h"
 #include "buffers/utils.h"
+#if defined TM_USE_MPI
 #include "buffers/utils_nonblocking.h"
+#endif
 #include "linalg_eo.h"
 #include "fatal_error.h"
 
@@ -78,6 +80,9 @@ static bispinor *v2m1;
 static bispinor *v2m2;
 static bispinor *v2m3;
 static bispinor *v2m4;
+
+static bispinor *tempor;
+
 void init_D_psi_BSM2f(){
 
      vm1 =(bispinor *)calloc(VOLUMEPLUSRAND,sizeof(bispinor));
@@ -93,6 +98,7 @@ void init_D_psi_BSM2f(){
      v2m3=(bispinor *)calloc(VOLUMEPLUSRAND,sizeof(bispinor));
      v2m4=(bispinor *)calloc(VOLUMEPLUSRAND,sizeof(bispinor));
 
+     tempor=(bispinor *)calloc(VOLUMEPLUSRAND,sizeof(bispinor));
 }
 void free_D_psi_BSM2f(){
      free(vm1);
@@ -107,7 +113,57 @@ void free_D_psi_BSM2f(){
      free(v2m2);
      free(v2m3);
      free(v2m4);
+
+     free(tempor);
 }
+
+static inline void tm3_add(bispinor * const out, const bispinor * const in, const double sign)
+{  
+  /*out+=s*i\gamma_5 \tau_3 mu3 *in
+   * sign>0 for D+i\gamma_5\tau_3
+   * sign<0 for D_dag-i\gamma_5\tau_3
+   */
+  double s = +1.;
+  if(sign < 0) s = -1.;
+  
+  // out_up += s * i \gamma_5 \mu3 * in_up
+  _vector_add_i_mul(out->sp_up.s0,  s*mu03_BSM, in->sp_up.s0);
+  _vector_add_i_mul(out->sp_up.s1,  s*mu03_BSM, in->sp_up.s1);
+  _vector_add_i_mul(out->sp_up.s2, -s*mu03_BSM, in->sp_up.s2);
+  _vector_add_i_mul(out->sp_up.s3, -s*mu03_BSM, in->sp_up.s3);
+  
+  
+  // out_dn +=- s * i \gamma_5 \mu3 * in_dn
+  _vector_add_i_mul(out->sp_dn.s0, -s*mu03_BSM, in->sp_dn.s0);
+  _vector_add_i_mul(out->sp_dn.s1, -s*mu03_BSM, in->sp_dn.s1);
+  _vector_add_i_mul(out->sp_dn.s2,  s*mu03_BSM, in->sp_dn.s2);
+  _vector_add_i_mul(out->sp_dn.s3,  s*mu03_BSM, in->sp_dn.s3);
+  
+}
+static inline void tm1_add(bispinor * const out, const bispinor * const in, const double sign)
+{  
+  /*out+=s*i\gamma_5 \tau_1 mu1 *in
+   * sign>0 for D+i\gamma_5\tau_1
+   * sign<0 for D_dag-i\gamma_5\tau_1
+   */
+  double s = +1.;
+  if(sign < 0) s = -1.;
+  
+  // out_up += s * i \gamma_5 \mu1 * in_dn
+  _vector_add_i_mul(out->sp_up.s0,  s*mu01_BSM, in->sp_dn.s0);
+  _vector_add_i_mul(out->sp_up.s1,  s*mu01_BSM, in->sp_dn.s1);
+  _vector_add_i_mul(out->sp_up.s2, -s*mu01_BSM, in->sp_dn.s2);
+  _vector_add_i_mul(out->sp_up.s3, -s*mu01_BSM, in->sp_dn.s3);
+  
+  
+  // out_dn += s * i \gamma_5 \mu1 * in_up
+  _vector_add_i_mul(out->sp_dn.s0,  s*mu01_BSM, in->sp_up.s0);
+  _vector_add_i_mul(out->sp_dn.s1,  s*mu01_BSM, in->sp_up.s1);
+  _vector_add_i_mul(out->sp_dn.s2, -s*mu01_BSM, in->sp_up.s2);
+  _vector_add_i_mul(out->sp_dn.s3, -s*mu01_BSM, in->sp_up.s3);
+  
+}
+
 static inline void Fadd(bispinor * const out, const bispinor * const in, const scalar * const phi, const double c, const double sign) {
   static spinor tmp;
   double s = +1.;
@@ -348,7 +404,7 @@ void D_psi_BSM2f(bispinor * const P, bispinor * const Q){
   scalar phim[4][4];                   // phi_i(x-mu) = phim[mu][i]
   bispinor ALIGN stmp2;
 
-
+#if defined TM_USE_MPI
   MPI_Status  statuses[8];
   MPI_Request *request;
   request=( MPI_Request *) malloc(sizeof(MPI_Request)*8);
@@ -359,7 +415,7 @@ void D_psi_BSM2f(bispinor * const P, bispinor * const Q){
   generic_exchange_direction_nonblocking(Q, sizeof(bispinor), XUP, request, &count);
   generic_exchange_direction_nonblocking(Q, sizeof(bispinor), YUP, request, &count);
   generic_exchange_direction_nonblocking(Q, sizeof(bispinor), ZUP, request, &count);
-
+#endif
 //  computing backward
   for (ix=0;ix<VOLUME;ix++)
   {
@@ -393,7 +449,7 @@ void D_psi_BSM2f(bispinor * const P, bispinor * const Q){
     upm = &g_gauge_field[ix][ZUP];
     padd(rr4, s, upm, HOP_DN, 0.5*phase_3);
   }
-  
+ #if defined TM_USE_MPI 
   MPI_Waitall( count, request, statuses);
 
 //gathering backward
@@ -403,7 +459,7 @@ void D_psi_BSM2f(bispinor * const P, bispinor * const Q){
   generic_exchange_direction_nonblocking(vm3, sizeof(bispinor), XDOWN, request, &count);
   generic_exchange_direction_nonblocking(vm2, sizeof(bispinor), YDOWN, request, &count);
   generic_exchange_direction_nonblocking(vm1, sizeof(bispinor), ZDOWN, request, &count);
-
+#endif
 //computing forward
   for (ix=0;ix<VOLUME;ix++)
   {
@@ -440,9 +496,9 @@ void D_psi_BSM2f(bispinor * const P, bispinor * const Q){
     padd(rr3, spm,  upm, HOP_UP, 0.5*phase_3);
 
   }
-
+#if defined TM_USE_MPI
   MPI_Waitall( count, request, statuses);
-
+#endif
 // join
   for (ix=0; ix<VOLUME; ++ix){
 
@@ -486,12 +542,13 @@ void D_psi_BSM2f(bispinor * const P, bispinor * const Q){
 
 
 //start gathering forward
+#if defined TM_USE_MPI
   count=0;
   generic_exchange_direction_nonblocking(vp1, sizeof(bispinor), TUP, request, &count);
   generic_exchange_direction_nonblocking(vp2, sizeof(bispinor), XUP, request, &count);
   generic_exchange_direction_nonblocking(vp3, sizeof(bispinor), YUP, request, &count);
   generic_exchange_direction_nonblocking(vp4, sizeof(bispinor), ZUP, request, &count);
-
+#endif
 
 //start computing backward
   for (ix=0;ix<VOLUME;ix++)
@@ -546,6 +603,7 @@ void D_psi_BSM2f(bispinor * const P, bispinor * const Q){
     Fadd( rrs3, &stmp2, phi, -0.125*rho_BSM, +1. );
 
   }
+#if defined TM_USE_MPI
   MPI_Waitall( count, request, statuses);
 
 //gathering backward
@@ -554,7 +612,7 @@ void D_psi_BSM2f(bispinor * const P, bispinor * const Q){
   generic_exchange_direction_nonblocking(v2m2, sizeof(bispinor), XDOWN, request, &count);
   generic_exchange_direction_nonblocking(v2m3, sizeof(bispinor), YDOWN, request, &count);
   generic_exchange_direction_nonblocking(v2m4, sizeof(bispinor), ZDOWN, request, &count);
-
+#endif
 //computing forward
   for (ix=0;ix<VOLUME;ix++)
   {
@@ -599,8 +657,9 @@ void D_psi_BSM2f(bispinor * const P, bispinor * const Q){
     Fadd( rr, &stmp2, phip[ZUP], -0.125*rho_BSM, +1. );
   }
 
+#if defined TM_USE_MPI
   MPI_Waitall( count, request, statuses);
-
+#endif
 //join
 
  for (ix=0; ix<VOLUME; ++ix){
@@ -645,9 +704,18 @@ void D_psi_BSM2f(bispinor * const P, bispinor * const Q){
       Fadd(rr, s, phim[mu], 0.125*rho_BSM, +1. );
    }
 
+   // tmpr+=i\gamma_5\tau_1 mu0 *Q 
+    if( fabs(mu01_BSM) > 1.e-10 )
+        tm1_add(rr, s, 1);
+    
+   // tmpr+=i\gamma_5\tau_3 mu0 *Q 
+    if( fabs(mu03_BSM) > 1.e-10 )
+        tm3_add(rr, s, 1);
 
   } // end volume loop
+#if defined TM_USE_MPI
   free(request);
+#endif
 
 }
 
@@ -687,6 +755,7 @@ void D_psi_dagger_BSM2f(bispinor * const P, bispinor * const Q){
   scalar phim[4][4];                   // phi_i(x-mu) = phim[mu][i]
   bispinor ALIGN stmp2;
 
+#if defined TM_USE_MPI
   MPI_Status  statuses[8];
   MPI_Request *request;
   request=( MPI_Request *) malloc(sizeof(MPI_Request)*8);
@@ -698,6 +767,7 @@ void D_psi_dagger_BSM2f(bispinor * const P, bispinor * const Q){
   generic_exchange_direction_nonblocking(Q, sizeof(bispinor), YUP, request, &count);
   generic_exchange_direction_nonblocking(Q, sizeof(bispinor), ZUP, request, &count);
 
+#endif
 //  computing backward
   for (ix=0;ix<VOLUME;ix++)
   {
@@ -731,7 +801,7 @@ void D_psi_dagger_BSM2f(bispinor * const P, bispinor * const Q){
     upm = &g_gauge_field[ix][ZUP];
     padd(rr4, s, upm, HOP_DN, 0.5*phase_3);
   }
-
+#if defined TM_USE_MPI
   MPI_Waitall( count, request, statuses);
 
 //gathering backward
@@ -741,7 +811,7 @@ void D_psi_dagger_BSM2f(bispinor * const P, bispinor * const Q){
   generic_exchange_direction_nonblocking(vm3, sizeof(bispinor), XDOWN, request, &count);
   generic_exchange_direction_nonblocking(vm2, sizeof(bispinor), YDOWN, request, &count);
   generic_exchange_direction_nonblocking(vm1, sizeof(bispinor), ZDOWN, request, &count);
-
+#endif
 //computing forward
   for (ix=0;ix<VOLUME;ix++)
   {
@@ -778,9 +848,9 @@ void D_psi_dagger_BSM2f(bispinor * const P, bispinor * const Q){
     padd(rr3, spm,  upm, HOP_UP, 0.5*phase_3);
 
   }
-
+#if defined TM_USE_MPI
   MPI_Waitall( count, request, statuses);
-
+#endif
 // join
   for (ix=0; ix<VOLUME; ++ix){
 
@@ -828,12 +898,13 @@ void D_psi_dagger_BSM2f(bispinor * const P, bispinor * const Q){
 
 
 //start gathering forward
+#if defined TM_USE_MPI
   count=0;
   generic_exchange_direction_nonblocking(vp1, sizeof(bispinor), TUP, request, &count);
   generic_exchange_direction_nonblocking(vp2, sizeof(bispinor), XUP, request, &count);
   generic_exchange_direction_nonblocking(vp3, sizeof(bispinor), YUP, request, &count);
   generic_exchange_direction_nonblocking(vp4, sizeof(bispinor), ZUP, request, &count);
-
+#endif
 //start computing backward
   for (ix=0;ix<VOLUME;ix++)
   {
@@ -887,6 +958,7 @@ void D_psi_dagger_BSM2f(bispinor * const P, bispinor * const Q){
     Fadd( rrs3, &stmp2, phi, -0.125*rho_BSM, -1. );
 
   }
+#if defined TM_USE_MPI
   MPI_Waitall( count, request, statuses);
 
 //gathering backward
@@ -896,7 +968,7 @@ void D_psi_dagger_BSM2f(bispinor * const P, bispinor * const Q){
   generic_exchange_direction_nonblocking(v2m2, sizeof(bispinor), XDOWN, request, &count);
   generic_exchange_direction_nonblocking(v2m3, sizeof(bispinor), YDOWN, request, &count);
   generic_exchange_direction_nonblocking(v2m4, sizeof(bispinor), ZDOWN, request, &count);
-
+#endif
 //computing forward
   for (ix=0;ix<VOLUME;ix++)
   {
@@ -941,8 +1013,9 @@ void D_psi_dagger_BSM2f(bispinor * const P, bispinor * const Q){
     Fadd( rr, &stmp2, phip[ZUP], -0.125*rho_BSM, -1. );
   }
 
+#if defined TM_USE_MPI
   MPI_Waitall( count, request, statuses);
-
+#endif
 
 //join
 
@@ -988,16 +1061,27 @@ void D_psi_dagger_BSM2f(bispinor * const P, bispinor * const Q){
        Fadd(rr, s, phim[mu], 0.125*rho_BSM, -1. );
     }
 
+   // tmpr+=i\gamma_5\tau_1 mu0 *Q 
+    if( fabs(mu01_BSM) > 1.e-10 )
+        tm1_add(rr, s, -1);
+    
+   // tmpr+=i\gamma_5\tau_3 mu0 *Q 
+    if( fabs(mu03_BSM) > 1.e-10 )
+        tm3_add(rr, s, -1);
 
   } // end volume loop
+//  for (ix=0; ix<VOLUME; ++ix){  
+//  }
+#if defined TM_USE_MPI
   free(request);
+#endif
 }
 /* Q2_psi_BSM2f acts on bispinor fields */
 void Q2_psi_BSM2f(bispinor * const P, bispinor * const Q){
 
   /* TODO: the use of [3] has to be changed to avoid future conflicts */
-  D_psi_dagger_BSM2f(g_bispinor_field[3] , Q);
-  D_psi_BSM2f(P, g_bispinor_field[3]);
+  D_psi_dagger_BSM2f(tempor , Q);
+  D_psi_BSM2f(P, tempor);
   // only use these cycles if the m0_BSM parameter is really nonzero...
   if( fabs(m0_BSM) > 1.e-10 ){
     /* Q and P are spinor, not bispinor ==> made a cast */
@@ -1005,3 +1089,4 @@ void Q2_psi_BSM2f(bispinor * const P, bispinor * const Q){
   }
 
 }
+#endif
diff --git a/operator/D_psi_BSM2f.h b/operator/D_psi_BSM2f.h
index df619bc35..dce1b6429 100644
--- a/operator/D_psi_BSM2f.h
+++ b/operator/D_psi_BSM2f.h
@@ -24,6 +24,8 @@
 
 //#include "block.h"
 
+void init_D_psi_BSM2f(void);
+void free_D_psi_BSM2f(void);
 void D_psi_BSM2f(bispinor * const P, bispinor * const Q);
 void D_psi_dagger_BSM2f(bispinor * const P, bispinor * const Q);
 void Q2_psi_BSM2f(bispinor * const P, bispinor * const Q);
diff --git a/operator/D_psi_BSM2m.c b/operator/D_psi_BSM2m.c
index 726c3f2b6..5531b78ab 100644
--- a/operator/D_psi_BSM2m.c
+++ b/operator/D_psi_BSM2m.c
@@ -25,11 +25,11 @@
  * Action of a Dirac operator (Frezzotti-Rossi BSM toy model) on a bispinor field
  *
  *******************************************************************************/
-
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 
+#ifdef TM_USE_BSM
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
@@ -37,7 +37,7 @@
 #include "su3.h"
 #include "sse.h"
 #include "boundary.h"
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include "xchange/xchange.h"
 #endif
 #include "update_backward_gauge.h"
@@ -47,12 +47,60 @@
 #include "buffers/utils.h"
 #include "linalg_eo.h"
 
+static inline void tm3_add(bispinor * const out, const bispinor * const in, const double sign)
+{  
+  /*out+=s*i\gamma_5 \tau_3 mu3 *in
+   * sign>0 for D+i\gamma_5\tau_3
+   * sign<0 for D_dag-i\gamma_5\tau_3
+   */
+  double s = +1.;
+  if(sign < 0) s = -1.;
+  
+  // out_up += s * i \gamma_5 \mu3 * in_up
+  _vector_add_i_mul(out->sp_up.s0,  s*mu03_BSM, in->sp_up.s0);
+  _vector_add_i_mul(out->sp_up.s1,  s*mu03_BSM, in->sp_up.s1);
+  _vector_add_i_mul(out->sp_up.s2, -s*mu03_BSM, in->sp_up.s2);
+  _vector_add_i_mul(out->sp_up.s3, -s*mu03_BSM, in->sp_up.s3);
+  
+  
+  // out_dn +=- s * i \gamma_5 \mu3 * in_dn
+  _vector_add_i_mul(out->sp_dn.s0, -s*mu03_BSM, in->sp_dn.s0);
+  _vector_add_i_mul(out->sp_dn.s1, -s*mu03_BSM, in->sp_dn.s1);
+  _vector_add_i_mul(out->sp_dn.s2,  s*mu03_BSM, in->sp_dn.s2);
+  _vector_add_i_mul(out->sp_dn.s3,  s*mu03_BSM, in->sp_dn.s3);
+  
+}
+static inline void tm1_add(bispinor * const out, const bispinor * const in, const double sign)
+{  
+  /*out+=s*i\gamma_5 \tau_1 mu1 *in
+   * sign>0 for D+i\gamma_5\tau_1
+   * sign<0 for D_dag-i\gamma_5\tau_1
+   */
+  double s = +1.;
+  if(sign < 0) s = -1.;
+  
+  // out_up += s * i \gamma_5 \mu1 * in_dn
+  _vector_add_i_mul(out->sp_up.s0,  s*mu01_BSM, in->sp_dn.s0);
+  _vector_add_i_mul(out->sp_up.s1,  s*mu01_BSM, in->sp_dn.s1);
+  _vector_add_i_mul(out->sp_up.s2, -s*mu01_BSM, in->sp_dn.s2);
+  _vector_add_i_mul(out->sp_up.s3, -s*mu01_BSM, in->sp_dn.s3);
+  
+  
+  // out_dn += s * i \gamma_5 \mu1 * in_up
+  _vector_add_i_mul(out->sp_dn.s0,  s*mu01_BSM, in->sp_up.s0);
+  _vector_add_i_mul(out->sp_dn.s1,  s*mu01_BSM, in->sp_up.s1);
+  _vector_add_i_mul(out->sp_dn.s2, -s*mu01_BSM, in->sp_up.s2);
+  _vector_add_i_mul(out->sp_dn.s3, -s*mu01_BSM, in->sp_up.s3);
+  
+}
+
+
 void Fadd(bispinor * const out, const bispinor * const in, const scalar * const phi, const double c, const double sign) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static spinor tmp;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
   
@@ -126,11 +174,11 @@ void Fadd(bispinor * const out, const bispinor * const in, const scalar * const
 static inline void bispinor_times_phase_times_u(bispinor * restrict const us, const _Complex double phase,
 						su3 const * restrict const u, bispinor const * restrict const s)
 {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static su3_vector chi;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -164,11 +212,11 @@ static inline void bispinor_times_phase_times_u(bispinor * restrict const us, co
 static inline void bispinor_times_phase_times_inverse_u(bispinor * restrict const us, const _Complex double phase,
 							su3 const * restrict const u, bispinor const * restrict const s)
 {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static su3_vector chi;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -208,11 +256,11 @@ static inline void bispinor_times_phase_times_inverse_u(bispinor * restrict cons
 static inline void p0add_gamma(bispinor * restrict const tmpr , bispinor const * restrict const s,
                          su3 const * restrict const u, const int inv, const _Complex double phase) {
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static bispinor us;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -241,11 +289,11 @@ static inline void p0add_gamma(bispinor * restrict const tmpr , bispinor const *
 
 static inline void p1add_gamma(bispinor * restrict const tmpr, bispinor const * restrict const s,
                          su3 const * restrict const u, const int inv, const _Complex double phase) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static bispinor us;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -272,11 +320,11 @@ static inline void p1add_gamma(bispinor * restrict const tmpr, bispinor const *
 
 static inline void p2add_gamma(bispinor * restrict const tmpr, bispinor const * restrict const s,
                          su3 const * restrict const u, const int inv, const _Complex double phase) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static bispinor us;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -302,11 +350,11 @@ static inline void p2add_gamma(bispinor * restrict const tmpr, bispinor const *
 
 static inline void p3add_gamma(bispinor * restrict const tmpr, bispinor const * restrict const s,
                          su3 const * restrict const u, const int inv, const _Complex double phase) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static bispinor us;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -335,11 +383,11 @@ static inline void padd_F(bispinor * restrict const tmpr , bispinor const * rest
                          su3 const * restrict const u, const int inv, const _Complex double phase,
                          const double phaseF, const scalar * const phi,
                          const double sign) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static bispinor us;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -371,11 +419,11 @@ void D_psi_BSM2m(bispinor * const P, bispinor * const Q){
     update_backward_gauge(g_gauge_field);
   }
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
   generic_exchange(Q, sizeof(bispinor));
 #endif
         
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -396,7 +444,7 @@ void D_psi_BSM2m(bispinor * const P, bispinor * const Q){
     c_phase_33=conj(phase_33);
 
 /************************ loop over all lattice sites *************************/
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
     for (ix=0;ix<VOLUME;ix++)
@@ -548,9 +596,16 @@ void D_psi_BSM2m(bispinor * const P, bispinor * const Q){
          _su3d_times_su3d(tempuu,*um,*up);uu=&tempuu;
         padd_F(rr, sm, uu, 0, c_phase_33, -0.125*rho_BSM, phim[3], +1.);
  
-        
+  	 // tmpr+=i\gamma_5\tau_1 mu1 *Q 
+        if( fabs(mu01_BSM) > 1.e-10 )
+            tm1_add(rr, s, 1);
+    
+       // tmpr+=i\gamma_5\tau_3 mu3 *Q 
+        if( fabs(mu03_BSM) > 1.e-10 )
+            tm3_add(rr, s, 1);
+
       }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
@@ -568,11 +623,11 @@ void D_psi_dagger_BSM2m(bispinor * const P, bispinor * const Q){
     update_backward_gauge(g_gauge_field);
   }
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
   generic_exchange(Q, sizeof(bispinor));
 #endif
         
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -595,7 +650,7 @@ void D_psi_dagger_BSM2m(bispinor * const P, bispinor * const Q){
 
     /************************ loop over all lattice sites *************************/
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
     for (ix=0;ix<VOLUME;ix++)
@@ -746,9 +801,18 @@ void D_psi_dagger_BSM2m(bispinor * const P, bispinor * const Q){
         sm = (bispinor *) Q +iy;
         _su3d_times_su3d(tempuu,*um,*up);uu=&tempuu;
         padd_F(rr, sm, uu, 0, c_phase_33, -0.125*rho_BSM, phim[3], -1.);
+        
+       // tmpr+=i\gamma_5\tau_1 mu1 *Q 
+        if( fabs(mu01_BSM) > 1.e-10 )
+            tm1_add(rr, s, -1);
+    
+       // tmpr+=i\gamma_5\tau_3 mu3 *Q 
+        if( fabs(mu03_BSM) > 1.e-10 )
+            tm3_add(rr, s, -1);
 
     }
-#ifdef OMP
+
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
@@ -765,4 +829,4 @@ void Q2_psi_BSM2m(bispinor * const P, bispinor * const Q){
   }
 
 }
-
+#endif
diff --git a/operator/D_psi_BSM3.c b/operator/D_psi_BSM3.c
new file mode 100644
index 000000000..552e923b4
--- /dev/null
+++ b/operator/D_psi_BSM3.c
@@ -0,0 +1,1295 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach,
+ * 2014 Mario Schroeck
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+ * GNU General Public License for more deta_BSMils.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.	If not, see <http://www.gnu.org/licenses/>.
+ *
+ *******************************************************************************/
+
+/*******************************************************************************
+ *
+ * Action of a Dirac operator (Frezzotti-Rossi BSM toy model) on a bispinor field
+ *
+ *******************************************************************************/
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+
+#ifdef TM_USE_BSM
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "global.h"
+#include "su3.h"
+#include "sse.h"
+#include "boundary.h"
+#ifdef TM_USE_MPI
+# include "xchange/xchange.h"
+#endif
+#include "update_backward_gauge.h"
+#include "block.h"
+#include "operator/D_psi_BSM.h"
+#include "operator/D_psi_BSM3.h"
+#include "solver/dirac_operator_eigenvectors.h"
+#include "buffers/utils.h"
+#include "linalg_eo.h"
+
+#include "operator/clovertm_operators.h"
+#include "operator/clover_leaf.h"
+
+
+static bispinor *tempor;
+
+void init_D_psi_BSM3(){
+
+
+     tempor=(bispinor *)calloc(VOLUMEPLUSRAND,sizeof(bispinor));
+}
+void free_D_psi_BSM3(){
+
+     free(tempor);
+}
+
+static inline void tm3_add(bispinor * const out, const bispinor * const in, const double sign)
+{
+  /*out+=s*i\gamma_5 \tau_3 mu3 *in
+   * sign>0 for D+i\gamma_5\tau_3
+   * sign<0 for D_dag-i\gamma_5\tau_3
+   */
+  const double s = (sign < 0) ? -1. : 1. ;
+
+  /* out_up += s * i \gamma_5 \mu3 * in_up */
+  _vector_add_i_mul(out->sp_up.s0,  s*mu03_BSM, in->sp_up.s0);
+  _vector_add_i_mul(out->sp_up.s1,  s*mu03_BSM, in->sp_up.s1);
+  _vector_add_i_mul(out->sp_up.s2, -s*mu03_BSM, in->sp_up.s2);
+  _vector_add_i_mul(out->sp_up.s3, -s*mu03_BSM, in->sp_up.s3);
+
+
+  /* out_dn +=- s * i \gamma_5 \mu3 * in_dn */
+  _vector_add_i_mul(out->sp_dn.s0, -s*mu03_BSM, in->sp_dn.s0);
+  _vector_add_i_mul(out->sp_dn.s1, -s*mu03_BSM, in->sp_dn.s1);
+  _vector_add_i_mul(out->sp_dn.s2,  s*mu03_BSM, in->sp_dn.s2);
+  _vector_add_i_mul(out->sp_dn.s3,  s*mu03_BSM, in->sp_dn.s3);
+
+}
+static inline void tm1_add(bispinor * const out, const bispinor * const in, const double sign)
+{
+  /*out+=s*i\gamma_5 \tau_1 mu1 *in
+   * sign>0 for D+i\gamma_5\tau_1
+   * sign<0 for D_dag-i\gamma_5\tau_1
+   */
+  const double s = (sign < 0) ? -1. : 1.;
+
+  /* out_up += s * i \gamma_5 \mu1 * in_dn */
+  _vector_add_i_mul(out->sp_up.s0,  s*mu01_BSM, in->sp_dn.s0);
+  _vector_add_i_mul(out->sp_up.s1,  s*mu01_BSM, in->sp_dn.s1);
+  _vector_add_i_mul(out->sp_up.s2, -s*mu01_BSM, in->sp_dn.s2);
+  _vector_add_i_mul(out->sp_up.s3, -s*mu01_BSM, in->sp_dn.s3);
+
+
+  /* out_dn += s * i \gamma_5 \mu1 * in_up */
+  _vector_add_i_mul(out->sp_dn.s0,  s*mu01_BSM, in->sp_up.s0);
+  _vector_add_i_mul(out->sp_dn.s1,  s*mu01_BSM, in->sp_up.s1);
+  _vector_add_i_mul(out->sp_dn.s2, -s*mu01_BSM, in->sp_up.s2);
+  _vector_add_i_mul(out->sp_dn.s3, -s*mu01_BSM, in->sp_up.s3);
+
+}
+
+
+/* operation out(x) += Fabs(y)*in(x)
+ * Fabs(y) := [ \phi_0(y)**2 + \sum_j \phi_j(y)**2 ] * c
+ * this operator acts locally on a site x, pass pointers accordingly.
+ * out: the resulting bispinor, out += F*in
+ * in:  the input bispinor at site x
+ * phi: pointer to the four scalars phi0,...,phi3 at site y, y = x or x+-\mu
+ * c:    constant double
+ */
+
+static inline void Fabsadd(bispinor * const out, const bispinor * const in, const scalar * const phi, const double c) {
+  const double c_phisq = c*(phi[0]*phi[0]+phi[1]*phi[1]+phi[2]*phi[2]+phi[3]*phi[3]);
+
+  // flavour 1:
+  // out_up += c(\phi_0 \phi_0 + \phi_1 \phi_1 + \phi_2 \phi_2+ \phi_3 \phi_3)* in_up
+  _vector_add_mul(out->sp_up.s0, c_phisq, in->sp_up.s0);
+  _vector_add_mul(out->sp_up.s1, c_phisq, in->sp_up.s1);
+  _vector_add_mul(out->sp_up.s2, c_phisq, in->sp_up.s2);
+  _vector_add_mul(out->sp_up.s3, c_phisq, in->sp_up.s3);
+
+  // flavour 2:
+  // out_dn += c(\phi_0 \phi_0 + \phi_1 \phi_1 + \phi_2 \phi_2+ \phi_3 \phi_3)* in_dn
+  _vector_add_mul(out->sp_dn.s0, c_phisq, in->sp_dn.s0);
+  _vector_add_mul(out->sp_dn.s1, c_phisq, in->sp_dn.s1);
+  _vector_add_mul(out->sp_dn.s2, c_phisq, in->sp_dn.s2);
+  _vector_add_mul(out->sp_dn.s3, c_phisq, in->sp_dn.s3);
+
+}
+
+
+
+/* operation out(x) += F(y)*in(x)
+ * F(y) := [ \phi_0(y) + i \gamma_5 \tau^j \phi_j(y) ] * c
+ * this operator acts locally on a site x, pass pointers accordingly.
+ * out: the resulting bispinor, out += F*in
+ * in:  the input bispinor at site x
+ * phi: pointer to the four scalars phi0,...,phi3 at site y, y = x or x+-\mu
+ * c:    constant double
+ *
+ * sign = +1 -> Fadd
+ * sign = -1 -> Fbaradd
+ */
+
+static inline void Fadd(bispinor * const out, const bispinor * const in, const scalar * const phi, const double c, const double sign) {
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static spinor tmp;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+  
+  const double s = (sign < 0) ? -1. : 1.;
+
+  // flavour 1:
+  // tmp_up = \phi_0 * in_up
+  _vector_mul(tmp.s0, phi[0], in->sp_up.s0);
+  _vector_mul(tmp.s1, phi[0], in->sp_up.s1);
+  _vector_mul(tmp.s2, phi[0], in->sp_up.s2);
+  _vector_mul(tmp.s3, phi[0], in->sp_up.s3);
+  
+  // tmp_up += s * i \gamma_5 \phi_1 * in_dn
+  _vector_add_i_mul(tmp.s0,  s*phi[1], in->sp_dn.s0);
+  _vector_add_i_mul(tmp.s1,  s*phi[1], in->sp_dn.s1);
+  _vector_add_i_mul(tmp.s2, -s*phi[1], in->sp_dn.s2);
+  _vector_add_i_mul(tmp.s3, -s*phi[1], in->sp_dn.s3);
+  
+  // tmp_up += s * \gamma_5 \phi_2 * in_dn
+  _vector_add_mul(tmp.s0,  s*phi[2], in->sp_dn.s0);
+  _vector_add_mul(tmp.s1,  s*phi[2], in->sp_dn.s1);
+  _vector_add_mul(tmp.s2, -s*phi[2], in->sp_dn.s2);
+  _vector_add_mul(tmp.s3, -s*phi[2], in->sp_dn.s3);
+  
+  // tmp_up += s * i \gamma_5 \phi_3 * in_up
+  _vector_add_i_mul(tmp.s0,  s*phi[3], in->sp_up.s0);
+  _vector_add_i_mul(tmp.s1,  s*phi[3], in->sp_up.s1);
+  _vector_add_i_mul(tmp.s2, -s*phi[3], in->sp_up.s2);
+  _vector_add_i_mul(tmp.s3, -s*phi[3], in->sp_up.s3);
+  
+  // out_up += c * tmp;
+  _vector_add_mul(out->sp_up.s0,c,tmp.s0);
+  _vector_add_mul(out->sp_up.s1,c,tmp.s1);
+  _vector_add_mul(out->sp_up.s2,c,tmp.s2);
+  _vector_add_mul(out->sp_up.s3,c,tmp.s3);
+  
+  
+  // flavour 2:
+  // tmp_dn = \phi_0 * in_dn
+  _vector_mul(tmp.s0, phi[0], in->sp_dn.s0);
+  _vector_mul(tmp.s1, phi[0], in->sp_dn.s1);
+  _vector_mul(tmp.s2, phi[0], in->sp_dn.s2);
+  _vector_mul(tmp.s3, phi[0], in->sp_dn.s3);
+  
+  // tmp_dn += s * i \gamma_5 \phi_1 * in_up
+  _vector_add_i_mul(tmp.s0,  s*phi[1], in->sp_up.s0);
+  _vector_add_i_mul(tmp.s1,  s*phi[1], in->sp_up.s1);
+  _vector_add_i_mul(tmp.s2, -s*phi[1], in->sp_up.s2);
+  _vector_add_i_mul(tmp.s3, -s*phi[1], in->sp_up.s3);
+  
+  // tmp_dn -= s * \gamma_5 \phi_2 * in_up
+  _vector_add_mul(tmp.s0, -s*phi[2], in->sp_up.s0);
+  _vector_add_mul(tmp.s1, -s*phi[2], in->sp_up.s1);
+  _vector_add_mul(tmp.s2,  s*phi[2], in->sp_up.s2);
+  _vector_add_mul(tmp.s3,  s*phi[2], in->sp_up.s3);
+  
+  // tmp_dn -= s * i \gamma_5 \phi_3 * in_dn
+  _vector_add_i_mul(tmp.s0, -s*phi[3], in->sp_dn.s0);
+  _vector_add_i_mul(tmp.s1, -s*phi[3], in->sp_dn.s1);
+  _vector_add_i_mul(tmp.s2,  s*phi[3], in->sp_dn.s2);
+  _vector_add_i_mul(tmp.s3,  s*phi[3], in->sp_dn.s3);
+  
+  // out_dn += c * tmp;
+  _vector_add_mul(out->sp_dn.s0,c,tmp.s0);
+  _vector_add_mul(out->sp_dn.s1,c,tmp.s1);
+  _vector_add_mul(out->sp_dn.s2,c,tmp.s2);
+  _vector_add_mul(out->sp_dn.s3,c,tmp.s3);
+}
+
+static inline void bispinor_times_phase_times_u(bispinor * restrict const us, const _Complex double phase,
+						su3 const * restrict const u, bispinor const * restrict const s)
+{
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static su3_vector chi;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+
+  _su3_multiply(chi, (*u), s->sp_up.s0);
+  _complex_times_vector(us->sp_up.s0, phase, chi);
+
+  _su3_multiply(chi, (*u), s->sp_up.s1);
+  _complex_times_vector(us->sp_up.s1, phase, chi);
+
+  _su3_multiply(chi, (*u), s->sp_up.s2);
+  _complex_times_vector(us->sp_up.s2, phase, chi);
+
+  _su3_multiply(chi, (*u), s->sp_up.s3);
+  _complex_times_vector(us->sp_up.s3, phase, chi);
+
+  _su3_multiply(chi, (*u), s->sp_dn.s0);
+  _complex_times_vector(us->sp_dn.s0, phase, chi);
+
+  _su3_multiply(chi, (*u), s->sp_dn.s1);
+  _complex_times_vector(us->sp_dn.s1, phase, chi);
+
+  _su3_multiply(chi, (*u), s->sp_dn.s2);
+  _complex_times_vector(us->sp_dn.s2, phase, chi);
+
+  _su3_multiply(chi, (*u), s->sp_dn.s3);
+  _complex_times_vector(us->sp_dn.s3, phase, chi);
+
+  return;
+}
+
+
+static inline void bispinor_times_phase_times_inverse_u(bispinor * restrict const us, const _Complex double phase,
+							su3 const * restrict const u, bispinor const * restrict const s)
+{
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static su3_vector chi;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+
+  _su3_inverse_multiply(chi, (*u), s->sp_up.s0);
+  _complexcjg_times_vector(us->sp_up.s0, phase, chi);
+
+  _su3_inverse_multiply(chi, (*u), s->sp_up.s1);
+  _complexcjg_times_vector(us->sp_up.s1, phase, chi);
+
+  _su3_inverse_multiply(chi, (*u), s->sp_up.s2);
+  _complexcjg_times_vector(us->sp_up.s2, phase, chi);
+
+  _su3_inverse_multiply(chi, (*u), s->sp_up.s3);
+  _complexcjg_times_vector(us->sp_up.s3, phase, chi);
+
+  _su3_inverse_multiply(chi, (*u), s->sp_dn.s0);
+  _complexcjg_times_vector(us->sp_dn.s0, phase, chi);
+
+  _su3_inverse_multiply(chi, (*u), s->sp_dn.s1);
+  _complexcjg_times_vector(us->sp_dn.s1, phase, chi);
+
+  _su3_inverse_multiply(chi, (*u), s->sp_dn.s2);
+  _complexcjg_times_vector(us->sp_dn.s2, phase, chi);
+
+  _su3_inverse_multiply(chi, (*u), s->sp_dn.s3);
+  _complexcjg_times_vector(us->sp_dn.s3, phase, chi);
+
+  return;
+}
+
+static inline void bispinor_times_real_times_inverse_u(bispinor * restrict const us, const double realnum,
+                                                        su3 const * restrict const u, bispinor const * restrict const s)
+{
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static su3_vector chi;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+
+
+  _su3_inverse_multiply(chi, (*u), s->sp_up.s0);
+  _vector_null( us->sp_up.s0 );
+  _vector_add_mul( us->sp_up.s0, realnum, chi );
+
+  _su3_inverse_multiply(chi, (*u), s->sp_up.s1);
+  _vector_null( us->sp_up.s1 );
+  _vector_add_mul( us->sp_up.s1, realnum, chi );
+
+  _su3_inverse_multiply(chi, (*u), s->sp_up.s2);
+  _vector_null( us->sp_up.s2 );
+  _vector_add_mul( us->sp_up.s2, realnum, chi );
+
+  _su3_inverse_multiply(chi, (*u), s->sp_up.s3);
+  _vector_null( us->sp_up.s3 );
+  _vector_add_mul( us->sp_up.s3, realnum, chi );
+
+  _su3_inverse_multiply(chi, (*u), s->sp_dn.s0);
+  _vector_null( us->sp_dn.s0 );
+  _vector_add_mul( us->sp_dn.s0, realnum, chi );
+
+  _su3_inverse_multiply(chi, (*u), s->sp_dn.s1);
+  _vector_null( us->sp_dn.s1 );
+  _vector_add_mul( us->sp_dn.s1, realnum, chi );
+
+  _su3_inverse_multiply(chi, (*u), s->sp_dn.s2);
+  _vector_null( us->sp_dn.s2 );
+  _vector_add_mul( us->sp_dn.s2, realnum, chi );
+
+  _su3_inverse_multiply(chi, (*u), s->sp_dn.s3);
+  _vector_null( us->sp_dn.s3 );
+  _vector_add_mul( us->sp_dn.s3, realnum, chi );
+
+  return;
+}
+
+
+//This takes care of -rho/4*F(\tilde x)*Z(tilde)(tilde x) psi(tilde x)
+//One has to set the location of the field beforehand correctly here they are just vectors
+//in spinor color and flavor space
+//note the -factor is implemented through 0.5 in both phasef and phase -->>altogether a factor of 1/4
+static inline void padd_chitildebreak(bispinor * restrict const tmpr , bispinor const * restrict const s,
+                         su3 const * restrict const u, const int inv, const _Complex double phase,
+                         const double phaseF, const scalar * const phi, const scalar * const phip,
+                         const double sign) {
+
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static bispinor us;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+
+  // us = phase*u*s
+  ( inv == 0 ) ? bispinor_times_phase_times_u(&us, phase, u, s) : bispinor_times_phase_times_inverse_u(&us, phase, u, s);
+
+  // FIXME: signs of terms local and non-local in phi should be different
+  // tmpr += F*us
+  Fadd(tmpr, &us, phi,  phaseF, sign);
+  Fadd(tmpr, &us, phip, phaseF, sign);
+
+  // tmpr += b*us
+
+  return;
+}
+
+
+static inline void p0add_wilsonclover( bispinor * restrict const tmpr , bispinor const * restrict const sp,
+                         su3 const * restrict const u, const int inv, const _Complex double phase,
+                         const double sign) {
+#ifdef TM_USE_OMP
+#define static
+#endif
+// FIXME check and fix sign here once and for all ;) 
+  const int sign_gamma = (inv==1) ? -sign : sign ;
+  static su3_vector halfwilson1;
+  static su3_vector halfwilson2;
+  static su3_vector chi;
+  static su3_vector results1;
+  static su3_vector results2;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+  _vector_null( halfwilson1 );
+  _vector_null( halfwilson2 );
+  if(sign_gamma == 1){
+//Performing the multiplication on the first half of a halfspinor
+//shrink the fermion vector from four spin component to two
+//first component
+    _vector_add(     halfwilson1, sp->sp_up.s0, sp->sp_up.s2);
+    _vector_add(     halfwilson2, sp->sp_dn.s0, sp->sp_dn.s2);
+    if(inv == 1){
+      _su3_inverse_multiply(chi, (*u), halfwilson1);
+      _complexcjg_times_vector(results1, phase, chi);
+      _su3_inverse_multiply(chi, (*u), halfwilson2);
+      _complexcjg_times_vector(results2, phase, chi);
+    }
+    else{
+      _su3_multiply(chi, (*u), halfwilson1);
+      _complex_times_vector(results1, phase, chi);
+      _su3_multiply(chi, (*u), halfwilson2);
+      _complex_times_vector(results2, phase, chi);
+    }
+
+    _vector_add_assign( tmpr->sp_up.s0, results1);
+    _vector_add_assign( tmpr->sp_up.s2, results1);
+    _vector_add_assign( tmpr->sp_dn.s0, results2);
+    _vector_add_assign( tmpr->sp_dn.s2, results2);
+
+//Performing the multiplication on the first half of a halfspinor
+//shrink the fermion vector from four spin component to two
+//second component
+    _vector_add(     halfwilson1, sp->sp_up.s1, sp->sp_up.s3 );
+    _vector_add(     halfwilson2, sp->sp_dn.s1, sp->sp_dn.s3 );
+
+    if(inv == 1){
+      _su3_inverse_multiply(chi, (*u), halfwilson1);
+      _complexcjg_times_vector(results1, phase, chi);
+      _su3_inverse_multiply(chi, (*u), halfwilson2);
+      _complexcjg_times_vector(results2, phase, chi);
+    }
+    else{
+      _su3_multiply(chi, (*u), halfwilson1);
+      _complex_times_vector(results1, phase, chi);
+      _su3_multiply(chi, (*u), halfwilson2);
+      _complex_times_vector(results2, phase, chi);
+    }
+    _vector_add_assign( tmpr->sp_up.s1, results1);
+    _vector_add_assign( tmpr->sp_up.s3, results1);
+    _vector_add_assign( tmpr->sp_dn.s1, results2);
+    _vector_add_assign( tmpr->sp_dn.s3, results2);
+  }//end of if sign_gamma==1
+  else{
+//Performing the multiplication on the first half of a halfspinor
+//shrink the fermion vector from four spin component to two
+//first component
+    _vector_assign(     halfwilson1, sp->sp_up.s0);
+    _vector_assign(     halfwilson2, sp->sp_dn.s0);
+    _vector_sub_assign( halfwilson1, sp->sp_up.s2);
+    _vector_sub_assign( halfwilson2, sp->sp_dn.s2);
+//multiply the shrinked fermion vector with the gauge 
+    if(inv == 1){
+      _su3_inverse_multiply(chi, (*u), halfwilson1);
+      _complexcjg_times_vector(results1, phase, chi);
+      _su3_inverse_multiply(chi, (*u), halfwilson2);
+      _complexcjg_times_vector(results2, phase, chi);
+    }
+    else{
+      _su3_multiply(chi, (*u), halfwilson1);
+      _complex_times_vector(results1, phase, chi);
+      _su3_multiply(chi, (*u), halfwilson2);
+      _complex_times_vector(results2, phase, chi);
+    }
+//expand it
+    _vector_add_assign( tmpr->sp_up.s0, results1);
+    _vector_sub_assign( tmpr->sp_up.s2, results1);
+    _vector_add_assign( tmpr->sp_dn.s0, results2);
+    _vector_sub_assign( tmpr->sp_dn.s2, results2);
+
+//Performing the multiplication on the first half of a halfspinor
+//shrink the fermion vector from four spin component to two
+//second component
+    _vector_assign(     halfwilson1, sp->sp_up.s1);
+    _vector_assign(     halfwilson2, sp->sp_dn.s1);
+    _vector_sub_assign( halfwilson1, sp->sp_up.s3);
+    _vector_sub_assign( halfwilson2, sp->sp_dn.s3);
+//multiply the shrinked fermion vector with the gauge 
+    if(inv == 1){
+      _su3_inverse_multiply(chi, (*u), halfwilson1);
+      _complexcjg_times_vector(results1, phase, chi);
+      _su3_inverse_multiply(chi, (*u), halfwilson2);
+      _complexcjg_times_vector(results2, phase, chi);
+    }
+    else{
+      _su3_multiply(chi, (*u), halfwilson1);
+      _complex_times_vector(results1, phase, chi);
+      _su3_multiply(chi, (*u), halfwilson2);
+      _complex_times_vector(results2, phase, chi);
+    }
+//expand it
+    _vector_add_assign( tmpr->sp_up.s1, results1);
+    _vector_sub_assign( tmpr->sp_up.s3, results1);
+    _vector_add_assign( tmpr->sp_dn.s1, results2);
+    _vector_sub_assign( tmpr->sp_dn.s3, results2);
+  }
+  return;
+      
+}
+static inline void p1add_wilsonclover( bispinor * restrict const tmpr , bispinor const * restrict const sp,
+                         su3 const * restrict const u, const int inv, const _Complex double phase,
+                         const double sign) {
+#ifdef TM_USE_OMP
+#define static
+#endif
+  const int sign_gamma = (inv==1) ? -sign : sign ;
+  static su3_vector halfwilson1;
+  static su3_vector halfwilson2;
+  static su3_vector chi;
+  static su3_vector results1;
+  static su3_vector results2;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+  _vector_null( halfwilson1 );
+  _vector_null( halfwilson2 );
+  if(sign_gamma == 1){
+//Performing the multiplication on the first half of a halfspinor
+//shrink the fermion vector from four spin component to two
+//first component
+    _vector_assign(     halfwilson1, sp->sp_up.s0);
+    _vector_assign(     halfwilson2, sp->sp_dn.s0);
+    _vector_add_i_assign( halfwilson1, sp->sp_up.s3);
+    _vector_add_i_assign( halfwilson2, sp->sp_dn.s3);
+    if(inv == 1){
+      _su3_inverse_multiply(chi, (*u), halfwilson1);
+      _complexcjg_times_vector(results1, phase, chi);
+      _su3_inverse_multiply(chi, (*u), halfwilson2);
+      _complexcjg_times_vector(results2, phase, chi);
+    }
+    else{
+      _su3_multiply(chi, (*u), halfwilson1);
+      _complex_times_vector(results1, phase, chi);
+      _su3_multiply(chi, (*u), halfwilson2);
+      _complex_times_vector(results2, phase, chi);
+    }
+
+    _vector_add_assign(   tmpr->sp_up.s0, results1);
+    _vector_sub_i_assign( tmpr->sp_up.s3, results1);
+    _vector_add_assign(   tmpr->sp_dn.s0, results2);
+    _vector_sub_i_assign( tmpr->sp_dn.s3, results2);
+
+//Performing the multiplication on the first half of a halfspinor
+//shrink the fermion vector from four spin component to two
+//second component
+    _vector_assign(     halfwilson1, sp->sp_up.s1);
+    _vector_assign(     halfwilson2, sp->sp_dn.s1);
+    _vector_add_i_assign( halfwilson1, sp->sp_up.s2);
+    _vector_add_i_assign( halfwilson2, sp->sp_dn.s2);
+
+    if(inv == 1){
+      _su3_inverse_multiply(chi, (*u), halfwilson1);
+      _complexcjg_times_vector(results1, phase, chi);
+      _su3_inverse_multiply(chi, (*u), halfwilson2);
+      _complexcjg_times_vector(results2, phase, chi);
+    }
+    else{
+      _su3_multiply(chi, (*u), halfwilson1);
+      _complex_times_vector(results1, phase, chi);
+      _su3_multiply(chi, (*u), halfwilson2);
+      _complex_times_vector(results2, phase, chi);
+    }
+    _vector_add_assign( tmpr->sp_up.s1, results1);
+    _vector_sub_i_assign( tmpr->sp_up.s2, results1);
+    _vector_add_assign( tmpr->sp_dn.s1, results2);
+    _vector_sub_i_assign( tmpr->sp_dn.s2, results2);
+  }//end of if sign_gamma==1
+  else{
+//Performing the multiplication on the first half of a halfspinor
+//shrink the fermion vector from four spin component to two
+//first component
+    _vector_assign(       halfwilson1, sp->sp_up.s0);
+    _vector_assign(       halfwilson2, sp->sp_dn.s0);
+    _vector_sub_i_assign( halfwilson1, sp->sp_up.s3);
+    _vector_sub_i_assign( halfwilson2, sp->sp_dn.s3);
+//multiply the shrinked fermion vector with the gauge 
+    if(inv == 1){
+      _su3_inverse_multiply(chi, (*u), halfwilson1);
+      _complexcjg_times_vector(results1, phase, chi);
+      _su3_inverse_multiply(chi, (*u), halfwilson2);
+      _complexcjg_times_vector(results2, phase, chi);
+    }
+    else{
+      _su3_multiply(chi, (*u), halfwilson1);
+      _complex_times_vector(results1, phase, chi);
+      _su3_multiply(chi, (*u), halfwilson2);
+      _complex_times_vector(results2, phase, chi);
+    }
+//expand it
+    _vector_add_assign(   tmpr->sp_up.s0, results1);
+    _vector_add_i_assign( tmpr->sp_up.s3, results1);
+    _vector_add_assign(   tmpr->sp_dn.s0, results2);
+    _vector_add_i_assign( tmpr->sp_dn.s3, results2);
+
+//Performing the multiplication on the first half of a halfspinor
+//shrink the fermion vector from four spin component to two
+//second component
+    _vector_assign(     halfwilson1, sp->sp_up.s1);
+    _vector_assign(     halfwilson2, sp->sp_dn.s1);
+    _vector_sub_i_assign( halfwilson1, sp->sp_up.s2);
+    _vector_sub_i_assign( halfwilson2, sp->sp_dn.s2);
+//multiply the shrinked fermion vector with the gauge 
+    if(inv == 1){
+      _su3_inverse_multiply(chi, (*u), halfwilson1);
+      _complexcjg_times_vector(results1, phase, chi);
+      _su3_inverse_multiply(chi, (*u), halfwilson2);
+      _complexcjg_times_vector(results2, phase, chi);
+    }
+    else{
+      _su3_multiply(chi, (*u), halfwilson1);
+      _complex_times_vector(results1, phase, chi);
+      _su3_multiply(chi, (*u), halfwilson2);
+      _complex_times_vector(results2, phase, chi);
+    }
+//expand it
+    _vector_add_assign(   tmpr->sp_up.s1, results1);
+    _vector_add_i_assign( tmpr->sp_up.s2, results1);
+    _vector_add_assign(   tmpr->sp_dn.s1, results2);
+    _vector_add_i_assign( tmpr->sp_dn.s2, results2);
+  }
+
+
+  return;
+
+}
+
+
+static inline void p2add_wilsonclover( bispinor * restrict const tmpr , bispinor const * restrict const sp,
+                         su3 const * restrict const u, const int inv, const _Complex double phase,
+                         const double sign) {
+#ifdef TM_USE_OMP
+#define static
+#endif
+  const int sign_gamma = (inv==1) ? -sign : sign ;
+  static su3_vector halfwilson1;
+  static su3_vector halfwilson2;
+  static su3_vector chi;
+  static su3_vector results1;
+  static su3_vector results2;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+  _vector_null( halfwilson1 );
+  _vector_null( halfwilson2 );
+  if(sign_gamma == 1){
+//Performing the multiplication on the first half of a halfspinor
+//shrink the fermion vector from four spin component to two
+//first component
+    _vector_assign(     halfwilson1, sp->sp_up.s0);
+    _vector_assign(     halfwilson2, sp->sp_dn.s0);
+    _vector_add_assign( halfwilson1, sp->sp_up.s3);
+    _vector_add_assign( halfwilson2, sp->sp_dn.s3);
+    if(inv == 1){
+      _su3_inverse_multiply(chi, (*u), halfwilson1);
+      _complexcjg_times_vector(results1, phase, chi);
+      _su3_inverse_multiply(chi, (*u), halfwilson2);
+      _complexcjg_times_vector(results2, phase, chi);
+    }
+    else{
+      _su3_multiply(chi, (*u), halfwilson1);
+      _complex_times_vector(results1, phase, chi);
+      _su3_multiply(chi, (*u), halfwilson2);
+      _complex_times_vector(results2, phase, chi);
+    }
+
+    _vector_add_assign(   tmpr->sp_up.s0, results1);
+    _vector_add_assign(   tmpr->sp_up.s3, results1);
+    _vector_add_assign(   tmpr->sp_dn.s0, results2);
+    _vector_add_assign(   tmpr->sp_dn.s3, results2);
+
+//Performing the multiplication on the first half of a halfspinor
+//shrink the fermion vector from four spin component to two
+//second component
+    _vector_assign(     halfwilson1, sp->sp_up.s1);
+    _vector_assign(     halfwilson2, sp->sp_dn.s1);
+    _vector_sub_assign( halfwilson1, sp->sp_up.s2);
+    _vector_sub_assign( halfwilson2, sp->sp_dn.s2);
+
+    if(inv == 1){
+      _su3_inverse_multiply(chi, (*u), halfwilson1);
+      _complexcjg_times_vector(results1, phase, chi);
+      _su3_inverse_multiply(chi, (*u), halfwilson2);
+      _complexcjg_times_vector(results2, phase, chi);
+    }
+    else{
+      _su3_multiply(chi, (*u), halfwilson1);
+      _complex_times_vector(results1, phase, chi);
+      _su3_multiply(chi, (*u), halfwilson2);
+      _complex_times_vector(results2, phase, chi);
+    }
+    _vector_add_assign( tmpr->sp_up.s1, results1);
+    _vector_sub_assign( tmpr->sp_up.s2, results1);
+    _vector_add_assign( tmpr->sp_dn.s1, results2);
+    _vector_sub_assign( tmpr->sp_dn.s2, results2);
+  }//end of if sign_gamma==1
+  else{
+//Performing the multiplication on the first half of a halfspinor
+//shrink the fermion vector from four spin component to two
+//first component
+    _vector_assign(       halfwilson1, sp->sp_up.s0);
+    _vector_assign(       halfwilson2, sp->sp_dn.s0);
+    _vector_sub_assign(   halfwilson1, sp->sp_up.s3);
+    _vector_sub_assign(   halfwilson2, sp->sp_dn.s3);
+//multiply the shrinked fermion vector with the gauge 
+    if(inv == 1){
+      _su3_inverse_multiply(chi, (*u), halfwilson1);
+      _complexcjg_times_vector(results1, phase, chi);
+      _su3_inverse_multiply(chi, (*u), halfwilson2);
+      _complexcjg_times_vector(results2, phase, chi);
+    }
+    else{
+      _su3_multiply(chi, (*u), halfwilson1);
+      _complex_times_vector(results1, phase, chi);
+      _su3_multiply(chi, (*u), halfwilson2);
+      _complex_times_vector(results2, phase, chi);
+    }
+//expand it
+    _vector_add_assign(   tmpr->sp_up.s0, results1);
+    _vector_sub_assign(   tmpr->sp_up.s3, results1);
+    _vector_add_assign(   tmpr->sp_dn.s0, results2);
+    _vector_sub_assign(   tmpr->sp_dn.s3, results2);
+
+//Performing the multiplication on the first half of a halfspinor
+//shrink the fermion vector from four spin component to two
+//second component
+    _vector_assign(     halfwilson1, sp->sp_up.s1);
+    _vector_assign(     halfwilson2, sp->sp_dn.s1);
+    _vector_add_assign( halfwilson1, sp->sp_up.s2);
+    _vector_add_assign( halfwilson2, sp->sp_dn.s2);
+//multiply the shrinked fermion vector with the gauge 
+    if(inv == 1){
+      _su3_inverse_multiply(chi, (*u), halfwilson1);
+      _complexcjg_times_vector(results1, phase, chi);
+      _su3_inverse_multiply(chi, (*u), halfwilson2);
+      _complexcjg_times_vector(results2, phase, chi);
+    }
+    else{
+      _su3_multiply(chi, (*u), halfwilson1);
+      _complex_times_vector(results1, phase, chi);
+      _su3_multiply(chi, (*u), halfwilson2);
+      _complex_times_vector(results2, phase, chi);
+    }
+//expand it
+    _vector_add_assign(   tmpr->sp_up.s1, results1);
+    _vector_add_assign(   tmpr->sp_up.s2, results1);
+    _vector_add_assign(   tmpr->sp_dn.s1, results2);
+    _vector_add_assign(   tmpr->sp_dn.s2, results2);
+  }
+
+  return;
+
+}
+
+
+
+static inline void p3add_wilsonclover( bispinor * restrict const tmpr , bispinor const * restrict const sp,
+                         su3 const * restrict const u, const int inv, const _Complex double phase,
+                         const double sign) {
+#ifdef TM_USE_OMP
+#define static
+#endif
+  const int sign_gamma = (inv==1) ? -sign : sign ;
+  static su3_vector halfwilson1;
+  static su3_vector halfwilson2;
+  static su3_vector chi;
+  static su3_vector results1;
+  static su3_vector results2;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+  if(sign_gamma == 1){
+//Performing the multiplication on the first half of a halfspinor
+//shrink the fermion vector from four spin component to two
+//first component
+    _vector_assign(       halfwilson1, sp->sp_up.s0);
+    _vector_assign(       halfwilson2, sp->sp_dn.s0);
+    _vector_add_i_assign( halfwilson1, sp->sp_up.s2);
+    _vector_add_i_assign( halfwilson2, sp->sp_dn.s2);
+    if(inv == 1){
+      _su3_inverse_multiply(chi, (*u), halfwilson1);
+      _complexcjg_times_vector(results1, phase, chi);
+      _su3_inverse_multiply(chi, (*u), halfwilson2);
+      _complexcjg_times_vector(results2, phase, chi);
+    }
+    else{
+      _su3_multiply(chi, (*u), halfwilson1);
+      _complex_times_vector(results1, phase, chi);
+      _su3_multiply(chi, (*u), halfwilson2);
+      _complex_times_vector(results2, phase, chi);
+    }
+
+    _vector_add_assign(     tmpr->sp_up.s0, results1);
+    _vector_add_assign(     tmpr->sp_dn.s0, results2);
+    _vector_sub_i_assign(   tmpr->sp_up.s2, results1);
+    _vector_sub_i_assign(   tmpr->sp_dn.s2, results2);
+
+//Performing the multiplication on the first half of a halfspinor
+//shrink the fermion vector from four spin component to two
+//second component
+    _vector_assign(       halfwilson1, sp->sp_up.s1);
+    _vector_assign(       halfwilson2, sp->sp_dn.s1);
+    _vector_sub_i_assign( halfwilson1, sp->sp_up.s3);
+    _vector_sub_i_assign( halfwilson2, sp->sp_dn.s3);
+
+    if(inv == 1){
+      _su3_inverse_multiply(chi, (*u), halfwilson1);
+      _complexcjg_times_vector(results1, phase, chi);
+      _su3_inverse_multiply(chi, (*u), halfwilson2);
+      _complexcjg_times_vector(results2, phase, chi);
+    }
+    else{
+      _su3_multiply(chi, (*u), halfwilson1);
+      _complex_times_vector(results1, phase, chi);
+      _su3_multiply(chi, (*u), halfwilson2);
+      _complex_times_vector(results2, phase, chi);
+    }
+    _vector_add_assign(   tmpr->sp_up.s1, results1);
+    _vector_add_i_assign( tmpr->sp_up.s3, results1);
+    _vector_add_assign(   tmpr->sp_dn.s1, results2);
+    _vector_add_i_assign( tmpr->sp_dn.s3, results2);
+  }//end of if sign_gamma==1
+  else{
+//Performing the multiplication on the first half of a halfspinor
+//shrink the fermion vector from four spin component to two
+//first component
+    _vector_assign(         halfwilson1, sp->sp_up.s0);
+    _vector_assign(         halfwilson2, sp->sp_dn.s0);
+    _vector_sub_i_assign(   halfwilson1, sp->sp_up.s2);
+    _vector_sub_i_assign(   halfwilson2, sp->sp_dn.s2);
+//multiply the shrinked fermion vector with the gauge 
+    if(inv == 1){
+      _su3_inverse_multiply(chi, (*u), halfwilson1);
+      _complexcjg_times_vector(results1, phase, chi);
+      _su3_inverse_multiply(chi, (*u), halfwilson2);
+      _complexcjg_times_vector(results2, phase, chi);
+    }
+    else{
+      _su3_multiply(chi, (*u), halfwilson1);
+      _complex_times_vector(results1, phase, chi);
+      _su3_multiply(chi, (*u), halfwilson2);
+      _complex_times_vector(results2, phase, chi);
+    }
+//expand it
+    _vector_add_assign(     tmpr->sp_up.s0, results1);
+    _vector_add_i_assign(   tmpr->sp_up.s2, results1);
+    _vector_add_assign(     tmpr->sp_dn.s0, results2);
+    _vector_add_i_assign(   tmpr->sp_dn.s2, results2);
+
+//Performing the multiplication on the first half of a halfspinor
+//shrink the fermion vector from four spin component to two
+//second component
+    _vector_assign(       halfwilson1, sp->sp_up.s1);
+    _vector_assign(       halfwilson2, sp->sp_dn.s1);
+    _vector_add_i_assign( halfwilson1, sp->sp_up.s3);
+    _vector_add_i_assign( halfwilson2, sp->sp_dn.s3);
+//multiply the shrinked fermion vector with the gauge 
+    if(inv == 1){
+      _su3_inverse_multiply(chi, (*u), halfwilson1);
+      _complexcjg_times_vector(results1, phase, chi);
+      _su3_inverse_multiply(chi, (*u), halfwilson2);
+      _complexcjg_times_vector(results2, phase, chi);
+    }
+    else{
+      _su3_multiply(chi, (*u), halfwilson1);
+      _complex_times_vector(results1, phase, chi);
+      _su3_multiply(chi, (*u), halfwilson2);
+      _complex_times_vector(results2, phase, chi);
+    }
+//expand it
+    _vector_add_assign(       tmpr->sp_up.s1, results1);
+    _vector_sub_i_assign(     tmpr->sp_up.s3, results1);
+    _vector_add_assign(       tmpr->sp_dn.s1, results2);
+    _vector_sub_i_assign(     tmpr->sp_dn.s3, results2);
+  }
+
+  return;
+
+}
+
+/* D_psi_BSM3 acts on bispinor fields 
+ * version meant for production uses two 
+ * different gauge fields: smeared one
+ * for the fermionic kinetic term and 
+ * unsmeared for the chitilde breaking
+ * terms */
+void D_psi_BSM3(bispinor * const P, bispinor * const Q){
+  if(P==Q){
+    printf("Error in D_psi_BSM (D_psi_BSM.c):\n");
+    printf("Arguments must be different bispinor fields\n");
+    printf("Program aborted\n");
+    exit(1);
+  }
+#ifdef TM_USE_MPI
+  generic_exchange(Q, sizeof(bispinor));
+#endif
+
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+
+    int ix,iy;                       /* x, x+-\mu */
+    su3 * restrict up,* restrict um; /* U_\mu(x), U_\mu(x-\mu) */
+    bispinor * restrict rr;          /* P(x) */
+    bispinor const * restrict s;     /* Q(x) */
+    bispinor const * restrict sp;    /* Q(x+\mu) */
+    bispinor const * restrict sm;    /* Q(x-\mu) */
+    scalar phi[4];                   /* phi_i(x) */
+    scalar phip[4][4];               /* phi_i(x+mu) = phip[mu][i] */
+    scalar phim[4][4];               /* phi_i(x-mu) = phim[mu][i] */
+    const su3 *w1,*w2,*w3;
+    _Complex double rho1, rho2;
+    rho1 = (1. +  mu03_BSM * I);
+    rho2 = conj(rho1);
+
+
+
+
+    /************************ loop over all lattice sites *************************/
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+    for (ix=0;ix<VOLUME;ix++)
+      {
+        rr = (bispinor *) P + ix;
+        s  = (bispinor *) Q + ix;
+
+        /* prefatch scalar fields */
+        phi[0] = g_scalar_field[0][ix];
+        phi[1] = g_scalar_field[1][ix];
+        phi[2] = g_scalar_field[2][ix];
+        phi[3] = g_scalar_field[3][ix];
+
+        for( int mu=0; mu<4; mu++ )
+          {
+            phip[mu][0] = g_scalar_field[0][g_iup[ix][mu]];
+            phip[mu][1] = g_scalar_field[1][g_iup[ix][mu]];
+            phip[mu][2] = g_scalar_field[2][g_iup[ix][mu]];
+            phip[mu][3] = g_scalar_field[3][g_iup[ix][mu]];
+
+            phim[mu][0] = g_scalar_field[0][g_idn[ix][mu]];
+            phim[mu][1] = g_scalar_field[1][g_idn[ix][mu]];
+            phim[mu][2] = g_scalar_field[2][g_idn[ix][mu]];
+            phim[mu][3] = g_scalar_field[3][g_idn[ix][mu]];
+          }
+
+
+        /* the local part (not local in phi) */
+
+        _spinor_null(rr->sp_up);
+        _spinor_null(rr->sp_dn);
+        
+        // FIXME check normalisation of twisted mass and csw terms
+        // Note that here what is implemented is i*1/2*c_sw*sigma_mu,nu F_mu,nu
+        // thus we need to apply and additional factor of 1/2 preparing the 
+        // clover term, this is done when we initialize the clover term in 
+        // operator.c for executable invert and in contractions_BSM.c for exec
+        // contractions_BSM
+        if(csw_BSM > 0) {
+         (assign_mul_one_sw_pm_imu_site_lexic)(ix, &(rr->sp_up), &(s->sp_up), +mu03_BSM);
+         (assign_mul_one_sw_pm_imu_site_lexic)(ix, &(rr->sp_dn), &(s->sp_dn), -mu03_BSM);
+        }
+        else {
+          _complex_times_vector(rr->sp_up.s0, rho1, s->sp_up.s0);
+          _complex_times_vector(rr->sp_up.s1, rho1, s->sp_up.s1);
+          _complex_times_vector(rr->sp_up.s2, rho2, s->sp_up.s2);
+          _complex_times_vector(rr->sp_up.s3, rho2, s->sp_up.s3);
+
+          _complex_times_vector(rr->sp_dn.s0, rho2, s->sp_dn.s0);
+          _complex_times_vector(rr->sp_dn.s1, rho2, s->sp_dn.s1);
+          _complex_times_vector(rr->sp_dn.s2, rho1, s->sp_dn.s2);
+          _complex_times_vector(rr->sp_dn.s3, rho1, s->sp_dn.s3);
+
+        }
+
+        // FIXME split r-term into two pieces (perhaps), apply 3.0r here, 1r via assign_mul_one_sw_pm_imu_site_lexic
+        // because that is (1+icsw/2*sum_mu>nu sigma_munu F_munu )
+        // tmpr += (3.0*r_BSM+m0_BSM)*s
+        // This part takes care of the constant factor coming from the 
+        // second derivative in the wilson term 4r0 delta_xy
+        // + the additional wilson bare mass m_0
+        // Here we already took a factor of 1. in the clover 
+        // term account, thus we only have to add a factor of 3.0
+        /* tmpr += (3.0*r_BSM+m0_BSM)*s */
+        _vector_add_mul(rr->sp_up.s0, (3+m0_BSM), s->sp_up.s0);
+        _vector_add_mul(rr->sp_up.s1, (3+m0_BSM), s->sp_up.s1);
+        _vector_add_mul(rr->sp_up.s2, (3+m0_BSM), s->sp_up.s2);
+        _vector_add_mul(rr->sp_up.s3, (3+m0_BSM), s->sp_up.s3);
+
+        _vector_add_mul(rr->sp_dn.s0, (3+m0_BSM), s->sp_dn.s0);
+        _vector_add_mul(rr->sp_dn.s1, (3+m0_BSM), s->sp_dn.s1);
+        _vector_add_mul(rr->sp_dn.s2, (3+m0_BSM), s->sp_dn.s2);
+        _vector_add_mul(rr->sp_dn.s3, (3+m0_BSM), s->sp_dn.s3);
+
+
+        /* tmpr += (\eta_BSM+2*\rho_BSM) * F(x)*Q(x) */
+        Fadd(rr, s, phi, eta_BSM+2.0*rho_BSM, +1.);
+
+        /* tmpr += \sum_\mu (\rho_BSM/4) * F(x+-\mu)*Q */
+        for( int mu=0; mu<4; mu++ ) {
+          Fadd(rr, s, phip[mu], 0.25*rho_BSM, +1.);
+          Fadd(rr, s, phim[mu], 0.25*rho_BSM, +1.);
+        }
+        Fabsadd(rr,s,phi,c5phi_BSM);
+
+        /* tmpr+=i\gamma_5\tau_1 mu0 *Q */
+        if( fabs(mu01_BSM) > 1.e-10 )
+          tm1_add(rr, s, 1);
+
+        /* tmpr+=i\gamma_5\tau_3 mu0 *Q */
+        /* if( fabs(mu03_BSM) > 1.e-10 )
+          tm3_add(rr, s, 1); */
+
+        /* the hopping part:
+         * tmpr += +1/2 \sum_\mu (1-gamma_\mu - \rho_BSM/2*F(x) - \rho_BSM/2*F(x+-\mu))*U_{+-\mu}(x)*Q(x+-\mu)
+         ******************************* direction +0 *********************************/
+        iy=g_iup[ix][0];
+        sp = (bispinor *) Q +iy;
+        up=&g_smeared_gauge_field[ix][0];
+        p0add_wilsonclover(rr, sp, up, 0, -0.5*phase_0, 1);
+        up=&g_gauge_field[ix][0];
+        padd_chitildebreak(rr, sp, up, 0, -0.5*phase_0, 0.5*rho_BSM, phi, phip[0], +1.);
+
+        /******************************* direction -0 *********************************/
+        iy=g_idn[ix][0];
+        sm = (bispinor *) Q +iy;
+        um=&g_smeared_gauge_field[iy][0];
+        p0add_wilsonclover(rr, sm, um, 1, -0.5*phase_0, 1);
+        um=&g_gauge_field[iy][0];
+        padd_chitildebreak(rr, sm, um, 1, -0.5*phase_0, 0.5*rho_BSM, phi, phim[0], +1.);
+
+        /******************************* direction +1 *********************************/
+        iy=g_iup[ix][1];
+        sp = (bispinor *) Q +iy;
+        up=&g_smeared_gauge_field[ix][1];
+        p1add_wilsonclover(rr, sp, up, 0, -0.5*phase_1, 1);
+        up=&g_gauge_field[ix][1];
+        padd_chitildebreak(rr, sp, up, 0, -0.5*phase_1, 0.5*rho_BSM, phi, phip[1], +1.);
+
+
+        /******************************* direction -1 *********************************/
+        iy=g_idn[ix][1];
+        sm = (bispinor *) Q +iy;
+        um=&g_smeared_gauge_field[iy][1];
+        p1add_wilsonclover(rr, sm, um, 1, -0.5*phase_1, 1);
+        um=&g_gauge_field[iy][1];
+        padd_chitildebreak(rr, sm, um, 1, -0.5*phase_1, 0.5*rho_BSM, phi, phim[1], +1.);
+
+
+        /******************************* direction +2 *********************************/
+        iy=g_iup[ix][2];
+        sp = (bispinor *) Q +iy;
+        up=&g_smeared_gauge_field[ix][2];
+        p2add_wilsonclover(rr, sp, up, 0, -0.5*phase_2, 1);
+        up=&g_gauge_field[ix][2];
+        padd_chitildebreak(rr, sp, up, 0, -0.5*phase_2, 0.5*rho_BSM, phi, phip[2], +1.);
+
+
+        /******************************* direction -2 *********************************/
+        iy=g_idn[ix][2];
+        sm = (bispinor *) Q +iy;
+        um=&g_smeared_gauge_field[iy][2];
+        p2add_wilsonclover(rr, sm, um, 1, -0.5*phase_2, 1);
+        um=&g_gauge_field[iy][2];
+        padd_chitildebreak(rr, sm, um, 1, -0.5*phase_2, 0.5*rho_BSM, phi, phim[2], +1.);
+
+        /******************************* direction +3 *********************************/
+        iy=g_iup[ix][3];
+        sp = (bispinor *) Q +iy;
+        up=&g_smeared_gauge_field[ix][3];
+        p3add_wilsonclover(rr, sp, up, 0, -0.5*phase_3, 1);
+        up=&g_gauge_field[ix][3];
+        padd_chitildebreak(rr, sp, up, 0, -0.5*phase_3, 0.5*rho_BSM, phi, phip[3], +1.);
+
+        /******************************* direction -3 *********************************/
+        iy=g_idn[ix][3];
+        sm = (bispinor *) Q +iy;
+        um=&g_smeared_gauge_field[iy][3];
+        p3add_wilsonclover(rr, sm, um, 1, -0.5*phase_3, 1);
+        um=&g_gauge_field[iy][3];
+        padd_chitildebreak(rr, sm, um, 1, -0.5*phase_3, 0.5*rho_BSM, phi, phim[3], +1.);
+      }
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+}
+
+
+
+
+/* D_psi_BSM acts on bispinor fields */
+void D_psi_dagger_BSM3(bispinor * const P, bispinor * const Q){
+  if(P==Q){
+    printf("Error in D_psi_BSM (D_psi_BSM.c):\n");
+    printf("Arguments must be different bispinor fields\n");
+    printf("Program aborted\n");
+    exit(1);
+  }
+#ifdef TM_USE_MPI
+  generic_exchange(Q, sizeof(bispinor));
+#endif
+  
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+    
+    int ix,iy;                       // x, x+-\mu
+    su3 * restrict up,* restrict um; // U_\mu(x), U_\mu(x-\mu)
+    bispinor * restrict rr;          // P(x)
+    bispinor const * restrict s;     // Q(x)
+    bispinor const * restrict sp;    // Q(x+\mu)
+    bispinor const * restrict sm;    // Q(x-\mu)
+    scalar phi[4];                   // phi_i(x)
+    scalar phip[4][4];               // phi_i(x+mu) = phip[mu][i]
+    scalar phim[4][4];               // phi_i(x-mu) = phim[mu][i] 
+    _Complex double rho1, rho2;
+    rho1 = ( 1. +  mu03_BSM * I);
+    rho2 = conj(rho1);
+
+    
+    /************************ loop over all lattice sites *************************/
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+    for (ix = 0; ix < VOLUME; ix++) {
+      rr = (bispinor *) P + ix;
+      s  = (bispinor *) Q + ix;
+      
+      // prefatch scalar fields
+      phi[0] = g_scalar_field[0][ix];
+      phi[1] = g_scalar_field[1][ix];
+      phi[2] = g_scalar_field[2][ix];
+      phi[3] = g_scalar_field[3][ix];
+      
+      for(int mu = 0; mu < 4; mu++ ) {
+        phip[mu][0] = g_scalar_field[0][g_iup[ix][mu]];
+        phip[mu][1] = g_scalar_field[1][g_iup[ix][mu]];
+        phip[mu][2] = g_scalar_field[2][g_iup[ix][mu]];
+        phip[mu][3] = g_scalar_field[3][g_iup[ix][mu]];
+        
+        phim[mu][0] = g_scalar_field[0][g_idn[ix][mu]];
+        phim[mu][1] = g_scalar_field[1][g_idn[ix][mu]];
+        phim[mu][2] = g_scalar_field[2][g_idn[ix][mu]];
+        phim[mu][3] = g_scalar_field[3][g_idn[ix][mu]];
+      }
+
+      // the local part (not local in phi)
+
+       _spinor_null(rr->sp_up);
+       _spinor_null(rr->sp_dn);
+
+       if(csw_BSM > 0) {
+         (assign_mul_one_sw_pm_imu_site_lexic)(ix, &(rr->sp_up), &(s->sp_up), -mu03_BSM);
+         (assign_mul_one_sw_pm_imu_site_lexic)(ix, &(rr->sp_dn), &(s->sp_dn), +mu03_BSM);
+        }
+        else {
+          _complex_times_vector(rr->sp_up.s0, rho2, s->sp_up.s0);
+          _complex_times_vector(rr->sp_up.s1, rho2, s->sp_up.s1);
+          _complex_times_vector(rr->sp_up.s2, rho1, s->sp_up.s2);
+          _complex_times_vector(rr->sp_up.s3, rho1, s->sp_up.s3);
+
+          _complex_times_vector(rr->sp_dn.s0, rho1, s->sp_dn.s0);
+          _complex_times_vector(rr->sp_dn.s1, rho1, s->sp_dn.s1);
+          _complex_times_vector(rr->sp_dn.s2, rho2, s->sp_dn.s2);
+          _complex_times_vector(rr->sp_dn.s3, rho2, s->sp_dn.s3);
+
+        }
+
+ 
+      // tmpr += (3*r_BSM+m0_BSM)*s
+      // This part takes care of the constant factor coming from the 
+      // second derivative in the wilson term 4r0 delta_xy
+      // + the additional wilson bare mass m_0
+      // Here we already took a factor of 1. in the clover 
+      // term account, thus we only have to add a factor of 3
+      _vector_add_mul(rr->sp_up.s0, (3+m0_BSM), s->sp_up.s0);
+      _vector_add_mul(rr->sp_up.s1, (3+m0_BSM), s->sp_up.s1);
+      _vector_add_mul(rr->sp_up.s2, (3+m0_BSM), s->sp_up.s2);
+      _vector_add_mul(rr->sp_up.s3, (3+m0_BSM), s->sp_up.s3);
+
+      _vector_add_mul(rr->sp_dn.s0, (3+m0_BSM), s->sp_dn.s0);
+      _vector_add_mul(rr->sp_dn.s1, (3+m0_BSM), s->sp_dn.s1);
+      _vector_add_mul(rr->sp_dn.s2, (3+m0_BSM), s->sp_dn.s2);
+      _vector_add_mul(rr->sp_dn.s3, (3+m0_BSM), s->sp_dn.s3);
+      
+      // tmpr += (\eta_BSM+2*\rho_BSM) * Fbar(x)*Q(x)
+      Fadd(rr, s, phi, eta_BSM+2.0*rho_BSM, -1.);
+      
+      // tmpr += \sum_\mu (\rho_BSM/4) * F(x+-\mu)*Q
+      for(int mu = 0; mu < 4; mu++) {
+        Fadd(rr, s, phip[mu], 0.25*rho_BSM, -1.);
+        Fadd(rr, s, phim[mu], 0.25*rho_BSM, -1.);
+      }
+
+      // tmpr += c5phi_BSM \Phi^\dagger\Phi Q
+      Fabsadd(rr,s,phi,c5phi_BSM);
+
+      // tmpr+=i\gamma_5\tau_1 mu0 *Q 
+      if( fabs(mu01_BSM) > 1.e-10 )
+        tm1_add(rr, s, -1);
+
+      // tmpr+=i\gamma_5\tau_3 mu0 *Q 
+      /*if( fabs(mu03_BSM) > 1.e-10 )
+        tm3_add(rr, s, -1);*/
+
+      // the hopping part:
+      // tmpr += +1/2 \sum_\mu (1+\gamma_\mu - \rho_BSM/2*Fbar(x) - \rho_BSM/2*Fbar(x+-\mu)*U_{+-\mu}(x)*Q(x+-\mu)
+      /******************************* direction +0 *********************************/
+      iy=g_iup[ix][0];
+      sp = (bispinor *) Q +iy;
+      up=&g_smeared_gauge_field[ix][0];
+      p0add_wilsonclover(rr, sp, up, 0, -0.5*phase_0, -1);
+      up=&g_gauge_field[ix][0];
+      padd_chitildebreak(rr, sp, up, 0, -0.5*phase_0, 0.5*rho_BSM, phi, phip[0], -1.);
+ 
+      /******************************* direction -0 *********************************/
+      iy=g_idn[ix][0];
+      sm = (bispinor *) Q +iy;
+      um=&g_smeared_gauge_field[iy][0];
+      p0add_wilsonclover(rr, sm, um, 1, -0.5*phase_0, -1);
+      um=&g_gauge_field[iy][0];
+      padd_chitildebreak(rr, sm, um, 1, -0.5*phase_0, 0.5*rho_BSM, phi, phim[0], -1.);
+      /******************************* direction +1 *********************************/
+      iy=g_iup[ix][1];
+      sp = (bispinor *) Q +iy;
+      up=&g_smeared_gauge_field[ix][1];
+      p1add_wilsonclover(rr, sp, up, 0, -0.5*phase_1, -1);
+      up=&g_gauge_field[ix][1];
+      padd_chitildebreak(rr, sp, up, 0, -0.5*phase_1, 0.5*rho_BSM, phi, phip[1], -1.);
+ 
+      /******************************* direction -1 *********************************/
+      iy=g_idn[ix][1];
+      sm = (bispinor *) Q +iy;
+      um=&g_smeared_gauge_field[iy][1];
+      p1add_wilsonclover(rr, sm, um, 1, -0.5*phase_1, -1);
+      um=&g_gauge_field[iy][1];
+      padd_chitildebreak(rr, sm, um, 1, -0.5*phase_1, 0.5*rho_BSM, phi, phim[1], -1.);
+ 
+      /******************************* direction +2 *********************************/
+      iy=g_iup[ix][2];
+      sp = (bispinor *) Q +iy;
+      up=&g_smeared_gauge_field[ix][2];
+      p2add_wilsonclover(rr, sp, up, 0, -0.5*phase_2, -1);
+      up=&g_gauge_field[ix][2];
+      padd_chitildebreak(rr, sp, up, 0, -0.5*phase_2, 0.5*rho_BSM, phi, phip[2], -1.);
+
+      /******************************* direction -2 *********************************/
+      iy=g_idn[ix][2];
+      sm = (bispinor *) Q +iy;
+      um=&g_smeared_gauge_field[iy][2]; 
+      p2add_wilsonclover(rr, sm, um, 1, -0.5*phase_2, -1);
+      um=&g_gauge_field[iy][2]; 
+      padd_chitildebreak(rr, sm, um, 1, -0.5*phase_2, 0.5*rho_BSM, phi, phim[2], -1.);
+ 
+      /******************************* direction +3 *********************************/
+      iy=g_iup[ix][3];
+      sp = (bispinor *) Q +iy;
+      up=&g_smeared_gauge_field[ix][3];
+      p3add_wilsonclover(rr, sp, up, 0, -0.5*phase_3, -1);
+      up=&g_gauge_field[ix][3];
+      padd_chitildebreak(rr, sp, up, 0, -0.5*phase_3, 0.5*rho_BSM, phi, phip[3], -1.);
+      
+      /******************************* direction -3 *********************************/
+      iy=g_idn[ix][3];
+      sm = (bispinor *) Q +iy;
+      um=&g_smeared_gauge_field[iy][3];
+      p3add_wilsonclover(rr, sm, um, 1, -0.5*phase_3, -1);
+      um=&g_gauge_field[iy][3];
+      padd_chitildebreak(rr, sm, um, 1, -0.5*phase_3, 0.5*rho_BSM, phi, phim[3], -1.);
+    }
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+}
+
+/* Q2_psi_BSM acts on bispinor fields */
+void Q2_psi_BSM3(bispinor * const P, bispinor * const Q){
+
+  D_psi_dagger_BSM3(tempor , Q);
+  D_psi_BSM3(P, tempor);
+  /* Q and P are spinor, not bispinor ==> made a cast */
+  /* the use of [3] has to be changed to avoid future conflicts */
+
+}
+#endif
diff --git a/operator/D_psi_BSM3.h b/operator/D_psi_BSM3.h
new file mode 100644
index 000000000..65e9099fd
--- /dev/null
+++ b/operator/D_psi_BSM3.h
@@ -0,0 +1,34 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2015 Mario Schroeck
+ *               2016 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _D_PSI_BSM3_H
+#define _D_PSI_BSM3_H
+
+//#include "block.h"
+void init_D_psi_BSM3(void);
+void free_D_psi_BSM3(void);
+
+void D_psi_BSM3_test(bispinor * const P, bispinor * const Q);
+void D_psi_BSM3(bispinor * const P, bispinor * const Q);
+void D_psi_dagger_BSM3(bispinor * const P, bispinor * const Q);
+void Q2_psi_BSM3(bispinor * const P, bispinor * const Q);
+
+#endif
diff --git a/operator/D_psi_BSM3_test.c b/operator/D_psi_BSM3_test.c
new file mode 100644
index 000000000..fe4a5e307
--- /dev/null
+++ b/operator/D_psi_BSM3_test.c
@@ -0,0 +1,758 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach,
+ * 2014 Mario Schroeck
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more deta_BSMils.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.   If not, see <http://www.gnu.org/licenses/>.
+ *
+ *******************************************************************************/
+
+/*******************************************************************************
+ *
+ * Action of a Dirac operator (Frezzotti-Rossi BSM toy model) on a bispinor field
+ *
+ *******************************************************************************/
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+
+#ifdef TM_USE_BSM
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "global.h"
+#include "su3.h"
+#include "sse.h"
+#include "boundary.h"
+#ifdef TM_USE_MPI
+# include "xchange/xchange.h"
+#endif
+#include "update_backward_gauge.h"
+#include "block.h"
+#include "operator/D_psi_BSM.h"
+#include "operator/D_psi_BSM3_test.h"
+#include "solver/dirac_operator_eigenvectors.h"
+#include "buffers/utils.h"
+#include "linalg_eo.h"
+
+#include "operator/clovertm_operators.h"
+#include "operator/clover_leaf.h"
+
+
+static inline void bispinor_times_phase_times_u(bispinor * restrict const us, const _Complex double phase,
+						su3 const * restrict const u, bispinor const * restrict const s)
+{
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static su3_vector chi;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+
+  _su3_multiply(chi, (*u), s->sp_up.s0);
+  _complex_times_vector(us->sp_up.s0, phase, chi);
+
+  _su3_multiply(chi, (*u), s->sp_up.s1);
+  _complex_times_vector(us->sp_up.s1, phase, chi);
+
+  _su3_multiply(chi, (*u), s->sp_up.s2);
+  _complex_times_vector(us->sp_up.s2, phase, chi);
+
+  _su3_multiply(chi, (*u), s->sp_up.s3);
+  _complex_times_vector(us->sp_up.s3, phase, chi);
+
+  _su3_multiply(chi, (*u), s->sp_dn.s0);
+  _complex_times_vector(us->sp_dn.s0, phase, chi);
+
+  _su3_multiply(chi, (*u), s->sp_dn.s1);
+  _complex_times_vector(us->sp_dn.s1, phase, chi);
+
+  _su3_multiply(chi, (*u), s->sp_dn.s2);
+  _complex_times_vector(us->sp_dn.s2, phase, chi);
+
+  _su3_multiply(chi, (*u), s->sp_dn.s3);
+  _complex_times_vector(us->sp_dn.s3, phase, chi);
+
+  return;
+}
+
+
+
+
+static inline void bispinor_times_phase_times_inverse_u(bispinor * restrict const us, const _Complex double phase,
+							su3 const * restrict const u, bispinor const * restrict const s)
+{
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static su3_vector chi;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+
+  _su3_inverse_multiply(chi, (*u), s->sp_up.s0);
+  _complexcjg_times_vector(us->sp_up.s0, phase, chi);
+
+  _su3_inverse_multiply(chi, (*u), s->sp_up.s1);
+  _complexcjg_times_vector(us->sp_up.s1, phase, chi);
+
+  _su3_inverse_multiply(chi, (*u), s->sp_up.s2);
+  _complexcjg_times_vector(us->sp_up.s2, phase, chi);
+
+  _su3_inverse_multiply(chi, (*u), s->sp_up.s3);
+  _complexcjg_times_vector(us->sp_up.s3, phase, chi);
+
+  _su3_inverse_multiply(chi, (*u), s->sp_dn.s0);
+  _complexcjg_times_vector(us->sp_dn.s0, phase, chi);
+
+  _su3_inverse_multiply(chi, (*u), s->sp_dn.s1);
+  _complexcjg_times_vector(us->sp_dn.s1, phase, chi);
+
+  _su3_inverse_multiply(chi, (*u), s->sp_dn.s2);
+  _complexcjg_times_vector(us->sp_dn.s2, phase, chi);
+
+  _su3_inverse_multiply(chi, (*u), s->sp_dn.s3);
+  _complexcjg_times_vector(us->sp_dn.s3, phase, chi);
+
+  return;
+}
+
+static inline void bispinor_times_real_times_inverse_u(bispinor * restrict const us, const double realnum,
+                                                        su3 const * restrict const u, bispinor const * restrict const s)
+{
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static su3_vector chi;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+
+
+  _su3_inverse_multiply(chi, (*u), s->sp_up.s0);
+  _vector_null( us->sp_up.s0 );
+  _vector_add_mul( us->sp_up.s0, realnum, chi );
+
+  _su3_inverse_multiply(chi, (*u), s->sp_up.s1);
+  _vector_null( us->sp_up.s1 );
+  _vector_add_mul( us->sp_up.s1, realnum, chi );
+
+  _su3_inverse_multiply(chi, (*u), s->sp_up.s2);
+  _vector_null( us->sp_up.s2 );
+  _vector_add_mul( us->sp_up.s2, realnum, chi );
+
+  _su3_inverse_multiply(chi, (*u), s->sp_up.s3);
+  _vector_null( us->sp_up.s3 );
+  _vector_add_mul( us->sp_up.s3, realnum, chi );
+
+  _su3_inverse_multiply(chi, (*u), s->sp_dn.s0);
+  _vector_null( us->sp_dn.s0 );
+  _vector_add_mul( us->sp_dn.s0, realnum, chi );
+
+  _su3_inverse_multiply(chi, (*u), s->sp_dn.s1);
+  _vector_null( us->sp_dn.s1 );
+  _vector_add_mul( us->sp_dn.s1, realnum, chi );
+
+  _su3_inverse_multiply(chi, (*u), s->sp_dn.s2);
+  _vector_null( us->sp_dn.s2 );
+  _vector_add_mul( us->sp_dn.s2, realnum, chi );
+
+  _su3_inverse_multiply(chi, (*u), s->sp_dn.s3);
+  _vector_null( us->sp_dn.s3 );
+  _vector_add_mul( us->sp_dn.s3, realnum, chi );
+
+  return;
+}
+
+
+
+
+static inline void tm3_add(bispinor * const out, const bispinor * const in, const double sign)
+{
+  /*out+=s*i\gamma_5 \tau_3 mu3 *in
+   * sign>0 for D+i\gamma_5\tau_3
+   * sign<0 for D_dag-i\gamma_5\tau_3
+   */
+  const double s = (sign < 0) ? -1. : 1. ;
+
+  /* out_up += s * i \gamma_5 \mu3 * in_up */
+  _vector_add_i_mul(out->sp_up.s0,  s*mu03_BSM, in->sp_up.s0);
+  _vector_add_i_mul(out->sp_up.s1,  s*mu03_BSM, in->sp_up.s1);
+  _vector_add_i_mul(out->sp_up.s2, -s*mu03_BSM, in->sp_up.s2);
+  _vector_add_i_mul(out->sp_up.s3, -s*mu03_BSM, in->sp_up.s3);
+
+
+  /* out_dn +=- s * i \gamma_5 \mu3 * in_dn */
+  _vector_add_i_mul(out->sp_dn.s0, -s*mu03_BSM, in->sp_dn.s0);
+  _vector_add_i_mul(out->sp_dn.s1, -s*mu03_BSM, in->sp_dn.s1);
+  _vector_add_i_mul(out->sp_dn.s2,  s*mu03_BSM, in->sp_dn.s2);
+  _vector_add_i_mul(out->sp_dn.s3,  s*mu03_BSM, in->sp_dn.s3);
+
+}
+static inline void tm1_add(bispinor * const out, const bispinor * const in, const double sign)
+{
+  /*out+=s*i\gamma_5 \tau_1 mu1 *in
+   * sign>0 for D+i\gamma_5\tau_1
+   * sign<0 for D_dag-i\gamma_5\tau_1
+   */
+  const double s = (sign < 0) ? -1. : 1.;
+
+  /* out_up += s * i \gamma_5 \mu1 * in_dn */
+  _vector_add_i_mul(out->sp_up.s0,  s*mu01_BSM, in->sp_dn.s0);
+  _vector_add_i_mul(out->sp_up.s1,  s*mu01_BSM, in->sp_dn.s1);
+  _vector_add_i_mul(out->sp_up.s2, -s*mu01_BSM, in->sp_dn.s2);
+  _vector_add_i_mul(out->sp_up.s3, -s*mu01_BSM, in->sp_dn.s3);
+
+
+  /* out_dn += s * i \gamma_5 \mu1 * in_up */
+  _vector_add_i_mul(out->sp_dn.s0,  s*mu01_BSM, in->sp_up.s0);
+  _vector_add_i_mul(out->sp_dn.s1,  s*mu01_BSM, in->sp_up.s1);
+  _vector_add_i_mul(out->sp_dn.s2, -s*mu01_BSM, in->sp_up.s2);
+  _vector_add_i_mul(out->sp_dn.s3, -s*mu01_BSM, in->sp_up.s3);
+
+}
+
+
+
+/* operation out(x) += Fabs(y)*in(x)
+ * Fabs(y) := [ \phi_0(y)**2 + \sum_j \phi_j(y)**2 ] * c
+ * this operator acts locally on a site x, pass pointers accordingly.
+ * out: the resulting bispinor, out += F*in
+ * in:  the input bispinor at site x
+ * phi: pointer to the four scalars phi0,...,phi3 at site y, y = x or x+-\mu
+ * c:    constant double
+ */
+
+static inline void Fabsadd(bispinor * const out, const bispinor * const in, const scalar * const phi, const double c) {
+  const double c_phisq = c*(phi[0]*phi[0]+phi[1]*phi[1]+phi[2]*phi[2]+phi[3]*phi[3]);
+
+  // flavour 1:
+  // out_up += c(\phi_0 \phi_0 + \phi_1 \phi_1 + \phi_2 \phi_2+ \phi_3 \phi_3)* in_up
+  _vector_add_mul(out->sp_up.s0, c_phisq, in->sp_up.s0);
+  _vector_add_mul(out->sp_up.s1, c_phisq, in->sp_up.s1);
+  _vector_add_mul(out->sp_up.s2, c_phisq, in->sp_up.s2);
+  _vector_add_mul(out->sp_up.s3, c_phisq, in->sp_up.s3);
+
+  // flavour 2:
+  // out_dn += c(\phi_0 \phi_0 + \phi_1 \phi_1 + \phi_2 \phi_2+ \phi_3 \phi_3)* in_dn
+  _vector_add_mul(out->sp_dn.s0, c_phisq, in->sp_dn.s0);
+  _vector_add_mul(out->sp_dn.s1, c_phisq, in->sp_dn.s1);
+  _vector_add_mul(out->sp_dn.s2, c_phisq, in->sp_dn.s2);
+  _vector_add_mul(out->sp_dn.s3, c_phisq, in->sp_dn.s3);
+
+}
+
+
+
+/* operation out(x) += F(y)*in(x)
+ * F(y) := [ \phi_0(y) + i \gamma_5 \tau^j \phi_j(y) ] * c
+ * this operator acts locally on a site x, pass pointers accordingly.
+ * out: the resulting bispinor, out += F*in
+ * in:  the input bispinor at site x
+ * phi: pointer to the four scalars phi0,...,phi3 at site y, y = x or x+-\mu
+ * c:    constant double
+ *
+ * sign = +1 -> Fadd
+ * sign = -1 -> Fbaradd
+ */
+
+static inline void Fadd(bispinor * const out, const bispinor * const in, const scalar * const phi, const double c, const double sign) {
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static spinor tmp;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+  
+  const double s = (sign < 0) ? -1. : 1.;
+
+  // flavour 1:
+  // tmp_up = \phi_0 * in_up
+  _vector_mul(tmp.s0, phi[0], in->sp_up.s0);
+  _vector_mul(tmp.s1, phi[0], in->sp_up.s1);
+  _vector_mul(tmp.s2, phi[0], in->sp_up.s2);
+  _vector_mul(tmp.s3, phi[0], in->sp_up.s3);
+  
+  // tmp_up += s * i \gamma_5 \phi_1 * in_dn
+  _vector_add_i_mul(tmp.s0,  s*phi[1], in->sp_dn.s0);
+  _vector_add_i_mul(tmp.s1,  s*phi[1], in->sp_dn.s1);
+  _vector_add_i_mul(tmp.s2, -s*phi[1], in->sp_dn.s2);
+  _vector_add_i_mul(tmp.s3, -s*phi[1], in->sp_dn.s3);
+  
+  // tmp_up += s * \gamma_5 \phi_2 * in_dn
+  _vector_add_mul(tmp.s0,  s*phi[2], in->sp_dn.s0);
+  _vector_add_mul(tmp.s1,  s*phi[2], in->sp_dn.s1);
+  _vector_add_mul(tmp.s2, -s*phi[2], in->sp_dn.s2);
+  _vector_add_mul(tmp.s3, -s*phi[2], in->sp_dn.s3);
+  
+  // tmp_up += s * i \gamma_5 \phi_3 * in_up
+  _vector_add_i_mul(tmp.s0,  s*phi[3], in->sp_up.s0);
+  _vector_add_i_mul(tmp.s1,  s*phi[3], in->sp_up.s1);
+  _vector_add_i_mul(tmp.s2, -s*phi[3], in->sp_up.s2);
+  _vector_add_i_mul(tmp.s3, -s*phi[3], in->sp_up.s3);
+  
+  // out_up += c * tmp;
+  _vector_add_mul(out->sp_up.s0,c,tmp.s0);
+  _vector_add_mul(out->sp_up.s1,c,tmp.s1);
+  _vector_add_mul(out->sp_up.s2,c,tmp.s2);
+  _vector_add_mul(out->sp_up.s3,c,tmp.s3);
+  
+  
+  // flavour 2:
+  // tmp_dn = \phi_0 * in_dn
+  _vector_mul(tmp.s0, phi[0], in->sp_dn.s0);
+  _vector_mul(tmp.s1, phi[0], in->sp_dn.s1);
+  _vector_mul(tmp.s2, phi[0], in->sp_dn.s2);
+  _vector_mul(tmp.s3, phi[0], in->sp_dn.s3);
+  
+  // tmp_dn += s * i \gamma_5 \phi_1 * in_up
+  _vector_add_i_mul(tmp.s0,  s*phi[1], in->sp_up.s0);
+  _vector_add_i_mul(tmp.s1,  s*phi[1], in->sp_up.s1);
+  _vector_add_i_mul(tmp.s2, -s*phi[1], in->sp_up.s2);
+  _vector_add_i_mul(tmp.s3, -s*phi[1], in->sp_up.s3);
+  
+  // tmp_dn -= s * \gamma_5 \phi_2 * in_up
+  _vector_add_mul(tmp.s0, -s*phi[2], in->sp_up.s0);
+  _vector_add_mul(tmp.s1, -s*phi[2], in->sp_up.s1);
+  _vector_add_mul(tmp.s2,  s*phi[2], in->sp_up.s2);
+  _vector_add_mul(tmp.s3,  s*phi[2], in->sp_up.s3);
+  
+  // tmp_dn -= s * i \gamma_5 \phi_3 * in_dn
+  _vector_add_i_mul(tmp.s0, -s*phi[3], in->sp_dn.s0);
+  _vector_add_i_mul(tmp.s1, -s*phi[3], in->sp_dn.s1);
+  _vector_add_i_mul(tmp.s2,  s*phi[3], in->sp_dn.s2);
+  _vector_add_i_mul(tmp.s3,  s*phi[3], in->sp_dn.s3);
+  
+  // out_dn += c * tmp;
+  _vector_add_mul(out->sp_dn.s0,c,tmp.s0);
+  _vector_add_mul(out->sp_dn.s1,c,tmp.s1);
+  _vector_add_mul(out->sp_dn.s2,c,tmp.s2);
+  _vector_add_mul(out->sp_dn.s3,c,tmp.s3);
+}
+
+
+static inline void p0add(bispinor * restrict const tmpr , bispinor const * restrict const s,
+                         su3 const * restrict const u, const int inv, const _Complex double phase,
+                         const double phaseF, const scalar * const phi, const scalar * const phip,
+                         const double sign) {
+
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static bispinor us;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+
+
+  // us = phase*u*s
+  if( inv ){
+    bispinor_times_phase_times_inverse_u(&us, phase, u, s);
+  }
+  else{
+    bispinor_times_phase_times_u(&us, phase, u, s);
+  }
+
+  _vector_add_mul(tmpr->sp_up.s0, r0_BSM, us.sp_up.s0);
+  _vector_add_mul(tmpr->sp_up.s1, r0_BSM, us.sp_up.s1);
+  _vector_add_mul(tmpr->sp_up.s2, r0_BSM, us.sp_up.s2);
+  _vector_add_mul(tmpr->sp_up.s3, r0_BSM, us.sp_up.s3);
+  _vector_add_mul(tmpr->sp_dn.s0, r0_BSM, us.sp_dn.s0);
+  _vector_add_mul(tmpr->sp_dn.s1, r0_BSM, us.sp_dn.s1);
+  _vector_add_mul(tmpr->sp_dn.s2, r0_BSM, us.sp_dn.s2);
+  _vector_add_mul(tmpr->sp_dn.s3, r0_BSM, us.sp_dn.s3);
+
+  // tmpr += \gamma_0*us
+  if ( inv ){
+    _vector_add_assign(tmpr->sp_up.s0, us.sp_up.s2);
+    _vector_add_assign(tmpr->sp_up.s1, us.sp_up.s3);
+    _vector_add_assign(tmpr->sp_up.s2, us.sp_up.s0);
+    _vector_add_assign(tmpr->sp_up.s3, us.sp_up.s1);
+
+    _vector_add_assign(tmpr->sp_dn.s0, us.sp_dn.s2);
+    _vector_add_assign(tmpr->sp_dn.s1, us.sp_dn.s3);
+    _vector_add_assign(tmpr->sp_dn.s2, us.sp_dn.s0);
+    _vector_add_assign(tmpr->sp_dn.s3, us.sp_dn.s1);
+  }
+  else{
+    _vector_sub_assign(tmpr->sp_up.s0, us.sp_up.s2);
+    _vector_sub_assign(tmpr->sp_up.s1, us.sp_up.s3);
+    _vector_sub_assign(tmpr->sp_up.s2, us.sp_up.s0);
+    _vector_sub_assign(tmpr->sp_up.s3, us.sp_up.s1);
+
+    _vector_sub_assign(tmpr->sp_dn.s0, us.sp_dn.s2);
+    _vector_sub_assign(tmpr->sp_dn.s1, us.sp_dn.s3);
+    _vector_sub_assign(tmpr->sp_dn.s2, us.sp_dn.s0);
+    _vector_sub_assign(tmpr->sp_dn.s3, us.sp_dn.s1);
+  }
+
+  // tmpr += F*us
+  Fadd(tmpr, &us, phi,  phaseF, sign);
+  Fadd(tmpr, &us, phip, phaseF, sign);
+
+  return;
+}
+
+static inline void p1add(bispinor * restrict const tmpr, bispinor const * restrict const s,
+                         su3 const * restrict const u, const int inv, const _Complex double phase,
+                         const double phaseF, const scalar * const phi, const scalar * const phip,
+                         const double sign) {
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static bispinor us;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+
+  // us = phase*u*s
+  if( inv ){
+    bispinor_times_phase_times_inverse_u(&us, phase, u, s);
+  }
+  else{
+    bispinor_times_phase_times_u(&us, phase, u, s);
+  }
+
+
+  _vector_add_mul(tmpr->sp_up.s0, r0_BSM, us.sp_up.s0);
+  _vector_add_mul(tmpr->sp_up.s1, r0_BSM, us.sp_up.s1);
+  _vector_add_mul(tmpr->sp_up.s2, r0_BSM, us.sp_up.s2);
+  _vector_add_mul(tmpr->sp_up.s3, r0_BSM, us.sp_up.s3);
+  _vector_add_mul(tmpr->sp_dn.s0, r0_BSM, us.sp_dn.s0);
+  _vector_add_mul(tmpr->sp_dn.s1, r0_BSM, us.sp_dn.s1);
+  _vector_add_mul(tmpr->sp_dn.s2, r0_BSM, us.sp_dn.s2);
+  _vector_add_mul(tmpr->sp_dn.s3, r0_BSM, us.sp_dn.s3);
+
+  // tmpr += \gamma_1*us
+  if (inv){
+   _vector_i_add_assign(tmpr->sp_up.s0, us.sp_up.s3);
+   _vector_i_add_assign(tmpr->sp_up.s1, us.sp_up.s2);
+   _vector_i_sub_assign(tmpr->sp_up.s2, us.sp_up.s1);
+   _vector_i_sub_assign(tmpr->sp_up.s3, us.sp_up.s0);
+ 
+   _vector_i_add_assign(tmpr->sp_dn.s0, us.sp_dn.s3);
+   _vector_i_add_assign(tmpr->sp_dn.s1, us.sp_dn.s2);
+   _vector_i_sub_assign(tmpr->sp_dn.s2, us.sp_dn.s1);
+   _vector_i_sub_assign(tmpr->sp_dn.s3, us.sp_dn.s0);
+
+  }
+  else {
+   _vector_i_sub_assign(tmpr->sp_up.s0, us.sp_up.s3);
+   _vector_i_sub_assign(tmpr->sp_up.s1, us.sp_up.s2);
+   _vector_i_add_assign(tmpr->sp_up.s2, us.sp_up.s1);
+   _vector_i_add_assign(tmpr->sp_up.s3, us.sp_up.s0);
+
+   _vector_i_sub_assign(tmpr->sp_dn.s0, us.sp_dn.s3);
+   _vector_i_sub_assign(tmpr->sp_dn.s1, us.sp_dn.s2);
+   _vector_i_add_assign(tmpr->sp_dn.s2, us.sp_dn.s1);
+   _vector_i_add_assign(tmpr->sp_dn.s3, us.sp_dn.s0);
+  }
+
+  // tmpr += F*us
+  Fadd(tmpr, &us, phi,  phaseF, sign);
+  Fadd(tmpr, &us, phip, phaseF, sign);
+
+  return;
+}
+
+static inline void p2add(bispinor * restrict const tmpr, bispinor const * restrict const s,
+                         su3 const * restrict const u, const int inv, const _Complex double phase,
+                         const double phaseF, const scalar * const phi, const scalar * const phip,
+                         const double sign) {
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static bispinor us;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+  // us = phase*u*s
+  if( inv ){
+    bispinor_times_phase_times_inverse_u(&us, phase, u, s);
+  }
+  else{
+    bispinor_times_phase_times_u(&us, phase, u, s);
+  }
+
+  _vector_add_mul(tmpr->sp_up.s0, r0_BSM, us.sp_up.s0);
+  _vector_add_mul(tmpr->sp_up.s1, r0_BSM, us.sp_up.s1);
+  _vector_add_mul(tmpr->sp_up.s2, r0_BSM, us.sp_up.s2);
+  _vector_add_mul(tmpr->sp_up.s3, r0_BSM, us.sp_up.s3);
+  _vector_add_mul(tmpr->sp_dn.s0, r0_BSM, us.sp_dn.s0);
+  _vector_add_mul(tmpr->sp_dn.s1, r0_BSM, us.sp_dn.s1);
+  _vector_add_mul(tmpr->sp_dn.s2, r0_BSM, us.sp_dn.s2);
+  _vector_add_mul(tmpr->sp_dn.s3, r0_BSM, us.sp_dn.s3);
+
+  // tmpr += \gamma_2*us
+  if (inv){
+    _vector_add_assign(tmpr->sp_up.s0, us.sp_up.s3);
+    _vector_sub_assign(tmpr->sp_up.s1, us.sp_up.s2);
+    _vector_sub_assign(tmpr->sp_up.s2, us.sp_up.s1);
+    _vector_add_assign(tmpr->sp_up.s3, us.sp_up.s0);
+
+    _vector_add_assign(tmpr->sp_dn.s0, us.sp_dn.s3);
+    _vector_sub_assign(tmpr->sp_dn.s1, us.sp_dn.s2);
+    _vector_sub_assign(tmpr->sp_dn.s2, us.sp_dn.s1);
+    _vector_add_assign(tmpr->sp_dn.s3, us.sp_dn.s0);
+  }
+  else{
+    _vector_sub_assign(tmpr->sp_up.s0, us.sp_up.s3);
+    _vector_add_assign(tmpr->sp_up.s1, us.sp_up.s2);
+    _vector_add_assign(tmpr->sp_up.s2, us.sp_up.s1);
+    _vector_sub_assign(tmpr->sp_up.s3, us.sp_up.s0);
+
+    _vector_sub_assign(tmpr->sp_dn.s0, us.sp_dn.s3);
+    _vector_add_assign(tmpr->sp_dn.s1, us.sp_dn.s2);
+    _vector_add_assign(tmpr->sp_dn.s2, us.sp_dn.s1);
+    _vector_sub_assign(tmpr->sp_dn.s3, us.sp_dn.s0);
+
+  }
+
+  // tmpr += F*us
+  Fadd(tmpr, &us, phi,  phaseF, sign);
+  Fadd(tmpr, &us, phip, phaseF, sign);
+
+  return;
+}
+
+static inline void p3add(bispinor * restrict const tmpr, bispinor const * restrict const s,
+                         su3 const * restrict const u, const int inv, const _Complex double phase,
+                         const double phaseF, const scalar * const phi, const scalar * const phip,
+                         const double sign) {
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static bispinor us;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+
+  // us = phase*u*s
+  if( inv ){
+    bispinor_times_phase_times_inverse_u(&us, phase, u, s);
+  }
+  else{
+    bispinor_times_phase_times_u(&us, phase, u, s);
+  }
+
+  _vector_add_mul(tmpr->sp_up.s0, r0_BSM, us.sp_up.s0);
+  _vector_add_mul(tmpr->sp_up.s1, r0_BSM, us.sp_up.s1);
+  _vector_add_mul(tmpr->sp_up.s2, r0_BSM, us.sp_up.s2);
+  _vector_add_mul(tmpr->sp_up.s3, r0_BSM, us.sp_up.s3);
+  _vector_add_mul(tmpr->sp_dn.s0, r0_BSM, us.sp_dn.s0);
+  _vector_add_mul(tmpr->sp_dn.s1, r0_BSM, us.sp_dn.s1);
+  _vector_add_mul(tmpr->sp_dn.s2, r0_BSM, us.sp_dn.s2);
+  _vector_add_mul(tmpr->sp_dn.s3, r0_BSM, us.sp_dn.s3);
+
+  
+  // tmpr += \gamma_3*us
+  if (inv){
+    _vector_i_add_assign(tmpr->sp_up.s0, us.sp_up.s2);
+    _vector_i_sub_assign(tmpr->sp_up.s1, us.sp_up.s3);
+    _vector_i_sub_assign(tmpr->sp_up.s2, us.sp_up.s0);
+    _vector_i_add_assign(tmpr->sp_up.s3, us.sp_up.s1);
+
+    _vector_i_add_assign(tmpr->sp_dn.s0, us.sp_dn.s2);
+    _vector_i_sub_assign(tmpr->sp_dn.s1, us.sp_dn.s3);
+    _vector_i_sub_assign(tmpr->sp_dn.s2, us.sp_dn.s0);
+    _vector_i_add_assign(tmpr->sp_dn.s3, us.sp_dn.s1);
+  }
+  else {
+    _vector_i_sub_assign(tmpr->sp_up.s0, us.sp_up.s2);
+    _vector_i_add_assign(tmpr->sp_up.s1, us.sp_up.s3);
+    _vector_i_add_assign(tmpr->sp_up.s2, us.sp_up.s0);
+    _vector_i_sub_assign(tmpr->sp_up.s3, us.sp_up.s1);
+
+    _vector_i_sub_assign(tmpr->sp_dn.s0, us.sp_dn.s2);
+    _vector_i_add_assign(tmpr->sp_dn.s1, us.sp_dn.s3);
+    _vector_i_add_assign(tmpr->sp_dn.s2, us.sp_dn.s0);
+    _vector_i_sub_assign(tmpr->sp_dn.s3, us.sp_dn.s1);
+
+  }
+  // tmpr += F*us
+  Fadd(tmpr, &us, phi,  phaseF, sign);
+  Fadd(tmpr, &us, phip, phaseF, sign);
+
+  return;
+}
+
+
+
+/**********************************************
+ * D_psi_BSM acts on bispinor fields          * 
+ * Test version only to provide a version     *
+ * that is working with both with r0_BSM=0,1  *
+ * therefore it is not optimal, only used for *
+ * testing purposes                           *
+ *********************************************/
+void D_psi_BSM3_test(bispinor * const P, bispinor * const Q){
+  if(P==Q){
+    printf("Error in D_psi_BSM (D_psi_BSM.c):\n");
+    printf("Arguments must be different bispinor fields\n");
+    printf("Program aborted\n");
+    exit(1);
+  }
+
+
+#ifdef TM_USE_MPI
+  generic_exchange(Q, sizeof(bispinor));
+#endif
+
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+
+    int ix,iy;                       // x, x+-\mu
+    su3 * restrict up,* restrict um; // U_\mu(x), U_\mu(x-\mu)
+    bispinor * restrict rr;          // P(x)
+    bispinor const * restrict s;     // Q(x)
+    bispinor const * restrict sp;    // Q(x+\mu)
+    bispinor const * restrict sm;    // Q(x-\mu)
+    scalar phi[4];                   // phi_i(x)
+    scalar phip[4][4];               // phi_i(x+mu) = phip[mu][i]
+    scalar phim[4][4];               // phi_i(x-mu) = phim[mu][i]
+    const su3 *w1,*w2,*w3;
+
+
+
+    /************************ loop over all lattice sites *************************/
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+    for (ix=0;ix<VOLUME;ix++)
+      {
+	rr = (bispinor *) P + ix;
+	s  = (bispinor *) Q + ix;
+
+	/* prefatch scalar fields */
+	phi[0] = g_scalar_field[0][ix];
+	phi[1] = g_scalar_field[1][ix];
+	phi[2] = g_scalar_field[2][ix];
+	phi[3] = g_scalar_field[3][ix];
+
+	for( int mu=0; mu<4; mu++ )
+	  {
+	    phip[mu][0] = g_scalar_field[0][g_iup[ix][mu]];
+	    phip[mu][1] = g_scalar_field[1][g_iup[ix][mu]];
+	    phip[mu][2] = g_scalar_field[2][g_iup[ix][mu]];
+	    phip[mu][3] = g_scalar_field[3][g_iup[ix][mu]];
+
+	    phim[mu][0] = g_scalar_field[0][g_idn[ix][mu]];
+	    phim[mu][1] = g_scalar_field[1][g_idn[ix][mu]];
+	    phim[mu][2] = g_scalar_field[2][g_idn[ix][mu]];
+	    phim[mu][3] = g_scalar_field[3][g_idn[ix][mu]];
+	  }
+
+	/* the local part (not local in phi) */
+
+	_spinor_null(rr->sp_up);
+	_spinor_null(rr->sp_dn);
+
+        /* tmpr += (4*r_BSM+m0_BSM)*s */
+        _vector_add_mul(rr->sp_up.s0, 4*r0_BSM+m0_BSM, s->sp_up.s0);
+        _vector_add_mul(rr->sp_up.s1, 4*r0_BSM+m0_BSM, s->sp_up.s1);
+        _vector_add_mul(rr->sp_up.s2, 4*r0_BSM+m0_BSM, s->sp_up.s2);
+        _vector_add_mul(rr->sp_up.s3, 4*r0_BSM+m0_BSM, s->sp_up.s3);
+
+        _vector_add_mul(rr->sp_dn.s0, 4*r0_BSM+m0_BSM, s->sp_dn.s0);
+        _vector_add_mul(rr->sp_dn.s1, 4*r0_BSM+m0_BSM, s->sp_dn.s1);
+        _vector_add_mul(rr->sp_dn.s2, 4*r0_BSM+m0_BSM, s->sp_dn.s2);
+        _vector_add_mul(rr->sp_dn.s3, 4*r0_BSM+m0_BSM, s->sp_dn.s3);
+
+
+
+	/* tmpr += (\eta_BSM+2*\rho_BSM) * F(x)*Q(x) */
+	Fadd(rr, s, phi, eta_BSM+2.0*rho_BSM, +1.);
+
+	/* tmpr += \sum_\mu (\rho_BSM/4) * F(x+-\mu)*Q */
+	for( int mu=0; mu<4; mu++ ) {
+	  Fadd(rr, s, phip[mu], 0.25*rho_BSM, +1.);
+	  Fadd(rr, s, phim[mu], 0.25*rho_BSM, +1.);
+	}
+        Fabsadd(rr,s,phi,c5phi_BSM);
+
+        /* tmpr+=i\gamma_5\tau_1 mu0 *Q */
+        if( fabs(mu01_BSM) > 1.e-10 )
+          tm1_add(rr, s, 1);
+
+        /* tmpr+=i\gamma_5\tau_3 mu0 *Q */
+        if( fabs(mu03_BSM) > 1.e-10 )
+          tm3_add(rr, s, 1);
+
+	/* the hopping part:
+	 * tmpr += +1/2 \sum_\mu (1-gamma_\mu - \rho_BSM/2*F(x) - \rho_BSM/2*F(x+-\mu))*U_{+-\mu}(x)*Q(x+-\mu)
+	 ******************************* direction +0 *********************************/
+	iy=g_iup[ix][0];
+	sp = (bispinor *) Q +iy;
+ 
+        up=&g_gauge_field[ix][0];
+        p0add(rr, sp, up, 0, -0.5*phase_0, 0.5*rho_BSM, phi, phip[0], +1.);
+
+	/******************************* direction -0 *********************************/
+
+	iy=g_idn[ix][0];
+	sm = (bispinor *) Q +iy;
+        um=&g_gauge_field[iy][0];
+        p0add(rr, sm, um, 1, -0.5*phase_0, 0.5*rho_BSM, phi, phim[0], +1.);
+
+	/******************************* direction +1 *********************************/
+	iy=g_iup[ix][1];
+	sp = (bispinor *) Q +iy;
+        up=&g_gauge_field[ix][1];
+        p1add(rr, sp, up, 0, -0.5*phase_1, 0.5*rho_BSM, phi, phip[1], +1.);
+
+
+	/******************************* direction -1 *********************************/
+	iy=g_idn[ix][1];
+	sm = (bispinor *) Q +iy;
+	um=&g_gauge_field[iy][1];
+        p1add(rr, sm, um, 1, -0.5*phase_1, 0.5*rho_BSM, phi, phim[1], +1.);
+
+	/******************************* direction +2 *********************************/
+	iy=g_iup[ix][2];
+	sp = (bispinor *) Q +iy;
+	up=&g_gauge_field[ix][2];
+        p2add(rr, sp, up, 0, -0.5*phase_2, 0.5*rho_BSM, phi, phip[2], +1.);
+
+	/******************************* direction -2 *********************************/
+	iy=g_idn[ix][2];
+	sm = (bispinor *) Q +iy;
+	um=&g_gauge_field[iy][2];
+        p2add(rr, sm, um, 1, -0.5*phase_2, 0.5*rho_BSM, phi, phim[2], +1.);
+
+
+	/******************************* direction +3 *********************************/
+	iy=g_iup[ix][3];
+	sp = (bispinor *) Q +iy;
+	up=&g_gauge_field[ix][3];
+        p3add(rr, sp, up, 0, -0.5*phase_3, 0.5*rho_BSM, phi, phip[3], +1.);
+
+	/******************************* direction -3 *********************************/
+	iy=g_idn[ix][3];
+	sm = (bispinor *) Q +iy;
+	um=&g_gauge_field[iy][3];
+        p3add(rr, sm, um, 1, -0.5*phase_3, 0.5*rho_BSM, phi, phim[3], +1.);
+
+      }
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+}
+#endif
diff --git a/operator/D_psi_BSM3_test.h b/operator/D_psi_BSM3_test.h
new file mode 100644
index 000000000..fe1c56f1a
--- /dev/null
+++ b/operator/D_psi_BSM3_test.h
@@ -0,0 +1,28 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2015 Mario Schroeck
+ *               2016 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _D_PSI_BSM3_TEST_H
+#define _D_PSI_BSM3_TEST_H
+
+//#include "block.h"
+void D_psi_BSM3_test(bispinor * const P, bispinor * const Q);
+
+#endif
diff --git a/operator/D_psi_body.c b/operator/D_psi_body.c
new file mode 100644
index 000000000..225d0803b
--- /dev/null
+++ b/operator/D_psi_body.c
@@ -0,0 +1,375 @@
+static inline void _PTSWITCH(p0add)(_PTSWITCH(spinor) * restrict const tmpr , _PTSWITCH(spinor) const * restrict const s, 
+				    _PSWITCH(su3) const * restrict const u, const _C_TYPE phase) {
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static _PTSWITCH(su3_vector) chi, psi;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+
+  _vector_add(psi,s->s0, s->s2);
+  _su3_multiply(chi, (*u), psi);
+
+  _complex_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->s0, psi);
+  _vector_add_assign(tmpr->s2, psi);
+
+  _vector_add(psi, s->s1, s->s3);
+  _su3_multiply(chi, (*u), psi);
+
+  _complex_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->s1, psi);
+  _vector_add_assign(tmpr->s3, psi);
+
+  return;
+}
+
+
+static inline void _PTSWITCH(m0add)(_PTSWITCH(spinor) * restrict const tmpr, _PTSWITCH(spinor) const * restrict const s, 
+				    _PSWITCH(su3) const * restrict const u, const _C_TYPE phase) {
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static _PTSWITCH(su3_vector) chi, psi;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+
+  _vector_sub(psi, s->s0, s->s2);
+  _su3_inverse_multiply(chi, (*u), psi);
+
+  _complexcjg_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->s0, psi);
+  _vector_sub_assign(tmpr->s2, psi);
+
+  _vector_sub(psi, s->s1, s->s3);
+  _su3_inverse_multiply(chi, (*u), psi);
+
+  _complexcjg_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->s1, psi);
+  _vector_sub_assign(tmpr->s3, psi);
+
+  return;
+}
+
+static inline void _PTSWITCH(p1add)(_PTSWITCH(spinor) * restrict const tmpr, _PTSWITCH(spinor) const * restrict const s, 
+				    _PSWITCH(su3) const * restrict const u, const _C_TYPE phase) {
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static _PTSWITCH(su3_vector) chi, psi;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+
+  _vector_i_add(psi,s->s0,s->s3);
+  _su3_multiply(chi,(*u),psi);
+
+  _complex_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->s0, psi);
+  _vector_i_sub_assign(tmpr->s3, psi);
+ 
+  _vector_i_add(psi, s->s1, s->s2);
+  _su3_multiply(chi, (*u), psi);
+
+  _complex_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->s1, psi);
+  _vector_i_sub_assign(tmpr->s2, psi);
+
+  return;
+}
+
+static inline void _PTSWITCH(m1add)(_PTSWITCH(spinor) * restrict const tmpr, _PTSWITCH(spinor) const * restrict const s, 
+				    _PSWITCH(su3) const * restrict const u, const _C_TYPE phase) {
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static _PTSWITCH(su3_vector) chi, psi;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+
+  _vector_i_sub(psi,s->s0, s->s3);
+  _su3_inverse_multiply(chi,(*u), psi);
+
+  _complexcjg_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->s0, psi);
+  _vector_i_add_assign(tmpr->s3, psi);
+
+  _vector_i_sub(psi, s->s1, s->s2);
+  _su3_inverse_multiply(chi, (*u), psi);
+
+  _complexcjg_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->s1, psi);
+  _vector_i_add_assign(tmpr->s2, psi);
+
+  return;
+}
+
+static inline void _PTSWITCH(p2add)(_PTSWITCH(spinor) * restrict const tmpr, _PTSWITCH(spinor) const * restrict const s, 
+				    _PSWITCH(su3) const * restrict const u, const _C_TYPE phase) {
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static _PTSWITCH(su3_vector) chi, psi;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+
+  _vector_add(psi,s->s0,s->s3);
+  _su3_multiply(chi, (*u), psi);
+
+  _complex_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->s0, psi);
+  _vector_add_assign(tmpr->s3, psi);
+
+  _vector_sub(psi,s->s1,s->s2);
+  _su3_multiply(chi, (*u), psi);
+
+  _complex_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->s1, psi);
+  _vector_sub_assign(tmpr->s2, psi);
+
+
+  return;
+}
+
+static inline void _PTSWITCH(m2add)(_PTSWITCH(spinor) * restrict const tmpr, _PTSWITCH(spinor) const * restrict const s, 
+				    _PSWITCH(su3) const * restrict const u, const _C_TYPE phase) {
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static _PTSWITCH(su3_vector) chi, psi;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+
+  _vector_sub(psi, s->s0, s->s3);
+  _su3_inverse_multiply(chi, (*u), psi);
+
+  _complexcjg_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->s0, psi);
+  _vector_sub_assign(tmpr->s3, psi);
+
+  _vector_add(psi, s->s1, s->s2);
+  _su3_inverse_multiply(chi, (*u),psi);
+
+  _complexcjg_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->s1, psi);
+  _vector_add_assign(tmpr->s2, psi);
+
+  return;
+}
+
+static inline void _PTSWITCH(p3add)(_PTSWITCH(spinor) * restrict const tmpr, _PTSWITCH(spinor) const * restrict const s, 
+				    _PSWITCH(su3) const * restrict const u, const _C_TYPE phase) {
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static _PTSWITCH(su3_vector) chi, psi;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+
+  _vector_i_add(psi, s->s0, s->s2);
+  _su3_multiply(chi, (*u), psi);
+
+  _complex_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->s0, psi);
+  _vector_i_sub_assign(tmpr->s2, psi);
+
+  _vector_i_sub(psi,s->s1, s->s3);
+  _su3_multiply(chi, (*u), psi);
+
+  _complex_times_vector(psi, phase, chi);
+  _vector_add_assign(tmpr->s1, psi);
+  _vector_i_add_assign(tmpr->s3, psi);
+
+  return;
+}
+
+static inline void _PTSWITCH(m3addandstore)(_PTSWITCH(spinor) * restrict const r, _PTSWITCH(spinor) const * restrict const s, 
+					    _PSWITCH(su3) const * restrict const u, const _C_TYPE phase,
+					    _PTSWITCH(spinor) const * restrict const tmpr) {
+#ifdef TM_USE_OMP
+#define static
+#endif
+  static _PTSWITCH(su3_vector) chi, psi;
+#ifdef TM_USE_OMP
+#undef static
+#endif
+
+  _vector_i_sub(psi,s->s0, s->s2);
+  _su3_inverse_multiply(chi, (*u), psi);
+
+  _complexcjg_times_vector(psi, phase, chi);
+  _vector_add(r->s0, tmpr->s0, psi);
+  _vector_i_add(r->s2, tmpr->s2, psi);
+
+  _vector_i_add(psi, s->s1, s->s3);
+  _su3_inverse_multiply(chi, (*u), psi);
+
+  _complexcjg_times_vector(psi, phase, chi);
+  _vector_add(r->s1, tmpr->s1, psi);
+  _vector_i_sub(r->s3, tmpr->s3, psi);
+
+  return;
+}
+
+/* this is the hopping part only */
+static inline void _PSWITCH(local_H)(_PTSWITCH(spinor) * const rr, _PTSWITCH(spinor) const * const s, 
+				      _PSWITCH(su3) const * restrict u, 
+				      int * _idx, _PTSWITCH(spinor) * const restrict tmpr) {
+  // convert phases to _C_TYPE locally
+  _C_TYPE ALIGN32 phase_0l = (_C_TYPE) phase_0;
+  _C_TYPE ALIGN32 phase_1l = (_C_TYPE) phase_1;
+  _C_TYPE ALIGN32 phase_2l = (_C_TYPE) phase_2;
+  _C_TYPE ALIGN32 phase_3l = (_C_TYPE) phase_3;  
+
+  int * idx = _idx;
+
+  /****** direction +0 ******/
+  _PTSWITCH(p0add)(tmpr, s + (*idx), u, phase_0l);
+  u++;
+  idx++;
+  /****** direction -0 ******/
+  _PTSWITCH(m0add)(tmpr, s + (*idx), u, phase_0l);
+  u++;
+  idx++;
+  /****** direction +1 ******/
+  _PTSWITCH(p1add)(tmpr, s + (*idx), u, phase_1l);
+  u++;
+  idx++;
+  /****** direction -1 ******/
+  _PTSWITCH(m1add)(tmpr, s + (*idx), u, phase_1l);
+  u++;
+  idx++;
+  /****** direction +2 ******/
+  _PTSWITCH(p2add)(tmpr, s + (*idx), u, phase_2l);
+  u++;
+  idx++;
+  /****** direction -2 ******/
+  _PTSWITCH(m2add)(tmpr, s + (*idx), u, phase_2l);
+  u++;
+  idx++;
+  /****** direction +3 ******/
+  _PTSWITCH(p3add)(tmpr, s + (*idx), u, phase_3l);
+  u++;
+  idx++;
+  /****** direction -3 ******/
+  _PTSWITCH(m3addandstore)(rr, s + (*idx), u, phase_3l, tmpr);
+
+  return;
+}
+
+void _PSWITCH(D_psi)(_PTSWITCH(spinor) * const P, _PTSWITCH(spinor) * const Q){
+  if(P==Q){
+    printf("Error in D_psi (operator.c):\n");
+    printf("Arguments must be different spinor fields\n");
+    printf("Program aborted\n");
+    exit(1);
+  }
+  //convert phases to float locally
+  _C_TYPE ALIGN32 phase_0l = (_C_TYPE) phase_0;
+  _C_TYPE ALIGN32 phase_1l = (_C_TYPE) phase_1;
+  _C_TYPE ALIGN32 phase_2l = (_C_TYPE) phase_2;
+  _C_TYPE ALIGN32 phase_3l = (_C_TYPE) phase_3;  
+
+#ifdef _GAUGE_COPY
+  if(_PSWITCH(g_update_gauge_copy)) {
+    _PSWITCH(update_backward_gauge)(_PSWITCH(g_gauge_field));
+  }
+#endif
+# if defined TM_USE_MPI
+  _PTSWITCH(xchange_lexicfield)(Q);
+# endif
+
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+
+  int ix,iy;
+  _PSWITCH(su3) * restrict up,* restrict um;
+  _PTSWITCH(spinor) * restrict rr; 
+  _PTSWITCH(spinor) const * restrict s;
+  _PTSWITCH(spinor) const * restrict sp;
+  _PTSWITCH(spinor) const * restrict sm;
+  _C_TYPE rho1, rho2;
+  _PTSWITCH(spinor) tmpr;
+
+  rho1 = (_F_TYPE)1. + (_F_TYPE) g_mu * I;
+  rho2 = conj(rho1);
+
+  /************************ loop over all lattice sites *************************/
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for (ix = 0; ix < VOLUME; ix++) {
+    rr  = (_PTSWITCH(spinor) *) P +ix;
+    s  = (_PTSWITCH(spinor) *) Q +ix;
+
+    if(g_c_sw > 0) {
+      _PSWITCH(assign_mul_one_sw_pm_imu_site_lexic)(ix, &tmpr, s, (_F_TYPE) g_mu);
+    }
+    else {
+      _complex_times_vector(tmpr.s0, rho1, s->s0);
+      _complex_times_vector(tmpr.s1, rho1, s->s1);
+      _complex_times_vector(tmpr.s2, rho2, s->s2);
+      _complex_times_vector(tmpr.s3, rho2, s->s3);
+    }
+
+    /******************************* direction +0 *********************************/
+    iy=g_iup[ix][0];
+    sp = (_PTSWITCH(spinor) *) Q +iy;
+    up=&_PSWITCH(g_gauge_field)[ix][0];
+    _PTSWITCH(p0add)(&tmpr, sp, up, phase_0l);
+
+    /******************************* direction -0 *********************************/
+    iy=g_idn[ix][0];
+    sm  = (_PTSWITCH(spinor) *) Q +iy;
+    um=&_PSWITCH(g_gauge_field)[iy][0];
+    _PTSWITCH(m0add)(&tmpr, sm, um, phase_0l);
+
+    /******************************* direction +1 *********************************/
+    iy=g_iup[ix][1];
+    sp = (_PTSWITCH(spinor) *) Q +iy;
+    up=&_PSWITCH(g_gauge_field)[ix][1];
+    _PTSWITCH(p1add)(&tmpr, sp, up, phase_1l);
+
+    /******************************* direction -1 *********************************/
+    iy=g_idn[ix][1];
+    sm = (_PTSWITCH(spinor) *) Q +iy;
+    um=&_PSWITCH(g_gauge_field)[iy][1];
+    _PTSWITCH(m1add)(&tmpr, sm, um, phase_1l);
+
+    /******************************* direction +2 *********************************/
+    iy=g_iup[ix][2];
+    sp = (_PTSWITCH(spinor) *) Q +iy;
+    up=&_PSWITCH(g_gauge_field)[ix][2];
+    _PTSWITCH(p2add)(&tmpr, sp, up, phase_2l);
+
+    /******************************* direction -2 *********************************/
+    iy=g_idn[ix][2];
+    sm = (_PTSWITCH(spinor) *) Q +iy;
+    um=&_PSWITCH(g_gauge_field)[iy][2];
+    _PTSWITCH(m2add)(&tmpr, sm, um, phase_2l);
+
+    /******************************* direction +3 *********************************/
+    iy=g_iup[ix][3];
+    sp = (_PTSWITCH(spinor) *) Q +iy;
+    up=&_PSWITCH(g_gauge_field)[ix][3];
+    _PTSWITCH(p3add)(&tmpr, sp, up, phase_3l);
+
+    /******************************* direction -3 *********************************/
+    iy=g_idn[ix][3];
+    sm = (_PTSWITCH(spinor) *) Q +iy;
+    um=&_PSWITCH(g_gauge_field)[iy][3];
+    _PTSWITCH(m3addandstore)(rr, sm, um, phase_3l, &tmpr);
+  }
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+}
diff --git a/operator/Dov_psi.c b/operator/Dov_psi.c
index 68231dbce..f2064a57f 100644
--- a/operator/Dov_psi.c
+++ b/operator/Dov_psi.c
@@ -38,7 +38,7 @@
  *************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 
 #include <stdlib.h>
diff --git a/operator/Hopping_Matrix.c b/operator/Hopping_Matrix.c
index d75d2e781..1f95e335b 100644
--- a/operator/Hopping_Matrix.c
+++ b/operator/Hopping_Matrix.c
@@ -47,16 +47,16 @@
  ****************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
-#ifdef OMP
+#ifdef TM_USE_OMP
 #include <omp.h>
 #endif
 #include "global.h"
 #include "su3.h"
-#ifdef MPI
+#ifdef TM_USE_MPI
 #  include "xchange/xchange.h"
 #endif
 #include "boundary.h"
@@ -91,7 +91,7 @@ void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k) {
   }
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
   su3 * restrict u0 ALIGN;
@@ -99,7 +99,7 @@ void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k) {
 
 #  include "operator/halfspinor_body.c"
 
-#  ifdef OMP
+#  ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #  endif
   return;
@@ -138,18 +138,18 @@ void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k) {
   }
 #    endif
 
-#    if (defined MPI && !(defined _NO_COMM))
+#    if (defined TM_USE_MPI && !(defined _NO_COMM))
   xchange_field(k, ieo);
 #    endif
 
-#    ifdef OMP
+#    ifdef TM_USE_OMP
 #      pragma omp parallel
   {
 #    endif
 
 #    include "operator/hopping_body_dbl.c"
 
-#    ifdef OMP
+#    ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #    endif
   return;
diff --git a/operator/Hopping_Matrix_32.c b/operator/Hopping_Matrix_32.c
new file mode 100644
index 000000000..e80e5c736
--- /dev/null
+++ b/operator/Hopping_Matrix_32.c
@@ -0,0 +1,145 @@
+/**********************************************************************
+ * Copyright (C) 2013 Florian Burger
+ * derived from Hopping_Matrix.c 
+ * Copyright (C) 2001 Martin Luescher
+ *               2002 Martin Hasenbusch
+ *               2003, 2004, 2005, 2006, 2007, 2008 Carsten Urbach
+ *
+ * BG and halfspinor versions (C) 2007, 2008 Carsten Urbach
+ *
+ * This file is based on an implementation of the Dirac operator 
+ * written by Martin Luescher, modified by Martin Hasenbusch in 2002 
+ * and modified and extended by Carsten Urbach from 2003-2008
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Hopping_Matrix is the conventional Wilson 
+ * hopping matrix
+ *
+ * \kappa\sum_{\pm\mu}(r+\gamma_\mu)U_{x,\mu}
+ *
+ * for ieo = 0 this is M_{eo}, for ieo = 1
+ * it is M_{oe}
+ *
+ * l is the output, k the input field
+ *
+ *  Structure of top level precompiler directives 
+ *
+ * - defining _USE_HALFSPINOR implies that we also use
+ *   a "gauge copy"
+ *
+ * - such that we are checking for the _USE_GAUGECOPY feature seperatly in the 
+ *   ELSE branch of the "if defined _USE_HALFSPINOR" statement
+ *
+ ****************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+
+// work-around for missing single precision implementation of inline SSE
+#ifdef SSE
+#define REDEFSSE
+#undef SSE
+#endif
+
+#ifdef SSE2
+#define REDEFSSE2
+#undef SSE2
+#endif
+
+#ifdef SSE3
+#define REDEFSSE3
+#undef SSE3
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#ifdef TM_USE_OMP
+#include <omp.h>
+#endif
+#include "global.h"
+#include "su3.h"
+#ifdef TM_USE_MPI
+#  include "xchange/xchange.h"
+#endif
+#include "boundary.h"
+#include "init/init_dirac_halfspinor.h"
+#include "update_backward_gauge.h"
+#ifdef SPI
+#  include"DirectPut.h"
+#endif
+#include "operator/Hopping_Matrix_32.h"
+
+#if defined _USE_HALFSPINOR
+#  include "operator/halfspinor_hopping_32.h"
+#endif
+
+
+#if (defined BGQ && defined XLC)
+#    include "bgq.h"
+#    include "bgq2.h"
+#    include "xlc_prefetch.h"
+#endif
+
+void Hopping_Matrix_32_orphaned(const int ieo, spinor32 * const l, spinor32 * const k) {
+#if defined _USE_HALFSPINOR
+  #ifdef _GAUGE_COPY
+    if(g_update_gauge_copy_32) {
+      update_backward_gauge_32_orphaned(g_gauge_field_32);   
+    }
+  #endif
+
+  #ifdef TM_USE_OMP
+    su3_32 * restrict u0 ALIGN32;
+  #endif
+
+  #  include "operator/halfspinor_body_32.c"
+#else
+   printf("Error: Single precision Matrix only implemented with HALFSPINOR\n");
+   exit(200);
+#endif  
+}
+
+
+void Hopping_Matrix_32(const int ieo, spinor32 * const l, spinor32 * const k) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  Hopping_Matrix_32_orphaned(ieo,l,k);
+#ifdef TM_USE_OMP
+  }
+#endif
+  return;
+}
+
+#ifdef REDEFSSE
+#undef REDEFSSE
+#define SSE
+#endif
+
+#ifdef REDEFSSE2
+#undef REDEFSSE2
+#define SSE2
+#endif
+
+#ifdef REDEFSSE3
+#undef REDEFSSE3
+#define SSE3
+#endif                                                                                                                                                                                                                                       
+
diff --git a/operator/Hopping_Matrix_32.h b/operator/Hopping_Matrix_32.h
new file mode 100644
index 000000000..610ac67a1
--- /dev/null
+++ b/operator/Hopping_Matrix_32.h
@@ -0,0 +1,33 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _HOPPING_MATRIX32_H
+#  define _HOPPING_MATRIX32_H
+
+#  define EO 0
+#  define OE 1
+#  define OO 1
+#  define EE 0
+
+#  include "su3.h"
+
+void Hopping_Matrix_32_orphaned(const int ieo, spinor32 * const l, spinor32 * const k);
+void Hopping_Matrix_32(const int ieo, spinor32 * const l, spinor32 * const k);
+
+#endif
diff --git a/operator/Hopping_Matrix_32_nocom.c b/operator/Hopping_Matrix_32_nocom.c
new file mode 100644
index 000000000..463ab2f17
--- /dev/null
+++ b/operator/Hopping_Matrix_32_nocom.c
@@ -0,0 +1,54 @@
+/***********************************************************************
+ * Copyright (C) 2013 Florian Burger
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+
+// work-around for missing single precision implementation of inline SSE
+#ifdef SSE
+#define REDEFSSE
+#undef SSE
+#endif
+
+#ifdef SSE2
+#define REDEFSSE2
+#undef SSE2
+#endif
+
+#ifdef SSE3
+#define REDEFSSE3
+#undef SSE3
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "global.h"
+#include "xchange/xchange.h"
+#include "su3.h"
+#include "sse.h"
+#include "boundary.h"
+#include "operator/Hopping_Matrix_32.h"
+
+#define Hopping_Matrix_32 Hopping_Matrix_32_nocom
+#define Hopping_Matrix_32_orphaned Hopping_Matrix_32_orphaned_nocom
+#define _NO_COMM 1
+
+#include "Hopping_Matrix_32.c"
diff --git a/operator/Hopping_Matrix_nocom.c b/operator/Hopping_Matrix_nocom.c
index 028a26630..fc9d2f5ab 100644
--- a/operator/Hopping_Matrix_nocom.c
+++ b/operator/Hopping_Matrix_nocom.c
@@ -36,7 +36,7 @@
  ******************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/operator/M_psi.c b/operator/M_psi.c
index 68e325937..b0a62f228 100644
--- a/operator/M_psi.c
+++ b/operator/M_psi.c
@@ -39,7 +39,7 @@
  *****************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 
 #include <stdlib.h>
@@ -49,7 +49,7 @@
 #include "su3.h"
 #include "sse.h"
 #include "boundary.h"
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include "xchange/xchange.h"
 #endif
 #include "update_backward_gauge.h"
@@ -57,7 +57,7 @@
 #include "operator/D_psi.h"
 #include "solver/dirac_operator_eigenvectors.h"
 
-
+#ifdef TM_USE_BSM
 #include "init/init_scalar_field.h"
 
 
@@ -69,7 +69,7 @@ void scalarderivatives(_Complex double * drvsc){
   /**  questa instruzione l'ho messa nel MAIN:
   drvsc = malloc(18*VOLUMEPLUSRAND*sizeof(_Complex double)); **/
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (ix=0;ix<VOLUME;ix++)
@@ -80,7 +80,7 @@ void scalarderivatives(_Complex double * drvsc){
 
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (ix=0;ix<VOLUME;ix++)
@@ -125,12 +125,12 @@ void scalarderivatives(_Complex double * drvsc){
 
 static inline void nohopp(spinor * restrict const tmpr, spinor const * restrict const s, spinor const * restrict const t, _Complex double const * restrict const xs, int row) {
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static su3_vector chi, psi;
   static _Complex double fact1, fact2;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -175,12 +175,12 @@ static inline void nohopp(spinor * restrict const tmpr, spinor const * restrict
 
 static inline void pp0add(spinor * restrict const tmpr , spinor const * restrict const s, spinor const * restrict const t, su3 const * restrict const u, const _Complex double phase, _Complex double const * restrict const xs, int row) {
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static su3_vector chi, psi;
   static _Complex double fact1, fact2;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -231,12 +231,12 @@ static inline void pp0add(spinor * restrict const tmpr , spinor const * restrict
 }
 
 static inline void mm0add(spinor * restrict const tmpr, spinor const * restrict const s, spinor const * restrict const t, su3 const * restrict const u, const _Complex double phase, _Complex double const * restrict const xs, int row) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static su3_vector chi, psi;
   static _Complex double fact1, fact2;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -287,12 +287,12 @@ static inline void mm0add(spinor * restrict const tmpr, spinor const * restrict
 }
 
 static inline void pp1add(spinor * restrict const tmpr, spinor const * restrict const s, spinor const * restrict const t, su3 const * restrict const u, const _Complex double phase, _Complex double const * restrict const xs, int row) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static su3_vector chi, psi;
   static _Complex double fact1, fact2;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -343,12 +343,12 @@ static inline void pp1add(spinor * restrict const tmpr, spinor const * restrict
 }
 
 static inline void mm1add(spinor * restrict const tmpr, spinor const * restrict const s, spinor const * restrict const t, su3 const * restrict const u, const _Complex double phase, _Complex double const * restrict const xs, int row) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static su3_vector chi, psi;
   static _Complex double fact1, fact2;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -399,12 +399,12 @@ static inline void mm1add(spinor * restrict const tmpr, spinor const * restrict
 }
 
 static inline void pp2add(spinor * restrict const tmpr, spinor const * restrict const s, spinor const * restrict const t, su3 const * restrict const u, const _Complex double phase, _Complex double const * restrict const xs, int row) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static su3_vector chi, psi;
   static _Complex double fact1, fact2;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -455,12 +455,12 @@ static inline void pp2add(spinor * restrict const tmpr, spinor const * restrict
 }
 
 static inline void mm2add(spinor * restrict const tmpr, spinor const * restrict const s, spinor const * restrict const t, su3 const * restrict const u, const _Complex double phase, _Complex double const * restrict const xs, int row) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static su3_vector chi, psi;
   static _Complex double fact1, fact2;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -511,12 +511,12 @@ static inline void mm2add(spinor * restrict const tmpr, spinor const * restrict
 }
 
 static inline void pp3add(spinor * restrict const tmpr, spinor const * restrict const s, spinor const * restrict const t, su3 const * restrict const u, const _Complex double phase, _Complex double const * restrict const xs, int row) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static su3_vector chi, psi;
   static _Complex double fact1, fact2;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -567,12 +567,12 @@ static inline void pp3add(spinor * restrict const tmpr, spinor const * restrict
 }
 
 static inline void mm3addandstore(spinor * restrict const r, spinor const * restrict const s, spinor const * restrict const t, su3 const * restrict const u, const _Complex double phase, spinor const * restrict const tmpr, _Complex double const * restrict const xs, int row) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #define static
 #endif
   static su3_vector chi, psi;
   static _Complex double fact1, fact2;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
@@ -648,7 +648,7 @@ static inline void mm3addandstore(spinor * restrict const r, spinor const * rest
   /**  xchange_lexicfield(drvsc);  THIS CAN BE DANGEROUS ?!?   **/
 # endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -673,7 +673,7 @@ static inline void mm3addandstore(spinor * restrict const r, spinor const * rest
 
   /************************ loop over all lattice sites *******************/
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (ix=0;ix<VOLUME;ix++)
@@ -780,7 +780,7 @@ static inline void mm3addandstore(spinor * restrict const r, spinor const * rest
   }
 
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
@@ -788,3 +788,4 @@ static inline void mm3addandstore(spinor * restrict const r, spinor const * rest
 
 
 
+#endif
diff --git a/operator/Makefile.in b/operator/Makefile.in
index d306684c4..c39a5d1ed 100644
--- a/operator/Makefile.in
+++ b/operator/Makefile.in
@@ -31,10 +31,11 @@ COMPILE = ${CC} $(DEFS) ${INCLUDES} ${CFLAGS}
 
 LIBRARIES = liboperator
 liboperator_TARGETS = clover_accumulate_deriv clover_deriv clovertm_operators clover_leaf \
-  tm_operators_nd clover_term clover_invert clover_det
+  tm_operators_nd tm_operators_nd_32 clover_term clover_invert clover_det \
+	clovertm_operators_32
 
-liboperator_STARGETS = Hopping_Matrix_nocom tm_times_Hopping_Matrix Hopping_Matrix \
-	tm_operators tm_sub_Hopping_Matrix D_psi D_psi_BSM D_psi_BSM2m D_psi_BSM2b D_psi_BSM2f M_psi Dov_psi Dov_proj
+liboperator_STARGETS = Hopping_Matrix_nocom tm_times_Hopping_Matrix Hopping_Matrix Hopping_Matrix_32 Hopping_Matrix_32_nocom \
+	tm_operators tm_operators_32 tm_sub_Hopping_Matrix D_psi D_psi_BSM D_psi_BSM2m D_psi_BSM2b D_psi_BSM2f D_psi_BSM3 D_psi_BSM3_test M_psi Dov_psi Dov_proj
 
 liboperator_OBJECTS = $(addsuffix .o, ${liboperator_TARGETS})
 liboperator_SOBJECTS = $(addsuffix .o, ${liboperator_STARGETS})
@@ -57,10 +58,10 @@ profile all-profile: all
 include ${top_srcdir}/Makefile.global
 
 # rule to compile objects
-${liboperator_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h
+${liboperator_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config.h
 	$(COMPILE) ${OPTARGS} -c $<
 
-${liboperator_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h
+${liboperator_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config.h
 	$(COMPILE) ${SOPTARGS} -c $<
 
 # rule to make liboperator
diff --git a/operator/assign_mul_one_sw_pm_imu_inv_block_body.c b/operator/assign_mul_one_sw_pm_imu_inv_block_body.c
new file mode 100644
index 000000000..b212efe72
--- /dev/null
+++ b/operator/assign_mul_one_sw_pm_imu_inv_block_body.c
@@ -0,0 +1,254 @@
+void _PSWITCH(assign_mul_one_sw_pm_imu)(const int ieo, 
+					_PTSWITCH(spinor) * const k, _PTSWITCH(spinor) * const l,
+					const _F_TYPE mu) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+    _PTSWITCH(su3_vector) ALIGN chi, psi1, psi2;
+    int ix;
+    int ioff;
+    const _PSWITCH(su3) *w1, *w2, *w3;
+    _PTSWITCH(spinor) *r;
+    const _PTSWITCH(spinor) *s;
+  
+    if(ieo == 0) {
+      ioff = 0;
+    } 
+    else {
+      ioff = (VOLUME+RAND)/2;
+    }
+    /************************ loop over all lattice sites *************************/
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+    for(unsigned icx = ioff; icx < (VOLUME/2+ioff); icx++) {
+      ix = g_eo2lexic[icx];
+    
+      r = k + icx-ioff;
+      s = l + icx-ioff;
+
+      // upper two spin components first
+      w1=&_PSWITCH(sw)[ix][0][0];
+      w2=w1+2; /*&sw[ix][1][0];*/
+      w3=w1+4; /*&sw[ix][2][0];*/
+      _su3_multiply(psi1,*w1,(*s).s0); 
+      _su3_multiply(chi,*w2,(*s).s1);
+      _vector_add_assign(psi1,chi);
+      _su3_inverse_multiply(psi2,*w2,(*s).s0); 
+      _su3_multiply(chi,*w3,(*s).s1);
+      _vector_add_assign(psi2,chi); 
+
+      // add in the twisted mass term (plus in the upper components)
+      _vector_add_i_mul(psi1, mu, (*s).s0);
+      _vector_add_i_mul(psi2, mu, (*s).s1);
+
+      _vector_assign((*r).s0, psi1);
+      _vector_assign((*r).s1, psi2);
+
+      // now lower to spin components
+      w1++; /*=&sw[ix][0][1];*/
+      w2++; /*=&sw[ix][1][1];*/
+      w3++; /*=&sw[ix][2][1];*/
+      _su3_multiply(psi1,*w1,(*s).s2); 
+      _su3_multiply(chi,*w2,(*s).s3);
+      _vector_add_assign(psi1,chi); 
+      _su3_inverse_multiply(psi2,*w2,(*s).s2); 
+      _su3_multiply(chi,*w3,(*s).s3);
+      _vector_add_assign(psi2,chi); 
+
+      // add in the twisted mass term (minus from g5 in the lower components)
+      _vector_add_i_mul(psi1, -mu, (*s).s2);
+      _vector_add_i_mul(psi2, -mu, (*s).s3);
+
+      _vector_assign((*r).s2, psi1);
+      _vector_assign((*r).s3, psi2);
+    }
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+  return;
+}
+
+/**************************************************************
+ * assign_mul_one_sw_pm_imu applies (1 + T + imug5) to 
+ * a block spinor k
+ *
+ * it is assumed that the clover leaf is computed and stored
+ * in sw[VOLUME][3][2]
+ * the corresponding routine can be found in clover_leaf.c
+ *
+ **************************************************************/
+
+
+void _PSWITCH(assign_mul_one_sw_pm_imu_block)(const int ieo, 
+					      _PTSWITCH(spinor) * const k, _PTSWITCH(spinor) * const l,
+					      const _F_TYPE mu, block *blk) {
+  int jeo;
+  _PTSWITCH(spinor) *r,*s;
+  _PTSWITCH(spinor)  ALIGN tmpr;
+
+  int t,x,y,z;     //lexiographic index of the site w.r.t the current process
+  int bt,bx,by,bz; //block coordinate on the local mpi process
+  int dT,dX,dY,dZ; //block size
+  int sT,sX,sY,sZ; //constant shifts
+  int lx; //lexiographic index of the block site w.r.t the local mpi process
+
+  dT  = blk->BT;
+  dX  = blk->BLX;
+  dY  = blk->BLY;
+  dZ  = blk->BLZ;
+
+  bt = blk->mpilocal_coordinate[0];
+  bx = blk->mpilocal_coordinate[1];
+  by = blk->mpilocal_coordinate[2];
+  bz = blk->mpilocal_coordinate[3];
+
+  sT = bt*dT;
+  sX = bx*dX;
+  sY = by*dY;
+  sZ = bz*dZ;
+ 
+
+  r = k; 
+  s = l;
+
+  for(int it = 0; it < dT; it++)
+    for(int ix = 0; ix < dX; ix++)
+      for(int iy = 0; iy < dY; iy++)
+        for(int iz = 0; iz < dZ; iz++)
+          { 
+            t = it + sT;
+            x = ix + sX;
+            y = iy + sY;
+            z = iz + sZ;
+
+            lx = g_ipt[t][x][y][z];
+
+            jeo= (t+x+y+z)%2;
+
+            if( ieo == jeo) {
+	      _PSWITCH(assign_mul_one_sw_pm_imu_site_lexic)(lx, &tmpr, s, (_F_TYPE) g_mu);
+	      _PSWITCH(assign)(r, &tmpr, 1);
+	      r++;
+	      s++;
+	    }
+          }
+
+  return;
+}
+
+
+
+void _PSWITCH(assign_mul_one_sw_pm_imu_inv)(const int ieo, 
+					    _PTSWITCH(spinor) * const k, _PTSWITCH(spinor) * const l,
+					    const _F_TYPE mu) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+    _PTSWITCH(su3_vector) ALIGN psi, chi, phi1, phi3;
+    const _PSWITCH(su3) *w1, *w2, *w3, *w4;
+    const _PTSWITCH(spinor) *rn;
+    _PTSWITCH(spinor) *s;
+
+    /************************ loop over all lattice sites *************************/
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+    for(int icx = 0; icx < (VOLUME/2); icx++) {
+
+      rn = l + icx;
+      s = k + icx;
+      _vector_assign(phi1,(*rn).s0);
+      _vector_assign(phi3,(*rn).s2);
+
+      w1=&_PSWITCH(sw_inv)[icx][0][0];
+      w2=w1+2;  /* &sw_inv[icx][1][0]; */
+      w3=w1+4;  /* &sw_inv[icx][2][0]; */
+      w4=w1+6;  /* &sw_inv[icx][3][0]; */
+      _su3_multiply(psi,*w1,phi1); 
+      _su3_multiply(chi,*w2,(*rn).s1);
+      _vector_add((*s).s0,psi,chi);
+      _su3_multiply(psi,*w4,phi1); 
+      _su3_multiply(chi,*w3,(*rn).s1);
+      _vector_add((*s).s1,psi,chi);
+
+      w1++; /* &sw_inv[icx][0][1]; */
+      w2++; /* &sw_inv[icx][1][1]; */
+      w3++; /* &sw_inv[icx][2][1]; */
+      w4++; /* &sw_inv[icx][3][1]; */
+      _su3_multiply(psi,*w1,phi3); 
+      _su3_multiply(chi,*w2,(*rn).s3);
+      _vector_add((*s).s2,psi,chi);
+      _su3_multiply(psi,*w4,phi3); 
+      _su3_multiply(chi,*w3,(*rn).s3);
+      _vector_add((*s).s3,psi,chi);
+
+      /******************************** end of loop *********************************/
+    }
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+  return;
+}
+
+
+
+void _PSWITCH(assign_mul_one_sw_pm_imu_inv_block)(const int ieo, 
+						  _PTSWITCH(spinor) * const k, _PTSWITCH(spinor) * const l, 
+						  const _F_TYPE mu, block *blk) {
+  int i,it,ix,iy,iz, lxeo;
+  _PTSWITCH(su3_vector) ALIGN psi, chi, phi1, phi3;
+  const _PSWITCH(su3) *w1, *w2, *w3, *w4;
+  const _PTSWITCH(spinor) *rn;
+  _PTSWITCH(spinor) *s;
+
+  rn = l; 
+  s  = k;
+
+  for(int t = 0; t < blk->BT; t++) {
+    it = t + blk->mpilocal_coordinate[0]*blk->BT;
+    for(int x = 0; x < blk->BLX; x++) {
+      ix = x +  blk->mpilocal_coordinate[1]*blk->BLX;
+      for(int y = 0; y < blk->BLY; y++) {
+        iy = y +  blk->mpilocal_coordinate[2]*blk->BLY;
+        for(int z = 0; z < blk->BLZ; z++) {
+          iz = z +  blk->mpilocal_coordinate[3]*blk->BLZ;
+          i = g_ipt[it][ix][iy][iz];
+          lxeo = g_lexic2eo[i]; 
+          if((t+x+y+z)%2 == ieo) {
+            _vector_assign(phi1, (*rn).s0);
+            _vector_assign(phi3, (*rn).s2);
+            
+            w1 = &_PSWITCH(sw_inv)[lxeo][0][0];
+            w2 = w1 + 2;  /* &sw_inv[lxeo][1][0]; */
+            w3 = w1 + 4;  /* &sw_inv[lxeo][2][0]; */
+            w4 = w1 + 6;  /* &sw_inv[lxeo][3][0]; */
+            
+            _su3_multiply(psi, *w1, phi1); 
+            _su3_multiply(chi, *w2, (*rn).s1);
+            _vector_add((*s).s0, psi, chi);
+            _su3_multiply(psi, *w4, phi1); 
+            _su3_multiply(chi, *w3, (*rn).s1);
+            _vector_add((*s).s1, psi, chi);
+            
+            w1++; /* &sw_inv[lxeo][0][1]; */
+            w2++; /* &sw_inv[lxeo][1][1]; */
+            w3++; /* &sw_inv[lxeo][2][1]; */
+            w4++; /* &sw_inv[lxeo][3][1]; */
+            _su3_multiply(psi, *w1, phi3); 
+            _su3_multiply(chi, *w2, (*rn).s3);
+            _vector_add((*s).s2, psi, chi);
+            _su3_multiply(psi, *w4, phi3); 
+            _su3_multiply(chi, *w3, (*rn).s3);
+            _vector_add((*s).s3, psi, chi);
+            rn++;
+            s++;
+          }
+        }
+      }
+    }
+  }
+  return;
+}
diff --git a/operator/assign_mul_one_sw_pm_imu_site_lexic_body.c b/operator/assign_mul_one_sw_pm_imu_site_lexic_body.c
new file mode 100644
index 000000000..918a01ec2
--- /dev/null
+++ b/operator/assign_mul_one_sw_pm_imu_site_lexic_body.c
@@ -0,0 +1,44 @@
+void _PSWITCH(assign_mul_one_sw_pm_imu_site_lexic)(const int ix,
+						    _PTSWITCH(spinor) * const k, const _PTSWITCH(spinor) * const l, 
+						    const _F_TYPE mu) {
+
+  _PTSWITCH(su3_vector) ALIGN chi, psi1, psi2;
+  const _PSWITCH(su3) *w1, *w2, *w3;
+
+  // upper two spin components first
+  w1 = &_PSWITCH(sw)[ix][0][0];
+  w2 = w1 + 2; /*&sw[ix][1][0];*/
+  w3 = w1 + 4; /*&sw[ix][2][0];*/
+  _su3_multiply(psi1, *w1, (*l).s0);
+  _su3_multiply(chi, *w2, (*l).s1);
+  _vector_add_assign(psi1, chi);
+  _su3_inverse_multiply(psi2, *w2, (*l).s0);
+  _su3_multiply(chi, *w3, (*l).s1);
+  _vector_add_assign(psi2, chi);
+
+  // add in the twisted mass term (plus in the upper components)
+  _vector_add_i_mul(psi1, mu, (*l).s0);
+  _vector_add_i_mul(psi2, mu, (*l).s1);
+
+  _vector_assign((*k).s0, psi1);
+  _vector_assign((*k).s1, psi2);
+
+  // now lower to spin components
+  w1++; /*=&sw[ix][0][1];*/
+  w2++; /*=&sw[ix][1][1];*/
+  w3++; /*=&sw[ix][2][1];*/
+  _su3_multiply(psi1, *w1, (*l).s2);
+  _su3_multiply(chi, *w2, (*l).s3);
+  _vector_add_assign(psi1, chi);
+  _su3_inverse_multiply(psi2, *w2, (*l).s2);
+  _su3_multiply(chi, *w3, (*l).s3);
+  _vector_add_assign(psi2, chi);
+
+  // add in the twisted mass term (minus from g5 in the lower components)
+  _vector_add_i_mul(psi1, -mu, (*l).s2);
+  _vector_add_i_mul(psi2, -mu, (*l).s3);
+
+  _vector_assign((*k).s2, psi1);
+  _vector_assign((*k).s3, psi2);
+  return;
+}
diff --git a/operator/clover_accumulate_deriv.c b/operator/clover_accumulate_deriv.c
index 05eba8f75..f96e9f43d 100644
--- a/operator/clover_accumulate_deriv.c
+++ b/operator/clover_accumulate_deriv.c
@@ -21,7 +21,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #ifdef SSE
 # undef SSE
@@ -39,10 +39,10 @@
 #include <math.h>
 #include <errno.h>
 #include <time.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include "global.h"
@@ -57,7 +57,7 @@
 
 void sw_all(hamiltonian_field_t * const hf, const double kappa, 
 	    const double c_sw) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -69,7 +69,7 @@ void sw_all(hamiltonian_field_t * const hf, const double kappa,
   su3 ALIGN v1,v2,vv1,vv2,plaq;
   su3 ALIGN vis[4][4];
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(x = 0; x < VOLUME; x++) {
@@ -200,7 +200,7 @@ void sw_all(hamiltonian_field_t * const hf, const double kappa,
       }
     }
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
   return;
diff --git a/operator/clover_deriv.c b/operator/clover_deriv.c
index 47f9f77de..d814db193 100644
--- a/operator/clover_deriv.c
+++ b/operator/clover_deriv.c
@@ -21,7 +21,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #ifdef SSE
 # undef SSE
@@ -39,10 +39,10 @@
 #include <math.h>
 #include <errno.h>
 #include <time.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include "global.h"
@@ -70,7 +70,7 @@
 // this function depends on mu
 
 void sw_deriv(const int ieo, const double mu) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -89,15 +89,15 @@ void sw_deriv(const int ieo, const double mu) {
   }
   if(fabs(mu) > 0.) fac = 0.5;
 
-#ifndef OMP
+#ifndef TM_USE_OMP
   icy = 0;
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(int icx = ioff; icx < (VOLUME/2+ioff); icx++) {
-#ifdef OMP
+#ifdef TM_USE_OMP
     icy = icx - ioff;
 #endif
     x = g_eo2lexic[icx];
@@ -143,18 +143,18 @@ void sw_deriv(const int ieo, const double mu) {
       _su3_refac_acc(swp[x][2], fac, lswp[2]);
       _su3_refac_acc(swp[x][3], fac, lswp[3]);
     }
-#ifndef OMP
+#ifndef TM_USE_OMP
     ++icy;
 #endif
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
   return;
 }
 
 void sw_deriv_nd(const int ieo) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -173,15 +173,15 @@ void sw_deriv_nd(const int ieo) {
     ioff = (VOLUME+RAND)/2;
   }
 
-#ifndef OMP
+#ifndef TM_USE_OMP
   icy = 0;
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(int icx = ioff; icx < (VOLUME/2+ioff); icx++) {
-#ifdef OMP
+#ifdef TM_USE_OMP
     icy = icx - ioff;
 #endif
     x = g_eo2lexic[icx];
@@ -232,11 +232,11 @@ void sw_deriv_nd(const int ieo) {
     _su3_refac_acc(swp[x][1], fac, lswp[1]);
     _su3_refac_acc(swp[x][2], fac, lswp[2]);
     _su3_refac_acc(swp[x][3], fac, lswp[3]);
-#ifndef OMP
+#ifndef TM_USE_OMP
     ++icy;
 #endif
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
   return;
@@ -249,9 +249,9 @@ void sw_deriv_nd(const int ieo) {
 // result is again stored in swm and swp                 
 // includes a gamma5 multiplication for kk
 
-void sw_spinor(const int ieo, const spinor * const kk, const spinor * const ll, 
-	       const double fac) {
-#ifdef OMP
+void sw_spinor_eo(const int ieo, const spinor * const kk, const spinor * const ll, 
+		  const double fac) {
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -272,7 +272,7 @@ void sw_spinor(const int ieo, const spinor * const kk, const spinor * const ll,
   }
   /************************ loop over half of the lattice sites ***********/
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif  
   for(icx = ioff; icx < (VOLUME/2+ioff); icx++) {
@@ -301,7 +301,64 @@ void sw_spinor(const int ieo, const spinor * const kk, const spinor * const ll,
     _su3_minus_su3(lswm[2],u2,v2);
     _su3_minus_su3(lswm[3],u3,v3);
     
-    /* add up to swm[0] and swp[0] */
+    /* add up to swm[x] and swp[x] */
+    _su3_refac_acc(swm[x][0], fac, lswm[0]);
+    _su3_refac_acc(swm[x][1], fac, lswm[1]);
+    _su3_refac_acc(swm[x][2], fac, lswm[2]);
+    _su3_refac_acc(swm[x][3], fac, lswm[3]);
+    _su3_refac_acc(swp[x][0], fac, lswp[0]);
+    _su3_refac_acc(swp[x][1], fac, lswp[1]);
+    _su3_refac_acc(swp[x][2], fac, lswp[2]);
+    _su3_refac_acc(swp[x][3], fac, lswp[3]);
+  }
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+  return;
+}
+
+void sw_spinor(const spinor * const kk, const spinor * const ll, 
+	       const double fac) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+
+  int x;
+  const spinor *r,*s;
+  su3 ALIGN v0,v1,v2,v3;
+  su3 ALIGN u0,u1,u2,u3;
+  su3 ALIGN lswp[4],lswm[4];
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif  
+  for(x = 0; x < VOLUME; x++) {
+    r = kk + x;
+    s = ll + x;
+    
+    _vector_tensor_vector(v0,(*r).s0,(*s).s0);
+    _vector_tensor_vector(v1,(*r).s0,(*s).s1);
+    _vector_tensor_vector(v2,(*r).s1,(*s).s1);
+    _vector_tensor_vector(v3,(*r).s1,(*s).s0);
+    // mvector takes g5 into account
+    _mvector_tensor_vector(u0,(*r).s2,(*s).s2);
+    _mvector_tensor_vector(u1,(*r).s2,(*s).s3);
+    _mvector_tensor_vector(u2,(*r).s3,(*s).s3);
+    _mvector_tensor_vector(u3,(*r).s3,(*s).s2);
+    
+    /* compute the insertion matrix */
+    _su3_plus_su3(lswp[0],u0,v0);
+    _su3_plus_su3(lswp[1],u1,v1);
+    _su3_plus_su3(lswp[2],u2,v2);
+    _su3_plus_su3(lswp[3],u3,v3);
+
+    _su3_minus_su3(lswm[0],u0,v0);
+    _su3_minus_su3(lswm[1],u1,v1);
+    _su3_minus_su3(lswm[2],u2,v2);
+    _su3_minus_su3(lswm[3],u3,v3);
+    
+    /* add up to swm[x] and swp[x] */
     _su3_refac_acc(swm[x][0], fac, lswm[0]);
     _su3_refac_acc(swm[x][1], fac, lswm[1]);
     _su3_refac_acc(swm[x][2], fac, lswm[2]);
@@ -311,7 +368,7 @@ void sw_spinor(const int ieo, const spinor * const kk, const spinor * const ll,
     _su3_refac_acc(swp[x][2], fac, lswp[2]);
     _su3_refac_acc(swp[x][3], fac, lswp[3]);
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
   return;
diff --git a/operator/clover_det.c b/operator/clover_det.c
index f66915f7f..8951b7689 100644
--- a/operator/clover_det.c
+++ b/operator/clover_det.c
@@ -21,7 +21,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #ifdef SSE
 # undef SSE
@@ -39,10 +39,10 @@
 #include <math.h>
 #include <errno.h>
 #include <time.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include "global.h"
@@ -114,11 +114,11 @@ void six_det(_Complex double* const rval, _Complex double a[6][6])
 
 double sw_trace(const int ieo, const double mu) {
   double ALIGN res = 0.0;
-#ifdef MPI
+#ifdef TM_USE_MPI
   double ALIGN mres;
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
   int thread_num = omp_get_thread_num();
@@ -141,7 +141,7 @@ double sw_trace(const int ieo, const double mu) {
     ioff=(VOLUME+RAND)/2;
   }
   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(int icx = ioff; icx < (VOLUME/2+ioff); icx++) {
@@ -169,7 +169,7 @@ double sw_trace(const int ieo, const double mu) {
   }
   kc=ks+kc;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   g_omp_acc_re[thread_num] = kc;
   } /* OpenMP parallel closing brace */
 
@@ -180,7 +180,7 @@ double sw_trace(const int ieo, const double mu) {
   res=kc;
 #endif
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
   return(mres);
 #else
@@ -201,11 +201,11 @@ double sw_trace(const int ieo, const double mu) {
 
 double sw_trace_nd(const int ieo, const double mu, const double eps) {
   double ALIGN res = 0.0;
-#ifdef MPI
+#ifdef TM_USE_MPI
   double ALIGN mres;
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
   int thread_num = omp_get_thread_num();
@@ -228,7 +228,7 @@ double sw_trace_nd(const int ieo, const double mu, const double eps) {
     ioff=(VOLUME+RAND)/2;
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(unsigned int icx = ioff; icx < (VOLUME/2+ioff); icx++) {
@@ -259,7 +259,7 @@ double sw_trace_nd(const int ieo, const double mu, const double eps) {
   }
   kc=ks+kc;
   
-#ifdef OMP
+#ifdef TM_USE_OMP
   g_omp_acc_re[thread_num] = kc;
   } /* OpenMP parallel closing brace */
 
@@ -270,7 +270,7 @@ double sw_trace_nd(const int ieo, const double mu, const double eps) {
   res=kc;
 #endif
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
   return(mres);
 #else
diff --git a/operator/clover_invert.c b/operator/clover_invert.c
index b9a241bb6..8f9827b23 100644
--- a/operator/clover_invert.c
+++ b/operator/clover_invert.c
@@ -3,6 +3,7 @@
  * Copyright (C) 1995 Ulli Wolff, Stefan Sint
  *               2001,2005 Martin Hasenbusch
  *               2011,2012 Carsten Urbach
+ *               2017      Bartosz Kostrzewa
  *
  * This file is part of tmLQCD.
  *
@@ -21,7 +22,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #ifdef SSE
 # undef SSE
@@ -39,10 +40,10 @@
 #include <math.h>
 #include <errno.h>
 #include <time.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include "global.h"
@@ -76,6 +77,8 @@
   !______________________________________________________________!
 */
 
+// for debugging purposes, the print statements can be enabled
+// #define CLOVER_INVERT_DEBUG
 
 /* six_invert and six_det are called from multiple threads, they are thus
  * made thread-safe by removing the static keywords but they are NOT
@@ -165,7 +168,7 @@ void six_invert(int* ifail ,_Complex double a[6][6])
 // - is stored in sw_inv[VOLUME/2-(VOLUME-1)]
 
 void sw_invert(const int ieo, const double mu) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -182,15 +185,15 @@ void sw_invert(const int ieo, const double mu) {
     ioff=(VOLUME+RAND)/2;
   }
 
-#ifndef OMP
+#ifndef TM_USE_OMP
   icy=0;
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(int icx = ioff; icx < (VOLUME/2+ioff); icx++) {
-#ifdef OMP
+#ifdef TM_USE_OMP
     icy = icx - ioff;
 #endif
     x = g_eo2lexic[icx];
@@ -246,11 +249,176 @@ void sw_invert(const int ieo, const double mu) {
 	get_3x3_block_matrix(&sw_inv[icy+VOLUME/2][3][i], a, 3, 0);
       }
     }
-#ifndef OMP
+#ifndef TM_USE_OMP
     ++icy;
 #endif
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+  return;
+}
+
+/* This function computes 
+ * 
+ *   \bar\epsilon / ( (1 + Tee)^2 + \bar\mu^2 - \bar\epsilon^2 )
+ * 
+ * for use in the QPhiX packing routine for the non-degenerate
+ * clover doublet
+ *
+ * sw_inv should contain 
+ *   1 / ( (1 + Tee)^2 + \bar\mu^2 - \bar\epsilon^2 ) 
+ * the last VOLUME/2 elements (which should not be relevant at this stage) 
+ * of sw_inv will be overwritten
+ */ 
+
+void sw_invert_epsbar(const double epsbar) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  int icy, i;
+  _Complex double ALIGN a[6][6];
+
+#ifndef TM_USE_OMP
+  icy=VOLUME/2;
+#endif
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(int icx = 0; icx < VOLUME/2; icx++) {
+#ifdef TM_USE_OMP
+    icy = icx + VOLUME/2;
+#endif
+    for(i = 0; i < 2; i++) {
+      // extract 1/((1+Tee)^2 + \bar\mu^2 - \bar\epsilon^2)
+      populate_6x6_matrix(a, &sw_inv[icx][0][i], 0, 0);
+      populate_6x6_matrix(a, &sw_inv[icx][1][i], 0, 3);
+      populate_6x6_matrix(a, &sw_inv[icx][2][i], 3, 3);
+      populate_6x6_matrix(a, &sw_inv[icx][3][i], 3, 0);
+
+      // scale by epsbar
+      scale_real_6x6(a, epsbar);
+
+#ifdef CLOVER_INVERT_DEBUG
+      if(icx==0) print_6x6(a, "sw_invert_epsbar epsilon*sw_inv");
+#endif
+
+      /*  and write the result into the last VOLUME/2 elements of sw_inv */
+      get_3x3_block_matrix(&sw_inv[icy][0][i], a, 0, 0);
+      get_3x3_block_matrix(&sw_inv[icy][1][i], a, 0, 3);
+      get_3x3_block_matrix(&sw_inv[icy][2][i], a, 3, 3);
+      get_3x3_block_matrix(&sw_inv[icy][3][i], a, 3, 0);
+    }
+#ifndef TM_USE_OMP
+    icy++;
+#endif
+  }
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+  return;
+}
+
+/* This function computes 
+ * 
+ *   (1 + Tee - I*\bar\mu\gamma_5\tau3 ) / ( 1 + Tee + \bar\mu^2 - \bar\eps^2 )
+ * 
+ * for use in the QPhiX packing routine for the non-degenerate
+ * clover doublet
+ *
+ * sw should be populated with (1+Tee) and 
+ * sw_inv should contain 1 / ( (1 + Tee)^2 + \bar\mu^2 - \bar\eps^2 )
+ *
+ * !! all elements of sw_inv will be overwritten !!
+ */ 
+
+void sw_invert_mubar(const double mubar) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  int err=0;
+  int icy, i, x;
+  su3 ALIGN v;
+  _Complex double ALIGN a[6][6];
+  _Complex double ALIGN b[6][6];
+  _Complex double ALIGN c[6][6];
+  _Complex double ALIGN d[6][6];
+
+#ifndef TM_USE_OMP
+  icy=VOLUME/2;
+#endif
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(int icx = 0; icx < VOLUME/2; icx++) {
+#ifdef TM_USE_OMP
+    icy = icx + VOLUME/2;
+#endif
+    x = g_eo2lexic[icx];
+    
+    for(i = 0; i < 2; i++) {
+      // extract (1+Tee)
+      populate_6x6_matrix(a, &sw[x][0][i], 0, 0);
+      populate_6x6_matrix(a, &sw[x][1][i], 0, 3);
+      _su3_dagger(v, sw[x][1][i]); 
+      populate_6x6_matrix(a, &v, 3, 0);
+      populate_6x6_matrix(a, &sw[x][2][i], 3, 3);
+
+      copy_6x6(b,a);
+
+      // we add the twisted quark masses for both the 'up' and the 'down' flavour
+      // (note that this is the inverse, so -mu is associated with 'up')
+      // the i index denotes the halfspinor block and thus implements gamma5
+      add_tm(a, -(i==0?1.0:-1.0)*mubar);
+      add_tm(b, +(i==0?1.0:-1.0)*mubar);
+
+#ifdef CLOVER_INVERT_DEBUG
+      if(icx==0) {
+        print_6x6(a,"sw_invert_mubar sw_up");
+        print_6x6(b,"sw_invert_mubar sw_dn");
+      }
+#endif
+  
+      // extract 1/((1+Tee)^2 + \bar\mu^2 - \bar\eps^2)
+      populate_6x6_matrix(c, &sw_inv[icx][0][i], 0, 0);
+      populate_6x6_matrix(c, &sw_inv[icx][1][i], 0, 3);
+      populate_6x6_matrix(c, &sw_inv[icx][2][i], 3, 3);
+      populate_6x6_matrix(c, &sw_inv[icx][3][i], 3, 0);
+  
+      // multiply the two together and store in d
+      mult_6x6(d, a, c);
+  
+      /*  and write the result into sw_inv */
+      get_3x3_block_matrix(&sw_inv[icx][0][i], d, 0, 0);
+      get_3x3_block_matrix(&sw_inv[icx][1][i], d, 0, 3);
+      get_3x3_block_matrix(&sw_inv[icx][2][i], d, 3, 3);
+      get_3x3_block_matrix(&sw_inv[icx][3][i], d, 3, 0);
+
+#ifdef CLOVER_INVERT_DEBUG
+      if(icx==0) print_6x6(d,"sw_invert_mubar sw_inv_up");
+#endif
+
+      // and the same for the 'down'
+      mult_6x6(d, b, c);
+  
+      get_3x3_block_matrix(&sw_inv[icy][0][i], d, 0, 0);
+      get_3x3_block_matrix(&sw_inv[icy][1][i], d, 0, 3);
+      get_3x3_block_matrix(&sw_inv[icy][2][i], d, 3, 3);
+      get_3x3_block_matrix(&sw_inv[icy][3][i], d, 3, 0);
+
+#ifdef CLOVER_INVERT_DEBUG
+      if(icx==0) print_6x6(d,"sw_invert_mubar sw_inv_dn");
+#endif
+    }
+#ifndef TM_USE_OMP
+    icy++;
+#endif
+  }
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
   return;
@@ -258,7 +426,7 @@ void sw_invert(const int ieo, const double mu) {
 
 // This function computes
 //
-// 1/((1+T)^2 + barmu^2 - bareps^2)^{-1}
+// 1/((1+T)^2 + \bar\mu^2 - \bar\epsilon^2)
 //
 // for all even x,
 // which is stored in sw_inv[0-(VOLUME/2-1)]
@@ -270,7 +438,7 @@ void sw_invert(const int ieo, const double mu) {
 // must be done elsewhere because of flavour structure
 
 void sw_invert_nd(const double mshift) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -279,7 +447,7 @@ void sw_invert_nd(const double mshift) {
   su3 ALIGN v;
   _Complex double ALIGN a[6][6], b[6][6];
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(int icx = 0; icx < (VOLUME/2); icx++) {
@@ -292,6 +460,10 @@ void sw_invert_nd(const double mshift) {
       populate_6x6_matrix(a, &v, 3, 0);
       populate_6x6_matrix(a, &sw[x][2][i], 3, 3);
 
+#ifdef CLOVER_INVERT_DEBUG
+      if(icx==0) print_6x6(a, "sw_invert_nd sw");
+#endif
+
       // compute (1+T)^2 and store in b
       mult_6x6(b, a, a);
       // we add the mass shift term, which is a real number
@@ -305,6 +477,10 @@ void sw_invert_nd(const double mshift) {
 	err = 0;
       }
 
+#ifdef CLOVER_INVERT_DEBUG
+      if(icx==0) print_6x6(b, "sw_invert_nd sw_inv");
+#endif
+
       /*  copy "a" back to sw_inv */
       get_3x3_block_matrix(&sw_inv[icx][0][i], b, 0, 0);
       get_3x3_block_matrix(&sw_inv[icx][1][i], b, 0, 3);
@@ -312,7 +488,7 @@ void sw_invert_nd(const double mshift) {
       get_3x3_block_matrix(&sw_inv[icx][3][i], b, 3, 0);
     }
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
   return;
diff --git a/operator/clover_leaf.c b/operator/clover_leaf.c
index 41df662cb..1692984b8 100644
--- a/operator/clover_leaf.c
+++ b/operator/clover_leaf.c
@@ -21,7 +21,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #ifdef SSE
 # undef SSE
@@ -39,10 +39,10 @@
 #include <math.h>
 #include <errno.h>
 #include <time.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include "global.h"
@@ -98,12 +98,43 @@ void copy_6x6(_Complex double a[6][6], const _Complex double b[6][6]) {
   return;
 }
 
+void scale_real_6x6(_Complex double a[6][6], const double scale){
+  for(int i = 0; i < 6; i++) {
+    for(int j = 0; j < 6; j++) {
+      a[i][j] *= scale;
+    }
+  }
+  return;
+}
 
+void scale_cplx_6x6(_Complex double a[6][6], const _Complex double scale){
+  for(int i = 0; i < 6; i++) {
+    for(int j = 0; j < 6; j++) {
+      a[i][j] *= scale;
+    }
+  }
+  return;
+}
 
+void one_6x6(_Complex double a[6][6]){
+  memset((void*)(&a[0][0]), 0, 36*sizeof(_Complex double));
+  for(int i = 0; i < 6; i++){
+    a[i][i] = 1.0;
+  }
+}
 
-
-
-
+void print_6x6(_Complex double a[6][6], const char * const text){
+  if(g_proc_id==0){
+    printf("%s\n",text);
+    for(int i = 0; i < 6; i++){
+      for(int j = 0; j < 6; j++){
+        printf("(%+.2e %+.2e) ", creal(a[i][j]), cimag(a[i][j]));
+      }
+      printf("\n");
+    }
+    printf("\n");
+  }
+}
 
 su3 * _swp;
 
diff --git a/operator/clover_leaf.h b/operator/clover_leaf.h
index 44db299cf..f869360d1 100644
--- a/operator/clover_leaf.h
+++ b/operator/clover_leaf.h
@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2005 Martin Hasenbusch
  *               2011 Carsten Urbach
+ *               2017 Bartosz Kostrzewa
  *
  * This file is part of tmLQCD.
  *
@@ -32,9 +33,12 @@ double sw_trace(const int ieo, const double mu);
 double sw_trace_nd(const int ieo, const double mu, const double eps);
 void sw_invert(const int ieo, const double mu);
 void sw_invert_nd(const double mshift);
+void sw_invert_epsbar(const double epsbar);
+void sw_invert_mubar(const double mubar);
 void sw_deriv(const int ieo, const double mu);
 void sw_deriv_nd(const int ieo);
-void sw_spinor(const int ieo, const spinor * const kk, const spinor * const ll, const double fac);
+void sw_spinor_eo(const int ieo, const spinor * const kk, const spinor * const ll, const double fac);
+void sw_spinor(const spinor * const kk, const spinor * const ll, const double fac);
 void sw_all(hamiltonian_field_t * const hf, const double kappa, const double c_sw);
 int init_swpm(const int V);
 
@@ -42,5 +46,9 @@ void mult_6x6(_Complex double a[6][6], _Complex double b[6][6], _Complex double
 void add_6x6(_Complex double a[6][6], _Complex double b[6][6], _Complex double d[6][6]);
 void sub_6x6(_Complex double a[6][6], _Complex double b[6][6], _Complex double d[6][6]);
 void copy_6x6(_Complex double a[6][6], const _Complex double b[6][6]);
+void scale_real_6x6(_Complex double a[6][6], const double scale);
+void scale_cplx_6x6(_Complex double a[6][6], const _Complex double scale);
+void one_6x6(_Complex double a[6][6]);
+void print_6x6(_Complex double a[6][6], const char * const text );
 
 #endif
diff --git a/operator/clover_term.c b/operator/clover_term.c
index 02273048d..d6d3bfc63 100644
--- a/operator/clover_term.c
+++ b/operator/clover_term.c
@@ -21,7 +21,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #ifdef SSE
 # undef SSE
@@ -39,10 +39,10 @@
 #include <math.h>
 #include <errno.h>
 #include <time.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include "global.h"
@@ -86,7 +86,7 @@
 // suppressing space-time indices
 
 void sw_term(const su3 ** const gf, const double kappa, const double c_sw) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -109,7 +109,7 @@ void sw_term(const su3 ** const gf, const double kappa, const double c_sw) {
         |  | |  |
         |__| |__| k  */
   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(x = 0; x < VOLUME; x++) {
@@ -197,7 +197,7 @@ void sw_term(const su3 ** const gf, const double kappa, const double c_sw) {
     _itimes_su3_plus_su3(aux,magnetic[3],electric[3]);
     _su3_refac_acc(sw[x][2][1],ka_csw_8,aux);
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
   return;
diff --git a/operator/clovertm_operators.c b/operator/clovertm_operators.c
index 0a7fa9181..35d0a3ea4 100644
--- a/operator/clovertm_operators.c
+++ b/operator/clovertm_operators.c
@@ -20,38 +20,150 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
+#ifdef SSE
+# undef SSE
+#endif
+#ifdef SSE2
+# undef SSE2
+#endif
+#ifdef SSE3
+# undef SSE3
+#endif
+
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <errno.h>
 #include <time.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
 #include "su3.h"
 #include "sse.h"
+#include "gamma.h"
 #include "linalg_eo.h"
 #include "operator/Hopping_Matrix.h"
+#include "operator/tm_operators.h"
+#include "operator/Hopping_Matrix_32.h"
+
 #include "tm_operators.h"
-#include "operator/clovertm_operators.h"
+#include "tm_operators_32.h"
 
+#include "operator/clovertm_operators.h"
+#include "operator/D_psi.h"
 
 su3 *** sw;
 su3 *** sw_inv;
 
+/******************************************************************************
+ *
+ * assign_mul_one_sw_pm_imu_site_lexic applies (1 + T + imug5) to spinor l
+ * at a lexic site ix and stores it in k at the same site.
+ * l and k are pointers to the spinors at this site.
+ * it is assumed that the clover leaf is computed and stored
+ * in sw[VOLUME][3][2]
+ * the corresponding routine can be found in clover_leaf.c
+ * A.Abdel-Rehim
+ *****************************************************************************/
+
+#define _F_TYPE double
+#define _PTSWITCH(s) s
+#define _PSWITCH(s) s
+
+#include"assign_mul_one_sw_pm_imu_site_lexic_body.c"
+
+#undef _F_TYPE
+#undef _PSWITCH
+#undef _PTSWITCH
+
+#define _F_TYPE float
+#define _PTSWITCH(s) s ## 32
+#define _PSWITCH(s) s ## _32
+
+#include"assign_mul_one_sw_pm_imu_site_lexic_body.c"
+
+#undef _F_TYPE
+#undef _PSWITCH
+#undef _PTSWITCH
+
+/*****************************************************
+ * Full operator Q=gamma5*M acting on a spinor stored
+ * in even-odd ordering.
+ * A. Abdel-Rehim
+ ****************************************************/
+void Qsw_full(spinor * const Even_new, spinor * const Odd_new,
+              spinor * const Even, spinor * const Odd) {
+
+  /* Even sites */
+  Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX], Odd);
+  assign_mul_one_sw_pm_imu(EE, Even_new, Even, +g_mu);
+  assign_add_mul_r(Even_new, g_spinor_field[DUM_MATRIX], -1., VOLUME/2);
+  gamma5(Even_new,Even_new,VOLUME/2);
+
+  /* Odd sites */
+  Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], Even);
+  assign_mul_one_sw_pm_imu(OO, Odd_new, Odd, +g_mu);
+  assign_add_mul_r(Odd_new, g_spinor_field[DUM_MATRIX], -1., VOLUME/2);
+  gamma5(Odd_new,Odd_new,VOLUME/2);
+}
+
+/**************************************************
+ * Full operator acting on spinors stored in 
+ * lexiographic order.
+ * A.Abdel-Rehim
+ **************************************************/
+
+//Msw_full_plus_psi is simply D_psi
+
+void Msw_full_minus_psi(spinor * const l, spinor * const k)
+{
+  g_mu = -g_mu;
+  D_psi(l, k);
+  g_mu = -g_mu;
+}
+
+void Qsw_full_plus_psi(spinor * const l, spinor * const k)
+{
+  D_psi(l, k);
+  gamma5(l, l, VOLUME);
+}
+
+void Qsw_full_minus_psi(spinor * const l, spinor * const k)
+{
+  g_mu = -g_mu;
+  D_psi(l, k);
+  g_mu = -g_mu;
+  gamma5(l, l, VOLUME);
+}
+
+void Qsw_full_pm_psi(spinor * const l, spinor * const k)
+{
+  g_mu = -g_mu;
+  D_psi(l, k);
+  gamma5(g_spinor_field[DUM_MATRIX], l, VOLUME);
+  g_mu = -g_mu;
+  D_psi(l, g_spinor_field[DUM_MATRIX]);
+  gamma5(l, l, VOLUME);
+}
+//******************************************************************
+
+su3_32 *** sw_32;
+su3_32 *** sw_inv_32;
+
 void clover_gamma5(const int ieo, 
-		   spinor * const l, const spinor * const k, const spinor * const j,
-		   const double mu);
+                   spinor * const l, const spinor * const k, const spinor * const j,
+                   const double mu);
 void clover(const int ieo, 
-	    spinor * const l, const spinor * const k, const spinor * const j,
-	    const double mu);
+            spinor * const l, const spinor * const k, const spinor * const j,
+            const double mu);
 
 void Msw_full(spinor * const Even_new, spinor * const Odd_new, 
-	      spinor * const Even, spinor * const Odd) {
+              spinor * const Even, spinor * const Odd) {
+
   /* Even sites */
   Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX], Odd);
   assign_mul_one_sw_pm_imu(EE, Even_new, Even, +g_mu);
@@ -173,149 +285,149 @@ void H_eo_sw_inv_psi(spinor * const l, spinor * const k, const int ieo, const in
  **********************************************************/
 
 void clover_inv(spinor * const l, const int tau3sign, const double mu) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
-  int icy;
-  su3_vector ALIGN psi, chi, phi1, phi3;
-  int ioff = 0;
-  const su3 *w1, *w2, *w3, *w4;
-  spinor *rn;
-
-  if(tau3sign < 0 && fabs(mu) > 0) {
-    ioff = VOLUME/2;
-  }
+    int icy;
+    su3_vector ALIGN psi, chi, phi1, phi3;
+    int ioff = 0;
+    const su3 *w1, *w2, *w3, *w4;
+    spinor *rn;
+
+    if(tau3sign < 0 && fabs(mu) > 0) {
+      ioff = VOLUME/2;
+    }
 
-#ifndef OMP
-  icy = ioff;
+#ifndef TM_USE_OMP
+    icy = ioff;
 #endif
-  /************************ loop over all lattice sites *************************/
-#ifdef OMP
+    /************************ loop over all lattice sites *************************/
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
-  for(int icx = 0; icx < (VOLUME/2); icx++) {
-#ifdef OMP
-    icy = ioff + icx;
+    for(int icx = 0; icx < (VOLUME/2); icx++) {
+#ifdef TM_USE_OMP
+      icy = ioff + icx;
 #endif
 
-    rn = l + icx;
-    _vector_assign(phi1,(*rn).s0);
-    _vector_assign(phi3,(*rn).s2);
-
-    w1=&sw_inv[icy][0][0];
-    w2=w1+2;  /* &sw_inv[icy][1][0]; */
-    w3=w1+4;  /* &sw_inv[icy][2][0]; */
-    w4=w1+6;  /* &sw_inv[icy][3][0]; */
-    _su3_multiply(psi,*w1,phi1); 
-    _su3_multiply(chi,*w2,(*rn).s1);
-    _vector_add((*rn).s0,psi,chi);
-    _su3_multiply(psi,*w4,phi1); 
-    _su3_multiply(chi,*w3,(*rn).s1);
-    _vector_add((*rn).s1,psi,chi);
-
-    w1++; /* &sw_inv[icy][0][1]; */
-    w2++; /* &sw_inv[icy][1][1]; */
-    w3++; /* &sw_inv[icy][2][1]; */
-    w4++; /* &sw_inv[icy][3][1]; */
-    _su3_multiply(psi,*w1,phi3); 
-    _su3_multiply(chi,*w2,(*rn).s3);
-    _vector_add((*rn).s2,psi,chi);
-    _su3_multiply(psi,*w4,phi3); 
-    _su3_multiply(chi,*w3,(*rn).s3);
-    _vector_add((*rn).s3,psi,chi);
-
-#ifndef OMP
-    ++icy;
+      rn = l + icx;
+      _vector_assign(phi1,(*rn).s0);
+      _vector_assign(phi3,(*rn).s2);
+
+      w1=&sw_inv[icy][0][0];
+      w2=w1+2;  /* &sw_inv[icy][1][0]; */
+      w3=w1+4;  /* &sw_inv[icy][2][0]; */
+      w4=w1+6;  /* &sw_inv[icy][3][0]; */
+      _su3_multiply(psi,*w1,phi1); 
+      _su3_multiply(chi,*w2,(*rn).s1);
+      _vector_add((*rn).s0,psi,chi);
+      _su3_multiply(psi,*w4,phi1); 
+      _su3_multiply(chi,*w3,(*rn).s1);
+      _vector_add((*rn).s1,psi,chi);
+
+      w1++; /* &sw_inv[icy][0][1]; */
+      w2++; /* &sw_inv[icy][1][1]; */
+      w3++; /* &sw_inv[icy][2][1]; */
+      w4++; /* &sw_inv[icy][3][1]; */
+      _su3_multiply(psi,*w1,phi3); 
+      _su3_multiply(chi,*w2,(*rn).s3);
+      _vector_add((*rn).s2,psi,chi);
+      _su3_multiply(psi,*w4,phi3); 
+      _su3_multiply(chi,*w3,(*rn).s3);
+      _vector_add((*rn).s3,psi,chi);
+
+#ifndef TM_USE_OMP
+      ++icy;
 #endif
 
-    /******************************** end of loop *********************************/
-  }
-#ifdef OMP
+      /******************************** end of loop *********************************/
+    }
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
   return;
 }
 
 void clover_inv_nd(const int ieo, spinor * const l_c, spinor * const l_s) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
-  int icy;
-  su3_vector ALIGN psi, chi, phi1, phi3;
-  int ioff = 0;
-  const su3 *w1, *w2, *w3, *w4;
-  spinor *rn_s, *rn_c;
+    int icy;
+    su3_vector ALIGN psi, chi, phi1, phi3;
+    int ioff = 0;
+    const su3 *w1, *w2, *w3, *w4;
+    spinor *rn_s, *rn_c;
 
 
-  if(ieo == 1) ioff = VOLUME/2;
+    if(ieo == 1) ioff = VOLUME/2;
 
-#ifndef OMP
-  icy = ioff;
+#ifndef TM_USE_OMP
+    icy = ioff;
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
-  for(unsigned int icx = 0; icx < (VOLUME/2); icx++) {
-#ifdef OMP
-    icy = ioff + icx;
+    for(unsigned int icx = 0; icx < (VOLUME/2); icx++) {
+#ifdef TM_USE_OMP
+      icy = ioff + icx;
 #endif
 
-    rn_s = l_s + icx;
-    rn_c = l_c + icx;
-    _vector_assign(phi1,(*rn_s).s0);
-
-    w1=&sw_inv[icy][0][0];
-    w2=w1+2;  /* &sw_inv[icy][1][0]; */
-    w3=w1+4;  /* &sw_inv[icy][2][0]; */
-    w4=w1+6;  /* &sw_inv[icy][3][0]; */
-    _su3_multiply(psi, *w1, phi1); 
-    _su3_multiply(chi, *w2, (*rn_s).s1);
-    _vector_add((*rn_s).s0, psi,chi);
-    _su3_multiply(psi, *w4, phi1); 
-    _su3_multiply(chi, *w3, (*rn_s).s1);
-    _vector_add((*rn_s).s1, psi, chi);
-
-    _vector_assign(phi1,(*rn_c).s0);
-
-    _su3_multiply(psi, *w1, phi1); 
-    _su3_multiply(chi, *w2, (*rn_c).s1);
-    _vector_add((*rn_c).s0, psi,chi);
-    _su3_multiply(psi, *w4, phi1); 
-    _su3_multiply(chi, *w3, (*rn_c).s1);
-    _vector_add((*rn_c).s1, psi, chi);
-
-    _vector_assign(phi3,(*rn_s).s2);
-
-    w1++; /* &sw_inv[icy][0][1]; */
-    w2++; /* &sw_inv[icy][1][1]; */
-    w3++; /* &sw_inv[icy][2][1]; */
-    w4++; /* &sw_inv[icy][3][1]; */
-    _su3_multiply(psi, *w1, phi3); 
-    _su3_multiply(chi, *w2, (*rn_s).s3);
-    _vector_add((*rn_s).s2, psi, chi);
-    _su3_multiply(psi, *w4, phi3); 
-    _su3_multiply(chi, *w3, (*rn_s).s3);
-    _vector_add((*rn_s).s3, psi, chi);
-
-    _vector_assign(phi3,(*rn_c).s2);
-
-    _su3_multiply(psi, *w1, phi3); 
-    _su3_multiply(chi, *w2, (*rn_c).s3);
-    _vector_add((*rn_c).s2, psi, chi);
-    _su3_multiply(psi, *w4, phi3); 
-    _su3_multiply(chi, *w3, (*rn_c).s3);
-    _vector_add((*rn_c).s3, psi, chi);
-
-#ifndef OMP
-    ++icy;
+      rn_s = l_s + icx;
+      rn_c = l_c + icx;
+      _vector_assign(phi1,(*rn_s).s0);
+
+      w1=&sw_inv[icy][0][0];
+      w2=w1+2;  /* &sw_inv[icy][1][0]; */
+      w3=w1+4;  /* &sw_inv[icy][2][0]; */
+      w4=w1+6;  /* &sw_inv[icy][3][0]; */
+      _su3_multiply(psi, *w1, phi1); 
+      _su3_multiply(chi, *w2, (*rn_s).s1);
+      _vector_add((*rn_s).s0, psi,chi);
+      _su3_multiply(psi, *w4, phi1); 
+      _su3_multiply(chi, *w3, (*rn_s).s1);
+      _vector_add((*rn_s).s1, psi, chi);
+
+      _vector_assign(phi1,(*rn_c).s0);
+
+      _su3_multiply(psi, *w1, phi1); 
+      _su3_multiply(chi, *w2, (*rn_c).s1);
+      _vector_add((*rn_c).s0, psi,chi);
+      _su3_multiply(psi, *w4, phi1); 
+      _su3_multiply(chi, *w3, (*rn_c).s1);
+      _vector_add((*rn_c).s1, psi, chi);
+
+      _vector_assign(phi3,(*rn_s).s2);
+
+      w1++; /* &sw_inv[icy][0][1]; */
+      w2++; /* &sw_inv[icy][1][1]; */
+      w3++; /* &sw_inv[icy][2][1]; */
+      w4++; /* &sw_inv[icy][3][1]; */
+      _su3_multiply(psi, *w1, phi3); 
+      _su3_multiply(chi, *w2, (*rn_s).s3);
+      _vector_add((*rn_s).s2, psi, chi);
+      _su3_multiply(psi, *w4, phi3); 
+      _su3_multiply(chi, *w3, (*rn_s).s3);
+      _vector_add((*rn_s).s3, psi, chi);
+
+      _vector_assign(phi3,(*rn_c).s2);
+
+      _su3_multiply(psi, *w1, phi3); 
+      _su3_multiply(chi, *w2, (*rn_c).s3);
+      _vector_add((*rn_c).s2, psi, chi);
+      _su3_multiply(psi, *w4, phi3); 
+      _su3_multiply(chi, *w3, (*rn_c).s3);
+      _vector_add((*rn_c).s3, psi, chi);
+
+#ifndef TM_USE_OMP
+      ++icy;
 #endif
 
-    /******************************** end of loop *********************************/
-  }
-#ifdef OMP
+      /******************************** end of loop *********************************/
+    }
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
   return;
@@ -334,75 +446,76 @@ void clover_inv_nd(const int ieo, spinor * const l_c, spinor * const l_s) {
  **************************************************************/
 
 void clover_gamma5(const int ieo, 
-		   spinor * const l, const spinor * const k, const spinor * const j,
-		   const double mu) {
-#ifdef OMP
+                   spinor * const l, const spinor * const k, const spinor * const j,
+                   const double mu) {
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
-  su3_vector ALIGN chi, psi1, psi2;
-  int ix;
-  int ioff,icx;
-  const su3 *w1,*w2,*w3;
-  spinor *r;
-  const spinor *s,*t;
-
-  if(ieo == 0) {
-    ioff = 0;
-  } 
-  else {
-    ioff = (VOLUME+RAND)/2;
-  }
+    su3_vector ALIGN chi, psi1, psi2;
+    int ix;
+    int ioff,icx;
+    const su3 *w1,*w2,*w3;
+    spinor *r;
+    const spinor *s,*t;
+
+    if(ieo == 0) {
+      ioff = 0;
+    } 
+    else {
+      ioff = (VOLUME+RAND)/2;
+    }
 
-/************************ loop over all lattice sites *************************/
-#ifdef OMP
+    /************************ loop over all lattice sites *************************/
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
-  for(icx = ioff; icx < (VOLUME/2+ioff); icx++) {
-    ix = g_eo2lexic[icx];
+    for(icx = ioff; icx < (VOLUME/2+ioff); icx++) {
+      ix = g_eo2lexic[icx];
     
-    r = l + icx-ioff;
-    s = k + icx-ioff;
-    t = j + icx-ioff;
+      r = l + icx-ioff;
+      s = k + icx-ioff;
+      t = j + icx-ioff;
     
-    w1=&sw[ix][0][0];
-    w2=w1+2; /*&sw[ix][1][0];*/
-    w3=w1+4; /*&sw[ix][2][0];*/
-    _su3_multiply(psi1,*w1,(*s).s0); 
-    _su3_multiply(chi,*w2,(*s).s1);
-    _vector_add_assign(psi1,chi);
-    _su3_inverse_multiply(psi2,*w2,(*s).s0); 
-    _su3_multiply(chi,*w3,(*s).s1);
-    _vector_add_assign(psi2,chi); 
-    // add in the twisted mass term (plus in the upper components)
-    _vector_add_i_mul(psi1, mu, (*s).s0);
-    _vector_add_i_mul(psi2, mu, (*s).s1);
-
-    _vector_sub((*r).s0,psi1,(*t).s0);
-    _vector_sub((*r).s1,psi2,(*t).s1);
+      w1=&sw[ix][0][0];
+      w2=w1+2; /*&sw[ix][1][0];*/
+      w3=w1+4; /*&sw[ix][2][0];*/
+      _su3_multiply(psi1,*w1,(*s).s0); 
+      _su3_multiply(chi,*w2,(*s).s1);
+      _vector_add_assign(psi1,chi);
+      _su3_inverse_multiply(psi2,*w2,(*s).s0); 
+      _su3_multiply(chi,*w3,(*s).s1);
+      _vector_add_assign(psi2,chi); 
+      // add in the twisted mass term (plus in the upper components)
+      _vector_add_i_mul(psi1, mu, (*s).s0);
+      _vector_add_i_mul(psi2, mu, (*s).s1);
+
+      _vector_sub((*r).s0,psi1,(*t).s0);
+      _vector_sub((*r).s1,psi2,(*t).s1);
     
-    w1++; /*=&sw[ix][0][1];*/
-    w2++; /*=&sw[ix][1][1];*/
-    w3++; /*=&sw[ix][2][1];*/
-    _su3_multiply(psi1,*w1,(*s).s2); _su3_multiply(chi,*w2,(*s).s3);
-    _vector_add_assign(psi1,chi); 
-    _su3_inverse_multiply(psi2,*w2,(*s).s2); _su3_multiply(chi,*w3,(*s).s3);
-    _vector_add_assign(psi2,chi); 
-    // add in the twisted mass term (minus from g5 in the lower components)
-    _vector_add_i_mul(psi1, -mu, (*s).s2);
-    _vector_add_i_mul(psi2, -mu, (*s).s3);
-
-    /**************** multiply with  gamma5 included ******************************/
-    _vector_sub((*r).s2,(*t).s2,psi1);
-    _vector_sub((*r).s3,(*t).s3,psi2);
-    /******************************** end of loop *********************************/
-  }
-#ifdef OMP
+      w1++; /*=&sw[ix][0][1];*/
+      w2++; /*=&sw[ix][1][1];*/
+      w3++; /*=&sw[ix][2][1];*/
+      _su3_multiply(psi1,*w1,(*s).s2); _su3_multiply(chi,*w2,(*s).s3);
+      _vector_add_assign(psi1,chi); 
+      _su3_inverse_multiply(psi2,*w2,(*s).s2); _su3_multiply(chi,*w3,(*s).s3);
+      _vector_add_assign(psi2,chi); 
+      // add in the twisted mass term (minus from g5 in the lower components)
+      _vector_add_i_mul(psi1, -mu, (*s).s2);
+      _vector_add_i_mul(psi2, -mu, (*s).s3);
+
+      /**************** multiply with  gamma5 included ******************************/
+      _vector_sub((*r).s2,(*t).s2,psi1);
+      _vector_sub((*r).s3,(*t).s3,psi2);
+      /******************************** end of loop *********************************/
+    }
+#ifdef TM_USE_OMP
   } /* OMP closing brace */
 #endif
   return;
 }
 
+
 /**************************************************************
  *
  * clover applies (1 + T + imug5) to spinor k, 
@@ -416,72 +529,72 @@ void clover_gamma5(const int ieo,
 
 
 void clover(const int ieo, 
-	    spinor * const l, const spinor * const k, const spinor * const j,
-	    const double mu) {
-#ifdef OMP
+            spinor * const l, const spinor * const k, const spinor * const j,
+            const double mu) {
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
-  su3_vector ALIGN chi, psi1, psi2;
-  int ix;
-  int ioff;
-  const su3 *w1,*w2,*w3;
-  spinor *r;
-  const spinor *s,*t;
+    su3_vector ALIGN chi, psi1, psi2;
+    int ix;
+    int ioff;
+    const su3 *w1,*w2,*w3;
+    spinor *r;
+    const spinor *s,*t;
   
-  if(ieo == 0) {
-    ioff = 0;
-  } 
-  else {
-    ioff = (VOLUME+RAND)/2;
-  }
-#ifdef OMP
+    if(ieo == 0) {
+      ioff = 0;
+    } 
+    else {
+      ioff = (VOLUME+RAND)/2;
+    }
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
-  for(unsigned int icx = ioff; icx < (VOLUME/2+ioff); icx++) {
-    ix = g_eo2lexic[icx];
+    for(unsigned int icx = ioff; icx < (VOLUME/2+ioff); icx++) {
+      ix = g_eo2lexic[icx];
     
-    r = l + icx-ioff;
-    s = k + icx-ioff;
-    t = j + icx-ioff;
-
-    // upper two spin components first
-    w1=&sw[ix][0][0];
-    w2=w1+2; /*&sw[ix][1][0];*/
-    w3=w1+4; /*&sw[ix][2][0];*/
-    _su3_multiply(psi1,*w1,(*s).s0); 
-    _su3_multiply(chi,*w2,(*s).s1);
-    _vector_add_assign(psi1,chi);
-    _su3_inverse_multiply(psi2,*w2,(*s).s0); 
-    _su3_multiply(chi,*w3,(*s).s1);
-    _vector_add_assign(psi2,chi); 
-
-    // add in the twisted mass term (plus in the upper components)
-    _vector_add_i_mul(psi1, mu, (*s).s0);
-    _vector_add_i_mul(psi2, mu, (*s).s1);
-
-    _vector_sub((*r).s0,psi1,(*t).s0);
-    _vector_sub((*r).s1,psi2,(*t).s1);
-
-    // now lower to spin components
-    w1++; /*=&sw[ix][0][1];*/
-    w2++; /*=&sw[ix][1][1];*/
-    w3++; /*=&sw[ix][2][1];*/
-    _su3_multiply(psi1,*w1,(*s).s2); 
-    _su3_multiply(chi,*w2,(*s).s3);
-    _vector_add_assign(psi1,chi); 
-    _su3_inverse_multiply(psi2,*w2,(*s).s2); 
-    _su3_multiply(chi,*w3,(*s).s3);
-    _vector_add_assign(psi2,chi); 
-
-    // add in the twisted mass term (minus from g5 in the lower components)
-    _vector_add_i_mul(psi1, -mu, (*s).s2);
-    _vector_add_i_mul(psi2, -mu, (*s).s3);
-
-    _vector_sub((*r).s2,psi1,(*t).s2);
-    _vector_sub((*r).s3,psi2,(*t).s3);
-  }
-#ifdef OMP
+      r = l + icx-ioff;
+      s = k + icx-ioff;
+      t = j + icx-ioff;
+
+      // upper two spin components first
+      w1=&sw[ix][0][0];
+      w2=w1+2; /*&sw[ix][1][0];*/
+      w3=w1+4; /*&sw[ix][2][0];*/
+      _su3_multiply(psi1,*w1,(*s).s0); 
+      _su3_multiply(chi,*w2,(*s).s1);
+      _vector_add_assign(psi1,chi);
+      _su3_inverse_multiply(psi2,*w2,(*s).s0); 
+      _su3_multiply(chi,*w3,(*s).s1);
+      _vector_add_assign(psi2,chi); 
+
+      // add in the twisted mass term (plus in the upper components)
+      _vector_add_i_mul(psi1, mu, (*s).s0);
+      _vector_add_i_mul(psi2, mu, (*s).s1);
+
+      _vector_sub((*r).s0,psi1,(*t).s0);
+      _vector_sub((*r).s1,psi2,(*t).s1);
+
+      // now lower to spin components
+      w1++; /*=&sw[ix][0][1];*/
+      w2++; /*=&sw[ix][1][1];*/
+      w3++; /*=&sw[ix][2][1];*/
+      _su3_multiply(psi1,*w1,(*s).s2); 
+      _su3_multiply(chi,*w2,(*s).s3);
+      _vector_add_assign(psi1,chi); 
+      _su3_inverse_multiply(psi2,*w2,(*s).s2); 
+      _su3_multiply(chi,*w3,(*s).s3);
+      _vector_add_assign(psi2,chi); 
+
+      // add in the twisted mass term (minus from g5 in the lower components)
+      _vector_add_i_mul(psi1, -mu, (*s).s2);
+      _vector_add_i_mul(psi2, -mu, (*s).s3);
+
+      _vector_sub((*r).s2,psi1,(*t).s2);
+      _vector_sub((*r).s3,psi2,(*t).s3);
+    }
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
   return;
@@ -499,238 +612,238 @@ void clover(const int ieo,
  **************************************************************/
 
 void clover_nd(const int ieo, 
-	       spinor * const l_c, spinor * const l_s, 
-	       const spinor * const k_c, const spinor * const k_s, 
-	       const spinor * const j_c, const spinor * const j_s,
-	       const double mubar, const double epsbar) {
-#ifdef OMP
+               spinor * const l_c, spinor * const l_s, 
+               const spinor * const k_c, const spinor * const k_s, 
+               const spinor * const j_c, const spinor * const j_s,
+               const double mubar, const double epsbar) {
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
-  su3_vector ALIGN chi, psi1, psi2;
-  int ix;
-  int ioff;
-  const su3 *w1,*w2,*w3;
-  spinor *r_s, *r_c;
-  const spinor *s_s, *s_c, *t_s, *t_c;
+    su3_vector ALIGN chi, psi1, psi2;
+    int ix;
+    int ioff;
+    const su3 *w1,*w2,*w3;
+    spinor *r_s, *r_c;
+    const spinor *s_s, *s_c, *t_s, *t_c;
   
-  if(ieo == 0) {
-    ioff = 0;
-  } 
-  else {
-    ioff = (VOLUME+RAND)/2;
-  }
-  /************************ loop over all lattice sites *************************/
-#ifdef OMP
+    if(ieo == 0) {
+      ioff = 0;
+    } 
+    else {
+      ioff = (VOLUME+RAND)/2;
+    }
+    /************************ loop over all lattice sites *************************/
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
-  for(unsigned int icx = ioff; icx < (VOLUME/2+ioff); icx++) {
-    ix = g_eo2lexic[icx];
+    for(unsigned int icx = ioff; icx < (VOLUME/2+ioff); icx++) {
+      ix = g_eo2lexic[icx];
     
-    r_s = l_s + icx-ioff;
-    r_c = l_c + icx-ioff;
-    s_s = k_s + icx-ioff;
-    s_c = k_c + icx-ioff;
-    t_s = j_s + icx-ioff;
-    t_c = j_c + icx-ioff;
-
-    // upper two spin components first
-    w1=&sw[ix][0][0];
-    w2=w1+2; /*&sw[ix][1][0];*/
-    w3=w1+4; /*&sw[ix][2][0];*/
-    _su3_multiply(psi1, *w1, (*s_s).s0); 
-    _su3_multiply(chi, *w2, (*s_s).s1);
-    _vector_add_assign(psi1, chi);
-    _su3_inverse_multiply(psi2, *w2, (*s_s).s0); 
-    _su3_multiply(chi, *w3, (*s_s).s1);
-    _vector_add_assign(psi2, chi); 
-
-    // add in the twisted mass term (plus in the upper components)
-    _vector_add_i_mul(psi1, mubar, (*s_s).s0);
-    _vector_add_i_mul(psi2, mubar, (*s_s).s1);
-
-    _vector_add_mul(psi1, epsbar, (*s_c).s0);
-    _vector_add_mul(psi2, epsbar, (*s_c).s1);
-
-    _vector_sub((*r_s).s0, psi1, (*t_s).s0);
-    _vector_sub((*r_s).s1, psi2, (*t_s).s1);
-
-    _su3_multiply(psi1, *w1, (*s_c).s0); 
-    _su3_multiply(chi, *w2, (*s_c).s1);
-    _vector_add_assign(psi1, chi);
-    _su3_inverse_multiply(psi2, *w2, (*s_c).s0); 
-    _su3_multiply(chi, *w3, (*s_c).s1);
-    _vector_add_assign(psi2, chi); 
-
-    // add in the twisted mass term (plus in the upper components)
-    _vector_add_i_mul(psi1, -mubar, (*s_c).s0);
-    _vector_add_i_mul(psi2, -mubar, (*s_c).s1);
-
-    _vector_add_mul(psi1, epsbar, (*s_s).s0);
-    _vector_add_mul(psi2, epsbar, (*s_s).s1);
-
-    _vector_sub((*r_c).s0, psi1, (*t_c).s0);
-    _vector_sub((*r_c).s1, psi2, (*t_c).s1);
-
-
-    // now lower to spin components
-    w1++; /*=&sw[ix][0][1];*/
-    w2++; /*=&sw[ix][1][1];*/
-    w3++; /*=&sw[ix][2][1];*/
-    _su3_multiply(psi1, *w1, (*s_s).s2); 
-    _su3_multiply(chi, *w2, (*s_s).s3);
-    _vector_add_assign(psi1, chi); 
-    _su3_inverse_multiply(psi2, *w2, (*s_s).s2); 
-    _su3_multiply(chi, *w3, (*s_s).s3);
-    _vector_add_assign(psi2, chi); 
-
-    // add in the twisted mass term (minus from g5 in the lower components)
-    _vector_add_i_mul(psi1, -mubar, (*s_s).s2);
-    _vector_add_i_mul(psi2, -mubar, (*s_s).s3);
-
-    _vector_add_mul(psi1, epsbar, (*s_c).s2);
-    _vector_add_mul(psi2, epsbar, (*s_c).s3);
-
-    _vector_sub((*r_s).s2,psi1,(*t_s).s2);
-    _vector_sub((*r_s).s3,psi2,(*t_s).s3);
-
-    _su3_multiply(psi1, *w1, (*s_c).s2); 
-    _su3_multiply(chi, *w2, (*s_c).s3);
-    _vector_add_assign(psi1, chi); 
-    _su3_inverse_multiply(psi2, *w2, (*s_c).s2); 
-    _su3_multiply(chi, *w3, (*s_c).s3);
-    _vector_add_assign(psi2, chi); 
-
-    // add in the twisted mass term (minus from g5 in the lower components)
-    _vector_add_i_mul(psi1, mubar, (*s_c).s2);
-    _vector_add_i_mul(psi2, mubar, (*s_c).s3);
-
-    _vector_add_mul(psi1, epsbar, (*s_s).s2);
-    _vector_add_mul(psi2, epsbar, (*s_s).s3);
-
-    _vector_sub((*r_c).s2, psi1, (*t_c).s2);
-    _vector_sub((*r_c).s3, psi2, (*t_c).s3);
-  }
-#ifdef OMP
+      r_s = l_s + icx-ioff;
+      r_c = l_c + icx-ioff;
+      s_s = k_s + icx-ioff;
+      s_c = k_c + icx-ioff;
+      t_s = j_s + icx-ioff;
+      t_c = j_c + icx-ioff;
+
+      // upper two spin components first
+      w1=&sw[ix][0][0];
+      w2=w1+2; /*&sw[ix][1][0];*/
+      w3=w1+4; /*&sw[ix][2][0];*/
+      _su3_multiply(psi1, *w1, (*s_s).s0); 
+      _su3_multiply(chi, *w2, (*s_s).s1);
+      _vector_add_assign(psi1, chi);
+      _su3_inverse_multiply(psi2, *w2, (*s_s).s0); 
+      _su3_multiply(chi, *w3, (*s_s).s1);
+      _vector_add_assign(psi2, chi); 
+
+      // add in the twisted mass term (plus in the upper components)
+      _vector_add_i_mul(psi1, mubar, (*s_s).s0);
+      _vector_add_i_mul(psi2, mubar, (*s_s).s1);
+
+      _vector_add_mul(psi1, epsbar, (*s_c).s0);
+      _vector_add_mul(psi2, epsbar, (*s_c).s1);
+
+      _vector_sub((*r_s).s0, psi1, (*t_s).s0);
+      _vector_sub((*r_s).s1, psi2, (*t_s).s1);
+
+      _su3_multiply(psi1, *w1, (*s_c).s0); 
+      _su3_multiply(chi, *w2, (*s_c).s1);
+      _vector_add_assign(psi1, chi);
+      _su3_inverse_multiply(psi2, *w2, (*s_c).s0); 
+      _su3_multiply(chi, *w3, (*s_c).s1);
+      _vector_add_assign(psi2, chi); 
+
+      // add in the twisted mass term (plus in the upper components)
+      _vector_add_i_mul(psi1, -mubar, (*s_c).s0);
+      _vector_add_i_mul(psi2, -mubar, (*s_c).s1);
+
+      _vector_add_mul(psi1, epsbar, (*s_s).s0);
+      _vector_add_mul(psi2, epsbar, (*s_s).s1);
+
+      _vector_sub((*r_c).s0, psi1, (*t_c).s0);
+      _vector_sub((*r_c).s1, psi2, (*t_c).s1);
+
+
+      // now lower to spin components
+      w1++; /*=&sw[ix][0][1];*/
+      w2++; /*=&sw[ix][1][1];*/
+      w3++; /*=&sw[ix][2][1];*/
+      _su3_multiply(psi1, *w1, (*s_s).s2); 
+      _su3_multiply(chi, *w2, (*s_s).s3);
+      _vector_add_assign(psi1, chi); 
+      _su3_inverse_multiply(psi2, *w2, (*s_s).s2); 
+      _su3_multiply(chi, *w3, (*s_s).s3);
+      _vector_add_assign(psi2, chi); 
+
+      // add in the twisted mass term (minus from g5 in the lower components)
+      _vector_add_i_mul(psi1, -mubar, (*s_s).s2);
+      _vector_add_i_mul(psi2, -mubar, (*s_s).s3);
+
+      _vector_add_mul(psi1, epsbar, (*s_c).s2);
+      _vector_add_mul(psi2, epsbar, (*s_c).s3);
+
+      _vector_sub((*r_s).s2,psi1,(*t_s).s2);
+      _vector_sub((*r_s).s3,psi2,(*t_s).s3);
+
+      _su3_multiply(psi1, *w1, (*s_c).s2); 
+      _su3_multiply(chi, *w2, (*s_c).s3);
+      _vector_add_assign(psi1, chi); 
+      _su3_inverse_multiply(psi2, *w2, (*s_c).s2); 
+      _su3_multiply(chi, *w3, (*s_c).s3);
+      _vector_add_assign(psi2, chi); 
+
+      // add in the twisted mass term (minus from g5 in the lower components)
+      _vector_add_i_mul(psi1, mubar, (*s_c).s2);
+      _vector_add_i_mul(psi2, mubar, (*s_c).s3);
+
+      _vector_add_mul(psi1, epsbar, (*s_s).s2);
+      _vector_add_mul(psi2, epsbar, (*s_s).s3);
+
+      _vector_sub((*r_c).s2, psi1, (*t_c).s2);
+      _vector_sub((*r_c).s3, psi2, (*t_c).s3);
+    }
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
   return;
 }
 
 void clover_gamma5_nd(const int ieo, 
-		      spinor * const l_c, spinor * const l_s, 
-		      const spinor * const k_c, const spinor * const k_s, 
-		      const spinor * const j_c, const spinor * const j_s,
-		      const double mubar, const double epsbar) {
-#ifdef OMP
+                      spinor * const l_c, spinor * const l_s, 
+                      const spinor * const k_c, const spinor * const k_s, 
+                      const spinor * const j_c, const spinor * const j_s,
+                      const double mubar, const double epsbar) {
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
-  su3_vector ALIGN chi, psi1, psi2;
-  int ix;
-  int ioff;
-  const su3 *w1,*w2,*w3;
-  spinor *r_s, *r_c;
-  const spinor *s_s, *s_c, *t_s, *t_c;
+    su3_vector ALIGN chi, psi1, psi2;
+    int ix;
+    int ioff;
+    const su3 *w1,*w2,*w3;
+    spinor *r_s, *r_c;
+    const spinor *s_s, *s_c, *t_s, *t_c;
   
-  if(ieo == 0) {
-    ioff = 0;
-  } 
-  else {
-    ioff = (VOLUME+RAND)/2;
-  }
-  /************************ loop over all lattice sites *************************/
-#ifdef OMP
+    if(ieo == 0) {
+      ioff = 0;
+    } 
+    else {
+      ioff = (VOLUME+RAND)/2;
+    }
+    /************************ loop over all lattice sites *************************/
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
-  for(unsigned int icx = ioff; icx < (VOLUME/2+ioff); icx++) {
-    ix = g_eo2lexic[icx];
+    for(unsigned int icx = ioff; icx < (VOLUME/2+ioff); icx++) {
+      ix = g_eo2lexic[icx];
     
-    r_s = l_s + icx-ioff;
-    r_c = l_c + icx-ioff;
-    s_s = k_s + icx-ioff;
-    s_c = k_c + icx-ioff;
-    t_s = j_s + icx-ioff;
-    t_c = j_c + icx-ioff;
-
-    // upper two spin components first
-    w1=&sw[ix][0][0];
-    w2=w1+2; /*&sw[ix][1][0];*/
-    w3=w1+4; /*&sw[ix][2][0];*/
-    _su3_multiply(psi1, *w1, (*s_s).s0); 
-    _su3_multiply(chi, *w2, (*s_s).s1);
-    _vector_add_assign(psi1, chi);
-    _su3_inverse_multiply(psi2, *w2, (*s_s).s0); 
-    _su3_multiply(chi, *w3, (*s_s).s1);
-    _vector_add_assign(psi2, chi); 
-
-    // add in the twisted mass term (plus in the upper components)
-    _vector_add_i_mul(psi1, mubar, (*s_s).s0);
-    _vector_add_i_mul(psi2, mubar, (*s_s).s1);
-
-    _vector_add_mul(psi1, epsbar, (*s_c).s0);
-    _vector_add_mul(psi2, epsbar, (*s_c).s1);
-
-    _vector_sub((*r_s).s0, psi1, (*t_s).s0);
-    _vector_sub((*r_s).s1, psi2, (*t_s).s1);
-
-    _su3_multiply(psi1, *w1, (*s_c).s0); 
-    _su3_multiply(chi, *w2, (*s_c).s1);
-    _vector_add_assign(psi1, chi);
-    _su3_inverse_multiply(psi2, *w2, (*s_c).s0); 
-    _su3_multiply(chi, *w3, (*s_c).s1);
-    _vector_add_assign(psi2, chi); 
-
-    // add in the twisted mass term (plus in the upper components)
-    _vector_add_i_mul(psi1, -mubar, (*s_c).s0);
-    _vector_add_i_mul(psi2, -mubar, (*s_c).s1);
-
-    _vector_add_mul(psi1, epsbar, (*s_s).s0);
-    _vector_add_mul(psi2, epsbar, (*s_s).s1);
-
-    _vector_sub((*r_c).s0, psi1, (*t_c).s0);
-    _vector_sub((*r_c).s1, psi2, (*t_c).s1);
-
-
-    // now lower to spin components
-    w1++; /*=&sw[ix][0][1];*/
-    w2++; /*=&sw[ix][1][1];*/
-    w3++; /*=&sw[ix][2][1];*/
-    _su3_multiply(psi1, *w1, (*s_s).s2); 
-    _su3_multiply(chi, *w2, (*s_s).s3);
-    _vector_add_assign(psi1, chi); 
-    _su3_inverse_multiply(psi2, *w2, (*s_s).s2); 
-    _su3_multiply(chi, *w3, (*s_s).s3);
-    _vector_add_assign(psi2, chi); 
-
-    // add in the twisted mass term (minus from g5 in the lower components)
-    _vector_add_i_mul(psi1, -mubar, (*s_s).s2);
-    _vector_add_i_mul(psi2, -mubar, (*s_s).s3);
-
-    _vector_add_mul(psi1, epsbar, (*s_c).s2);
-    _vector_add_mul(psi2, epsbar, (*s_c).s3);
-
-    _vector_sub((*r_s).s2, (*t_s).s2, psi1);
-    _vector_sub((*r_s).s3, (*t_s).s3, psi2);
-
-    _su3_multiply(psi1, *w1, (*s_c).s2); 
-    _su3_multiply(chi, *w2, (*s_c).s3);
-    _vector_add_assign(psi1, chi); 
-    _su3_inverse_multiply(psi2, *w2, (*s_c).s2); 
-    _su3_multiply(chi, *w3, (*s_c).s3);
-    _vector_add_assign(psi2, chi); 
-
-    // add in the twisted mass term (minus from g5 in the lower components)
-    _vector_add_i_mul(psi1, mubar, (*s_c).s2);
-    _vector_add_i_mul(psi2, mubar, (*s_c).s3);
-
-    _vector_add_mul(psi1, epsbar, (*s_s).s2);
-    _vector_add_mul(psi2, epsbar, (*s_s).s3);
-
-    _vector_sub((*r_c).s2, (*t_c).s2, psi1);
-    _vector_sub((*r_c).s3, (*t_c).s3, psi2);
-  }
-#ifdef OMP
+      r_s = l_s + icx-ioff;
+      r_c = l_c + icx-ioff;
+      s_s = k_s + icx-ioff;
+      s_c = k_c + icx-ioff;
+      t_s = j_s + icx-ioff;
+      t_c = j_c + icx-ioff;
+
+      // upper two spin components first
+      w1=&sw[ix][0][0];
+      w2=w1+2; /*&sw[ix][1][0];*/
+      w3=w1+4; /*&sw[ix][2][0];*/
+      _su3_multiply(psi1, *w1, (*s_s).s0); 
+      _su3_multiply(chi, *w2, (*s_s).s1);
+      _vector_add_assign(psi1, chi);
+      _su3_inverse_multiply(psi2, *w2, (*s_s).s0); 
+      _su3_multiply(chi, *w3, (*s_s).s1);
+      _vector_add_assign(psi2, chi); 
+
+      // add in the twisted mass term (plus in the upper components)
+      _vector_add_i_mul(psi1, mubar, (*s_s).s0);
+      _vector_add_i_mul(psi2, mubar, (*s_s).s1);
+
+      _vector_add_mul(psi1, epsbar, (*s_c).s0);
+      _vector_add_mul(psi2, epsbar, (*s_c).s1);
+
+      _vector_sub((*r_s).s0, psi1, (*t_s).s0);
+      _vector_sub((*r_s).s1, psi2, (*t_s).s1);
+
+      _su3_multiply(psi1, *w1, (*s_c).s0); 
+      _su3_multiply(chi, *w2, (*s_c).s1);
+      _vector_add_assign(psi1, chi);
+      _su3_inverse_multiply(psi2, *w2, (*s_c).s0); 
+      _su3_multiply(chi, *w3, (*s_c).s1);
+      _vector_add_assign(psi2, chi); 
+
+      // add in the twisted mass term (plus in the upper components)
+      _vector_add_i_mul(psi1, -mubar, (*s_c).s0);
+      _vector_add_i_mul(psi2, -mubar, (*s_c).s1);
+
+      _vector_add_mul(psi1, epsbar, (*s_s).s0);
+      _vector_add_mul(psi2, epsbar, (*s_s).s1);
+
+      _vector_sub((*r_c).s0, psi1, (*t_c).s0);
+      _vector_sub((*r_c).s1, psi2, (*t_c).s1);
+
+
+      // now lower to spin components
+      w1++; /*=&sw[ix][0][1];*/
+      w2++; /*=&sw[ix][1][1];*/
+      w3++; /*=&sw[ix][2][1];*/
+      _su3_multiply(psi1, *w1, (*s_s).s2); 
+      _su3_multiply(chi, *w2, (*s_s).s3);
+      _vector_add_assign(psi1, chi); 
+      _su3_inverse_multiply(psi2, *w2, (*s_s).s2); 
+      _su3_multiply(chi, *w3, (*s_s).s3);
+      _vector_add_assign(psi2, chi); 
+
+      // add in the twisted mass term (minus from g5 in the lower components)
+      _vector_add_i_mul(psi1, -mubar, (*s_s).s2);
+      _vector_add_i_mul(psi2, -mubar, (*s_s).s3);
+
+      _vector_add_mul(psi1, epsbar, (*s_c).s2);
+      _vector_add_mul(psi2, epsbar, (*s_c).s3);
+
+      _vector_sub((*r_s).s2, (*t_s).s2, psi1);
+      _vector_sub((*r_s).s3, (*t_s).s3, psi2);
+
+      _su3_multiply(psi1, *w1, (*s_c).s2); 
+      _su3_multiply(chi, *w2, (*s_c).s3);
+      _vector_add_assign(psi1, chi); 
+      _su3_inverse_multiply(psi2, *w2, (*s_c).s2); 
+      _su3_multiply(chi, *w3, (*s_c).s3);
+      _vector_add_assign(psi2, chi); 
+
+      // add in the twisted mass term (minus from g5 in the lower components)
+      _vector_add_i_mul(psi1, mubar, (*s_c).s2);
+      _vector_add_i_mul(psi2, mubar, (*s_c).s3);
+
+      _vector_add_mul(psi1, epsbar, (*s_s).s2);
+      _vector_add_mul(psi2, epsbar, (*s_s).s3);
+
+      _vector_sub((*r_c).s2, (*t_c).s2, psi1);
+      _vector_sub((*r_c).s3, (*t_c).s3, psi2);
+    }
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
   return;
@@ -749,78 +862,88 @@ void clover_gamma5_nd(const int ieo,
  **************************************************************/
 
 
-void assign_mul_one_sw_pm_imu(const int ieo, 
-			      spinor * const k, const spinor * const l,
-			      const double mu) {
-#ifdef OMP
+
+/***************************************************
+ * Application of Mee (1+T+i*g_mu*gamma5) to spinor
+ * l and store the result in spinor k. Only even 
+ * sites will be computed.
+ * A. Abdel-Rehim
+ **************************************************/
+
+void Mee_sw_psi(spinor * const k, spinor * const l, const double mu) {
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
-  su3_vector ALIGN chi, psi1, psi2;
-  int ix;
-  int ioff;
-  const su3 *w1, *w2, *w3;
-  spinor *r;
-  const spinor *s;
+    su3_vector ALIGN chi, psi1, psi2;
+    int ix;
+    //int ioff;
+    const su3 *w1, *w2, *w3;
+    spinor *r;
+    const spinor *s;
   
-  if(ieo == 0) {
-    ioff = 0;
-  } 
-  else {
-    ioff = (VOLUME+RAND)/2;
-  }
-  /************************ loop over all lattice sites *************************/
-#ifdef OMP
+    //if(ieo == 0) {
+    //  ioff = 0;
+    //} 
+    //else {
+    //  ioff = (VOLUME+RAND)/2;
+    //}
+    /************************ loop over even lattice sites *************************/
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
-  for(unsigned icx = ioff; icx < (VOLUME/2+ioff); icx++) {
-    ix = g_eo2lexic[icx];
+    //for(unsigned icx = ioff; icx < (VOLUME/2+ioff); icx++) {
+    for(unsigned icx = 0; icx < VOLUME/2; icx++) {
+      ix = g_eo2lexic[icx];
     
-    r = k + icx-ioff;
-    s = l + icx-ioff;
-
-    // upper two spin components first
-    w1=&sw[ix][0][0];
-    w2=w1+2; /*&sw[ix][1][0];*/
-    w3=w1+4; /*&sw[ix][2][0];*/
-    _su3_multiply(psi1,*w1,(*s).s0); 
-    _su3_multiply(chi,*w2,(*s).s1);
-    _vector_add_assign(psi1,chi);
-    _su3_inverse_multiply(psi2,*w2,(*s).s0); 
-    _su3_multiply(chi,*w3,(*s).s1);
-    _vector_add_assign(psi2,chi); 
-
-    // add in the twisted mass term (plus in the upper components)
-    _vector_add_i_mul(psi1, mu, (*s).s0);
-    _vector_add_i_mul(psi2, mu, (*s).s1);
-
-    _vector_assign((*r).s0, psi1);
-    _vector_assign((*r).s1, psi2);
-
-    // now lower to spin components
-    w1++; /*=&sw[ix][0][1];*/
-    w2++; /*=&sw[ix][1][1];*/
-    w3++; /*=&sw[ix][2][1];*/
-    _su3_multiply(psi1,*w1,(*s).s2); 
-    _su3_multiply(chi,*w2,(*s).s3);
-    _vector_add_assign(psi1,chi); 
-    _su3_inverse_multiply(psi2,*w2,(*s).s2); 
-    _su3_multiply(chi,*w3,(*s).s3);
-    _vector_add_assign(psi2,chi); 
-
-    // add in the twisted mass term (minus from g5 in the lower components)
-    _vector_add_i_mul(psi1, -mu, (*s).s2);
-    _vector_add_i_mul(psi2, -mu, (*s).s3);
-
-    _vector_assign((*r).s2, psi1);
-    _vector_assign((*r).s3, psi2);
-  }
-#ifdef OMP
+      //r = k + icx-ioff;
+      //s = l + icx-ioff;
+
+      r = k + icx;
+      s = l + icx;
+      // upper two spin components first
+      w1=&sw[ix][0][0];
+      w2=w1+2; /*&sw[ix][1][0];*/
+      w3=w1+4; /*&sw[ix][2][0];*/
+      _su3_multiply(psi1,*w1,(*s).s0); 
+      _su3_multiply(chi,*w2,(*s).s1);
+      _vector_add_assign(psi1,chi);
+      _su3_inverse_multiply(psi2,*w2,(*s).s0); 
+      _su3_multiply(chi,*w3,(*s).s1);
+      _vector_add_assign(psi2,chi); 
+
+      // add in the twisted mass term (plus in the upper components)
+      _vector_add_i_mul(psi1, mu, (*s).s0);
+      _vector_add_i_mul(psi2, mu, (*s).s1);
+
+      _vector_assign((*r).s0, psi1);
+      _vector_assign((*r).s1, psi2);
+
+      // now lower to spin components
+      w1++; /*=&sw[ix][0][1];*/
+      w2++; /*=&sw[ix][1][1];*/
+      w3++; /*=&sw[ix][2][1];*/
+      _su3_multiply(psi1,*w1,(*s).s2); 
+      _su3_multiply(chi,*w2,(*s).s3);
+      _vector_add_assign(psi1,chi); 
+      _su3_inverse_multiply(psi2,*w2,(*s).s2); 
+      _su3_multiply(chi,*w3,(*s).s3);
+      _vector_add_assign(psi2,chi); 
+
+      // add in the twisted mass term (minus from g5 in the lower components)
+      _vector_add_i_mul(psi1, -mu, (*s).s2);
+      _vector_add_i_mul(psi2, -mu, (*s).s3);
+
+      _vector_assign((*r).s2, psi1);
+      _vector_assign((*r).s3, psi2);
+    }
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
   return;
 }
 
+
 /**************************************************************
  *
  * assign_mul_one_sw_pm_imu_eps applies 
@@ -835,171 +958,189 @@ void assign_mul_one_sw_pm_imu(const int ieo,
 
 
 void assign_mul_one_sw_pm_imu_eps(const int ieo, 
-				  spinor * const k_s, spinor * const k_c, 
-				  const spinor * const l_s, const spinor * const l_c,
-				  const double mu, const double eps) {
-#ifdef OMP
+                                  spinor * const k_s, spinor * const k_c, 
+                                  const spinor * const l_s, const spinor * const l_c,
+                                  const double mu, const double eps) {
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
-  su3_vector ALIGN chi, psi1, psi2;
-  int ix;
-  int ioff;
-  const su3 *w1, *w2, *w3;
-  spinor *r_s, *r_c;
-  const spinor *s_s, *s_c;
+    su3_vector ALIGN chi, psi1, psi2;
+    int ix;
+    int ioff;
+    const su3 *w1, *w2, *w3;
+    spinor *r_s, *r_c;
+    const spinor *s_s, *s_c;
   
-  if(ieo == 0) {
-    ioff = 0;
-  } 
-  else {
-    ioff = (VOLUME+RAND)/2;
-  }
-  /************************ loop over all lattice sites *************************/
-#ifdef OMP
+    if(ieo == 0) {
+      ioff = 0;
+    } 
+    else {
+      ioff = (VOLUME+RAND)/2;
+    }
+    /************************ loop over all lattice sites *************************/
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
-  for(unsigned int icx = ioff; icx < (VOLUME/2+ioff); icx++) {
-    ix = g_eo2lexic[icx];
+    for(unsigned int icx = ioff; icx < (VOLUME/2+ioff); icx++) {
+      ix = g_eo2lexic[icx];
     
-    r_s = k_s + icx-ioff;
-    r_c = k_c + icx-ioff;
-    s_s = l_s + icx-ioff;
-    s_c = l_c + icx-ioff;
-
-    // upper two spin components first
-    w1=&sw[ix][0][0];
-    w2=w1+2; /*&sw[ix][1][0];*/
-    w3=w1+4; /*&sw[ix][2][0];*/
-    _su3_multiply(psi1, *w1, (*s_s).s0); 
-    _su3_multiply(chi, *w2, (*s_s).s1);
-    _vector_add_assign(psi1, chi);
-    _su3_inverse_multiply(psi2, *w2, (*s_s).s0); 
-    _su3_multiply(chi, *w3, (*s_s).s1);
-    _vector_add_assign(psi2, chi); 
-
-    // add in the twisted mass term (plus in the upper components)
-    _vector_add_i_mul(psi1, mu, (*s_s).s0);
-    _vector_add_i_mul(psi2, mu, (*s_s).s1);
-
-    _vector_add_mul(psi1, eps, (*s_c).s0);
-    _vector_add_mul(psi2, eps, (*s_c).s1);
-
-    _vector_assign((*r_s).s0, psi1);
-    _vector_assign((*r_s).s1, psi2);
-
-    _su3_multiply(psi1, *w1, (*s_c).s0); 
-    _su3_multiply(chi, *w2, (*s_c).s1);
-    _vector_add_assign(psi1, chi);
-    _su3_inverse_multiply(psi2, *w2, (*s_c).s0); 
-    _su3_multiply(chi, *w3, (*s_c).s1);
-    _vector_add_assign(psi2, chi); 
-
-    // add in the twisted mass term (plus in the upper components)
-    _vector_add_i_mul(psi1, -mu, (*s_c).s0);
-    _vector_add_i_mul(psi2, -mu, (*s_c).s1);
-
-    _vector_add_mul(psi1, eps, (*s_s).s0);
-    _vector_add_mul(psi2, eps, (*s_s).s1);
-
-    _vector_assign((*r_c).s0, psi1);
-    _vector_assign((*r_c).s1, psi2);
-
-    // now lower two spin components
-    w1++; /*=&sw[ix][0][1];*/
-    w2++; /*=&sw[ix][1][1];*/
-    w3++; /*=&sw[ix][2][1];*/
-    _su3_multiply(psi1, *w1, (*s_s).s2); 
-    _su3_multiply(chi, *w2, (*s_s).s3);
-    _vector_add_assign(psi1, chi); 
-    _su3_inverse_multiply(psi2, *w2, (*s_s).s2); 
-    _su3_multiply(chi, *w3, (*s_s).s3);
-    _vector_add_assign(psi2, chi); 
-
-    // add in the twisted mass term (minus from g5 in the lower components)
-    _vector_add_i_mul(psi1, -mu, (*s_s).s2);
-    _vector_add_i_mul(psi2, -mu, (*s_s).s3);
-
-    _vector_add_mul(psi1, eps, (*s_c).s2);
-    _vector_add_mul(psi2, eps, (*s_c).s3);
-
-    _vector_assign((*r_s).s2, psi1);
-    _vector_assign((*r_s).s3, psi2);
-
-    _su3_multiply(psi1, *w1, (*s_c).s2); 
-    _su3_multiply(chi, *w2, (*s_c).s3);
-    _vector_add_assign(psi1, chi); 
-    _su3_inverse_multiply(psi2, *w2, (*s_c).s2); 
-    _su3_multiply(chi, *w3, (*s_c).s3);
-    _vector_add_assign(psi2, chi); 
-
-    // add in the twisted mass term (minus from g5 in the lower components)
-    _vector_add_i_mul(psi1, mu, (*s_c).s2);
-    _vector_add_i_mul(psi2, mu, (*s_c).s3);
-
-    _vector_add_mul(psi1, eps, (*s_s).s2);
-    _vector_add_mul(psi2, eps, (*s_s).s3);
-
-    _vector_assign((*r_c).s2, psi1);
-    _vector_assign((*r_c).s3, psi2);
+      r_s = k_s + icx-ioff;
+      r_c = k_c + icx-ioff;
+      s_s = l_s + icx-ioff;
+      s_c = l_c + icx-ioff;
+
+      // upper two spin components first
+      w1=&sw[ix][0][0];
+      w2=w1+2; /*&sw[ix][1][0];*/
+      w3=w1+4; /*&sw[ix][2][0];*/
+      _su3_multiply(psi1, *w1, (*s_s).s0); 
+      _su3_multiply(chi, *w2, (*s_s).s1);
+      _vector_add_assign(psi1, chi);
+      _su3_inverse_multiply(psi2, *w2, (*s_s).s0); 
+      _su3_multiply(chi, *w3, (*s_s).s1);
+      _vector_add_assign(psi2, chi); 
+
+      // add in the twisted mass term (plus in the upper components)
+      _vector_add_i_mul(psi1, mu, (*s_s).s0);
+      _vector_add_i_mul(psi2, mu, (*s_s).s1);
+
+      _vector_add_mul(psi1, eps, (*s_c).s0);
+      _vector_add_mul(psi2, eps, (*s_c).s1);
+
+      _vector_assign((*r_s).s0, psi1);
+      _vector_assign((*r_s).s1, psi2);
+
+      _su3_multiply(psi1, *w1, (*s_c).s0); 
+      _su3_multiply(chi, *w2, (*s_c).s1);
+      _vector_add_assign(psi1, chi);
+      _su3_inverse_multiply(psi2, *w2, (*s_c).s0); 
+      _su3_multiply(chi, *w3, (*s_c).s1);
+      _vector_add_assign(psi2, chi); 
+
+      // add in the twisted mass term (plus in the upper components)
+      _vector_add_i_mul(psi1, -mu, (*s_c).s0);
+      _vector_add_i_mul(psi2, -mu, (*s_c).s1);
+
+      _vector_add_mul(psi1, eps, (*s_s).s0);
+      _vector_add_mul(psi2, eps, (*s_s).s1);
+
+      _vector_assign((*r_c).s0, psi1);
+      _vector_assign((*r_c).s1, psi2);
+
+      // now lower two spin components
+      w1++; /*=&sw[ix][0][1];*/
+      w2++; /*=&sw[ix][1][1];*/
+      w3++; /*=&sw[ix][2][1];*/
+      _su3_multiply(psi1, *w1, (*s_s).s2); 
+      _su3_multiply(chi, *w2, (*s_s).s3);
+      _vector_add_assign(psi1, chi); 
+      _su3_inverse_multiply(psi2, *w2, (*s_s).s2); 
+      _su3_multiply(chi, *w3, (*s_s).s3);
+      _vector_add_assign(psi2, chi); 
+
+      // add in the twisted mass term (minus from g5 in the lower components)
+      _vector_add_i_mul(psi1, -mu, (*s_s).s2);
+      _vector_add_i_mul(psi2, -mu, (*s_s).s3);
+
+      _vector_add_mul(psi1, eps, (*s_c).s2);
+      _vector_add_mul(psi2, eps, (*s_c).s3);
+
+      _vector_assign((*r_s).s2, psi1);
+      _vector_assign((*r_s).s3, psi2);
+
+      _su3_multiply(psi1, *w1, (*s_c).s2); 
+      _su3_multiply(chi, *w2, (*s_c).s3);
+      _vector_add_assign(psi1, chi); 
+      _su3_inverse_multiply(psi2, *w2, (*s_c).s2); 
+      _su3_multiply(chi, *w3, (*s_c).s3);
+      _vector_add_assign(psi2, chi); 
+
+      // add in the twisted mass term (minus from g5 in the lower components)
+      _vector_add_i_mul(psi1, mu, (*s_c).s2);
+      _vector_add_i_mul(psi2, mu, (*s_c).s3);
+
+      _vector_add_mul(psi1, eps, (*s_s).s2);
+      _vector_add_mul(psi2, eps, (*s_s).s3);
+
+      _vector_assign((*r_c).s2, psi1);
+      _vector_assign((*r_c).s3, psi2);
 
-  }
-#ifdef OMP
+    }
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
   return;
 }
 
 
+#define _F_TYPE double
+#define _PSWITCH(s) s
+#define _PTSWITCH(s) s
+
+#include "assign_mul_one_sw_pm_imu_inv_block_body.c"
+
+#undef _F_TYPE
+#undef _PSWITCH
+#undef _PTSWITCH
+
+#define _F_TYPE float
+#define _PSWITCH(s) s ## _32
+#define _PTSWITCH(s) s ## 32
+
+#include "assign_mul_one_sw_pm_imu_inv_block_body.c"
 
-void assign_mul_one_sw_pm_imu_inv(const int ieo, 
-				  spinor * const k, const spinor * const l,
-				  const double mu) {
-#ifdef OMP
+#undef _F_TYPE
+#undef _PSWITCH
+#undef _PTSWITCH
+
+
+void Mee_sw_inv_psi(spinor * const k, spinor * const l, const double mu) {
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
-  su3_vector ALIGN psi, chi, phi1, phi3;
-  const su3 *w1, *w2, *w3, *w4;
-  const spinor *rn;
-  spinor *s;
+    su3_vector ALIGN psi, chi, phi1, phi3;
+    const su3 *w1, *w2, *w3, *w4;
+    const spinor *rn;
+    spinor *s;
 
-  /************************ loop over all lattice sites *************************/
-#ifdef OMP
+    /************************ loop over all lattice sites *************************/
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
-  for(int icx = 0; icx < (VOLUME/2); icx++) {
-
-    rn = l + icx;
-    s = k + icx;
-    _vector_assign(phi1,(*rn).s0);
-    _vector_assign(phi3,(*rn).s2);
-
-    w1=&sw_inv[icx][0][0];
-    w2=w1+2;  /* &sw_inv[icx][1][0]; */
-    w3=w1+4;  /* &sw_inv[icx][2][0]; */
-    w4=w1+6;  /* &sw_inv[icx][3][0]; */
-    _su3_multiply(psi,*w1,phi1); 
-    _su3_multiply(chi,*w2,(*rn).s1);
-    _vector_add((*s).s0,psi,chi);
-    _su3_multiply(psi,*w4,phi1); 
-    _su3_multiply(chi,*w3,(*rn).s1);
-    _vector_add((*s).s1,psi,chi);
-
-    w1++; /* &sw_inv[icx][0][1]; */
-    w2++; /* &sw_inv[icx][1][1]; */
-    w3++; /* &sw_inv[icx][2][1]; */
-    w4++; /* &sw_inv[icx][3][1]; */
-    _su3_multiply(psi,*w1,phi3); 
-    _su3_multiply(chi,*w2,(*rn).s3);
-    _vector_add((*s).s2,psi,chi);
-    _su3_multiply(psi,*w4,phi3); 
-    _su3_multiply(chi,*w3,(*rn).s3);
-    _vector_add((*s).s3,psi,chi);
-
-    /******************************** end of loop *********************************/
-  }
-#ifdef OMP
+    for(int icx = 0; icx < (VOLUME/2); icx++) {
+
+      rn = l + icx;
+      s = k + icx;
+      _vector_assign(phi1,(*rn).s0);
+      _vector_assign(phi3,(*rn).s2);
+
+      w1=&sw_inv[icx][0][0];
+      w2=w1+2;  /* &sw_inv[icx][1][0]; */
+      w3=w1+4;  /* &sw_inv[icx][2][0]; */
+      w4=w1+6;  /* &sw_inv[icx][3][0]; */
+      _su3_multiply(psi,*w1,phi1); 
+      _su3_multiply(chi,*w2,(*rn).s1);
+      _vector_add((*s).s0,psi,chi);
+      _su3_multiply(psi,*w4,phi1); 
+      _su3_multiply(chi,*w3,(*rn).s1);
+      _vector_add((*s).s1,psi,chi);
+
+      w1++; /* &sw_inv[icx][0][1]; */
+      w2++; /* &sw_inv[icx][1][1]; */
+      w3++; /* &sw_inv[icx][2][1]; */
+      w4++; /* &sw_inv[icx][3][1]; */
+      _su3_multiply(psi,*w1,phi3); 
+      _su3_multiply(chi,*w2,(*rn).s3);
+      _vector_add((*s).s2,psi,chi);
+      _su3_multiply(psi,*w4,phi3); 
+      _su3_multiply(chi,*w3,(*rn).s3);
+      _vector_add((*s).s3,psi,chi);
+
+      /******************************** end of loop *********************************/
+    }
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
   return;
@@ -1015,9 +1156,13 @@ void assign_mul_one_sw_pm_imu_inv(const int ieo,
 su3 ** sw1, ** sw_inv1;
 su3 * _sw, *_sw_inv;
 
+su3_32 ** sw1_32, ** sw_inv1_32;
+su3_32 * _sw_32, *_sw_inv_32;
+
 void init_sw_fields() {
   int V = VOLUME;
   su3 * tmp;
+  su3_32 * tmp_32;
   static int sw_init = 0;
 
   if(!sw_init) {
@@ -1050,19 +1195,110 @@ void init_sw_fields() {
     tmp = sw[0][0];
     for(int i = 0; i < V; i++) {
       for(int j = 0; j < 3; j++) {
-	sw[i][j] = tmp;
-	tmp = tmp+2;
+        sw[i][j] = tmp;
+        tmp = tmp+2;
       }
     }
     
     tmp = sw_inv[0][0];
     for(int i = 0; i < V; i++) {
       for(int j = 0; j < 4; j++) {
-	sw_inv[i][j] = tmp;
-	tmp = tmp+2;
+        sw_inv[i][j] = tmp;
+        tmp = tmp+2;
       }
     }
+    
+    /* 32 bit fields */
+    if((void*)(sw_32 = (su3_32***)calloc(V, sizeof(su3_32**))) == NULL) {
+      fprintf (stderr, "sw (32 bit) malloc err\n"); 
+    }
+    if((void*)(sw_inv_32 = (su3_32***)calloc(V, sizeof(su3_32**))) == NULL) {
+      fprintf (stderr, "sw_inv (32 bit) malloc err\n"); 
+    }    
+    if((void*)(sw1_32 = (su3_32**)calloc(3*V, sizeof(su3_32*))) == NULL) {
+      fprintf (stderr, "sw1 (32 bit) malloc err\n"); 
+    }    
+    if((void*)(sw_inv1_32 = (su3_32**)calloc(4*V, sizeof(su3_32*))) == NULL) {
+      fprintf (stderr, "sw_inv1 (32 bit) malloc err\n"); 
+    }    
+    if((void*)(_sw_32 = (su3_32*)calloc(3*2*V+1, sizeof(su3_32))) == NULL) {
+      fprintf (stderr, "_sw (32 bit) malloc err\n"); 
+    }    
+    if((void*)(_sw_inv_32 = (su3_32*)calloc(4*2*V+1, sizeof(su3_32))) == NULL) {
+      fprintf (stderr, "_sw_inv (32 bit) malloc err\n"); 
+    } 
+        
+    sw_32[0] = sw1_32;
+    sw_inv_32[0] = sw_inv1_32;
+    for(int i = 1; i < V; i++) {
+      sw_32[i] = sw_32[i-1]+3;
+      sw_inv_32[i] = sw_inv_32[i-1]+4;
+    }
+    sw_32[0][0] = (su3_32*)(((unsigned long int)(_sw_32)+ALIGN_BASE32)&~ALIGN_BASE32);
+    sw_inv_32[0][0] = (su3_32*)(((unsigned long int)(_sw_inv_32)+ALIGN_BASE32)&~ALIGN_BASE32);
+    tmp_32 = sw_32[0][0];
+    for(int i = 0; i < V; i++) {
+      for(int j = 0; j < 3; j++) {
+	sw_32[i][j] = tmp_32;
+	tmp_32 = tmp_32+2;
+      }
+    }
+    
+    tmp_32 = sw_inv_32[0][0];
+    for(int i = 0; i < V; i++) {
+      for(int j = 0; j < 4; j++) {
+	sw_inv_32[i][j] = tmp_32;
+	tmp_32 = tmp_32+2;
+      }
+    }
+        
+    
+    
     sw_init = 1;
   }
   return;
 }
+
+
+void copy_32_sw_fields(){
+  
+  int V = VOLUME;
+  
+  for(int i = 0; i < V; i++) {
+      for(int j = 0; j < 3; j++) {
+	for(int k = 0; k < 2; k++) {
+	  sw_32[i][j][k].c00 = (_Complex float) sw[i][j][k].c00;
+	  sw_32[i][j][k].c01 = (_Complex float) sw[i][j][k].c01;
+	  sw_32[i][j][k].c02 = (_Complex float) sw[i][j][k].c02;
+	  
+	  sw_32[i][j][k].c10 = (_Complex float) sw[i][j][k].c10;
+	  sw_32[i][j][k].c11 = (_Complex float) sw[i][j][k].c11;
+	  sw_32[i][j][k].c12 = (_Complex float) sw[i][j][k].c12;    
+
+	  sw_32[i][j][k].c20 = (_Complex float) sw[i][j][k].c20;
+	  sw_32[i][j][k].c21 = (_Complex float) sw[i][j][k].c21;
+	  sw_32[i][j][k].c22 = (_Complex float) sw[i][j][k].c22; 
+	}
+      }
+    }
+    
+  for(int i = 0; i < V; i++) {
+      for(int j = 0; j < 4; j++) {
+	for(int k = 0; k < 2; k++) {
+	  sw_inv_32[i][j][k].c00 = (_Complex float) sw_inv[i][j][k].c00;
+	  sw_inv_32[i][j][k].c01 = (_Complex float) sw_inv[i][j][k].c01;
+	  sw_inv_32[i][j][k].c02 = (_Complex float) sw_inv[i][j][k].c02;
+	  
+	  sw_inv_32[i][j][k].c10 = (_Complex float) sw_inv[i][j][k].c10;
+	  sw_inv_32[i][j][k].c11 = (_Complex float) sw_inv[i][j][k].c11;
+	  sw_inv_32[i][j][k].c12 = (_Complex float) sw_inv[i][j][k].c12;    
+
+	  sw_inv_32[i][j][k].c20 = (_Complex float) sw_inv[i][j][k].c20;
+	  sw_inv_32[i][j][k].c21 = (_Complex float) sw_inv[i][j][k].c21;
+	  sw_inv_32[i][j][k].c22 = (_Complex float) sw_inv[i][j][k].c22; 	  
+	}
+      }
+    }
+}
+
+
diff --git a/operator/clovertm_operators.h b/operator/clovertm_operators.h
index 184577699..80a65d2ce 100644
--- a/operator/clovertm_operators.h
+++ b/operator/clovertm_operators.h
@@ -24,15 +24,38 @@
 #define _CLOVERTM_OPERATORS_H
 
 #include "su3.h"
+#include "block.h"
 
 extern su3 *** sw;
 extern su3 *** sw_inv;
+extern su3_32 *** sw_32;
+extern su3_32 *** sw_inv_32;
 extern su3 ** swm, ** swp;
 
-void assign_mul_one_sw_pm_imu(const int ieo, spinor * const k, const spinor * const l, const double mu);
-void assign_mul_one_sw_pm_imu_inv(const int ieo, spinor * const k, const spinor * const l, const double mu);
+void assign_mul_one_sw_pm_imu_site_lexic(const int ix, spinor * const k,  const spinor * const l, const double mu);
+void assign_mul_one_sw_pm_imu_site_lexic_32(const int ix, spinor32 * const k,  const spinor32 * const l, const float mu);
+
+void Qsw_full(spinor * const Even_new, spinor * const Odd_new,
+              spinor * const Even, spinor * const Odd);
+void Qsw_full_plus_psi(spinor * const l, spinor * const k);
+void Qsw_full_minus_psi(spinor * const l, spinor * const k);
+void Qsw_full_pm_psi(spinor * const l, spinor * const k);
+void Msw_full_minus_psi(spinor * const l, spinor * const k);
+
+void assign_mul_one_sw_pm_imu(const int ieo, spinor * const k, spinor * const l, const double mu);
+void assign_mul_one_sw_pm_imu_32(const int ieo, spinor32 * const k, spinor32 * const l, const float mu);
+void assign_mul_one_sw_pm_imu_block(const int ieo, spinor * const k, spinor * const l, const double mu, block *blk);
+void assign_mul_one_sw_pm_imu_block_32(const int ieo, spinor32 * const k, spinor32 * const l, const float mu, block *blk);
+void assign_mul_one_sw_pm_imu_inv(const int ieo, spinor * const k, spinor * const l, const double mu);
+void assign_mul_one_sw_pm_imu_inv_32(const int ieo, spinor32 * const k, spinor32 * const l, const float mu);
+void assign_mul_one_sw_pm_imu_inv_block(const int ieo, spinor * const k, spinor * const l, const double mu, block *blk);
+void assign_mul_one_sw_pm_imu_inv_block_32(const int ieo, spinor32 * const k, spinor32 * const l, const float mu, block *blk);
+
+void Mee_sw_psi(spinor * const l, spinor * const k, const double mu);
+void Mee_sw_inv_psi(spinor * const k, spinor * const l, const double mu);
 void Msw_full(spinor * const Even_new, spinor * const Odd_new, 
 	      spinor * const Even, spinor * const Odd);
+
 void clover_inv(spinor * const l, const int tau3sign, const double mu);
 void Qsw_psi(spinor * const l, spinor * const k);
 void Qsw_plus_psi(spinor * const l, spinor * const k);
@@ -44,12 +67,14 @@ void Msw_plus_psi(spinor * const l, spinor * const k);
 void Msw_minus_psi(spinor * const l, spinor * const k);
 void H_eo_sw_inv_psi(spinor * const l, spinor * const k, const int ieo, const int tau3sign, const double mu);
 void init_sw_fields();
+void copy_32_sw_fields();
 
 void clover_nd(const int ieo, 
 	       spinor * const l_s, spinor * const l_c, 
 	       const spinor * const k_s, const spinor * const k_c, 
 	       const spinor * const j_s, const spinor * const j_c,
 	       const double mubar, const double epsbar);
+
 void clover_gamma5_nd(const int ieo, 
 		      spinor * const l_s, spinor * const l_c, 
 		      const spinor * const k_s, const spinor * const k_c, 
diff --git a/operator/clovertm_operators_32.c b/operator/clovertm_operators_32.c
new file mode 100644
index 000000000..f96cdfa6a
--- /dev/null
+++ b/operator/clovertm_operators_32.c
@@ -0,0 +1,595 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2005 Martin Hasenbusch
+ *               2011 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+
+// work-around for missing single precision implementation of inline SSE
+#ifdef SSE
+#define REDEFSSE
+#undef SSE
+#endif
+
+#ifdef SSE2
+#define REDEFSSE2
+#undef SSE2
+#endif
+
+#ifdef SSE3
+#define REDEFSSE3
+#undef SSE3
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <errno.h>
+#include <time.h>
+#ifdef TM_USE_MPI
+# include <mpi.h>
+#endif
+#include "global.h"
+#include "su3.h"
+#include "sse.h"
+#include "linalg_eo.h"
+#include "operator/Hopping_Matrix.h"
+#include "operator/Hopping_Matrix_32.h"
+
+#include "tm_operators.h"
+#include "tm_operators_32.h"
+
+#include "operator/clovertm_operators.h"
+#include "operator/clovertm_operators_32.h"
+
+
+void Qsw_pm_psi_32(spinor32 * const l, spinor32 * const k) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  /* \hat Q_{-} */
+  Hopping_Matrix_32_orphaned(EO, g_spinor_field32[1], k);
+  clover_inv_32_orphaned(g_spinor_field32[1], -1, g_mu);
+  Hopping_Matrix_32_orphaned(OE, g_spinor_field32[0], g_spinor_field32[1]);
+  clover_gamma5_32_orphaned(OO, g_spinor_field32[0], k, g_spinor_field32[0], -(g_mu + g_mu3));
+  /* \hat Q_{+} */
+  Hopping_Matrix_32_orphaned(EO, l, g_spinor_field32[0]);
+  clover_inv_32_orphaned(l, +1, g_mu); 
+  Hopping_Matrix_32_orphaned(OE, g_spinor_field32[1], l);
+  clover_gamma5_32_orphaned(OO, l, g_spinor_field32[0], g_spinor_field32[1], +(g_mu + g_mu3));
+#ifdef TM_USE_OMP
+  } /* OpenMP parallel closing brace */
+#endif
+}
+
+void clover_inv_32_orphaned(spinor32 * const l, const int tau3sign, const double mu) {
+  int icy;
+  su3_vector32 ALIGN32 psi, chi, phi1, phi3;
+  int ioff = 0;
+  const su3_32 *w1, *w2, *w3, *w4;
+  spinor32 *rn;
+
+  if(tau3sign < 0 && fabs(mu) > 0) {
+    ioff = VOLUME/2;
+  }
+
+#ifndef TM_USE_OMP
+  icy = ioff;
+#endif
+  /************************ loop over all lattice sites *************************/
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(int icx = 0; icx < (VOLUME/2); icx++) {
+#ifdef TM_USE_OMP
+    icy = ioff + icx;
+#endif
+
+    rn = l + icx;
+    _vector_assign(phi1,(*rn).s0);
+    _vector_assign(phi3,(*rn).s2);
+
+    w1=&sw_inv_32[icy][0][0];
+    w2=w1+2;  /* &sw_inv_32[icy][1][0]; */
+    w3=w1+4;  /* &sw_inv_32[icy][2][0]; */
+    w4=w1+6;  /* &sw_inv_32[icy][3][0]; */
+    _su3_multiply(psi,*w1,phi1); 
+    _su3_multiply(chi,*w2,(*rn).s1);
+    _vector_add((*rn).s0,psi,chi);
+    _su3_multiply(psi,*w4,phi1); 
+    _su3_multiply(chi,*w3,(*rn).s1);
+    _vector_add((*rn).s1,psi,chi);
+
+    w1++; /* &sw_inv_32[icy][0][1]; */
+    w2++; /* &sw_inv_32[icy][1][1]; */
+    w3++; /* &sw_inv_32[icy][2][1]; */
+    w4++; /* &sw_inv_32[icy][3][1]; */
+    _su3_multiply(psi,*w1,phi3); 
+    _su3_multiply(chi,*w2,(*rn).s3);
+    _vector_add((*rn).s2,psi,chi);
+    _su3_multiply(psi,*w4,phi3); 
+    _su3_multiply(chi,*w3,(*rn).s3);
+    _vector_add((*rn).s3,psi,chi);
+
+#ifndef TM_USE_OMP
+    ++icy;
+#endif
+
+    /******************************** end of loop *********************************/
+  }
+}
+
+void clover_inv_32(spinor32 * const l, const int tau3sign, const double mu) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  clover_inv_32_orphaned(l,tau3sign,mu);
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+  return;
+}
+
+void clover_inv_nd_32_orphaned(const int ieo, spinor32 * const l_c, spinor32 * const l_s) {
+  int icy;
+  su3_vector32 ALIGN psi, chi, phi1, phi3;
+  int ioff = 0;
+  const su3_32 *w1, *w2, *w3, *w4;
+  spinor32 *rn_s, *rn_c;
+
+
+  if(ieo == 1) ioff = VOLUME/2;
+
+#ifndef TM_USE_OMP
+  icy = ioff;
+#endif
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(unsigned int icx = 0; icx < (VOLUME/2); icx++) {
+#ifdef TM_USE_OMP
+    icy = ioff + icx;
+#endif
+
+    rn_s = l_s + icx;
+    rn_c = l_c + icx;
+    _vector_assign(phi1,(*rn_s).s0);
+
+    w1=&sw_inv_32[icy][0][0];
+    w2=w1+2;  /* &sw_inv_32[icy][1][0]; */
+    w3=w1+4;  /* &sw_inv_32[icy][2][0]; */
+    w4=w1+6;  /* &sw_inv_32[icy][3][0]; */
+    _su3_multiply(psi, *w1, phi1); 
+    _su3_multiply(chi, *w2, (*rn_s).s1);
+    _vector_add((*rn_s).s0, psi,chi);
+    _su3_multiply(psi, *w4, phi1); 
+    _su3_multiply(chi, *w3, (*rn_s).s1);
+    _vector_add((*rn_s).s1, psi, chi);
+
+    _vector_assign(phi1,(*rn_c).s0);
+
+    _su3_multiply(psi, *w1, phi1); 
+    _su3_multiply(chi, *w2, (*rn_c).s1);
+    _vector_add((*rn_c).s0, psi,chi);
+    _su3_multiply(psi, *w4, phi1); 
+    _su3_multiply(chi, *w3, (*rn_c).s1);
+    _vector_add((*rn_c).s1, psi, chi);
+
+    _vector_assign(phi3,(*rn_s).s2);
+
+    w1++; /* &sw_inv_32[icy][0][1]; */
+    w2++; /* &sw_inv_32[icy][1][1]; */
+    w3++; /* &sw_inv_32[icy][2][1]; */
+    w4++; /* &sw_inv_32[icy][3][1]; */
+    _su3_multiply(psi, *w1, phi3); 
+    _su3_multiply(chi, *w2, (*rn_s).s3);
+    _vector_add((*rn_s).s2, psi, chi);
+    _su3_multiply(psi, *w4, phi3); 
+    _su3_multiply(chi, *w3, (*rn_s).s3);
+    _vector_add((*rn_s).s3, psi, chi);
+
+    _vector_assign(phi3,(*rn_c).s2);
+
+    _su3_multiply(psi, *w1, phi3); 
+    _su3_multiply(chi, *w2, (*rn_c).s3);
+    _vector_add((*rn_c).s2, psi, chi);
+    _su3_multiply(psi, *w4, phi3); 
+    _su3_multiply(chi, *w3, (*rn_c).s3);
+    _vector_add((*rn_c).s3, psi, chi);
+
+#ifndef TM_USE_OMP
+    ++icy;
+#endif
+
+    /******************************** end of loop *********************************/
+  }
+  return;
+}
+
+void clover_inv_nd_32(const int ieo, spinor32 * const l_c, spinor32 * const l_s) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  clover_inv_nd_32_orphaned(ieo,l_c,l_s);
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+  return;
+}
+
+void clover_gamma5_32_orphaned(const int ieo, 
+		   spinor32 * const l, const spinor32 * const k, const spinor32 * const j,
+		   const double mu) {
+
+  su3_vector32 ALIGN32 chi, psi1, psi2;
+  int ix;
+  int ioff,icx;
+  const su3_32 *w1,*w2,*w3;
+  spinor32 *r;
+  const spinor32 *s,*t;
+
+  if(ieo == 0) {
+    ioff = 0;
+  } 
+  else {
+    ioff = (VOLUME+RAND)/2;
+  }
+
+/************************ loop over all lattice sites *************************/
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(icx = ioff; icx < (VOLUME/2+ioff); icx++) {
+    ix = g_eo2lexic[icx];
+    
+    r = l + icx-ioff;
+    s = k + icx-ioff;
+    t = j + icx-ioff;
+    
+    w1=&sw_32[ix][0][0];
+    w2=w1+2; /*&sw[ix][1][0];*/
+    w3=w1+4; /*&sw[ix][2][0];*/
+    _su3_multiply(psi1,*w1,(*s).s0); 
+    _su3_multiply(chi,*w2,(*s).s1);
+    _vector_add_assign(psi1,chi);
+    _su3_inverse_multiply(psi2,*w2,(*s).s0); 
+    _su3_multiply(chi,*w3,(*s).s1);
+    _vector_add_assign(psi2,chi); 
+    // add in the twisted mass term (plus in the upper components)
+    _vector_add_i_mul(psi1, (float)mu, (*s).s0);
+    _vector_add_i_mul(psi2, (float)mu, (*s).s1);
+
+    _vector_sub((*r).s0,psi1,(*t).s0);
+    _vector_sub((*r).s1,psi2,(*t).s1);
+    
+    w1++; /*=&sw[ix][0][1];*/
+    w2++; /*=&sw[ix][1][1];*/
+    w3++; /*=&sw[ix][2][1];*/
+    _su3_multiply(psi1,*w1,(*s).s2); _su3_multiply(chi,*w2,(*s).s3);
+    _vector_add_assign(psi1,chi); 
+    _su3_inverse_multiply(psi2,*w2,(*s).s2); _su3_multiply(chi,*w3,(*s).s3);
+    _vector_add_assign(psi2,chi); 
+    // add in the twisted mass term (minus from g5 in the lower components)
+    _vector_add_i_mul(psi1, -mu, (*s).s2);
+    _vector_add_i_mul(psi2, -mu, (*s).s3);
+
+    /**************** multiply with  gamma5 included ******************************/
+    _vector_sub((*r).s2,(*t).s2,psi1);
+    _vector_sub((*r).s3,(*t).s3,psi2);
+    /******************************** end of loop *********************************/
+  }
+}
+
+void clover_gamma5_32(const int ieo, 
+		   spinor32 * const l, const spinor32 * const k, const spinor32 * const j,
+		   const double mu) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  clover_gamma5_32_orphaned(ieo,l,k,j,mu);
+#ifdef TM_USE_OMP
+  } /* OMP closing brace */
+#endif
+  return;
+}
+
+void clover_gamma5_nd_32_orphaned(const int ieo, 
+          spinor32 * const l_c, spinor32 * const l_s, 
+          const spinor32 * const k_c, const spinor32 * const k_s, 
+          const spinor32 * const j_c, const spinor32 * const j_s,
+          const float mubar, const float epsbar) {
+  su3_vector32 ALIGN chi, psi1, psi2;
+  int ix;
+  int ioff;
+  const su3_32 *w1,*w2,*w3;
+  spinor32 *r_s, *r_c;
+  const spinor32 *s_s, *s_c, *t_s, *t_c;
+  
+  if(ieo == 0) {
+    ioff = 0;
+  } 
+  else {
+    ioff = (VOLUME+RAND)/2;
+  }
+  /************************ loop over all lattice sites *************************/
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(unsigned int icx = ioff; icx < (VOLUME/2+ioff); icx++) {
+    ix = g_eo2lexic[icx];
+    
+    r_s = l_s + icx-ioff;
+    r_c = l_c + icx-ioff;
+    s_s = k_s + icx-ioff;
+    s_c = k_c + icx-ioff;
+    t_s = j_s + icx-ioff;
+    t_c = j_c + icx-ioff;
+
+    // upper two spin components first
+    w1=&sw_32[ix][0][0];
+    w2=w1+2; /*&sw_32[ix][1][0];*/
+    w3=w1+4; /*&sw_32[ix][2][0];*/
+    _su3_multiply(psi1, *w1, (*s_s).s0); 
+    _su3_multiply(chi, *w2, (*s_s).s1);
+    _vector_add_assign(psi1, chi);
+    _su3_inverse_multiply(psi2, *w2, (*s_s).s0); 
+    _su3_multiply(chi, *w3, (*s_s).s1);
+    _vector_add_assign(psi2, chi); 
+
+    // add in the twisted mass term (plus in the upper components)
+    _vector_add_i_mul(psi1, mubar, (*s_s).s0);
+    _vector_add_i_mul(psi2, mubar, (*s_s).s1);
+
+    _vector_add_mul(psi1, epsbar, (*s_c).s0);
+    _vector_add_mul(psi2, epsbar, (*s_c).s1);
+
+    _vector_sub((*r_s).s0, psi1, (*t_s).s0);
+    _vector_sub((*r_s).s1, psi2, (*t_s).s1);
+
+    _su3_multiply(psi1, *w1, (*s_c).s0); 
+    _su3_multiply(chi, *w2, (*s_c).s1);
+    _vector_add_assign(psi1, chi);
+    _su3_inverse_multiply(psi2, *w2, (*s_c).s0); 
+    _su3_multiply(chi, *w3, (*s_c).s1);
+    _vector_add_assign(psi2, chi); 
+
+    // add in the twisted mass term (plus in the upper components)
+    _vector_add_i_mul(psi1, -mubar, (*s_c).s0);
+    _vector_add_i_mul(psi2, -mubar, (*s_c).s1);
+
+    _vector_add_mul(psi1, epsbar, (*s_s).s0);
+    _vector_add_mul(psi2, epsbar, (*s_s).s1);
+
+    _vector_sub((*r_c).s0, psi1, (*t_c).s0);
+    _vector_sub((*r_c).s1, psi2, (*t_c).s1);
+
+
+    // now lower to spin components
+    w1++; /*=&sw_32[ix][0][1];*/
+    w2++; /*=&sw_32[ix][1][1];*/
+    w3++; /*=&sw_32[ix][2][1];*/
+    _su3_multiply(psi1, *w1, (*s_s).s2); 
+    _su3_multiply(chi, *w2, (*s_s).s3);
+    _vector_add_assign(psi1, chi); 
+    _su3_inverse_multiply(psi2, *w2, (*s_s).s2); 
+    _su3_multiply(chi, *w3, (*s_s).s3);
+    _vector_add_assign(psi2, chi); 
+
+    // add in the twisted mass term (minus from g5 in the lower components)
+    _vector_add_i_mul(psi1, -mubar, (*s_s).s2);
+    _vector_add_i_mul(psi2, -mubar, (*s_s).s3);
+
+    _vector_add_mul(psi1, epsbar, (*s_c).s2);
+    _vector_add_mul(psi2, epsbar, (*s_c).s3);
+
+    _vector_sub((*r_s).s2, (*t_s).s2, psi1);
+    _vector_sub((*r_s).s3, (*t_s).s3, psi2);
+
+    _su3_multiply(psi1, *w1, (*s_c).s2); 
+    _su3_multiply(chi, *w2, (*s_c).s3);
+    _vector_add_assign(psi1, chi); 
+    _su3_inverse_multiply(psi2, *w2, (*s_c).s2); 
+    _su3_multiply(chi, *w3, (*s_c).s3);
+    _vector_add_assign(psi2, chi); 
+
+    // add in the twisted mass term (minus from g5 in the lower components)
+    _vector_add_i_mul(psi1, mubar, (*s_c).s2);
+    _vector_add_i_mul(psi2, mubar, (*s_c).s3);
+
+    _vector_add_mul(psi1, epsbar, (*s_s).s2);
+    _vector_add_mul(psi2, epsbar, (*s_s).s3);
+
+    _vector_sub((*r_c).s2, (*t_c).s2, psi1);
+    _vector_sub((*r_c).s3, (*t_c).s3, psi2);
+  }
+}
+
+void clover_gamma5_nd_32(const int ieo, 
+          spinor32 * const l_c, spinor32 * const l_s, 
+          const spinor32 * const k_c, const spinor32 * const k_s, 
+          const spinor32 * const j_c, const spinor32 * const j_s,
+          const float mubar, const float epsbar) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  clover_gamma5_nd_32_orphaned(ieo,l_c,l_s,k_c,k_s,j_c,j_s,mubar,epsbar);
+#ifdef TM_USE_OMP
+  } /* OpenMP parallel closing brace */
+#endif
+}
+
+
+
+/**************************************************************
+ *
+ * assign_mul_one_sw_pm_imu_eps applies 
+ * (1 + T + imug5tau3 + epstau1) to spinor l
+ * and stores it in k
+ *
+ * it is assumed that the clover leaf is computed and stored
+ * in sw[VOLUME][3][2]
+ * the corresponding routine can be found in clover_leaf.c
+ *
+ **************************************************************/
+
+void assign_mul_one_sw_pm_imu_eps_32_orphaned(const int ieo, 
+          spinor32 * const k_s, spinor32 * const k_c, 
+          const spinor32 * const l_s, const spinor32 * const l_c,
+          const float mu, const float eps) {
+  su3_vector32 ALIGN chi, psi1, psi2;
+  int ix;
+  int ioff;
+  const su3_32 *w1, *w2, *w3;
+  spinor32 *r_s, *r_c;
+  const spinor32 *s_s, *s_c;
+  
+  if(ieo == 0) {
+    ioff = 0;
+  } 
+  else {
+    ioff = (VOLUME+RAND)/2;
+  }
+  /************************ loop over all lattice sites *************************/
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(unsigned int icx = ioff; icx < (VOLUME/2+ioff); icx++) {
+    ix = g_eo2lexic[icx];
+    
+    r_s = k_s + icx-ioff;
+    r_c = k_c + icx-ioff;
+    s_s = l_s + icx-ioff;
+    s_c = l_c + icx-ioff;
+
+    // upper two spin components first
+    w1=&sw_32[ix][0][0];
+    w2=w1+2; /*&sw_32[ix][1][0];*/
+    w3=w1+4; /*&sw_32[ix][2][0];*/
+    _su3_multiply(psi1, *w1, (*s_s).s0); 
+    _su3_multiply(chi, *w2, (*s_s).s1);
+    _vector_add_assign(psi1, chi);
+    _su3_inverse_multiply(psi2, *w2, (*s_s).s0); 
+    _su3_multiply(chi, *w3, (*s_s).s1);
+    _vector_add_assign(psi2, chi); 
+
+    // add in the twisted mass term (plus in the upper components)
+    _vector_add_i_mul(psi1, mu, (*s_s).s0);
+    _vector_add_i_mul(psi2, mu, (*s_s).s1);
+
+    _vector_add_mul(psi1, eps, (*s_c).s0);
+    _vector_add_mul(psi2, eps, (*s_c).s1);
+
+    _vector_assign((*r_s).s0, psi1);
+    _vector_assign((*r_s).s1, psi2);
+
+    _su3_multiply(psi1, *w1, (*s_c).s0); 
+    _su3_multiply(chi, *w2, (*s_c).s1);
+    _vector_add_assign(psi1, chi);
+    _su3_inverse_multiply(psi2, *w2, (*s_c).s0); 
+    _su3_multiply(chi, *w3, (*s_c).s1);
+    _vector_add_assign(psi2, chi); 
+
+    // add in the twisted mass term (plus in the upper components)
+    _vector_add_i_mul(psi1, -mu, (*s_c).s0);
+    _vector_add_i_mul(psi2, -mu, (*s_c).s1);
+
+    _vector_add_mul(psi1, eps, (*s_s).s0);
+    _vector_add_mul(psi2, eps, (*s_s).s1);
+
+    _vector_assign((*r_c).s0, psi1);
+    _vector_assign((*r_c).s1, psi2);
+
+    // now lower two spin components
+    w1++; /*=&sw_32[ix][0][1];*/
+    w2++; /*=&sw_32[ix][1][1];*/
+    w3++; /*=&sw_32[ix][2][1];*/
+    _su3_multiply(psi1, *w1, (*s_s).s2); 
+    _su3_multiply(chi, *w2, (*s_s).s3);
+    _vector_add_assign(psi1, chi); 
+    _su3_inverse_multiply(psi2, *w2, (*s_s).s2); 
+    _su3_multiply(chi, *w3, (*s_s).s3);
+    _vector_add_assign(psi2, chi); 
+
+    // add in the twisted mass term (minus from g5 in the lower components)
+    _vector_add_i_mul(psi1, -mu, (*s_s).s2);
+    _vector_add_i_mul(psi2, -mu, (*s_s).s3);
+
+    _vector_add_mul(psi1, eps, (*s_c).s2);
+    _vector_add_mul(psi2, eps, (*s_c).s3);
+
+    _vector_assign((*r_s).s2, psi1);
+    _vector_assign((*r_s).s3, psi2);
+
+    _su3_multiply(psi1, *w1, (*s_c).s2); 
+    _su3_multiply(chi, *w2, (*s_c).s3);
+    _vector_add_assign(psi1, chi); 
+    _su3_inverse_multiply(psi2, *w2, (*s_c).s2); 
+    _su3_multiply(chi, *w3, (*s_c).s3);
+    _vector_add_assign(psi2, chi); 
+
+    // add in the twisted mass term (minus from g5 in the lower components)
+    _vector_add_i_mul(psi1, mu, (*s_c).s2);
+    _vector_add_i_mul(psi2, mu, (*s_c).s3);
+
+    _vector_add_mul(psi1, eps, (*s_s).s2);
+    _vector_add_mul(psi2, eps, (*s_s).s3);
+
+    _vector_assign((*r_c).s2, psi1);
+    _vector_assign((*r_c).s3, psi2);
+
+  }
+}
+
+void assign_mul_one_sw_pm_imu_eps_32(const int ieo, 
+          spinor32 * const k_s, spinor32 * const k_c, 
+          const spinor32 * const l_s, const spinor32 * const l_c,
+          const float mu, const float eps) {
+  #ifdef TM_USE_OMP
+  #pragma omp parallel
+  {
+  #endif
+  assign_mul_one_sw_pm_imu_eps_32_orphaned(ieo,k_s,k_c,l_s,l_c,mu,eps);
+  #ifdef TM_USE_OMP
+  } /* OpenMP parallel closing brace */
+  #endif
+}
+
+#ifdef REDEFSSE
+#undef REDEFSSE
+#define SSE
+#endif
+
+#ifdef REDEFSSE2
+#undef REDEFSSE2
+#define SSE2
+#endif
+
+#ifdef REDEFSSE3
+#undef REDEFSSE3
+#define SSE3
+#endif
diff --git a/operator/clovertm_operators_32.h b/operator/clovertm_operators_32.h
new file mode 100644
index 000000000..fcf2b17eb
--- /dev/null
+++ b/operator/clovertm_operators_32.h
@@ -0,0 +1,68 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2005 Martin Hasenbusch
+ *               2009 Carsten Urbach
+ *               2012 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _CLOVERTM_OPERATORS_32_H
+#define _CLOVERTM_OPERATORS_32_H
+
+#include "su3.h"
+
+extern su3 *** sw;
+extern su3 *** sw_inv;
+extern su3_32 *** sw_32;
+extern su3_32 *** sw_inv_32;
+extern su3 ** swm, ** swp;
+
+void clover_inv_32_orphaned(spinor32 * const l, const int tau3sign, const double mu);
+void clover_inv_32(spinor32 * const l, const int tau3sign, const double mu);
+void Qsw_pm_psi_32(spinor32 * const l, spinor32 * const k);
+void clover_gamma5_32_orphaned(const int ieo, 
+		   spinor32 * const l, const spinor32 * const k, const spinor32 * const j,
+		   const double mu);
+void clover_gamma5_32(const int ieo, 
+		   spinor32 * const l, const spinor32 * const k, const spinor32 * const j,
+		   const double mu);
+
+void assign_mul_one_sw_pm_imu_eps_32(const int ieo, 
+          spinor32 * const k_s, spinor32 * const k_c, 
+          const spinor32 * const l_s, const spinor32 * const l_c,
+          const float mu, const float eps);
+void assign_mul_one_sw_pm_imu_eps_32_orphaned(const int ieo, 
+          spinor32 * const k_s, spinor32 * const k_c, 
+          const spinor32 * const l_s, const spinor32 * const l_c,
+          const float mu, const float eps);
+
+void clover_gamma5_nd_32(const int ieo,
+          spinor32 * const l_c, spinor32 * const l_s,
+          const spinor32 * const k_c, const spinor32 * const k_s,
+          const spinor32 * const j_c, const spinor32 * const j_s,
+          const float mubar, const float epsbar);
+void clover_gamma5_nd_32_orphaned(const int ieo,
+          spinor32 * const l_c, spinor32 * const l_s,
+          const spinor32 * const k_c, const spinor32 * const k_s,
+          const spinor32 * const j_c, const spinor32 * const j_s,
+          const float mubar, const float epsbar);
+
+void clover_inv_nd_32(const int ieo, spinor32 * const l_c, spinor32 * const l_s);
+void clover_inv_nd_32_orphaned(const int ieo, spinor32 * const l_c, spinor32 * const l_s);
+
+#endif
+
diff --git a/operator/halfspinor_bg_dbl.c b/operator/halfspinor_bg_dbl.c
index c052eeeee..7d73c6e24 100644
--- a/operator/halfspinor_bg_dbl.c
+++ b/operator/halfspinor_bg_dbl.c
@@ -119,7 +119,7 @@ void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
       /************************ end of loop ************************/
     }
 
-#    if (defined MPI && !defined _NO_COMM)
+#    if (defined TM_USE_MPI && !defined _NO_COMM)
     xchange_halffield32(); 
 #    endif
     s = l;
@@ -233,7 +233,7 @@ void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
       /************************ end of loop ************************/
     }
 
-#    if (defined MPI && !defined _NO_COMM)
+#    if (defined TM_USE_MPI && !defined _NO_COMM)
     xchange_halffield(); 
 #    endif
     s = l;
diff --git a/operator/halfspinor_bgq_dbl.c b/operator/halfspinor_bgq_dbl.c
index 0eb5516ef..77b927476 100644
--- a/operator/halfspinor_bgq_dbl.c
+++ b/operator/halfspinor_bgq_dbl.c
@@ -111,7 +111,7 @@ void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
       /************************ end of loop ************************/
     }
 
-#    if (defined MPI && !defined _NO_COMM)
+#    if (defined TM_USE_MPI && !defined _NO_COMM)
     xchange_halffield32(); 
 #    endif
     s = l;
@@ -226,7 +226,7 @@ void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
 
     }
 
-#    if (defined MPI && !defined _NO_COMM)
+#    if (defined TM_USE_MPI && !defined _NO_COMM)
     xchange_halffield(); 
 #    endif
     s = l;
diff --git a/operator/halfspinor_body.c b/operator/halfspinor_body.c
index 69e8d55f5..2bdfbb67f 100644
--- a/operator/halfspinor_body.c
+++ b/operator/halfspinor_body.c
@@ -49,7 +49,7 @@ __alignx(32, s);
 #pragma pomp inst begin(hoppingmatrix)
 #endif
 
-#ifndef OMP  
+#ifndef TM_USE_OMP  
 s = k;
 _prefetch_spinor(s);
 if(ieo == 0) {
@@ -73,13 +73,13 @@ g_sloppy_precision = 0;
 if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
   phi32 = NBPointer32[ieo];
   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #else
   ix=0;
 #endif
   for(unsigned int i = 0; i < (VOLUME)/2; i++){
-#ifdef OMP
+#ifdef TM_USE_OMP
     U=u0+i*4;
     s=k+i;
     ix=i*8;
@@ -111,18 +111,18 @@ if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
     
     _hop_z_m_pre32();
     
-#ifndef OMP
+#ifndef TM_USE_OMP
     s++;
     ix++;
 #endif
   }
   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp single
   {
 #endif
     
-#    if (defined MPI && !defined _NO_COMM)
+#    if (defined TM_USE_MPI && !defined _NO_COMM)
 #      ifdef SPI
 
      // Initialize the barrier, resetting the hardware.
@@ -150,11 +150,11 @@ if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
 #      endif
 #    endif
     
-#ifdef OMP
+#ifdef TM_USE_OMP
   }
 #endif
   
-#ifndef OMP
+#ifndef TM_USE_OMP
   s = l;
   if(ieo == 0) {
     U = g_gauge_field_copy[1][0];
@@ -173,13 +173,13 @@ if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
   
   phi32 = NBPointer32[2 + ieo];
   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #else
   ix = 0;
 #endif
   for(unsigned int i = 0; i < (VOLUME)/2; i++){
-#ifdef OMP
+#ifdef TM_USE_OMP
     ix=i*8;
     s=l+i;
     U=u0+i*4;
@@ -221,7 +221,7 @@ if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
     _hop_store_post(s);
 #endif
     
-#ifndef OMP
+#ifndef TM_USE_OMP
     U++;
     ix++;
     s++;
@@ -231,13 +231,13 @@ if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
  else {
    phi = NBPointer[ieo];
    
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #else
    ix=0;
 #endif
    for(unsigned int i = 0; i < (VOLUME)/2; i++){
-#ifdef OMP
+#ifdef TM_USE_OMP
      s=k+i;
      _prefetch_spinor(s);
      ix=i*8;
@@ -272,18 +272,18 @@ if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
      
      _hop_z_m_pre();
      
-#ifndef OMP
+#ifndef TM_USE_OMP
      s++;            
      ix++;
 #endif
    }
    
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp single
    {
 #endif
      
-#    if (defined MPI && !defined _NO_COMM)
+#    if (defined TM_USE_MPI && !defined _NO_COMM)
 #      ifdef SPI
 
      // Initialize the barrier, resetting the hardware.
@@ -312,11 +312,11 @@ if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
 #      endif // SPI
 #    endif
      
-#ifdef OMP
+#ifdef TM_USE_OMP
    }
 #endif
    
-#ifndef OMP
+#ifndef TM_USE_OMP
    s = l;
    if(ieo == 0) {
      U = g_gauge_field_copy[1][0];
@@ -336,14 +336,14 @@ if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
    
    phi = NBPointer[2 + ieo];
    
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #else
    ix = 0;
 #endif
    /* #pragma ivdep */
    for(unsigned int i = 0; i < (VOLUME)/2; i++){
-#ifdef OMP
+#ifdef TM_USE_OMP
      ix=i*8;
      U=u0+i*4;
      _prefetch_su3(U);
@@ -387,7 +387,7 @@ if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
      _hop_store_post(s);
 #endif
      
-#ifndef OMP
+#ifndef TM_USE_OMP
      U++;
      ix++;
      s++;
diff --git a/operator/halfspinor_body_32.c b/operator/halfspinor_body_32.c
new file mode 100644
index 000000000..e7aecb26a
--- /dev/null
+++ b/operator/halfspinor_body_32.c
@@ -0,0 +1,228 @@
+/**********************************************************************
+ * single precision version Copyright (C) 2013 Florian Burger
+ * based on halfspinor_body.c by Carsten Urbach
+ *
+ * This file is based on an implementation of the Dirac operator 
+ * written by Martin Luescher, modified by Martin Hasenbusch in 2002 
+ * this is a new version based on the aforementioned implementations
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ **********************************************************************/
+
+
+int ix;
+su3_32 * restrict U ALIGN32;
+spinor32 * restrict s ALIGN32;
+halfspinor32 * restrict * phi2 ALIGN32;
+_declare_hregs();
+
+#ifdef XLC
+# pragma disjoint(*l, *k)
+# pragma disjoint(*k, *U)
+# pragma disjoint(*l, *U)
+# pragma disjoint(*U, *s)
+# pragma disjoint(*k, *s)
+# pragma disjoint(*l, *s)
+__alignx(16, l);
+__alignx(16, k);
+__alignx(16, U);
+__alignx(16, s);
+#endif 
+
+//convert kappas to float locally
+_Complex float ALIGN32 ka0_32 = (_Complex float) ka0;
+_Complex float ALIGN32 ka1_32 = (_Complex float) ka1;
+_Complex float ALIGN32 ka2_32 = (_Complex float) ka2;
+_Complex float ALIGN32 ka3_32 = (_Complex float) ka3;
+
+#ifndef TM_USE_OMP  
+s = k;
+_prefetch_spinor_32(s);
+if(ieo == 0) {
+  U = g_gauge_field_copy_32[0][0];
+ }
+ else {
+   U = g_gauge_field_copy_32[1][0];
+ }
+_prefetch_su3_32(U);
+#else
+if(ieo == 0) {
+  u0 = g_gauge_field_copy_32[0][0];
+ }
+ else {
+   u0 = g_gauge_field_copy_32[1][0];
+ }
+#endif
+
+  phi2 = NBPointer32[ieo];
+  
+#ifdef TM_USE_OMP
+#pragma omp for
+#else
+  ix=0;
+#endif
+  for(unsigned int i = 0; i < (VOLUME)/2; i++){
+#ifdef TM_USE_OMP
+    U=u0+i*4;
+    s=k+i;
+    ix=i*8;
+#endif
+    _hop_t_p_pre32();
+    U++;
+    ix++;
+    
+    _hop_t_m_pre32();
+    ix++;
+    
+    _hop_x_p_pre32();
+    U++;
+    ix++;
+    
+    _hop_x_m_pre32();
+    ix++;
+    
+    _hop_y_p_pre32();
+    U++;
+    ix++;
+    
+    _hop_y_m_pre32();
+    ix++;
+    
+    _hop_z_p_pre32();
+    U++;
+    ix++;
+    
+    _hop_z_m_pre32();
+    
+#ifndef TM_USE_OMP
+    s++;
+    ix++;
+#endif
+  }
+  
+#ifdef TM_USE_OMP
+#pragma omp single
+  {
+#endif
+    
+#    if (defined TM_USE_MPI && !defined _NO_COMM)
+#      ifdef SPI
+
+     // Initialize the barrier, resetting the hardware.
+     int rc = MUSPI_GIBarrierInit ( &GIBarrier, 0 /*comm world class route*/  );
+     if(rc) {
+       printf("MUSPI_GIBarrierInit returned rc = %d\n", rc);
+       exit(__LINE__);
+     }
+     // reset the recv counter 
+     recvCounter = totalMessageSize/2;
+     global_barrier(); // make sure everybody is set recv counter
+
+     //#pragma omp for nowait
+     for (unsigned int j = 0; j < spi_num_dirs; j++) {
+       descCount[ j ] =
+	 msg_InjFifoInject ( injFifoHandle,
+			     j,
+			     &SPIDescriptors32[j]);
+     }
+     // wait for receive completion
+     while ( recvCounter > 0 );
+     _bgq_msync();
+#      else
+    xchange_halffield32(); 
+#      endif
+#    endif
+    
+#ifdef TM_USE_OMP
+  }
+#endif
+ 
+#ifndef TM_USE_OMP
+  s = l;
+  if(ieo == 0) {
+    U = g_gauge_field_copy_32[1][0];
+  }
+  else {
+    U = g_gauge_field_copy_32[0][0];
+  }
+#else
+  if(ieo == 0) {
+    u0 = g_gauge_field_copy_32[1][0];
+  }
+  else {
+    u0 = g_gauge_field_copy_32[0][0];
+  }
+#endif
+  
+  phi2 = NBPointer32[2 + ieo];
+  
+#ifdef TM_USE_OMP
+#pragma omp for
+#else
+  ix = 0;
+#endif
+  for(unsigned int i = 0; i < (VOLUME)/2; i++){
+#ifdef TM_USE_OMP
+    ix=i*8;
+    s=l+i;
+    U=u0+i*4;
+#endif
+#ifdef _TM_SUB_HOP
+     pn=p+i;
+#endif
+    _hop_t_p_post32();
+    ix++;
+    
+    _hop_t_m_post32();
+    ix++;
+    U++;
+    
+    _hop_x_p_post32();
+    ix++;
+    
+    _hop_x_m_post32();
+    U++;
+    ix++;
+    
+    _hop_y_p_post32();
+    ix++;
+    
+    _hop_y_m_post32();
+    U++;
+    ix++;
+    
+    _hop_z_p_post32();
+    ix++;
+    
+    _hop_z_m_post32();
+    
+#ifdef _MUL_G5_CMPLX
+    _hop_mul_g5_cmplx_and_store32(s);
+#elif defined _TM_SUB_HOP
+     _g5_cmplx_sub_hop_and_g5store32(s);
+#else
+    _hop_store_post32(s);
+#endif
+    
+#ifndef TM_USE_OMP
+    U++;
+    ix++;
+    s++;
+#endif
+  }
+ 
+
diff --git a/operator/halfspinor_hopping_32.h b/operator/halfspinor_hopping_32.h
new file mode 100644
index 000000000..697fa1496
--- /dev/null
+++ b/operator/halfspinor_hopping_32.h
@@ -0,0 +1,408 @@
+/**********************************************************************
+ *
+ * Copyright (C) 2013  Florian Burger
+ *
+ * A 32-bit version of the Half-spinor implementation by Carsten Urbach
+ *
+ * This file is based on an implementation of the Dirac operator 
+ * written by Martin Luescher, modified by Martin Hasenbusch in 2002 
+ * and modified and extended by Carsten Urbach from 2003-2008
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ **********************************************************************/
+
+#ifndef _HALFSPINOR_HOPPING32_H
+#define _HALFSPINOR_HOPPING32_H
+
+#if (defined BGQ && defined XLC)
+
+#define _hop_t_p_pre32()							\
+  _vec_load_32(rs0, rs1, s->s0);						\
+  _vec_load16_32(rs2, rs3, s->s1, rtmp);					\
+  _vec_load_32(rs4, rs5, s->s2);						\
+  _vec_load16_32(rs6, rs7, s->s3, rtmp);					\
+  _prefetch_spinor_32(s+1);						\
+  _prefetch_su3_32(U+1);							\
+  _vec_add(r0, r1, rs0, rs1, rs4, rs5);					\
+  _vec_add(r2, r3, rs2, rs3, rs6, rs7);					\
+  _vec_su3_multiply_double2c_32(U);					\
+  rtmp = vec_ld2(0, (float*) &ka0_32);					\
+  _vec_cmplx_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp);		\
+  _vec_store_halfspinor_32(phi2[ix]->s0, r0, r1, r2);
+
+
+
+#define _hop_t_m_pre32()						\
+  _vec_sub(r0, r1, rs0, rs1, rs4, rs5);				\
+  _vec_sub(r2, r3, rs2, rs3, rs6, rs7);				\
+  _vec_store_32(phi2[ix]->s0, r0, r1);				\
+  _vec_store16_32(phi2[ix]->s1, r2, r3, U0);
+
+
+#define _hop_x_p_pre32()						\
+  _prefetch_su3_32(U+1);						\
+  _vec_i_mul_add(r0, r1, rs0, rs1, rs6, rs7, U0);		\
+  _vec_i_mul_add(r2, r3, rs2, rs3, rs4, rs5, U0);		\
+  rtmp = vec_ld2(0, (float*) &ka1_32);				\
+  _vec_su3_multiply_double2c_32(U);				\
+  _vec_cmplx_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp);	\
+  _vec_store_halfspinor_32(phi2[ix]->s0, r0, r1, r2);
+
+
+#define _hop_x_m_pre32()					\
+  _vec_i_mul_sub(r0, r1, rs0, rs1, rs6, rs7, U0);	\
+  _vec_i_mul_sub(r2, r3, rs2, rs3, rs4, rs5, U0);	\
+  _vec_store_32(phi2[ix]->s0, r0, r1);			\
+  _vec_store16_32(phi2[ix]->s1, r2, r3, U0);
+
+
+#define _hop_y_p_pre32()						\
+  _prefetch_su3_32(U+1);						\
+  _vec_add(r0, r1, rs0, rs1, rs6, rs7);				\
+  _vec_sub(r2, r3, rs2, rs3, rs4, rs5);				\
+  rtmp = vec_ld2(0, (float*) &ka2_32);				\
+  _vec_su3_multiply_double2c_32(U);				\
+  _vec_cmplx_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp);	\
+  _vec_store_halfspinor_32(phi2[ix]->s0, r0, r1, r2);
+
+
+
+#define _hop_y_m_pre32()				\
+  _vec_sub(r0, r1, rs0, rs1, rs6, rs7);		\
+  _vec_add(r2, r3, rs2, rs3, rs4, rs5);		\
+  _vec_store_32(phi2[ix]->s0, r0, r1);		\
+  _vec_store16_32(phi2[ix]->s1, r2, r3, U0);
+
+  
+#define _hop_z_p_pre32()						\
+  _prefetch_su3_32(U+1);						\
+  _vec_i_mul_add(r0, r1, rs0, rs1, rs4, rs5, U0);		\
+  _vec_i_mul_sub(r2, r3, rs2, rs3, rs6, rs7, U0);		\
+  rtmp = vec_ld2(0, (float*) &ka3_32);				\
+  _vec_su3_multiply_double2c_32(U);				\
+  _vec_cmplx_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp);	\
+  _vec_store_halfspinor_32(phi2[ix]->s0, r0, r1, r2);
+
+
+#define _hop_z_m_pre32()					\
+  _vec_i_mul_sub(r0, r1, rs0, rs1, rs4, rs5, U0);	\
+  _vec_i_mul_add(r2, r3, rs2, rs3, rs6, rs7, U0);	\
+  _vec_store_32(phi2[ix]->s0, r0, r1);			\
+  _vec_store16_32(phi2[ix]->s1, r2, r3, U0);
+
+  
+#define _hop_t_p_post32()				\
+  _vec_load_halfspinor_32(rs0, rs1, rs2, phi2[ix]->s0);	\
+  _vec_unfuse(rs0, rs1, rs2, rs3, rs4, rs5);		\
+  rs6 = rs0; rs7 = rs1; rs8 = rs2;			\
+  rs9 = rs3; rs10= rs4; rs11= rs5;
+
+
+#define _hop_t_m_post32()						\
+  _prefetch_su3_32(U+1);							\
+  _vec_load_32(r0, r1, phi2[ix]->s0);					\
+  _vec_load16_32(r2, r3, phi2[ix]->s1, rtmp);				\
+  rtmp = vec_ld2(0, (float*) &ka0_32);					\
+  _vec_su3_inverse_multiply_double2c_32(U);				\
+  _vec_cmplxcg_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp);		\
+  _vec_unfuse(r0, r1, r2, r3, r4, r5);					\
+  _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \
+  _vec_sub_double2(rs6, rs7, rs8, rs9, rs10, rs11, r0, r1, r2, r3, r4, r5);
+
+
+#define _hop_x_p_post32()						\
+  _vec_load_halfspinor_32(r0, r1, r2, phi2[ix]->s0);			\
+  _vec_unfuse(r0, r1, r2, r3, r4, r5);				\
+  _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \
+  _vec_i_mul_sub2(rs6, rs7, rs8, r3, r4, r5, U0);			\
+  _vec_i_mul_sub2(rs9, rs10, rs11, r0, r1, r2, U1);
+  
+
+#define _hop_x_m_post32()						\
+  _prefetch_su3_32(U+1);							\
+  _vec_load_32(r0, r1, phi2[ix]->s0);					\
+  _vec_load16_32(r2, r3, phi2[ix]->s1, rtmp);				\
+  rtmp = vec_ld2(0, (float*) &ka1_32);					\
+  _vec_su3_inverse_multiply_double2c_32(U);				\
+  _vec_cmplxcg_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp);		\
+  _vec_unfuse(r0, r1, r2, r3, r4, r5);					\
+  _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \
+  _vec_i_mul_add_double2(rs9, rs10, rs11, rs6, rs7, rs8, r0, r1, r2, r3, r4, r5, U0);
+
+
+
+#define _hop_y_p_post32()						\
+  _vec_load_halfspinor_32(r0, r1, r2, phi2[ix]->s0);			\
+  _vec_unfuse(r0, r1, r2, r3, r4, r5);				\
+  _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5);	\
+  _vec_sub2(rs6, rs7, rs8, r3, r4, r5);					\
+  _vec_add2(rs9, rs10, rs11, r0, r1, r2);
+
+
+#define _hop_y_m_post32()						\
+  _prefetch_su3_32(U+1);							\
+  _vec_load_32(r0, r1, phi2[ix]->s0);					\
+  _vec_load16_32(r2, r3, phi2[ix]->s1, rtmp);				\
+  rtmp = vec_ld2(0, (float*) &ka2_32);					\
+  _vec_su3_inverse_multiply_double2c_32(U);				\
+  _vec_cmplxcg_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp);		\
+  _vec_unfuse(r0, r1, r2, r3, r4, r5);					\
+  _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \
+  _vec_add2(rs6, rs7, rs8, r3, r4, r5);					\
+  _vec_sub2(rs9, rs10, rs11, r0, r1, r2);
+
+
+#define _hop_z_p_post32()						\
+  _vec_load_halfspinor_32(r0, r1, r2, phi2[ix]->s0);			\
+  _vec_unfuse(r0, r1, r2, r3, r4, r5);				\
+  _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \
+  _vec_i_mul_sub2(rs6, rs7, rs8, r0, r1, r2, U0);			\
+  _vec_i_mul_add2(rs9, rs10, rs11, r3, r4, r5, U1);
+
+
+#define _hop_z_m_post32()						\
+  _prefetch_su3_32(U+1);							\
+  _vec_load_32(r0, r1, phi2[ix]->s0);					\
+  _vec_load16_32(r2, r3, phi2[ix]->s1, rtmp);				\
+  rtmp = vec_ld2(0, (float*) &ka3_32);					\
+  _vec_su3_inverse_multiply_double2c_32(U);				\
+  _vec_cmplxcg_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp);		\
+  _vec_unfuse(r0, r1, r2, r3, r4, r5);					\
+  _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \
+  _vec_i_mul_add2(rs6, rs7, rs8, r0, r1, r2, U0);			\
+  _vec_i_mul_sub2(rs9, rs10, rs11, r3, r4, r5, U1);
+
+  
+  
+  
+//end new versions
+  
+  
+  
+  
+  
+#define _hop_mul_g5_cmplx_and_store32(res)					\
+  _vec_cmplx_mul_double2(r0, r1, r2, r3, r4, r5, rs0, rs1, rs2, rs3, rs4, rs5, cf); \
+  _vec_cmplxcg_mul_double2(r6, r7, r8, r9, r10, r11, rs6, rs7, rs8, rs9, rs10, rs11, cf); \
+  _vec_store2_32((res)->s0, r0, r1, r2);					\
+  _vec_store2_32((res)->s1, r3, r4, r5);					\
+  _vec_store2_32((res)->s2, r6, r7, r8);					\
+  _vec_store2_32((res)->s3, r9, r10, r11);
+
+#define _g5_cmplx_sub_hop_and_g5store32(res)					\
+  _vec_load_halfspinor_32(r3, r4, r5, pn->s0);				\
+  _vec_cmplx_mul_double2c_32(r0, r1, r2, r3, r4, r5, cf);			\
+  _vec_unfuse(r0, r1, r2, r3, r4, r5);					\
+  _vec_sub_double2(r0, r3, r1, r4, r2, r5, rs0, rs1, rs2, rs3, rs4, rs5); \
+  _vec_store2_32((res)->s0, r0, r3, r1);					\
+  _vec_store2_32((res)->s1, r4, r2, r5);					\
+  _vec_load_halfspinor_32(r3, r4, r5, pn->s2);				\
+  _vec_cmplxcg_mul_double2c(r0, r1, r2, r3, r4, r5, cf);		\
+  _vec_unfuse(r0, r1, r2, r3, r4, r5);					\
+  _vec_sub_double2(rs6, rs7, rs8, rs9, rs10, rs11, r0, r3, r1, r4, r2, r5); \
+  _vec_store2_32((res)->s2, rs6, rs7, rs8);					\
+  _vec_store2_32((res)->s3, rs9, rs10, rs11);
+
+#define _hop_store_post32(res)		\
+  _vec_store2_32((res)->s0, rs0, rs1, rs2);	\
+  _vec_store2_32((res)->s1, rs3, rs4, rs5);	\
+  _vec_store2_32((res)->s2, rs6, rs7, rs8);	\
+  _vec_store2_32((res)->s3, rs9, rs10, rs11);
+
+
+#define _declare_hregs()						\
+  vector4double ALIGN r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11;	\
+  vector4double ALIGN rs0, rs1, rs2, rs3, rs4, rs5, rs6, rs7, rs8, rs9, rs10, rs11; \
+  vector4double ALIGN U0, U1, U2, U3, U4, U6, U7;			\
+  vector4double ALIGN rtmp;
+
+#else
+
+#ifdef _prefetch_spinor
+# undef _prefetch_spinor
+#endif
+#define _prefetch_spinor(s)
+#ifdef _prefetch_halfspinor
+# undef _prefetch_halfspinor
+#endif
+#define _prefetch_halfspinor(hs)
+#ifdef _prefetch_spinor_32
+# undef _prefetch_spinor_32
+#endif
+#define _prefetch_spinor_32(s)
+#ifdef _prefetch_su3_32
+# undef _prefetch_su3_32
+#endif
+#define _prefetch_su3_32(U)
+
+
+#define _hop_t_p_pre32()				\
+  _vector_assign(rs.s0, s->s0);				\
+  _vector_assign(rs.s1, s->s1);				\
+  _vector_assign(rs.s2, s->s2);				\
+  _vector_assign(rs.s3, s->s3);				\
+  _vector_add(psi, rs.s0, rs.s2);			\
+  _su3_multiply(chi,(*U),psi);				\
+  _complex_times_vector(phi2[ix]->s0, ka0_32, chi);	\
+  _vector_add(psi, rs.s1, rs.s3);			\
+  _su3_multiply(chi,(*U),psi);				\
+  _complex_times_vector(phi2[ix]->s1, ka0_32, chi);
+
+#define _hop_t_m_pre32()				\
+  _vector_sub(phi2[ix]->s0, rs.s0, rs.s2);		\
+  _vector_sub(phi2[ix]->s1, rs.s1, rs.s3);
+
+#define _hop_x_p_pre32()				\
+  _vector_i_add(psi, rs.s0, rs.s3);			\
+  _su3_multiply(chi, (*U), psi);			\
+  _complex_times_vector(phi2[ix]->s0, ka1_32, chi);	\
+  _vector_i_add(psi, rs.s1, rs.s2);			\
+  _su3_multiply(chi, (*U), psi);			\
+  _complex_times_vector(phi2[ix]->s1, ka1_32, chi);
+
+#define _hop_x_m_pre32()				\
+  _vector_i_sub(phi2[ix]->s0, rs.s0, rs.s3);		\
+  _vector_i_sub(phi2[ix]->s1, rs.s1, rs.s2);
+
+#define _hop_y_p_pre32()				\
+  _vector_add(psi, rs.s0, rs.s3);			\
+  _su3_multiply(chi,(*U),psi);				\
+  _complex_times_vector(phi2[ix]->s0, ka2_32, chi);	\
+  _vector_sub(psi, rs.s1, rs.s2);			\
+  _su3_multiply(chi,(*U),psi);				\
+  _complex_times_vector(phi2[ix]->s1, ka2_32, chi);
+
+#define _hop_y_m_pre32()			\
+  _vector_sub(phi2[ix]->s0, rs.s0, rs.s3);	\
+  _vector_add(phi2[ix]->s1, rs.s1, rs.s2);
+
+#define _hop_z_p_pre32()				\
+  _vector_i_add(psi, rs.s0, rs.s2);			\
+  _su3_multiply(chi, (*U), psi);			\
+  _complex_times_vector(phi2[ix]->s0, ka3_32, chi);	\
+  _vector_i_sub(psi, rs.s1, rs.s3);			\
+  _su3_multiply(chi,(*U),psi);				\
+  _complex_times_vector(phi2[ix]->s1, ka3_32, chi);
+
+#define _hop_z_m_pre32()			\
+  _vector_i_sub(phi2[ix]->s0, rs.s0, rs.s2);	\
+  _vector_i_add(phi2[ix]->s1, rs.s1, rs.s3);
+
+#define _hop_t_p_post32();			\
+  _vector_assign(rs.s0, phi2[ix]->s0);		\
+  _vector_assign(rs.s2, phi2[ix]->s0);		\
+  _vector_assign(rs.s1, phi2[ix]->s1);		\
+  _vector_assign(rs.s3, phi2[ix]->s1);		\
+
+#define _hop_t_m_post32();			\
+  _vector_assign(psi, phi2[ix]->s0);		\
+  _su3_inverse_multiply(chi,(*U), psi);		\
+  _complexcjg_times_vector(psi,ka0_32,chi);	\
+  _vector_add_assign(rs.s0, psi);		\
+  _vector_sub_assign(rs.s2, psi);		\
+  _vector_assign(psi, phi2[ix]->s1);		\
+  _su3_inverse_multiply(chi,(*U), psi);		\
+  _complexcjg_times_vector(psi,ka0_32,chi);	\
+  _vector_add_assign(rs.s1, psi);		\
+  _vector_sub_assign(rs.s3, psi);
+
+#define _hop_x_p_post32();				\
+  _vector_add_assign(rs.s0, phi2[ix]->s0);		\
+  _vector_i_sub_assign(rs.s3, phi2[ix]->s0);		\
+  _vector_add_assign(rs.s1, phi2[ix]->s1);		\
+  _vector_i_sub_assign(rs.s2, phi2[ix]->s1);
+
+#define _hop_x_m_post32();			\
+  _vector_assign(psi, phi2[ix]->s0);		\
+  _su3_inverse_multiply(chi,(*U), psi);		\
+  _complexcjg_times_vector(psi,ka1_32,chi);	\
+  _vector_add_assign(rs.s0, psi);		\
+  _vector_i_add_assign(rs.s3, psi);		\
+  _vector_assign(psi, phi2[ix]->s1);		\
+  _su3_inverse_multiply(chi,(*U), psi);		\
+  _complexcjg_times_vector(psi,ka1_32,chi);	\
+  _vector_add_assign(rs.s1, psi);		\
+  _vector_i_add_assign(rs.s2, psi);
+
+#define _hop_y_p_post32();			\
+  _vector_add_assign(rs.s0, phi2[ix]->s0);	\
+  _vector_add_assign(rs.s3, phi2[ix]->s0);	\
+  _vector_add_assign(rs.s1, phi2[ix]->s1);	\
+  _vector_sub_assign(rs.s2, phi2[ix]->s1);
+
+#define _hop_y_m_post32();			\
+  _vector_assign(psi, phi2[ix]->s0);		\
+  _su3_inverse_multiply(chi,(*U), psi);		\
+  _complexcjg_times_vector(psi,ka2_32,chi);	\
+  _vector_add_assign(rs.s0, psi);		\
+  _vector_sub_assign(rs.s3, psi);		\
+  _vector_assign(psi, phi2[ix]->s1);		\
+  _su3_inverse_multiply(chi, (*U), psi);	\
+  _complexcjg_times_vector(psi,ka2_32,chi);	\
+  _vector_add_assign(rs.s1, psi);		\
+  _vector_add_assign(rs.s2, psi);
+
+#define _hop_z_p_post32();			\
+  _vector_add_assign(rs.s0, phi2[ix]->s0);	\
+  _vector_i_sub_assign(rs.s2, phi2[ix]->s0);	\
+  _vector_add_assign(rs.s1, phi2[ix]->s1);	\
+  _vector_i_add_assign(rs.s3, phi2[ix]->s1);
+
+#define _hop_z_m_post32();			\
+  _vector_assign(psi, phi2[ix]->s0);		\
+  _su3_inverse_multiply(chi,(*U), psi);		\
+  _complexcjg_times_vector(psi,ka3_32,chi);	\
+  _vector_add_assign(rs.s0, psi);		\
+  _vector_i_add_assign(rs.s2, psi);		\
+  _vector_assign(psi, phi2[ix]->s1);		\
+  _su3_inverse_multiply(chi,(*U), psi);		\
+  _complexcjg_times_vector(psi,ka3_32,chi);	\
+  _vector_add_assign(rs.s1, psi);		\
+  _vector_i_sub_assign(rs.s3, psi);
+
+#define _hop_mul_g5_cmplx_and_store32(res)			\
+  _complex_times_vector((res)->s0, cfactor, rs.s0);		\
+  _complex_times_vector((res)->s1, cfactor, rs.s1);		\
+  _complexcjg_times_vector((res)->s2, cfactor, rs.s2);	\
+  _complexcjg_times_vector((res)->s3, cfactor, rs.s3);
+
+#define _g5_cmplx_sub_hop_and_g5store32(res)		\
+  _complex_times_vector(psi, cfactor, pn->s0);		\
+  _vector_sub((res)->s0, psi, rs.s0);			\
+  _complex_times_vector(psi2, cfactor, pn->s1);		\
+  _vector_sub((res)->s1, psi2, rs.s1);			\
+  _complexcjg_times_vector(psi, cfactor, pn->s2);	\
+  _vector_sub((res)->s2, rs.s2, psi);			\
+  _complexcjg_times_vector(psi2, cfactor, pn->s3);	\
+  _vector_sub((res)->s3, rs.s3, psi2);
+
+
+#define _hop_store_post32(res)		\
+  _vector_assign(res->s0, rs.s0);	\
+  _vector_assign(res->s1, rs.s1);	\
+  _vector_assign(res->s2, rs.s2);	\
+  _vector_assign(res->s3, rs.s3);
+
+
+#define _declare_hregs()				\
+  spinor32 ALIGN32 rs;					\
+  su3_vector32 ALIGN32 psi, chi, psi2, chi2;
+
+#endif
+
+#endif
+
diff --git a/operator/halfspinor_sse_dbl.c b/operator/halfspinor_sse_dbl.c
index 354ec1a02..624f68eab 100644
--- a/operator/halfspinor_sse_dbl.c
+++ b/operator/halfspinor_sse_dbl.c
@@ -34,7 +34,7 @@ void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
   }
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
 {
   su3 * restrict U0 ALIGN;
@@ -50,7 +50,7 @@ void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
 #pragma pomp inst begin(hoppingmatrix)
 #endif
 
-#ifndef OMP
+#ifndef TM_USE_OMP
   /* We will run through the source vector now */
   /* instead of the solution vector            */
   s = k;
@@ -74,13 +74,13 @@ void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
   phi = NBPointer[ieo];
 
   /**************** loop over all lattice sites ******************/
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #else
   ix = 0;
 #endif
   for(int i = 0; i < (VOLUME)/2; i++){
-#ifdef OMP
+#ifdef TM_USE_OMP
     s = k+i;
     _prefetch_spinor(s);
     U = U0+i*4;
@@ -119,24 +119,24 @@ void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
 
     /*********************** direction -3 ************************/
     _hop_z_m_pre();
-#ifndef OMP
+#ifndef TM_USE_OMP
     ix++;
     s++;
 #endif
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp single
 {
 #endif
-#    if (defined MPI && !defined _NO_COMM)
+#    if (defined TM_USE_MPI && !defined _NO_COMM)
   xchange_halffield(); 
 #    endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 }
 #endif
 
-#ifndef OMP
+#ifndef TM_USE_OMP
   s = l;
   if(ieo == 0) {
     U = g_gauge_field_copy[1][0];
@@ -157,13 +157,13 @@ void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
 
   
   /* Now we sum up and expand to a full spinor */
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #else
   ix = 0;
 #endif
   for(int i = 0; i < (VOLUME)/2; i++){
-#ifdef OMP
+#ifdef TM_USE_OMP
     U = U0 + i*4;
     _prefetch_su3(U);
     ix = i*8;
@@ -201,7 +201,7 @@ void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
     ix++;
     /*********************** direction -3 ************************/
     _hop_z_m_post();
-#ifndef OMP
+#ifndef TM_USE_OMP
     ix++;
     U++;
     s++;
@@ -211,7 +211,7 @@ void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
 #pragma pomp inst end(hoppingmatrix)
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* omp parallel closing bracket */
 #endif
 }
diff --git a/operator/hopping_bg_dbl.c b/operator/hopping_bg_dbl.c
index 498feb91a..fd65e1708 100644
--- a/operator/hopping_bg_dbl.c
+++ b/operator/hopping_bg_dbl.c
@@ -48,7 +48,7 @@ void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
   }
 #endif
 
-#    if (defined MPI && !(defined _NO_COMM))
+#    if (defined TM_USE_MPI && !(defined _NO_COMM))
   xchange_field(k, ieo);
 #    endif
 
diff --git a/operator/hopping_body_dbl.c b/operator/hopping_body_dbl.c
index ba7a43473..574908fed 100644
--- a/operator/hopping_body_dbl.c
+++ b/operator/hopping_body_dbl.c
@@ -44,7 +44,7 @@
     ioff = (VOLUME+RAND)/2;
   }
 
-#ifndef OMP
+#ifndef TM_USE_OMP
   hi = &g_hi[16*ioff];
 
 #  if ((defined _GAUGE_COPY))
@@ -58,11 +58,11 @@
 #endif
 
   /**************** loop over all lattice sites ******************/
-#ifdef OMP
+#ifdef TM_USE_OMP
 #  pragma omp for
 #endif
   for(int icx = ioff; icx < (VOLUME/2+ioff); icx++){
-#ifdef OMP
+#ifdef TM_USE_OMP
     hi = &g_hi[16*icx];
 #  if ((defined _GAUGE_COPY))
     up=&g_gauge_field_copy[icx][0];
@@ -159,7 +159,7 @@
     _hop_z_p();
 
     /*********************** direction -3 ************************/
-#ifndef OMP
+#ifndef TM_USE_OMP
 #  if ((defined _GAUGE_COPY))
     up=um+1;
 #  else
diff --git a/operator/hopping_sgl.c b/operator/hopping_sgl.c
index 843fd6a1b..59de53290 100644
--- a/operator/hopping_sgl.c
+++ b/operator/hopping_sgl.c
@@ -44,7 +44,7 @@ void Hopping_Matrix(int ieo, spinor32 * const l, spinor32 * const k){
 #endif
 
   /* for parallelization */
-#    if (defined MPI && !(defined _NO_COMM))
+#    if (defined TM_USE_MPI && !(defined _NO_COMM))
   xchange_field(k, ieo);
 #    endif
 
diff --git a/operator/hopping_sse_dbl.c b/operator/hopping_sse_dbl.c
index bb9eb70c4..735a2ee5a 100644
--- a/operator/hopping_sse_dbl.c
+++ b/operator/hopping_sse_dbl.c
@@ -50,7 +50,7 @@ void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
   spinor * restrict sm;
   spinor * restrict rn;
 
-# if (defined MPI)
+# if (defined TM_USE_MPI)
 #  ifdef PARALLELX
 #   define  REQC 4
 #  elif defined PARALLELXY
@@ -79,7 +79,7 @@ void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
   for(x0=0;x0<T;x0++){
 
     /* start the communication of the timslice borders (non-blocking send and receive)*/
-#    if (defined MPI && !defined _NO_COMM)
+#    if (defined TM_USE_MPI && !defined _NO_COMM)
    xchange_field_open(k, ieo, x0, requests, status);
 #    endif
     
@@ -152,7 +152,7 @@ void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
 
        
    /* complete the communication of the timslice borders (and wait) */
-#if (defined MPI && !defined _NO_COMM)
+#if (defined TM_USE_MPI && !defined _NO_COMM)
    xchange_field_close(requests, status, REQC); /*    MPI_Waitall */
 #endif
 
diff --git a/operator/hopping_sse_sgl.c b/operator/hopping_sse_sgl.c
index 4bca9c22c..b15e0334b 100644
--- a/operator/hopping_sse_sgl.c
+++ b/operator/hopping_sse_sgl.c
@@ -45,7 +45,7 @@ void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
   }
 #endif
 
-#    if (defined MPI && !defined _NO_COMM)
+#    if (defined TM_USE_MPI && !defined _NO_COMM)
   xchange_field(k, ieo);
 #    endif
 
diff --git a/operator/mul_one_pm_imu_inv_body.c b/operator/mul_one_pm_imu_inv_body.c
new file mode 100644
index 000000000..4f4198558
--- /dev/null
+++ b/operator/mul_one_pm_imu_inv_body.c
@@ -0,0 +1,80 @@
+void _PSWITCH(mul_one_pm_imu_inv)(_PTSWITCH(spinor) * const l, const double _sign, const int N){
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  _C_TYPE ALIGN z,w;
+  int ix;
+  double sign=-1.; 
+  _PTSWITCH(spinor) *r;
+
+  _PTSWITCH(su3_vector) ALIGN phi1;
+
+  _F_TYPE ALIGN nrm = 1./(1.+g_mu*g_mu);
+
+  if(_sign < 0.){
+    sign = 1.; 
+  }
+
+  z = nrm + (sign * nrm * g_mu) * I;
+  w = conj(z);
+  /************ loop over all lattice sites ************/
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(ix = 0; ix < N; ix++){
+    r=l + ix;
+    /* Multiply the spinorfield with the inverse of 1+imu\gamma_5 */
+    _complex_times_vector(phi1, z, r->s0);
+    _vector_assign(r->s0, phi1);
+    _complex_times_vector(phi1, z, r->s1);
+    _vector_assign(r->s1, phi1);
+    _complex_times_vector(phi1, w, r->s2);
+    _vector_assign(r->s2, phi1);
+    _complex_times_vector(phi1, w, r->s3);
+    _vector_assign(r->s3, phi1);
+  }
+
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+
+}
+
+void _PSWITCH(assign_mul_one_pm_imu_inv)(_PTSWITCH(spinor) * const l, _PTSWITCH(spinor) * const k, 
+					 const double _sign, const int N){
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  _C_TYPE z,w;
+  int ix;
+  double sign=-1.; 
+  _PTSWITCH(spinor) *r, *s;
+  _F_TYPE nrm = (_F_TYPE) 1./(1.+g_mu*g_mu);
+
+  if(_sign < 0.){
+    sign = 1.; 
+  }
+
+  z = nrm + (_F_TYPE) (sign * nrm * g_mu) * I;
+  w = conj(z);
+
+  /************ loop over all lattice sites ************/
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(ix = 0; ix < N; ix++){
+    r=k+ix;
+    s=l+ix;
+    /* Multiply the spinorfield with the inverse of 1+imu\gamma_5 */
+    _complex_times_vector(s->s0, z, r->s0);
+    _complex_times_vector(s->s1, z, r->s1);
+    _complex_times_vector(s->s2, w, r->s2);
+    _complex_times_vector(s->s3, w, r->s3);
+  }
+
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+}
diff --git a/operator/mul_one_pm_imu_sub_mul_body.c b/operator/mul_one_pm_imu_sub_mul_body.c
new file mode 100644
index 000000000..f88cf04d2
--- /dev/null
+++ b/operator/mul_one_pm_imu_sub_mul_body.c
@@ -0,0 +1,48 @@
+void _PSWITCH(mul_one_pm_imu_sub_mul)(_PTSWITCH(spinor) * const l, _PTSWITCH(spinor) * const k, 
+				      _PTSWITCH(spinor) * const j, const double _sign, const int N){
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  _C_TYPE z,w;
+  int ix;
+  double sign=1.;
+  _PTSWITCH(spinor) *r, *s, *t;
+
+#if (!defined SSE2 && !defined SSE3)
+
+  _PTSWITCH(su3_vector) ALIGN phi1, phi2, phi3, phi4;
+  
+#endif
+
+  if(_sign < 0.){
+    sign = -1.;
+  }
+
+  z = 1. + (sign * g_mu) * I;
+  w = conj(z);
+  /************ loop over all lattice sites ************/
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(ix = 0; ix < N; ix++){
+    r = k+ix;
+    s = j+ix;
+    t = l+ix;
+    /* Multiply the spinorfield with 1+imu\gamma_5 */
+    _complex_times_vector(phi1, z, r->s0);
+    _complex_times_vector(phi2, z, r->s1);
+    _complex_times_vector(phi3, w, r->s2);
+    _complex_times_vector(phi4, w, r->s3);
+    /* Subtract s and store the result in t */
+    _vector_sub(t->s0, phi1, s->s0);
+    _vector_sub(t->s1, phi2, s->s1);
+    _vector_sub(t->s2, phi3, s->s2);
+    _vector_sub(t->s3, phi4, s->s3);
+  }
+
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+}
+
diff --git a/operator/tm_operators.c b/operator/tm_operators.c
index 07de6fefb..7c6e1a375 100644
--- a/operator/tm_operators.c
+++ b/operator/tm_operators.c
@@ -25,7 +25,13 @@
  **************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
+#endif
+#ifdef SSE2
+# undef SSE2
+#endif
+#ifdef SSE3
+# undef SSE3
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -65,6 +71,7 @@ const int predist=2;
  *
  ******************************************/
 void mul_one_pm_imu_inv(spinor * const l, const double _sign, const int N);
+void mul_one_pm_imu_inv_32(spinor32 * const l, const double _sign, const int N);
 void mul_one_pm_imu(spinor * const l, const double _sign);
 /******************************************
  * mul_one_pm_imu_sub_mul_gamma5 computes
@@ -93,11 +100,20 @@ void mul_one_sub_mul_gamma5(spinor * const l, spinor * const k,
  ******************************************/
 void mul_one_pm_imu_sub_mul(spinor * const l, spinor * const k,
 			    spinor * const j, const double _sign, const int N);
+void mul_one_pm_imu_sub_mul_32(spinor32 * const l, spinor32 * const k,
+			       spinor32 * const j, const double _sign, const int N);
+
 void tm_sub_H_eo_gamma5(spinor* const l, spinor * const p, spinor * const k,
 			const int ieo, const double _sign);
 
 /* external functions */
 
+/*full operator acting on the even and odd parts of a spinor*/
+void Q_psi(spinor * const P, spinor * const Q) {
+  D_psi(P, Q);
+  gamma5(P, P, VOLUME);
+}
+
 void M_full(spinor * const Even_new, spinor * const Odd_new, 
 	    spinor * const Even, spinor * const Odd) {
   /* Even sites */
@@ -293,6 +309,19 @@ void Mtm_minus_sym_psi_nocom(spinor * const l, spinor * const k) {
   diff(l, k, g_spinor_field[DUM_MATRIX], VOLUME/2);
 }
 
+void Mtm_plus_sym_dagg_psi(spinor * const l, spinor * const k){
+
+  gamma5(l, k, VOLUME/2);
+  mul_one_pm_imu_inv(l, -1., VOLUME/2);
+  Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], l);
+  mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX+1], -1., VOLUME/2);
+  Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]);
+  gamma5(g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX], VOLUME/2);
+
+  diff(l, k, g_spinor_field[DUM_MATRIX+1], VOLUME/2);
+}
+
+
 /******************************************
  *
  * This is the implementation of
@@ -358,6 +387,16 @@ void Q_pm_psi(spinor * const l, spinor * const k)
   gamma5(l, l, VOLUME);
 }
 
+void D_dagg_psi(spinor * const l, spinor * const k)
+{
+  gamma5(l, k, VOLUME); 
+  g_mu = -g_mu;
+  D_psi(g_spinor_field[DUM_MATRIX], l);
+  gamma5(l, g_spinor_field[DUM_MATRIX], VOLUME);
+  g_mu = -g_mu;
+}
+
+
 
 /* the "full" operators */
 void Q_pm_psi_prec(spinor * const l, spinor * const k)
@@ -429,6 +468,13 @@ void Q_minus_psi(spinor * const l, spinor * const k)
   gamma5(l, l, VOLUME);
 }
 
+void M_minus_psi(spinor * const l, spinor * const k)
+{
+  g_mu = -g_mu;
+  D_psi(l, k);
+  g_mu = -g_mu;
+}
+
 /* This is the version for the gpu (Florian Burger)*/
 void Q_minus_psi_gpu(spinor * const l, spinor * const k)
 {
@@ -513,89 +559,57 @@ void tm_sub_H_eo_gamma5(spinor* const l, spinor * const p, spinor * const k,
  * can find comments above at the declaration 
  *
  **********************************************/
+#define _F_TYPE double
+#define _C_TYPE _Complex double
+#define _PSWITCH(s) s
+#define _PTSWITCH(s) s
 
-void mul_one_pm_imu_inv(spinor * const l, const double _sign, const int N){
-#ifdef OMP
-#pragma omp parallel
-  {
-#endif
-  _Complex double ALIGN z,w;
-  int ix;
-  double sign=-1.; 
-  spinor *r;
+#include "mul_one_pm_imu_inv_body.c"
 
-  su3_vector ALIGN phi1;
+#undef _F_TYPE
+#undef _C_TYPE
+#undef _PSWITCH
+#undef _PTSWITCH
 
-  double ALIGN nrm = 1./(1.+g_mu*g_mu);
+#define _F_TYPE float
+#define _C_TYPE _Complex float
+#define _PSWITCH(s) s ## _32
+#define _PTSWITCH(s) s ## 32
 
-  if(_sign < 0.){
-    sign = 1.; 
-  }
+#include "mul_one_pm_imu_inv_body.c"
 
-  z = nrm + (sign * nrm * g_mu) * I;
-  w = conj(z);
-  /************ loop over all lattice sites ************/
-#ifdef OMP
-#pragma omp for
-#endif
-  for(ix = 0; ix < N; ix++){
-    r=l + ix;
-    /* Multiply the spinorfield with the inverse of 1+imu\gamma_5 */
-#if ( defined SSE2 || defined SSE3 )
-    _prefetch_spinor((r+predist)); 
-    _sse_load_up(r->s0);
-    _sse_vector_cmplx_mul(z);
-    _sse_store_nt_up(r->s0);
-    _sse_load_up(r->s1);
-    _sse_vector_cmplx_mul_two();
-    _sse_store_nt_up(r->s1);
-    _sse_load_up(r->s2);
-    _sse_vector_cmplx_mul(w);
-    _sse_store_nt_up(r->s2);
-    _sse_load_up(r->s3);
-    _sse_vector_cmplx_mul_two();
-    _sse_store_nt_up(r->s3);
-#else
-    _complex_times_vector(phi1, z, r->s0);
-    _vector_assign(r->s0, phi1);
-    _complex_times_vector(phi1, z, r->s1);
-    _vector_assign(r->s1, phi1);
-    _complex_times_vector(phi1, w, r->s2);
-    _vector_assign(r->s2, phi1);
-    _complex_times_vector(phi1, w, r->s3);
-    _vector_assign(r->s3, phi1);
-#endif
-  }
+#undef _F_TYPE
+#undef _C_TYPE
+#undef _PSWITCH
+#undef _PTSWITCH
 
-#ifdef OMP
-  } /* OpenMP closing brace */
-#endif
-
-}
 
-void assign_mul_one_pm_imu_inv(spinor * const l, spinor * const k, const double _sign, const int N){
-#ifdef OMP
+void Mee_inv_psi(spinor * const l, spinor * const k, const double mu){
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
   _Complex double z,w;
   int ix;
-  double sign=-1.; 
+  //double sign=-1.; 
   spinor *r, *s;
-  double nrm = 1./(1.+g_mu*g_mu);
+  //double nrm = 1./(1.+g_mu*g_mu);
+  double nrm = 1./(1.+mu*mu);
 
-  if(_sign < 0.){
-    sign = 1.; 
-  }
+  //if(_sign < 0.){
+  //  sign = 1.; 
+  //}
 
-  z = nrm + (sign * nrm * g_mu) * I;
+  //z = nrm + (sign * nrm * g_mu) * I;
+  z = nrm - (nrm * mu) * I;
   w = conj(z);
 
   /************ loop over all lattice sites ************/
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
-  for(ix = 0; ix < N; ix++){
+  //for(ix = 0; ix < N; ix++){
+  for(ix = 0; ix < VOLUME/2; ix++){
     r=k+ix;
     s=l+ix;
     /* Multiply the spinorfield with the inverse of 1+imu\gamma_5 */
@@ -605,13 +619,13 @@ void assign_mul_one_pm_imu_inv(spinor * const l, spinor * const k, const double
     _complex_times_vector(s->s3, w, r->s3);
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
 
 void mul_one_pm_imu(spinor * const l, const double _sign){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -630,7 +644,7 @@ void mul_one_pm_imu(spinor * const l, const double _sign){
   w = conj(z);
 
   /************ loop over all lattice sites ************/
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(ix = 0; ix < (VOLUME/2); ix++){
@@ -646,14 +660,14 @@ void mul_one_pm_imu(spinor * const l, const double _sign){
     _vector_assign(r->s3, phi1);
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 
 }
 
 void assign_mul_one_pm_imu(spinor * const l, spinor * const k, const double _sign, const int N){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -670,7 +684,7 @@ void assign_mul_one_pm_imu(spinor * const l, spinor * const k, const double _sig
   w = conj(z);
 
   /************ loop over all lattice sites ************/
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(ix = 0; ix < N; ix++){
@@ -700,21 +714,80 @@ void assign_mul_one_pm_imu(spinor * const l, spinor * const k, const double _sig
     _complex_times_vector(s->s3, w, r->s3);
 #endif
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
 
+
+void Mee_psi(spinor * const l, spinor * const k, const double mu){
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  _Complex double z,w;
+  int ix;
+  //double sign = 1.; 
+  spinor *r, *s;
+
+  //if(_sign < 0.){
+  //  sign = -1.; 
+  //}
+
+  //z = 1. + (sign * g_mu) * I;
+  z = 1. + mu * I;
+  w = conj(z);
+
+  /************ loop over all lattice sites ************/
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(ix = 0; ix < VOLUME/2; ix++){
+    s=l+ix;
+    r=k+ix;
+
+    /* Multiply the spinorfield with of 1+imu\gamma_5 */
+#if ( defined SSE2 || defined SSE3 )
+    _prefetch_spinor((r+predist));
+    _prefetch_spinor((s+predist));
+    _sse_load_up(r->s0);
+    _sse_vector_cmplx_mul(z);
+    _sse_store_nt_up(s->s0);
+    _sse_load_up(r->s1);
+    _sse_vector_cmplx_mul_two();
+    _sse_store_nt_up(s->s1);
+    _sse_load_up(r->s2);
+    _sse_vector_cmplx_mul(w);
+    _sse_store_nt_up(s->s2);
+    _sse_load_up(r->s3);
+    _sse_vector_cmplx_mul_two();
+    _sse_store_nt_up(s->s3);
+#else
+    _complex_times_vector(s->s0, z, r->s0);
+    _complex_times_vector(s->s1, z, r->s1);
+    _complex_times_vector(s->s2, w, r->s2);
+    _complex_times_vector(s->s3, w, r->s3);
+#endif
+  }
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+}
+
+
+
+
+
 void mul_one_sub_mul_gamma5(spinor * const l, spinor * const k, 
 				   spinor * const j){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
   spinor *r, *s, *t;
 
   /************ loop over all lattice sites ************/
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(int ix = 0; ix < (VOLUME/2); ++ix)
@@ -731,7 +804,7 @@ void mul_one_sub_mul_gamma5(spinor * const l, spinor * const k,
     _vector_sub(t->s3, s->s3, r->s3);  
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
@@ -739,7 +812,7 @@ void mul_one_sub_mul_gamma5(spinor * const l, spinor * const k,
 
 void mul_one_pm_imu_sub_mul_gamma5(spinor * const l, spinor * const k, 
 				   spinor * const j, const double _sign){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -758,7 +831,7 @@ void mul_one_pm_imu_sub_mul_gamma5(spinor * const l, spinor * const k,
   w = conj(z);
   
   /************ loop over all lattice sites ************/
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(ix = 0; ix < (VOLUME/2); ix++){
@@ -779,81 +852,31 @@ void mul_one_pm_imu_sub_mul_gamma5(spinor * const l, spinor * const k,
     _vector_sub(t->s3, s->s3, phi4);
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 }
 
-void mul_one_pm_imu_sub_mul(spinor * const l, spinor * const k, 
-			    spinor * const j, const double _sign, const int N){
-#ifdef OMP
-#pragma omp parallel
-  {
-#endif
-  _Complex double z,w;
-  int ix;
-  double sign=1.;
-  spinor *r, *s, *t;
-
-#if (!defined SSE2 && !defined SSE3)
+#define _C_TYPE _Complex double
+#define _F_TYPE double
+#define _PSWITCH(s) s
+#define _PTSWITCH(s) s
 
-  su3_vector ALIGN phi1, phi2, phi3, phi4;
-  
-#endif
+#include "mul_one_pm_imu_sub_mul_body.c"
 
-  if(_sign < 0.){
-    sign = -1.;
-  }
+#undef _C_TYPE
+#undef _F_TYPE
+#undef _PSWITCH
+#undef _PTSWITCH
 
-  z = 1. + (sign * g_mu) * I;
-  w = conj(z);
-  /************ loop over all lattice sites ************/
-#ifdef OMP
-#pragma omp for
-#endif
-  for(ix = 0; ix < N; ix++){
-    r = k+ix;
-    s = j+ix;
-    t = l+ix;
-    /* Multiply the spinorfield with 1+imu\gamma_5 */
-#if (defined SSE2 || defined SSE3)
-    _prefetch_spinor((r+predist));
-    _prefetch_spinor((s+predist));
-    _sse_load_up(r->s0);
-    _sse_vector_cmplx_mul(z);
-    _sse_load(s->s0);
-    _sse_vector_sub_up();
-    _sse_store_nt_up(t->s0);
-    _sse_load_up(r->s1);
-    _sse_vector_cmplx_mul_two();
-    _sse_load(s->s1);
-    _sse_vector_sub_up();
-    _sse_store_nt_up(t->s1);
-    _sse_load_up(r->s2);
-    _sse_vector_cmplx_mul(w);
-    _sse_load(s->s2);
-    _sse_vector_sub_up();
-    _sse_store_nt_up(t->s2);
-    _sse_load_up(r->s3);
-    _sse_vector_cmplx_mul_two();
-    _sse_load(s->s3);
-    _sse_vector_sub_up();
-    _sse_store_nt_up(t->s3);
-#else
-    _complex_times_vector(phi1, z, r->s0);
-    _complex_times_vector(phi2, z, r->s1);
-    _complex_times_vector(phi3, w, r->s2);
-    _complex_times_vector(phi4, w, r->s3);
-    /* Subtract s and store the result in t */
-    _vector_sub(t->s0, phi1, s->s0);
-    _vector_sub(t->s1, phi2, s->s1);
-    _vector_sub(t->s2, phi3, s->s2);
-    _vector_sub(t->s3, phi4, s->s3);
-#endif
-  }
+#define _C_TYPE _Complex float
+#define _F_TYPE float
+#define _PSWITCH(s) s ## _32
+#define _PTSWITCH(s) s ## 32
 
-#ifdef OMP
-  } /* OpenMP closing brace */
-#endif
-}
+#include "mul_one_pm_imu_sub_mul_body.c"
 
+#undef _C_TYPE
+#undef _F_TYPE
+#undef _PSWITCH
+#undef _PTSWITCH
diff --git a/operator/tm_operators.h b/operator/tm_operators.h
index e1ceff78e..0ce7f723d 100644
--- a/operator/tm_operators.h
+++ b/operator/tm_operators.h
@@ -23,6 +23,7 @@
 #include "su3.h"
 
 /* This is the full matrix multiplication */
+void Q_psi(spinor * const P, spinor * const Q);
 void M_full(spinor * const Even_new, spinor * const Odd_new, 
 	    spinor * const Even, spinor * const Odd);
 void Q_full(spinor * const Even_new, spinor * const Odd_new, 
@@ -40,26 +41,37 @@ void Qtm_pm_psi(spinor * const l, spinor * const k);
 void Qtm_pm_psi_nocom(spinor * const l, spinor * const k);
 void H_eo_tm_inv_psi(spinor * const l, spinor * const k, const int ieo, const double sign);
 void mul_one_pm_imu_inv(spinor * const l, const double _sign, const int N);
+void mul_one_pm_imu_inv_32(spinor32 * const l, const double _sign, const int N);
 void assign_mul_one_pm_imu_inv(spinor * const l, spinor * const k, const double _sign, const int N);
+void assign_mul_one_pm_imu_inv_32(spinor32 * const l, spinor32 * const k, const double _sign, const int N);
 void assign_mul_one_pm_imu(spinor * const l, spinor * const k, const double _sign, const int N);
+void Mee_inv_psi(spinor * const l, spinor * const k, const double mu);
+void assign_mul_one_pm_imu(spinor * const l, spinor * const k, const double _sign, const int N);
+void Mee_psi(spinor * const l, spinor * const k, const double mu);
 void mul_one_pm_imu(spinor * const l, const double _sign);
 void mul_one_pm_imu_sub_mul(spinor * const l, spinor * const k,
 			    spinor * const j, const double _sign, const int N);
+void mul_one_pm_imu_sub_mul_32(spinor32 * const l, spinor32 * const k,
+			       spinor32 * const j, const double _sign, const int N);
 
 void Qtm_plus_sym_psi(spinor * const l, spinor * const k);
 void Qtm_plus_sym_psi_nocom(spinor * const l, spinor * const k);
 void Qtm_minus_sym_psi(spinor * const l, spinor * const k);
 void Mtm_plus_sym_psi(spinor * const l, spinor * const k);
+void Mtm_plus_sym_dagg_psi(spinor * const l, spinor * const k);
 void Mtm_minus_sym_psi(spinor * const l, spinor * const k);
 void Mtm_plus_sym_psi_nocom(spinor * const l, spinor * const k);
 void Mtm_minus_sym_psi_nocom(spinor * const l, spinor * const k);
 void Qtm_pm_sym_psi(spinor * const l, spinor * const k);
 
+
+void D_dagg_psi(spinor * const l, spinor * const k);
 void Q_pm_psi(spinor * const l, spinor * const k);
 void Q_pm_psi_prec(spinor * const l, spinor * const k);
 void Q_pm_psi_gpu(spinor * const l, spinor * const k);
 void Q_pm_psi2(spinor * const l, spinor * const k);
 void Q_minus_psi(spinor * const l, spinor * const k);
+void M_minus_psi(spinor * const l, spinor * const k);
 void Q_minus_psi_gpu(spinor * const l, spinor * const k);
 void Q_plus_psi(spinor * const l, spinor * const k);
 
diff --git a/operator/tm_operators_32.c b/operator/tm_operators_32.c
new file mode 100644
index 000000000..1dee170be
--- /dev/null
+++ b/operator/tm_operators_32.c
@@ -0,0 +1,150 @@
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include "global.h"
+#include "su3.h"
+#include "operator/Hopping_Matrix.h"
+#include "operator/Hopping_Matrix_32.h"
+#include "linalg_eo.h"
+#include "gamma.h"
+#include "operator/D_psi.h"
+#include "tm_operators_32.h"
+
+
+/* note that most 32 bit functions make use of orphaned directives!
+   in order to take advantage of threads, they must be called from within
+   a parallel section and care must be taken that within those parallel
+   sections, no nested parallelism is generated through further parallel section */
+
+void mul_one_pm_imu_inv_32_orphaned(spinor32 * const l, const float _sign, const int N){
+  _Complex float ALIGN z,w;
+  int ix;
+  float sign=-1.; 
+  spinor32 *r;
+
+  su3_vector32 ALIGN phi1;
+
+  double ALIGN nrm = 1./(1.+g_mu*g_mu);
+
+  if(_sign < 0.){
+    sign = 1.; 
+  }
+
+  z = nrm + (sign * nrm * g_mu) * I;
+  w = conj(z);
+  /************ loop over all lattice sites ************/
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(ix = 0; ix < N; ix++){
+    r=l + ix;
+    /* Multiply the spinorfield with the inverse of 1+imu\gamma_5 */
+    _complex_times_vector(phi1, z, r->s0);
+    _vector_assign(r->s0, phi1);
+    _complex_times_vector(phi1, z, r->s1);
+    _vector_assign(r->s1, phi1);
+    _complex_times_vector(phi1, w, r->s2);
+    _vector_assign(r->s2, phi1);
+    _complex_times_vector(phi1, w, r->s3);
+    _vector_assign(r->s3, phi1);
+  }
+}
+
+void mul_one_pm_imu_sub_mul_gamma5_32_orphaned(spinor32 * const l, spinor32 * const k, 
+				   spinor32 * const j, const float _sign){
+  _Complex float z,w;
+  int ix;
+  float sign=1.;
+  spinor32 *r, *s, *t;
+
+  su3_vector32 ALIGN phi1, phi2, phi3, phi4;
+
+  if(_sign < 0.){
+    sign = -1.;
+  }
+
+  z = 1. + (sign * g_mu) * I;
+  w = conj(z);
+  
+  /************ loop over all lattice sites ************/
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(ix = 0; ix < (VOLUME/2); ix++){
+    r = k+ix;
+    s = j+ix;
+    t = l+ix;
+    /* Multiply the spinorfield with 1+imu\gamma_5 */
+    _complex_times_vector(phi1, z, r->s0);
+    _complex_times_vector(phi2, z, r->s1);
+    _complex_times_vector(phi3, w, r->s2);
+    _complex_times_vector(phi4, w, r->s3);
+    /* Subtract s and store the result in t */
+    /* multiply with  gamma5 included by    */
+    /* reversed order of s and phi3|4       */
+    _vector_sub(t->s0, phi1, s->s0);
+    _vector_sub(t->s1, phi2, s->s1);
+    _vector_sub(t->s2, s->s2, phi3);
+    _vector_sub(t->s3, s->s3, phi4);
+  }
+}
+
+void Qtm_pm_psi_32(spinor32 * const l, spinor32 * const k){
+  /* Q_{-} */
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif  
+  Hopping_Matrix_32_orphaned(EO, g_spinor_field32[1], k);
+  mul_one_pm_imu_inv_32_orphaned(g_spinor_field32[1], -1., VOLUME/2);
+  Hopping_Matrix_32_orphaned(OE, g_spinor_field32[0], g_spinor_field32[1]);
+  mul_one_pm_imu_sub_mul_gamma5_32_orphaned(g_spinor_field32[0], k, g_spinor_field32[0], -1.);
+  /* Q_{+} */
+  Hopping_Matrix_32_orphaned(EO, l, g_spinor_field32[0]);
+  mul_one_pm_imu_inv_32_orphaned(l, +1., VOLUME/2);
+  Hopping_Matrix_32_orphaned(OE, g_spinor_field32[1], l);
+  mul_one_pm_imu_sub_mul_gamma5_32_orphaned(l, g_spinor_field32[0], g_spinor_field32[1], +1.);
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif  
+}
+
+void gamma5_32_orphaned(spinor32 * const l, spinor32 * const k, const int V){
+  int ix;
+  spinor32 *r,*s;
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for (ix = 0; ix < V; ix++){
+    r=l+ix;
+    s=k+ix;
+    _vector_assign((*r).s0,(*s).s0);
+    _vector_assign((*r).s1,(*s).s1);
+    _vector_minus_assign((*r).s2,(*s).s2);
+    _vector_minus_assign((*r).s3,(*s).s3);
+  }
+}
+
+void gamma5_32(spinor32 * const l, spinor32 * const k, const int V){
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  gamma5_32_orphaned(l,k,V);
+#ifdef TM_USE_OMP
+  } /*OpenMP closing brace */
+#endif
+}
+
+void Q_pm_psi_32(spinor32 * const l, spinor32 * const k)
+{
+  g_mu = -g_mu;
+  D_psi_32(l, k);
+  gamma5_32(g_spinor_field32[0], l, VOLUME);
+  g_mu = -g_mu;
+  D_psi_32(l, g_spinor_field32[0]);
+  gamma5_32(l, l, VOLUME);
+}
+
diff --git a/operator/tm_operators_32.h b/operator/tm_operators_32.h
new file mode 100644
index 000000000..84ba67800
--- /dev/null
+++ b/operator/tm_operators_32.h
@@ -0,0 +1,12 @@
+
+#ifndef _TM_OPERATORS_32_H
+#define _TM_OPERATORS_32_H
+
+void mul_one_pm_imu_inv_32_orphaned(spinor32 * const l, const float _sign, const int N);
+void mul_one_pm_imu_sub_mul_gamma5_32_orphaned(spinor32 * const l, spinor32 * const k, spinor32 * const j, const float _sign);
+void Qtm_pm_psi_32(spinor32 * const l, spinor32 * const k);
+void Q_pm_psi_32(spinor32 * const l, spinor32 * const k);
+void gamma5_32_orphaned(spinor32 * const l, spinor32 * const k, const int V);
+void gamma5_32(spinor32 * const l, spinor32 * const k, const int V);
+
+#endif
diff --git a/operator/tm_operators_nd.c b/operator/tm_operators_nd.c
index dc931b3a6..5cdc4f385 100644
--- a/operator/tm_operators_nd.c
+++ b/operator/tm_operators_nd.c
@@ -26,7 +26,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -37,6 +37,7 @@
 #include "phmc.h"
 #include "gamma.h"
 #include "linalg_eo.h"
+#include "operator/D_psi.h"
 #include "operator/tm_operators.h"
 #include "operator/clovertm_operators.h"
 #include "operator/tm_operators_nd.h"
@@ -52,6 +53,74 @@ void M_oo_sub_g5_ndpsi(spinor * const l_s, spinor * const l_c,
 
 /* external functions */
 
+
+/******************************************
+ *
+ * This is the implementation of
+ *
+ *  M_full_ndpsi = D_w I_f + i gamma5 mubar tau3 - epsbar tau1
+ *  the full operator done for testing purpose
+ ******************************************/
+void M_full_ndpsi(spinor * const Even_new_s, spinor * const Odd_new_s, 
+                  spinor * const Even_new_c, spinor * const Odd_new_c, 
+                  spinor * const Even_s, spinor * const Odd_s,
+                  spinor * const Even_c, spinor * const Odd_c) {
+  
+  double mu = g_mu;
+  g_mu = g_mubar;
+  M_full(Even_new_s, Odd_new_s, Even_s, Odd_s);
+
+  assign_add_mul_r(Even_new_s, Even_c, -g_epsbar, VOLUME/2);
+  assign_add_mul_r(Odd_new_s, Odd_c, -g_epsbar, VOLUME/2);
+  
+  g_mu = -g_mu;
+  M_full(Even_new_c, Odd_new_c, Even_c, Odd_c);
+  
+  assign_add_mul_r(Even_new_c, Even_s, -g_epsbar, VOLUME/2);
+  assign_add_mul_r(Odd_new_c, Odd_s, -g_epsbar, VOLUME/2);
+
+  g_mu = mu;
+}
+
+void Msw_full_ndpsi(spinor * const Even_new_s, spinor * const Odd_new_s, 
+                    spinor * const Even_new_c, spinor * const Odd_new_c, 
+                    spinor * const Even_s, spinor * const Odd_s,
+                    spinor * const Even_c, spinor * const Odd_c) {
+
+  double mu = g_mu;
+  g_mu = g_mubar;
+  Msw_full(Even_new_s, Odd_new_s, Even_s, Odd_s);
+
+  assign_add_mul_r(Even_new_s, Even_c, -g_epsbar, VOLUME/2);
+  assign_add_mul_r(Odd_new_s, Odd_c, -g_epsbar, VOLUME/2);
+  
+  g_mu = -g_mu;
+  Msw_full(Even_new_c, Odd_new_c, Even_c, Odd_c);
+  
+  assign_add_mul_r(Even_new_c, Even_s, -g_epsbar, VOLUME/2);
+  assign_add_mul_r(Odd_new_c, Odd_s, -g_epsbar, VOLUME/2);
+
+  g_mu = mu;
+}
+
+// full VOLUME operator; it used D_psi which works with tm and tm+clover
+void D_ndpsi(spinor * const l_strange, spinor * const l_charm,
+             spinor * const k_strange, spinor * const k_charm) {
+
+  double mu = g_mu;
+  g_mu = g_mubar;
+  D_psi(l_strange,k_strange);
+
+  assign_add_mul_r(l_strange, k_charm, -g_epsbar, VOLUME);
+  
+  g_mu = -g_mu;
+  D_psi(l_charm,k_charm);
+  
+  assign_add_mul_r(l_charm, k_strange, -g_epsbar, VOLUME);
+
+  g_mu = mu;
+}
+
 /******************************************
  *
  * This is the implementation of
@@ -110,6 +179,63 @@ void Qsw_ndpsi(spinor * const l_strange, spinor * const l_charm,
   return;
 }
 
+/******************************************
+ *
+ * This is the implementation of 
+ *
+ *  Q_tau1_ndpsi_add/sub_Ishift =  ( M +/- I z_k )
+ *
+ *  with M = Qhat(2x2) tau_1   and z_k from sqrt(g_shift) 
+ *
+ *
+ *  needed in the evaluation of the heatbath when 
+ *  the Rational approximation is used
+ *
+ *
+ * For details, see documentation and comments of the
+ * above mentioned routines
+ *
+ * k_charm and k_strange are the input fields
+ * l_* the output fields
+ *
+ * it acts only on the odd part or only
+ * on a half spinor
+ ******************************************/
+
+
+void Qtm_tau1_ndpsi_add_Ishift(spinor * const l_strange, spinor * const l_charm,
+                               spinor * const k_strange, spinor * const k_charm) {
+
+  Q_tau1_sub_const_ndpsi(l_strange,l_charm,k_strange,k_charm,-I*sqrt(g_shift),1.,phmc_invmaxev);
+
+  return;
+}
+
+void Qtm_tau1_ndpsi_sub_Ishift(spinor * const l_strange, spinor * const l_charm,
+                               spinor * const k_strange, spinor * const k_charm) {
+
+  Q_tau1_sub_const_ndpsi(l_strange,l_charm,k_strange,k_charm, I*sqrt(g_shift),1.,phmc_invmaxev);
+
+  return;
+}
+
+void Qsw_tau1_ndpsi_add_Ishift(spinor * const l_strange, spinor * const l_charm,
+                               spinor * const k_strange, spinor * const k_charm) {
+
+  Qsw_tau1_sub_const_ndpsi(l_strange,l_charm,k_strange,k_charm,-I*sqrt(g_shift),1.,phmc_invmaxev);
+
+  return;
+}
+
+void Qsw_tau1_ndpsi_sub_Ishift(spinor * const l_strange, spinor * const l_charm,
+                               spinor * const k_strange, spinor * const k_charm) {
+
+  Qsw_tau1_sub_const_ndpsi(l_strange,l_charm,k_strange,k_charm, I*sqrt(g_shift),1.,phmc_invmaxev);
+
+  return;
+}
+
+
 /******************************************
  *
  * This is the implementation of
@@ -237,6 +363,14 @@ void Qtm_pm_ndpsi(spinor * const l_strange, spinor * const l_charm,
   return;
 }
 
+void Qtm_pm_ndpsi_shift(spinor * const l_strange, spinor * const l_charm,
+                       spinor * const k_strange, spinor * const k_charm) {
+  Qtm_pm_ndpsi(l_strange,l_charm,k_strange,k_charm);  
+  assign_add_mul_r( l_strange, k_strange, g_shift, VOLUME/2 );
+  assign_add_mul_r( l_charm, k_charm, g_shift, VOLUME/2 );
+  return;
+}
+
 void Qsw_pm_ndpsi(spinor * const l_strange, spinor * const l_charm,
 		  spinor * const k_strange, spinor * const k_charm) {
 
@@ -284,6 +418,15 @@ void Qsw_pm_ndpsi(spinor * const l_strange, spinor * const l_charm,
   return;
 }
 
+void Qsw_pm_ndpsi_shift(spinor * const l_strange, spinor * const l_charm,
+                       spinor * const k_strange, spinor * const k_charm) {
+  Qsw_pm_ndpsi(l_strange,l_charm,k_strange,k_charm);
+  
+  assign_add_mul_r( l_strange, k_strange, g_shift, VOLUME/2 );
+  assign_add_mul_r( l_charm, k_charm, g_shift, VOLUME/2 );
+
+  return;
+}
 
 
 /******************************************
@@ -343,7 +486,7 @@ void Q_tau1_sub_const_ndpsi(spinor * const l_strange, spinor * const l_charm,
   /* by the constant  phmc_Cpol  */
   /* which renders the polynomial in monomials  */
   /* identical to the polynomial a la clenshaw */;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel for private(r) private(s) private(phi1)
 #endif
   for(int ix = 0; ix < (VOLUME/2); ix++){
@@ -411,7 +554,7 @@ void Qsw_tau1_sub_const_ndpsi(spinor * const l_strange, spinor * const l_charm,
   /* by the constant  phmc_Cpol  */
   /* which renders the polynomial in monomials  */
   /* identical to the polynomial a la clenshaw */;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel for private(r) private(s) private(phi1)
 #endif
   for(int ix = 0; ix < (VOLUME/2); ix++){
@@ -598,7 +741,7 @@ void mul_one_pm_itau2(spinor * const p, spinor * const q,
 
 void mul_one_pm_iconst(spinor * const l, spinor * const k, 
 		       const double mu_, const int sign_) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -611,7 +754,7 @@ void mul_one_pm_iconst(spinor * const l, spinor * const k,
   }
 
   /************ loop over all lattice sites ************/
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(unsigned int ix = 0; ix < (VOLUME/2); ++ix){
@@ -628,7 +771,7 @@ void mul_one_pm_iconst(spinor * const l, spinor * const k,
     _vector_assign(r->s3, phi1);
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 
@@ -639,7 +782,7 @@ void mul_one_pm_iconst(spinor * const l, spinor * const k,
 void M_ee_inv_ndpsi(spinor * const l_s, spinor * const l_c, 
 		    spinor * const k_s, spinor * const k_c,
 		    const double mu, const double eps) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -647,7 +790,7 @@ void M_ee_inv_ndpsi(spinor * const l_s, spinor * const l_c,
   spinor *r_s, *r_c, *s_s, *s_c;
   su3_vector ALIGN phi1, phi2;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(unsigned int ix = 0; ix < (VOLUME/2); ++ix){
@@ -686,7 +829,7 @@ void M_ee_inv_ndpsi(spinor * const l_s, spinor * const l_c,
 
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 
@@ -699,14 +842,14 @@ void M_oo_sub_g5_ndpsi(spinor * const l_s, spinor * const l_c,
 		       spinor * const k_s, spinor * const k_c,
 		       spinor * const j_s, spinor * const j_c,
 		       const double mu, const double eps) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
   spinor *r_s, *r_c, *s_s, *s_c, *t_s, *t_c;
   su3_vector ALIGN phi1, phi2;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(unsigned int ix = 0; ix < (VOLUME/2); ++ix){
@@ -746,7 +889,7 @@ void M_oo_sub_g5_ndpsi(spinor * const l_s, spinor * const l_c,
     _vector_sub(r_c->s3, t_c->s3, phi2);
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 
@@ -818,7 +961,7 @@ void Qtm_pm_sub_const_nrm_psi(spinor * const l, spinor * const k,
 
 
   /************ loop over all lattice sites ************/
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel for private(ix) private(r) private(s) private(phi1)
 #endif
   for(ix = 0; ix < (VOLUME/2); ix++){
diff --git a/operator/tm_operators_nd.h b/operator/tm_operators_nd.h
index 347f326a5..138e9b93b 100644
--- a/operator/tm_operators_nd.h
+++ b/operator/tm_operators_nd.h
@@ -23,53 +23,82 @@
 #define _TM_OPERATTORS_ND_H
 
 void mul_one_pm_itau2(spinor * const p, spinor * const q,
-		      spinor * const r, spinor * const s,
-		      const double sign, const int N);
+                      spinor * const r, spinor * const s,
+                      const double sign, const int N);
+
+void M_full_ndpsi(spinor * const Even_new_s, spinor * const Odd_new_s, 
+                  spinor * const Even_new_c, spinor * const Odd_new_c, 
+                  spinor * const Even_s, spinor * const Odd_s,
+                  spinor * const Even_c, spinor * const Odd_c);
+
+void Msw_full_ndpsi(spinor * const Even_new_s, spinor * const Odd_new_s, 
+                    spinor * const Even_new_c, spinor * const Odd_new_c, 
+                    spinor * const Even_s, spinor * const Odd_s,
+                    spinor * const Even_c, spinor * const Odd_c);
+
+//This works with tm and tm+clover 
+void D_ndpsi(spinor * const l_strange, spinor * const l_charm,
+             spinor * const k_strange,  spinor * const k_charm);
 
 void Qtm_ndpsi(spinor * const l_strange, spinor * const l_charm,
-	       spinor * const k_strange,  spinor * const k_charm);
+               spinor * const k_strange,  spinor * const k_charm);
 void Qsw_ndpsi(spinor * const l_strange, spinor * const l_charm,
-	       spinor * const k_strange, spinor * const k_charm);
+               spinor * const k_strange, spinor * const k_charm);
+
+void Qtm_tau1_ndpsi_add_Ishift(spinor * const l_strange, spinor * const l_charm,
+                               spinor * const k_strange,  spinor * const k_charm);
+void Qtm_tau1_ndpsi_sub_Ishift(spinor * const l_strange, spinor * const l_charm,
+                               spinor * const k_strange,  spinor * const k_charm);
+void Qsw_tau1_ndpsi_add_Ishift(spinor * const l_strange, spinor * const l_charm,
+                               spinor * const k_strange,  spinor * const k_charm);
+void Qsw_tau1_ndpsi_sub_Ishift(spinor * const l_strange, spinor * const l_charm,
+                               spinor * const k_strange,  spinor * const k_charm);
+
 
 void Qtm_dagger_ndpsi(spinor * const l_strange, spinor * const l_charm,
-		      spinor * const k_strange, spinor * const k_charm);
+                      spinor * const k_strange, spinor * const k_charm);
 void Qsw_dagger_ndpsi(spinor * const l_strange, spinor * const l_charm,
-		      spinor * const k_strange, spinor * const k_charm);
+                      spinor * const k_strange, spinor * const k_charm);
 
 void Qtm_pm_ndpsi(spinor * const l_strange, spinor * const l_charm,
                   spinor * const k_strange, spinor * const k_charm);
+void Qtm_pm_ndpsi_shift(spinor * const l_strange, spinor * const l_charm,
+                        spinor * const k_strange, spinor * const k_charm);
+
 void Qsw_pm_ndpsi(spinor * const l_strange, spinor * const l_charm,
-		  spinor * const k_strange, spinor * const k_charm);
+                  spinor * const k_strange, spinor * const k_charm);
+void Qsw_pm_ndpsi_shift(spinor * const l_strange, spinor * const l_charm,
+                        spinor * const k_strange, spinor * const k_charm);
 
 void Qtm_pm_ndbipsi(bispinor * const bisp_l, bispinor * const bisp_k);
 void Qsw_pm_ndbipsi(bispinor * const bisp_l, bispinor * const bisp_k);
 
 void Q_tau1_sub_const_ndpsi(spinor * const l_strange, spinor * const l_charm,
-			    spinor * const k_strange, spinor * const k_charm, 
-			    const _Complex double z, const double Cpol, const double invev);
+                            spinor * const k_strange, spinor * const k_charm, 
+                            const _Complex double z, const double Cpol, const double invev);
 void Qsw_tau1_sub_const_ndpsi(spinor * const l_strange, spinor * const l_charm,
-			      spinor * const k_strange, spinor * const k_charm, 
-			      const _Complex double z, const double Cpol, const double invev);
+                              spinor * const k_strange, spinor * const k_charm, 
+                              const _Complex double z, const double Cpol, const double invev);
 
 void H_eo_tm_ndpsi(spinor * const l_strange, spinor * const l_charm, 
-             spinor * const k_strange, spinor * const k_charm, 
-	     const int ieo);
+                   spinor * const k_strange, spinor * const k_charm, 
+                   const int ieo);
 void H_eo_sw_ndpsi(spinor * const l_strange, spinor * const l_charm, 
-		   spinor * const k_strange, spinor * const k_charm);
+                   spinor * const k_strange, spinor * const k_charm);
 
 
 void M_ee_inv_ndpsi(spinor * const l_strange, spinor * const l_charm, 
-		    spinor * const k_strange, spinor * const k_charm,
-		    const double mu, const double eps);
+                    spinor * const k_strange, spinor * const k_charm,
+                    const double mu, const double eps);
 
 void Msw_ee_inv_ndpsi(spinor * const l_strange, spinor * const l_charm, 
-		      spinor * const k_strange, spinor * const k_charm);
+                      spinor * const k_strange, spinor * const k_charm);
 
 void Q_test_epsilon(spinor * const l_strange, spinor * const l_charm,
                     spinor * const k_strange, spinor * const k_charm);
 
 void Qtau1_P_ndpsi(spinor * const l_strange, spinor * const l_charm,
-		spinor * const k_strange, spinor * const k_charm);
+                   spinor * const k_strange, spinor * const k_charm);
 
 void Qtm_pm_Ptm_pm_psi(spinor * const l, spinor * const k);
 
diff --git a/operator/tm_operators_nd_32.c b/operator/tm_operators_nd_32.c
new file mode 100644
index 000000000..ee5e36460
--- /dev/null
+++ b/operator/tm_operators_nd_32.c
@@ -0,0 +1,336 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2015 Florian Burger
+ * based on the corresponding 64 bit operators in tm_operators_nd.c
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *                                                            
+ * This file contains operators for twisted mass Wilson QCD   
+ * to construct a multiplication with a non-degenerate        
+ * flavour matrix                                             
+ *                                                            
+ *                                                            
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "global.h"
+#include "su3.h"
+#include "operator/Hopping_Matrix_32.h"
+#include "phmc.h"
+#include "gamma.h"
+#include "linalg_eo.h"
+#include "operator/tm_operators_32.h"
+#include "operator/tm_operators_nd.h"
+#include "operator/clovertm_operators_32.h"
+#include "operator/D_psi.h"
+#include "tm_operators_nd_32.h"
+
+
+
+void sub_epsbar_tau1_32(spinor32 * const l_strange, spinor32 * const l_charm , spinor32 * const k_strange, spinor32 * const k_charm){
+  mul_r_32(g_spinor_field32[2], (float) g_epsbar, k_strange , VOLUME);
+  mul_r_32(g_spinor_field32[3], (float) g_epsbar, k_charm, VOLUME);
+  diff_32(l_strange, l_strange, g_spinor_field32[3], VOLUME);
+  diff_32(l_charm, l_charm, g_spinor_field32[2], VOLUME);  
+}
+
+
+void Q_pm_ndpsi_32(spinor32 * const l_strange, spinor32 * const l_charm, spinor32 * const k_strange, spinor32 * const k_charm)
+{
+
+  //D_h^{dagger}
+  //tau^1 by s<->c
+  
+     D_psi_32(l_strange, k_charm);
+     g_mu = -g_mu;
+     D_psi_32(l_charm, k_strange);     
+     g_mu = -g_mu;
+     
+     sub_epsbar_tau1_32(l_strange, l_charm, k_charm, k_strange);
+     
+     gamma5_32(g_spinor_field32[0], l_strange, VOLUME);
+     gamma5_32(g_spinor_field32[1], l_charm, VOLUME);    
+     
+    //D_h
+    //tau^1 by s<->c     
+     D_psi_32(l_strange, g_spinor_field32[1]);
+     g_mu = -g_mu;
+     D_psi_32(l_charm, g_spinor_field32[0]);         
+     g_mu = -g_mu;
+     sub_epsbar_tau1_32(l_strange, l_charm, g_spinor_field32[1], g_spinor_field32[0]);
+     
+     gamma5_32(l_strange, l_strange, VOLUME);      
+     gamma5_32(l_charm, l_charm, VOLUME);
+     /* At the end, the normalisation by the max. eigenvalue  */ 
+     /* Twice  phmc_invmaxev  since we consider here  D Ddag  !!! */
+     mul_r_32(l_charm, (float) phmc_invmaxev*phmc_invmaxev, l_charm, VOLUME);
+     mul_r_32(l_strange, (float) phmc_invmaxev*phmc_invmaxev, l_strange, VOLUME);     
+
+}
+
+// l_ and k_ are allowed to be the same spinors
+void M_ee_inv_ndpsi_32_orphaned(spinor32 * const l_s, spinor32 * const l_c, 
+		    spinor32 * const k_s, spinor32 * const k_c,
+		    const float mu, const float eps) {
+  float nrm = 1./(1.+ mu*mu - eps*eps);
+  spinor32 *r_s, *r_c, *s_s, *s_c;
+  su3_vector32 ALIGN32 phi1, phi2;
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(unsigned int ix = 0; ix < (VOLUME/2); ++ix){
+    r_s = l_s + ix;
+    r_c = l_c + ix;
+    s_s = k_s + ix;
+    s_c = k_c + ix;
+
+    _complex_times_vector(phi1, (1. - mu * I), s_s->s0);
+    _vector_add_mul(phi1, eps, s_c->s0);
+    _complex_times_vector(phi2, (1. + mu * I), s_c->s0);
+    _vector_add_mul(phi2, eps, s_s->s0);
+    _vector_mul(r_s->s0, nrm, phi1);
+    _vector_mul(r_c->s0, nrm, phi2);
+
+    _complex_times_vector(phi1, (1. - mu * I), s_s->s1);
+    _vector_add_mul(phi1, eps, s_c->s1);
+    _complex_times_vector(phi2, (1. + mu * I), s_c->s1);
+    _vector_add_mul(phi2, eps, s_s->s1);
+    _vector_mul(r_s->s1, nrm, phi1);
+    _vector_mul(r_c->s1, nrm, phi2);
+
+    _complex_times_vector(phi1, (1. + mu * I), s_s->s2);
+    _vector_add_mul(phi1, eps, s_c->s2);
+    _complex_times_vector(phi2, (1. - mu * I), s_c->s2);
+    _vector_add_mul(phi2, eps, s_s->s2);
+    _vector_mul(r_s->s2, nrm, phi1);
+    _vector_mul(r_c->s2, nrm, phi2);
+
+    _complex_times_vector(phi1, (1. + mu * I), s_s->s3);
+    _vector_add_mul(phi1, eps, s_c->s3);
+    _complex_times_vector(phi2, (1. - mu * I), s_c->s3);
+    _vector_add_mul(phi2, eps, s_s->s3);
+    _vector_mul(r_s->s3, nrm, phi1);
+    _vector_mul(r_c->s3, nrm, phi2);
+
+  }
+}
+
+void M_ee_inv_ndpsi_32(spinor32 * const l_s, spinor32 * const l_c, 
+		    spinor32 * const k_s, spinor32 * const k_c,
+		    const float mu, const float eps) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  M_ee_inv_ndpsi_32_orphaned(l_s, l_c, k_s, k_c, mu, eps);
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+  return;
+}
+
+// l_ and k_ are allowed to be the same spinors
+void M_oo_sub_g5_ndpsi_32_orphaned(spinor32 * const l_s, spinor32 * const l_c, 
+		       spinor32 * const k_s, spinor32 * const k_c,
+		       spinor32 * const j_s, spinor32 * const j_c,
+		       const float mu, const float eps) {
+  spinor32 *r_s, *r_c, *s_s, *s_c, *t_s, *t_c;
+  su3_vector32 ALIGN32 phi1, phi2;
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(unsigned int ix = 0; ix < (VOLUME/2); ++ix){
+    r_s = l_s + ix;
+    r_c = l_c + ix;
+    s_s = k_s + ix;
+    s_c = k_c + ix;
+    t_s = j_s + ix;
+    t_c = j_c + ix;
+
+    _complex_times_vector(phi1, (1. - mu * I), s_s->s0);
+    _vector_add_mul(phi1, eps, s_c->s0);
+    _complex_times_vector(phi2, (1. + mu * I), s_c->s0);
+    _vector_add_mul(phi2, eps, s_s->s0);
+    _vector_sub(r_s->s0, phi1, t_s->s0);
+    _vector_sub(r_c->s0, phi2, t_c->s0);
+
+    _complex_times_vector(phi1, (1. - mu * I), s_s->s1);
+    _vector_add_mul(phi1, eps, s_c->s1);
+    _complex_times_vector(phi2, (1. + mu * I), s_c->s1);
+    _vector_add_mul(phi2, eps, s_s->s1);
+    _vector_sub(r_s->s1, phi1, t_s->s1);
+    _vector_sub(r_c->s1, phi2, t_c->s1);
+
+    _complex_times_vector(phi1, (1. + mu * I), s_s->s2);
+    _vector_add_mul(phi1, eps, s_c->s2);
+    _complex_times_vector(phi2, (1. - mu * I), s_c->s2);
+    _vector_add_mul(phi2, eps, s_s->s2);
+    _vector_sub(r_s->s2, t_s->s2, phi1);
+    _vector_sub(r_c->s2, t_c->s2, phi2);
+
+    _complex_times_vector(phi1, (1. + mu * I), s_s->s3);
+    _vector_add_mul(phi1, eps, s_c->s3);
+    _complex_times_vector(phi2, (1. - mu * I), s_c->s3);
+    _vector_add_mul(phi2, eps, s_s->s3);
+    _vector_sub(r_s->s3, t_s->s3, phi1);
+    _vector_sub(r_c->s3, t_c->s3, phi2);
+  }
+}
+
+void M_oo_sub_g5_ndpsi_32(spinor32 * const l_s, spinor32 * const l_c, 
+		       spinor32 * const k_s, spinor32 * const k_c,
+		       spinor32 * const j_s, spinor32 * const j_c,
+		       const float mu, const float eps) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  M_oo_sub_g5_ndpsi_32_orphaned(l_s,l_c,k_s,k_c,j_s,j_c,mu,eps);
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+  return;
+}
+
+void Qtm_pm_ndpsi_32(spinor32 * const l_strange, spinor32 * const l_charm,
+		  spinor32 * const k_strange, spinor32 * const k_charm){
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  /* first the  Qhat(2x2)^dagger  PART*/
+  /* Here the  M_oe Mee^-1 M_eo  implementation  */
+  Hopping_Matrix_32_orphaned(EO, g_spinor_field32[0], k_charm);
+  Hopping_Matrix_32_orphaned(EO, g_spinor_field32[1], k_strange);
+
+  M_ee_inv_ndpsi_32_orphaned(g_spinor_field32[2], g_spinor_field32[3],
+		 g_spinor_field32[0], g_spinor_field32[1],
+		 (float) g_mubar, (float) g_epsbar);
+
+  Hopping_Matrix_32_orphaned(OE, g_spinor_field32[0], g_spinor_field32[2]);
+  Hopping_Matrix_32_orphaned(OE, g_spinor_field32[1], g_spinor_field32[3]);
+
+  /* Here the M_oo  implementation  */
+  M_oo_sub_g5_ndpsi_32_orphaned(g_spinor_field32[2], g_spinor_field32[3], k_charm, k_strange,
+  		    g_spinor_field32[0], g_spinor_field32[1],
+  		    (float)(-g_mubar), (float)(-g_epsbar));
+  /* We have to reassigin as follows to avoid overwriting */
+  /* Recall in fact that   Q^hat = tau_1 Q tau_1  , hence  */
+  /* and then the  Qhat(2x2)  PART */
+
+  /* Here the  M_oe Mee^-1 M_eo  implementation  */
+  Hopping_Matrix_32_orphaned(EO, g_spinor_field32[0], g_spinor_field32[3]);
+  Hopping_Matrix_32_orphaned(EO, g_spinor_field32[1], g_spinor_field32[2]);
+
+  M_ee_inv_ndpsi_32_orphaned(g_spinor_field32[5], g_spinor_field32[4],
+		 g_spinor_field32[1], g_spinor_field32[0],
+		 (float)(-g_mubar), (float)g_epsbar);
+
+  Hopping_Matrix_32_orphaned(OE, l_strange, g_spinor_field32[4]);
+  Hopping_Matrix_32_orphaned(OE, l_charm, g_spinor_field32[5]);
+
+  /* Here the M_oo  implementation  */
+  M_oo_sub_g5_ndpsi_32_orphaned(l_strange, l_charm, g_spinor_field32[3], g_spinor_field32[2],
+		    l_strange, l_charm, (float)(-g_mubar), (float)(-g_epsbar));
+  /* At the end, the normalisation by the max. eigenvalue  */ 
+  /* Twice  phmc_invmaxev  since we consider here  D Ddag  !!! */
+  mul_r_32_orphaned(l_charm, (float) phmc_invmaxev*phmc_invmaxev, l_charm, VOLUME/2);
+  mul_r_32_orphaned(l_strange, (float) phmc_invmaxev*phmc_invmaxev, l_strange, VOLUME/2);
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+  return;
+}
+
+void Qtm_pm_ndpsi_shift_32(spinor32 * const l_strange, spinor32 * const l_charm,
+          spinor32 * const k_strange, spinor32 * const k_charm){
+  Qtm_pm_ndpsi_32(l_strange, l_charm, k_strange, k_charm);
+  assign_add_mul_r_32(l_strange, k_strange, (float)g_shift, VOLUME/2 );
+  assign_add_mul_r_32(l_charm, k_charm, (float)g_shift, VOLUME/2 );
+  return;
+}
+
+void Qsw_pm_ndpsi_32(spinor32 * const l_strange, spinor32 * const l_charm,
+      spinor32 * const k_strange, spinor32 * const k_charm) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  /* FIRST THE  Qhat(2x2)^dagger  PART*/
+  /* Here the  M_oe Mee^-1 M_eo  implementation  */
+  Hopping_Matrix_32_orphaned(EO, g_spinor_field32[0], k_charm);
+  Hopping_Matrix_32_orphaned(EO, g_spinor_field32[1], k_strange);
+
+  assign_mul_one_sw_pm_imu_eps_32_orphaned(EE, g_spinor_field32[2], g_spinor_field32[3], 
+             g_spinor_field32[0], g_spinor_field32[1], -g_mubar, g_epsbar);
+  clover_inv_nd_32_orphaned(EE, g_spinor_field32[2], g_spinor_field32[3]);
+
+  Hopping_Matrix_32_orphaned(OE, g_spinor_field32[0], g_spinor_field32[2]);
+  Hopping_Matrix_32_orphaned(OE, g_spinor_field32[1], g_spinor_field32[3]);
+
+  // Here the M_oo  implementation  
+  clover_gamma5_nd_32_orphaned(OO, g_spinor_field32[2], g_spinor_field32[3], 
+         k_charm, k_strange,
+         g_spinor_field32[0], g_spinor_field32[1],
+         -g_mubar, -g_epsbar);
+
+  // and then the  Qhat(2x2)  PART 
+  // Recall in fact that   Q^hat = tau_1 Q tau_1  
+  // Here the  M_oe Mee^-1 M_eo  implementation  
+  // the re-ordering in s and c components is due to tau_1
+  Hopping_Matrix_32_orphaned(EO, g_spinor_field32[0], g_spinor_field32[3]);
+  Hopping_Matrix_32_orphaned(EO, g_spinor_field32[1], g_spinor_field32[2]);
+
+  assign_mul_one_sw_pm_imu_eps_32_orphaned(EE, g_spinor_field32[4], g_spinor_field32[5], 
+             g_spinor_field32[1], g_spinor_field32[0], g_mubar, g_epsbar);
+  clover_inv_nd_32_orphaned(EE, g_spinor_field32[4], g_spinor_field32[5]);
+
+  Hopping_Matrix_32_orphaned(OE, g_spinor_field32[0], g_spinor_field32[5]);
+  Hopping_Matrix_32_orphaned(OE, g_spinor_field32[1], g_spinor_field32[4]);
+
+  clover_gamma5_nd_32_orphaned(OO, l_charm, l_strange,
+         g_spinor_field32[2], g_spinor_field32[3],
+         g_spinor_field32[1], g_spinor_field32[0],
+         g_mubar, -g_epsbar);
+
+  /* At the end, the normalisation by the max. eigenvalue  */ 
+  /* Twice  phmc_invmaxev  since we consider here  D Ddag  !!! */
+  mul_r_32_orphaned(l_charm, phmc_invmaxev*phmc_invmaxev, l_charm, VOLUME/2);
+  mul_r_32_orphaned(l_strange, phmc_invmaxev*phmc_invmaxev, l_strange, VOLUME/2);
+
+#ifdef TM_USE_OMP /* OpenMP parallel closing brace */
+  }
+#endif
+
+  return;
+}
+
+void Qsw_pm_ndpsi_shift_32(spinor32* const l_strange, spinor32 * const l_charm,
+      spinor32 * const k_strange, spinor32 * const k_charm){
+  Qsw_pm_ndpsi_32(l_strange,l_charm,k_strange,k_charm);
+  assign_add_mul_r_32(l_strange, k_strange, (float)g_shift, VOLUME/2 );
+  assign_add_mul_r_32(l_charm, k_charm, (float)g_shift, VOLUME/2 );
+  return;
+}
+
+
diff --git a/operator/tm_operators_nd_32.h b/operator/tm_operators_nd_32.h
new file mode 100644
index 000000000..c9833bed6
--- /dev/null
+++ b/operator/tm_operators_nd_32.h
@@ -0,0 +1,33 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2015 Florian Burger
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _TM_OPERATORS_ND_32_H
+#define _TM_OPERATORS_ND_32_H
+
+void Q_pm_ndpsi_32(spinor32 * const l_strange, spinor32 * const l_charm, spinor32 * const k_strange, spinor32 * const k_charm);
+
+void Qtm_pm_ndpsi_32(spinor32 * const l_strange, spinor32 * const l_charm,
+		  spinor32 * const k_strange, spinor32 * const k_charm);
+void Qtm_pm_ndpsi_shift_32(spinor32 * const l_strange, spinor32 * const l_charm, spinor32 * const k_strange, spinor32 * const k_charm);
+
+void Qsw_pm_ndpsi_32(spinor32 * const l_strange, spinor32 * const l_charm,
+      spinor32 * const k_strange, spinor32 * const k_charm);
+void Qsw_pm_ndpsi_shift_32(spinor32 * const l_strange, spinor32 * const l_charm, spinor32 * const k_strange, spinor32 * const k_charm);
+#endif
diff --git a/operator/tm_sub_Hopping_Matrix.c b/operator/tm_sub_Hopping_Matrix.c
index dd96ca221..f00c60821 100644
--- a/operator/tm_sub_Hopping_Matrix.c
+++ b/operator/tm_sub_Hopping_Matrix.c
@@ -28,11 +28,11 @@
  ****************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
-#ifdef OMP
+#ifdef TM_USE_OMP
 #include <omp.h>
 #endif
 #include <complex.h>
@@ -41,7 +41,7 @@
 #ifdef BGQ
 #  include"DirectPut.h"
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
 #  include "xchange/xchange.h"
 #endif
 #include "boundary.h"
@@ -79,7 +79,7 @@ void tm_sub_Hopping_Matrix(const int ieo, spinor * const l, spinor * const p, sp
   }
 #  endif
   
-#  ifdef OMP
+#  ifdef TM_USE_OMP
 #  pragma omp parallel
   {
     su3 * restrict u0 ALIGN;
@@ -96,7 +96,7 @@ void tm_sub_Hopping_Matrix(const int ieo, spinor * const l, spinor * const p, sp
 #  endif
 #  include "operator/halfspinor_body.c"
 #  undef _TM_SUB_HOP    
-#  ifdef OMP
+#  ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #  endif
   return;
@@ -130,11 +130,11 @@ void tm_sub_Hopping_Matrix(const int ieo, spinor * const l, spinor * p, spinor *
   }
 #  endif
 
-#  if (defined MPI)
+#  if (defined TM_USE_MPI)
   xchange_field(k, ieo);
 #  endif
   
-#  ifdef OMP
+#  ifdef TM_USE_OMP
 #    pragma omp parallel
   {
 #  endif
@@ -149,7 +149,7 @@ void tm_sub_Hopping_Matrix(const int ieo, spinor * const l, spinor * p, spinor *
 #  endif
 #  include "operator/hopping_body_dbl.c"
 #  undef _TM_SUB_HOP
-#  ifdef OMP
+#  ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #  endif
   return;
diff --git a/operator/tm_times_Hopping_Matrix.c b/operator/tm_times_Hopping_Matrix.c
index 183bf737b..e2edd2775 100644
--- a/operator/tm_times_Hopping_Matrix.c
+++ b/operator/tm_times_Hopping_Matrix.c
@@ -28,11 +28,11 @@
  ****************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
-#ifdef OMP
+#ifdef TM_USE_OMP
 #include <omp.h>
 #endif
 #include <complex.h>
@@ -41,7 +41,7 @@
 #ifdef BGQ
 #  include"DirectPut.h"
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
 #  include "xchange/xchange.h"
 #endif
 #include "boundary.h"
@@ -78,7 +78,7 @@ void tm_times_Hopping_Matrix(const int ieo, spinor * const l, spinor * const k,
   }
 #  endif
   
-#  ifdef OMP
+#  ifdef TM_USE_OMP
 #  pragma omp parallel
   {
     su3 * restrict u0 ALIGN;
@@ -93,7 +93,7 @@ void tm_times_Hopping_Matrix(const int ieo, spinor * const l, spinor * const k,
 #  endif
 #  include "operator/halfspinor_body.c"
 #  undef _MUL_G5_CMPLX    
-#  ifdef OMP
+#  ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #  endif
   return;
@@ -126,11 +126,11 @@ void tm_times_Hopping_Matrix(const int ieo, spinor * const l, spinor * const k,
   }
 #  endif
 
-#  if (defined MPI)
+#  if (defined TM_USE_MPI)
   xchange_field(k, ieo);
 #  endif
   
-#  ifdef OMP
+#  ifdef TM_USE_OMP
 #    pragma omp parallel
   {
 #  endif
@@ -143,7 +143,7 @@ void tm_times_Hopping_Matrix(const int ieo, spinor * const l, spinor * const k,
 #  endif
 #  include "operator/hopping_body_dbl.c"
 #  undef _MUL_G5_CMPLX
-#  ifdef OMP
+#  ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #  endif
   return;
diff --git a/operator_types.h b/operator_types.h
new file mode 100644
index 000000000..de10bfe53
--- /dev/null
+++ b/operator_types.h
@@ -0,0 +1,40 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2009 Carsten Urbach
+ *               2017 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef OPERATOR_TYPES_H
+#define OPERATOR_TYPES_H
+
+typedef enum op_type_t {
+  TMWILSON = 0,
+  OVERLAP,
+  WILSON,
+  DBTMWILSON,
+  CLOVER,
+  DBCLOVER,
+  BSM,
+  BSM2b,
+  BSM2m,
+  BSM2f,
+  BSM3
+
+} op_type_t;
+
+#endif // OPERATOR_TYPES_H
diff --git a/overrelaxation.c b/overrelaxation.c
index 7813c27af..cbf3a3d78 100644
--- a/overrelaxation.c
+++ b/overrelaxation.c
@@ -70,7 +70,7 @@
 *
 */
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/phmc.c b/phmc.c
index a27f869c0..50b41e43c 100644
--- a/phmc.c
+++ b/phmc.c
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -98,7 +98,7 @@ void init_phmc() {
       printf("PHMC: Ev-max = %e \n", phmc_cheb_evmax);
       printf("PHMC: Ev-min = %e \n", phmc_cheb_evmin); 
     }
-#ifdef MPI
+#ifdef TM_USE_MPI
     MPI_Finalize();
 #endif
     exit(0);
@@ -152,7 +152,7 @@ void init_phmc() {
       fclose(Const);
     } else {
       fprintf(stderr, "File %s is missing! Aborting...\n", filename_const);
-#ifdef MPI
+#ifdef TM_USE_MPI
       MPI_Finalize();
 #endif
       exit(6);
@@ -177,7 +177,7 @@ void init_phmc() {
     if (fgets(title, 100, roots) == NULL)
     {
       fprintf(stderr, "Error in reading %s! Aborting...\n", filename_phmc_root);
-      #ifdef MPI
+      #ifdef TM_USE_MPI
          MPI_Finalize();
       #endif
       exit(6);
@@ -191,7 +191,7 @@ void init_phmc() {
   }
   else {
     fprintf(stderr, "File %s is missing! Aborting...\n", filename_phmc_root);
-#ifdef MPI
+#ifdef TM_USE_MPI
     MPI_Finalize();
 #endif
     exit(6);
@@ -235,7 +235,7 @@ void phmc_compute_ev(const int trajectory_counter,
 	   mnl->name, trajectory_counter, temp2);
   }
   if(g_proc_id == 0) {
-    if(temp2 > 1.) {
+    if(temp2 > mnl->EVMax) {
       fprintf(stderr, "\nWarning: largest eigenvalue for monomial %s larger than upper bound!\n\n", mnl->name);
     }
     if(temp < mnl->EVMin) {
@@ -243,7 +243,7 @@ void phmc_compute_ev(const int trajectory_counter,
     }
     countfile = fopen(phmcfilename, "a");
     fprintf(countfile, "%.8d %1.5e %1.5e %1.5e %1.5e\n", 
-	    trajectory_counter, temp, temp2, mnl->EVMin, 1.);
+	    trajectory_counter, temp, temp2, mnl->EVMin, mnl->EVMax);
     fclose(countfile);
   }
   etime = gettime();
diff --git a/prepare_source.c b/prepare_source.c
index 824ea3d6d..3490674fa 100644
--- a/prepare_source.c
+++ b/prepare_source.c
@@ -19,14 +19,14 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <time.h>
 #include <assert.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
@@ -37,6 +37,7 @@
 #include "solver/solver.h"
 #include "start.h"
 #include "ranlxd.h"
+#include "ranlxs.h"
 #include "su3.h"
 #include "operator.h"
 #include "linalg_eo.h"
@@ -46,26 +47,22 @@
 
 void prepare_source(const int nstore, const int isample, const int ix, const int op_id, 
                     const int read_source_flag,
-                    const int source_location) {
+                    const int source_location, const unsigned int seed) {
 
   FILE * ifs = NULL;
   int is = ix / 3, ic = ix %3, err = 0, rstat=0, t = 0;
   operator * optr = &operator_list[op_id];
-  char source_filename[100];
+  char source_filename[400];
   int source_type = SourceInfo.type;
-  static int nstore_ = -1;
-  static int isample_ = -1;
-  static int ix_ = -1;
-  static int op_id_ = -1;
-
+  float u;
   SourceInfo.nstore = nstore;
   SourceInfo.sample = isample;
   SourceInfo.ix = ix;
 
-  if(optr->type != DBTMWILSON && optr->type != DBCLOVER && optr->type != BSM && optr->type != BSM2b && optr->type != BSM2m && optr->type != BSM2f ) {
+  if(optr->type != DBTMWILSON && optr->type != DBCLOVER && optr->type != BSM && optr->type != BSM2b && optr->type != BSM2m && optr->type != BSM2f && optr->type != BSM3 ) {
     SourceInfo.no_flavours = 1;
     /* no volume sources */
-    if(source_type != 1) {
+    if(source_type == SRC_TYPE_POINT || source_type == SRC_TYPE_TS) {
       /* either "Don't read inversion source from file" or                    */
       /* "Don't read inversion source from file, but save the one generated" */
       if (read_source_flag == 0 || read_source_flag == 2) {
@@ -79,24 +76,26 @@ void prepare_source(const int nstore, const int isample, const int ix, const int
       /* "Read inversion source from file" */
       else {
         if (SourceInfo.splitted) {
-	  /* timeslice needs to be put into filename */
-	  if(SourceInfo.automaticTS) {
-	    /* automatic timeslice detection */
-	    if(g_proc_id == 0) {
-	      for(t = 0; t < g_nproc_t*T; t++) {
-		sprintf(source_filename, "%s.%.4d.%.2d.%.2d", SourceInfo.basename, nstore, t, ix);
-		if( (ifs = fopen(source_filename, "r")) != NULL) {
-		  fclose(ifs);
-		  break;
-		}
-	      }
-	    }
-#ifdef MPI
-	    MPI_Bcast(&t, 1, MPI_INT, 0, MPI_COMM_WORLD);
+          /* timeslice needs to be put into filename */
+          if(SourceInfo.automaticTS) {
+            /* automatic timeslice detection */
+            if(g_proc_id == 0) {
+              for(t = 0; t < g_nproc_t*T; t++) {
+                if(T_global > 99) sprintf(source_filename, "%s.%.4d.%.3d.%.2d", SourceInfo.basename, nstore, t, ix);
+                else sprintf(source_filename, "%s.%.4d.%.2d.%.2d", SourceInfo.basename, nstore, t, ix);
+                if( (ifs = fopen(source_filename, "r")) != NULL) {
+                  fclose(ifs);
+                  break;
+                }
+              }
+            }
+#ifdef TM_USE_MPI
+            MPI_Bcast(&t, 1, MPI_INT, 0, MPI_COMM_WORLD);
 #endif
-	    SourceInfo.t = t;
-	  }
-          sprintf(source_filename, "%s.%.4d.%.2d.%.2d", SourceInfo.basename, nstore, SourceInfo.t, ix);
+            SourceInfo.t = t;
+          }
+          if(T_global > 99) sprintf(source_filename, "%s.%.4d.%.3d.%.2d", SourceInfo.basename, nstore, SourceInfo.t, ix);
+          else sprintf(source_filename, "%s.%.4d.%.2d.%.2d", SourceInfo.basename, nstore, SourceInfo.t, ix);
           if (g_cart_id == 0) {
             printf("# Trying to read source from %s\n", source_filename);
           }
@@ -115,13 +114,15 @@ void prepare_source(const int nstore, const int isample, const int ix, const int
         }
       }
       if (PropInfo.splitted) {
-        sprintf(source_filename, "%s.%.4d.%.2d.%.2d.inverted", PropInfo.basename, nstore, SourceInfo.t, ix);
+        if(T_global > 99) sprintf(source_filename, "%s.%.4d.%.3d.%.2d.inverted", PropInfo.basename, nstore, SourceInfo.t, ix);
+        else sprintf(source_filename, "%s.%.4d.%.2d.%.2d.inverted", PropInfo.basename, nstore, SourceInfo.t, ix);
       }
       else {
-        sprintf(source_filename, "%s.%.4d.%.2d.inverted", PropInfo.basename, nstore, SourceInfo.t);
+        if(T_global > 99) sprintf(source_filename, "%s.%.4d.%.3d.inverted", PropInfo.basename, nstore, SourceInfo.t);
+        else sprintf(source_filename, "%s.%.4d.%.2d.inverted", PropInfo.basename, nstore, SourceInfo.t);
       }
     }
-    else if(source_type == 1) {
+    else if(source_type == SRC_TYPE_VOL) {
       /* Volume sources */
       if(read_source_flag == 0 || read_source_flag == 2) {
         if(g_proc_id == 0 && g_debug_level > 0) {
@@ -142,6 +143,78 @@ void prepare_source(const int nstore, const int isample, const int ix, const int
       }
       sprintf(source_filename, "%s.%.4d.%.5d.inverted", PropInfo.basename, nstore, isample);
     }
+    else if(source_type == SRC_TYPE_PION_TS) {
+      // If a pion timeslice source has already been inverted for the current sample and gauge configuration,
+      // we would like to re-use the same timeslice, which we ensure with the loop below. The reason for doing 
+      // this is that we cannot guarantee that the call to ranlxs below is reproducible.
+      // Note: source_generation_pion_only reinitialises the RNG with a systematically chosen seed and thus does
+      // not suffer from this problem when called below.
+      if(SourceInfo.automaticTS) {
+        int found = 0;
+        if(g_proc_id == 0 && !PropInfo.splitted) {
+          for(t = 0; t < g_nproc_t*T; t++) {
+            sprintf(source_filename, "%s.%.4d.%.5d.%.2d.inverted", SourceInfo.basename, nstore, isample, t);
+            if( (ifs = fopen(source_filename, "r")) != NULL) {
+              fclose(ifs);
+              found = 1;
+              break;
+            }
+          }
+        }
+        // chose timeslice randomly
+        if(PropInfo.splitted || !found) {
+          ranlxs(&u, 1);
+          t = (int)(u*g_nproc_t*T);
+        }
+#ifdef TM_USE_MPI
+        MPI_Bcast(&t, 1, MPI_INT, 0, MPI_COMM_WORLD);
+#endif
+        SourceInfo.t = t;
+      }
+      if(g_proc_id == 0 && g_debug_level > 0) {
+        printf("# Preparing 1 flavour Pion TimeSlice at t = %d source\n", SourceInfo.t);
+      }
+      source_generation_pion_only(g_spinor_field[0], g_spinor_field[1], SourceInfo.t, isample, nstore, seed);
+      sprintf(source_filename, "%s.%.4d.%.5d.%.2d.inverted", PropInfo.basename, nstore, isample, SourceInfo.t);
+    }
+    else if(source_type == SRC_TYPE_GEN_PION_TS) {
+      // Generalised Pion full time slice sources
+      if(SourceInfo.automaticTS) {
+        // automatic timeslice detection based on an existing forward propagator
+        if(g_proc_id == 0) {
+          for(t = 0; t < g_nproc_t*T; t++) {
+            sprintf(source_filename, "%s.%.4d.%.5d.%.2d.inverted", SourceInfo.basename, nstore, isample, t);
+            if( (ifs = fopen(source_filename, "r")) != NULL) {
+              fclose(ifs);
+              break;
+            }
+          }
+        }
+#ifdef TM_USE_MPI
+        MPI_Bcast(&t, 1, MPI_INT, 0, MPI_COMM_WORLD);
+#endif
+        SourceInfo.t = t;
+      }
+
+      if(g_proc_id == 0 && g_debug_level > 0) {
+        printf("# Preparing 1 flavour Generalised Pion TimeSlice at T/2 + t = %d source\n", SourceInfo.t+(g_nproc_t*T)/2);
+      }
+
+      sprintf(source_filename, "%s.%.4d.%.5d.%.2d.inverted", SourceInfo.basename, nstore, isample, SourceInfo.t);
+      rstat = read_spinor(g_spinor_field[2], g_spinor_field[3], source_filename, 0);
+      if(rstat) {
+        fprintf(stderr, "Error reading file %s in prepare_source.c.\nUnable to proceed, aborting....\n", source_filename);
+        exit(-1);
+      }
+      extended_pion_source(g_spinor_field[0], g_spinor_field[1], g_spinor_field[2], g_spinor_field[3], 
+                           SourceInfo.t, (g_nproc_t*T)/2, 0., 0., 0.);
+      sprintf(source_filename, "%s.%.4d.%.5d.%.2d.inverted", PropInfo.basename, nstore, isample, SourceInfo.t);
+      // if the generalised pion propagator is to be written to the same file as the source, splitting must be disabled
+      if( strcmp(PropInfo.basename,SourceInfo.basename) == 0 ) PropInfo.splitted = 0;
+    }
+    else { 
+      fprintf(stderr, "# source type %d not implemented yet.\nCannot proceed, aborting...\n", source_type);
+    }
     optr->sr0 = g_spinor_field[0];
     optr->sr1 = g_spinor_field[1];
     optr->prop0 = g_spinor_field[2];
@@ -151,7 +224,7 @@ void prepare_source(const int nstore, const int isample, const int ix, const int
     /* If the solver is _not_ CG we might read in */
     /* here some better guess                     */
     /* This also works for re-iteration           */
-    if (optr->solver != CG && optr->solver != PCG) {
+    if (optr->solver != CG && optr->solver != PCG && optr->solver != MIXEDCG && optr->solver != RGMIXEDCG) {
       ifs = fopen(source_filename, "r");
       if (ifs != NULL) {
         if (g_cart_id == 0) {
@@ -195,21 +268,24 @@ void prepare_source(const int nstore, const int isample, const int ix, const int
   }
   else { /* for the ND 2 flavour twisted operator and BSM(2) */
     SourceInfo.no_flavours = 2;
+    int tindex=source_location/(LX*g_nproc_x*LY*g_nproc_y*LZ*g_nproc_z);
+    SourceInfo.t = tindex;
     zero_spinor_field(g_spinor_field[0], VOLUME/2);
     zero_spinor_field(g_spinor_field[1], VOLUME/2);
-    if(source_type != 1) {
+    if(source_type == SRC_TYPE_POINT || source_type == SRC_TYPE_TS) {
       if(read_source_flag == 0 || read_source_flag == 2) {
         if(source_location == 0) {
           source_spinor_field(g_spinor_field[2], g_spinor_field[3], is, ic);
         }
         else {
           source_spinor_field_point_from_file(g_spinor_field[2], g_spinor_field[3], 
-					      is, ic, source_location);
+                                              is, ic, source_location);
         }
       }
       else {
         if(SourceInfo.splitted) {
-          sprintf(source_filename, "%s.%.4d.%.2d.%.2d", SourceInfo.basename, nstore, SourceInfo.t, ix);
+          if(T_global > 99) sprintf(source_filename, "%s.%.4d.%.3d.%.2d", SourceInfo.basename, nstore, SourceInfo.t, ix);
+          else sprintf(source_filename, "%s.%.4d.%.2d.%.2d", SourceInfo.basename, nstore, SourceInfo.t, ix);
         }
         else {
           sprintf(source_filename,"%s", SourceInfo.basename);
@@ -219,7 +295,7 @@ void prepare_source(const int nstore, const int isample, const int ix, const int
         }
         if(read_spinor(g_spinor_field[2], g_spinor_field[3], source_filename, 0) != 0) {
           fprintf(stderr, "Error reading source! Aborting...\n");
-#ifdef MPI
+#ifdef TM_USE_MPI
           MPI_Abort(MPI_COMM_WORLD, 1);
           MPI_Finalize();
 #endif
@@ -227,7 +303,7 @@ void prepare_source(const int nstore, const int isample, const int ix, const int
         }
       }
     }
-    else if(source_type == 1) {
+    else if(source_type == SRC_TYPE_VOL) {
       /* Volume sources */
       if(g_proc_id == 0 && g_debug_level > 0) {
         printf("# Preparing 2 flavour volume source\n");
@@ -237,7 +313,7 @@ void prepare_source(const int nstore, const int isample, const int ix, const int
       gaussian_volume_source(g_spinor_field[2], g_spinor_field[3],
                              isample, nstore, 2);
     }
-    if( optr->type != BSM && optr->type != BSM2b && optr->type != BSM2m && optr->type != BSM2f) {
+    if( optr->type != BSM && optr->type != BSM2b && optr->type != BSM2m && optr->type != BSM2f && optr->type != BSM3) {
       mul_one_pm_itau2(g_spinor_field[4], g_spinor_field[6], g_spinor_field[0], g_spinor_field[2], +1., VOLUME/2);
       mul_one_pm_itau2(g_spinor_field[5], g_spinor_field[7], g_spinor_field[1], g_spinor_field[3], +1., VOLUME/2);
       assign(g_spinor_field[0], g_spinor_field[4], VOLUME/2);
@@ -255,9 +331,5 @@ void prepare_source(const int nstore, const int isample, const int ix, const int
     optr->prop2 = g_spinor_field[6];
     optr->prop3 = g_spinor_field[7];
   }
-  nstore_ = nstore;
-  isample_ = isample;
-  ix_ = ix;
-  op_id_ = op_id;
   return;
 }
diff --git a/prepare_source.h b/prepare_source.h
index 3b903ab01..eecf61d2d 100644
--- a/prepare_source.h
+++ b/prepare_source.h
@@ -22,7 +22,7 @@
 #define _PREPARE_SOURCE_H
 
 void prepare_source(const int nstore, const int isample, const int ix, const int op_id, 
-		    const int read_source_flag,
-		    const int source_location);
+		    const int read_source_flag, const int source_location,
+                    const unsigned int seed);
 
 #endif
diff --git a/prop_io_test.c b/prop_io_test.c
index dfcc21f85..a94555032 100644
--- a/prop_io_test.c
+++ b/prop_io_test.c
@@ -34,10 +34,10 @@
 #include <time.h>
 #include <string.h>
 #include <signal.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include "global.h"
@@ -48,7 +48,7 @@
 #include "start.h"
 /*#include "eigenvalues.h"*/
 #include "measure_gauge_action.h"
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include "xchange/xchange.h"
 #endif
 #include <io/utils.h>
@@ -125,9 +125,9 @@ int main(int argc, char *argv[])
   verbose = 0;
   g_use_clover_flag = 0;
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 
-#  ifdef OMP
+#  ifdef TM_USE_OMP
   int mpi_thread_provided;
   MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_thread_provided);
 #  else
@@ -148,7 +148,7 @@ int main(int argc, char *argv[])
     exit(-1);
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   init_openmp();
 #endif
 
@@ -211,7 +211,7 @@ int main(int argc, char *argv[])
 
   finalize_solver(temp_field,1);
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   free_omp_accumulators();
 #endif
   free_blocks();
@@ -224,7 +224,7 @@ int main(int argc, char *argv[])
   free_chi_spinor_field();
   free(filename);
   free(input_filename);
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Barrier(MPI_COMM_WORLD);
   MPI_Finalize();
 #endif
diff --git a/qphix_base_classes.hpp b/qphix_base_classes.hpp
new file mode 100644
index 000000000..26015e3a2
--- /dev/null
+++ b/qphix_base_classes.hpp
@@ -0,0 +1,771 @@
+// Copyright © 2017 Martin Ueding <dev@martin-ueding.de>
+// Licensed unter the [BSD-3-Clause](https://opensource.org/licenses/BSD-3-Clause).
+
+// Due to github issue #404, the helper functions to apply the full QPhiX operator
+// are currently disabled because they conflict with the new interfaces in QPhiX
+// itself. If required, these should be rewritten to use these interfaces
+// rather than the base classes in qphix_base_classes.hpp
+
+// This file should be deprecated or updated to provide any functionality
+// not covered by QPhiX itself.
+
+/**
+  \file Additions to QPhiX that are only needed for tmLQCD.
+
+  In the original QPhiX, there are only Wilson fermions and Wilson clover
+  fermions. The Dslash operators have a different call signature (the latter
+  requiring a clover term), so there is no common base class. With the addition
+  of Wilson twisted mass (Mario) and Wilson twisted clover (Peter), there are
+  now two instances of the Dslash that have the same signature. In order to
+  write a more general even-odd source preparation and solution reconstruction
+  code, a common base class for non-clover and clover is desired. In order to
+  leave the QPhiX code untouched (for now), this code lives here in tmLQCD.
+  */
+
+#pragma once
+
+#include <qphix/blas_new_c.h>
+#include <qphix/clover_dslash_def.h>
+#include <qphix/dslash_def.h>
+#include <qphix/geometry.h>
+#include <qphix/tm_clov_dslash_def.h>
+#include <qphix/tm_dslash_def.h>
+
+#include <cassert>
+
+namespace tmlqcd {
+
+namespace {
+size_t constexpr re = 0;
+size_t constexpr im = 1;
+int const n_blas_simt = 1;
+
+// The even checkerboard is given by ( (x + y + z + t ) & 1 == 0 ) -> cb0 is even
+int constexpr cb_even = 0;
+int constexpr cb_odd = 1;
+}
+
+/**
+  Complex multiplication accumulate.
+
+  Computes \f$ (r + \mathrm i i) += (a + \mathrm i b) * (c + \mathrm i d) \f$.
+  */
+template <typename FT>
+void cplx_mul_acc(FT &r_out, FT &i_out, FT const &a, FT const &b, FT const &c, FT const &d) {
+  r_out += a * c - b * d;
+  i_out += a * d + b * c;
+}
+
+/**
+  Wrapper for the clover multiplication function.
+
+  The `struct` is needed in order to allow for partial template specialization in the `Clover`
+  parameter.
+
+  \tparam Clover Type of clover block to use, must be a type from Geometry such that there exists a
+  specialization for it.
+  */
+template <typename FT, int veclen, int soalen, bool compress12, typename Clover>
+struct InnerCloverProduct {
+  /**
+  Multiplies the clover term for a single lattice size to a spinor.
+
+  This function is intended to be used in a loop over all lattice sites. It is expected from the
+  caller to have figured out all the correct indices. There are template specializations for the two
+  different types of clover term that are used in QPhiX.
+
+  \param[out] out Output spinor block. It is assumed to be zeroed properly, the function will just
+  accumulate values into that output variable. Use \ref QPhiX::zeroSpinor for that.
+  \param[in] in Input spinor block.
+  \param[in] clover Single clover block that contains the lattice site of the spinor.
+  \param[in] xi SIMD index for the arrays with length `soalen`, as in the spinors.
+  \param[in] veclen_idx SIMD index for the arrays with length `veclen`, as in the clover term.
+  */
+  static void multiply(
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock &out,
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock const &in,
+      Clover const &clover, int const xi, int const veclen_idx);
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+struct InnerCloverProduct<FT, veclen, soalen, compress12,
+                          typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock> {
+  static void multiply(
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock &spinor_out,
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock const &spinor_in,
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock const &clov_block,
+      int const xi, int const veclen_idx) {
+    // The clover term is block-diagonal in spin. Therefore we need
+    // to iterate over the two blocks of spin.
+    for (auto s_block : {0, 1}) {
+      // Extract the diagonal and triangular parts.
+      auto const &diag_in = s_block == 0 ? clov_block.diag1 : clov_block.diag2;
+      auto const &off_diag_in = s_block == 0 ? clov_block.off_diag1 : clov_block.off_diag2;
+      // Input two-spinor component.
+      for (auto two_s_in : {0, 1}) {
+        // Reconstruct four spinor index.
+        auto const four_s_in = 2 * s_block + two_s_in;
+        // Output two-spinor component.
+        for (auto two_s_out : {0, 1}) {
+          // Reconstruct four spinor index.
+          auto const four_s_out = 2 * s_block + two_s_out;
+          // Input color.
+          for (auto c_in : {0, 1, 2}) {
+            // Spin-color index (0, ..., 5).
+            auto const sc_in = 3 * two_s_in + c_in;
+            // Output color.
+            for (auto c_out : {0, 1, 2}) {
+              // Spin-color index (0, ..., 5).
+              auto const sc_out = 3 * two_s_out + c_out;
+
+              // See `qphix-codegen` file `dslash_common.cc`
+              // function
+              // `clover_term` for the index manipulations done
+              // here.
+
+              // Using separate loops over the actual indices is
+              // probably
+              // faster than the branching in the innermost loop.
+
+              if (sc_out == sc_in) {
+                cplx_mul_acc(spinor_out[c_out][four_s_out][re][xi],
+                             spinor_out[c_out][four_s_out][im][xi], diag_in[sc_in][veclen_idx],
+                             QPhiX::rep<FT,double>(0.0), spinor_in[c_in][four_s_in][re][xi],
+                             spinor_in[c_in][four_s_in][im][xi]);
+              } else if (sc_out < sc_in) {
+                auto const idx15 = sc_in * (sc_in - 1) / 2 + sc_out;
+                cplx_mul_acc(
+                    spinor_out[c_out][four_s_out][re][xi], spinor_out[c_out][four_s_out][im][xi],
+                    off_diag_in[idx15][re][veclen_idx],
+                    // aww hell, maybe one should just add negation to QPhiX::half ?
+                    QPhiX::rep<FT,double>(-QPhiX::rep<double,FT>(off_diag_in[idx15][im][veclen_idx])),
+                    spinor_in[c_in][four_s_in][re][xi], spinor_in[c_in][four_s_in][im][xi]);
+              } else {
+                auto const idx15 = sc_out * (sc_out - 1) / 2 + sc_in;
+                cplx_mul_acc(
+                    spinor_out[c_out][four_s_out][re][xi], spinor_out[c_out][four_s_out][im][xi],
+                    off_diag_in[idx15][re][veclen_idx], off_diag_in[idx15][im][veclen_idx],
+                    spinor_in[c_in][four_s_in][re][xi], spinor_in[c_in][four_s_in][im][xi]);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+struct InnerCloverProduct<
+    FT, veclen, soalen, compress12,
+    typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FullCloverBlock> {
+  static void multiply(
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock &spinor_out,
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock const &spinor_in,
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FullCloverBlock const &clov_block,
+      int const xi, int const veclen_idx) {
+    // The clover term is block-diagonal in spin. Therefore we need
+    // to iterate over the two blocks of spin.
+    for (auto s_block : {0, 1}) {
+      // handy reference to half-spinor block
+      auto const &block_in = s_block == 0 ? clov_block.block1 : clov_block.block2;
+      // Input two-spinor component.
+      for (auto two_s_in : {0, 1}) {
+        // Reconstruct four spinor index.
+        auto const four_s_in = 2 * s_block + two_s_in;
+        // Output two-spinor component.
+        for (auto two_s_out : {0, 1}) {
+          // Reconstruct four spinor index.
+          auto const four_s_out = 2 * s_block + two_s_out;
+          // Input color.
+          for (auto c_in : {0, 1, 2}) {
+            // Spin-color index (0, ..., 5).
+            auto const sc_in = 3 * two_s_in + c_in;
+            // Output color.
+            for (auto c_out : {0, 1, 2}) {
+              // Spin-color index (0, ..., 5).
+              auto const sc_out = 3 * two_s_out + c_out;
+
+              cplx_mul_acc(
+                  spinor_out[c_out][four_s_out][re][xi], spinor_out[c_out][four_s_out][im][xi],
+                  block_in[sc_out][sc_in][re][veclen_idx], block_in[sc_out][sc_in][im][veclen_idx],
+                  spinor_in[c_in][four_s_in][re][xi], spinor_in[c_in][four_s_in][im][xi]);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+/**
+  Multiplies a checkerboarded QPhiX Clover term with a checkerboarded QPhiX spinor.
+
+  Padding is taken care of. A test case for (a copy of) this function exists in QPhiX.
+
+  If the preprocessor macro `PRINT_MAPPING` is defined, it will print out the mapping of `(x, y, z,
+  t)` coordinates to block indices. Also it will check that each block is accessed the proper number
+  of times, that is `soalen` for spinors and `veclen` for clover blocks.
+
+  \param[out] out Output spinor
+  \param[in] in Input spinor
+  \param[in] clover Clover block
+  \param[in] geom Geometry object holding the dimension of clover and spinor
+  */
+template <typename FT, int veclen, int soalen, bool compress12, typename Clover>
+void clover_product(
+    typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock *const out,
+    typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock const *const in,
+    Clover *clover, ::QPhiX::Geometry<FT, veclen, soalen, compress12> &geom) {
+  ::QPhiX::zeroSpinor<FT, veclen, soalen, compress12>(out, geom, n_blas_simt);
+
+#ifdef PRINT_MAPPING
+  std::vector<int> spin_touches(geom.getPxyz() * geom.Nt(), 0);
+  std::vector<int> clover_touches(geom.getPxyz() * geom.Nt() * soalen / veclen, 0);
+
+  std::cout << std::setw(3) << "x" << std::setw(3) << "y" << std::setw(3) << "z" << std::setw(3)
+            << "t"
+            << ":" << std::setw(5) << "spin" << std::setw(5) << "clov"
+            << "\n";
+#endif
+
+  // Iterate through all the block.
+  for (int t = 0; t < geom.Nt(); ++t) {
+    for (int z = 0; z < geom.Nz(); ++z) {
+      for (int y = 0; y < geom.Ny(); ++y) {
+        for (int x = 0; x < geom.Nxh(); ++x) {
+          // First element in the current XY plane at desired Z and T.
+          auto const xyBase = t * geom.getPxyz() + z * geom.getPxy();
+          // Index of the SoA along the X direction.
+          auto const xb = x / soalen;
+          // Index within the SoA.
+          auto const xi = x % soalen;
+          // Global spin block index.
+          auto const spin_block_idx = xb + geom.Nxh() / soalen * y + xyBase;
+          // Global clover/gauge block index.
+          auto const clov_block_idx =
+              xb + (y / geom.nGY()) * geom.Nxh() / soalen + xyBase / geom.nGY();
+          // Index of the SoA structure within the current tile.
+          // auto const tile = (geom.Nxh() / soalen * y + xyBase) % geom.nGY();
+          auto const tile = y % geom.nGY();
+          // Vector index for clover/gauge. The SoA index only runs to
+          // `soalen`, this index needs to run to `veclen`, that is across the
+          // various SoA within the tile.
+          auto const veclen_idx = soalen * tile + xi;
+
+#ifdef PRINT_MAPPING
+          ++spin_touches[spin_block_idx];
+          ++clover_touches[clov_block_idx];
+
+          std::cout << std::setw(3) << x << std::setw(3) << y << std::setw(3) << z << std::setw(3)
+                    << t << ":" << std::setw(5) << spin_block_idx << std::setw(5) << clov_block_idx
+                    << "\n";
+#endif
+
+          assert(xi + xb * soalen == x);
+
+          // References to the objects at desired block.
+          auto const &clov_block = clover[clov_block_idx];
+          auto const &spinor_in = in[spin_block_idx];
+          auto &spinor_out = out[spin_block_idx];
+
+          InnerCloverProduct<FT, veclen, soalen, compress12, Clover>::multiply(
+              spinor_out, spinor_in, clov_block, xi, veclen_idx);
+        }
+      }
+    }
+  }
+
+#ifdef PRINT_MAPPING
+  std::cout << std::flush;
+
+  // Make sure that each block got touched the correct number of times.
+  for (int i = 0; i != spin_touches.size(); ++i) {
+    if (spin_touches[i] != soalen) {
+      std::cout << "Spin missmatch: Block " << std::setw(4) << i << " accessed " << std::setw(4)
+                << spin_touches[i] << " times instead of " << soalen << "\n";
+    }
+  }
+
+  for (int i = 0; i != clover_touches.size(); ++i) {
+    if (clover_touches[i] != veclen) {
+      std::cout << "Clover missmatch: Block " << std::setw(4) << i << " accessed " << std::setw(4)
+                << clover_touches[i] << " times instead of " << veclen << "\n";
+    }
+  }
+
+  std::cout << std::flush;
+#endif
+}
+
+/**
+  Abstract base class for all single-flavor Dslash variants.
+
+  There are four Dslash operators which are implemented in QPhiX:
+
+  - Wilson
+  - Wilson clover
+  - Wilson twisted mass
+  - Wilson clover with twisted mass
+
+  Each of these has a the actual Dslash operation and a so-called “achimbdpsi” operation. These act
+  on four-spinors given a gauge field. This base class provides a uniform interface to all four
+  kinds.
+
+  This code should eventually be migrated into the QPhiX repository. Currently these classes are
+  mere delegators. In the QPhiX repository, the actual classes there should be used as concrete
+  classes.
+  */
+template <typename FT, int veclen, int soalen, bool compress12>
+class Dslash {
+ public:
+  typedef ::QPhiX::Geometry<FT, veclen, soalen, compress12> Geom;
+  typedef typename Geom::FourSpinorBlock Spinor;
+  typedef typename Geom::SU3MatrixBlock SU3MatrixBlock;
+
+  explicit Dslash(Geom *geom, double const t_boundary_, double const aniso_coeff_S_,
+                  double const aniso_coeff_T_, double const mass_, bool use_tbc_[4] = nullptr,
+                  double tbc_phases_[4][2] = nullptr)
+      : geom(geom),
+        t_boundary(t_boundary_),
+        aniso_coeff_S(aniso_coeff_S_),
+        aniso_coeff_T(aniso_coeff_T_),
+        mass(mass_) {}
+
+  /**
+    Computes \f$ \psi_\mathrm o = A_\mathrm{oo} \chi_\mathrm o \f$.
+
+    The actual definition of the matrix \f$ A_\mathrm{oo} \f$ is
+    implementation dependent and can be the mass factor \f$ \alpha = 4 + m
+    \f$ for plain Wilson or something more complicated for twisted mass.
+
+    \param[out] out Output spinor \f$ \psi \f$.
+    \param[in] in Input spinor \f$ \chi \f$.
+    */
+  virtual void A_chi(Spinor *const out, Spinor const *const in, int const isign, int const cb) = 0;
+
+  /**
+    Computes \f$ \psi_\mathrm e = A_\mathrm{ee}^{-1} \chi_\mathrm e \f$.
+
+    \param[out] out Output spinor \f$ \psi \f$.
+    \param[in] in Input spinor \f$ \chi \f$.
+    */
+  virtual void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign,
+                         int const cb) = 0;
+
+  /**
+    Forwarder for the `dslash`.
+
+    This will call the `dslash` function of the respective QPhiX dslash class. There is a subtle
+    difference between the Wilson and all other cases. The Wilson dslash is just the hopping matrix,
+    just the operator \f$ D \f$. For every other case (clover, twisted mass, twisted mass clover),
+    the `dslash` member function will compute \f$ A^{-1} D \f$. In the Wilson case, this \f$ A =
+    \alpha = 4 + m = 1/(2 \kappa) \f$. Since that is _not_ included in the Wilson `dslash`, you will
+    obtain different results when using WilsonDslash::dslash and WilsonTMDslash::dslash with \f$
+    \mu = 0 \f$.
+
+    \todo Make this member function `const`. For this the member function in
+    QPhiX that is called internally must be marked `const` as well.
+    */
+  virtual void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
+                      int const isign, int const cb) = 0;
+
+  /**
+    Always plain Wilson dslash.
+
+    In contrast to the \ref dslash member function which just forwards the implementation of QPhiX,
+    this will always give you the “naked” plain Wilson dslash without any factors of \f$ A^{-1} \f$
+    applied.
+    */
+  virtual void plain_dslash(Spinor *const res, const Spinor *const psi,
+                            const SU3MatrixBlock *const u, int const isign, int const cb) {
+    // XXX Perhaps rather implement this with an instance of the WilsonDslash instead?
+
+    auto tmp = QPhiX::makeFourSpinorHandle(*geom);
+    dslash(tmp.get(), psi, u, isign, cb);
+    A_chi(res, tmp.get(), isign, cb);
+  };
+
+  /**
+    Always “dressed” dslash.
+
+    This computes \f$ A^{-1} D \f$ for all variants. In the Wilson case, this will give \f$
+    \alpha^{-1} D \f$.
+    */
+  virtual void A_inv_dslash(Spinor *const res, const Spinor *const psi,
+                            const SU3MatrixBlock *const u, int const isign, int const cb) {
+    dslash(res, psi, u, isign, cb);
+  };
+
+  /**
+    Forwarder for the `achimbdpsi`.
+
+    \todo Make this member function `const`. For this the member function in QPhiX that is called
+    internally must be marked `const` as well.
+    */
+  virtual void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
+                          const SU3MatrixBlock *const u, double const alpha, double const beta,
+                          int const isign, int const cb) = 0;
+
+  /**
+    Prepares the sources on the odd checkerboard.
+
+    This computes
+    \f[
+        \tilde b_o = \frac 12 D_{oe} M_{ee}^{-1} b_e + b_o \,.
+    \f]
+
+    \param[out] tilde_b_odd Prepared source
+    \param[in] b_even Source (right hand side) on the even lattice sites
+    \param]in] b_odd Source on the odd lattice sites
+    \param[in] u Gauge field on the odd lattice sites
+    */
+  virtual void prepare_source(Spinor *const tilde_b_odd, Spinor const *const b_even,
+                              Spinor const *const b_odd, SU3MatrixBlock const *const u);
+
+  /**
+    Reconstructs the solution on the even lattices sites.
+
+    This computes
+    \f[
+        x_e = M_{ee}^{-1} \left( b_e - \frac 12 D_{eo} x_o \right) \,.
+    \f]
+
+    \param[out] x_even Solution on the even lattices sites
+    \param[in] b_even Source (right hand side) on the even lattice sites
+    \param[in] x_odd Solution on the odd lattices sites
+    \param[in] u Gauge field on the even lattice sites
+    */
+  virtual void reconstruct_solution(Spinor *const x_even, Spinor const *const b_even,
+                                    Spinor const *const x_odd, SU3MatrixBlock const *const u);
+
+  Geom *getGeometry() const { return geom; }
+
+ private:
+  Geom *const geom;
+
+  double const t_boundary;
+  double const aniso_coeff_S;
+  double const aniso_coeff_T;
+  double const mass;
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+class WilsonDslash : public Dslash<FT, veclen, soalen, compress12> {
+ public:
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock Spinor;
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::SU3MatrixBlock SU3MatrixBlock;
+
+  WilsonDslash(::QPhiX::Geometry<FT, veclen, soalen, compress12> *geom_, double const t_boundary_,
+               double const aniso_coeff_S_, double const aniso_coeff_T_, double const mass_,
+               bool use_tbc_[4] = nullptr, double tbc_phases_[4][2] = nullptr)
+      : Dslash<FT, veclen, soalen, compress12>(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_,
+                                               mass_, use_tbc_, tbc_phases_),
+        upstream_dslash(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_, use_tbc_, tbc_phases_),
+        mass_factor_alpha(4.0 + mass_),
+        mass_factor_beta(1.0 / (4.0 * mass_factor_alpha)) {}
+
+  void A_chi(Spinor *const out, Spinor const *const in, int const isign_ignored,
+             int const cb_ignored) override {
+    int const n_blas_simt = 1;
+    ::QPhiX::axy(mass_factor_alpha, in, out, upstream_dslash.getGeometry(), n_blas_simt);
+  }
+
+  void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign_ignored,
+                 int const cb_ignored) override {
+    int const n_blas_simt = 1;
+    ::QPhiX::axy(1.0 / mass_factor_alpha, in, out, upstream_dslash.getGeometry(), n_blas_simt);
+  }
+
+  void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
+              int const isign, int const cb) override {
+    upstream_dslash.dslash(res, psi, u, isign, cb);
+  }
+
+  void plain_dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
+                    int const isign, int const cb) override {
+    dslash(res, psi, u, isign, cb);
+  };
+
+  void A_inv_dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
+                    int const isign, int const cb) override {
+    auto tmp = QPhiX::makeFourSpinorHandle(upstream_dslash.getGeometry());
+    dslash(tmp.get(), psi, u, isign, cb);
+    A_inv_chi(res, tmp.get(), isign, cb);
+  };
+
+  void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
+                  const SU3MatrixBlock *const u, double const alpha, double const beta,
+                  int const isign, int const cb) override {
+    upstream_dslash.dslashAChiMinusBDPsi(res, psi, chi, u, alpha, beta, isign, cb);
+  }
+
+ private:
+  ::QPhiX::Dslash<FT, veclen, soalen, compress12> upstream_dslash;
+
+  double const mass_factor_alpha;
+  double const mass_factor_beta;
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+class WilsonTMDslash : public Dslash<FT, veclen, soalen, compress12> {
+ public:
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock Spinor;
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::SU3MatrixBlock SU3MatrixBlock;
+
+  WilsonTMDslash(::QPhiX::Geometry<FT, veclen, soalen, compress12> *geom_, double const t_boundary_,
+                 double const aniso_coeff_S_, double const aniso_coeff_T_, double const mass_,
+                 double const twisted_mass_, bool use_tbc_[4] = nullptr,
+                 double tbc_phases_[4][2] = nullptr)
+      : Dslash<FT, veclen, soalen, compress12>(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_,
+                                               mass_, use_tbc_, tbc_phases_),
+        upstream_dslash(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_, mass_, twisted_mass_,
+                        use_tbc_, tbc_phases_),
+        mass_factor_alpha(4.0 + mass_),
+        mass_factor_beta(0.25),
+        derived_mu(twisted_mass_ / mass_factor_alpha),
+        derived_mu_inv(mass_factor_alpha /
+                       (mass_factor_alpha * mass_factor_alpha + twisted_mass_ * twisted_mass_)) {}
+
+  void A_chi(Spinor *const out, Spinor const *const in, int const isign,
+             int const cb_ignored) override {
+    helper_A_chi(out, in, -derived_mu * isign, mass_factor_alpha);
+  }
+
+  void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign,
+                 int const cb_ignored) override {
+    helper_A_chi(out, in, derived_mu * isign, derived_mu_inv);
+  }
+
+  void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
+              int const isign, int const cb) override {
+    upstream_dslash.dslash(res, psi, u, isign, cb);
+  }
+
+  void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
+                  const SU3MatrixBlock *const u, double const alpha, double const beta,
+                  int const isign, int const cb) override {
+    upstream_dslash.dslashAChiMinusBDPsi(res, psi, chi, u, alpha, beta, isign, cb);
+  }
+
+ private:
+  void helper_A_chi(Spinor *const out, Spinor const *const in, double const factor_a,
+                    double const factor_b);
+
+  ::QPhiX::TMDslash<FT, veclen, soalen, compress12> upstream_dslash;
+
+  double const mass_factor_alpha;
+  double const mass_factor_beta;
+  double const derived_mu;
+  double const derived_mu_inv;
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+class WilsonClovDslash : public Dslash<FT, veclen, soalen, compress12> {
+ public:
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock Spinor;
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::SU3MatrixBlock SU3MatrixBlock;
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock CloverBlock;
+
+  WilsonClovDslash(::QPhiX::Geometry<FT, veclen, soalen, compress12> *geom_,
+                   double const t_boundary_, double const aniso_coeff_S_,
+                   double const aniso_coeff_T_, double const mass_,
+                   CloverBlock *const (&clover_)[2], CloverBlock *const (&inv_clover_)[2],
+                   bool use_tbc_[4] = nullptr, double tbc_phases_[4][2] = nullptr)
+      : Dslash<FT, veclen, soalen, compress12>(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_,
+                                               mass_, use_tbc_, tbc_phases_),
+        upstream_dslash(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_, use_tbc_, tbc_phases_),
+        mass_factor_alpha(4.0 + mass_),
+        mass_factor_beta(1.0 / (4.0 * mass_factor_alpha)) {
+    for (int cb : {0, 1}) {
+      clover[cb] = clover_[cb];
+      inv_clover[cb] = inv_clover_[cb];
+    }
+  }
+
+  void A_chi(Spinor *const out, Spinor const *const in, int const isign_ignored,
+             int const cb) override {
+    clover_product(out, in, clover[cb], upstream_dslash.getGeometry());
+  }
+
+  void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign_ignored,
+                 int const cb) override {
+    clover_product(out, in, inv_clover[cb], upstream_dslash.getGeometry());
+  }
+
+  void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
+              int const isign, int const cb) override {
+    upstream_dslash.dslash(res, psi, u, inv_clover[cb], isign, cb);
+  }
+
+  void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
+                  const SU3MatrixBlock *const u, double const alpha, double const beta,
+                  int const isign, int const cb) override {
+    upstream_dslash.dslashAChiMinusBDPsi(res, psi, chi, u, clover[cb], mass_factor_beta, isign, cb);
+  }
+
+ private:
+  ::QPhiX::ClovDslash<FT, veclen, soalen, compress12> upstream_dslash;
+
+  double const mass_factor_alpha;
+  double const mass_factor_beta;
+
+  /**
+    Reference to the clover term.
+
+    This class has to provide a `dslash` and `achimbdpsi` member function with the prescribed
+    argument list which does not contain the clover term. The user of these classes should not have
+    to differentiate between non-clover and clover variants. In order to provide the function
+    signature, the clover term is a member. This means that the user has to construct a new operator
+    if the pointers to the clover field need to be changed. Seperate pointers are kept for the fields
+    on the even and odd checkerboards, hence the array dimension.
+    */
+  CloverBlock *clover[2];
+
+  /// See \ref clover.
+  CloverBlock *inv_clover[2];
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+class WilsonClovTMDslash : public Dslash<FT, veclen, soalen, compress12> {
+ public:
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock Spinor;
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::SU3MatrixBlock SU3MatrixBlock;
+  typedef
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FullCloverBlock FullCloverBlock;
+  typedef
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock CloverBlock;
+
+  WilsonClovTMDslash(::QPhiX::Geometry<FT, veclen, soalen, compress12> *geom_,
+                     double const t_boundary_, double const aniso_coeff_S_,
+                     double const aniso_coeff_T_, double const mass_, double const twisted_mass_,
+                     CloverBlock *const (&clover_)[2],
+                     FullCloverBlock *const (&inv_clover_)[2][2], bool use_tbc_[4] = nullptr,
+                     double tbc_phases_[4][2] = nullptr)
+      : Dslash<FT, veclen, soalen, compress12>(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_,
+                                               mass_, use_tbc_, tbc_phases_),
+        upstream_dslash(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_, use_tbc_, tbc_phases_),
+        mass_factor_alpha(4.0 + mass_),
+        mass_factor_beta(0.25),
+        derived_mu(twisted_mass_ / mass_factor_alpha),
+        derived_mu_inv(mass_factor_alpha /
+                       (mass_factor_alpha * mass_factor_alpha + twisted_mass_ * twisted_mass_)) {
+    for (int cb : {0, 1}) {
+      clover[cb] = clover_[cb];
+      for (int fl : {0, 1}) {
+        inv_clover[cb][fl] = inv_clover_[cb][fl];
+      }
+    }
+  }
+
+  void A_chi(Spinor *const out, Spinor const *const in, int const isign, int const cb) override {
+    clover_product(out, in, clover[cb], upstream_dslash.getGeometry());
+    // TODO: add twisted mass here
+  }
+
+  void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign,
+                 int const cb) override {
+    if (isign == -1) {
+      clover_product(out, in, inv_clover[cb][1], upstream_dslash.getGeometry());
+    } else {
+      clover_product(out, in, inv_clover[cb][0], upstream_dslash.getGeometry());
+    }
+  }
+
+  void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
+              int const isign, int const cb) override {
+    upstream_dslash.dslash(res, psi, u, (const FullCloverBlock **)inv_clover[cb], isign, cb);
+  }
+
+  void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
+                  const SU3MatrixBlock *const u, double const alpha, double const beta,
+                  int const isign, int const cb) override {
+    upstream_dslash.dslashAChiMinusBDPsi(res, psi, chi, u, clover[cb],
+                                         mass_factor_beta, isign, cb);
+  }
+
+ private:
+  ::QPhiX::TMClovDslash<FT, veclen, soalen, compress12> upstream_dslash;
+
+  double const mass_factor_alpha;
+  double const mass_factor_beta;
+  double const derived_mu;
+  double const derived_mu_inv;
+
+  CloverBlock *clover[2];
+  /* For twisted clover, there are two fields on each checkerboard which differ in the sign
+   * of the twisted quark mass. In effect then, the inner index can be thought of as being
+   * in flavour space while the outer index is the checkerboard index. 
+   */
+  FullCloverBlock *inv_clover[2][2];
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+void WilsonTMDslash<FT, veclen, soalen, compress12>::helper_A_chi(Spinor *const out,
+                                                                  Spinor const *const in,
+                                                                  double const factor_a,
+                                                                  double const factor_b) {
+  auto const nVecs = upstream_dslash.getGeometry().nVecs();
+  auto const Pxy = upstream_dslash.getGeometry().getPxy();
+  auto const Pxyz = upstream_dslash.getGeometry().getPxyz();
+
+  for (uint64_t t = 0; t < T; t++)
+    for (uint64_t x = 0; x < LX / 2; x++)
+      for (uint64_t y = 0; y < LY; y++)
+        for (uint64_t z = 0; z < LZ; z++) {
+          uint64_t const SIMD_vector = x / soalen;
+          uint64_t const x_internal = x % soalen;
+          uint64_t const qphix_idx = t * Pxyz + z * Pxy + y * nVecs + SIMD_vector;
+
+          for (int color = 0; color < 3; ++color) {
+            for (int spin_block = 0; spin_block < 2; ++spin_block) {
+              // Implement the $\gamma_5$ structure.
+              auto const signed_factor_a = factor_a * (spin_block == 0 ? 1.0 : -1.0);
+
+              for (int half_spin = 0; half_spin < 2; ++half_spin) {
+                auto const four_spin = 2 * spin_block + half_spin;
+                for (int v = 0; v < soalen; ++v) {
+                  auto &out_bcs = out[qphix_idx][color][four_spin];
+                  auto const &in_bcs = in[qphix_idx][color][four_spin];
+
+                  out_bcs[re][v] = factor_b * (in_bcs[re][v] + signed_factor_a * in_bcs[im][v]);
+                  out_bcs[im][v] = factor_b * (in_bcs[im][v] - signed_factor_a * in_bcs[re][v]);
+                }
+              }
+            }
+          }
+
+        }  // volume
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+void Dslash<FT, veclen, soalen, compress12>::prepare_source(Spinor *const tilde_b_odd,
+                                                            Spinor const *const b_even,
+                                                            Spinor const *const b_odd,
+                                                            SU3MatrixBlock const *const u) {
+  auto Mee_be = QPhiX::makeFourSpinorHandle(*geom);
+  WilsonDslash<FT, veclen, soalen, compress12> plain_dslash(geom, t_boundary, aniso_coeff_S,
+                                                            aniso_coeff_T, mass);
+
+  A_inv_chi(Mee_be.get(), b_even, 1, cb_even);
+
+  plain_dslash.dslash(tilde_b_odd, Mee_be.get(), u, 1, cb_odd);
+
+  // FIXME Perhaps use a variable number of BLAS threads here (last parameter).
+  QPhiX::aypx(0.5, Mee_be.get(), tilde_b_odd, *geom, 1);
+}
+
+template <typename FT, int veclen, int soalen, bool compress12>
+void Dslash<FT, veclen, soalen, compress12>::reconstruct_solution(Spinor *const x_even,
+                                                                  Spinor const *const b_even,
+                                                                  Spinor const *const x_odd,
+                                                                  SU3MatrixBlock const *const u) {
+  auto tmp = QPhiX::makeFourSpinorHandle(*geom);
+  WilsonDslash<FT, veclen, soalen, compress12> plain_dslash(geom, t_boundary, aniso_coeff_S,
+                                                            aniso_coeff_T, mass);
+
+  plain_dslash.dslash(tmp.get(), x_odd, u, 1, cb_even);
+  QPhiX::aypx(0.5, b_even, tmp.get(), *geom, 1);
+  A_inv_chi(x_even, tmp.get(), 1, cb_even);
+}
+}
diff --git a/qphix_interface.cpp b/qphix_interface.cpp
new file mode 100644
index 000000000..83f6cbef8
--- /dev/null
+++ b/qphix_interface.cpp
@@ -0,0 +1,2193 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2015 Mario Schroeck
+ *               2016 Peter Labus
+ *               2017 Peter Labus, Martin Ueding, Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+
+#include "qphix_interface.h"
+#include "qphix_interface.hpp"
+#include "qphix_interface_utils.hpp"
+#include "qphix_types.h"
+#include "qphix_veclen.h"
+
+#ifdef TM_USE_MPI
+#include <mpi.h>
+#endif
+
+extern "C" {
+#ifdef HAVE_CONFIG_H
+#include "tmlqcd_config.h"
+#endif
+#include "boundary.h"
+#include "geometry_eo.h"
+#include "gettime.h"
+#include "global.h"
+#include "struct_accessors.h"
+#include "linalg/convert_eo_to_lexic.h"
+#include "linalg/diff.h"
+#include "linalg/square_norm.h"
+#include "linalg/square_norm.h"
+#include "misc_types.h"
+// for the normalisation of the heavy doublet when running
+// RHMC
+#include "phmc.h"
+#include "start.h"
+#include "operator/clover_leaf.h"
+#include "operator/clovertm_operators.h"
+#include "operator_types.h"
+#include "operator/Hopping_Matrix.h"
+#include "solver/matrix_mult_typedef.h"
+#include "solver/solver_types.h"
+#include "solver/solver.h"
+#include "solver/solver_field.h"
+#include "solver/solver_params.h"
+#include "xchange/xchange_gauge.h"
+}
+#ifdef TM_USE_OMP
+#include <omp.h>
+#endif
+#include <qphix/blas_new_c.h>
+#include <qphix/clover.h>
+#include <qphix/invbicgstab.h>
+#include <qphix/invcg.h>
+#include <qphix/inv_richardson_multiprec.h>
+#include <qphix/inv_dummy_hermtest.h>
+#include <qphix/minvcg.h>
+#include <qphix/ndtm_reuse_operator.h>
+#include <qphix/ndtm_reuse_operator_clover.h>
+#include <qphix/print_utils.h>
+#include <qphix/qphix_config.h>
+#include <qphix/twisted_mass.h>
+#include <qphix/twisted_mass_clover.h>
+#include <qphix/wilson.h>
+#include <cfloat>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+#include <cmath>
+
+using namespace tmlqcd;
+
+tm_QPhiXParams_t qphix_input;
+
+int By;
+int Bz;
+int NCores;
+int Sy;
+int Sz;
+int PadXY;
+int PadXYZ;
+int MinCt;
+int N_simt;
+bool compress12;
+QphixPrec_t qphix_precision;
+QphixPrec_t qphix_inner_precision;
+
+
+int subLattSize[4];
+int lattSize[4];
+int qmp_geom[4];
+int qmp_tm_map[4];
+
+// angles for boundary phases, values come from read_input
+extern double X0, X1, X2, X3;
+
+bool use_tbc[4];
+double tbc_phases[4][2];
+// we always use twisted boundary conditions, which means that we are always
+// periodic in time and any possible anti-periodicity is implemented via
+// the phase
+double constexpr t_boundary = 1.0;
+
+template <typename T>
+struct rsdTarget {
+  static const double value;
+};
+
+template <>
+const double rsdTarget<QPhiX::half>::value = 1.0e-3;
+
+template <>
+const double rsdTarget<float>::value = 1.0e-8;
+
+
+void _initQphix(int argc, char **argv, tm_QPhiXParams_t params, int c12, QphixPrec_t precision_, QphixPrec_t inner_precision_) {
+  static bool qmp_topo_initialised = false;
+
+  // Global Lattice Size
+  lattSize[0] = LX * g_nproc_x;
+  lattSize[1] = LY * g_nproc_y;
+  lattSize[2] = LZ * g_nproc_z;
+  lattSize[3] = T * g_nproc_t;
+
+  // Local Lattice Size
+  subLattSize[0] = LX;
+  subLattSize[1] = LY;
+  subLattSize[2] = LZ;
+  subLattSize[3] = T;
+
+  // extract twisted boundary conditions
+  for (int dim = 0; dim < 4; dim++) {
+    bool dim_tbc = false;
+    double dim_phase[2] = {1.0, 0.0};
+    if (dim == 0) {
+      dim_tbc = (fabs(X1) > DBL_EPSILON);
+      dim_phase[0] = -((double*)(&phase_1))[0] / g_kappa;
+      dim_phase[1] = -((double*)(&phase_1))[1] / g_kappa;
+    } else if (dim == 1) {
+      dim_tbc = (fabs(X2) > DBL_EPSILON);
+      dim_phase[0] = -((double*)(&phase_2))[0] / g_kappa;
+      dim_phase[1] = -((double*)(&phase_2))[1] / g_kappa;
+    } else if (dim == 2) {
+      dim_tbc = (fabs(X3) > DBL_EPSILON);
+      dim_phase[0] = -((double*)(&phase_3))[0] / g_kappa;
+      dim_phase[1] = -((double*)(&phase_3))[1] / g_kappa;
+    } else if (dim == 3) {
+      dim_tbc = (fabs(X0) > DBL_EPSILON);
+      dim_phase[0] = -((double*)(&phase_0))[0] / g_kappa;
+      dim_phase[1] = -((double*)(&phase_0))[1] / g_kappa;
+    }
+    use_tbc[dim] = dim_tbc;
+    tbc_phases[dim][0] = dim_phase[0];
+    tbc_phases[dim][1] = dim_phase[1];
+  }
+
+  By = params.By;
+  Bz = params.Bz;
+  NCores = params.NCores;
+  Sy = params.Sy;
+  Sz = params.Sz;
+  PadXY = params.PadXY;
+  PadXYZ = params.PadXYZ;
+  MinCt = params.MinCt;
+  N_simt = Sy * Sz;
+  if (c12 == 8) {
+    QPhiX::masterPrintf(
+        "# INFO QphiX: 8-parameter gauge compression not supported, using two row compression "
+        "instead!\n");
+    c12 = 12;
+  }
+  compress12 = c12 == 12 ? true : false;
+  qphix_precision = precision_;
+  qphix_inner_precision = inner_precision_;
+
+#ifdef QPHIX_QMP_COMMS
+  // Declare the logical topology
+  if (!qmp_topo_initialised) {
+    // the QMP topology is the one implied by the number of processes in each
+    // dimension as required by QPHIX ( x fastest to t slowest running )
+    qmp_geom[0] = g_nproc_x;
+    qmp_geom[1] = g_nproc_y;
+    qmp_geom[2] = g_nproc_z;
+    qmp_geom[3] = g_nproc_t;
+
+    // in order for the topologies to agree between tmLQCD and QPhiX, the dimensions need to be
+    // permuted
+    // since Z is fastest in tmLQCD and X is second-slowest
+    qmp_tm_map[0] = 2;
+    qmp_tm_map[1] = 1;
+    qmp_tm_map[2] = 0;
+    qmp_tm_map[3] = 3;
+    if (QMP_declare_logical_topology_map(qmp_geom, 4, qmp_tm_map, 4) != QMP_SUCCESS) {
+      QMP_error("Failed to declare QMP Logical Topology\n");
+      abort();
+    }
+    // longish test to check if the logical coordinates are correctly mapped
+    if (g_debug_level >= 5) {
+      for (int proc = 0; proc < g_nproc; proc++) {
+        if (proc == g_proc_id) {
+          const int coordinates[4] = {g_proc_coords[1], g_proc_coords[2], g_proc_coords[3],
+                                      g_proc_coords[0]};
+          int id = QMP_get_node_number_from(coordinates);
+          int *qmp_coords = QMP_get_logical_coordinates_from(id);
+          fflush(stdout);
+          printf("QMP id: %3d x:%3d y:%3d z:%3d t:%3d\n", id, qmp_coords[0], qmp_coords[1],
+                 qmp_coords[2], qmp_coords[3]);
+          printf("MPI id: %3d x:%3d y:%3d z:%3d t:%3d\n\n", g_proc_id, g_proc_coords[1],
+                 g_proc_coords[2], g_proc_coords[3], g_proc_coords[0]);
+          free(qmp_coords);
+          fflush(stdout);
+          MPI_Barrier(MPI_COMM_WORLD);
+        } else {
+          MPI_Barrier(MPI_COMM_WORLD);
+        }
+      }
+    }
+    qmp_topo_initialised = true;
+  }
+#endif
+
+#ifdef QPHIX_QPX_SOURCE
+  if (thread_bind) {
+    QPhiX::setThreadAffinity(NCores_user, Sy_user * Sz_user);
+  }
+  QPhiX::reportAffinity();
+#endif
+}
+
+void _initQphix(int argc, char **argv, tm_QPhiXParams_t params, int c12, QphixPrec_t precision_){
+  _initQphix(argc, argv, params, c12, precision_, precision_);
+}
+
+// Finalize the QPhiX library
+void _endQphix() {}
+
+template <typename FT, int VECLEN, int SOALEN, bool compress12>
+void reorder_clover_to_QPhiX(
+    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
+    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::CloverBlock *qphix_clover, int cb,
+    bool inverse, bool fl_offdiag = false) {
+  const double startTime = gettime();
+
+  /* the spin-colour clover term in sw_term and the corresponding inverse
+   * in sw_inv are stored in the tmLQCD gamma basis.
+   * When we translate spinors to QPhiX, we apply a transformation V to the tmLQCD
+   * spinor and then apply the same transformation to the output spinor
+   * ( we have V^dagger = V and V*V = 1 )
+   * Thus, in order to translate the clover field, we need to copy
+   *   (1+T)' = V*(1+T)*V, where T is the spin-colour clover-term
+   * This way, the clover term will be in the correct gamma basis.
+   *
+   * The tmLQCD clover term is stored in half-spinor blocks of colour matrices
+   * for which we need to work out what (1+T)'=V*(1+T)*V implies.
+   * Below, each sAB represents one 3x3 colour matrix
+   *
+   *                +s33 -s32    0    0
+   *  T' = V*T*V =  -s23 +s22    0    0
+   *                   0    0 +s11 -s10
+   *                   0    0 -s01 +s00
+   *
+   * Such that the half-spinor blocks are inverted and within these, the ordering is
+   * reversed. Note that the off-diagonal 3x3 colour blocks are hermitian conjugate to
+   * each other and this is preserved by the transformation.
+   *
+   * The QPhiX (Wilson) clover term is stored as 12 reals on the diagonal
+   * in two 6-element vectors, one for each half-spinor spin pair
+   * and two sets of off-diagonal complex components.
+   *
+   * In addition, colour matrices are transposed in QPhiX.
+   *
+   * The tmLQCD clover term is stored as:
+   *
+   *      s00 s01
+   *          s11
+   * T =          s22 s23
+   *                  s33
+   *
+   * with indexing
+   *
+   *     sw[0][0] sw[1][0]
+   *              sw[2][0]
+   *                       sw[0][1] sw[1][1]
+   *                                sw[2][1]
+   *
+   * The inverse has four su3 blocks instead and is indexed
+   *     sw_inv[0][0] sw_inv[1][0]
+   *     sw_inv[3][0] sw_inv[2][0]
+   *                               sw_inv[0][1] sw_inv[1][1]
+   *                               sw_inv[3][1] sw_inv[2][1]
+   *
+   * where blocks sw_inv[3][0] and sw_inv[3][1] are relevant only when mu > 0
+   * 
+   * There is a special case for the non-degenerate twisted clover operator. The flavour-off-diagonal
+   * components of the inverse clover term do not have an imaginary part on the spin-colour diagonal.
+   * They can thus be stored as CloverBlock, which is done in the QPhiX implementation
+   * of the ND tmclover operator.
+   * 
+   * As a hack, this inverse is prepared by sw_invert_epsbar and placed in to the last
+   * VOLUME/2 sites of sw_inv. Reading from there is triggered by the boolean
+   * fl_offdiag.
+   */
+
+  // rescale to get clover term (or its inverse) in the physical normalisation
+  // rather than the kappa normalisation
+  const double scale = inverse ? 2.0 * g_kappa : 1.0 / (2.0 * g_kappa);
+  su3 ***tm_clover = inverse ? sw_inv : sw;
+
+  // Number of elements in spin, color & complex
+  const int Ns = 4;
+  const int Nc = 3;
+  const int Nz = 2;
+
+  // Geometric parameters for QPhiX data layout
+  const auto ngy = geom.nGY();
+  const auto nVecs = geom.nVecs();
+  const auto Pxy = geom.getPxy();
+  const auto Pxyz = geom.getPxyz();
+
+  // packer for Wilson clover (real diagonal + complex upper-triangular)
+  /* for the index in the off_diagN arrays, we map to an index in the su3 struct
+  * keeping in mind complex conjugation
+  * The off-diagonal in QPhiX is stored as follows:
+  *
+  * 0 1 3 6 10
+  *   2 4 7 11
+  *     5 8 12
+  *       9 13
+  *         14
+  *
+  * which we are going to map to su3 in blocks
+  *
+  *     0* 1*
+  *        2*
+  *
+  * 3   4  5
+  * 6   7  8
+  * 10 11 12
+  *
+  *   9* 13*
+  *      14*
+  *
+  * where the asterisk indicates complex conjugation. As a linear array then,
+  * these mappings are:
+  *
+  */
+  const int od_su3_offsets[15] = {Nz,
+                                  2 * Nz,            //     0 1
+                                  Nc * Nz + 2 * Nz,  //       2
+
+                                  0,
+                                  Nz,
+                                  2 * Nz,  // 3  4  5
+                                  Nc * Nz,
+                                  Nc * Nz + Nz,
+                                  Nc * Nz + 2 * Nz,  // 6  7  8
+
+                                  Nz,  //     9
+
+                                  2 * Nc * Nz,
+                                  2 * Nc * Nz + Nz,
+                                  2 * Nc * Nz + 2 * Nz,  // 10 11 12
+
+                                  2 * Nz,
+                                  Nc * Nz + 2 * Nz};  // 13 14
+
+#pragma omp parallel for collapse(4)
+  for (int64_t t = 0; t < T; t++) {
+    for (int64_t z = 0; z < LZ; z++) {
+      for (int64_t y = 0; y < LY; y++) {
+        for (int64_t v = 0; v < nVecs; v++) {
+          int64_t block = (t * Pxyz + z * Pxy) / ngy + (y / ngy) * nVecs + v;
+
+          for (int64_t x_soa = 0; x_soa < SOALEN; x_soa++) {
+            int64_t xx = (y % ngy) * SOALEN + x_soa;
+            int64_t q_cb_x_coord = x_soa + v * SOALEN;
+            int64_t tm_x_coord = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ cb);
+
+            // the inverse of the clover term is in even-odd ordering
+            // while the clover term itself is lexicographically ordered
+            // for the special case of the nd tmclover operator, the inverse of the flavour off-diagonal
+            // components is stored in the last VOLUME/2 elements of sw_inv
+            int64_t tm_idx =
+                (inverse ? g_lexic2eosub[g_ipt[t][tm_x_coord][y][z]] : g_ipt[t][tm_x_coord][y][z]) +
+                ( (inverse && fl_offdiag) ? VOLUME/2 : 0 );
+
+            int b_idx;
+
+            //             we begin with the diagonal elements in CloverBlock
+            for (int d = 0; d < 6; d++) {
+              //               choose the block in sw which corresponds to the block in T'
+              b_idx = d < 3 ? 2 : 0;
+              //               get the right colour components
+              qphix_clover[block].diag1[d][xx] =
+                QPhiX::rep<FT, double>(
+                  *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][1].c00) +
+                    (Nc * Nz + Nz) * (d % 3)) *
+                  scale
+                );
+
+              qphix_clover[block].diag2[d][xx] =
+                QPhiX::rep<FT, double>(
+                  *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][0].c00) +
+                    (Nc * Nz + Nz) * (d % 3)) *
+                  scale
+                );
+            }
+
+            b_idx = 2;  // s33 and s11
+            for (int od : {0, 1, 2}) {
+              for (int reim : {0, 1}) {
+                qphix_clover[block].off_diag1[od][reim][xx] =
+                  QPhiX::rep<FT, double>(
+                    (reim == 1 ? -1.0 : 1.0) *
+                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][1].c00) +
+                      od_su3_offsets[od] + reim) *
+                    scale
+                  );
+
+                qphix_clover[block].off_diag2[od][reim][xx] =
+                  QPhiX::rep<FT, double>( 
+                    (reim == 1 ? -1.0 : 1.0) *
+                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][0].c00) +
+                      od_su3_offsets[od] + reim) *
+                    scale 
+                  ); 
+              }
+            }
+
+            b_idx = 1;  // s32 and s10
+            for (int od : {3, 4, 5, 6, 7, 8, 10, 11, 12}) {
+              for (int reim : {0, 1}) {
+                qphix_clover[block].off_diag1[od][reim][xx] = 
+                  QPhiX::rep<FT, double>(
+                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][1].c00) +
+                      od_su3_offsets[od] + reim) *
+                    (-scale) 
+                  );
+
+                qphix_clover[block].off_diag2[od][reim][xx] =
+                  QPhiX::rep<FT, double>(
+                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][0].c00) +
+                      od_su3_offsets[od] + reim) *
+                    (-scale)
+                  );
+              }
+            }
+
+            b_idx = 0;  // s22 and s00
+            for (int od : {9, 13, 14}) {
+              for (int reim : {0, 1}) {
+                qphix_clover[block].off_diag1[od][reim][xx] =
+                  QPhiX::rep<FT, double>(
+                    (reim == 1 ? -1.0 : 1.0) *
+                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][1].c00) +
+                      od_su3_offsets[od] + reim) *
+                    scale
+                  );
+
+                qphix_clover[block].off_diag2[od][reim][xx] =
+                  QPhiX::rep<FT, double>(
+                    (reim == 1 ? -1.0 : 1.0) *
+                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][0].c00) +
+                      od_su3_offsets[od] + reim) *
+                    scale
+                  );
+              }
+            }
+
+          }  // x_soa
+        }    // for(v)
+      }      // for(y)
+    }        // for(z)
+  }          // for(t)
+
+  const double diffTime = gettime() - startTime;
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf(
+        "# QPHIX-interface: time spent in reorder_clover_to_QPhiX (CloverBlock): %f secs\n",
+        diffTime);
+  }
+}
+
+template <typename FT, int VECLEN, int SOALEN, bool compress12>
+void reorder_clover_to_QPhiX(
+    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
+    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::FullCloverBlock *qphix_clover[2],
+    int cb, bool inverse) {
+  const double startTime = gettime();
+
+  /* the spin-colour clover term in sw_term and the corresponding inverse
+   * in sw_inv are stored in the tmLQCD gamma basis.
+   * When we translate spinors to QPhiX, we apply a transformation V to the tmLQCD
+   * spinor and then apply the same transformation to the output spinor
+   * ( we have V^dagger = V and V*V = 1 )
+   * Thus, in order to translate the clover field, we need to copy
+   *   (1+T)' = V*(1+T)*V, where T is the spin-colour clover-term
+   * This way, the clover term will be in the correct gamma basis.
+   *
+   * The tmLQCD clover term is stored in half-spinor blocks of colour matrices
+   * for which we need to work out what (1+T)'=V*(1+T)*V implies.
+   * Below, each sAB represents one 3x3 colour matrix
+   *
+   *                +s33 -s32    0    0
+   *  T' = V*T*V =  -s23 +s22    0    0
+   *                   0    0 +s11 -s10
+   *                   0    0 -s01 +s00
+   *
+   * Such that the half-spinor blocks are inverted and within these, the ordering is
+   * reversed. Note that the off-diagonal 3x3 colour blocks are hermitian conjugate to
+   * each other and this is preserved by the transformation.
+   *
+   * The QPhiX (tmclover) clover term and its inverse are stored as a pair of full
+   * 6x6 complex matrices which are multiplied with the spinor in exactly the same way
+   * as in tmLQCD.
+   *
+   * The tmLQCD clover term is stored as:
+   *
+   *      s00 s01
+   *          s11
+   * T =          s22 s23
+   *                  s33
+   *
+   * with indexing
+   *
+   *     sw[0][0] sw[1][0]
+   *              sw[2][0]
+   *                       sw[0][1] sw[1][1]
+   *                                sw[2][1]
+   *
+   * The inverse has four su3 blocks instead and is indexed
+   *     sw_inv[0][0] sw_inv[1][0]
+   *     sw_inv[3][0] sw_inv[2][0]
+   *                               sw_inv[0][1] sw_inv[1][1]
+   *                               sw_inv[3][1] sw_inv[2][1]
+   *
+   * where blocks sw_inv[3][0] and sw_inv[3][1] are relevant only when mu > 0   *
+   */
+
+  // rescale to get clover term (or its inverse) in the physical normalisation
+  // rather than the kappa normalisation
+  const double scale = inverse ? 2.0 * g_kappa : 1.0 / (2.0 * g_kappa);
+  su3 ***tm_clover = inverse ? sw_inv : sw;
+
+  // Number of elements in spin, color & complex
+  const int Ns = 4;
+  const int Nc = 3;
+  const int Nz = 2;
+
+  const double amu = g_mu / (2.0 * g_kappa);
+
+  // Geometric parameters for QPhiX data layout
+  const auto ngy = geom.nGY();
+  const auto nVecs = geom.nVecs();
+  const auto Pxy = geom.getPxy();
+  const auto Pxyz = geom.getPxyz();
+
+#pragma omp parallel for collapse(4)
+  for (int64_t t = 0; t < T; t++) {
+    for (int64_t z = 0; z < LZ; z++) {
+      for (int64_t y = 0; y < LY; y++) {
+        for (int64_t v = 0; v < nVecs; v++) {
+          int64_t block = (t * Pxyz + z * Pxy) / ngy + (y / ngy) * nVecs + v;
+
+          for (int64_t x_soa = 0; x_soa < SOALEN; x_soa++) {
+            int64_t xx = (y % ngy) * SOALEN + x_soa;
+            int64_t q_cb_x_coord = x_soa + v * SOALEN;
+            int64_t tm_x_coord = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ cb);
+
+            //             the inverse of the clover term is in even-odd ordering
+            //             while the clover term itself is lexicographically ordered
+            int64_t tm_idx =
+                inverse ? g_lexic2eosub[g_ipt[t][tm_x_coord][y][z]] : g_ipt[t][tm_x_coord][y][z];
+
+            for (int fl : {0, 1}) {
+              if (inverse && fl == 1) {
+                // the inverse clover term for the second flavour is stored at an offset
+                tm_idx += VOLUME / 2;
+              }
+              for (int q_hs : {0, 1}) {
+                auto &hs_block =
+                    ((q_hs == 0) ? qphix_clover[fl][block].block1 : qphix_clover[fl][block].block2);
+                for (int q_sc1 = 0; q_sc1 < 6; q_sc1++) {
+                  for (int q_sc2 = 0; q_sc2 < 6; q_sc2++) {
+                    const int q_s1 = q_sc1 / 3;
+                    const int q_s2 = q_sc2 / 3;
+                    const int q_c1 = q_sc1 % 3;
+                    const int q_c2 = q_sc2 % 3;
+
+                    // invert in spin as required by V*T*V
+                    const int t_hs = 1 - q_hs;
+                    // the indices inside the half-spinor are also inverted
+                    // (which transposes them, of course)
+                    const int t_s1 = 1 - q_s1;
+                    const int t_s2 = 1 - q_s2;
+                    // carry out the mapping from T' to T, keeping in mind that for the inverse
+                    // there are four blocks also on the tmLQCD side, otherwise there are just three
+                    const int t_b_idx = t_s1 + t_s2 + ((inverse && t_s1 == 1 && t_s2 == 0) ? 2 : 0);
+                    for (int reim : {0, 1}) {
+                      hs_block[q_sc1][q_sc2][reim][xx] =
+                        QPhiX::rep<FT,double>(
+                          scale *
+                              // off-diagonal (odd-numbered) blocks change sign
+                              (t_b_idx & 1 ? (-1.0) : 1.0) *
+                              // if not doing the inverse and in the bottom-left block, need to
+                              // complex conjugate
+                              ((!inverse && (t_s1 == 1 && t_s2 == 0) && reim == 1) ? -1.0 : 1.0) *
+                              *(reinterpret_cast<double const *const>(
+                                    &(tm_clover[tm_idx][t_b_idx][t_hs].c00)) +
+                                // if not doing the inverse and in the bottom-left block, transpose
+                                // in colour
+                                // because we're actually reading out of the top-right block
+                                Nz * ((!inverse && (t_s1 == 1 && t_s2 == 0)) ? Nc * q_c2 + q_c1
+                                                                             : Nc * q_c1 + q_c2) +
+                                reim) +
+                          // in the QPhiX gamma basis, the twisted quark mass enters with the
+                          // opposite
+                          // sign for consistency
+                          ((!inverse && q_sc1 == q_sc2 && q_hs == 0 && reim == 1)
+                               ? -amu * (1 - 2 * fl)
+                               : 0) +
+                          ((!inverse && q_sc1 == q_sc2 && q_hs == 1 && reim == 1)
+                               ? amu * (1 - 2 * fl)
+                               : 0)
+                        );
+                    }
+                  }  // q_sc2
+                }    // q_sc1
+              }      // q_hs
+            }        // fl
+
+          }  // x_soa
+        }    // for(v)
+      }      // for(y)
+    }        // for(z)
+  }          // for(t)
+
+  const double diffTime = gettime() - startTime;
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf(
+        "# QPHIX-interface: time spent in reorder_clover_to_QPhiX (FullCloverBlock): %f secs\n",
+        diffTime);
+  }
+}
+
+template <typename FT, int VECLEN, int SOALEN, bool compress12>
+void reorder_gauge_to_QPhiX(
+    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
+    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::SU3MatrixBlock *qphix_gauge_cb0,
+    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::SU3MatrixBlock *qphix_gauge_cb1) {
+  const double startTime = gettime();
+
+  // Number of elements in spin, color & complex
+  // Here c1 is QPhiX's outer color, and c2 the inner one
+  const int Ns = 4;
+  const int Nc1 = compress12 ? 2 : 3;
+  const int Nc2 = 3;
+  const int Nz = 2;
+
+  // Geometric parameters for QPhiX data layout
+  const auto ngy = geom.nGY();
+  const auto nVecs = geom.nVecs();
+  const auto Pxy = geom.getPxy();
+  const auto Pxyz = geom.getPxyz();
+
+  // This is needed to translate between the different
+  // orderings of the direction index "\mu" in tmlQCD
+  // and QPhiX, respectively
+  // in qphix, the Dirac operator is applied in the order
+  //   -+x -> -+y -> -+z -> -+t
+  // while tmlqcd does
+  //   -+t -> -+x -> -+y -> -+z
+  // same as the lattice ordering
+  // The mappingn between the application dimensions is thus:
+  //  tmlqcd_dim(t(0) -> x(1) -> y(2) -> z(3)) = qphix_dim( t(3) -> x(0) -> y(1) -> z(2) )
+  const int change_dim[4] = {1, 2, 3, 0};
+
+  // Get the base pointer for the (global) tmlQCD gauge field
+  xchange_gauge(g_gauge_field);
+  const double *in = reinterpret_cast<double *>(&g_gauge_field[0][0].c00);
+
+#pragma omp parallel for collapse(4)
+  for (int64_t t = 0; t < T; t++)
+    for (int64_t z = 0; z < LZ; z++)
+      for (int64_t y = 0; y < LY; y++)
+        for (int64_t v = 0; v < nVecs; v++) {
+          int64_t block = (t * Pxyz + z * Pxy) / ngy + (y / ngy) * nVecs + v;
+
+          for (int dim = 0; dim < 4; dim++)     // dimension == QPhiX \mu
+            for (int c1 = 0; c1 < Nc1; c1++)    // QPhiX convention color 1 (runs up to 2 or 3)
+              for (int c2 = 0; c2 < Nc2; c2++)  // QPhiX convention color 2 (always runs up to 3)
+                for (int x_soa = 0; x_soa < SOALEN; x_soa++) {
+                  int64_t xx = (y % ngy) * SOALEN + x_soa;
+                  int64_t q_cb_x_coord = x_soa + v * SOALEN;
+                  int64_t tm_x_coord_cb0 = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ 0);
+                  int64_t tm_x_coord_cb1 = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ 1);
+
+                  int64_t tm_idx_cb0;
+                  int64_t tm_idx_cb1;
+
+                  // backward / forward
+                  for (int dir = 0; dir < 2; dir++) {
+                    if (dir == 0) {
+                      tm_idx_cb0 = g_idn[g_ipt[t][tm_x_coord_cb0][y][z]][change_dim[dim]];
+                      tm_idx_cb1 = g_idn[g_ipt[t][tm_x_coord_cb1][y][z]][change_dim[dim]];
+                    } else {
+                      tm_idx_cb0 = g_ipt[t][tm_x_coord_cb0][y][z];
+                      tm_idx_cb1 = g_ipt[t][tm_x_coord_cb1][y][z];
+                    }
+                    for (int reim = 0; reim < Nz; reim++) {
+                      // Note:
+                      // -----
+                      // 1. \mu in QPhiX runs from 0..7 for all eight neighbouring
+                      // links.
+                      //    Here, the ordering of the direction (backward/forward)
+                      //    is the same
+                      //    for tmlQCD and QPhiX, but we have to change the
+                      //    ordering of the dimensions.
+                      int q_mu = 2 * dim + dir;
+
+                      qphix_gauge_cb0[block][q_mu][c1][c2][reim][xx] = QPhiX::rep<FT, double>(
+                        su3_get_elem(&(g_gauge_field[tm_idx_cb0][change_dim[dim]]), c2, c1, reim ) );
+                      qphix_gauge_cb1[block][q_mu][c1][c2][reim][xx] = QPhiX::rep<FT, double>(
+                        su3_get_elem(&(g_gauge_field[tm_idx_cb1][change_dim[dim]]), c2, c1, reim ) );
+                    }
+                  }
+                }  // for(dim,c1,c2,x_soa)
+        }          // outer loop (t,z,y,v)
+
+  const double diffTime = gettime() - startTime;
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_gauge_to_QPhiX: %f secs\n",
+                        diffTime);
+  }
+}
+
+// Reorder tmLQCD eo-spinor to a FourSpinorBlock QPhiX spinor on the given checkerboard
+template <typename FT, int VECLEN, int SOALEN, bool compress12>
+void reorder_eo_spinor_to_QPhiX(
+    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom, spinor const *const tm_eo_spinor,
+    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::FourSpinorBlock *qphix_spinor,
+    const int cb) {
+  const double startTime = gettime();
+
+  const int Ns = 4;
+  const int Nc = 3;
+  const int Nz = 2;
+
+  const auto nVecs = geom.nVecs();
+  const auto Pxy = geom.getPxy();
+  const auto Pxyz = geom.getPxyz();
+  const auto Nxh = geom.Nxh();
+
+  // This is needed to translate between the different
+  // gamma bases tmlQCD and QPhiX are using
+  // (note, this is a 4x4 matrix with 4 non-zero elements)
+  const int change_sign[4] = {1, -1, -1, 1};
+  const int change_spin[4] = {3, 2, 1, 0};
+
+#pragma omp parallel for collapse(4)
+  for (int64_t t = 0; t < T; t++) {
+    for (int64_t z = 0; z < LZ; z++) {
+      for (int64_t y = 0; y < LY; y++) {
+        for (int64_t v = 0; v < nVecs; v++) {
+          for (int col = 0; col < Nc; col++) {
+            for (int q_spin = 0; q_spin < Ns; q_spin++) {
+              for (int x_soa = 0; x_soa < SOALEN; x_soa++) {
+                int64_t q_ind = t * Pxyz + z * Pxy + y * nVecs + v;
+                int64_t q_cb_x_coord = v * SOALEN + x_soa;
+                // when t+y+z is odd and we're on an odd (1) checkerboard OR
+                // when t+y+z is even and we're on an even (0) checkerboard
+                // the full x coordinate is 2*x_cb
+                // otherwise, it is 2*x_cb+1
+                int64_t tm_x_coord = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ cb);
+                // exchange x and z dimensions
+                int64_t tm_eo_ind = g_lexic2eosub[g_ipt[t][tm_x_coord][y][z]];
+
+                for (int reim = 0; reim < 2; reim++) {
+                  qphix_spinor[q_ind][col][q_spin][reim][x_soa] =
+                    QPhiX::rep<FT, double>(
+                         change_sign[q_spin] * spinor_get_elem( &(tm_eo_spinor[tm_eo_ind]), 
+                                                                change_spin[q_spin],
+                                                                col,
+                                                                reim
+                                                              )
+                    );
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  const double diffTime = gettime() - startTime;
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_eo_spinor_to_QPhiX: %f secs\n",
+                        diffTime);
+  }
+}
+
+template <typename FT, int VECLEN, int SOALEN, bool compress12>
+void reorder_eo_spinor_from_QPhiX(
+    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom, spinor* tm_eo_spinor,
+    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::FourSpinorBlock *qphix_spinor,
+    const int cb, double normFac = 1.0) {
+  const double startTime = gettime();
+
+  const int Ns = 4;
+  const int Nc = 3;
+  const int Nz = 2;
+
+  const auto nVecs = geom.nVecs();
+  const auto Pxy = geom.getPxy();
+  const auto Pxyz = geom.getPxyz();
+  const auto Nxh = geom.Nxh();
+
+  // This is needed to translate between the different
+  // gamma bases tmlQCD and QPhiX are using
+  // (note, this is a 4x4 matrix with 4 non-zero elements)
+  const int change_sign[4] = {1, -1, -1, 1};
+  const int change_spin[4] = {3, 2, 1, 0};
+
+#pragma omp parallel for collapse(4)
+  for (int64_t t = 0; t < T; t++) {
+    for (int64_t z = 0; z < LZ; z++) {
+      for (int64_t y = 0; y < LY; y++) {
+        for (int64_t v = 0; v < nVecs; v++) {
+          for (int col = 0; col < Nc; col++) {
+            for (int q_spin = 0; q_spin < Ns; q_spin++) {
+              for (int x_soa = 0; x_soa < SOALEN; x_soa++) {
+                int64_t q_ind = t * Pxyz + z * Pxy + y * nVecs + v;
+                int64_t q_cb_x_coord = v * SOALEN + x_soa;
+                // when t+y+z is odd and we're on an odd checkerboard (1) OR
+                // when t+y+z is even and we're on an even (0) checkerboard
+                // the full x coordinate is 2*x_cb
+                // otherwise, it is 2*x_cb+1
+                int64_t tm_x_coord = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ cb);
+                // exchange x and z dimensions
+                int64_t tm_eo_ind = g_lexic2eosub[g_ipt[t][tm_x_coord][y][z]];
+
+                spinor_set_elem( &(tm_eo_spinor[tm_eo_ind]),
+                                 change_spin[q_spin],
+                                 col,
+                                 change_sign[q_spin] * normFac * QPhiX::rep<double, FT>(
+                                  qphix_spinor[q_ind][col][q_spin][0][x_soa]
+                                 ),
+                                 change_sign[q_spin] * normFac * QPhiX::rep<double, FT>(
+                                  qphix_spinor[q_ind][col][q_spin][1][x_soa]
+                                 )
+                               );
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  const double diffTime = gettime() - startTime;
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_eo_spinor_from_QPhiX: %f secs\n",
+                        diffTime);
+  }
+}
+
+// Reorder a full tmLQCD spinor to a cb0 and cb1 QPhiX spinor
+template <typename FT, int VECLEN, int SOALEN, bool compress12>
+void reorder_spinor_to_QPhiX(QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
+                             double const *tm_spinor, FT *qphix_spinor_cb0, FT *qphix_spinor_cb1) {
+  const double startTime = gettime();
+
+  // Number of elements in spin, color & complex
+  const int Ns = 4;
+  const int Nc = 3;
+  const int Nz = 2;
+
+  // Geometric parameters for QPhiX data layout
+  const auto nVecs = geom.nVecs();
+  const auto Pxy = geom.getPxy();
+  const auto Pxyz = geom.getPxyz();
+
+  // This is needed to translate between the different
+  // gamma bases tmlQCD and QPhiX are using
+  const int change_sign[4] = {1, -1, -1, 1};
+  const int change_spin[4] = {3, 2, 1, 0};
+
+// This will loop over the entire lattice and calculate
+// the array and internal indices for both tmlQCD & QPhiX
+#pragma omp parallel for collapse(4)
+  for (uint64_t t = 0; t < T; t++)
+    for (uint64_t x = 0; x < LX; x++)
+      for (uint64_t y = 0; y < LY; y++)
+        for (uint64_t z = 0; z < LZ; z++) {
+          // These are the QPhiX SIMD vector in checkerboarded x direction
+          // (up to LX/2) and the internal position inside the SIMD vector
+          const uint64_t SIMD_vector = (x / 2) / SOALEN;
+          const uint64_t x_internal = (x / 2) % SOALEN;
+
+          // Calculate the array index in tmlQCD & QPhiX,
+          // given a global lattice index (t,x,y,z)
+          const uint64_t qphix_idx = t * Pxyz + z * Pxy + y * nVecs + SIMD_vector;
+          const uint64_t tm_idx = g_ipt[t][x][y][z];
+
+          // Calculate base point for every spinor field element (tmlQCD) or
+          // for every SIMD vector of spinors, a.k.a FourSpinorBlock (QPhiX),
+          // which will depend on the checkerboard (cb)
+          const double *in = tm_spinor + Ns * Nc * Nz * tm_idx;
+          FT *out;
+          if ((t + x + y + z) & 1)
+            out = qphix_spinor_cb1 + SOALEN * Nz * Nc * Ns * qphix_idx;  // odd -> cb1
+          else
+            out = qphix_spinor_cb0 + SOALEN * Nz * Nc * Ns * qphix_idx;  // even -> cb0
+
+          // Copy the internal elements, performing a gamma basis transformation
+          for (int spin = 0; spin < Ns; spin++)  // QPhiX spin index
+            for (int color = 0; color < Nc; color++)
+              for (int z = 0; z < Nz; z++)  // RE or IM
+              {
+                const uint64_t qId =
+                    x_internal + z * SOALEN + spin * SOALEN * Nz + color * SOALEN * Nz * Ns;
+                const uint64_t tId = z + color * Nz + change_spin[spin] * Nz * Nc;
+
+                out[qId] = QPhiX::rep<FT, double>( change_sign[spin] * in[tId] ); 
+              }
+
+        }  // volume
+
+  const double diffTime = gettime() - startTime;
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_spinor_to_QPhiX: %f secs\n",
+                        diffTime);
+  }
+}
+
+// Reorder a cb0 and cb1 QPhiX spinor to a full tmLQCD spinor
+template <typename FT, int VECLEN, int SOALEN, bool compress12>
+void reorder_spinor_from_QPhiX(QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
+                               double *tm_spinor, FT const *qphix_spinor_cb0,
+                               FT const *qphix_spinor_cb1, double normFac = 1.0) {
+  const double startTime = gettime();
+
+  // Number of elements in spin, color & complex
+  const int Ns = 4;
+  const int Nc = 3;
+  const int Nz = 2;
+
+  // Geometric parameters for QPhiX data layout
+  const auto nVecs = geom.nVecs();
+  const auto Pxy = geom.getPxy();
+  const auto Pxyz = geom.getPxyz();
+
+  // This is needed to translate between the different
+  // gamma bases tmlQCD and QPhiX are using
+  const int change_sign[4] = {1, -1, -1, 1};
+  const int change_spin[4] = {3, 2, 1, 0};
+
+// This will loop over the entire lattice and calculate
+// the array and internal indices for both tmlQCD & QPhiX
+#pragma omp parallel for collapse(4)
+  for (uint64_t t = 0; t < T; t++)
+    for (uint64_t x = 0; x < LX; x++)
+      for (uint64_t y = 0; y < LY; y++)
+        for (uint64_t z = 0; z < LZ; z++) {
+          // These are the QPhiX SIMD vector in checkerboarded x direction
+          // (up to LX/2) and the internal position inside the SIMD vector
+          const uint64_t SIMD_vector = (x / 2) / SOALEN;
+          const uint64_t x_internal = (x / 2) % SOALEN;
+
+          // Calculate the array index in tmlQCD & QPhiX,
+          // given a global lattice index (t,x,y,z)
+          const uint64_t qphix_idx = t * Pxyz + z * Pxy + y * nVecs + SIMD_vector;
+          const uint64_t tm_idx = g_ipt[t][x][y][z];
+
+          // Calculate base point for every spinor field element (tmlQCD) or
+          // for every SIMD vector of spinors, a.k.a FourSpinorBlock (QPhiX),
+          // which will depend on the checkerboard (cb)
+          const FT *in;
+          if ((t + x + y + z) & 1)
+            in = qphix_spinor_cb1 + SOALEN * Nz * Nc * Ns * qphix_idx;  // cb1
+          else
+            in = qphix_spinor_cb0 + SOALEN * Nz * Nc * Ns * qphix_idx;  // cb0
+          double *out = tm_spinor + Ns * Nc * Nz * tm_idx;
+
+          // Copy the internal elements, performing a gamma basis transformation
+          for (int spin = 0; spin < Ns; spin++)  // tmlQCD spin index
+            for (int color = 0; color < Nc; color++)
+              for (int z = 0; z < Nz; z++)  // RE or IM
+              {
+                const uint64_t qId = x_internal + z * SOALEN + change_spin[spin] * SOALEN * Nz +
+                                     color * SOALEN * Nz * Ns;
+                const uint64_t tId = z + color * Nz + spin * Nz * Nc;
+
+                out[tId] = QPhiX::rep<double, FT>( normFac * change_sign[spin] * in[qId] );
+              }
+
+        }  // volume
+
+  const double diffTime = gettime() - startTime;
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_spinor_from_QPhiX: %f secs\n",
+                        diffTime);
+  }
+}
+
+template <typename FT, int V, int S, bool compress12,
+          typename FT_inner, int V_inner, int S_inner, bool compress12_inner>
+void pack_nd_clover(QPhiX::Geometry<FT, V, S, compress12> &geom,
+                   QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner> &geom_inner,
+                   typename QPhiX::Geometry<FT, V, S, compress12>::FullCloverBlock *full_invclov[2],
+                   typename QPhiX::Geometry<FT, V, S, compress12>::CloverBlock *invclov_odiag,
+                   typename QPhiX::Geometry<FT, V, S, compress12>::CloverBlock *clov,
+                   typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::FullCloverBlock *full_invclov_inner[2],
+                   typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::CloverBlock *invclov_odiag_inner,
+                   typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::CloverBlock *clov_inner,
+                   const int cb,
+                   bool pack_inner){
+  
+  typedef typename QPhiX::Geometry<FT, V, S, compress12>::CloverBlock QClover;
+  typedef typename QPhiX::Geometry<FT, V, S, compress12>::FullCloverBlock QFullClover;
+  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::CloverBlock QClover_inner;
+  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::FullCloverBlock QFullClover_inner;
+
+  double start = gettime(); 
+  reorder_clover_to_QPhiX(geom, clov, cb, false);
+  if(pack_inner){
+    reorder_clover_to_QPhiX(geom_inner, clov_inner, cb, false);
+  }
+  
+  sw_invert_epsbar(g_epsbar);
+  reorder_clover_to_QPhiX(geom, invclov_odiag, 1-cb, true, true);
+  if(pack_inner){
+    reorder_clover_to_QPhiX(geom_inner, invclov_odiag_inner, 1-cb, true, true);
+  }
+
+  // no minus sign here, the difference in the sign of gamma5 
+  // is taken care of internally
+  sw_invert_mubar(g_mubar);
+  reorder_clover_to_QPhiX(geom, full_invclov, 1-cb, true);
+  if(pack_inner){
+    reorder_clover_to_QPhiX(geom_inner, full_invclov_inner, 1-cb, true);
+  }
+
+  sw_invert_nd(g_mubar*g_mubar-g_epsbar*g_epsbar);
+  
+  if(g_debug_level > 1){
+    QPhiX::masterPrintf("# QPHIX-inteface: ND TMClover clover-field packing took %.4lf seconds\n", gettime()-start);
+  }
+}
+
+// Due to github issue #404, the helper functions to apply the full QPhiX operator
+// are currently disabled because they conflict with the new interfaces in QPhiX
+// itself. If required, these should be rewritten to use these interfaces
+// rather than the base classes in qphix_base_classes.hpp
+
+// Apply the full QPhiX fermion matrix to checkerboarded tm spinors
+//template <typename FT, int V, int S, bool compress>
+//void Mfull_helper(spinor *Even_out, spinor *Odd_out, const spinor *Even_in, const spinor *Odd_in,
+//                  const op_type_t op_type) {
+//  // TODO: this should use handles for gauge and spinors because these are definitely temporary
+//  // objects
+//  typedef typename QPhiX::Geometry<FT, V, S, compress>::SU3MatrixBlock QGauge;
+//  typedef typename QPhiX::Geometry<FT, V, S, compress>::FourSpinorBlock QSpinor;
+//  typedef typename QPhiX::Geometry<FT, V, S, compress>::CloverBlock QClover;
+//  typedef typename QPhiX::Geometry<FT, V, S, compress>::FullCloverBlock QFullClover;
+//
+//  if (g_debug_level > 1) tmlqcd::printQphixDiagnostics(V, S, compress, V, S, compress);
+//
+//  double coeff_s = (FT)(1);
+//  double coeff_t = (FT)(1);
+//
+//  QPhiX::Geometry<FT, V, S, compress> geom(subLattSize, By, Bz, NCores, Sy, Sz, PadXY, PadXYZ,
+//                                           MinCt);
+//
+//  // Wilson mass
+//  double mass = 1 / (2.0 * g_kappa) - 4;
+//
+//  tmlqcd::Dslash<FT, V, S, compress> *polymorphic_dslash;
+//
+//  QGauge *u_packed[2];
+//  QSpinor *qphix_in[2];
+//  QSpinor *qphix_out[2];
+//
+//  QClover *clover[2];
+//  QClover *inv_clover[2];
+//
+//  QFullClover *inv_fullclover[2][2];
+//
+//  QSpinor *tmp_spinor = (QSpinor *)geom.allocCBFourSpinor();
+//  for (int cb : {0, 1}) {
+//    u_packed[cb] = (QGauge *)geom.allocCBGauge();
+//    qphix_in[cb] = (QSpinor *)geom.allocCBFourSpinor();
+//    qphix_out[cb] = (QSpinor *)geom.allocCBFourSpinor();
+//    clover[cb] = nullptr;
+//    inv_clover[cb] = nullptr;
+//    for (int fl : {0, 1}) {
+//      inv_fullclover[cb][fl] = nullptr;
+//    }
+//  }
+//  reorder_gauge_to_QPhiX(geom, u_packed[cb_even], u_packed[cb_odd]);
+//
+//  if (op_type == WILSON) {
+//    polymorphic_dslash = new tmlqcd::WilsonDslash<FT, V, S, compress>(
+//        &geom, t_boundary, coeff_s, coeff_t, mass, use_tbc, tbc_phases);
+//  } else if (op_type == TMWILSON) {
+//    polymorphic_dslash = new tmlqcd::WilsonTMDslash<FT, V, S, compress>(
+//        &geom, t_boundary, coeff_s, coeff_t, mass, -g_mu / (2.0 * g_kappa), use_tbc, tbc_phases);
+//  } else if (op_type == CLOVER && fabs(g_mu) <= DBL_EPSILON) {
+//    for (int cb : {0, 1}) {
+//      clover[cb] = (QClover *)geom.allocCBClov();
+//      inv_clover[cb] = (QClover *)geom.allocCBClov();
+//
+//      reorder_clover_to_QPhiX(geom, clover[cb], cb, false);
+//      sw_invert(cb, 0);
+//      reorder_clover_to_QPhiX(geom, inv_clover[cb], cb, true);
+//    }
+//
+//    polymorphic_dslash = new tmlqcd::WilsonClovDslash<FT, V, S, compress>(
+//        &geom, t_boundary, coeff_s, coeff_t, mass, clover, inv_clover, use_tbc, tbc_phases);
+//
+//  } else if (op_type == CLOVER && fabs(g_mu) > DBL_EPSILON) {
+//    for (int cb : {0, 1}) {
+//      clover[cb] = (QClover *)geom.allocCBClov();
+//      for (int fl : {0, 1}) {
+//        inv_fullclover[cb][fl] = (QFullClover *)geom.allocCBFullClov();
+//      }
+//      reorder_clover_to_QPhiX(geom, clover[cb], cb, false);
+//      sw_invert(cb, g_mu);
+//      reorder_clover_to_QPhiX(geom, inv_fullclover[cb], cb, true);
+//    }
+//
+//    polymorphic_dslash = new tmlqcd::WilsonClovTMDslash<FT, V, S, compress>(
+//        &geom, t_boundary, coeff_s, coeff_t, mass, -g_mu / (2.0 * g_kappa), clover,
+//        inv_fullclover, use_tbc, tbc_phases);
+//
+//  } else {
+//    QPhiX::masterPrintf("tmlqcd::Mfull_helper; No such operator type: %d\n", op_type);
+//    abort();
+//  }
+//
+////   reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const *const>(Even_in),
+////                              qphix_in[cb_even], cb_even);
+////   reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const *const>(Odd_in), qphix_in[cb_odd],
+////                              cb_odd);
+//  reorder_eo_spinor_to_QPhiX(geom, Even_in,
+//                             qphix_in[cb_even], cb_even);
+//  reorder_eo_spinor_to_QPhiX(geom, Odd_in, qphix_in[cb_odd],
+//                             cb_odd);
+//  // Apply QPhiX Mfull
+//  polymorphic_dslash->plain_dslash(qphix_out[cb_odd], qphix_in[cb_even], u_packed[cb_odd],
+//                                   /* isign == non-conjugate */ 1, cb_odd);
+//  polymorphic_dslash->plain_dslash(qphix_out[cb_even], qphix_in[cb_odd], u_packed[cb_even],
+//                                   /* isign == non-conjugate */ 1, cb_even);
+//  for (int cb : {0, 1}) {
+//    polymorphic_dslash->A_chi(tmp_spinor, qphix_in[cb], 1, cb);
+//    QPhiX::aypx(-0.5, tmp_spinor, qphix_out[cb], geom, 1);
+//  }
+//
+//  reorder_eo_spinor_from_QPhiX(geom, Even_out, qphix_out[cb_even],
+//                               cb_even, 2.0 * g_kappa);
+//  reorder_eo_spinor_from_QPhiX(geom, Odd_out, qphix_out[cb_odd], cb_odd,
+//                               2.0 * g_kappa);
+//
+//  geom.free(tmp_spinor);
+//  for (int cb : {0, 1}) {
+//    geom.free(u_packed[cb]);
+//    geom.free(qphix_in[cb]);
+//    geom.free(qphix_out[cb]);
+//    geom.free(clover[cb]);
+//    geom.free(inv_clover[cb]);
+//    for (int fl : {0, 1}) {
+//      geom.free(inv_fullclover[cb][fl]);
+//    }
+//  };
+//  delete (polymorphic_dslash);
+//}
+
+// Templated even-odd preconditioned solver using QPhiX Library
+template <typename FT, int V, int S, bool compress, 
+          typename FT_inner = FT, int V_inner = V, int S_inner = S, bool compress_inner = compress>
+int invert_eo_qphix_helper(std::vector< std::vector < spinor* > > &tmlqcd_odd_out, 
+                           std::vector< std::vector < spinor* > > &tmlqcd_odd_in,
+                           const double target_precision, const int max_iter, const int solver_flag,
+                           solver_params_t solver_params, const int num_flavour) {
+  // TODO: it would perhaps be beneficial to keep the fields resident
+  typedef typename QPhiX::Geometry<FT, V, S, compress>::SU3MatrixBlock QGauge;
+  typedef typename QPhiX::Geometry<FT, V, S, compress>::FourSpinorBlock QSpinor;
+  typedef typename QPhiX::FourSpinorHandle<FT, V, S, compress> QSpinorHandle;
+  typedef typename QPhiX::Geometry<FT, V, S, compress>::CloverBlock QClover;
+  typedef typename QPhiX::Geometry<FT, V, S, compress>::FullCloverBlock QFullClover;
+
+  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner>::SU3MatrixBlock QGauge_inner;
+  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner>::FourSpinorBlock QSpinor_inner;
+  typedef typename QPhiX::FourSpinorHandle<FT_inner, V_inner, S_inner, compress_inner> QSpinorHandle_inner;
+  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner>::CloverBlock QClover_inner;
+  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner>::FullCloverBlock QFullClover_inner;
+
+  /************************
+   *                      *
+   *    SETUP GEOMETRY    *
+   *                      *
+  ************************/
+
+  if (g_debug_level > 1) {
+    tmlqcd::printQphixDiagnostics(V, S, compress, V_inner, S_inner, compress_inner);
+  }
+
+  QPhiX::Geometry<FT, V, S, compress> geom(subLattSize, By, Bz, NCores, Sy, Sz, PadXY, PadXYZ,
+                                           MinCt);
+
+  // we always create the inner geometry, the overhead should be small...
+  QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner> geom_inner(
+      subLattSize, By, Bz, NCores, Sy, Sz, PadXY, PadXYZ, MinCt);
+
+  // Set number of BLAS threads by hand.
+  // In case some implements the tune routines in QPhiX
+  // this may be updated...
+  QPhiX::masterPrintf("# Setting number of BLAS threads...\n");
+  const int n_blas_simt = N_simt;
+  QPhiX::masterPrintf("# ...done.\n");
+
+  // Anisotropy Coefficents
+  const double coeff_s = 1.0;
+  const double coeff_t = 1.0;
+
+  // The Wilson mass
+  const double mass = 1.0 / (2.0 * g_kappa) - 4.0;
+
+  // Set variables need for solve
+  bool verbose = g_debug_level > 2 ? true : false;
+  int niters = -1;
+  int niters2 = 0;
+  double rsd_final = -1.0;
+  uint64_t site_flops = 0;
+  uint64_t site_flops2 = 0;
+  uint64_t mv_apps = 0;
+  uint64_t mv_apps2 = 0;
+  
+  double start_time;
+  double end_time;
+  
+  // support for multi-shift solves via the length of the output vector,
+  // which counts the shifts on the outer index and the flavour on the inner index
+  const int num_shifts = tmlqcd_odd_out.size();
+  std::vector < double > shifts; shifts.resize( num_shifts );
+  std::vector <double> RsdTargetArr; RsdTargetArr.resize(num_shifts);
+  std::vector <double> RsdFinalArr; RsdFinalArr.resize(num_shifts);
+
+  double rescale = 0.5 / g_kappa;
+  // the inverse of M M^dag, as required for the HMC, comes with a factor of alpha^2
+  if (solver_params.solution_type == TM_SOLUTION_M_MDAG) {
+    rescale *= rescale;
+  }
+
+  std::vector<QSpinorHandle> q_spinor_handles;
+
+  QGauge *u_packed[2] = {nullptr, nullptr};
+  QGauge_inner *u_packed_inner[2] = {nullptr, nullptr};
+  for (int cb : {0, 1}) {
+    u_packed[cb] = (QGauge *)geom.allocCBGauge();
+  }
+  // Reorder (global) input gauge field from tmLQCD to QPhiX
+  reorder_gauge_to_QPhiX(geom, u_packed[cb_even], u_packed[cb_odd]);
+  
+  // for mixed solvers, we also need the gauge field in the inner precision
+  if( solver_is_mixed(solver_flag) ){
+    for(int cb : {0, 1}) {
+      u_packed_inner[cb] = (QGauge_inner *)geom_inner.allocCBGauge();
+    }
+    reorder_gauge_to_QPhiX(geom_inner, u_packed_inner[cb_even], u_packed_inner[cb_odd]);
+  }
+
+  if (num_flavour == 1) {
+    constexpr int nf = 1;
+    std::vector < QSpinor* > qphix_in; qphix_in.resize( 1 );
+    std::vector < QSpinor* > qphix_out; qphix_out.resize( num_shifts );
+    QSpinor *qphix_buffer;
+
+    QClover *qphix_clover = nullptr;
+    QClover *qphix_inv_clover = nullptr;
+    
+    QClover_inner *qphix_clover_inner = nullptr;
+    QClover_inner *qphix_inv_clover_inner = nullptr;
+
+    QFullClover *qphix_inv_fullclover[2] = {nullptr, nullptr};
+    
+    QFullClover_inner *qphix_inv_fullclover_inner[2] = {nullptr, nullptr};
+
+    q_spinor_handles.push_back(makeFourSpinorHandle(geom));
+    qphix_in[0] = q_spinor_handles.back().get();
+
+    for( int shift = 0; shift < num_shifts; shift++ ) {
+      q_spinor_handles.push_back(makeFourSpinorHandle(geom));
+      qphix_out[shift] = q_spinor_handles.back().get();
+    }
+
+    q_spinor_handles.push_back(makeFourSpinorHandle(geom));
+    qphix_buffer = q_spinor_handles.back().get();
+
+    QPhiX::EvenOddLinearOperator<FT, V, S, compress> *FermionMatrixQPhiX = nullptr;
+    QPhiX::EvenOddLinearOperator<FT_inner, V_inner, S_inner, compress_inner> *InnerFermionMatrixQPhiX = nullptr;
+    if ( ( fabs(g_mu) > DBL_EPSILON ) && g_c_sw > DBL_EPSILON) {  // TWISTED-MASS-CLOVER
+      qphix_clover = (QClover *)geom.allocCBClov();
+      for (int fl : {0, 1}) {
+        qphix_inv_fullclover[fl] = (QFullClover *)geom.allocCBFullClov(); 
+      }
+      reorder_clover_to_QPhiX(geom, qphix_clover, cb_odd, false);
+      reorder_clover_to_QPhiX(geom, qphix_inv_fullclover, cb_even, true);
+      
+      QPhiX::masterPrintf("# Creating QPhiX Twisted Clover Fermion Matrix...\n");
+      FermionMatrixQPhiX = new QPhiX::EvenOddTMCloverOperator<FT, V, S, compress>(
+          u_packed, qphix_clover, qphix_inv_fullclover, &geom, t_boundary, coeff_s, coeff_t,
+          use_tbc, tbc_phases, -0.5*(g_mu3+g_mu)/g_kappa);
+      if( solver_is_mixed(solver_flag) ){
+        qphix_clover_inner = (QClover_inner *)geom_inner.allocCBClov();
+        for( int fl : {0, 1} ){
+          qphix_inv_fullclover_inner[fl] = (QFullClover_inner *)geom_inner.allocCBFullClov();
+        }
+        reorder_clover_to_QPhiX(geom_inner, qphix_clover_inner, cb_odd, false);        
+        reorder_clover_to_QPhiX(geom_inner, qphix_inv_fullclover_inner, cb_even, true);
+        InnerFermionMatrixQPhiX = new QPhiX::EvenOddTMCloverOperator<FT_inner, V_inner, S_inner, compress_inner>(
+          u_packed_inner, qphix_clover_inner, qphix_inv_fullclover_inner, &geom_inner, t_boundary, coeff_s, coeff_t,
+          use_tbc, tbc_phases, -0.5*(g_mu3+g_mu)/g_kappa);
+      }
+      QPhiX::masterPrintf("# ...done.\n");
+    } else if ( fabs(g_mu) > DBL_EPSILON ) {  // TWISTED-MASS
+      const double TwistedMass = -g_mu / (2.0 * g_kappa);
+      QPhiX::masterPrintf("# Creating QPhiX Twisted Mass Wilson Fermion Matrix...\n");
+      FermionMatrixQPhiX = new QPhiX::EvenOddTMWilsonOperator<FT, V, S, compress>(
+          mass, TwistedMass, u_packed, &geom, t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases);
+      QPhiX::masterPrintf("# ...done.\n");
+      if( solver_is_mixed(solver_flag) ){
+        InnerFermionMatrixQPhiX = new QPhiX::EvenOddTMWilsonOperator<FT_inner, V_inner, S_inner, compress_inner>(
+            mass, TwistedMass, u_packed_inner, &geom_inner, t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases);
+      }
+    } else if (g_c_sw > DBL_EPSILON) {  // WILSON CLOVER
+      qphix_clover = (QClover *)geom.allocCBClov();
+      qphix_inv_clover = (QClover *)geom.allocCBClov();
+
+      reorder_clover_to_QPhiX(geom, qphix_clover, cb_odd, false);
+      reorder_clover_to_QPhiX(geom, qphix_inv_clover, cb_even, true);
+
+      QPhiX::masterPrintf("# Creating QPhiX Wilson Clover Fermion Matrix...\n");
+      FermionMatrixQPhiX = new QPhiX::EvenOddCloverOperator<FT, V, S, compress>(
+          u_packed, qphix_clover, qphix_inv_clover, &geom, t_boundary, coeff_s, coeff_t, use_tbc,
+          tbc_phases, -0.5*g_mu3/g_kappa);
+      if( solver_is_mixed(solver_flag) ){
+        qphix_clover_inner = (QClover_inner *)geom_inner.allocCBClov();
+        qphix_inv_clover_inner = (QClover_inner *)geom_inner.allocCBClov();
+        reorder_clover_to_QPhiX(geom_inner, qphix_clover_inner, cb_odd, false);        
+        reorder_clover_to_QPhiX(geom_inner, qphix_inv_clover_inner, cb_even, true);
+        InnerFermionMatrixQPhiX = new QPhiX::EvenOddCloverOperator<FT_inner, V_inner, S_inner, compress_inner>(
+          u_packed_inner, qphix_clover_inner, qphix_inv_clover_inner, &geom_inner, t_boundary, coeff_s, coeff_t,
+          use_tbc, tbc_phases, -0.5*g_mu3/g_kappa);
+      }
+      QPhiX::masterPrintf("# ...done.\n");
+
+    } else {  // WILSON
+      QPhiX::masterPrintf("# Creating QPhiX Wilson Fermion Matrix...\n");
+      FermionMatrixQPhiX = new QPhiX::EvenOddWilsonOperator<FT, V, S, compress>(
+          mass, u_packed, &geom, t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases);
+      if( solver_is_mixed(solver_flag) ){
+        InnerFermionMatrixQPhiX = new QPhiX::EvenOddWilsonOperator<FT_inner, V_inner, S_inner, compress_inner>(
+            mass, u_packed_inner, &geom_inner, t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases);
+      }
+      QPhiX::masterPrintf("# ...done.\n");
+    }
+
+    // Create a Linear Solver Object
+    QPhiX::AbstractSolver<FT, V, S, compress> *SolverQPhiX = nullptr;
+    QPhiX::AbstractSolver<FT_inner, V_inner, S_inner, compress_inner> *InnerSolverQPhiX = nullptr;
+    QPhiX::AbstractMultiSolver<FT, V, S, compress, nf> *MultiSolverQPhiX = nullptr;
+    if( solver_flag == DUMMYHERMTEST ) {
+      QPhiX::masterPrintf("# QPHIX: Creating dummy solver for hermiticity test...\n");
+      SolverQPhiX =
+        new QPhiX::InvDummyHermTest<FT, V, S, compress,
+                                    typename QPhiX::EvenOddLinearOperator<FT, V, S, compress> >(
+              *FermionMatrixQPhiX, max_iter);
+    } else if (solver_flag == CG) {
+      QPhiX::masterPrintf("# QPHIX: Creating CG solver...\n");
+      SolverQPhiX = new QPhiX::InvCG<FT, V, S, compress>(*FermionMatrixQPhiX, max_iter);
+    } else if (solver_flag == BICGSTAB) {
+      QPhiX::masterPrintf("# QPHIX: Creating BiCGStab solver...\n");
+      SolverQPhiX = new QPhiX::InvBiCGStab<FT, V, S, compress>(*FermionMatrixQPhiX, max_iter);
+    } else if ( solver_flag == MIXEDCG ) {
+      // TODO: probably need to adjust inner solver iterations here...
+      QPhiX::masterPrintf("# QPHIX: Creating mixed-precision CG solver...\n");
+      InnerSolverQPhiX = new QPhiX::InvCG<FT_inner, V_inner, S_inner, compress_inner>(*InnerFermionMatrixQPhiX, max_iter);
+      const bool MMdag = true;
+      SolverQPhiX = new QPhiX::InvRichardsonMultiPrec<FT, V, S, compress, FT_inner, V_inner, S_inner, compress_inner, MMdag>(
+          *FermionMatrixQPhiX, *InnerSolverQPhiX, solver_params.mcg_delta, max_iter);
+    } else if (solver_flag == MIXEDBICGSTAB ) {
+      QPhiX::masterPrintf("# QPHIX: Creating mixed-precision BICGCGSTAB solver...\n");
+      InnerSolverQPhiX = new QPhiX::InvBiCGStab<FT_inner, V_inner, S_inner, compress_inner>(*InnerFermionMatrixQPhiX, max_iter);
+      const bool MMdag = false;
+      SolverQPhiX = new QPhiX::InvRichardsonMultiPrec<FT, V, S, compress, FT_inner, V_inner, S_inner, compress_inner, MMdag>(
+          *FermionMatrixQPhiX, *InnerSolverQPhiX, solver_params.mcg_delta, max_iter);
+    } else if (solver_flag == CGMMS ) {
+      QPhiX::masterPrintf("# QPHIX: Creating multi-shift CG solver ...\n");
+      MultiSolverQPhiX = new QPhiX::MInvCG<FT, V, S, compress>( *FermionMatrixQPhiX, max_iter, num_shifts );
+    } else {
+      QPhiX::masterPrintf(" Solver not yet supported by QPhiX!\n");
+      QPhiX::masterPrintf(" Aborting...\n");
+      abort();
+    }
+    QPhiX::masterPrintf("# ...done.\n");
+
+//     reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const *const>(tmlqcd_odd_in[0][0]),
+//                                qphix_in[0], cb_odd);
+    reorder_eo_spinor_to_QPhiX(geom, tmlqcd_odd_in[0][0],
+                               qphix_in[0], cb_odd);
+    QPhiX::masterPrintf("# Calling the solver...\n");
+
+    // Set the right precision for the QPhiX solver
+    // we get target_precision externally and and is given such, that it's either
+    // already relative or absolute
+    // Most QPhiX solvers allow setting absolute or relative residual
+    // by passing an appropriate flag, but this is not true for the multi-shift solver.
+    // As a result, we follow that solver and call ALL solvers with
+    // QPhiX::RELATIVE, which gives results consistent with tmLQCD in all cases. 
+    double rhs_norm2 = 1.0;
+    QPhiX::norm2Spinor(rhs_norm2, qphix_in[0], geom, n_blas_simt);
+    const double RsdTarget = sqrt(target_precision / rhs_norm2);
+
+    // Calling the solver
+    start_time = gettime();
+    if ( solver_flag == DUMMYHERMTEST ){
+      random_spinor_field_eo(tmlqcd_odd_out[0][0], 0, RN_GAUSS);
+      reorder_eo_spinor_to_QPhiX(geom, tmlqcd_odd_out[0][0], qphix_buffer, cb_odd);      
+      for(int isign : {-1, 1} ){
+        (*SolverQPhiX)(qphix_buffer, qphix_in[0], RsdTarget, niters, rsd_final, site_flops, mv_apps, isign,
+                       verbose, cb_odd, QPhiX::RELATIVE);
+      }
+      QPhiX::copySpinor(qphix_out[0], qphix_buffer, geom, n_blas_simt);
+    } else if (solver_flag == CG || solver_flag == MIXEDCG || solver_flag == RGMIXEDCG) {
+      // USING CG:
+      // We are solving
+      //   M M^dagger qphix_buffer = qphix_in_prepared
+      // here, that is, isign = -1 for the QPhiX CG solver.
+      (*SolverQPhiX)(qphix_buffer, qphix_in[0], RsdTarget, niters, rsd_final, site_flops, mv_apps, -1,
+                     verbose, cb_odd, QPhiX::RELATIVE);
+      // After that. if required by the solution type, multiply with M^dagger:
+      //   qphix_out[1] = M^dagger ( M^dagger^-1 M^-1 ) qphix_in_prepared
+      if (solver_params.solution_type == TM_SOLUTION_M) {
+        (*FermionMatrixQPhiX)(qphix_out[0], qphix_buffer, /* conjugate */ -1);
+        mv_apps++;
+      } else {
+        QPhiX::copySpinor(qphix_out[0], qphix_buffer, geom, n_blas_simt);
+      }
+    } else if (solver_flag == CGMMS ){
+      // TODO: handle the residuals properly
+      if(g_debug_level > 2 ) QPhiX::masterPrintf("# QPHIX CGMMS: shifts: \n");
+      for( int shift = 0; shift < num_shifts; shift++ ){
+        RsdTargetArr[shift] = RsdTarget;
+        RsdFinalArr[shift] = -1.0;
+        shifts[shift] = solver_params.shifts[shift]*solver_params.shifts[shift]/(4*g_kappa*g_kappa);
+        if(g_debug_level > 2 ) QPhiX::masterPrintf("# QPHIX CGMMS: shift[%d] = %.6e\n", shift, shifts[shift]);
+      }
+      if(g_debug_level > 2 ) QPhiX::masterPrintf("\n");
+      (*MultiSolverQPhiX)(qphix_out.data(), qphix_in[0], num_shifts, shifts.data(), 
+                          RsdTargetArr.data(), niters, RsdFinalArr.data(), site_flops, mv_apps, -1, verbose );
+      rsd_final = RsdFinalArr[0];
+    } else if (solver_flag == BICGSTAB || solver_flag == MIXEDBICGSTAB) {
+      (*SolverQPhiX)(qphix_buffer, qphix_in[0], RsdTarget, niters, rsd_final, site_flops, mv_apps, 1,
+                     verbose, cb_odd, QPhiX::RELATIVE);
+      // for M^dagger^-1 M^-1 solution type, need to call BiCGstab twice
+      if (solver_params.solution_type == TM_SOLUTION_M_MDAG) {
+        (*SolverQPhiX)(qphix_out[0], qphix_buffer, RsdTarget, niters2, rsd_final, site_flops, mv_apps2,
+                       -1, verbose, cb_odd, QPhiX::RELATIVE);
+      } else {
+        QPhiX::copySpinor(qphix_out[0], qphix_buffer, geom, n_blas_simt);
+      }
+    }
+    end_time = gettime();
+    
+    for(int shift = 0; shift < num_shifts; shift++ ){
+      reorder_eo_spinor_from_QPhiX(geom, tmlqcd_odd_out[shift][0],
+                                   qphix_out[shift], cb_odd, rescale);
+    }
+
+    QPhiX::masterPrintf("# QPHIX: ...done.\n");
+    QPhiX::masterPrintf("# QPHIX: Cleaning up\n");
+    delete (FermionMatrixQPhiX);
+    delete (InnerFermionMatrixQPhiX);
+    delete (SolverQPhiX);
+    delete (InnerSolverQPhiX);
+    delete (MultiSolverQPhiX);
+    // on KNL, it seems that munmap is problematic, so we check for nullptr
+    if(qphix_clover) geom.free(qphix_clover);
+    if(qphix_inv_clover) geom.free(qphix_inv_clover);
+    if(qphix_clover_inner) geom_inner.free(qphix_clover_inner);
+    if(qphix_inv_clover_inner) geom_inner.free(qphix_inv_clover_inner);
+    for (int fl : {0, 1}) {
+      if(qphix_inv_fullclover[fl]) geom.free(qphix_inv_fullclover[fl]);
+      if(qphix_inv_fullclover_inner[fl]) geom_inner.free(qphix_inv_fullclover_inner[fl]);
+    }
+    QPhiX::masterPrintf("# QPHIX: ...done.\n\n");
+
+  } else if (num_flavour == 2) {
+    // for explicit template arguments
+    constexpr int nf = 2;
+
+    QSpinor *qphix_in[2];
+    std::vector < QSpinor** > qphix_out;
+    qphix_out.resize( num_shifts );
+    for( int shift = 0; shift < num_shifts; shift++ ){
+      qphix_out[shift] = new QSpinor*[2];
+      for (int fl : {0, 1}) {
+        q_spinor_handles.push_back(makeFourSpinorHandle(geom));
+        qphix_out[shift][fl] = q_spinor_handles.back().get();
+      }
+    }
+    
+    QSpinor *qphix_buffer[2];
+    for (int fl : {0, 1}) {
+      q_spinor_handles.push_back(makeFourSpinorHandle(geom));
+      qphix_in[fl] = q_spinor_handles.back().get();
+      q_spinor_handles.push_back(makeFourSpinorHandle(geom));
+      qphix_buffer[fl] = q_spinor_handles.back().get();
+    }
+    
+    QClover *qphix_clover = nullptr;
+    QClover_inner *qphix_clover_inner = nullptr;
+    
+    QClover *qphix_invclov_odiag = nullptr;
+    QClover_inner *qphix_invclov_odiag_inner = nullptr;
+
+    QFullClover *qphix_inv_fullclover[2] = {nullptr, nullptr};
+    QFullClover_inner *qphix_inv_fullclover_inner[2] = {nullptr, nullptr};
+
+    QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> *TwoFlavFermionMatrixQPhiX = nullptr;
+    QPhiX::TwoFlavEvenOddLinearOperator<FT_inner, V_inner, S_inner, compress_inner> *InnerTwoFlavFermionMatrixQPhiX = nullptr;
+    
+    if (g_c_sw > DBL_EPSILON) {  // DBCLOVER
+      qphix_clover = (QClover *)geom.allocCBClov();
+      qphix_invclov_odiag = (QClover *)geom.allocCBClov();
+      if( solver_is_mixed(solver_flag) ){
+        qphix_clover_inner = (QClover_inner *)geom_inner.allocCBClov();
+        qphix_invclov_odiag_inner = (QClover_inner *)geom_inner.allocCBClov();
+      }
+
+      for (int fl : {0, 1}) {
+        qphix_inv_fullclover[fl] = (QFullClover *)geom.allocCBFullClov();
+        if( solver_is_mixed(solver_flag) ){
+          qphix_inv_fullclover_inner[fl] = (QFullClover_inner*)geom_inner.allocCBFullClov();
+        }
+      }
+
+      pack_nd_clover(geom, geom_inner, 
+                     qphix_inv_fullclover, qphix_invclov_odiag, qphix_clover,
+                     qphix_inv_fullclover_inner, qphix_invclov_odiag_inner, qphix_clover_inner,
+                     cb_odd,
+                     solver_is_mixed(solver_flag));
+
+      QPhiX::masterPrintf("# QPHIX: Creating two-flavour QPhiX Wilson Twisted Clover Fermion Matrix...\n");
+      TwoFlavFermionMatrixQPhiX = new QPhiX::EvenOddNDTMCloverReuseOperator<FT, V, S, compress>(
+          -0.5 * g_mubar / g_kappa, 0.5 * g_epsbar / g_kappa, 
+          u_packed, qphix_clover, qphix_invclov_odiag, qphix_inv_fullclover,
+          &geom, t_boundary,
+          coeff_s, coeff_t, use_tbc, tbc_phases);
+      if( solver_is_mixed(solver_flag) ){
+        InnerTwoFlavFermionMatrixQPhiX = new QPhiX::EvenOddNDTMCloverReuseOperator<FT_inner, V_inner, S_inner, compress_inner>(
+            -0.5 * g_mubar / g_kappa, 0.5 * g_epsbar / g_kappa, 
+            u_packed_inner, qphix_clover_inner, qphix_invclov_odiag_inner, qphix_inv_fullclover_inner,
+            &geom_inner, t_boundary,
+            coeff_s, coeff_t, use_tbc, tbc_phases);
+      }
+    } else {  // DBTMWILSON
+      QPhiX::masterPrintf("# QPHIX: Creating two-flavour QPhiX Wilson Twisted Mass Fermion Matrix...\n");
+      TwoFlavFermionMatrixQPhiX = new QPhiX::EvenOddNDTMWilsonReuseOperator<FT, V, S, compress>(
+          mass, -0.5 * g_mubar / g_kappa, 0.5 * g_epsbar / g_kappa, u_packed, &geom, t_boundary,
+          coeff_s, coeff_t, use_tbc, tbc_phases);
+      if( solver_is_mixed(solver_flag) ){
+        InnerTwoFlavFermionMatrixQPhiX = new QPhiX::EvenOddNDTMWilsonReuseOperator<
+                                               FT_inner, V_inner, S_inner, compress_inner>(
+          mass, -0.5 * g_mubar / g_kappa, 0.5 * g_epsbar / g_kappa, u_packed_inner, &geom_inner, t_boundary,
+          coeff_s, coeff_t, use_tbc, tbc_phases);
+      }
+    }
+
+    //
+    QPhiX::AbstractSolver<FT, V, S, compress, nf> *TwoFlavSolverQPhiX = nullptr;
+    QPhiX::AbstractSolver<FT_inner, V_inner, S_inner, compress_inner, nf> *InnerTwoFlavSolverQPhiX = nullptr;
+    QPhiX::AbstractMultiSolver<FT, V, S, compress, nf> *TwoFlavMultiSolverQPhiX = nullptr;
+    if( solver_flag == DUMMYHERMTEST ) {
+      QPhiX::masterPrintf("# QPHIX: Creating dummy solver for hermiticity test...\n");
+      TwoFlavSolverQPhiX =
+        new QPhiX::InvDummyHermTest<FT, V, S, compress,
+                                    typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
+              *TwoFlavFermionMatrixQPhiX, max_iter);
+    } else if (solver_flag == CG) {
+      QPhiX::masterPrintf("# QPHIX: Creating CG solver...\n");
+      TwoFlavSolverQPhiX =
+          new QPhiX::InvCG<FT, V, S, compress,
+                           typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
+              *TwoFlavFermionMatrixQPhiX, max_iter);
+    } else if (solver_flag == BICGSTAB) {
+      QPhiX::masterPrintf("# QPHIX: Creating BiCGstab solver...\n");
+      TwoFlavSolverQPhiX =
+          new QPhiX::InvBiCGStab<FT, V, S, compress,
+                                 typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
+              *TwoFlavFermionMatrixQPhiX, max_iter);
+    } else if (solver_flag == MIXEDCG) {
+      QPhiX::masterPrintf("# QPHIX: Creating mixed-precision CG solver...\n");
+      InnerTwoFlavSolverQPhiX = new QPhiX::InvCG<
+          FT_inner, V_inner, S_inner, compress_inner,
+          typename QPhiX::TwoFlavEvenOddLinearOperator<FT_inner, V_inner, S_inner, compress_inner> 
+        >(
+            *InnerTwoFlavFermionMatrixQPhiX, max_iter
+         );
+      const bool MMdag = true;
+      TwoFlavSolverQPhiX = new QPhiX::InvRichardsonMultiPrec
+        <
+          FT, V, S, compress, 
+          FT_inner, V_inner, S_inner, compress_inner,
+          MMdag, typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> 
+        >(
+            *TwoFlavFermionMatrixQPhiX,
+            *InnerTwoFlavSolverQPhiX,
+            solver_params.mcg_delta,
+            max_iter
+        );
+    } else if ( solver_flag == CGMMSND ) {
+      QPhiX::masterPrintf("# QPHIX: Creating multi-shift CG solver...\n");
+      TwoFlavMultiSolverQPhiX = new QPhiX::MInvCG<FT, V, S, compress, 
+                                                  typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress>
+                                                  >( *TwoFlavFermionMatrixQPhiX, max_iter, num_shifts );
+    } else {
+      QPhiX::masterPrintf(" Solver not yet supported by QPhiX!\n");
+      QPhiX::masterPrintf(" Aborting...\n");
+      abort();
+    }
+    QPhiX::masterPrintf("# QPHIX: ...done.\n");
+
+    for (int fl : {0, 1}) {
+//       reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const *const>(tmlqcd_odd_in[0][fl]),
+//                                  qphix_in[fl], cb_odd);
+      reorder_eo_spinor_to_QPhiX(geom, tmlqcd_odd_in[0][fl],
+                                 qphix_in[fl], cb_odd);      
+    }
+
+    QPhiX::masterPrintf("# QPHIX: Calling the solver...\n");
+
+    // Set the right precision for the QPhiX solver
+    // we get target_precision externally and and is given such, that it's either
+    // already relative or absolute
+    // Most QPhiX solvers allow setting absolute or relative residual
+    // by passing an appropriate flag, but this is not true for the multi-shift solver.
+    // As a result, we follow that solver and call ALL solvers with
+    // QPhiX::RELATIVE, which gives results consistent with tmLQCD in all cases. 
+    double rhs_norm2 = 1.0;
+    QPhiX::norm2Spinor<FT, V, S, compress, nf>(rhs_norm2, qphix_in, geom, n_blas_simt);
+    const double RsdTarget = sqrt(target_precision / rhs_norm2);
+
+    // Calling the solver
+    start_time = gettime();
+    if ( solver_flag == DUMMYHERMTEST ){
+      for(int fl : {0, 1}){
+        random_spinor_field_eo(tmlqcd_odd_out[0][fl], 0, RN_GAUSS);
+        reorder_eo_spinor_to_QPhiX(geom, tmlqcd_odd_out[0][fl], qphix_buffer[fl], cb_odd);      
+      }
+      for( int isign : {-1, 1} ){
+        (*TwoFlavSolverQPhiX)(qphix_buffer, qphix_in, RsdTarget, niters, rsd_final, site_flops, mv_apps, isign,
+                       verbose, cb_odd, QPhiX::RELATIVE);
+      }
+      QPhiX::copySpinor<FT, V, S, compress, nf>(qphix_out[0], qphix_buffer, geom, n_blas_simt);
+    } else if (solver_flag == CG || solver_flag == MIXEDCG) {
+      // USING CG:
+      // We are solving
+      //   M M^dagger qphix_buffer = qphix_in_prepared
+      // here, that is, isign = -1 for the QPhiX CG solver.
+      (*TwoFlavSolverQPhiX)(qphix_buffer, qphix_in, RsdTarget, niters, rsd_final, site_flops, mv_apps, -1,
+                     verbose, cb_odd, QPhiX::RELATIVE);
+      // After that. if required by the solution type, multiply with M^dagger:
+      //   qphix_out[1] = M^dagger M^dagger^-1 M^-1 qphix_in_prepared
+      if (solver_params.solution_type == TM_SOLUTION_M) {
+        (*TwoFlavFermionMatrixQPhiX)(qphix_out[0], qphix_buffer, /* conjugate */ -1);
+        mv_apps++;
+      } else {
+        QPhiX::copySpinor<FT, V, S, compress, nf>(qphix_out[0], qphix_buffer, geom, n_blas_simt);
+      }
+    } else if (solver_flag == BICGSTAB || solver_flag == MIXEDBICGSTAB) {
+      (*TwoFlavSolverQPhiX)(qphix_buffer, qphix_in, RsdTarget, niters, rsd_final, site_flops, mv_apps, 1,
+                            verbose, cb_odd, QPhiX::RELATIVE);
+      // for M^dagger^-1 M^-1 solution type, need to call BiCGstab twice
+      if (solver_params.solution_type == TM_SOLUTION_M_MDAG) {
+        (*TwoFlavSolverQPhiX)(qphix_out[0], qphix_buffer, RsdTarget, niters2, rsd_final, site_flops, mv_apps2,
+                              -1, verbose, cb_odd, QPhiX::RELATIVE);
+      } else {
+        QPhiX::copySpinor<FT, V, S, compress, nf>(qphix_out[0], qphix_buffer, geom, n_blas_simt);
+      }
+    } else if (solver_flag == CGMMSND ){
+      // TODO: handle the residuals properly
+      if(g_debug_level > 2 ) QPhiX::masterPrintf("# QPHIX CGMMSND: shifts: \n");
+      // tmLQCD weights the operator with 1/maxev in the RHMC relative to the shifts
+      // we will do this externally on the inverse (in monomial_solve) and thus need to weight
+      // the shifts by maxev^2
+      const double maxev_sq = (1.0/phmc_invmaxev)*(1.0/phmc_invmaxev);
+      for( int shift = 0; shift < num_shifts; shift++ ){
+        RsdTargetArr[shift] = RsdTarget;
+        RsdFinalArr[shift] = -1.0;
+        shifts[shift] = maxev_sq *
+                        solver_params.shifts[shift]*solver_params.shifts[shift] /
+                        (4*g_kappa*g_kappa); 
+        if(g_debug_level > 2 ) QPhiX::masterPrintf("# [%d] = %lf\n", shift, shifts[shift]);
+      }
+      if(g_debug_level > 2 ) QPhiX::masterPrintf("\n");
+      (*TwoFlavMultiSolverQPhiX)(qphix_out.data(), qphix_in, num_shifts, shifts.data(), 
+                          RsdTargetArr.data(), niters, RsdFinalArr.data(), site_flops, mv_apps, -1, verbose );
+      rsd_final = RsdFinalArr[0];
+    }
+    end_time = gettime();
+
+    for(int shift = 0; shift < num_shifts; shift++){
+      for (int fl : {0, 1}) {
+        reorder_eo_spinor_from_QPhiX(geom, tmlqcd_odd_out[shift][fl],
+                                    qphix_out[shift][fl], cb_odd, rescale);
+      }
+    }
+
+    delete TwoFlavFermionMatrixQPhiX;
+    delete InnerTwoFlavFermionMatrixQPhiX;
+    delete InnerTwoFlavSolverQPhiX;
+    delete TwoFlavMultiSolverQPhiX;
+    delete TwoFlavSolverQPhiX;
+    for( int shift = 0; shift < num_shifts; shift++ ){
+      delete[] qphix_out[shift];
+    }
+    
+    if(qphix_clover) geom.free(qphix_clover);
+    if(qphix_invclov_odiag) geom.free(qphix_invclov_odiag);
+    if(qphix_clover_inner) geom_inner.free(qphix_clover_inner);
+    if(qphix_invclov_odiag_inner) geom_inner.free(qphix_invclov_odiag_inner);
+    for (int fl : {0, 1}) {
+      if(qphix_inv_fullclover[fl]) geom.free(qphix_inv_fullclover[fl]);
+      if(qphix_inv_fullclover_inner[fl]) geom_inner.free(qphix_inv_fullclover_inner[fl]);
+    }    
+
+  } else { // if(num_flavour)
+    // complain, this number of flavours is not valid
+  } // if(num_flavour)
+
+  for (int cb : {0, 1}) {
+    if(u_packed[cb]) geom.free(u_packed[cb]);
+    if(u_packed_inner[cb]) geom_inner.free(u_packed_inner[cb]);
+  }
+
+  // FIXME: This should be called properly somewhere else
+  _endQphix();
+
+  QPhiX::masterPrintf("# ...done.\n\n");
+
+  uint64_t num_cb_sites = lattSize[0] / 2 * lattSize[1] * lattSize[2] * lattSize[3];
+  // FIXME: this needs to be adjusted depending on the operator used
+  uint64_t op_flops_per_site = 1320;
+  uint64_t total_flops = (site_flops + site_flops2 + 
+                          (2 * num_flavour * op_flops_per_site) * (mv_apps+mv_apps2) ) * num_cb_sites;
+  QPhiX::masterPrintf("# QPHIX: Solver Time = %g sec\n", (end_time - start_time));
+  QPhiX::masterPrintf("# QPHIX: Performance in GFLOPS = %g\n\n", 1.0e-9 * total_flops / (end_time - start_time));  
+
+  if( solver_is_mixed(solver_flag) ){
+    // the mixed solver reports the outer iterations, we would like to get
+    // some better total
+    niters = mv_apps / 2;
+    if( solver_flag == MIXEDBICGSTAB && solver_params.solution_type == TM_SOLUTION_M_MDAG ){
+      niters2 = mv_apps2 / 2;
+    }
+  }
+  // solver did not converge in maximum number of iterations
+  // FIXME: non-convergence does not work correctly yet
+  if( (niters+niters2) > max_iter ){
+    niters = -1;
+    niters2 = 0;
+  }
+  return (niters+niters2);
+}
+
+// Due to github issue #404, the helper functions to apply the full QPhiX operator
+// are currently disabled because they conflict with the new interfaces in QPhiX
+// itself. If required, these should be rewritten to use these interfaces
+// rather than the base classes in qphix_base_classes.hpp
+
+// Template wrapper for the Dslash operator call-able from C code
+//void Mfull_qphix(spinor *Even_out, spinor *Odd_out, const spinor *Even_in, const spinor *Odd_in,
+//                 const op_type_t op_type) {
+//  tmlqcd::checkQphixInputParameters(qphix_input);
+//  // FIXME: two-row gauge compression and double precision hard-coded
+//  _initQphix(0, nullptr, qphix_input, 12, QPHIX_DOUBLE_PREC);
+//
+//  if (qphix_precision == QPHIX_DOUBLE_PREC) {
+//    if (QPHIX_SOALEN > VECLEN_DP) {
+//      QPhiX::masterPrintf("SOALEN=%d is greater than the double prec VECLEN=%d\n", QPHIX_SOALEN,
+//                          VECLEN_DP);
+//      abort();
+//    }
+//    QPhiX::masterPrintf("TESTING IN DOUBLE PRECISION \n");
+//    if (compress12) {
+//      Mfull_helper<double, VECLEN_DP, QPHIX_SOALEN, true>(Even_out, Odd_out, Even_in, Odd_in,
+//                                                          op_type);
+//    } else {
+//      Mfull_helper<double, VECLEN_DP, QPHIX_SOALEN, false>(Even_out, Odd_out, Even_in, Odd_in,
+//                                                           op_type);
+//    }
+//  } else if (qphix_precision == QPHIX_FLOAT_PREC) {
+//    if (QPHIX_SOALEN > VECLEN_SP) {
+//      QPhiX::masterPrintf("SOALEN=%d is greater than the single prec VECLEN=%d\n", QPHIX_SOALEN,
+//                          VECLEN_SP);
+//      abort();
+//    }
+//    QPhiX::masterPrintf("TESTING IN SINGLE PRECISION \n");
+//    if (compress12) {
+//      Mfull_helper<float, VECLEN_SP, QPHIX_SOALEN, true>(Even_out, Odd_out, Even_in, Odd_in,
+//                                                         op_type);
+//    } else {
+//      Mfull_helper<float, VECLEN_SP, QPHIX_SOALEN, false>(Even_out, Odd_out, Even_in, Odd_in,
+//                                                          op_type);
+//    }
+//  }
+//#if (defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE))
+//  else if (qphix_precision == QPHIX_HALF_PREC) {
+//    if (QPHIX_SOALEN > VECLEN_HP) {
+//      QPhiX::masterPrintf("SOALEN=%d is greater than the half prec VECLEN=%d\n", QPHIX_SOALEN,
+//                          VECLEN_HP);
+//      abort();
+//    }
+//    QPhiX::masterPrintf("TESTING IN HALF PRECISION \n");
+//    if (compress12) {
+//      Mfull_helper<QPhiX::half, VECLEN_HP, QPHIX_SOALEN, true>(Even_out, Odd_out, Even_in, Odd_in,
+//                                                               op_type);
+//    } else {
+//      Mfull_helper<QPhiX::half, VECLEN_HP, QPHIX_SOALEN, false>(Even_out, Odd_out, Even_in, Odd_in,
+//                                                                op_type);
+//    }
+//  }
+//#endif
+//}
+
+// we have a unified interface for n-flavour inversions, but we need to provide wrappers
+// which can be called by the tmLQCD solver drivers for one and two-flavour inversions
+int invert_eo_qphix_oneflavour(spinor *Odd_out_1f, spinor *Odd_in_1f, const int max_iter,
+                               const double precision, const int solver_flag, const int rel_prec,
+                               const solver_params_t solver_params, const SloppyPrecision sloppy,
+                               const CompressionType compression) {
+  const int num_flavour = 1;
+  const int num_shifts = 1;
+  std::vector< std::vector < spinor* > > Odd_out;
+  std::vector< std::vector < spinor* > > Odd_in;
+  
+  Odd_out.resize( num_shifts ); Odd_out[0].resize( num_flavour );
+  Odd_in.resize( 1 ); Odd_in[0].resize( num_flavour );
+  
+  Odd_in[0][0] = Odd_in_1f;
+  Odd_out[0][0] = Odd_out_1f;
+  
+  return invert_eo_qphix_nflavour_mshift(Odd_out, Odd_in, precision, max_iter,
+                                         solver_flag, rel_prec,
+                                         solver_params, sloppy, compression, num_flavour);
+}
+
+int invert_eo_qphix_oneflavour_mshift(spinor **Odd_out_1f, spinor *Odd_in_1f, const int max_iter,
+                                      const double precision, const int solver_flag, const int rel_prec,
+                                      const solver_params_t solver_params, const SloppyPrecision sloppy,
+                                      const CompressionType compression) {
+  // even though the default is set to 1, guard against zeroes
+  const int num_shifts = solver_params.no_shifts == 0 ? 1 : solver_params.no_shifts;
+  const int num_flavour = 1;
+  std::vector< std::vector < spinor* > > Odd_out;
+  std::vector< std::vector < spinor* > > Odd_in;
+  
+  Odd_out.resize( num_shifts );
+  Odd_in.resize( 1 ); Odd_in[0].resize( num_flavour );
+  
+  Odd_in[0][0] = Odd_in_1f;
+  for( int shift = 0; shift < num_shifts; shift++ ){
+    Odd_out[shift].resize( num_flavour );
+    Odd_out[shift][0] = Odd_out_1f[shift];
+  }
+
+  return invert_eo_qphix_nflavour_mshift(Odd_out, Odd_in, precision, max_iter,
+                                        solver_flag, rel_prec,
+                                        solver_params, sloppy, compression, num_flavour);
+}
+
+// Template wrapper for QPhiX solvers callable from C code, return number of iterations
+int invert_eo_qphix_twoflavour(spinor *Odd_out_s, spinor *Odd_out_c, spinor *Odd_in_s,
+                               spinor *Odd_in_c, const int max_iter, const double precision,
+                               const int solver_flag, const int rel_prec,
+                               const solver_params_t solver_params, const SloppyPrecision sloppy,
+                               const CompressionType compression) {
+  const int num_flavour = 2;
+  const int num_shifts = 1;
+  std::vector< std::vector < spinor* > > Odd_out;
+  std::vector< std::vector < spinor* > > Odd_in;
+  
+  Odd_out.resize( num_shifts ); Odd_out[0].resize( num_flavour );
+  Odd_in.resize( 1 ); Odd_in[0].resize( num_flavour );
+  
+  Odd_in[0][0] = Odd_in_s; 
+  Odd_in[0][1] = Odd_in_c;
+  
+  Odd_out[0][0] = Odd_out_s;
+  Odd_out[0][1] = Odd_out_c;
+  
+  return invert_eo_qphix_nflavour_mshift(Odd_out, Odd_in, precision, max_iter,
+                                         solver_flag, rel_prec,
+                                         solver_params, sloppy, compression, num_flavour);
+}
+
+int invert_eo_qphix_twoflavour_mshift(spinor **Odd_out_s, spinor **Odd_out_c, spinor *Odd_in_s,
+                                      spinor *Odd_in_c, const int max_iter, const double precision,
+                                      const int solver_flag, const int rel_prec,
+                                      const solver_params_t solver_params, const SloppyPrecision sloppy,
+                                      const CompressionType compression) {
+  // even though the default is set to 1, guard against zeroes
+  const int num_shifts = solver_params.no_shifts == 0 ? 1 : solver_params.no_shifts;  
+  const int num_flavour = 2;
+  std::vector< std::vector < spinor* > > Odd_out;
+  std::vector< std::vector < spinor* > > Odd_in;
+  
+  Odd_out.resize( num_shifts );
+  Odd_in.resize( 1 ); Odd_in[0].resize( num_flavour );
+  
+  Odd_in[0][0] = Odd_in_s; 
+  Odd_in[0][1] = Odd_in_c;
+  
+  for( int shift = 0; shift < num_shifts; shift++ ){
+    Odd_out[shift].resize( num_flavour );
+    Odd_out[shift][0] = Odd_out_s[shift];
+    Odd_out[shift][1] = Odd_out_c[shift];
+  }
+  
+  return invert_eo_qphix_nflavour_mshift(Odd_out, Odd_in, precision, max_iter,
+                                         solver_flag, rel_prec,
+                                         solver_params, sloppy, compression, num_flavour);
+}
+
+// Template wrapper for QPhiX solvers callable from C code, return number of iterations
+// the interface is prepared for multi-rhs solves, hence the double vector for the input
+int invert_eo_qphix_nflavour_mshift(std::vector< std::vector< spinor* > > &Odd_out, 
+                                    std::vector< std::vector< spinor* > > &Odd_in, 
+                                    const double precision,
+                                    const int max_iter,
+                                    const int solver_flag, 
+                                    const int rel_prec,
+                                    solver_params_t solver_params,
+                                    const SloppyPrecision sloppy, const CompressionType compression,
+                                    const int num_flavour) {
+  tmlqcd::checkQphixInputParameters(qphix_input);
+  double target_precision = precision;
+  double src_norm = 0.0;
+  for (int f = 0; f < num_flavour; ++f) {
+    src_norm += square_norm(Odd_in[0][f], VOLUME / 2, 1);
+  }
+  // we use "precision_lambda" to determine if a system can be solved in half or float
+  // precision (when a fixed-precision solver is used)
+  double precision_lambda = target_precision / src_norm;
+  if (rel_prec == 1) {
+    QPhiX::masterPrintf("# QPHIX: Using relative precision\n");
+    target_precision = precision * src_norm;
+    precision_lambda = precision;
+  }
+  QPhiX::masterPrintf("# QPHIX: precision_lambda: %g, target_precision: %g\n\n", precision_lambda,
+                      target_precision);
+
+  // mixed solvers require inner and outer precisions, which we specify explicitly here
+  if( solver_is_mixed(solver_flag) ){
+#if (defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE))
+    if( sloppy == SLOPPY_HALF ){
+      if( QPHIX_SOALEN > VECLEN_DP || QPHIX_SOALEN > VECLEN_HP ){
+        QPhiX::masterPrintf("SOALEN=%d is greater than the half prec VECLEN=%d or the double prec VECLEN=%d\n", QPHIX_SOALEN,
+                            VECLEN_HP, VECLEN_DP);
+        abort();
+      }
+      QPhiX::masterPrintf("# INITIALIZING QPHIX MIXED SOLVER\n");
+      QPhiX::masterPrintf("# USING DOUBLE-HALF PRECISION\n");
+      _initQphix(0, nullptr, qphix_input, compression, QPHIX_DOUBLE_PREC, QPHIX_HALF_PREC);
+      if(compress12) {
+        return invert_eo_qphix_helper<
+          double, VECLEN_DP, QPHIX_SOALEN, true, QPhiX::half, VECLEN_HP, QPHIX_SOALEN, true>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      } else {
+        return invert_eo_qphix_helper<
+          double, VECLEN_DP, QPHIX_SOALEN, false, QPhiX::half, VECLEN_HP, QPHIX_SOALEN, false>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      }
+    } else
+#else
+    if (sloppy == SLOPPY_HALF) {
+      QPhiX::masterPrintf("QPHIX interface: half precision not supported on this architecture!\n");
+      abort();
+    } else
+#endif
+    if( sloppy == SLOPPY_SINGLE ) {
+      if( QPHIX_SOALEN > VECLEN_DP || QPHIX_SOALEN > VECLEN_SP ){
+        QPhiX::masterPrintf("SOALEN=%d is greater than the single prec VECLEN=%d or the double prec VECLEN=%d\n", QPHIX_SOALEN,
+                            VECLEN_SP, VECLEN_DP);
+        abort();
+      }
+      QPhiX::masterPrintf("# INITIALIZING QPHIX MIXED SOLVER\n");
+      QPhiX::masterPrintf("# USING DOUBLE-SINGLE PRECISION\n");
+      _initQphix(0, nullptr, qphix_input, compression, QPHIX_DOUBLE_PREC, QPHIX_FLOAT_PREC);
+      if(compress12) {
+        return invert_eo_qphix_helper<
+          double, VECLEN_DP, QPHIX_SOALEN, true, float, VECLEN_SP, QPHIX_SOALEN, true>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      } else {
+        return invert_eo_qphix_helper<
+          double, VECLEN_DP, QPHIX_SOALEN, false, float, VECLEN_SP, QPHIX_SOALEN, false>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      }
+    } else { // if(sloppy)
+      if( QPHIX_SOALEN > VECLEN_DP ){
+        QPhiX::masterPrintf("SOALEN=%d is greater than the double prec VECLEN=%d\n", QPHIX_SOALEN,
+                            VECLEN_DP);
+        abort();
+      }
+      QPhiX::masterPrintf("# INITIALIZING QPHIX MIXED SOLVER\n");
+      QPhiX::masterPrintf("# USING DOUBLE-DOUBLE PRECISION\n");
+      _initQphix(0, nullptr, qphix_input, compression, QPHIX_DOUBLE_PREC, QPHIX_DOUBLE_PREC);
+      if(compress12) {
+        return invert_eo_qphix_helper<
+          double, VECLEN_DP, QPHIX_SOALEN, true>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      } else {
+        return invert_eo_qphix_helper<
+          double, VECLEN_DP, QPHIX_SOALEN, false>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      }
+    } // if( sloppy )
+  } else { // if( solver_is_mixed )
+#if (defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE))
+    if (sloppy == SLOPPY_HALF || precision_lambda >= rsdTarget<QPhiX::half>::value) {
+      if (QPHIX_SOALEN > VECLEN_HP) {
+        QPhiX::masterPrintf("SOALEN=%d is greater than the half prec VECLEN=%d\n", QPHIX_SOALEN,
+                            VECLEN_HP);
+        abort();
+      }
+      QPhiX::masterPrintf("# INITIALIZING QPHIX SOLVER\n");
+      QPhiX::masterPrintf("# USING HALF PRECISION\n");
+      _initQphix(0, nullptr, qphix_input, compression, QPHIX_HALF_PREC);
+  
+      if (compress12) {
+        return invert_eo_qphix_helper<QPhiX::half, VECLEN_HP, QPHIX_SOALEN, true>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      } else {
+        return invert_eo_qphix_helper<QPhiX::half, VECLEN_HP, QPHIX_SOALEN, false>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      }
+    } else
+ #else
+    if (sloppy == SLOPPY_HALF) {
+      QPhiX::masterPrintf("QPHIX interface: half precision not supported on this architecture!\n");
+      abort();
+    } else
+ #endif
+    if (sloppy == SLOPPY_SINGLE || precision_lambda >= rsdTarget<float>::value) {
+      if (QPHIX_SOALEN > VECLEN_SP) {
+        QPhiX::masterPrintf("SOALEN=%d is greater than the single prec VECLEN=%d\n", QPHIX_SOALEN,
+                            VECLEN_SP);
+        abort();
+      }
+      QPhiX::masterPrintf("# INITIALIZING QPHIX SOLVER\n");
+      QPhiX::masterPrintf("# USING SINGLE PRECISION\n");
+      _initQphix(0, nullptr, qphix_input, compression, QPHIX_FLOAT_PREC);
+  
+      if (compress12) {
+        return invert_eo_qphix_helper<float, VECLEN_SP, QPHIX_SOALEN, true>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      } else {
+        return invert_eo_qphix_helper<float, VECLEN_SP, QPHIX_SOALEN, false>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      }
+    } else {
+      if (QPHIX_SOALEN > VECLEN_DP) {
+        QPhiX::masterPrintf("SOALEN=%d is greater than the double prec VECLEN=%d\n", QPHIX_SOALEN,
+                            VECLEN_DP);
+        abort();
+      }
+      QPhiX::masterPrintf("# INITIALIZING QPHIX SOLVER\n");
+      QPhiX::masterPrintf("# USING DOUBLE PRECISION\n");
+      _initQphix(0, nullptr, qphix_input, compression, QPHIX_DOUBLE_PREC);
+  
+      if (compress12) {
+        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, true>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      } else {
+        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, false>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      }
+    }  // if( sloppy || target_precision )
+  } // if ( solver_flag == *MIXEDCG )
+  return -1;
+}
+
+void tmlqcd::checkQphixInputParameters(const tm_QPhiXParams_t &params) {
+  if (params.MinCt == 0) {
+    QPhiX::masterPrintf("QPHIX Error: MinCt cannot be 0! Minimal value: 1. Aborting.\n");
+    abort();
+  }
+  if (params.By == 0 || params.Bz == 0) {
+    QPhiX::masterPrintf("QPHIX Error: By and Bz may not be 0! Minimal value: 1. Aborting.\n");
+    abort();
+  }
+  if (params.NCores * params.Sy * params.Sz != omp_num_threads) {
+    QPhiX::masterPrintf("QPHIX Error: NCores * Sy * Sz != ompnumthreads ! Aborting.\n");
+    abort();
+  }
+}
+
+void tmlqcd::printQphixDiagnostics(int VECLEN, int SOALEN, bool compress, int VECLEN_inner, int SOALEN_inner, bool compress_inner) {
+  QPhiX::masterPrintf("# QphiX: VECLEN=%d SOALEN=%d VECLEN_inner=%d, SOALEN_inner=%d\n", VECLEN, SOALEN, VECLEN_inner, SOALEN_inner);
+
+  QPhiX::masterPrintf("# QphiX: Declared QMP Topology (xyzt):");
+  for (int mu = 0; mu < 4; mu++) QPhiX::masterPrintf(" %d", qmp_geom[mu]);
+  QPhiX::masterPrintf("\n");
+
+  QPhiX::masterPrintf("# QphiX: Mapping of dimensions QMP -> tmLQCD (xyzt):");
+  for (int mu = 0; mu < 4; mu++) QPhiX::masterPrintf(" %d->%d", mu, qmp_tm_map[mu]);
+  QPhiX::masterPrintf("\n");
+
+  QPhiX::masterPrintf("# QphiX: Global Lattice Size (xyzt) = ");
+  for (int mu = 0; mu < 4; mu++) {
+    QPhiX::masterPrintf(" %d", lattSize[mu]);
+  }
+  QPhiX::masterPrintf("\n");
+  QPhiX::masterPrintf("# QphiX: Local Lattice Size (xyzt) = ");
+  for (int mu = 0; mu < 4; mu++) {
+    QPhiX::masterPrintf(" %d", subLattSize[mu]);
+  }
+  QPhiX::masterPrintf("\n");
+  QPhiX::masterPrintf("# QphiX: Block Sizes: By= %d Bz=%d\n", By, Bz);
+  QPhiX::masterPrintf("# QphiX: Cores = %d\n", NCores);
+  QPhiX::masterPrintf("# QphiX: SMT Grid: Sy=%d Sz=%d\n", Sy, Sz);
+  QPhiX::masterPrintf("# QphiX: Pad Factors: PadXY=%d PadXYZ=%d\n", PadXY, PadXYZ);
+  QPhiX::masterPrintf("# QphiX: Threads_per_core = %d\n", N_simt);
+  QPhiX::masterPrintf("# QphiX: MinCt = %d\n", MinCt);
+  if (compress) {
+    QPhiX::masterPrintf("# QphiX: Using two-row gauge compression (compress12)\n");
+  }
+  if (compress_inner) {
+    QPhiX::masterPrintf("# QphiX: Inner solver using two-row gauge compression (compress12)\n");
+  }
+}
+
+void testSpinorPackers(spinor *Even_out, spinor *Odd_out, const spinor *const Even_in,
+                       const spinor *const Odd_in) {
+  tmlqcd::checkQphixInputParameters(qphix_input);
+  // FIXME: two-row gauge compression and double precision hard-coded
+  _initQphix(0, nullptr, qphix_input, 12, QPHIX_DOUBLE_PREC);
+
+  QPhiX::Geometry<double, VECLEN_SP, QPHIX_SOALEN, true> geom(subLattSize, By, Bz, NCores, Sy, Sz,
+                                                              PadXY, PadXYZ, MinCt);
+
+  auto qphix_cb_even = QPhiX::makeFourSpinorHandle(geom);
+  auto qphix_cb_odd = QPhiX::makeFourSpinorHandle(geom);
+
+  spinor **tmp;
+  init_solver_field(&tmp, VOLUME / 2, 2);
+
+//   reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const *const>(Even_in),
+//                              qphix_cb_even.get(), cb_even);
+//   reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const *const>(Odd_in),
+//                              qphix_cb_odd.get(), cb_odd);
+  reorder_eo_spinor_to_QPhiX(geom, Even_in,
+                             qphix_cb_even.get(), cb_even);
+  reorder_eo_spinor_to_QPhiX(geom, Odd_in,
+                             qphix_cb_odd.get(), cb_odd);
+  
+  reorder_eo_spinor_from_QPhiX(geom, Even_out, qphix_cb_even.get(),
+                               cb_even, 1.0);
+  reorder_eo_spinor_from_QPhiX(geom, Odd_out, qphix_cb_odd.get(),
+                               cb_odd, 1.0);
+
+  diff(tmp[0], Even_out, Even_in, VOLUME / 2);
+  diff(tmp[1], Odd_out, Odd_in, VOLUME / 2);
+  double l2norm = square_norm(tmp[0], VOLUME / 2, 1) + square_norm(tmp[1], VOLUME / 2, 1);
+  QPhiX::masterPrintf("QPHIX eo spinor packer back and forth difference L2 norm: %lf\n", l2norm);
+  finalize_solver(tmp, 2);
+}
diff --git a/qphix_interface.h b/qphix_interface.h
new file mode 100644
index 000000000..01b5318a4
--- /dev/null
+++ b/qphix_interface.h
@@ -0,0 +1,70 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2015 Mario Schroeck
+ *               2017 Peter Labus, Martin Ueding, Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+
+#ifndef QPHIX_INTERFACE_H_
+#define QPHIX_INTERFACE_H_
+
+#include "global.h"
+#include "qphix_types.h"
+
+#ifdef __cplusplus /* If this is a C++ compiler, use C linkage */
+extern "C" {
+#endif
+
+#include "misc_types.h"
+#include "operator_types.h"
+#include "solver/matrix_mult_typedef.h"
+#include "solver/solver_params.h"
+#include "su3.h"
+
+int invert_eo_qphix_oneflavour(spinor* const Odd_out, spinor* const Odd_in, const int max_iter,
+                               const double precision, const int solver_flag, const int rel_prec,
+                               solver_params_t solver_params, const SloppyPrecision sloppy,
+                               const CompressionType compression);
+
+int invert_eo_qphix_oneflavour_mshift(spinor** Odd_out, spinor* const Odd_in, const int max_iter,
+                                      const double precision, const int solver_flag, const int rel_prec,
+                                      solver_params_t solver_params, const SloppyPrecision sloppy,
+                                      const CompressionType compression);
+
+int invert_eo_qphix_twoflavour(spinor* Odd_out_s, spinor* Odd_out_c, spinor* Odd_in_s,
+                               spinor* Odd_in_c, const int max_iter, const double precision,
+                               const int solver_flag, const int rel_prec,
+                               solver_params_t solver_params, const SloppyPrecision sloppy,
+                               const CompressionType compression);
+
+int invert_eo_qphix_twoflavour_mshift(spinor** Odd_out_s, spinor** Odd_out_c, spinor* Odd_in_s,
+                                      spinor* Odd_in_c, const int max_iter, const double precision,
+                                      const int solver_flag, const int rel_prec,
+                                      solver_params_t solver_params, const SloppyPrecision sloppy,
+                                      const CompressionType compression);
+
+void Mfull_qphix(spinor* Even_out, spinor* Odd_out, const spinor* Even_in, const spinor* Odd_in,
+                 const op_type_t op_type);
+
+void testSpinorPackers(spinor* Even_out, spinor* Odd_out, const spinor* const Even_in,
+                       const spinor* const Odd_in);
+
+#ifdef __cplusplus /* If this is a C++ compiler, end C linkage */
+}
+#endif
+#endif /* QPHIX_INTERFACE_H_ */
diff --git a/qphix_interface.hpp b/qphix_interface.hpp
new file mode 100644
index 000000000..b487eda66
--- /dev/null
+++ b/qphix_interface.hpp
@@ -0,0 +1,51 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2017 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+
+#pragma once
+
+#include "global.h"
+#include "qphix_types.h"
+
+#ifdef __cplusplus /* If this is a C++ compiler, use C linkage */
+extern "C" {
+#endif
+
+#include "misc_types.h"
+#include "operator_types.h"
+#include "solver/matrix_mult_typedef.h"
+#include "solver/solver_params.h"
+#include "su3.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#include <vector>
+
+int invert_eo_qphix_nflavour_mshift(std::vector< std::vector< spinor* > > &Odd_out, 
+                                    std::vector< std::vector< spinor* > > &Odd_in, 
+                                    const double precision,
+                                    const int max_iter,
+                                    const int solver_flag, 
+                                    const int rel_prec,
+                                    solver_params_t solver_params,
+                                    const SloppyPrecision sloppy, const CompressionType compression,
+                                    const int num_flavour);
\ No newline at end of file
diff --git a/qphix_interface_utils.hpp b/qphix_interface_utils.hpp
new file mode 100644
index 000000000..56d8afe56
--- /dev/null
+++ b/qphix_interface_utils.hpp
@@ -0,0 +1,33 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2015 Mario Schroeck
+ *               2016 Peter Labus
+ *               2017 Peter Labus, Martin Ueding, Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+
+#pragma once
+
+#include "qphix_types.h"
+
+namespace tmlqcd {
+
+void checkQphixInputParameters(const tm_QPhiXParams_t &params);
+void printQphixDiagnostics(int VECLEN, int SOALEN, bool compress, int VECLEN_inner, int SOALEN_inner, bool compress_inner);
+
+}  // namespace tmlqcd
diff --git a/qphix_test_Dslash.c b/qphix_test_Dslash.c
new file mode 100644
index 000000000..3197c3028
--- /dev/null
+++ b/qphix_test_Dslash.c
@@ -0,0 +1,430 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2015 Mario Schroeck
+ *               2016 Peter Labus
+ *               2017 Peter Labus, Martin Ueding, Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include <tmlqcd_config.h>
+#endif
+#ifdef TM_USE_QPHIX
+#include <qphix/qphix_config.h>
+#endif
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#if (defined BGL && !defined BGP)
+#include <rts.h>
+#endif
+#ifdef TM_USE_MPI
+#include <mpi.h>
+#ifdef HAVE_LIBLEMON
+#include <io/gauge.h>
+#include <io/params.h>
+#endif
+#endif
+#ifdef TM_USE_OMP
+#include <omp.h>
+#include "init/init_openmp.h"
+#endif
+#ifdef QPHIX_QMP_COMMS
+#include <qmp.h>
+#endif
+#include "boundary.h"
+#include "gamma.h"
+#include "geometry_eo.h"
+#include "gettime.h"
+#include "global.h"
+#include "init/init.h"
+#include "init/init.h"
+#include "invert_clover_eo.h"
+#include "invert_eo.h"
+#include "linalg/assign_add_mul_r.h"
+#include "linalg/convert_eo_to_lexic.h"
+#include "linalg/diff_and_square_norm.h"
+#include "linalg/square_norm.h"
+#include "mpi_init.h"
+#include "operator.h"
+#include "operator/D_psi.h"
+#include "operator/Hopping_Matrix.h"
+#include "operator/Hopping_Matrix_nocom.h"
+#include "operator/clover_leaf.h"
+#include "operator/clovertm_operators.h"
+#include "operator/clovertm_operators.h"
+#include "operator/tm_operators.h"
+#include "prepare_source.h"
+#include "qphix_interface.h"
+#include "ranlxd.h"
+#include "read_input.h"
+#include "solver/cg_her.h"
+#include "solver/solver_field.h"
+#include "start.h"
+#include "su3.h"
+#include "su3adj.h"
+#include "test/check_geometry.h"
+#include "update_backward_gauge.h"
+#include "xchange/xchange.h"
+#include "struct_accessors.h"
+
+int check_xchange();
+double compare_spinors(spinor* s1, spinor* s2);
+
+int main(int argc, char* argv[]) {
+  int j;
+#ifdef HAVE_LIBLEMON
+  paramsXlfInfo* xlfInfo;
+#endif
+  int status = 0;
+
+  static double tm_t1, tm_t2, q_t1, q_t2;
+
+  DUM_DERI = 8;
+  DUM_MATRIX = DUM_DERI + 5;
+  NO_OF_SPINORFIELDS = DUM_MATRIX + 4;
+
+  /* Set the input file */
+  char input_filename[500];
+  snprintf(input_filename, 500, "test_Dslash.input");
+
+  init_parallel_and_read_input(argc, argv, input_filename);
+  tmlqcd_mpi_init(argc, argv);
+  g_dbw2rand = 0;
+
+#ifdef _GAUGE_COPY
+  init_gauge_field(VOLUMEPLUSRAND, 1);
+#else
+  init_gauge_field(VOLUMEPLUSRAND, 0);
+#endif
+
+  init_geometry_indices(VOLUMEPLUSRAND);
+  j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS);
+  if (j != 0) {
+    fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n");
+    exit(0);
+  }
+
+  if (g_proc_id == 0) {
+    fprintf(stdout, "# The number of processes is %d \n", g_nproc);
+    printf("# The lattice size is %d x %d x %d x %d\n", (int)(T * g_nproc_t), (int)(LX * g_nproc_x),
+           (int)(LY * g_nproc_y), (int)(g_nproc_z * LZ));
+    printf("# The local lattice size is %d x %d x %d x %d\n", (int)(T), (int)(LX), (int)(LY),
+           (int)LZ);
+    if (even_odd_flag) {
+      printf("# testing the even/odd preconditioned Dirac operator\n");
+    } else {
+      printf("# testing the standard Dirac operator\n");
+    }
+    fflush(stdout);
+  }
+
+  /* define the geometry */
+  geometry();
+
+#ifdef _USE_HALFSPINOR
+  j = init_dirac_halfspinor();
+  if (j != 0) {
+    fprintf(stderr, "Not enough memory for halfspinor fields! Aborting...\n");
+    exit(0);
+  }
+  j = init_dirac_halfspinor32();
+  if (j != 0) {
+    fprintf(stderr, "Not enough memory for 32-Bit halfspinor fields! Aborting...\n");
+    exit(0);
+  }
+#if (defined _PERSISTENT)
+  init_xchange_halffield();
+#endif
+#endif
+
+  status = check_geometry();
+  if (status != 0) {
+    fprintf(stderr, "Checking if geometry failed. Unable to proceed.\nAborting....\n");
+    exit(1);
+  }
+
+  start_ranlux(1, 123456);
+  if (startoption == 0) {
+    unit_g_gauge_field();  // unit 3x3 colour matrices
+  } else {
+    random_gauge_field(1, g_gauge_field);
+  }
+
+// g_gauge_field[ g_ipt[0][0][0][1] ][0].c00 = 1.0;
+// g_gauge_field[ g_ipt[0][0][0][1] ][0].c01 = 0.0;
+// g_gauge_field[ g_ipt[0][0][0][1] ][0].c02 = 0.0;
+// g_gauge_field[ g_ipt[0][0][0][1] ][0].c10 = 0.0;
+// g_gauge_field[ g_ipt[0][0][0][1] ][0].c11 = 1.0;
+// g_gauge_field[ g_ipt[0][0][0][1] ][0].c12 = 0.0;
+// g_gauge_field[ g_ipt[0][0][0][1] ][0].c20 = 0.0;
+// g_gauge_field[ g_ipt[0][0][0][1] ][0].c21 = 0.0;
+// g_gauge_field[ g_ipt[0][0][0][1] ][0].c22 = 1.0;
+
+#ifdef TM_USE_MPI
+  /*For parallelization: exchange the gaugefield */
+  xchange_gauge(g_gauge_field);
+#endif
+
+  g_update_gauge_copy = 1;
+#ifdef _GAUGE_COPY
+  update_backward_gauge(g_gauge_field);
+#endif
+
+  init_operators();
+
+  spinor** qphix_out_cb_spinors;
+  init_solver_field(&qphix_out_cb_spinors, VOLUME / 2, 2);
+
+  spinor** tmp;
+  init_solver_field(&tmp, VOLUME, 2);
+
+  double* difference_l2norm = calloc(no_operators, sizeof(double));
+
+  /* we will loop over the operators defined in the input file
+   * and first apply the tmLQCD operator to the test spinor, then
+   * the QPhiX operator and then compare */
+  for (int op_id = 0; op_id < no_operators; ++op_id) {
+    operator* op =& operator_list[op_id];
+    op_set_globals(op_id);
+    if (op->type == CLOVER || op->type == DBCLOVER) {
+      sw_term((const su3**)g_gauge_field, op->kappa, op->c_sw);
+      sw_invert(EE, op->mu);
+    }
+    boundary(g_kappa);
+    // check BC
+    if (g_proc_id == 0) {
+      printf("\nphase_0 = %f + I*%f\n", creal(phase_0), cimag(phase_0));
+      printf("phase_1 = %f + I*%f\n", creal(phase_1), cimag(phase_1));
+      printf("phase_2 = %f + I*%f\n", creal(phase_2), cimag(phase_2));
+      printf("phase_3 = %f + I*%f\n\n", creal(phase_3), cimag(phase_3));
+    }
+    /* depending on what has been set in the input file, this will create
+     * 1) a point source at source_location, spin/colour corresponding to index_start
+     * 2) a volume source
+     * 3) a time-slice source
+     * for the given operator */
+    prepare_source(0 /*nstore*/, 0 /*isample*/, index_start, op_id, 0 /*read_source_flag*/,
+                   source_location, 12345 /* seed */);
+
+#ifdef TM_USE_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+
+    tm_t1 = gettime();
+    op->applyM(op->prop0, op->prop1, op->sr0, op->sr1);
+    // Hopping_Matrix(OE, op->prop0, op->sr1);
+    // Hopping_Matrix(EO, op->prop1, op->sr0);
+    tm_t2 = gettime();
+
+#ifdef TM_USE_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+    q_t1 = gettime();
+    Mfull_qphix(qphix_out_cb_spinors[0], qphix_out_cb_spinors[1], op->sr0, op->sr1, op->type);
+    q_t2 = gettime();
+
+    double squarenorm = square_norm(op->sr0, VOLUME / 2, 1) + square_norm(op->sr1, VOLUME / 2, 1);
+    if (g_proc_id == 0) {
+      printf("  ||source||^2 = %e\n\n", squarenorm);
+      fflush(stdout);
+    }
+
+    // print L2-norm of result:
+    squarenorm = square_norm(op->prop0, VOLUME / 2, 1) + square_norm(op->prop1, VOLUME / 2, 1);
+    if (g_proc_id == 0) {
+      printf("\n\n");
+      printf("# -------------------------------------------- #\n\n");
+      printf("# Dslash 1 (tmLQCD) op_type=%d:\n", op->type);
+      printf("# ====================\n\n");
+      printf("  ||result_1||^2 = %.16e\n", squarenorm);
+      printf("  Time for MV mult: %e\n", tm_t2 - tm_t1);
+      fflush(stdout);
+    }
+
+    // print L2-norm of result:
+    squarenorm = square_norm(qphix_out_cb_spinors[0], VOLUME / 2, 1) +
+                 square_norm(qphix_out_cb_spinors[1], VOLUME / 2, 1);
+    if (g_proc_id == 0) {
+      printf("\n\n");
+      printf("# -------------------------------------------- #\n\n");
+      printf("# Dslash 2 (QPhiX) op_type=%d:\n", op->type);
+      printf("# ====================\n\n");
+      printf("  ||result_2||^2 = %.16e\n", squarenorm);
+      printf("  Time for MV mult: %e\n", q_t2 - q_t1);
+      fflush(stdout);
+    }
+
+    convert_eo_to_lexic(tmp[0], op->prop0, op->prop1);
+    convert_eo_to_lexic(tmp[1], qphix_out_cb_spinors[0], qphix_out_cb_spinors[1]);
+
+    difference_l2norm[op_id] = compare_spinors(tmp[0], tmp[1]);
+
+  }  // for(op_id)
+
+  int failed = 0;
+  for (int op_id = 0; op_id < no_operators; op_id++) {
+    if (g_proc_id == 0) {
+      printf("op_id: %d, |diff|^2 = %.16e\n", op_id, difference_l2norm[op_id]);
+    }
+    // check if the l2 norm of the difference is tolerable up to rounding
+    if (difference_l2norm[op_id] > 2 * g_nproc * VOLUME * DBL_EPSILON) {
+      failed = 1;
+    }
+  }
+
+  free(difference_l2norm);
+  finalize_solver(qphix_out_cb_spinors, 2);
+  finalize_solver(tmp, 2);
+#ifdef TM_USE_OMP
+  free_omp_accumulators();
+#endif
+  free_gauge_field();
+  free_geometry_indices();
+  free_spinor_field();
+  free_moment_field();
+#ifdef TM_USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Finalize();
+#endif
+  return (failed);
+}
+
+double compare_spinors(spinor* s1, spinor* s2) {
+#ifdef TM_USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  int coords[4];
+  int x, y, z, t, id = 0;
+  // list non-zero elements in spinors, but only if the source type was a point source
+  // otherwise the output is overwhelming
+  if (SourceInfo.type == SRC_TYPE_POINT) {
+    if (g_proc_id == 0) printf("\n OUTPUT TMLQCD vs QPHIX SPINOR (tmlQCD format):\n");
+    if (g_proc_id == 0)
+      printf("g_proc_id | T=%3d LX=%3d LY=%3d LZ=%3d %26s", g_nproc_t * T, g_nproc_x * LX,
+             g_nproc_y * LY, g_nproc_z * LZ, " ");
+    if (g_proc_id == 0)
+      printf("T=%3d LX=%3d LY=%3d LZ=%3d \n", g_nproc_t * T, g_nproc_x * LX, g_nproc_y * LY,
+             g_nproc_z * LZ);
+    for (int t_global = 0; t_global < g_nproc_t * T; t_global++) {
+      coords[0] = t_global / T;
+      for (int x_global = 0; x_global < g_nproc_x * LX; x_global++) {
+        coords[1] = x_global / LX;
+        for (int y_global = 0; y_global < g_nproc_y * LY; y_global++) {
+          coords[2] = y_global / LY;
+          for (int z_global = 0; z_global < g_nproc_z * LZ; z_global++) {
+            coords[3] = z_global / LZ;
+#ifdef TM_USE_MPI
+            MPI_Cart_rank(g_cart_grid, coords, &id);
+#endif
+            if (g_proc_id == id) {
+              t = t_global - g_proc_coords[0] * T;
+              x = x_global - g_proc_coords[1] * LX;
+              y = y_global - g_proc_coords[2] * LY;
+              z = z_global - g_proc_coords[3] * LZ;
+              int idx = g_ipt[t][x][y][z];
+              for (int sc = 0; sc < 24; sc++) {
+                double e_tmlqcd = spinor_get_elem_linear(&s2[idx],sc/2,sc%2);
+                double e_qphix = spinor_get_elem_linear(&s1[idx],sc/2,sc%2);
+                
+                if (fabs(e_tmlqcd) > 2 * DBL_EPSILON ||
+                    fabs(e_qphix) > 2 * DBL_EPSILON) {
+                  fflush(stdout);
+                  printf("%9d | %5d %6d %6d %6d s%1d c%1d reim%1d : %+5lf %2s", g_proc_id, t_global,
+                         x_global, y_global, z_global, sc / 6, (sc / 2) % 3, sc % 2, e_tmlqcd ,
+                         " ");
+                  printf("%5d %6d %6d %6d s%1d c%1d reim%1d : %+5lf", t_global, x_global, y_global,
+                         z_global, sc / 6, (sc / 2) % 3, sc % 2, e_qphix);
+                  if (fabs(e_tmlqcd - e_qphix) > 2 * DBL_EPSILON) printf(" !!! ");
+                  printf("\n");
+                }
+              }
+            }
+#ifdef TM_USE_MPI
+            MPI_Barrier(MPI_COMM_WORLD);
+#endif
+          }  // z
+        }    // y
+      }      // x
+    }        // t
+  }          // if( SourceInfo.type == SRC_TYPE_POINT )
+
+#ifdef TM_USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  if (g_proc_id == 0) {
+    printf("\n");
+    printf("# Comparison tmLQCD vs QPhiX:\n");
+    printf("# ===========================\n\n");
+  }
+
+  if (g_proc_id == 0) printf("\n OUTPUT TMLQCD vs QPHIX SPINOR (tmlQCD format):\n");
+  if (g_proc_id == 0)
+    printf("g_proc_id | T=%3d LX=%3d LY=%3d LZ=%3d \n", g_nproc_t * T, g_nproc_x * LX,
+           g_nproc_y * LY, g_nproc_z * LZ);
+  double squarenorm = diff_and_square_norm(s1, s2, VOLUME);
+
+#ifdef TM_USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  id = 0;
+  for (int t_global = 0; t_global < g_nproc_t * T; t_global++) {
+    coords[0] = t_global / T;
+    for (int x_global = 0; x_global < g_nproc_x * LX; x_global++) {
+      coords[1] = x_global / LX;
+      for (int y_global = 0; y_global < g_nproc_y * LY; y_global++) {
+        coords[2] = y_global / LY;
+        for (int z_global = 0; z_global < g_nproc_z * LZ; z_global++) {
+          coords[3] = z_global / LZ;
+#ifdef TM_USE_MPI
+          MPI_Cart_rank(g_cart_grid, coords, &id);
+#endif
+          if (g_proc_id == id) {
+            t = t_global - g_proc_coords[0] * T;
+            x = x_global - g_proc_coords[1] * LX;
+            y = y_global - g_proc_coords[2] * LY;
+            z = z_global - g_proc_coords[3] * LZ;
+            int idx = g_ipt[t][x][y][z];
+            for (int sc = 0; sc < 24; sc++) {
+              double e_diff = spinor_get_elem_linear(&s1[idx],sc/2,sc%2);
+              // when a volume source is used, these will be zero up to significant rounding
+              // we account for that by the scaling of DBL_EPSILON
+              if (fabs(e_diff) > 8 * 24 * DBL_EPSILON) {
+                fflush(stdout);
+                printf("%9d | %5d %6d %6d %6d s%1d c%1d reim%1d : %+5lf\n", g_proc_id, t_global,
+                       x_global, y_global, z_global, sc / 6, (sc / 2) % 3, sc % 2, e_diff);
+              }
+            }
+          }
+#ifdef TM_USE_MPI
+          MPI_Barrier(MPI_COMM_WORLD);
+#endif
+        }  // z
+      }    // y
+    }      // x
+  }        // t
+
+  if (g_proc_id == 0) {
+    printf("\n  ||result_1 - result_2||^2 = %e\n\n", squarenorm);
+    fflush(stdout);
+  }
+  return squarenorm;
+}
diff --git a/qphix_types.h b/qphix_types.h
new file mode 100644
index 000000000..7dd317844
--- /dev/null
+++ b/qphix_types.h
@@ -0,0 +1,41 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2015 Mario Schroeck
+ *               2016 Peter Labus
+ *               2017 Peter Labus, Martin Ueding, Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+
+#ifndef QPHIX_TYPES_H
+#define QPHIX_TYPES_H
+
+typedef struct tm_QPhiXParams_t {
+  int By;
+  int Bz;
+  int NCores;
+  int Sy;
+  int Sz;
+  int PadXY;
+  int PadXYZ;
+  int MinCt;
+  int soalen;
+} tm_QPhiXParams_t;
+
+typedef enum QphixPrec_t { QPHIX_FLOAT_PREC = 0, QPHIX_HALF_PREC, QPHIX_DOUBLE_PREC } QphixPrec_t;
+
+#endif  // QPHIX_TYPES_H
diff --git a/qphix_veclen.h b/qphix_veclen.h
new file mode 100644
index 000000000..1ca3358a9
--- /dev/null
+++ b/qphix_veclen.h
@@ -0,0 +1,60 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2015 Mario Schroeck
+ *               2016 Peter Labus
+ *               2017 Peter Labus, Martin Ueding, Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+
+#ifndef QPHIX_VECLEN_H
+#define QPHIX_VECLEN_H
+
+#include <qphix/qphix_config.h>
+
+#if (defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE))
+#define VECLEN_SP 16
+#define VECLEN_HP 16
+#define VECLEN_DP 8
+#endif
+
+#if defined(QPHIX_AVX2_SOURCE)
+#define VECLEN_SP 8
+#define VECLEN_DP 4
+#endif
+
+#if defined(QPHIX_AVX_SOURCE)
+#define VECLEN_SP 8
+#define VECLEN_DP 4
+#endif
+
+#if defined(QPHIX_SSE_SOURCE)
+#define VECLEN_SP 4
+#define VECLEN_DP 2
+#endif
+
+#if defined(QPHIX_SCALAR_SOURCE)
+#define VECLEN_SP 1
+#define VECLEN_DP 1
+#endif
+
+#if defined(QPHIX_QPX_SOURCE)
+#define VECLEN_SP 4
+#define VECLEN_DP 4
+#endif
+
+#endif  // QPHIX_VECLEN_H
diff --git a/quda_interface.c b/quda_interface.c
new file mode 100644
index 000000000..b4b3ef856
--- /dev/null
+++ b/quda_interface.c
@@ -0,0 +1,1311 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2015       Mario Schroeck
+ *               2016, 2017 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ ***********************************************************************/
+/***********************************************************************
+*
+* File quda_interface.h
+*
+* Authors: Mario Schroeck <mario.schroeck@roma3.infn.it>
+*          Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
+* 
+* Last changes: 12/2017
+*
+*
+* Interface to QUDA for multi-GPU inverters
+*
+* The externally accessible functions are
+*
+*   void _initQuda()
+*     Initializes the QUDA library. Carries over the lattice size and the
+*     MPI process grid and thus must be called after initializing MPI.
+*     Currently it is called in init_operators() if optr->use_qudainverter
+*     flag is set.
+*     Memory for the QUDA gaugefield on the host is allocated but not filled
+*     yet (the latter is done in _loadGaugeQuda(), see below).
+*     Performance critical settings are done here and can be changed.
+*
+*   void _endQuda()
+*     Finalizes the QUDA library. Call before MPI_Finalize().
+*
+*   void _loadGaugeQuda()
+*     Copies and reorders the gaugefield on the host and copies it to the GPU.
+*     Must be called between last changes on the gaugefield (smearing etc.)
+*     and first call of the inverter. In particular, 'boundary(const double kappa)'
+*     must be called before if nontrivial boundary conditions are to be used since
+*     those will be applied directly to the gaugefield. Currently it is called just
+*     before the inversion is done (might result in wasted loads...).
+*     It checks whether the curently loaded gauge field corresponds to the gauge field
+*     about to be loaded and returns with a no-op if they agree.
+*
+*   void _loadCloverQuda()
+*     Wrapper for loadCloverQuda() which checks that the currently loaded gauge field
+*     and the clover field about to be constructed agree. If they do, the currently
+*     loaded clover field is reused.
+*
+*   void _setQudaMultigridParam()
+*     borrowed from QUDA multigrid_invert_test, sets up the input parameters
+*     for running the QUDA-MG implementation
+*
+*   The functions
+*
+*     int invert_eo_quda(...);
+*     int invert_doublet_eo_quda(...);
+*     void M_full_quda(...);
+*     void D_psi_quda(...);
+*
+*   mimic their tmLQCD counterparts in functionality as well as input and
+*   output parameters. The invert functions will check the parameters
+*   g_mu, g_c_sw do decide which QUDA operator to create.
+*
+*   To activate those, set "UseQudaInverter = yes" in the operator
+*   declaration of the input file. For details see the documentation.
+*
+*   The function
+*
+*     int invert_quda_direct(...);
+*
+*   provides a direct interface to the QUDA solver and is not accessible through
+*   the input file.
+*
+* Notes:
+*
+* Minimum QUDA version is 0.7.0 (see https://github.com/lattice/quda/issues/151 
+* and https://github.com/lattice/quda/issues/157).
+*
+*
+**************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <float.h>
+#include "quda_interface.h"
+#include "quda_types.h"
+#include "boundary.h"
+#include "linalg/convert_eo_to_lexic.h"
+#include "linalg/mul_r.h"
+#include "solver/solver.h"
+#include "solver/solver_field.h"
+#include "gettime.h"
+#include "boundary.h"
+#include "quda.h"
+#include "global.h"
+#include "operator.h"
+#include "tm_debug_printf.h"
+
+// nstore is generally like a gauge id, for measurements it identifies the gauge field
+// uniquely 
+extern int nstore;
+
+double X0, X1, X2, X3;
+
+// define order of the spatial indices
+// default is LX-LY-LZ-T, see below def. of local lattice size, this is related to
+// the gamma basis transformation from tmLQCD -> UKQCD
+// for details see https://github.com/lattice/quda/issues/157
+#define USE_LZ_LY_LX_T 0
+
+#define MAX(a,b) ((a)>(b)?(a):(b))
+
+tm_QudaMGSetupState_t quda_mg_setup_state;
+tm_QudaGaugeState_t quda_gauge_state;
+tm_QudaCloverState_t quda_clover_state;
+
+// gauge and invert paramameter structs; init. in _initQuda()
+QudaGaugeParam  gauge_param;
+QudaInvertParam inv_param;
+// params to pass to MG
+QudaMultigridParam quda_mg_param;
+QudaInvertParam inv_mg_param;
+void* quda_mg_preconditioner;
+
+// input params specific to tmLQCD QUDA interface
+tm_QudaParams_t quda_input;
+
+// pointer to the QUDA gaugefield
+double *gauge_quda[4];
+
+// pointer to a temp. spinor, used for reordering etc.
+double *tempSpinor;
+  
+// function that maps coordinates in the communication grid to MPI ranks
+int commsMap(const int *coords, void *fdata) {
+#if USE_LZ_LY_LX_T
+  int n[4] = {coords[3], coords[2], coords[1], coords[0]};
+#else
+  int n[4] = {coords[3], coords[0], coords[1], coords[2]};
+#endif
+
+  int rank = 0;
+#ifdef TM_USE_MPI
+  MPI_Cart_rank( g_cart_grid, n, &rank );
+#endif
+
+  return rank;
+}
+
+// variable to check if quda has been initialized
+static int quda_initialized = 0;
+
+void _setQudaMultigridParam(QudaMultigridParam* mg_param);
+void _setOneFlavourSolverParam(const double kappa, const double c_sw, const double mu, 
+                               const int solver_type, const int even_odd,
+                               const double eps_sq, const int maxiter);
+
+void _setDefaultQudaParam(void){
+  reset_quda_gauge_state(&quda_gauge_state);
+  reset_quda_clover_state(&quda_clover_state);
+  reset_quda_mg_setup_state(&quda_mg_setup_state);
+
+  quda_mg_preconditioner = NULL;
+
+  // *** QUDA parameters begin here (sloppy prec. will be adjusted in invert)
+  QudaPrecision cpu_prec  = QUDA_DOUBLE_PRECISION;
+  QudaPrecision cuda_prec = QUDA_DOUBLE_PRECISION;
+  QudaPrecision cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
+  QudaPrecision cuda_prec_precondition = QUDA_HALF_PRECISION;
+
+  QudaTune tune = QUDA_TUNE_YES;
+
+  // *** the remainder should not be changed for this application
+  // local lattice size
+#if USE_LZ_LY_LX_T
+  gauge_param.X[0] = LZ;
+  gauge_param.X[1] = LY;
+  gauge_param.X[2] = LX;
+  gauge_param.X[3] = T;
+#else
+  gauge_param.X[0] = LX;
+  gauge_param.X[1] = LY;
+  gauge_param.X[2] = LZ;
+  gauge_param.X[3] = T;
+#endif
+
+  inv_param.Ls = 1;
+
+  gauge_param.anisotropy = 1.0;
+  gauge_param.type = QUDA_WILSON_LINKS;
+  gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
+
+  gauge_param.cpu_prec = cpu_prec;
+  gauge_param.cuda_prec = cuda_prec;
+  gauge_param.reconstruct = 18;
+  gauge_param.cuda_prec_sloppy = cuda_prec_sloppy;
+  gauge_param.reconstruct_sloppy = 18;
+  gauge_param.cuda_prec_precondition = cuda_prec_precondition;
+  gauge_param.reconstruct_precondition = 18;
+  gauge_param.cuda_prec_refinement_sloppy = cuda_prec_sloppy;
+  gauge_param.reconstruct_refinement_sloppy = 18;
+  gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
+
+  inv_param.dagger = QUDA_DAG_NO;
+  inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;
+  inv_param.solver_normalization = QUDA_DEFAULT_NORMALIZATION;
+
+  inv_param.pipeline = 0;
+  inv_param.gcrNkrylov = 20;
+
+  inv_param.residual_type = (QudaResidualType)(QUDA_L2_RELATIVE_RESIDUAL);
+  inv_param.tol_hq = 0.1;
+  inv_param.reliable_delta = 1e-3; // ignored by multi-shift solver
+  inv_param.use_sloppy_partial_accumulator = 0;
+
+  // domain decomposition preconditioner parameters
+  inv_param.inv_type_precondition = QUDA_CG_INVERTER;
+  inv_param.schwarz_type = QUDA_ADDITIVE_SCHWARZ;
+  inv_param.precondition_cycle = 1;
+  inv_param.tol_precondition = 1e-1;
+  inv_param.maxiter_precondition = 10;
+  inv_param.verbosity_precondition = QUDA_SILENT;
+  if( g_debug_level >= 5 )
+    inv_param.verbosity_precondition = QUDA_VERBOSE;
+
+  inv_param.omega = 1.0;
+
+  inv_param.cpu_prec = cpu_prec;
+  inv_param.cuda_prec = cuda_prec;
+  inv_param.cuda_prec_sloppy = cuda_prec_sloppy;
+  inv_param.cuda_prec_refinement_sloppy = cuda_prec_sloppy;
+  inv_param.cuda_prec_precondition = cuda_prec_precondition;
+
+  inv_param.clover_cpu_prec = cpu_prec;
+  inv_param.clover_cuda_prec = cuda_prec;
+  inv_param.clover_cuda_prec_sloppy = cuda_prec_sloppy;
+  inv_param.clover_cuda_prec_precondition = cuda_prec_precondition;
+  inv_param.clover_cuda_prec_refinement_sloppy = cuda_prec_sloppy;
+
+  inv_param.preserve_source = QUDA_PRESERVE_SOURCE_YES;
+  inv_param.gamma_basis = QUDA_CHIRAL_GAMMA_BASIS; // CHIRAL -> UKQCD does not seem to be supported right now...
+  inv_param.dirac_order = QUDA_DIRAC_ORDER;
+
+  inv_param.input_location = QUDA_CPU_FIELD_LOCATION;
+  inv_param.output_location = QUDA_CPU_FIELD_LOCATION;
+
+  inv_param.tune = tune ? QUDA_TUNE_YES : QUDA_TUNE_NO;
+
+  gauge_param.ga_pad = 0; // 24*24*24/2;
+  inv_param.sp_pad = 0; // 24*24*24/2;
+  inv_param.cl_pad = 0; // 24*24*24/2;
+
+  // For multi-GPU, ga_pad must be large enough to store a time-slice
+  int x_face_size = gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2;
+  int y_face_size = gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2;
+  int z_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2;
+  int t_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2;
+  int pad_size =MAX(x_face_size, y_face_size);
+  pad_size = MAX(pad_size, z_face_size);
+  pad_size = MAX(pad_size, t_face_size);
+  gauge_param.ga_pad = pad_size;
+
+  // solver verbosity
+  if( g_debug_level == 0 )
+    inv_param.verbosity = QUDA_SILENT;
+  else if( g_debug_level >= 1 && g_debug_level < 3 )
+    inv_param.verbosity = QUDA_SUMMARIZE;
+  else if( g_debug_level >= 3 && g_debug_level < 5 )
+    inv_param.verbosity = QUDA_VERBOSE;
+  else if( g_debug_level >= 5 )
+    inv_param.verbosity = QUDA_DEBUG_VERBOSE;
+
+  // general verbosity
+  setVerbosityQuda( QUDA_SUMMARIZE, "# QUDA: ", stdout);
+}
+
+void _initQuda() {
+  if( quda_initialized )
+    return;
+
+  if( g_debug_level > 0 )
+    if(g_proc_id == 0)
+      printf("\n# QUDA: Detected QUDA version %d.%d.%d\n\n", QUDA_VERSION_MAJOR, QUDA_VERSION_MINOR, QUDA_VERSION_SUBMINOR);
+  if( QUDA_VERSION_MAJOR == 0 && QUDA_VERSION_MINOR < 7) {
+    fprintf(stderr, "Error: minimum QUDA version required is 0.7.0 (for support of chiral basis and removal of bug in mass normalization with preconditioning).\n");
+    exit(-2);
+  }
+
+  gauge_param = newQudaGaugeParam();
+  inv_param = newQudaInvertParam();
+  inv_mg_param = newQudaInvertParam();
+  quda_mg_param = newQudaMultigridParam();
+
+  _setDefaultQudaParam();
+
+  // declare the grid mapping used for communications in a multi-GPU grid
+#if USE_LZ_LY_LX_T
+  int grid[4] = {g_nproc_z, g_nproc_y, g_nproc_x, g_nproc_t};
+#else
+  int grid[4] = {g_nproc_x, g_nproc_y, g_nproc_z, g_nproc_t};
+#endif
+
+  initCommsGridQuda(4, grid, commsMap, NULL);
+
+  // alloc gauge_quda
+  size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
+
+  for (int dir = 0; dir < 4; dir++) {
+    gauge_quda[dir] = (double*) malloc(VOLUME*18*gSize);
+    if(gauge_quda[dir] == NULL) {
+      fprintf(stderr, "_initQuda: malloc for gauge_quda[dir] failed");
+      exit(-2);
+    }
+  }
+
+  // alloc space for a temp. spinor, used throughout this module
+  tempSpinor  = (double*)malloc( 2*VOLUME*24*sizeof(double) ); /* factor 2 for doublet */
+  if(tempSpinor == NULL) {
+    fprintf(stderr, "_initQuda: malloc for tempSpinor failed");
+    exit(-2);
+  }
+
+  // initialize the QUDA library
+#ifdef TM_USE_MPI
+  initQuda(-1); //sets device numbers automatically
+#else
+  // when running in 'subprocess' mode, the external program should have provided us with a unique
+  // id in the range 0 to (N-1), where N is the number of NVIDIA devices available (see wrapper/lib_wrapper.c)
+  if(subprocess_flag){
+    initQuda(g_external_id);
+  }else{
+    initQuda(0);  //scalar build without subprocess: use device 0
+  }
+#endif
+  quda_initialized = 1;
+}
+
+// finalize the QUDA library
+void _endQuda() {
+  if( quda_initialized ) {
+    if( quda_mg_preconditioner != NULL ){
+      destroyMultigridQuda(quda_mg_preconditioner);
+      quda_mg_preconditioner = NULL;
+    }
+    freeGaugeQuda();
+    freeCloverQuda(); // this is safe even if there is no Clover field loaded, at least it was in QUDA v0.7.2
+    free((void*)tempSpinor);
+    endQuda();
+  }
+}
+
+void _loadCloverQuda(QudaInvertParam* inv_param){
+  // check if loaded clover and gauge fields agree
+  if( check_quda_clover_state(&quda_clover_state, &quda_gauge_state) ){
+    if(g_proc_id==0 && g_debug_level > 0 ) printf("# QUDA: Clover field and inverse already loaded for gauge %d\n", quda_gauge_state.gauge_id);
+  } else {
+    double atime = gettime();
+    freeCloverQuda();
+    reset_quda_clover_state(&quda_clover_state);
+    loadCloverQuda(NULL, NULL, inv_param);
+    set_quda_clover_state(&quda_clover_state, &quda_gauge_state);
+    if(g_proc_id==0 && g_debug_level > 0 ) printf("# QUDA: Time for loadCloverQuda: %.4e\n",gettime()-atime);
+  }
+}
+
+void _loadGaugeQuda( const int compression ) {
+  // check if the currently loaded gauge field is also the current gauge field
+  // and if so, return immediately
+  if( check_quda_gauge_state(&quda_gauge_state, nstore) ){
+    return;
+  } else {
+    freeGaugeQuda();
+    reset_quda_gauge_state(&quda_gauge_state);
+  }
+
+  if( inv_param.verbosity > QUDA_SILENT ){
+    if(g_proc_id == 0) {
+      printf("# QUDA: Called _loadGaugeQuda\n");
+      if( compression == 18 ){
+        if( quda_input.fermionbc == TM_QUDA_THETABC ){
+          printf("# QUDA: Theta boundary conditions will be applied to gauge field\n");
+        } else if ( quda_input.fermionbc == TM_QUDA_APBC ){
+          printf("# QUDA: Temporal ABPC will be applied to gauge field\n");
+        }
+      }
+    }
+  }
+
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  _Complex double tmpcplx;
+
+  size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
+  
+  // now copy and reorder
+#ifdef TM_USE_OMP
+  #pragma omp for
+#endif
+  for( int x0=0; x0<T; x0++ )
+    for( int x1=0; x1<LX; x1++ )
+      for( int x2=0; x2<LY; x2++ )
+        for( int x3=0; x3<LZ; x3++ ) {
+#if USE_LZ_LY_LX_T
+          int j = x3 + LZ*x2 + LY*LZ*x1 + LX*LY*LZ*x0;
+          int tm_idx = x1 + LX*x2 + LY*LX*x3 + LZ*LY*LX*x0;
+#else
+          int j = x1 + LX*x2 + LY*LX*x3 + LZ*LY*LX*x0;
+          int tm_idx = x3 + LZ*x2 + LY*LZ*x1 + LX*LY*LZ*x0;
+#endif
+          int oddBit = (x0+x1+x2+x3) & 1;
+          int quda_idx = 18*(oddBit*VOLUME/2+j/2);
+
+#if USE_LZ_LY_LX_T
+          memcpy( &(gauge_quda[0][quda_idx]), &(g_gauge_field[tm_idx][3]), 18*gSize);
+          memcpy( &(gauge_quda[1][quda_idx]), &(g_gauge_field[tm_idx][2]), 18*gSize);
+          memcpy( &(gauge_quda[2][quda_idx]), &(g_gauge_field[tm_idx][1]), 18*gSize);
+          memcpy( &(gauge_quda[3][quda_idx]), &(g_gauge_field[tm_idx][0]), 18*gSize);
+#else
+          memcpy( &(gauge_quda[0][quda_idx]), &(g_gauge_field[tm_idx][1]), 18*gSize);
+          memcpy( &(gauge_quda[1][quda_idx]), &(g_gauge_field[tm_idx][2]), 18*gSize);
+          memcpy( &(gauge_quda[2][quda_idx]), &(g_gauge_field[tm_idx][3]), 18*gSize);
+          memcpy( &(gauge_quda[3][quda_idx]), &(g_gauge_field[tm_idx][0]), 18*gSize);
+#endif
+        if( compression == 18 ) {
+          // apply boundary conditions
+          if ( quda_input.fermionbc == TM_QUDA_THETABC ){
+            for( int i=0; i<9; i++ ) {
+              tmpcplx = gauge_quda[0][quda_idx+2*i] + I*gauge_quda[0][quda_idx+2*i+1];
+              tmpcplx *= -phase_1/g_kappa;
+              gauge_quda[0][quda_idx+2*i]   = creal(tmpcplx);
+              gauge_quda[0][quda_idx+2*i+1] = cimag(tmpcplx);
+
+              tmpcplx = gauge_quda[1][quda_idx+2*i] + I*gauge_quda[1][quda_idx+2*i+1];
+              tmpcplx *= -phase_2/g_kappa;
+              gauge_quda[1][quda_idx+2*i]   = creal(tmpcplx);
+              gauge_quda[1][quda_idx+2*i+1] = cimag(tmpcplx);
+
+              tmpcplx = gauge_quda[2][quda_idx+2*i] + I*gauge_quda[2][quda_idx+2*i+1];
+              tmpcplx *= -phase_3/g_kappa;
+              gauge_quda[2][quda_idx+2*i]   = creal(tmpcplx);
+              gauge_quda[2][quda_idx+2*i+1] = cimag(tmpcplx);
+
+              tmpcplx = gauge_quda[3][quda_idx+2*i] + I*gauge_quda[3][quda_idx+2*i+1];
+              tmpcplx *= -phase_0/g_kappa;
+              gauge_quda[3][quda_idx+2*i]   = creal(tmpcplx);
+              gauge_quda[3][quda_idx+2*i+1] = cimag(tmpcplx);
+            }
+          } else if ( quda_input.fermionbc == TM_QUDA_APBC && x0+g_proc_coords[0]*T == g_nproc_t*T-1 ) {
+            for( int i=0; i<18; i++ ) {
+              gauge_quda[3][quda_idx+i]   = -gauge_quda[3][quda_idx+i];
+            }
+          } // quda_input.fermionbc
+        } // compression
+      } // volume loop
+#ifdef TM_USE_OMP
+  } // OpenMP parallel closing brace 
+#endif
+
+  loadGaugeQuda((void*)gauge_quda, &gauge_param);
+  set_quda_gauge_state(&quda_gauge_state, nstore);
+}
+
+
+// reorder spinor to QUDA format
+void reorder_spinor_toQuda( double* sp, QudaPrecision precision, int doublet, double* sp2 ) {
+  double startTime = gettime();
+
+  if( doublet ) {
+    memcpy( tempSpinor,           sp,  VOLUME*24*sizeof(double) );
+    memcpy( tempSpinor+VOLUME*24, sp2, VOLUME*24*sizeof(double) );
+  }
+  else {
+    memcpy( tempSpinor, sp, VOLUME*24*sizeof(double) );
+  }
+
+  // now copy and reorder from tempSpinor to spinor
+#ifdef TM_USE_OMP
+  #pragma omp parallel for
+#endif
+  for( int x0=0; x0<T; x0++ )
+    for( int x1=0; x1<LX; x1++ )
+      for( int x2=0; x2<LY; x2++ )
+        for( int x3=0; x3<LZ; x3++ ) {
+#if USE_LZ_LY_LX_T
+          int j = x3 + LZ*x2 + LY*LZ*x1 + LX*LY*LZ*x0;
+          int tm_idx = x1 + LX*x2 + LY*LX*x3 + LZ*LY*LX*x0;
+#else
+          int j = x1 + LX*x2 + LY*LX*x3 + LZ*LY*LX*x0;
+          int tm_idx   = x3 + LZ*x2 + LY*LZ*x1 + LX*LY*LZ*x0;
+#endif
+          int oddBit = (x0+x1+x2+x3) & 1;
+
+          if( doublet ) {
+            memcpy( &(sp[24*(oddBit*VOLUME+j/2)]),          &(tempSpinor[24* tm_idx        ]), 24*sizeof(double));
+            memcpy( &(sp2[24*(oddBit*VOLUME+j/2+VOLUME/2)]), &(tempSpinor[24*(tm_idx+VOLUME)]), 24*sizeof(double));
+          }
+          else {
+            memcpy( &(sp[24*(oddBit*VOLUME/2+j/2)]), &(tempSpinor[24*tm_idx]), 24*sizeof(double));
+          }
+
+        }
+
+  double endTime = gettime();
+  double diffTime = endTime - startTime;
+  if(g_proc_id == 0)
+    printf("# QUDA: time spent in reorder_spinor_toQuda: %f secs\n", diffTime);
+}
+
+// reorder spinor from QUDA format
+void reorder_spinor_fromQuda( double* sp, QudaPrecision precision, int doublet, double* sp2 ) {
+  double startTime = gettime();
+
+  if( doublet ) {
+    memcpy( tempSpinor, sp, 2*VOLUME*24*sizeof(double) );
+  }
+  else {
+    memcpy( tempSpinor, sp, VOLUME*24*sizeof(double) );
+  }
+
+  // now copy and reorder from tempSpinor to spinor
+#ifdef TM_USE_OMP
+  #pragma omp parallel for
+#endif
+  for( int x0=0; x0<T; x0++ )
+    for( int x1=0; x1<LX; x1++ )
+      for( int x2=0; x2<LY; x2++ )
+        for( int x3=0; x3<LZ; x3++ ) {
+#if USE_LZ_LY_LX_T
+          int j = x3 + LZ*x2 + LY*LZ*x1 + LX*LY*LZ*x0;
+          int tm_idx = x1 + LX*x2 + LY*LX*x3 + LZ*LY*LX*x0;
+#else
+          int j = x1 + LX*x2 + LY*LX*x3 + LZ*LY*LX*x0;
+          int tm_idx   = x3 + LZ*x2 + LY*LZ*x1 + LX*LY*LZ*x0;
+#endif
+          int oddBit = (x0+x1+x2+x3) & 1;
+
+          if( doublet ) {
+            memcpy( &(sp[24* tm_idx]),  &(tempSpinor[24*(oddBit*VOLUME+j/2)         ]), 24*sizeof(double));
+            memcpy( &(sp2[24*(tm_idx)]), &(tempSpinor[24*(oddBit*VOLUME+j/2+VOLUME/2)]), 24*sizeof(double));
+          }
+          else {
+            memcpy( &(sp[24*tm_idx]), &(tempSpinor[24*(oddBit*VOLUME/2+j/2)]), 24*sizeof(double));
+          }
+        }
+
+  double endTime = gettime();
+  double diffTime = endTime - startTime;
+  if(g_proc_id == 0)
+    printf("# QUDA: time spent in reorder_spinor_fromQuda: %f secs\n", diffTime);
+}
+
+void set_boundary_conditions( CompressionType* compression ) {
+  // we can't have compression and theta-BC
+  if( fabs(X1)>0.0 || fabs(X2)>0.0 || fabs(X3)>0.0 || (fabs(X0)!=0.0 && fabs(X0)!=1.0) ) {
+    if( *compression!=NO_COMPRESSION ) {
+      if(g_proc_id == 0) {
+        printf("\n# QUDA: WARNING you can't use compression %d with boundary conditions for fermion fields (t,x,y,z)*pi: (%f,%f,%f,%f) \n", *compression,X0,X1,X2,X3);
+        printf("# QUDA: disabling compression.\n\n");
+      }
+      *compression=NO_COMPRESSION;
+    }
+  }
+
+  if( quda_input.fermionbc == TM_QUDA_APBC || quda_input.fermionbc == TM_QUDA_PBC ){
+    if( *compression!=NO_COMPRESSION ){
+      if(g_proc_id == 0){
+        printf("# QUDA: WARNING forced (A)PBC were selected in the input file.\n");
+        printf("# QUDA: Disabling compression to make sure that these are not lost due to gauge compression.\n");
+      }
+      *compression=NO_COMPRESSION;
+    }
+  }
+
+  QudaReconstructType link_recon;
+  QudaReconstructType link_recon_sloppy;
+
+  if( *compression==NO_COMPRESSION ) { // theta BC or "hard-coded" (A)PBC
+    gauge_param.t_boundary = QUDA_PERIODIC_T; // BC will be applied to gaugefield
+    link_recon = 18;
+    link_recon_sloppy = 18;
+  }
+  else { // trivial BC
+    gauge_param.t_boundary = ( fabs(X0)>0.0 ? QUDA_ANTI_PERIODIC_T : QUDA_PERIODIC_T );
+    link_recon = 12;
+    link_recon_sloppy = *compression;
+    if( g_debug_level > 0 )
+      if(g_proc_id == 0)
+        printf("\n# QUDA: WARNING using %d compression with trivial (A)PBC instead of theta-BC ((t,x,y,z)*pi: (%f,%f,%f,%f))! This works fine but the residual check on the host (CPU) will fail.\n",*compression,X0,X1,X2,X3);
+  }
+
+  gauge_param.reconstruct = link_recon;
+  gauge_param.reconstruct_sloppy = link_recon_sloppy;
+  gauge_param.reconstruct_precondition = link_recon_sloppy;
+}
+
+void set_sloppy_prec( const SloppyPrecision sloppy_precision ) {
+
+  // choose sloppy prec.
+  QudaPrecision cuda_prec_sloppy;
+  if( sloppy_precision==SLOPPY_DOUBLE ) {
+    cuda_prec_sloppy = QUDA_DOUBLE_PRECISION;
+    if(g_proc_id == 0) printf("# QUDA: Using double prec. as sloppy!\n");
+  }
+  else if( sloppy_precision==SLOPPY_HALF ) {
+    cuda_prec_sloppy = QUDA_HALF_PRECISION;
+    if(g_proc_id == 0) printf("# QUDA: Using half prec. as sloppy!\n");
+  }
+  else {
+    cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
+    if(g_proc_id == 0) printf("# QUDA: Using single prec. as sloppy!\n");
+  }
+  gauge_param.cuda_prec_sloppy = cuda_prec_sloppy;
+  gauge_param.cuda_prec_refinement_sloppy = cuda_prec_sloppy;
+  
+  inv_param.cuda_prec_sloppy = cuda_prec_sloppy;
+  inv_param.clover_cuda_prec_sloppy = cuda_prec_sloppy;
+  inv_param.clover_cuda_prec_refinement_sloppy = cuda_prec_sloppy;
+}
+
+int invert_quda_direct(double * const propagator, double * const source,
+                const int op_id) {
+
+  double atime, atotaltime = gettime();
+  void *spinorIn  = (void*)source; // source
+  void *spinorOut = (void*)propagator; // solution
+  
+  operator * optr = &operator_list[op_id];
+  // g_kappa is necessary for the gauge field to be correctly translated from tmLQCD to QUDA
+  g_kappa = optr->kappa;
+  g_c_sw = optr->c_sw;
+  g_mu = optr->mu;
+
+  boundary(optr->kappa);
+  
+  if ( g_relative_precision_flag )
+    inv_param.residual_type = QUDA_L2_RELATIVE_RESIDUAL;
+  else
+    inv_param.residual_type = QUDA_L2_ABSOLUTE_RESIDUAL;
+  
+  inv_param.kappa = optr->kappa;
+
+  // figure out which BC to use (theta, trivial...)
+  set_boundary_conditions(&optr->compression_type);
+
+  // set the sloppy precision of the mixed prec solver
+  set_sloppy_prec(optr->sloppy_precision);
+ 
+  // load gauge after setting precision, this is a no-op if the current gauge field
+  // is already loaded
+  atime = gettime();
+  _loadGaugeQuda(optr->compression_type);
+  if(g_proc_id==0 && g_debug_level > 0 ) printf("# QUDA: Time for loadGaugeQuda: %.4e\n",gettime()-atime);
+
+  // this will also construct the clover field and its inverse, if required
+  // it will also run the MG setup
+  _setOneFlavourSolverParam(optr->kappa, 
+                            optr->c_sw, 
+                            optr->mu, 
+                            optr->solver,
+                            optr->even_odd_flag,
+                            optr->eps_sq,
+                            optr->maxiter);
+  
+  // reorder spinor
+  reorder_spinor_toQuda( (double*)spinorIn, inv_param.cpu_prec, 0, NULL );
+
+  // perform the inversion
+  invertQuda(spinorOut, spinorIn, &inv_param);
+
+  if( inv_param.verbosity > QUDA_SILENT )
+    if(g_proc_id == 0)
+      printf("# QUDA: Done: %i iter / %g secs = %g Gflops\n",
+             inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs);
+
+  optr->iterations = inv_param.iter;
+
+  // reorder spinor
+  reorder_spinor_fromQuda( (double*)spinorOut, inv_param.cpu_prec, 0, NULL );
+  // propagator in usual normalisation, this is only necessary in invert_quda_direct
+  // since the rescaling is otherwise done in the operator inversion driver
+  mul_r((spinor*)spinorOut, (2*optr->kappa), (spinor*)spinorOut, VOLUME );
+
+  if( g_proc_id==0 && g_debug_level > 0 )
+    printf("# QUDA: Total time for invert_quda_direct: %.4e\n",gettime()-atotaltime); 
+
+  if(optr->iterations >= optr->maxiter)
+    return(-1);
+
+  return(optr->iterations);
+}
+
+int invert_eo_quda(spinor * const Even_new, spinor * const Odd_new,
+                   spinor * const Even, spinor * const Odd,
+                   const double precision, const int max_iter,
+                   const int solver_flag, const int rel_prec,
+                   const int even_odd_flag, solver_params_t solver_params,
+                   SloppyPrecision sloppy_precision,
+                   CompressionType compression) {
+
+  spinor ** solver_field = NULL;
+  const int nr_sf = 2;
+  init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf);
+
+  convert_eo_to_lexic(solver_field[0],  Even, Odd);
+
+// this is basically not necessary, but if we want to use an a nitial guess, it will be
+//  convert_eo_to_lexic(solver_field[1], Even_new, Odd_new);
+
+  void *spinorIn  = (void*)solver_field[0]; // source
+  void *spinorOut = (void*)solver_field[1]; // solution
+
+  if ( rel_prec )
+    inv_param.residual_type = QUDA_L2_RELATIVE_RESIDUAL;
+  else
+    inv_param.residual_type = QUDA_L2_ABSOLUTE_RESIDUAL;
+
+  inv_param.kappa = g_kappa;
+
+  // figure out which BC to use (theta, trivial...)
+  set_boundary_conditions(&compression);
+  // set the sloppy precision of the mixed prec solver
+  set_sloppy_prec(sloppy_precision);
+  
+  // load gauge after setting precision
+  _loadGaugeQuda(compression);
+
+  // this will also construct the clover field and its inverse, if required
+  // it will also run the MG setup
+  _setOneFlavourSolverParam(g_kappa,
+                            g_c_sw,
+                            g_mu,
+                            solver_flag,
+                            even_odd_flag,
+                            precision,
+                            max_iter);
+
+  // reorder spinor
+  reorder_spinor_toQuda( (double*)spinorIn, inv_param.cpu_prec, 0, NULL );
+
+  // perform the inversion
+  invertQuda(spinorOut, spinorIn, &inv_param);
+
+
+  if( inv_param.verbosity > QUDA_SILENT )
+    if(g_proc_id == 0)
+      printf("# QUDA: Done: %i iter / %g secs = %g Gflops\n",
+             inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs);
+
+  // number of CG iterations
+  int iteration = inv_param.iter;
+
+  // reorder spinor
+  // BaKo 20170901: not sure why the source was also re-ordered after inversion
+  // we leave that commented out for now
+  //reorder_spinor_fromQuda( (double*)spinorIn,  inv_param.cpu_prec, 0, NULL );
+  //convert_lexic_to_eo(Even,     Odd,     solver_field[0]);
+  
+  reorder_spinor_fromQuda( (double*)spinorOut, inv_param.cpu_prec, 0, NULL );
+  convert_lexic_to_eo(Even_new, Odd_new, solver_field[1]);
+
+  finalize_solver(solver_field, nr_sf);
+
+  if(iteration >= max_iter)
+    return(-1);
+
+  return(iteration);
+}
+
+int invert_doublet_eo_quda(spinor * const Even_new_s, spinor * const Odd_new_s,
+                           spinor * const Even_new_c, spinor * const Odd_new_c,
+                           spinor * const Even_s, spinor * const Odd_s,
+                           spinor * const Even_c, spinor * const Odd_c,
+                           const double precision, const int max_iter,
+                           const int solver_flag, const int rel_prec, const int even_odd_flag,
+                           const SloppyPrecision sloppy_precision,
+                           CompressionType compression) {
+
+  spinor ** solver_field = NULL;
+  const int nr_sf = 4;
+  init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf);
+
+  convert_eo_to_lexic(solver_field[0],   Even_s,  Odd_s);
+  convert_eo_to_lexic(solver_field[1],   Even_c,  Odd_c);
+  // this would only be necessary if we wanted to use an initial guess
+  //  convert_eo_to_lexic(g_spinor_field[DUM_DERI+1], Even_new, Odd_new);
+
+  void *spinorIn    = (void*)solver_field[0]; // source
+  void *spinorIn_c  = (void*)solver_field[1]; // charme source
+  void *spinorOut   = (void*)solver_field[2]; // solution
+  void *spinorOut_c = (void*)solver_field[3]; // charme solution
+
+  if ( rel_prec )
+    inv_param.residual_type = QUDA_L2_RELATIVE_RESIDUAL;
+  else
+    inv_param.residual_type = QUDA_L2_ABSOLUTE_RESIDUAL;
+
+  inv_param.kappa = g_kappa;
+
+  // IMPORTANT: use opposite TM mu-flavor since gamma5 -> -gamma5
+  inv_param.mu           = -g_mubar /2./g_kappa;
+  inv_param.epsilon      =  g_epsbar/2./g_kappa;
+  // FIXME: in principle, there is also QUDA_TWIST_DEG_DOUBLET
+  inv_param.twist_flavor =  QUDA_TWIST_NONDEG_DOUBLET; 
+  inv_param.Ls = 2;
+
+  // figure out which BC to use (theta, trivial...)
+  set_boundary_conditions(&compression);
+
+  // set the sloppy precision of the mixed prec solver
+  set_sloppy_prec(sloppy_precision);
+
+  // load gauge after setting precision
+   _loadGaugeQuda(compression);
+
+  // choose dslash type
+  if( g_c_sw > 0.0 ) {
+    inv_param.dslash_type = QUDA_TWISTED_CLOVER_DSLASH;
+    inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; // FIXME: note sure if this is the correct PC type
+    inv_param.solution_type = QUDA_MAT_SOLUTION;
+    inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
+    inv_param.clover_coeff = g_c_sw*g_kappa;
+    inv_param.compute_clover = 1;
+    inv_param.compute_clover_inverse = 1;
+  }
+  else {
+    inv_param.dslash_type = QUDA_TWISTED_MASS_DSLASH;
+    inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN_ASYMMETRIC;
+    inv_param.solution_type = QUDA_MAT_SOLUTION;
+  }
+
+  // choose solver
+  if(solver_flag == BICGSTAB) {
+    if(g_proc_id == 0) {printf("# QUDA: Using BiCGstab!\n"); fflush(stdout);}
+    inv_param.inv_type = QUDA_BICGSTAB_INVERTER;
+  }
+  else {
+    /* Here we invert the hermitean operator squared */
+    inv_param.inv_type = QUDA_CG_INVERTER;
+    if(g_proc_id == 0) {
+      printf("# QUDA: Using mixed precision CG!\n");
+      printf("# QUDA: mu = %.12f, kappa = %.12f\n", g_mu/2./g_kappa, g_kappa);
+      fflush(stdout);
+    }
+  }
+
+  if( even_odd_flag ) {
+    inv_param.solve_type = QUDA_NORMERR_PC_SOLVE;
+    if(g_proc_id == 0) printf("# QUDA: Using EO preconditioning!\n");
+  }
+  else {
+    inv_param.solve_type = QUDA_NORMERR_SOLVE;
+    if(g_proc_id == 0) printf("# QUDA: Not using EO preconditioning!\n");
+  }
+
+  inv_param.tol = sqrt(precision);
+  inv_param.maxiter = max_iter;
+
+  if( g_c_sw > 0.0 ){
+    _loadCloverQuda(&inv_param);
+  }
+
+  // reorder spinor
+  reorder_spinor_toQuda( (double*)spinorIn,   inv_param.cpu_prec, 1, (double*)spinorIn_c );
+
+  // perform the inversion
+  invertQuda(spinorOut, spinorIn, &inv_param);
+
+  if( inv_param.verbosity > QUDA_SILENT )
+    if(g_proc_id == 0)
+      printf("# QUDA: Done: %i iter / %g secs = %g Gflops\n",
+             inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs);
+
+  // number of CG iterations
+  int iteration = inv_param.iter;
+
+  // reorder spinor
+  // BaKo 20170901: not sure why the source was also re-ordered
+  // we leave it commented out for now
+  //reorder_spinor_fromQuda( (double*)spinorIn,    inv_param.cpu_prec, 1, (double*)spinorIn_c );
+  //convert_lexic_to_eo(Even_s,     Odd_s,     solver_field[0]);
+  //convert_lexic_to_eo(Even_c,     Odd_c,     solver_field[1]);
+
+  reorder_spinor_fromQuda( (double*)spinorOut,   inv_param.cpu_prec, 1, (double*)spinorOut_c );
+  convert_lexic_to_eo(Even_new_s, Odd_new_s, solver_field[2]);
+  convert_lexic_to_eo(Even_new_c, Odd_new_c, solver_field[3]);
+
+  finalize_solver(solver_field, nr_sf);
+
+  if(iteration >= max_iter)
+    return(-1);
+
+  return(iteration);
+}
+
+// if even_odd_flag set
+void M_full_quda(spinor * const Even_new, spinor * const Odd_new,  spinor * const Even, spinor * const Odd) {
+  inv_param.kappa = g_kappa;
+  // IMPORTANT: use opposite TM flavor since gamma5 -> -gamma5 (until LXLYLZT prob. resolved)
+  inv_param.mu = -g_mu;
+  inv_param.epsilon = 0.0;
+
+  inv_param.twist_flavor = QUDA_TWIST_SINGLET;
+  inv_param.Ls = (inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ||
+       inv_param.twist_flavor == QUDA_TWIST_DEG_DOUBLET ) ? 2 : 1;
+
+  void *spinorIn  = (void*)g_spinor_field[DUM_DERI];   // source
+  void *spinorOut = (void*)g_spinor_field[DUM_DERI+1]; // solution
+
+  // reorder spinor
+  convert_eo_to_lexic( spinorIn, Even, Odd );
+  reorder_spinor_toQuda( (double*)spinorIn, inv_param.cpu_prec, 0, NULL );
+
+  // multiply
+  inv_param.solution_type = QUDA_MAT_SOLUTION;
+  MatQuda( spinorOut, spinorIn, &inv_param);
+
+  // reorder spinor
+  reorder_spinor_fromQuda( (double*)spinorOut, inv_param.cpu_prec, 0, NULL );
+  convert_lexic_to_eo( Even_new, Odd_new, spinorOut );
+}
+
+// no even-odd
+void D_psi_quda(spinor * const P, spinor * const Q) {
+  inv_param.kappa = g_kappa;
+  // IMPORTANT: use opposite TM flavor since gamma5 -> -gamma5 (until LXLYLZT prob. resolved)
+  inv_param.mu = -g_mu;
+  inv_param.epsilon = 0.0;
+
+  inv_param.twist_flavor = QUDA_TWIST_SINGLET;
+  inv_param.Ls = (inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ||
+       inv_param.twist_flavor == QUDA_TWIST_DEG_DOUBLET ) ? 2 : 1;
+
+  void *spinorIn  = (void*)Q;
+  void *spinorOut = (void*)P;
+
+  // reorder spinor
+  reorder_spinor_toQuda( (double*)spinorIn, inv_param.cpu_prec, 0, NULL );
+
+  // multiply
+  inv_param.solution_type = QUDA_MAT_SOLUTION;
+  MatQuda( spinorOut, spinorIn, &inv_param);
+
+  // reorder spinor
+  reorder_spinor_fromQuda( (double*)spinorIn,  inv_param.cpu_prec, 0, NULL );
+  reorder_spinor_fromQuda( (double*)spinorOut, inv_param.cpu_prec, 0, NULL );
+}
+
+void _setOneFlavourSolverParam(const double kappa, const double c_sw, const double mu, 
+                               const int solver_type, const int even_odd,
+                               const double eps_sq, const int maxiter) {
+
+  inv_param.tol = sqrt(eps_sq);
+  inv_param.maxiter = maxiter;
+  inv_param.Ls = 1;
+
+  // choose dslash type
+  if( fabs(mu) > 0.0 && c_sw > 0.0 ) {
+    inv_param.twist_flavor = QUDA_TWIST_SINGLET;
+    inv_param.dslash_type = QUDA_TWISTED_CLOVER_DSLASH;
+    inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
+    inv_param.solution_type = QUDA_MAT_SOLUTION;
+    inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
+    // IMPORTANT: use opposite TM flavor since gamma5 -> -gamma5 (until LXLYLZT prob. resolved)
+    inv_param.mu = -mu/2./kappa;
+    inv_param.clover_coeff = c_sw*kappa;
+    inv_param.compute_clover_inverse = 1;
+    inv_param.compute_clover = 1;
+  }
+  else if( fabs(mu) > 0.0 ) {
+    inv_param.twist_flavor = QUDA_TWIST_SINGLET;
+    inv_param.dslash_type = QUDA_TWISTED_MASS_DSLASH;
+    inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN_ASYMMETRIC;
+    inv_param.solution_type = QUDA_MAT_SOLUTION;
+    // IMPORTANT: use opposite TM flavor since gamma5 -> -gamma5 (until LXLYLZT prob. resolved)
+    inv_param.mu = -mu/2./kappa;
+  }
+  else if( c_sw > 0.0 ) {
+    inv_param.twist_flavor = QUDA_TWIST_NO;
+    inv_param.dslash_type = QUDA_CLOVER_WILSON_DSLASH;
+    inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
+    inv_param.solution_type = QUDA_MAT_SOLUTION;
+    inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
+    inv_param.clover_coeff = c_sw*kappa;
+    inv_param.compute_clover_inverse = 1;
+    inv_param.compute_clover = 1;
+  }
+  else {
+    inv_param.twist_flavor = QUDA_TWIST_NO;
+    inv_param.dslash_type = QUDA_WILSON_DSLASH;
+    inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
+    inv_param.solution_type = QUDA_MAT_SOLUTION;
+  }
+  
+  // choose solver
+  if( solver_type == BICGSTAB ) {
+    if(g_proc_id == 0) {printf("# QUDA: Using BiCGstab!\n"); fflush(stdout);}
+    inv_param.inv_type = QUDA_BICGSTAB_INVERTER;
+  }
+  else if ( solver_type == MG ) {
+    if(g_proc_id == 0) {printf("# QUDA: Using MG!\n"); fflush(stdout);}
+    // coarsening does not support QUDA_MATPC_EVEN_EVEN_ASYMMETRIC
+    if( inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC ) inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
+    inv_param.inv_type = QUDA_GCR_INVERTER;
+    inv_param.gcrNkrylov = 20;
+    inv_param.inv_type_precondition = QUDA_MG_INVERTER;
+    inv_param.schwarz_type = QUDA_ADDITIVE_SCHWARZ;
+    inv_param.reliable_delta = 1e-5;
+    inv_param.precondition_cycle = 1;
+    inv_param.tol_precondition = 1e-1;
+    inv_param.maxiter_precondition = 1;
+    inv_param.omega = quda_input.mg_omega;
+  }
+  else {
+    /* Here we invert the hermitean operator squared */
+    inv_param.inv_type = QUDA_CG_INVERTER;
+    if(g_proc_id == 0) {
+      printf("# QUDA: Using mixed precision CG!\n");
+      fflush(stdout);
+    }
+  }
+
+  // direct or norm-op. solve
+  if( inv_param.inv_type == QUDA_CG_INVERTER ) {
+    if( even_odd ) {
+      inv_param.solve_type = QUDA_NORMERR_PC_SOLVE;
+      if(g_proc_id == 0) printf("# QUDA: Using EO preconditioning!\n");
+    }
+    else {
+      inv_param.solve_type = QUDA_NORMERR_SOLVE;
+      if(g_proc_id == 0) printf("# QUDA: Not using EO preconditioning!\n");
+    }
+  }
+  else {
+    if( even_odd ) {
+      inv_param.solve_type = QUDA_DIRECT_PC_SOLVE;
+      if(g_proc_id == 0) printf("# QUDA: Using EO preconditioning!\n");
+    }
+    else {
+      inv_param.solve_type = QUDA_DIRECT_SOLVE;
+      if(g_proc_id == 0) printf("# QUDA: Not using EO preconditioning!\n");
+    }
+  }
+
+  // load clover field if required, doing so in this odd place because we need
+  // basic stuff to be set in inv_param
+  if( c_sw > 0.0 ) {
+    _loadCloverQuda(&inv_param);
+  }
+
+  if( g_proc_id == 0){
+    printf("# QUDA: mu = %.12f, kappa = %.12f, csw = %.12f\n", mu/2./kappa, kappa, c_sw);
+  }
+  if(g_proc_id == 0 && g_debug_level > 3){
+    printf("------------- OUTER SOLVER InvertParam --------------\n");
+    printQudaInvertParam(&inv_param);
+    printf("----------------------------------------\n");
+  }
+
+    // run the MG setup if required
+  if( inv_param.inv_type_precondition == QUDA_MG_INVERTER ){
+    // we begin by setting the inverter params for the quda_mg_param struct equal to the outer inv_param
+    inv_mg_param = inv_param;
+    // when the preconditioner for the outer solver has already been set below, the line just
+    // above would set a preconditioner for the MG smoothers, which is not allowed
+    // so we set this to NULL explicitly
+    inv_mg_param.preconditioner = NULL;
+    quda_mg_param.invert_param = &inv_mg_param;
+    _setQudaMultigridParam(&quda_mg_param);
+
+    if( check_quda_mg_setup_state(&quda_mg_setup_state, &quda_gauge_state, &quda_input) == TM_QUDA_MG_SETUP_RESET ){
+      double atime = gettime();
+      if( quda_mg_preconditioner != NULL ){
+        if(g_proc_id==0){ printf("# QUDA: Destroying MG Preconditioner Setup\n"); fflush(stdout); }
+        destroyMultigridQuda(quda_mg_preconditioner);
+        reset_quda_mg_setup_state(&quda_mg_setup_state);
+        quda_mg_preconditioner = NULL;
+      }
+      if(g_proc_id==0){ printf("# QUDA: Performing MG Preconditioner Setup\n"); fflush(stdout); }
+      quda_mg_preconditioner = newMultigridQuda(&quda_mg_param);
+      inv_param.preconditioner = quda_mg_preconditioner;
+      set_quda_mg_setup_state(&quda_mg_setup_state, &quda_gauge_state);
+      if(g_proc_id == 0 && g_debug_level > 0){
+        printf("# QUDA: MG Preconditioner Setup took %.3f seconds\n", gettime()-atime);
+        fflush(stdout);
+      }
+    } else if ( check_quda_mg_setup_state(&quda_mg_setup_state, &quda_gauge_state, &quda_input) == TM_QUDA_MG_SETUP_UPDATE )  {
+      if(g_proc_id==0 && g_debug_level > 0){ 
+        printf("# QUDA: Updating MG Preconditioner Setup for gauge %d\n", quda_gauge_state.gauge_id); fflush(stdout); 
+      }
+      double atime = gettime();
+      updateMultigridQuda(quda_mg_preconditioner, &quda_mg_param);
+      set_quda_mg_setup_state(&quda_mg_setup_state, &quda_gauge_state);
+      if(g_proc_id == 0 && g_debug_level > 0){
+        printf("# QUDA: MG Preconditioner Setup Update took %.3f seconds\n", gettime()-atime);
+        fflush(stdout);
+      }
+     } else {
+      if(g_proc_id==0 && g_debug_level > 0){ 
+        printf("# QUDA: Reusing MG Preconditioner Setup for gauge %d\n", quda_gauge_state.gauge_id); fflush(stdout); 
+      }
+    }
+  }
+  
+  if(g_proc_id == 0 && g_debug_level > 3 && inv_param.inv_type_precondition == QUDA_MG_INVERTER){
+    printf("--------------- MG InvertParam ------------------\n");
+    printQudaInvertParam(quda_mg_param.invert_param);
+    printf("---------------- MG MultigridParam ------------------------\n");
+    printQudaMultigridParam(&quda_mg_param);
+    printf("----------------------------------------\n");
+  }
+}
+
+void _setQudaMultigridParam(QudaMultigridParam* mg_param) {
+  QudaInvertParam *mg_inv_param = mg_param->invert_param;
+
+  // FIXME: we also want to do MG for the ND operator, perhaps
+  mg_inv_param->Ls = 1;
+  mg_inv_param->sp_pad = 0;
+  mg_inv_param->cl_pad = 0;
+
+  // in the MG, the residual type should always be relative,
+  // otherwisethe solver fails to converge
+  // in the outer solver, we are still free to choose
+  // absolute or relative
+  mg_inv_param->residual_type = QUDA_L2_RELATIVE_RESIDUAL;
+
+  mg_inv_param->preserve_source = QUDA_PRESERVE_SOURCE_NO;
+  // the MG internal Gamma basis is always DEGRAND_ROSSI
+  mg_inv_param->gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
+  mg_inv_param->dirac_order = QUDA_DIRAC_ORDER;
+  // just to be safe, we also set the input and output gamma basis again
+  inv_param.gamma_basis = QUDA_CHIRAL_GAMMA_BASIS; // CHIRAL -> UKQCD does not seem to be supported right now...
+
+  mg_inv_param->input_location = QUDA_CPU_FIELD_LOCATION;
+  mg_inv_param->output_location = QUDA_CPU_FIELD_LOCATION;
+  
+  // currently, only QUDA_DIRECT_SOLVE is supported for this, thus also QUDA_MAT_SOLUTION 
+  mg_inv_param->solve_type = QUDA_DIRECT_SOLVE;
+  mg_inv_param->solution_type = QUDA_MAT_SOLUTION;
+
+  mg_inv_param->dagger = QUDA_DAG_NO;
+
+  mg_param->setup_type = QUDA_NULL_VECTOR_SETUP;
+  mg_param->pre_orthonormalize = QUDA_BOOLEAN_NO;
+  mg_param->post_orthonormalize = QUDA_BOOLEAN_YES;
+
+  mg_param->n_level = quda_input.mg_n_level;
+  for (int level=0; level < mg_param->n_level; level++) {
+    mg_param->precision_null[level] = QUDA_HALF_PRECISION;
+    mg_param->setup_inv_type[level] = quda_input.mg_setup_inv_type;
+    // Kate says: experimental, leave at 1 (will be used for bootstrap-style setup later)
+    mg_param->num_setup_iter[level] = 1;
+    mg_param->setup_tol[level] = quda_input.mg_setup_tol;
+    mg_param->setup_maxiter[level] = quda_input.mg_setup_maxiter;
+    // If doing twisted mass, we can scale the twisted mass on the coarser grids
+    // which significantly increases speed of convergence as a result of making
+    // the coarsest grid solve a lot better conditioned.
+    // Dean Howarth has some RG arguments on why the coarse mass parameter should be
+    // rescaled for the coarse operator to be optimal.
+    if( fabs(mg_inv_param->mu) > 2*DBL_EPSILON ) {
+      mg_param->mu_factor[level] = quda_input.mg_mu_factor[level];
+      if( g_proc_id == 0 && g_debug_level >= 2 ){
+        printf("# QUDA: MG setting coarse mu scaling factor on level %d to %lf\n", level, mg_param->mu_factor[level]);
+      }
+    }
+    
+    for (int dim=0; dim<4; dim++) {
+      int extent;
+      switch(dim){
+        case 0:
+          extent = LX;
+          break;
+        case 1:
+          extent = LY;
+          break;
+        case 2:
+          extent = LZ;
+          break;
+        case 3:
+        default:
+          extent = T;
+          break;
+      }
+      // determine how many lattice sites remain at the current level
+      for(int k = level; k > 0; k--) {
+        extent = extent/mg_param->geo_block_size[k-1][dim];
+      }
+
+      if( level == (quda_input.mg_n_level-1) ){
+        // for the coarsest level, the block size is always set to 1
+        mg_param->geo_block_size[level][dim] = 1;
+      } else if( quda_input.mg_blocksize[level][dim] != 0 ){
+        // the block size for this level and dimension has been set non-zero in the input file
+        // we respect this no matter what
+        mg_param->geo_block_size[level][dim] = quda_input.mg_blocksize[level][dim];
+        // otherwise we employ our blocking algorithm
+      } else {
+        // on all levels, we try to use a block size of 4^4 and compute the
+        // number of fine or aggregate lattice sites on a given level,
+        // resulting in block sizes of:
+        // - 4 if the extent is larger or equal to 16 and
+        // - 2 otherwise
+        // When an extent is divisible by three and smaller than 16 and when we're
+        // not on the finest grid and when the user has explicitly enabled support 
+        // for these block lengths  (and therefore also adjusted QUDA to instantiate them), 
+        // we use a block length of 3.
+        // If aggregation using an even number of lattice points (if size 3 is disabled)
+        // is not possible or if the extent is 1 or some divisible only by some prime number
+        // other than 3 or 2, we use a block size of 1
+        int even_block_size = 4;
+        if( extent < 16 ) even_block_size = 2;
+     
+        // special treatment of size 24 lattice extents on the fine grid
+        if ( extent <= 24 && extent % 3 == 0 && quda_input.mg_enable_size_three_blocks ) {
+          mg_param->geo_block_size[level][dim] = 3;
+        } else if ( extent % even_block_size == 0 ) { 
+          mg_param->geo_block_size[level][dim] = even_block_size;
+        } else {
+          mg_param->geo_block_size[level][dim] = 1;
+        }
+      }
+      
+      // this output is only relevant on levels 0, 1, ..., n-2
+      if( level < (mg_param->n_level-1) && g_proc_id == 0 && g_debug_level >= 2 ) {
+        printf("# QUDA: MG level %d, extent of (xyzt) dim %d: %d\n", level, dim, extent);
+        printf("# QUDA: MG aggregation size set to: %d\n", mg_param->geo_block_size[level][dim]);
+        fflush(stdout);
+      }
+
+      // all lattice extents must be even after blocking on all levels
+      if( (extent / mg_param->geo_block_size[level][dim]) % 2 != 0 ){
+        tm_debug_printf(0, 0,
+                        "MG level %d, dim (xyzt) %d. Block size of %d would result "
+                        "in odd extent on level %d, aborting!\n"
+                        "Adjust your block sizes or parallelisation!\n",
+                        level, dim, mg_param->geo_block_size[level][dim]);
+        fflush(stdout);
+        fatal_error("Blocking error.\n", "_setQudaMultigridParam");
+      }
+
+    } // for( dim=0 to dim=3 ) (space-time dimensions)
+
+    mg_param->coarse_solver[level] = QUDA_GCR_INVERTER;
+    mg_param->coarse_solver_tol[level] = quda_input.mg_coarse_solver_tol;
+    mg_param->coarse_solver_maxiter[level] = quda_input.mg_coarse_solver_maxiter;
+    // spin block size on level zero will be reset to 2 below
+    mg_param->spin_block_size[level] = 1;
+    mg_param->n_vec[level] = quda_input.mg_n_vec[level];
+    mg_param->nu_pre[level] = quda_input.mg_nu_pre;
+    mg_param->nu_post[level] = quda_input.mg_nu_post;
+
+    mg_param->cycle_type[level] = QUDA_MG_CYCLE_RECURSIVE;
+    mg_param->location[level] = QUDA_CUDA_FIELD_LOCATION;
+    
+    mg_param->smoother[level] = QUDA_MR_INVERTER;
+    mg_param->smoother_tol[level] = quda_input.mg_smoother_tol;
+    // unless the Schwarz-alternating smoother is used, this should be 1
+    mg_param->smoother_schwarz_cycle[level] = 1;
+    // Kate says this should be EO always for performance
+    mg_param->smoother_solve_type[level] = QUDA_DIRECT_PC_SOLVE;
+    mg_param->smoother_schwarz_type[level] = QUDA_INVALID_SCHWARZ;
+   
+    // when the Schwarz-alternating smoother is used, this can be set to NO, otherwise it must be YES 
+    mg_param->global_reduction[level] = QUDA_BOOLEAN_YES;
+
+    // set to QUDA_MAT_SOLUTION to inject a full field into coarse grid
+    // set to QUDA_MATPC_SOLUTION to inject single parity field into coarse grid
+    // if we are using an outer even-odd preconditioned solve, then we
+    // use single parity injection into the coarse grid
+    mg_param->coarse_grid_solution_type[level] = inv_param.solve_type == QUDA_DIRECT_PC_SOLVE ? QUDA_MATPC_SOLUTION : QUDA_MAT_SOLUTION;
+
+    mg_param->omega[level] = quda_input.mg_omega; // over/under relaxation factor
+
+    mg_param->location[level] = QUDA_CUDA_FIELD_LOCATION;
+  } // for(i=0 to n_level-1)
+
+  // only coarsen the spin on the first restriction
+  mg_param->spin_block_size[0] = 2;
+
+  mg_param->compute_null_vector = QUDA_COMPUTE_NULL_VECTOR_YES;
+  mg_param->generate_all_levels = QUDA_BOOLEAN_YES;
+
+  mg_param->run_verify = quda_input.mg_run_verify;
+
+  // set file i/o parameters
+  strcpy(mg_param->vec_infile, "");
+  strcpy(mg_param->vec_outfile, "");
+
+  mg_inv_param->verbosity = QUDA_SUMMARIZE;
+  mg_inv_param->verbosity_precondition = QUDA_SUMMARIZE;;
+}
+
diff --git a/quda_interface.h b/quda_interface.h
new file mode 100644
index 000000000..e90e7de10
--- /dev/null
+++ b/quda_interface.h
@@ -0,0 +1,124 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2015 Mario Schroeck
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+/***********************************************************************
+*
+* File quda_interface.h
+*
+* Author: Mario Schroeck <mario.schroeck@roma3.infn.it>
+* 
+* Last changes: 06/2015
+*
+*
+* Interface to QUDA for multi-GPU inverters
+*
+* The externally accessible functions are
+*
+*   void _initQuda()
+*     Initializes the QUDA library. Carries over the lattice size and the
+*     MPI process grid and thus must be called after initializing MPI.
+*     Currently it is called in init_operators() if optr->use_qudainverter
+*     flag is set.
+*     Memory for the QUDA gaugefield on the host is allocated but not filled
+*     yet (the latter is done in _loadGaugeQuda(), see below).
+*     Performance critical settings are done here and can be changed.
+*
+*   void _endQuda()
+*     Finalizes the QUDA library. Call before MPI_Finalize().
+*
+*   void _loadGaugeQuda()
+*     Copies and reorders the gaugefield on the host and copies it to the GPU.
+*     Must be called between last changes on the gaugefield (smearing etc.)
+*     and first call of the inverter. In particular, 'boundary(const double kappa)'
+*     must be called before if nontrivial boundary conditions are to be used since
+*     those will be applied directly to the gaugefield. Currently it is called just
+*     before the inversion is done (might result in wasted loads...).
+*
+*   The functions
+*
+*     int invert_eo_quda(...);
+*     int invert_doublet_eo_quda(...);
+*     void M_full_quda(...);
+*     void D_psi_quda(...);
+*
+*   mimic their tmLQCD counterparts in functionality as well as input and
+*   output parameters. The invert functions will check the parameters
+*   g_mu, g_c_sw do decide which QUDA operator to create.
+*
+*   To activate those, set "UseQudaInverter = yes" in the operator
+*   declaration of the input file. For details see the documentation.
+*
+*   The function
+*
+*     int invert_quda_direct(...);
+*
+*   provides a direct interface to the QUDA solver and is not accessible through
+*   the input file.
+*
+* Notes:
+*
+* Minimum QUDA version is 0.7.0 (see https://github.com/lattice/quda/issues/151 
+* and https://github.com/lattice/quda/issues/157).
+*
+*
+**************************************************************************/
+
+#ifndef QUDA_INTERFACE_H_
+#define QUDA_INTERFACE_H_
+#include "global.h"
+#include "su3.h"
+#include "solver/solver_params.h"
+
+
+// wrapper functions
+void _initQuda();
+void _endQuda();
+void _loadGaugeQuda();
+void _loadCloverQuda();
+
+// direct line to QUDA inverter, no messing about with even/odd reordering
+// source and propagator  Should be full VOLUME spinor fields 
+// op_id                  Index of the operator to be inverted (0 to N-1)
+int invert_quda_direct(double * const propgator, double * const source,
+                const int op_id);
+
+// to be called instead of tmLQCD functions to use the QUDA inverter
+int invert_eo_quda(spinor * const Even_new, spinor * const Odd_new,
+                   spinor * const Even, spinor * const Odd,
+                   const double precision, const int max_iter,
+                   const int solver_flag, const int rel_prec,
+                   const int even_odd_flag, solver_params_t solver_params,
+                   const SloppyPrecision sloppy_precision,
+                   CompressionType compression);
+
+int invert_doublet_eo_quda(spinor * const Even_new_s, spinor * const Odd_new_s,
+                           spinor * const Even_new_c, spinor * const Odd_new_c,
+                           spinor * const Even_s, spinor * const Odd_s,
+                           spinor * const Even_c, spinor * const Odd_c,
+                           const double precision, const int max_iter,
+                           const int solver_flag, const int rel_prec, const int even_odd_flag,
+                           const SloppyPrecision sloppy_precision,
+                           CompressionType compression);
+
+// apply the TM operator using QUDA
+void M_full_quda(spinor * const Even_new, spinor * const Odd_new,  spinor * const Even, spinor * const Odd);
+void D_psi_quda(spinor * const P, spinor * const Q);
+
+#endif /* QUDA_INTERFACE_H_ */
diff --git a/quda_solver_translate.h b/quda_solver_translate.h
new file mode 100644
index 000000000..5eab16233
--- /dev/null
+++ b/quda_solver_translate.h
@@ -0,0 +1,32 @@
+/***************************************************************************
+ * Copyright (C) 2017                               Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#ifndef QUDA_SOLVER_TRANSLATE_H
+#define QUDA_SOLVER_TRANSLATE_H
+
+#include "solver/solver_types.h"
+
+// these exist only in case we are compiling without QUDA support, such that the
+// input file reader can be compiled
+typedef enum QudaInverterType_s {
+ QUDA_BICGSTAB_INVERTER = BICGSTAB,
+ QUDA_CG_INVERTER = CG
+} QudaInverterType;
+
+#endif
diff --git a/quda_types.h b/quda_types.h
new file mode 100644
index 000000000..0496b176c
--- /dev/null
+++ b/quda_types.h
@@ -0,0 +1,188 @@
+/***********************************************************************
+ * Copyright (C) 2017 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *******************************************************************************/
+
+#ifndef TM_QUDA_TYPES_H
+#define TM_QUDA_TYPES_H
+
+#ifdef HAVE_CONFIG_H
+#  include<tmlqcd_config.h>
+#endif
+
+#ifdef TM_USE_QUDA
+#include <quda.h>
+#else
+// some definitions which are found in quda.h must be reproduced in case
+// we are compiling without it, such that tm_QudaParams_t can be
+// defined properly anyway
+#define QUDA_MAX_MG_LEVEL 4
+#define QUDA_BOOLEAN_YES 1
+#define QUDA_BOOLEAN_NO 0
+#include "quda_solver_translate.h"
+#endif
+
+#include "global.h"
+#include <float.h>
+#include <math.h>
+
+typedef enum tm_quda_ferm_bc_t {
+  TM_QUDA_THETABC = 0,
+  TM_QUDA_APBC,
+  TM_QUDA_PBC
+} tm_quda_ferm_bc_t;
+
+/* tm_QudaParams_t provides an interface between the tmLQCD input file and the
+ * available QUDA parameters. At the moment, only the fermionic bounday conditions
+ * and the MG parameters are exposed like this, but a further refactoring might
+ * turn this into a complete representation of the possible input parameters */
+typedef struct tm_QudaParams_t {
+  tm_quda_ferm_bc_t fermionbc;
+
+  int               mg_n_level;
+  int               mg_n_vec[QUDA_MAX_MG_LEVEL];
+  int               mg_blocksize[QUDA_MAX_MG_LEVEL][4];
+  double            mg_mu_factor[QUDA_MAX_MG_LEVEL];
+  QudaInverterType  mg_setup_inv_type;
+  double            mg_setup_tol;
+  int               mg_setup_maxiter;
+  int               mg_coarse_solver_maxiter;
+  double            mg_coarse_solver_tol;
+  int               mg_nu_pre;
+  int               mg_nu_post;
+  double            mg_smoother_tol;
+  double            mg_omega;
+  int               mg_run_verify;
+  int               mg_enable_size_three_blocks;
+  double            mg_reset_setup_threshold;
+} tm_QudaParams_t;
+
+typedef struct tm_QudaMGSetupState_t {
+  int gauge_id;
+  double c_sw;
+  double kappa;
+  double mu;
+  int initialised;
+} tm_QudaMGSetupState_t;
+
+typedef struct tm_QudaCloverState_t {
+  int gauge_id;
+  double c_sw;
+  double kappa;
+  double mu;
+  int loaded;
+} tm_QudaCloverState_t;
+
+typedef struct tm_QudaGaugeState_t {
+  int gauge_id;
+  int loaded;
+} tm_QudaGaugeState_t;
+
+typedef enum tm_QudaMGSetupState_enum_t {
+  TM_QUDA_MG_SETUP_RESET = -1,
+  TM_QUDA_MG_SETUP_UPDATE,
+  TM_QUDA_MG_SETUP_REUSE
+} tm_QudaMGSetupState_enum_t; 
+
+
+static inline int check_quda_clover_state(const tm_QudaCloverState_t * const quda_clover_state,
+                                          const tm_QudaGaugeState_t * const quda_gauge_state){
+  return( quda_clover_state->loaded &&
+          (quda_clover_state->gauge_id == quda_gauge_state->gauge_id) &&
+          (fabs(quda_clover_state->c_sw - g_c_sw) < 2*DBL_EPSILON) &&
+          (fabs(quda_clover_state->kappa - g_kappa) < 2*DBL_EPSILON) &&
+          (fabs(quda_clover_state->mu - g_mu) < 2*DBL_EPSILON) );
+}
+
+static inline void set_quda_clover_state(tm_QudaCloverState_t * const quda_clover_state,
+                                         const tm_QudaGaugeState_t * const quda_gauge_state){
+  quda_clover_state->gauge_id = quda_gauge_state->gauge_id;
+  quda_clover_state->c_sw = g_c_sw;
+  quda_clover_state->kappa = g_kappa;
+  quda_clover_state->mu = g_mu;
+  quda_clover_state->loaded = 1;
+}
+
+static inline void reset_quda_clover_state(tm_QudaCloverState_t * const quda_clover_state){
+  quda_clover_state->gauge_id = -1;
+  quda_clover_state->loaded = 0;
+  quda_clover_state->mu = -1.0;
+  quda_clover_state->c_sw = -1.0;
+  quda_clover_state->mu = -1.0;
+}
+
+static inline int check_quda_gauge_state(const tm_QudaGaugeState_t * const quda_gauge_state,
+                                         const int gauge_id){
+  return( quda_gauge_state->loaded &&
+          (quda_gauge_state->gauge_id == gauge_id) );
+}
+
+static inline void set_quda_gauge_state(tm_QudaGaugeState_t * const quda_gauge_state,
+                                        const int gauge_id){
+  quda_gauge_state->gauge_id = gauge_id;
+  quda_gauge_state->loaded = 1;
+}
+
+static inline void reset_quda_gauge_state(tm_QudaGaugeState_t * const quda_gauge_state){
+  quda_gauge_state->gauge_id = -1;
+  quda_gauge_state->loaded = 0;
+}
+
+static inline int check_quda_mg_setup_state(const tm_QudaMGSetupState_t * const quda_mg_setup_state,
+                                            const tm_QudaGaugeState_t * const quda_gauge_state,
+                                            const tm_QudaParams_t * const quda_params){
+  // when the MG setup has not been initialised or when the "gauge_id" has changed by more
+  // than the mg_redo_setup_threhold, we need to (re-)do the setup completely
+  if( (quda_mg_setup_state->initialised != 1) ||
+      ( fabs(quda_mg_setup_state->gauge_id - quda_gauge_state->gauge_id) > quda_params->mg_reset_setup_threshold ) ){
+    return TM_QUDA_MG_SETUP_RESET;
+  // in other cases, e.g., when the operator parameters change or if the gauge_id has "moved" only a little,
+  // we don't need to redo the setup, we can simply rebuild the coarse operators with the
+  // new parameters (within reason).
+  // Note that we use 2*DBL_EPSILON to have a little bit more wiggle room in case of badly
+  // implemented floating point or something like that...
+  // TODO: perhaps introduce thresholds also for c_sw, kappa and mu, which might need some
+  // more sophisticated logic tree...
+  } else if( ( fabs(quda_mg_setup_state->gauge_id - quda_gauge_state->gauge_id) < 2*DBL_EPSILON ) &&
+             ( fabs(quda_mg_setup_state->c_sw - g_c_sw) < 2*DBL_EPSILON) &&
+             ( fabs(quda_mg_setup_state->kappa - g_kappa) < 2*DBL_EPSILON) &&
+             ( fabs(quda_mg_setup_state->mu - g_mu) < 2*DBL_EPSILON) ){
+    return TM_QUDA_MG_SETUP_REUSE;
+  } else {
+    return TM_QUDA_MG_SETUP_UPDATE;
+  }
+}
+
+static inline void set_quda_mg_setup_state(tm_QudaMGSetupState_t * const quda_mg_setup_state,
+                                           const tm_QudaGaugeState_t * const quda_gauge_state){
+  quda_mg_setup_state->gauge_id = quda_gauge_state->gauge_id;
+  quda_mg_setup_state->c_sw = g_c_sw;
+  quda_mg_setup_state->kappa = g_kappa;
+  quda_mg_setup_state->mu = g_mu;
+  quda_mg_setup_state->initialised = 1;
+}
+
+static inline void reset_quda_mg_setup_state(tm_QudaMGSetupState_t * const quda_mg_setup_state){
+  quda_mg_setup_state->gauge_id = -1;
+  quda_mg_setup_state->initialised = 0;
+  quda_mg_setup_state->mu = -1.0;
+  quda_mg_setup_state->c_sw = -1.0;
+  quda_mg_setup_state->mu = -1.0;
+}
+
+#endif // TM_QUDA_TYPES_H
diff --git a/ranlxd.c b/ranlxd.c
index 99615ca56..b089da0a5 100644
--- a/ranlxd.c
+++ b/ranlxd.c
@@ -42,7 +42,7 @@
 #define RANLXD_C
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <limits.h>
 #include <float.h>
diff --git a/ranlxs.c b/ranlxs.c
index 05f1310c4..86561497d 100644
--- a/ranlxs.c
+++ b/ranlxs.c
@@ -40,7 +40,7 @@
 #define RANLXS_C
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <limits.h>
 #include <float.h>
diff --git a/rational/Makefile.in b/rational/Makefile.in
index 225a9e5b7..22701c06a 100644
--- a/rational/Makefile.in
+++ b/rational/Makefile.in
@@ -58,10 +58,10 @@ include ${top_srcdir}/Makefile.global
 
 # rule to compile objects
 
-${librational_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h
+${librational_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config.h
 	$(COMPILE) ${OPTARGS} -c $<
 
-${librational_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h
+${librational_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config.h
 	$(COMPILE) ${SOPTARGS} -c $<
 
 # rule to make librational
diff --git a/rational/elliptic.c b/rational/elliptic.c
index 7d7c581cd..73da16a3d 100644
--- a/rational/elliptic.c
+++ b/rational/elliptic.c
@@ -54,7 +54,7 @@
 *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/rational/rational.c b/rational/rational.c
index 60bdcebb2..26faf3381 100644
--- a/rational/rational.c
+++ b/rational/rational.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/rational/zolotarev.c b/rational/zolotarev.c
index d86fb5065..b73910f49 100644
--- a/rational/zolotarev.c
+++ b/rational/zolotarev.c
@@ -50,7 +50,7 @@
 *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/read_input.h b/read_input.h
index 45d20f404..db51d68ed 100644
--- a/read_input.h
+++ b/read_input.h
@@ -92,6 +92,7 @@ extern "C"
   extern int bc_flag;
   extern int online_measurement_flag;
   extern int online_measurement_freq;
+  extern int restoresu3_flag;
   extern int reweighting_flag;
   extern int reweighting_samples; 
   extern int no_samples;
@@ -103,17 +104,44 @@ extern "C"
   extern int max_innersolver_it;
   extern double innersolver_precision;
   extern int device_num;
+
+  extern int max_mms_shifts;
+  extern int use_mixed_mms; 
   
   extern double innersolver_precision_abs;
   extern double innersolver_precision_rel;
   extern int innersolver_precision_check_abs;
   extern int innersolver_precision_check_rel;
+  extern int min_innersolver_it;  
 
+  extern double mixcg_innereps;
+  extern int mixcg_maxinnersolverit;
+  
   extern int omp_num_threads;
 
   extern int use_preconditioning;
 
+  extern int subprocess_flag;
+  extern int lowmem_flag; 
+
   extern int nblocks_t, nblocks_x, nblocks_y, nblocks_z;
+  extern double kappa_dflgen, mu_dflgen, kappa_dfl, mu_dfl, kappa_Msap, mu_Msap;
+
+  extern int mg_setup_iter;
+  extern int mg_coarse_setup_iter;
+  extern int mg_Nvec;
+  extern int mg_lvl;
+  extern int mg_blk[4];
+  extern int mg_mixed_prec;
+  extern int mg_setup_mu_set;
+  extern int mg_no_shifts;
+  extern double mg_mms_mass;
+  extern double mg_setup_mu;
+  extern double mg_cmu_factor;
+  extern double mg_dtau_update;
+  extern double mg_rho_update;
+  extern int mg_update_setup_iter;
+  extern int mg_omp_num_threads;
 
   extern int have_bsm_op;
 
diff --git a/read_input.l b/read_input.l
index 9b627016c..3f0901b6d 100644
--- a/read_input.l
+++ b/read_input.l
@@ -47,7 +47,8 @@ EQL {SPC}*={SPC}*
 
 %{
 #ifdef HAVE_CONFIG_H
-#  include<config.h>
+#  include<tmlqcd_config.h>
+
 #endif
 #include<stdlib.h>
 #include<stdio.h>
@@ -57,13 +58,18 @@ EQL {SPC}*={SPC}*
 #include"read_input.h"
 #include"default_input_values.h"
 #include"monomial/monomial.h"
-#include"measurements.h"
+#include"solver/solver_types.h"
+#include"meas/measurements.h"
 #include"integrator.h"
 #include"operator.h"
 #include"phmc.h"
 #include<io/params.h>
+#include "qphix_types.h"
+#include "quda_types.h"
+
+// some general parsing helpers
 
-inline void rmQuotes(char *str){
+static inline void rmQuotes(char *str){
   char* strsave=str;
 
   while(*str== ' ' || *str == '\t') str++;
@@ -76,6 +82,34 @@ inline void rmQuotes(char *str){
   *strsave='\0';
 }
 
+static inline void fltlist_tokenize(const char * const str, char * const paramname, const int paramname_length){
+  char error_message[200];
+  char * token = (char*)NULL;
+  // the first token is the parameter name, we will return it, 
+  // but first check if tokenization was successful
+  token = strtok(str, "\n\t =,\\");
+  if( (void*)token == NULL ){
+    snprintf(error_message, 200, "Unable to parse '%s' in fltlist_tokenize!\n", paramname);
+    yy_fatal_error(error_message);
+  } else {
+    // return what we parsed
+    snprintf(paramname, paramname_length, "%s", token);
+  }
+}
+
+static inline double fltlist_next_token(int * const list_end){
+  double retval;
+  char * token = strtok(NULL," =,\t");
+  if( (void*)token == NULL ){
+    *list_end = 1;
+    return(0.0);
+  } else {
+    *list_end = 0;
+    sscanf(token,"%lf",&retval);
+    return(retval);
+  }
+}
+
   /* Name of the parsing routine */
 #define YY_DECL         int parse_config()
 #define YY_NO_UNPUT
@@ -166,6 +200,13 @@ inline void rmQuotes(char *str){
   int innersolver_precision_check_abs;
   int innersolver_precision_check_rel;
 
+  int max_mms_shifts;
+  int use_mixed_mms;
+  int min_innersolver_it;
+
+  double mixcg_innereps;
+  int mixcg_maxinnersolverit;
+
   int propagator_comparison;
   int nb_cores;
 
@@ -178,11 +219,60 @@ inline void rmQuotes(char *str){
 
   int dfl_field_iter;
   int dfl_poly_iter;
+  double kappa_dflgen;
+  double mu_dflgen;
+  double kappa_dfl;
+  double mu_dfl;
+  double kappa_Msap;
+  double mu_Msap;
 
   int use_preconditioning;
 
   int have_bsm_op;
+  /* macro control over read_input.l is poor. In order to support compilation without
+   * a given external library, we need to have these declared here as a dummy.
+   * Note that no parameters will actually be passed, the input file reader will
+   * terminate the program if one attempts to set the parameters but tmLQCD has
+   * been compiled without the interface in question. */
+#ifdef TM_USE_QUDA
+  extern tm_QudaParams_t quda_input;
+#else
+  tm_QudaParams_t quda_input;
+#endif
 
+#ifdef TM_USE_QPHIX
+  extern tm_QPhiXParams_t qphix_input;
+#else
+  tm_QPhiXParams_t qphix_input;
+#endif
+
+// specific parsing helpers
+  static inline void parse_quda_mg_blocksize(const char * const input, const int dim){
+    char paramname[100];
+    char error_message[200];
+    int list_end = 0;
+    int level = 0;
+    int blocksize = 0;
+    
+    fltlist_tokenize(input, paramname, 100);
+    blocksize = (int)fltlist_next_token(&list_end);
+    while( list_end != 1 ){
+      if( level >= (QUDA_MAX_MG_LEVEL-1) ){
+        snprintf(error_message, 200, "Exceeded maximum number of levels (%d-1) parsing %s!\n", QUDA_MAX_MG_LEVEL, paramname);
+        yy_fatal_error(error_message);
+      }
+      
+      quda_input.mg_blocksize[level][dim] = blocksize;
+      if(myverbose){ 
+        printf("  %s, level %d set to %d line %d\n", paramname,
+                level, quda_input.mg_blocksize[level][dim], line_of_file);
+      }
+      
+      level++;
+      blocksize = (int)fltlist_next_token(&list_end);
+    }
+  }
+  
 %}
 
 %option never-interactive
@@ -202,9 +292,11 @@ inline void rmQuotes(char *str){
 %x GAUGEINPUTFILE
 %x SCALARINPUTFILE
 %x GAUGERPREC
+%x SCALARRPREC
+%x SCALARWPREC
 %x GAUGEWPREC
 %x DSBLIOCHECK
-%x DFLSP
+%x DSBLSRCIOCHECK
 %x PRECON
 %x WRITECP
 %x CPINT
@@ -227,6 +319,7 @@ inline void rmQuotes(char *str){
 %X READSOURCE
 %x SOURCEFORMAT
 %x SOURCEFILE
+%x PROPFILE
 %x SOURCETS
 %x SOURCETYPE
 %x PROPSPLIT
@@ -260,6 +353,7 @@ inline void rmQuotes(char *str){
 %x PIONNORMMEAS
 %x PLOOP
 %x ORIENTEDPLAQUETTESMEAS
+%x GRADIENTFLOWMEAS
 
 %x REWEIGH
 %x REWSAMPLES
@@ -270,6 +364,12 @@ inline void rmQuotes(char *str){
 %x GPU
 %x INITGPU
 
+%x DEFLATION
+%x INITDEFLATION
+
+%x MULTIGRID
+%x INITMULTIGRID
+
 %x INITOPERATOR
 %x TMOP
 %x DBTMOP
@@ -288,6 +388,7 @@ inline void rmQuotes(char *str){
 %x DETMONOMIAL
 %x CLDETMONOMIAL
 %x CLDETRATMONOMIAL
+%x CLDETRATRWMONOMIAL
 %x GAUGEMONOMIAL
 %x NDPOLYMONOMIAL
 %x NDRATMONOMIAL
@@ -297,12 +398,16 @@ inline void rmQuotes(char *str){
 %x CLRATCORMONOMIAL
 %x NDCLRATMONOMIAL
 %x NDRATCORMONOMIAL
+%x NDDETRATMONOMIAL
+%x NDCLDETRATMONOMIAL
 %x NDCLRATCORMONOMIAL
 %x POLYMONOMIAL
 %x CLPOLYMONOMIAL
 %x MNAME
 %x MCSTR
 %x MSOLVER
+%x RATMSOLVER
+%x NDMSOLVER
 %x GTYPE
 
 %x COMMENT
@@ -323,12 +428,21 @@ inline void rmQuotes(char *str){
 
 %x PRECONDITIONING
 
+%x QUDAINVERTER
+%x QPHIXINVERTER
+%x COMPRESSION
 
+%x MIXCGEPS
+%x MIXCGIT
 
+%x LOWMEM
+%x SUBPROCESS 
 
+%x INITEXTERNALINVERTER
 
 %%
 ^SourceFilename{EQL}               BEGIN(SOURCEFILE);
+^PropagatorFilename{EQL}           BEGIN(PROPFILE);
 ^T{EQL}                            BEGIN(TT);
 ^L{EQL}                            BEGIN(LL);
 ^LX{EQL}                           BEGIN(LLX);
@@ -376,7 +490,11 @@ inline void rmQuotes(char *str){
 ^GMRESDRNrEv{EQL}                  BEGIN(GMRESDRNEV);
 ^GaugeConfigReadPrecision{EQL}     BEGIN(GAUGERPREC);
 ^GaugeConfigWritePrecision{EQL}    BEGIN(GAUGEWPREC);
+^ScalarConfigReadPrecision{EQL}    BEGIN(SCALARRPREC);
+^ScalarConfigWritePrecision{EQL}   BEGIN(SCALARWPREC);
 ^DisableIOChecks{EQL}              BEGIN(DSBLIOCHECK);
+^DisableGaugeIOChecks{EQL}         BEGIN(DSBLIOCHECK);
+^DisableSourceIOChecks{EQL}        BEGIN(DSBLSRCIOCHECK);
 ^ReproduceRandomNumbers{EQL}       BEGIN(REPRORND);
 ^UseSloppyPrecision{EQL}           BEGIN(SLOPPYPREC);
 ^UseStoutSmearing{EQL}             BEGIN(USESTOUT);
@@ -389,7 +507,6 @@ inline void rmQuotes(char *str){
 ^WritePropagatorFormat{EQL}        BEGIN(WRPROPFLAG);
 ^PropagatorType{EQL}               BEGIN(WRPROPFLAG);
 ^RanluxdLevel{EQL}                 BEGIN(RLXDLEVEL);
-^DeflationSubspaceDimension{EQL}   BEGIN(DFLSP);
 ^GCRPreconditioner{EQL}            BEGIN(PRECON);
 ^ComputeReweightingFactor{EQL}     BEGIN(REWEIGH);
 ^NoReweightingSamples{EQL}         BEGIN(REWSAMPLES);
@@ -398,6 +515,7 @@ inline void rmQuotes(char *str){
 ^NoSamples{EQL}                    BEGIN(NOSAMPLES);
 ^SplittedPropagator{EQL}           BEGIN(PROPSPLIT);
 ^UsePreconditioning{EQL}           BEGIN(PRECONDITIONING);
+^UseCompression{EQL}               BEGIN(COMPRESSION);
 
 ^BeginMeasurement{SPC}+            BEGIN(INITMEASUREMENT);
 ^ComputeModeNumber{EQL}            BEGIN(COMPUTEMN);
@@ -408,6 +526,8 @@ inline void rmQuotes(char *str){
 ^BeginInt                          BEGIN(INITINTEGRATOR);
 ^BeginOperator{SPC}+               BEGIN(INITOPERATOR);
 
+^BeginExternalInverter{SPC}+       BEGIN(INITEXTERNALINVERTER);
+
 ^PropagatorComparison{EQL}         BEGIN(PCOMP);
 ^NbCoresPerNode{EQL}               BEGIN(NBCORES);
 
@@ -422,10 +542,13 @@ inline void rmQuotes(char *str){
 ^DflPolyIter{EQL}                  BEGIN(DFLPOLYITER);
 
 ^BeginGPU                          BEGIN(INITGPU);
+^BeginDeflation		                 BEGIN(INITDEFLATION);
+^BeginDDalpha       	             BEGIN(INITMULTIGRID);
+^MixCGInnerEps{EQL}                BEGIN(MIXCGEPS);
+^MixCGMaxIter{EQL}                 BEGIN(MIXCGIT);
 
-
-
-
+^EnableLowmem{EQL}                 BEGIN(LOWMEM);
+^EnableSubprocess{EQL}             BEGIN(SUBPROCESS);
 
 
 <INITGPU>Init{SPC}* {
@@ -470,12 +593,253 @@ inline void rmQuotes(char *str){
       innersolver_precision_check_rel = a;
       if(myverbose) printf("  innersolver_precision_check_rel set to %d line %d\n", a, line_of_file);
   }
+  {SPC}*MinInnerSolverIterations{EQL}{DIGIT}+ {
+      sscanf(yytext, " %[2a-zA-Z] = %d", name, &a);
+      min_innersolver_it = a;
+      if(myverbose) printf("  min_innersolver_it set to %d line %d\n", a, line_of_file);
+  }
+  {SPC}*MaxMmsShifts{EQL}{DIGIT}+ {
+      sscanf(yytext, " %[2a-zA-Z] = %d", name, &a);
+      max_mms_shifts = a;
+      if(myverbose) printf("  max_mms_shifts set to %d line %d\n", a, line_of_file);
+  }
+  {SPC}*UseMixedMms{EQL}yes {
+    sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
+    use_mixed_mms = 1;
+    if(myverbose) printf("  Using mixed solver for smallest shifts\n");  }
+  {SPC}*UseMixedMms{EQL}no {
+    sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
+    use_mixed_mms = 0;
+    if(myverbose) printf("  Not using mixed solver for smallest shifts\n");
+  }
   EndGPUInit{SPC}* {
   if(myverbose) printf("GPU parsed in line %d\n\n", line_of_file);
   BEGIN(0);
   }
 }
 
+<INITDEFLATION>Init{SPC}* {
+ if(myverbose) printf("Initialising DEFLATION line %d\n", line_of_file); 
+ BEGIN(DEFLATION);
+ }
+<DEFLATION>{
+  {SPC}*DeflationSubspaceDimension{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[2a-zA-Z] = %d", name, &a);
+    g_N_s = a;
+    if(myverbose) printf("Number of global approximate eigenvectors (deflation subspace dimension) set to %d line %d\n", a, line_of_file);
+  }
+  {SPC}*NiterMsap{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[2a-zA-Z] = %d", name, &a);
+    NiterMsap = a;
+    if(myverbose) printf("NiterMsap for solver preconditioner set to %d line %d\n", a, line_of_file);
+  }
+  {SPC}*NcycleMsap{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[2a-zA-Z] = %d", name, &a);
+    NcycleMsap = a;
+    if(myverbose) printf("NcycleMsap for solver preconditioner set to %d line %d\n", a, line_of_file);
+  }
+  {SPC}*KappaMsap{EQL}{FLT}+ {
+    sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
+    kappa_Msap = c;
+    if(myverbose) printf("kappa for Msap preconditioner set to %e line %d\n", c, line_of_file);
+  }
+  {SPC}*2KappaMuMsap{EQL}{FLT}+ {
+    sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
+    mu_Msap = c;
+    if(myverbose) printf("2KappaMu for Msap preconditioner set to %e line %d\n", c, line_of_file);
+  }
+  {SPC}*NiterMsapSubspace{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[2a-zA-Z] = %d", name, &a);
+    NiterMsap_dflgen = a;
+    if(myverbose) printf("NiterMsapDfl for subspace generation set to %d line %d\n", a, line_of_file);
+  }
+  {SPC}*NcycleMsapSubspace{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[2a-zA-Z] = %d", name, &a);
+    NcycleMsap_dflgen = a;
+    if(myverbose) printf("NcycleMsapDfl for subspace generation set to %d line %d\n", a, line_of_file);
+  }
+  {SPC}*NsmoothSubspace{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[2a-zA-Z] = %d", name, &a);
+    NsmoothMsap_dflgen = a;
+    if(myverbose) printf("NsmoothMsapDfl for subspace generation set to %d line %d\n", a, line_of_file);
+  }
+  {SPC}*LittleGMRESMParameter{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[2a-zA-Z] = %d", name, &a);
+    little_gmres_m_parameter = a;
+    if(myverbose) printf("LittleGMRESMParameter set to %d line %d\n", a, line_of_file);
+  }
+  {SPC}*LittleSolverLowPrecision{EQL}{FLT} {
+    sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
+    little_solver_low_prec = c;
+    if(myverbose) printf("Low precision for little Dirac solver set to %e line %d\n", c, line_of_file);
+  }
+  {SPC}*LittleSolverHighPrecision{EQL}{FLT} {
+    sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
+    little_solver_high_prec = c;
+    if(myverbose) printf("High precision for little Dirac solver set to %e line %d\n", c, line_of_file);
+  }
+  {SPC}*LittleSolverMaxIter{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[2a-zA-Z] = %d", name, &a);
+    little_solver_max_iter = a;
+    if(myverbose) printf("LittleSolverMaxIter set to %d line %d\n", a, line_of_file);
+  }
+  {SPC}*LittleSolver{EQL}(gcr|mcr|mr|fgmres) {
+    little_solver = 0;
+    if(myverbose) printf("Littlesolver set to gcr in line %d (currently no other solver available)\n", line_of_file);
+  }
+  {SPC}*useLittleLittleD{EQL}yes {
+    usePL = 1;
+    if(myverbose) printf("UsePL set to true line %d\n", line_of_file);
+  }
+  {SPC}*useLittleLittleD{EQL}no {
+    usePL = 0;
+    if(myverbose) printf("UsePL set to true line %d\n", line_of_file);
+  }
+  {SPC}*LittleEvenOdd{EQL}yes {
+    little_evenodd = 1;
+    if(myverbose) printf("LittleEvenOdd set to true line %d\n", line_of_file);
+  }
+  {SPC}*LittleEvenOdd{EQL}no {
+    little_evenodd = 0;
+    if(myverbose) printf("LittleEvenOdd set to true line %d\n", line_of_file);
+  }
+  {SPC}*KappaSubspace{EQL}{FLT} {
+    sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
+    kappa_dflgen = c;
+    if(myverbose) printf("kappa for subspace generation set to %e line %d\n", c, line_of_file);
+  }
+  {SPC}*2KappaMuSubspace{EQL}{FLT} {
+    sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
+    mu_dflgen = c;
+    if(myverbose) printf("2KappaMu for subspace generation set to %e line %d\n", c, line_of_file);
+  }
+  {SPC}*kappa{EQL}{FLT} {
+    sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
+    kappa_dfl = c;
+    if(myverbose) printf("kappa for preconditioner set to %e line %d\n", c, line_of_file);
+  }
+  {SPC}*2KappaMu{EQL}{FLT} {
+    sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
+    mu_dfl = c;
+    if(myverbose) printf("2KappaMu for preconditioner set to %e line %d\n", c, line_of_file);
+  }
+  EndDeflationInit{SPC}* {
+  if(myverbose) printf("DEFLATION parsed in line %d\n\n", line_of_file);
+  BEGIN(0);
+  }
+}
+
+<INITMULTIGRID>AMG{SPC}* {
+#ifdef DDalphaAMG
+ if(myverbose) printf("Initialising DDalphaAMG line %d\n", line_of_file); 
+ BEGIN(MULTIGRID);
+#else
+ printf("ERROR line %d: DDalphaAMG library not included\n", line_of_file);
+ exit(1);
+#endif
+ }
+<MULTIGRID>{
+  {SPC}*MGSetupIter{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    mg_setup_iter=a;
+    if(myverbose) printf("  MG_Setup_Iter set to %d line %d operator %d\n", mg_setup_iter, line_of_file, current_operator);
+  }
+  {SPC}*MGCoarseSetupIter{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    mg_coarse_setup_iter=a;
+    if(myverbose) printf("  MG_Coarse_Setup_Iter set to %d line %d operator %d\n", mg_coarse_setup_iter, line_of_file, current_operator);
+  }
+  {SPC}*MGNumberOfVectors{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    mg_Nvec=a;
+    if(myverbose) printf("  MGNumberOfVectors set to %d line %d operator %d\n", mg_Nvec, line_of_file, current_operator);
+  }
+  {SPC}*MGNumberOfLevels{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    mg_lvl=a;
+    if(myverbose) printf("  MGNumberOfLevels set to %d line %d operator %d\n", mg_lvl, line_of_file, current_operator);
+  }
+  {SPC}*MGBlockX{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    mg_blk[3]=a;
+    if(myverbose) printf("  MGBlockX set to %d line %d operator %d\n", mg_blk[3], line_of_file, current_operator);
+  }
+  {SPC}*MGBlockY{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    mg_blk[2]=a;
+    if(myverbose) printf("  MGBlockY set to %d line %d operator %d\n", mg_blk[2], line_of_file, current_operator);
+  }
+  {SPC}*MGBlockZ{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    mg_blk[1]=a;
+    if(myverbose) printf("  MGBlockZ set to %d line %d operator %d\n", mg_blk[1], line_of_file, current_operator);
+  }
+  {SPC}*MGBlockT{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    mg_blk[0]=a;
+    if(myverbose) printf("  MGBlockT set to %d line %d operator %d\n", mg_blk[0], line_of_file, current_operator);
+  }
+  {SPC}*MGSetup2KappaMu{EQL}{FLT}+ {
+    sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
+    mg_setup_mu=c;
+    mg_setup_mu_set=1;
+    if(myverbose) printf("  MGSetup2KappaMu set to %f line %d operator %d\n", mg_setup_mu, line_of_file, current_operator);
+  }
+  {SPC}*MGCoarseMuFactor{EQL}{FLT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
+    mg_cmu_factor=c;
+    if(myverbose) printf("  MGCoarseMuFactor set to %f line %d operator %d\n", mg_cmu_factor, line_of_file, current_operator);
+  }
+  {SPC}*MGMixedPrecision{EQL}yes {
+    sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
+    mg_mixed_prec = 1;
+    if(myverbose) printf("  MGMixedPrecision set to YES line %d operator %d\n",  line_of_file, current_operator);
+  }
+  {SPC}*MGMixedPrecision{EQL}no {
+    sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
+    mg_mixed_prec = 0;
+    if(myverbose) printf("  MGMixedPrecision set to NO line %d operator %d\n",  line_of_file, current_operator);
+  }
+  {SPC}*MGdtauUpdate{EQL}{FLT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
+    mg_dtau_update=c;
+    if(myverbose) printf("  MGdtauUpdate set to %f line %d operator %d\n", mg_dtau_update, line_of_file, current_operator);
+  }
+  {SPC}*MGrhoUpdate{EQL}{FLT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
+    mg_rho_update=c;
+    if(myverbose) printf("  MG_rho_Update set to %f line %d operator %d\n", mg_rho_update, line_of_file, current_operator);
+  }
+  {SPC}*MGUpdateSetupIter{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    mg_update_setup_iter=a;
+    if(myverbose) printf("  MG_Update_Setup_Iter set to %d line %d operator %d\n", mg_update_setup_iter, line_of_file, current_operator);
+  }
+  {SPC}*MGOMPNumThreads{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    mg_omp_num_threads=a;
+    if(myverbose) printf("  MG_omp_num_threads set to %d line %d operator %d\n", mg_omp_num_threads, line_of_file, current_operator);
+  }
+  {SPC}*MGNumberOfShifts{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    mg_no_shifts=a;
+    // when the number of shifts is specified, mg_mss_mass must be set to zero!
+    mg_mms_mass=0;
+    if(myverbose) printf("  MG_Num_of_shifts set to %d line %d operator %d\n", mg_no_shifts, line_of_file, current_operator);
+  }
+  {SPC}*MGMMSMass{EQL}{FLT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
+    mg_mms_mass=c;
+    // when mg_mms_mass is specified, mg_no_shifts should be set to zero!
+    mg_no_shifts=0;
+    if(myverbose) printf("  MG_MMS_Mass set to %f line %d operator %d\n", mg_mms_mass, line_of_file, current_operator);
+  }
+  EndDDalphaAMG{SPC}* {
+  if(myverbose) printf("DDalphaAMG parsed in line %d\n\n", line_of_file);
+  BEGIN(0);
+  }
+}
+
 
 <INITOPERATOR>{TYPE} {
   current_operator++;
@@ -515,7 +879,11 @@ inline void rmQuotes(char *str){
   else if(strcmp(yytext, "BSM2f")==0) {
     optr->type = BSM2f;
     have_bsm_op = 1;
-  }  
+  } 
+  else if(strcmp(yytext, "BSM3") == 0){
+    optr->type = BSM3;
+    have_bsm_op = 1;
+  } 
   else {
     fprintf(stderr, "Unknown operator type %s in line %d\n", yytext, line_of_file);
     exit(1);
@@ -562,6 +930,16 @@ inline void rmQuotes(char *str){
     PropInfo.precision = 64;
     if(myverbose) printf("  PropagatorPrecision set to 64 line %d operator %d\n", line_of_file, current_operator);
   }
+  {SPC}*WritePropagator{EQL}yes {
+    sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
+    optr->write_prop_flag = 1;
+    if(myverbose) printf("  WritePropagator set to YES line %d operator %d\n",  line_of_file, current_operator);
+  }
+  {SPC}*WritePropagator{EQL}no {
+    sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
+    optr->write_prop_flag = 0;
+    if(myverbose) printf("  WritePropagator set to NO line %d operator %d\n",  line_of_file, current_operator);
+  }
   {SPC}*SolverPrecision{EQL}{FLT} {
     sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
     optr->eps_sq = c;
@@ -579,6 +957,11 @@ inline void rmQuotes(char *str){
     optr->rel_prec = 0;
     if(myverbose) printf("  SolverRelativePrecision set to NO line %d operator %d\n",  line_of_file, current_operator);
   }
+  {SPC}*mcgdelta{EQL}{FLT} {
+    sscanf(yytext, " %[a-zA-Z1] = %lf", name, &c);
+    (optr->solver_params).mcg_delta = c;
+    if(myverbose) printf("  mcg_delta set to %lf line %d operator %d\n", c, line_of_file, current_operator);
+  }
   {SPC}*EigCGnrhs{EQL}{DIGIT}+ {
     sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
     (optr->solver_params).eigcg_nrhs = a;
@@ -607,19 +990,18 @@ inline void rmQuotes(char *str){
   {SPC}*EigCGtolsq1{EQL}{FLT} {
     sscanf(yytext, " %[a-zA-Z1] = %lf", name, &c);
     (optr->solver_params).eigcg_tolsq1 = c;
-    if(myverbose) printf("  EigCGtolsq1 set to %lf line %d operator %d\n", a, line_of_file, current_operator);
+    if(myverbose) printf("  EigCGtolsq1 set to %lf line %d operator %d\n", c, line_of_file, current_operator);
   }
   {SPC}*EigCGrestolsq{EQL}{FLT} {
     sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
     (optr->solver_params).eigcg_restolsq = c;
-    if(myverbose) printf("  EigCGrestolsq set to %lf line %d operator %d\n", a, line_of_file, current_operator);
+    if(myverbose) printf("  EigCGrestolsq set to %lf line %d operator %d\n", c, line_of_file, current_operator);
   }
   {SPC}*EigCGRandGuessOpt{EQL}{DIGIT}+ {
     sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
     (optr->solver_params).eigcg_rand_guess_opt = a;
     if(myverbose) printf("  EigCGrand_guess_opt set to %d line %d operator %d\n", a, line_of_file, current_operator);
-  }
-
+  }  
   {SPC}*ExtraMasses{EQL}{FLTLIST} {
     char * token = NULL;
     optr->no_extra_masses = 0;
@@ -697,21 +1079,192 @@ inline void rmQuotes(char *str){
     name_caller = YY_START; 
     BEGIN(BSMSOLVER);
   }
-  {SPC}*rho{EQL}{FLT} {
+  {SPC}*rhobsm{EQL}{FLT} {
     sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
     rho_BSM = c;
     if(myverbose != 0) printf("  BSM parameter rho set to %f\n", rho_BSM);
   }
-  {SPC}*eta{EQL}{FLT} {
+  {SPC}*etabsm{EQL}{FLT} {
     sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
     eta_BSM = c;
     if(myverbose != 0) printf("  BSM parameter eta set to %f\n", eta_BSM);
   }
-  {SPC}*m0{EQL}{FLT} {
+  {SPC}*m0bsm{EQL}{FLT} {
     sscanf(yytext, " %[a-zA-Z0] = %lf", name, &c);
     m0_BSM = c;
     if(myverbose != 0) printf("  BSM parameter m0 set to %f\n", m0_BSM);
   }
+  {SPC}*mu01bsm{EQL}{FLT} {
+    sscanf(yytext, " %[a-zA-Z01] = %lf", name, &c);
+    mu01_BSM = c;
+    if(myverbose != 0) printf("  BSM parameter mu01 set to %f\n", mu01_BSM);
+  }
+  {SPC}*mu03bsm{EQL}{FLT} {
+    sscanf(yytext, " %[a-zA-Z0123] = %lf", name, &c);
+    mu03_BSM = c;
+    if(myverbose != 0) printf("  BSM parameter mu03 set to %f\n", mu03_BSM);
+  }
+  {SPC}*c5phi{EQL}{FLT} {
+    sscanf(yytext, " %[a-zA-Z0123] = %lf", name, &c);
+    c5phi_BSM = c;
+    if(myverbose != 0) printf("  BSM parameter c5phi set to %f\n", c5phi_BSM);
+  }
+  {SPC}*rbsm{EQL}{FLT} {
+    sscanf(yytext, " %[a-zA-Z0123] = %lf", name, &c);
+    r_BSM = c;
+    if(myverbose != 0) printf("  BSM parameter r_bsm set to %f\n", r_BSM);
+  }
+  {SPC}*cswbsm{EQL}{FLT} {
+    sscanf(yytext, " %[a-zA-Z0123] = %lf", name, &c);
+    csw_BSM = c;
+    if(myverbose != 0) printf("  BSM parameter csw_bsm set to %f\n", csw_BSM);
+  }
+  {SPC}*propagatorsonthefly{EQL}yes {
+    propagatorsonthefly_BSM = 1;
+    if(myverbose != 0) printf("  BSM propagators are also computed for using them in the contractions  line %d operator %d\n", a, line_of_file, current_operator);
+  }
+  {SPC}*propagatorsonthefly{EQL}no {
+    propagatorsonthefly_BSM = 0;
+    if(myverbose != 0) printf("  BSM propagators are read from the disk (default)  line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*smearingcorrelator{EQL}yes {
+    smearedcorrelator_BSM = 1;
+    if(myverbose != 0) printf("  In the BSM correlators smeared scalar fields are used (over the whole timeslice) line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*smearingcorrelator{EQL}no {
+    smearedcorrelator_BSM = 0;
+    if(myverbose != 0) printf("  In the BSM smearing of the scalars in the correlators is switched off!! (this is the default) line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*densitydensity{EQL}yes {
+    if(myverbose) printf("  BSM density denstiy correlators are computed with nontrivial scalar line %d operator %d\n", line_of_file, current_operator);
+    densitydensity_BSM = 1;
+  }
+  {SPC}*densitydensity{EQL}no {
+    if(myverbose) printf("  BSM density denstiy correlators are not computed with nontrivial scalar line %d operator %d\n", line_of_file, current_operator);
+    densitydensity_BSM = 0;
+  }
+  {SPC}*densitydensitys0s0{EQL}yes {
+    densitydensity_s0s0_BSM = 1;
+    if(myverbose != 0) printf("  BSM density density s0s0,p0p0 are computed with trivial scalar fields line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*densitydensitys0s0{EQL}no {
+    densitydensity_s0s0_BSM = 0;
+    if(myverbose != 0) printf("  BSM density density s0s0,p0p0 are not computed with trivial scalar fields (this is the default) line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*densitydensitysxsx{EQL}yes {
+    densitydensity_sxsx_BSM = 1;
+    if(myverbose != 0) printf("  BSM density density ss and pp correlators with trivial scalar field are computed  line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*densitydensitysxsx{EQL}no {
+    densitydensity_sxsx_BSM = 0;
+    if(myverbose != 0) printf("  BSM density density ss and pp correlators with trivial scalar field are not computed (this is the default) line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*diraccurrentdensity{EQL}yes {
+    diraccurrentdensity_BSM = 1;
+    if(myverbose != 0) printf("  BSM naive (dirac) current  density correlators are computed line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*diraccurrentdensity{EQL}no  {
+    diraccurrentdensity_BSM = 0;
+    if(myverbose != 0) printf("  BSM naive (dirac) current  density correlators are not computed (this is the default) line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*wilsoncurrentdensityprfirstline{EQL}yes {
+    wilsoncurrentdensitypr1_BSM = 1;
+    if(myverbose != 0) printf("  BSM wilson (pr first line) current times density correlators are computed line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*wilsoncurrentdensityprfirstline{EQL}no {
+    wilsoncurrentdensitypr1_BSM = 0;
+    if(myverbose != 0) printf("  BSM wilson (pr first line) current times density correlators are not computed (this is the default) line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*wilsoncurrentdensityprsecondline{EQL}yes {
+    wilsoncurrentdensitypr2_BSM = 1;
+    if(myverbose != 0) printf("  BSM wilson (pr second line) current times density correlators are computed line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*wilsoncurrentdensityprsecondline{EQL}no {
+    wilsoncurrentdensitypr2_BSM = 0;
+    if(myverbose != 0) printf("  BSM wilson (pr second line) current times density correlators are not computed (this is the default) line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*wilsoncurrentdensityplfirstline{EQL}yes {
+    wilsoncurrentdensitypl1_BSM = 1;
+    if(myverbose != 0) printf("  BSM wilson (pl first line) current times density correlators are computed line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*wilsoncurrentdensityplfirstline{EQL}no {
+    wilsoncurrentdensitypl1_BSM = 0;
+    if(myverbose != 0) printf("  BSM wilson (pl first line) current times density correlators are not computed (this is the default) line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*wilsoncurrentdensityplsecondline{EQL}yes {
+    wilsoncurrentdensitypl2_BSM = 1;
+    if(myverbose != 0) printf("  BSM wilson (pl second line) current times density correlators are computed line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*wilsoncurrentdensityplsecondline{EQL}no {
+    wilsoncurrentdensitypl2_BSM = 0;
+    if(myverbose != 0) printf("  BSM wilson (pl second line) current times density correlators are not computed (this is the default) line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*currentV3currentV3{EQL}yes {
+    vectorcurrentcurrent_BSM = 1;
+    if(myverbose != 0) printf("  BSM extended vector current current V3 correlators are computed line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*currentV3currentV3{EQL}no {
+    vectorcurrentcurrent_BSM = 0;
+    if(myverbose != 0) printf("  BSM extended vector current current V3 correlators are not computed (this is the default) line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*currentA1currentA1{EQL}yes {
+    axialcurrentcurrent_BSM = 1;
+    if(myverbose != 0) printf("  BSM extended axial current current A1 correlators are computed (this is the default) line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*currentA1currentA1{EQL}no {
+    axialcurrentcurrent_BSM = 0;
+    if(myverbose != 0) printf("  BSM extended axial current current A1 correlators are not computed (this is the default) line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*vectorcurrentdensity{EQL}yes {
+    vectorcurrentdensity_BSM = 1;
+    if(myverbose != 0) printf("  BSM extended vector current times density correlators are computed line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*vectorcurrentdensity{EQL}no {
+    vectorcurrentdensity_BSM = 0;
+    if(myverbose != 0) printf("  BSM extended vector current times density correlators are not computed (this is the default) line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*axialcurrentdensity{EQL}yes {
+    axialcurrentdensity_BSM = 1;
+    if(myverbose != 0) printf("  BSM extended axial current times density correlators are computed line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*axialcurrentdensity{EQL}no {
+    axialcurrentdensity_BSM = 0;
+    if(myverbose != 0) printf("  BSM extended axial current times density correlators are not computed (this is the default) line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*pdensityvectordensity{EQL}yes {
+    pdensityvectordensity_BSM = 1;
+    if(myverbose != 0) printf("  BSM psuedo density(0) (without scalar)  * psdensity(x) (with scalar) %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*pdensityvectordensity{EQL}no {
+    pdensityvectordensity_BSM = 0;
+    if(myverbose != 0) printf("  BSM psuedo density(0) (without scalar)  * psdensity(x) (with scalar) are not computed %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*vectordensitydensity{EQL}yes {
+    vectordensitydensity_BSM = 1;
+    if(myverbose != 0) printf("  BSM extended vector density times density correlators are computed line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*vectordensitydensity{EQL}no {
+    vectordensitydensity_BSM = 0;
+    if(myverbose != 0) printf("  BSM extended vector density times density correlators are not computed (this is the default) line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*giancarlocontraction{EQL}yes {
+    giancarlo_BSM = 1;
+    if(myverbose != 0) printf("  Contractions proposed by Giancarlo are computed line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*giancarlocontraction{EQL}no {
+    giancarlo_BSM = 0;
+    if(myverbose != 0) printf("  Contractions proposed by Giancarlo are not computed line %d operator %d\n", line_of_file, current_operator);
+  }
+  {SPC}*timesmearcorrelator{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    timesmearcorrelator_BSM = a;
+    if(myverbose) printf("  timesmearcorrelator is  set to %d line %d operator %d\n", a, line_of_file, current_operator);
+  }
+  {SPC}*nscalarstep{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    optr->nscalarstep = a;
+    if(myverbose) printf("  nscalarstep set to %d line %d operator %d\n", a, line_of_file, current_operator);
+  }
   {SPC}*npergauge{EQL}{DIGIT}+ {
     sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
     optr->npergauge = a;
@@ -733,8 +1286,7 @@ inline void rmQuotes(char *str){
   }
 }
 
-
-<WILSONOP,TMOP>{
+<WILSONOP,TMOP,CLOVEROP>{
   {SPC}*UseEvenOdd{EQL}yes {
     if(myverbose) printf("  Use even/odd preconditioning line %d operator %d\n", line_of_file, current_operator);
     optr->even_odd_flag = 1;
@@ -786,6 +1338,57 @@ inline void rmQuotes(char *str){
   }
 }
 
+<WILSONOP,TMOP,DBTMOP,CLOVEROP,DBCLOVEROP>{  
+  {SPC}*UseExternalInverter{EQL}quda {
+    if(myverbose) printf("  Use Quda inverter line %d operator %d\n", line_of_file, current_operator);
+    optr->external_inverter = QUDA_INVERTER;
+  }
+  {SPC}*UseExternalInverter{EQL}qphix {
+    if(myverbose) printf("  Use QPhiX inverter line %d operator %d\n", line_of_file, current_operator);
+    optr->external_inverter = QPHIX_INVERTER;
+  }
+  {SPC}*UseExternalInverter{EQL}no {
+    if(myverbose) printf("  Do not use external inverter line %d operator %d\n", line_of_file, current_operator);
+    optr->external_inverter = NO_EXT_INV;
+  }
+  {SPC}*UseSloppyPrecision{EQL}yes {
+    if(myverbose) printf("  Use use sloppy precision (single) in the inverter (if supported by the inverter) line %d operator %d\n", line_of_file, current_operator);
+    optr->sloppy_precision = SLOPPY_SINGLE;
+  }
+  {SPC}*UseSloppyPrecision{EQL}float {
+    if(myverbose) printf("  Use use sloppy precision (single) in the inverter (if supported by the inverter) line %d operator %d\n", line_of_file, current_operator);
+    optr->sloppy_precision = SLOPPY_SINGLE;
+  }
+  {SPC}*UseSloppyPrecision{EQL}single {
+    if(myverbose) printf("  Use use sloppy precision (single) in the inverter (if supported by the inverter) line %d operator %d\n", line_of_file, current_operator);
+    optr->sloppy_precision = SLOPPY_SINGLE;
+  }
+  {SPC}*UseSloppyPrecision{EQL}no {
+    if(myverbose) printf("  Use use sloppy precision (single) in the inverter line %d operator %d\n", line_of_file, current_operator);
+    optr->sloppy_precision = SLOPPY_DOUBLE;
+  }
+  {SPC}*UseSloppyPrecision{EQL}double {
+    if(myverbose) printf("  Use use sloppy precision (single) in the inverter line %d operator %d\n", line_of_file, current_operator);
+    optr->sloppy_precision = SLOPPY_DOUBLE;
+  }
+  {SPC}*UseSloppyPrecision{EQL}half {
+    if(myverbose) printf("  Use use sloppy precision (half) in the inverter (if supported by the inverter) line %d operator %d\n", line_of_file, current_operator);
+    optr->sloppy_precision = SLOPPY_HALF;
+  }
+  {SPC}*UseCompression{EQL}12 {
+    if(myverbose) printf("  Use 12 compression in the inverter (if supported) line %d operator %d\n", line_of_file, current_operator);
+    optr->compression_type = COMPRESSION_12;
+  }
+  {SPC}*UseCompression{EQL}8 {
+    if(myverbose) printf("  Use 8 compression in the inverter (if supported) line %d operator %d\n", line_of_file, current_operator);
+    optr->compression_type = COMPRESSION_8;
+  }
+  {SPC}*UseCompression{EQL}18 {
+    if(myverbose) printf("  Not using compression in the inverter line %d operator %d\n", line_of_file, current_operator);
+    optr->compression_type = NO_COMPRESSION;
+  }
+}
+
 <OVERLAPOP>{
   {SPC}*Solver{EQL} { 
    BEGIN(OVSOLVER); 
@@ -827,111 +1430,433 @@ inline void rmQuotes(char *str){
 
 <DBTMSOLVER,TMSOLVER,CSWSOLVER,BSMSOLVER>{
   cg {
-    optr->solver=1;
+    optr->solver=CG;
     if(myverbose) printf("  Solver set to CG line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
   }
+  bicgstab {
+    optr->solver=BICGSTAB;
+    if(myverbose) printf("  Solver set to BiCGstab line %d operator %d\n", line_of_file, current_operator);
+    BEGIN(name_caller);
+  }
+  mixedbicgstab {
+    optr->solver=MIXEDBICGSTAB;
+    if(myverbose) printf("  Solver set to MixedBiCGstab line %d operator %d\n", line_of_file, current_operator);
+    BEGIN(name_caller);
+  }
+  mixedcg {
+    optr->solver=MIXEDCG;
+    if(myverbose) printf("  Solver set to MixedCG line %d operator %d\n", line_of_file, current_operator);
+    BEGIN(name_caller);
+  }
+  rgmixedcg {
+    optr->solver=RGMIXEDCG;
+    if(myverbose) printf("  Solver set to RGMixedCG line %d operator %d\n", line_of_file, current_operator);
+    BEGIN(name_caller);
+  }
+  DDalphaAMG {
+#ifdef DDalphaAMG
+    optr->solver = MG;
+    if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
+    BEGIN(name_caller);
+#else
+    printf("ERROR line %d operator %d: DDalphaAMG library not included\n", line_of_file, current_operator);
+    exit(1);
+#endif
+  }
+  dummyhermtest {
+    optr->solver=DUMMYHERMTEST;
+    if(myverbose) printf("  Solver set to DummyHermTest line %d operator %d\n", line_of_file, current_operator);
+    BEGIN(name_caller);
+  }
 }
 
 <TMSOLVER>{
   mixedcg {
-    optr->solver=13;
+    optr->solver=MIXEDCG;
     if(myverbose) printf("  Solver set to MixedCG line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
   }
   bicgstab {
-    optr->solver=0;
+    optr->solver=BICGSTAB;
     if(myverbose) printf("  Solver set to BiCGstab line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
   }
+  mixedbicgstab {
+    optr->solver=MIXEDBICGSTAB;
+    if(myverbose) printf("  Solver set to MixedBiCGstab line %d operator %d\n", line_of_file, current_operator);
+    BEGIN(name_caller);
+  }
+  bicg {
+    optr->solver=BICG;
+    if(myverbose) printf("  Solver set to BiCG line %d operator %d\n", line_of_file, current_operator);
+    BEGIN(name_caller);
+  }
   pcg {
-    optr->solver=9;
+    optr->solver=PCG;
     if(myverbose) printf("  Solver set to PCG line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
   }
   gmres {
-    optr->solver=2;
+    optr->solver=GMRES;
     if(myverbose) printf("  Solver set to GMRES line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
   }
   gcr {
-    optr->solver=7;
+    optr->solver=GCR;
     if(myverbose) printf("  Solver set to GCR line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
   }
+  mcr {
+    optr->solver=MCR;
+    if(myverbose) printf("  Solver set to mCR line %d operator %d\n", line_of_file, current_operator);
+    BEGIN(name_caller);
+  }
+  cr {
+    optr->solver=CR;
+    if(myverbose) printf("  Solver set to CR line %d operator %d\n", line_of_file, current_operator);
+    BEGIN(name_caller);
+  }
   gmresdr {
-    optr->solver=8;
+    optr->solver=GMRESDR;
     if(myverbose) printf("  Solver set to GMRES-DR line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
   }
   cgs {
-    optr->solver=3;
+    optr->solver=CGS;
     if(myverbose) printf("  Solver set to CGS line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
   }
   mr {
-    optr->solver=4;
+    optr->solver=MR;
     if(myverbose) printf("  Solver set to MR line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
   }
   fgmres {
-    optr->solver=6;
+    optr->solver=FGMRES;
     if(myverbose) printf("  Solver set to FGMRES line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
   }
   dflgcr {
-    optr->solver=10;
+    optr->solver=DFLGCR;
     g_dflgcr_flag = 1;
     if(myverbose) printf("  Solver set to DFL-GCR line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
   }
   dflfgmres {
-    optr->solver=11;
+    optr->solver=DFLFGMRES;
     g_dflgcr_flag = 1;
     if(myverbose) printf("  Solver set to DFL-FGMRES line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
   }
   cgmms {
-    optr->solver = 12;
+    optr->solver = CGMMS;
     if(myverbose) printf("  Solver set to CGMMS line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
   }
   increigcg {
-    optr->solver = 15;
+    optr->solver = INCREIGCG;
     if(myverbose) printf("  Solver set to INCR-EIG-CG line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
   }
-
+  DDalphaAMG {
+#ifdef DDalphaAMG
+    optr->solver = MG;
+    if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
+    BEGIN(name_caller);
+#else
+    printf("ERROR line %d operator %d: DDalphaAMG library not included\n", line_of_file, current_operator);
+    exit(1);
+#endif
+  }
 }
 
+<TMSOLVER,CSWSOLVER>{
+  mg {
+    optr->solver=MG;
+    if(myverbose) printf("  Solver set to MG line %d operator %d\n", line_of_file, current_operator);
+    BEGIN(name_caller);
+  }
+}
 
 <CSWSOLVER>{
+  bicgstab {
+    optr->solver = BICGSTAB;
+    if(myverbose) printf("  Solver set to BiCGstab line %d operator %d\n", line_of_file, current_operator);
+    BEGIN(name_caller);
+  }
+  mixedbicgstab {
+    optr->solver=MIXEDBICGSTAB;
+    if(myverbose) printf("  Solver set to MixedBiCGstab line %d operator %d\n", line_of_file, current_operator);
+    BEGIN(name_caller);
+  }
   increigcg {
-    optr->solver = 15;
+    optr->solver = INCREIGCG;
     if(myverbose) printf("  Solver set to INCR-EIG-CG line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
   }
-
+  dflgcr {
+    optr->solver=DFLGCR;
+    g_dflgcr_flag = 1;
+    if(myverbose) printf("  Solver set to DFL-GCR line %d operator %d\n", line_of_file, current_operator);
+    BEGIN(name_caller);
+  }
+  dflfgmres {
+    optr->solver=DFLFGMRES;
+    g_dflgcr_flag = 1;
+    if(myverbose) printf("  Solver set to DFL-FGMRES line %d operator %d\n", line_of_file, current_operator);
+    BEGIN(name_caller);
+  }
+  mixedcg {
+    optr->solver=MIXEDCG;
+    if(myverbose) printf("  Solver set to MixedCG line %d operator %d\n", line_of_file, current_operator);
+    BEGIN(name_caller);
+  }
+  DDalphaAMG {
+#ifdef DDalphaAMG
+    optr->solver = MG;
+    if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
+    BEGIN(name_caller);
+#else
+    printf("ERROR line %d operator %d: DDalphaAMG library not included\n", line_of_file, current_operator);
+    exit(1);
+#endif
+  }
 }
 
-
-
-
-
-
 <OVSOLVER>{
   sumr {
-    optr->solver = 13;
-    if(myverbose) printf("  Solver set to SUMR line %d operator %d\n", line_of_file, current_operator);
+    optr->solver = SUMR;
+    if(myverbose) printf("  Solver set to SUMR in line %d operator %d\n", line_of_file, current_operator);
     BEGIN(OVERLAPOP);
   }
   cg {
-    optr->solver = 1;
-    if(myverbose) printf("  Solver set to SUMR line %d operator %d\n", line_of_file, current_operator);
+    optr->solver = CG;
+    if(myverbose) printf("  Solver set to CG in line %d operator %d\n", line_of_file, current_operator);
     BEGIN(OVERLAPOP);
   }
 }
 
+<INITEXTERNALINVERTER>{TYPE} {
+  if(strcmp(yytext, "QPHIX")==0) {
+#ifdef TM_USE_QPHIX
+    if(myverbose) printf("Setting QPHIX external inverter parameters line %d\n", line_of_file);
+    BEGIN(QPHIXINVERTER);
+#else
+    printf("tmLQCD built withtout QPHIX support. Exiting!\n");
+    exit(1);
+#endif
+  } else if ( strcmp(yytext, "QUDA")==0) {
+#ifdef TM_USE_QUDA
+    if(myverbose) printf("Setting QUDA external inverter parameters line %d\n", line_of_file);
+    BEGIN(QUDAINVERTER);
+#else
+    printf("tmLQCD built withtout QUDA support. Exiting!\n");
+    exit(1);
+#endif
+  } else {
+    printf("Error on line %d, inverter '%s' is unknown. Exiting!\n", line_of_file, yytext);
+    exit(1);
+  }
+}
+<QUDAINVERTER>{
+  {SPC}*FermionBC{EQL}theta {
+    quda_input.fermionbc = TM_QUDA_THETABC;
+    if(myverbose) printf("  Force theta fermionic temporal boundary conditions for QUDA inversions, line %d\n", line_of_file);
+  }
+  {SPC}*FermionBC{EQL}apbc {
+    quda_input.fermionbc = TM_QUDA_APBC;
+    if(myverbose) printf("  Force anti-periodic temporal fermionic boundary conditions for QUDA inversions, line %d\n", line_of_file);
+  }
+  {SPC}*FermionBC{EQL}pbc {
+    quda_input.fermionbc = TM_QUDA_PBC;
+    if(myverbose) printf("  Force periodic temporal fermionic boundary conditions for QUDA inversions, line %d\n", line_of_file);
+  }
+  {SPC}*MGNumberOfLevels{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    quda_input.mg_n_level=a;
+    if(myverbose) printf("  MGNumberOfLevels set to %d line %d\n", quda_input.mg_n_level, line_of_file);
+  }
+  {SPC}*MGNumberOfVectors{EQL}{FLTLIST} {
+    char paramname[100];
+    char error_message[200];
+    int list_end = 0;
+    int level = 0;
+
+    fltlist_tokenize(yytext, paramname, 100);
+    int n_vec = (int)fltlist_next_token(&list_end);
+    while( list_end != 1 ){
+      if( level >= (QUDA_MAX_MG_LEVEL-1) ){
+        snprintf(error_message, 200, "Exceeded maximum number of levels (%d-1) parsing %s!\n", QUDA_MAX_MG_LEVEL, paramname);
+        yy_fatal_error(error_message);
+      }
+      
+      quda_input.mg_n_vec[level] = n_vec;
+      if(myverbose){ 
+        printf("  %s, level %d set to %d line %d\n", paramname,
+                level, quda_input.mg_n_vec[level], line_of_file);
+      }
+      level++;
+      n_vec = fltlist_next_token(&list_end);
+    }
+  }
+  {SPC}*MGBlockSizesX{EQL}{FLTLIST} {
+    // note the dimensions here are in QUDA order (XYZT!)
+    parse_quda_mg_blocksize(yytext, 0);
+  }
+  {SPC}*MGBlockSizesY{EQL}{FLTLIST} {
+    parse_quda_mg_blocksize(yytext, 1);
+  }
+  {SPC}*MGBlockSizesZ{EQL}{FLTLIST} {
+    parse_quda_mg_blocksize(yytext, 2);
+  }
+  {SPC}*MGBlockSizesT{EQL}{FLTLIST} {
+    parse_quda_mg_blocksize(yytext, 3);
+  }
+  {SPC}*MGCoarseMuFactor{EQL}{FLTLIST} {
+    char paramname[100];
+    char error_message[200];
+    int list_end = 0;
+    int level = 0;
+
+    fltlist_tokenize(yytext, paramname, 100);
+    double mu_factor = fltlist_next_token(&list_end);
+    while( list_end != 1 ){
+      if( level > QUDA_MAX_MG_LEVEL ){
+        snprintf(error_message, 200, "Exceeded maximum number of levels (%d) parsing %s!\n", QUDA_MAX_MG_LEVEL, paramname);
+        yy_fatal_error(error_message);
+      }
+      
+      quda_input.mg_mu_factor[level] = mu_factor;
+      if(myverbose){ 
+        printf("  %s, level %d set to %lf line %d\n", paramname,
+                level, quda_input.mg_mu_factor[level], line_of_file);
+      }
+      level++;
+      mu_factor = fltlist_next_token(&list_end);
+    }
+  }
+  {SPC}*MGSetupSolver{EQL}cg {
+    quda_input.mg_setup_inv_type = QUDA_CG_INVERTER;
+    if(myverbose) printf("  MGSetupSolver set to CG line %d\n", line_of_file);
+  }
+  {SPC}*MGSetupSolver{EQL}bicgstab {
+    quda_input.mg_setup_inv_type = QUDA_BICGSTAB_INVERTER;
+    if(myverbose) printf("  MGSetupSolver set to BiCGstab line %d\n", line_of_file);
+  }
+  {SPC}*MGSetupSolverTolerance{EQL}{FLT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
+    quda_input.mg_setup_tol=c;
+    if(myverbose) printf("  MGSetupSolverTolerance set to %f line %d\n", quda_input.mg_setup_tol, line_of_file);
+  }
+  {SPC}*MGSetupMaxSolverIterations{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    quda_input.mg_setup_maxiter=a;
+    if(myverbose) printf("  MGSetupMaxSolverIterations set to %d line %d\n", quda_input.mg_setup_maxiter, line_of_file);
+  }
+  {SPC}*MGCoarseSolverTolerance{EQL}{FLT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
+    quda_input.mg_coarse_solver_tol=c;
+    if(myverbose) printf("  MGCoarseSolverTolerance set to %f line %d\n", quda_input.mg_coarse_solver_tol, line_of_file);
+  }
+  {SPC}*MGCoarseMaxSolverIterations{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    quda_input.mg_coarse_solver_maxiter=a;
+    if(myverbose) printf("  MGCoarseMaxSolverIterations set to %d line %d\n", quda_input.mg_coarse_solver_maxiter, line_of_file);
+  }
+  {SPC}*MGSmootherTolerance{EQL}{FLT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
+    quda_input.mg_smoother_tol=c;
+    if(myverbose) printf("  MGSmootherTolerance set to %f line %d\n", quda_input.mg_smoother_tol, line_of_file);
+  }
+  {SPC}*MGSmootherPreIterations{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    quda_input.mg_nu_pre=a;
+    if(myverbose) printf("  MGSmootherPreIterations set to %d line %d\n", quda_input.mg_nu_pre, line_of_file);
+  }
+  {SPC}*MGSmootherPostIterations{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    quda_input.mg_nu_post=a;
+    if(myverbose) printf("  MGSmootherPostIterations set to %d line %d\n", quda_input.mg_nu_post, line_of_file);
+  }
+  {SPC}*MGOverUnderRelaxationFactor{EQL}{FLT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
+    quda_input.mg_omega=c;
+    if(myverbose) printf("  MGOverUnderRelaxationFactor set to %f line %d\n", quda_input.mg_omega, line_of_file);
+  }
+  {SPC}*MGRunVerify{EQL}yes {
+    quda_input.mg_run_verify = QUDA_BOOLEAN_YES;
+    if(myverbose) printf("  MGRunVerify set to YES in line %d\n", line_of_file);
+  }
+  {SPC}*MGRunVerify{EQL}no {
+    quda_input.mg_run_verify = QUDA_BOOLEAN_NO;
+    if(myverbose) printf("  MGRunVerify set to NO in line %d\n", line_of_file);
+  }
+  {SPC}*MGEnableSizeThreeBlocks{EQL}yes {
+    quda_input.mg_enable_size_three_blocks = 1;
+    if(myverbose) printf("  MGEnableSizeThreeBlocks set to YES in line %d\n", line_of_file);
+  }
+  {SPC}*MGEnableSizeThreeBlocks{EQL}no {
+    quda_input.mg_enable_size_three_blocks = 0;
+    if(myverbose) printf("  MGEnableSizeThreeBlocks set to NO in line %d\n", line_of_file);
+  }
+  {SPC}*MGResetSetupThreshold{EQL}{FLT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
+    quda_input.mg_reset_setup_threshold=c;
+    if(myverbose) printf("  MGResetSetupThreshold set to %f line %d\n", quda_input.mg_reset_setup_threshold, line_of_file);
+  }
+  ^EndExternalInverter{SPC}*  { 
+    if(myverbose) printf("QUDA external inverter parameters parsed on line %d\n\n", line_of_file);
+    BEGIN(0);
+  }
+}
+<QPHIXINVERTER>{
+  {SPC}*NCores{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    qphix_input.NCores = a;
+    if(myverbose) printf("  Ncores = %d, line %d\n", qphix_input.NCores, line_of_file );
+  }
+  {SPC}*By{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    qphix_input.By = a;
+    if(myverbose) printf("  By = %d, line %d\n", qphix_input.By, line_of_file );
+  }
+  {SPC}*Bz{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    qphix_input.Bz = a;
+    if(myverbose) printf("  Bz = %d, line %d\n", qphix_input.Bz, line_of_file );
+  }
+  {SPC}*Sy{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    qphix_input.Sy = a;
+    if(myverbose) printf("  Sy = %d, line %d\n", qphix_input.Sy, line_of_file );
+  }
+  {SPC}*Sz{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    qphix_input.Sz = a;
+    if(myverbose) printf("  Sz = %d, line %d\n", qphix_input.Sz, line_of_file );
+  }
+  {SPC}*PadXY{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    qphix_input.PadXY = a;
+    if(myverbose) printf("  PadXY = %d, line %d\n", qphix_input.PadXY, line_of_file );
+  }
+  {SPC}*PadXYZ{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    qphix_input.PadXYZ = a;
+    if(myverbose) printf("  PadXYZ = %d, line %d\n", qphix_input.PadXYZ, line_of_file );
+  }
+  {SPC}*MinCt{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    qphix_input.MinCt = a;
+    if(myverbose) printf("  MinCt = %d, line %d\n", qphix_input.MinCt, line_of_file );
+  }
+  ^EndExternalInverter{SPC}*  {
+    if(myverbose) printf("QPHIX external inverter parameters parsed on line %d\n\n", line_of_file);
+    BEGIN(0);
+  }
+}
+
 <INITMONOMIAL>{TYPE} {
   current_monomial++;
   mnl = &monomial_list[current_monomial];
@@ -948,6 +1873,10 @@ inline void rmQuotes(char *str){
     mnl->type = CLOVERDETRATIO;
     strcpy((*mnl).name, "CLOVERDETRATIO");
   }
+  else if(strcmp(yytext, "CLOVERDETRATIORW")==0) {
+    mnl->type = CLOVERDETRATIORW;
+    strcpy((*mnl).name, "CLOVERDETRATIORW");
+  }
   else if(strcmp(yytext, "DETRATIO")==0) {
     mnl->type = DETRATIO;
     strcpy((*mnl).name, "DETRATIO");
@@ -957,6 +1886,11 @@ inline void rmQuotes(char *str){
     strcpy((*mnl).name, "NDDETRATIO");
     g_running_phmc = 1;
   }
+  else if(strcmp(yytext, "NDCLOVERDETRATIO")==0) {
+    mnl->type = NDCLOVERDETRATIO;
+    strcpy((*mnl).name, "NDDCLOVERETRATIO");
+    g_running_phmc = 1;
+  }
   else if(strcmp(yytext, "NDPOLY")==0) {
     mnl->type = NDPOLY;
     strcpy((*mnl).name, "NDPOLY");
@@ -1039,10 +1973,13 @@ inline void rmQuotes(char *str){
   else if(mnl->type == NDRAT) BEGIN(NDRATMONOMIAL);
   else if(mnl->type == RAT) BEGIN(RATMONOMIAL);
   else if(mnl->type == NDCLOVERRAT) BEGIN(NDCLRATMONOMIAL);
+  else if(mnl->type == NDDETRATIO) BEGIN(NDDETRATMONOMIAL);
   else if(mnl->type == CLOVERRAT) BEGIN(CLRATMONOMIAL);
   else if(mnl->type == NDRATCOR) BEGIN(NDRATCORMONOMIAL);
   else if(mnl->type == RATCOR) BEGIN(RATCORMONOMIAL);
   else if(mnl->type == NDCLOVERRATCOR) BEGIN(NDCLRATCORMONOMIAL);
+  else if(mnl->type == NDDETRATIO) BEGIN(NDDETRATMONOMIAL);
+  else if(mnl->type == NDCLOVERDETRATIO) BEGIN(NDCLDETRATMONOMIAL);
   else if(mnl->type == CLOVERRATCOR) BEGIN(CLRATCORMONOMIAL);
   else if(mnl->type == POLY || mnl->type == POLYDETRATIO)  {
           fprintf(stderr,"starting to parse poly(detratio) monomial\n");
@@ -1050,14 +1987,15 @@ inline void rmQuotes(char *str){
   }
   else if(mnl->type == CLOVERDET) BEGIN(CLDETMONOMIAL);
   else if(mnl->type == CLOVERDETRATIO) BEGIN(CLDETRATMONOMIAL);
+  else if(mnl->type == CLOVERDETRATIORW) BEGIN(CLDETRATRWMONOMIAL);
   else BEGIN(DETMONOMIAL);
 }
 
 
 
-<DETMONOMIAL,GAUGEMONOMIAL,NDPOLYMONOMIAL,POLYMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLPOLYMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,RATMONOMIAL,CLRATMONOMIAL,RATCORMONOMIAL,CLRATCORMONOMIAL>{
+<DETMONOMIAL,GAUGEMONOMIAL,NDPOLYMONOMIAL,POLYMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,CLPOLYMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,RATMONOMIAL,CLRATMONOMIAL,RATCORMONOMIAL,CLRATCORMONOMIAL,NDDETRATMONOMIAL,NDCLDETRATMONOMIAL>{
   {SPC}*Timescale{EQL}{DIGIT}+ {
-    if(mnl->type == NDDETRATIO) {
+    if(mnl->type == NDDETRATIO || mnl->type == NDCLOVERDETRATIO) {
       mnl->timescale = -5;
       if(myverbose) printf("  timescales set to %d line %d monomial %d since NDDETRATIO is not for MD evolution\n", a, line_of_file, current_monomial);
     }
@@ -1077,7 +2015,7 @@ inline void rmQuotes(char *str){
   }
 }
 
-<CLDETMONOMIAL,CLDETRATMONOMIAL,CLPOLYMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL>{
+<CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,CLPOLYMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL,NDCLDETRATMONOMIAL>{
   {SPC}*CSW{EQL}{FLT} {
     sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
     mnl->c_sw = c;
@@ -1085,7 +2023,7 @@ inline void rmQuotes(char *str){
   }
 }
 
-<CLDETMONOMIAL,CLDETRATMONOMIAL>{
+<CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL>{
   {SPC}*rho{EQL}{FLT} {
     sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
     mnl->rho = c;
@@ -1093,7 +2031,7 @@ inline void rmQuotes(char *str){
   }
 }
 
-<CLDETRATMONOMIAL>{
+<CLDETRATMONOMIAL,CLDETRATRWMONOMIAL>{
   {SPC}*rho2{EQL}{FLT} {
     sscanf(yytext, " %[a-zA-Z]2 = %lf", name, &c);
     mnl->rho2 = c;
@@ -1101,7 +2039,7 @@ inline void rmQuotes(char *str){
   }
 }
 
-<DETMONOMIAL,POLYMONOMIAL,NDPOLYMONOMIAL,CLPOLYMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,RATMONOMIAL,CLRATMONOMIAL,RATCORMONOMIAL,CLRATCORMONOMIAL>{
+<DETMONOMIAL,POLYMONOMIAL,NDPOLYMONOMIAL,CLPOLYMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,RATMONOMIAL,CLRATMONOMIAL,RATCORMONOMIAL,CLRATCORMONOMIAL,NDDETRATMONOMIAL,NDCLDETRATMONOMIAL>{
   {SPC}*Kappa{EQL}{FLT} {
     sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
     mnl->kappa = c;
@@ -1109,7 +2047,7 @@ inline void rmQuotes(char *str){
   }
 }
 
-<DETMONOMIAL,POLYMONOMIAL>{
+<DETMONOMIAL,POLYMONOMIAL,CLDETRATRWMONOMIAL>{
   {SPC}*2KappaMu2{EQL}{FLT} {
     sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
     mnl->mu2 = c;
@@ -1122,6 +2060,14 @@ inline void rmQuotes(char *str){
   }
 }
 
+<NDDETRATMONOMIAL,NDCLDETRATMONOMIAL>{
+  {SPC}*Kappa2{EQL}{FLT} {
+    sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
+    mnl->kappa2 = c;
+    if(myverbose) printf("  kappa2 set to %f line %d monomial %d\n", c, line_of_file, current_monomial);
+  }
+}
+
 <NDCLRATMONOMIAL,CLRATMONOMIAL>{
   {SPC}*AddTrLog{EQL}yes {
     mnl->trlog = 1;
@@ -1133,7 +2079,7 @@ inline void rmQuotes(char *str){
   }
 }
 
-<NDPOLYMONOMIAL,CLPOLYMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL>{
+<NDPOLYMONOMIAL,CLPOLYMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,NDDETRATMONOMIAL,NDCLDETRATMONOMIAL>{
   {SPC}*2KappaMubar{EQL}{FLT} {
     sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
     mnl->mubar = c;
@@ -1146,7 +2092,70 @@ inline void rmQuotes(char *str){
   }
 }
 
-<DETMONOMIAL,POLYMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,RATMONOMIAL,RATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL>{
+<NDDETRATMONOMIAL,NDCLDETRATMONOMIAL>{
+  {SPC}*2KappaMubar2{EQL}{FLT} {
+    sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
+    mnl->mubar2 = c;
+    if(myverbose) printf("  2KappaMubar2 set to %f line %d monomial %d\n", c, line_of_file, current_monomial);
+  }
+  {SPC}*2KappaEpsbar2{EQL}{FLT} {
+    sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
+    mnl->epsbar2 = c;
+    if(myverbose) printf("  2KappaEpsbar2 set to %f line %d monomial %d\n", c, line_of_file, current_monomial);
+  }
+}
+<DETMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,RATMONOMIAL,RATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL,NDDETRATMONOMIAL,NDCLDETRATMONOMIAL>{
+  {SPC}*UseExternalInverter{EQL}quda {
+    if(myverbose) printf("  Use Quda inverter line %d monomial %d\n", line_of_file, current_monomial);
+    mnl->solver_params.external_inverter = QUDA_INVERTER;
+  }
+  {SPC}*UseExternalInverter{EQL}qphix {
+    if(myverbose) printf("  Use QPhiX inverter line %d monomial %d\n", line_of_file, current_monomial);
+    mnl->solver_params.external_inverter = QPHIX_INVERTER;
+  }
+  {SPC}*UseExternalInverter{EQL}no {
+    if(myverbose) printf("  Use QPhiX inverter line %d monomial %d\n", line_of_file, current_monomial);
+    mnl->solver_params.external_inverter = NO_EXT_INV;
+  }
+  {SPC}*UseSloppyPrecision{EQL}yes {
+    if(myverbose) printf("  Use use sloppy precision (single) in the inverter (if supported by the inverter) line %d monomial %d\n", line_of_file, current_monomial);
+    mnl->solver_params.sloppy_precision = SLOPPY_SINGLE;
+  }
+  {SPC}*UseSloppyPrecision{EQL}float {
+    if(myverbose) printf("  Use use sloppy precision (single) in the inverter (if supported by the inverter) line %d monomial %d\n", line_of_file, current_monomial);
+    mnl->solver_params.sloppy_precision = SLOPPY_SINGLE;
+  }
+  {SPC}*UseSloppyPrecision{EQL}single {
+    if(myverbose) printf("  Use use sloppy precision (single) in the inverter (if supported by the inverter) line %d monomial %d\n", line_of_file, current_monomial);
+    mnl->solver_params.sloppy_precision = SLOPPY_SINGLE;
+  }
+  {SPC}*UseSloppyPrecision{EQL}no {
+    if(myverbose) printf("  Use use sloppy precision (single) in the inverter line %d monomial %d\n", line_of_file, current_monomial);
+    mnl->solver_params.sloppy_precision = SLOPPY_DOUBLE;
+  }
+  {SPC}*UseSloppyPrecision{EQL}double {
+    if(myverbose) printf("  Use use sloppy precision (single) in the inverter line %d monomial %d\n", line_of_file, current_monomial);
+    mnl->solver_params.sloppy_precision = SLOPPY_DOUBLE;
+  }
+  {SPC}*UseSloppyPrecision{EQL}half {
+    if(myverbose) printf("  Use use sloppy precision (half) in the inverter (if supported by the inverter) line %d monomial %d\n", line_of_file, current_monomial);
+    mnl->solver_params.sloppy_precision = SLOPPY_HALF;
+  }
+  {SPC}*UseCompression{EQL}12 {
+    if(myverbose) printf("  Use 12 compression in the inverter (if supported) line %d monomial %d\n", line_of_file, current_monomial);
+    mnl->solver_params.compression_type = COMPRESSION_12;
+  }
+  {SPC}*UseCompression{EQL}8 {
+    if(myverbose) printf("  Use 8 compression in the inverter (if supported) line %d monomial %d\n", line_of_file, current_monomial);
+    mnl->solver_params.compression_type = COMPRESSION_8;
+  }
+  {SPC}*UseCompression{EQL}18 {
+    if(myverbose) printf("  Not using compression in the inverter line %d monomial %d\n", line_of_file, current_monomial);
+    mnl->solver_params.compression_type = NO_COMPRESSION;
+  }
+}
+
+<DETMONOMIAL,POLYMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,RATMONOMIAL,RATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL,NDDETRATMONOMIAL,NDCLDETRATMONOMIAL>{
   {SPC}*ForcePrecision{EQL}{FLT} {
     sscanf(yytext, " %[a-zA-Z] = %lf",name , &c);
     mnl->forceprec = c;
@@ -1162,9 +2171,28 @@ inline void rmQuotes(char *str){
     mnl->maxiter = a;
     if(myverbose) printf("  MaxSolverIterations set to %d line %d monomial %d\n", a, line_of_file, current_monomial);
   }
+  {SPC}*mcgdelta{EQL}{FLT} {
+    sscanf(yytext, " %[a-zA-Z1] = %lf", name, &c);
+    (mnl->solver_params).mcg_delta = c;
+    if(myverbose) printf("  mcg_delta set to %lf line %d monomial %d\n", c, line_of_file, current_monomial);
+  }
+}
+
+<NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,NDDETRATMONOMIAL,NDCLDETRATMONOMIAL>{
+  {SPC}*Solver{EQL} {
+   solver_caller=YY_START;
+   BEGIN(NDMSOLVER);
+  }
+}
+
+<RATMONOMIAL,RATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL>{
+  {SPC}*Solver{EQL} {
+   solver_caller=YY_START;
+   BEGIN(RATMSOLVER);
+  }
 }
 
-<DETMONOMIAL,POLYMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL>{
+<DETMONOMIAL,POLYMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,RATMONOMIAL,RATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL>{
   {SPC}*2KappaMu{EQL}{FLT} {
     sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
     mnl->mu = c;
@@ -1362,14 +2390,95 @@ inline void rmQuotes(char *str){
 <MSOLVER>{
   CG {
     if(myverbose) printf("  Solver set to \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial);
-    mnl->solver = 1;
+    mnl->solver = CG;
     BEGIN(solver_caller);
   }
-  bicgstab {
+  mixedCG {
+    if(myverbose) printf("  Solver set to \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial);
+    mnl->solver = MIXEDCG;
+    BEGIN(solver_caller);
+  }
+  rgmixedCG {
+    if(myverbose) printf("  Solver set to \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial);
+    mnl->solver = RGMIXEDCG;
+    BEGIN(solver_caller);
+  }
+  BICGSTAB {
+    if(myverbose) printf("  Solver set to \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial);
+    mnl->solver = BICGSTAB;
+    BEGIN(solver_caller);
+  }
+  mixedbicgstab {
+    if(myverbose) printf("  Solver set to \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial);
+    mnl->solver=MIXEDBICGSTAB;
+    BEGIN(name_caller);
+  }
+  DDalphaAMG {
+#ifdef DDalphaAMG
+    if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
+    mnl->solver = MG;
+    BEGIN(solver_caller);
+#else
+    printf("ERROR line %d operator %d: DDalphaAMG library not included\n", line_of_file, current_operator);
+    exit(1);
+#endif
+  }
+}
+
+<RATMSOLVER>{
+  rgmixedCG {
+    if(myverbose) printf("  Solver set to \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial);
+    mnl->solver = RGMIXEDCG;
+    BEGIN(solver_caller);
+  }
+  cgmms {
     if(myverbose) printf("  Solver set to \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial);
-    mnl->solver = 0;
+    mnl->solver = CGMMS;
     BEGIN(solver_caller);
   }
+  DDalphaAMG {
+#ifdef DDalphaAMG
+    if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
+    mnl->solver = MG;
+    BEGIN(solver_caller);
+#else
+    printf("ERROR line %d operator %d: DDalphaAMG library not included\n", line_of_file, current_operator);
+    exit(1);
+#endif
+  }
+}
+
+<NDMSOLVER>{
+  cgmmsnd {
+    if(myverbose) printf("  Solver set to \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial);
+    mnl->solver = CGMMSND;
+    BEGIN(solver_caller);
+  }
+  rgmixedcg {
+    if(myverbose) printf("  Solver set to \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial);
+    mnl->solver = RGMIXEDCG;
+    BEGIN(solver_caller);
+  }
+  mixedCGmmsnd {
+    if(myverbose) printf("  Solver set to \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial);
+    mnl->solver = MIXEDCGMMSND;
+    BEGIN(solver_caller);
+  }
+  MCR {
+    if(myverbose) printf("  Solver set to \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial);
+    mnl->solver = 14;
+    BEGIN(solver_caller);
+  }
+  DDalphaAMG {
+#ifdef DDalphaAMG
+    if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
+    mnl->solver = MG;
+    BEGIN(solver_caller);
+#else
+    printf("ERROR line %d operator %d: DDalphaAMG library not included\n", line_of_file, current_operator);
+    exit(1);
+#endif
+  }
 }
 
 <GTYPE>{
@@ -1438,6 +2547,9 @@ inline void rmQuotes(char *str){
     else if(strcmp(type, "OMF4")==0) {
       Integrator.type[a] = OMF4;
     }
+    else if(strcmp(type, "2MNFG")==0) {
+      Integrator.type[a] = MN2FG;
+    }
     else {
       fprintf(stderr, "Unknown integrator type %s in line %d\n", yytext, line_of_file);
       exit(1);
@@ -1496,17 +2608,25 @@ inline void rmQuotes(char *str){
 
 <SOURCETYPE>{
   Point {
-    SourceInfo.type = 0;
+    SourceInfo.type = SRC_TYPE_POINT;
     if(myverbose) printf("Using Point Sources\n");
   }
   Volume {
-    SourceInfo.type = 1;
+    SourceInfo.type = SRC_TYPE_VOL;
     if(myverbose) printf("Using Volume Sources\n");
   }
   TimeSlice {
-    SourceInfo.type = 2;
+    SourceInfo.type = SRC_TYPE_TS;
     if(myverbose) printf("Using TimeSlice Sources\n");
   }
+  PionTimeSlice {
+    SourceInfo.type = SRC_TYPE_PION_TS;
+    if(myverbose) printf("Using PionTimeSlice Sources\n");
+  }
+  GenPionTimeSlice {
+    SourceInfo.type = SRC_TYPE_GEN_PION_TS;
+    if(myverbose) printf("Using GenPionTimeSlice Sources\n");
+  }
 }
 <PROPSPLIT>{
   yes {
@@ -1547,13 +2667,24 @@ inline void rmQuotes(char *str){
     meas->type = ORIENTED_PLAQUETTES;
     strcpy(meas->name, "ORIENTEDPLAQUETTES");
   }
+  else if(strcmp(yytext, "GRADIENTFLOW")==0) {
+    meas->type = GRADIENT_FLOW;
+    strcpy(meas->name, "GRADIENTFLOW");
+  }
   else {
     fprintf(stderr, "Unknown measurement type %s in line %d\n", yytext, line_of_file);
     exit(1);
   }
-  /*set default frequency here, in case it is not specified
-    in the input file */  
+  // set default measurement parameters. This has to be done here, since init_measurements would
+  // override the inputs
   meas->freq = _default_measurement_freq;
+
+  meas->no_samples = 1;
+  meas->all_time_slices = 0;
+
+  meas->gf_eps = _default_gf_eps;
+  meas->gf_tmax = _default_gf_tmax;
+
   if(!reread) {
     if(add_measurement(meas->type) < 0) {
       fprintf(stderr, "Something went wrong in adding measurements\nAborting...!\n");
@@ -1567,9 +2698,10 @@ inline void rmQuotes(char *str){
   else if(meas->type == PIONNORM) BEGIN(PIONNORMMEAS);
   else if(meas->type == POLYAKOV) BEGIN(PLOOP);
   else if(meas->type == ORIENTED_PLAQUETTES) BEGIN(ORIENTEDPLAQUETTESMEAS);
+  else if(meas->type == GRADIENT_FLOW) BEGIN(GRADIENTFLOWMEAS);
 }
 
-<ONLINEMEAS,PIONNORMMEAS,PLOOP,ORIENTEDPLAQUETTESMEAS>{
+<ONLINEMEAS,PIONNORMMEAS,PLOOP,ORIENTEDPLAQUETTESMEAS,GRADIENTFLOWMEAS>{
   ^EndMeasurement{SPC}* {
     if(myverbose) printf("Measurement with id %d parsed in line %d\n\n", meas->id, line_of_file);
     BEGIN(0);
@@ -1587,8 +2719,34 @@ inline void rmQuotes(char *str){
     meas->max_iter = a;
     if(myverbose) printf("  MaxSolverIterations set to %d line %d measurement id=%d\n", a, line_of_file, meas->id);
   }
+  {SPC}*AllTimeSlices{EQL}yes {
+    meas->all_time_slices = 1;
+    if(myverbose) printf("  Inversions for correlators will be done on all time-slices, line %d, measurement id=%d\n", line_of_file, meas->id);
+  }
+  {SPC}*AllTimeSlices{EQL}no {
+    meas->all_time_slices = 0;
+    if(myverbose) printf("  Inversions for correlators will NOT be done on all time-slices, line %d, measurement id=%d\n", line_of_file, meas->id);
+  }
+  {SPC}*NoSamples{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    meas->no_samples = a;
+    if(myverbose) printf("  Number of samples for correlators measurement set to %d, line %d measurement id=%d\n", a, line_of_file, meas->id);
+  }
 }
 
+<GRADIENTFLOWMEAS>{
+  {SPC}*StepSize{EQL}{FLT} {
+    sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
+    meas->gf_eps = c;
+    if(myverbose) printf("  Gradient flow step size set to %lf line %d, measurement id=%d\n", meas->gf_eps, line_of_file, meas->id);
+  }
+  {SPC}*MaxFlowTime{EQL}{FLT} {
+    sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
+    meas->gf_tmax = c;
+    if(myverbose) printf("  Maximum gradient flow time size set to %lf line %d, measurement id=%d\n", meas->gf_tmax, line_of_file, meas->id);
+  }
+} 
+
 <PLOOP>{
   {SPC}*Direction{EQL}[03] {
     sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
@@ -1707,7 +2865,7 @@ inline void rmQuotes(char *str){
 }
 <CSW>{FLT}  {
   g_c_sw=atof(yytext);
-  if(myverbose!=0) printf("c_sw=%lf \n", g_c_sw);
+  if(myverbose!=0) printf("c_sw=%s \n", yytext);
 }
 <STARTCOND>{
   cold {
@@ -1747,10 +2905,6 @@ inline void rmQuotes(char *str){
   gmresdr_nr_ev = atoi(yytext);
   if(myverbose!=0) printf("Deflate %d eigenvectors in GMRES-DR \n", gmresdr_nr_ev);
 }
-<DFLSP>{DIGIT}+ {
-  g_N_s = atoi(yytext);
-  if(myverbose!=0) printf("Deflation subspace dimension set to %d \n", g_N_s);
-}
 <PRECON>{
   none {
     if(myverbose!=0) printf("Using no right preconditioner \n");
@@ -1770,6 +2924,22 @@ inline void rmQuotes(char *str){
   write_cp_flag=0;
   if(myverbose!=0) printf("Don't write Checkpoints\n");
 }
+<SUBPROCESS>yes     {
+  subprocess_flag=1;
+  if(myverbose!=0) printf("Running in \'subprocess\' mode\n");
+}
+<SUBPROCESS>no     {
+  subprocess_flag=0;
+  if(myverbose!=0) printf("Not running in \'subprocess\' mode\n");
+}
+<LOWMEM>yes     {
+  lowmem_flag=1;
+  if(g_proc_id==0) printf("!!! WARNING: running in \'lowmem\' mode, most functionality will NOT work without explicit memory allocations!\n");
+}
+<LOWMEM>no     {
+  lowmem_flag=0;
+  if(myverbose!=0) printf("Not running in \'lowmem\' mode\n");
+}
 <DSBLIOCHECK>yes {
   g_disable_IO_checks = 1;
   if(myverbose!=0) printf("Disable IO checks (and readback in case of Lemon IO)\n");
@@ -1778,6 +2948,14 @@ inline void rmQuotes(char *str){
   g_disable_IO_checks = 0;
   if(myverbose!=0) printf("Enable IO checks (and readback in case of Lemon IO)\n");
 }
+<DSBLSRCIOCHECK>yes {
+  g_disable_src_IO_checks = 1;
+  if(myverbose!=0) printf("Disable IO checks for sources\n");
+}
+<DSBLSRCIOCHECK>no {
+  g_disable_src_IO_checks = 0;
+  if(myverbose!=0) printf("Enable IO checks for sources\n");
+}
 <CPINT>{DIGIT}+   {
   cp_interval=atoi(yytext);
   if(myverbose!=0) printf("Write Checkpoint all %s measurements\n",yytext);
@@ -1869,13 +3047,16 @@ inline void rmQuotes(char *str){
   if(myverbose!=0) printf("Don't read inversion source from file, but save the one generated\n");
 }
 <SOURCEFILE>{FILENAME} {
-  if(SourceInfo.basename == NULL) free(SourceInfo.basename);
+  if(SourceInfo.basename != NULL) free(SourceInfo.basename);
   SourceInfo.basename = (char*)malloc((strlen(yytext)+1)*sizeof(char));
   strcpy(SourceInfo.basename, yytext);
-  if(PropInfo.basename == NULL) free(PropInfo.basename);
+  if(myverbose!=0) printf("source input filename set to %s of length %lu\n",yytext,strlen(yytext));
+}
+<PROPFILE>{FILENAME} {
+  if(PropInfo.basename != NULL) free(PropInfo.basename);
   PropInfo.basename = (char*)malloc((strlen(yytext)+1)*sizeof(char));
   strcpy(PropInfo.basename, yytext);
-  if(myverbose!=0) printf("source input filename set to %s\n",yytext);
+  if(myverbose!=0) printf("propagator output filename set to %s\n",yytext);
 }
 <SOURCEFORMAT>etmc      {
   SourceInfo.format = 0;
@@ -2092,15 +3273,25 @@ inline void rmQuotes(char *str){
 }
 <REWEIGH>yes {
   reweighting_flag = 1;
-  if(myverbose!=0) fprintf(stderr, "Compute reweighting factor\n");
+  if(myverbose!=0) printf("Compute reweighting factor\n");
 }
 <REWEIGH>no {
   reweighting_flag = 0;
-  if(myverbose!=0) fprintf(stderr, "Do not compute reweighting factor\n");
+  if(myverbose!=0) printf("Do not compute reweighting factor\n");
 }
 <REWSAMPLES>{DIGIT}+ {
   reweighting_samples = atoi(yytext);
-  if(myverbose!=0) fprintf(stderr, "Number of reweighting samples set to %d\n", reweighting_samples);
+  if(myverbose!=0) printf("Number of reweighting samples set to %d\n", reweighting_samples);
+}
+
+<MIXCGIT>{DIGIT}+ {
+  mixcg_maxinnersolverit = atoi(yytext);
+  if(myverbose) printf("MixedCG: setting maximal inner solver iterations to %d\n", mixcg_maxinnersolverit);
+}
+
+<MIXCGEPS>{FLT}  {
+  mixcg_innereps=atof(yytext);
+  if(myverbose!=0) printf("MixedCG: setting inner solver eps to %s \n", yytext);
 }
 
 <*>^#   {
@@ -2115,8 +3306,7 @@ inline void rmQuotes(char *str){
   BEGIN(comment_caller);
 }
 
-
-<INITMONOMIAL,DETMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,NDPOLYMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,CLPOLYMONOMIAL,GAUGEMONOMIAL,INTEGRATOR,INITINTEGRATOR,INITMEASUREMENT,PIONNORMMEAS,ONLINEMEAS,ORIENTEDPLAQUETTESMEAS,INITOPERATOR,TMOP,DBTMOP,OVERLAPOP,WILSONOP,CLOVEROP,DBCLOVEROP,POLYMONOMIAL,PLOOP,INITGPU,GPU,RATMONOMIAL,RATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL,BSMOP>{SPC}*\n   {
+<INITMONOMIAL,DETMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,NDPOLYMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,CLPOLYMONOMIAL,GAUGEMONOMIAL,INTEGRATOR,INITINTEGRATOR,INITMEASUREMENT,PIONNORMMEAS,ONLINEMEAS,ORIENTEDPLAQUETTESMEAS,GRADIENTFLOWMEAS,INITOPERATOR,TMOP,DBTMOP,OVERLAPOP,WILSONOP,CLOVEROP,DBCLOVEROP,POLYMONOMIAL,PLOOP,INITGPU,GPU,RATMONOMIAL,RATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL,INITDEFLATION,DEFLATION,INITMULTIGRID,MULTIGRID,INITEXTERNALINVERTER,QUDAINVERTER,QPHIXINVERTER,NDDETRATMONOMIAL,NDCLDETRATMONOMIAL,BSMOP>{SPC}*\n   {
   line_of_file++;
 }
 <*>{SPC}*\n                       {
@@ -2128,8 +3318,8 @@ inline void rmQuotes(char *str){
   BEGIN(ERROR);
 }
 <ERROR>[^\t\n]*             {
-  fprintf(stderr, "Parsing error in line %d\nAborting...!\n", line_of_file);
-  fprintf(stderr, "Could not make sense out off: %s\n", yytext);
+  if(g_proc_id == 0) fprintf(stderr, "Parsing error in line %d\nAborting...!\n", line_of_file);
+  if(g_proc_id == 0) fprintf(stderr, "Could not make sense out off: %s\n", yytext);
   exit(1);
 }
 
@@ -2194,11 +3384,34 @@ int read_input(char * conf_file){
   g_mu1 = _default_g_mu1;
   g_mu2 = _default_g_mu2;
   g_mu3 = _default_g_mu3;
+  g_shift = _default_g_shift;
   g_dbw2rand = 0;
   g_running_phmc = 0;
   g_beta = _default_g_beta;
   g_N_s = _default_g_N_s;
   g_dflgcr_flag = _default_g_dflgcr_flag;
+
+  Msap_precon = _default_Msap_precon;
+  NiterMsap = _default_NiterMsap;
+  NcycleMsap = _default_NcycleMsap;
+  kappa_Msap = _default_kappa_Msap;
+  mu_Msap = _default_mu_Msap;
+
+  NsmoothMsap_dflgen = _default_NsmoothMsap_dflgen;
+  NiterMsap_dflgen = _default_NiterMsap_dflgen;
+  NcycleMsap_dflgen = _default_NcycleMsap_dflgen;
+  usePL = _default_usePL;
+  mu_dflgen = _default_mu_dflgen;
+  kappa_dflgen = _default_kappa_dflgen;
+  mu_dfl = _default_mu_dfl;
+  kappa_dfl = _default_kappa_dfl;
+  little_solver_low_prec = _default_little_solver_low_prec;
+  little_solver_high_prec = _default_little_solver_high_prec;
+  little_solver_max_iter = _default_little_solver_max_iter;
+  little_evenodd = _default_little_evenodd;
+  little_solver = _default_little_solver;
+  little_gmres_m_parameter = _default_little_gmres_m_parameter;
+
   random_seed = _default_random_seed;
   rlxd_level = _default_rlxd_level;
   startoption = _default_startoption;
@@ -2222,11 +3435,34 @@ int read_input(char * conf_file){
   rho_BSM = _default_rho_BSM;
   eta_BSM = _default_eta_BSM;
   m0_BSM  = _default_m0_BSM;
+  mu03_BSM  = _default_mu03_BSM;
+  mu01_BSM  = _default_mu01_BSM;
+  r_BSM = _default_r_BSM ;
+  c5phi_BSM  = _default_c5phi_BSM ;
+  csw_BSM = _default_csw_BSM ;
+  propagatorsonthefly_BSM = _default_propagatorsonthefly_BSM;
+  smearedcorrelator_BSM = _default_smearedcorrelator_BSM;
+  densitydensity_BSM = _default_densitydensity_BSM;
+  densitydensity_s0s0_BSM = _default_densitydensity_s0s0_BSM;
+  densitydensity_sxsx_BSM = _default_densitydensity_sxsx_BSM;
+  diraccurrentdensity_BSM = _default_diraccurrentdensity_BSM;
+  wilsoncurrentdensitypr1_BSM = _default_wilsoncurrentdensitypr1_BSM;
+  wilsoncurrentdensitypr2_BSM = _default_wilsoncurrentdensitypr2_BSM;
+  wilsoncurrentdensitypl1_BSM = _default_wilsoncurrentdensitypl1_BSM;
+  wilsoncurrentdensitypl2_BSM = _default_wilsoncurrentdensitypl2_BSM;
+  vectorcurrentcurrent_BSM = _default_vectorcurrentcurrent_BSM;
+  axialcurrentcurrent_BSM = _default_axialcurrentcurrent_BSM;
+  vectorcurrentdensity_BSM = _default_vectorcurrentdensity_BSM;
+  axialcurrentdensity_BSM = _default_axialcurrentdensity_BSM;
+  vectordensitydensity_BSM = _default_vectordensitydensity_BSM;
+  giancarlo_BSM = _default_giancarlo_BSM;
+  timesmearcorrelator_BSM = _default_timesmearcorrelator_BSM;
+  pdensityvectordensity_BSM = _default_pdensityvectordensity_BSM;
   g_rgi_C1 = _default_g_rgi_C1;
   read_source_flag= _default_read_source_flag;
-  if(SourceInfo.basename == NULL) SourceInfo.basename = (char*)malloc(100*sizeof(char));
+  SourceInfo.basename = (char*)malloc(strlen(_default_source_filename)+10);
   strcpy(SourceInfo.basename, _default_source_filename);
-  if(PropInfo.basename == NULL) PropInfo.basename = (char*)malloc(100*sizeof(char));
+  PropInfo.basename = (char*)malloc(strlen(_default_source_filename)+10);
   strcpy(PropInfo.basename, _default_source_filename);
   PropInfo.splitted = _default_propagator_splitted;
   SourceInfo.splitted = _default_source_splitted;
@@ -2243,6 +3479,7 @@ int read_input(char * conf_file){
   scalar_precision_read_flag = _default_scalar_precision_read_flag;
   scalar_precision_write_flag = _default_scalar_precision_write_flag;
   g_disable_IO_checks = _default_g_disable_IO_checks;
+  g_disable_src_IO_checks = _default_g_disable_IO_checks;
   reproduce_randomnumber_flag = _default_reproduce_randomnumber_flag;
   g_sloppy_precision_flag = _default_g_sloppy_precision_flag;
   use_stout_flag = _default_use_stout_flag;
@@ -2271,6 +3508,49 @@ int read_input(char * conf_file){
   mstarsq = _default_mstarsq;
   no_sources_z2 = _default_no_sources_z2;
   device_num = _default_device_num;
+  min_innersolver_it = _default_min_innersolver_it;
+  max_mms_shifts = _default_max_mms_shifts;
+  use_mixed_mms = 0;
+  innersolver_precision_rel = 1.e-4;
+  innersolver_precision_abs = 1.e-4;
+
+  mixcg_innereps = _default_mixcg_innereps;
+  mixcg_maxinnersolverit = _default_mixcg_maxinnersolverit;
+
+  lowmem_flag = _default_lowmem_flag;
+  subprocess_flag = _default_subprocess_flag;
+ 
+  /* default parameters for QUDA */
+  quda_input.fermionbc = TM_QUDA_THETABC;
+  quda_input.mg_n_level = _default_quda_mg_n_level;
+  for( int level = 0; level < QUDA_MAX_MG_LEVEL; ++level){
+    quda_input.mg_n_vec[level] = _default_quda_mg_n_vec;
+    if( level > 0 ){
+      quda_input.mg_mu_factor[level] = _default_quda_mg_mu_factor;
+    } else {
+      quda_input.mg_mu_factor[level] = 1.0;
+    }
+
+    /* note: when the user does not specify any blocking parameters,
+     * a reasonable set will be computed automatically in the MG setup
+     * of the QUDA interface */
+    for( int dim = 0; dim < 4; ++dim ){
+      quda_input.mg_blocksize[level][dim] = 0;
+    }
+  }
+  quda_input.mg_setup_inv_type = QUDA_CG_INVERTER;
+  quda_input.mg_setup_maxiter = _default_quda_mg_setup_maxiter;
+  quda_input.mg_setup_tol = _default_quda_mg_setup_tol;
+  quda_input.mg_coarse_solver_tol = _default_quda_mg_coarse_solver_tol;
+  quda_input.mg_coarse_solver_maxiter = _default_quda_mg_coarse_solver_maxiter;
+  quda_input.mg_nu_pre = _default_quda_mg_nu_pre;
+  quda_input.mg_nu_post = _default_quda_mg_nu_post;
+  quda_input.mg_smoother_tol = _default_quda_mg_smoother_tol;
+  quda_input.mg_omega = _default_quda_mg_omega;
+  quda_input.mg_run_verify = QUDA_BOOLEAN_YES;
+  quda_input.mg_enable_size_three_blocks = _default_quda_mg_enable_size_three_blocks;
+  quda_input.mg_reset_setup_threshold = _default_quda_mg_reset_setup_threshold;
+
   /* Put -1 in PropInfo.format to see if parse_config() will
   change the value. If not then set it to source_format_flag */
   PropInfo.format = -1;
diff --git a/reweighting_factor.c b/reweighting_factor.c
index bfc3685a6..39ee12751 100644
--- a/reweighting_factor.c
+++ b/reweighting_factor.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -27,14 +27,15 @@
 #include "global.h"
 #include "linalg_eo.h"
 #include "start.h"
+#include "fatal_error.h"
 #include "monomial/monomial.h"
 #include "hamiltonian_field.h"
+#include "operator/clovertm_operators.h"
+#include "operator/clover_leaf.h"
 #include "reweighting_factor.h"
 
 void reweighting_factor(const int N, const int nstore) {
-  int i, j, n = VOLUME;
-  double sq_norm, x, y;
-  double * sum, * sum_sq;
+  int n = VOLUME;
   monomial * mnl;
   FILE * ofs;
   hamiltonian_field_t hf;
@@ -44,52 +45,110 @@ void reweighting_factor(const int N, const int nstore) {
   hf.derivative = NULL;
   hf.update_gauge_copy = g_update_gauge_copy;
 
-  sum = (double*)calloc(no_monomials, sizeof(double));
-  sum_sq = (double*)calloc(no_monomials, sizeof(double));
+  double * data = (double*)calloc(no_monomials*N, sizeof(double));
+  double * trlog = (double*)calloc(no_monomials, sizeof(double));
 
-  for(i = 0; i < N; i++) {
-    sq_norm = 0.;
-    for(j = 0; j < no_monomials; j++) {
+  // we compute the trlog part first, because they are independent of 
+  // stochastic noise. This is only needed for even/odd monomials
+  for(int j = 0; j < no_monomials; j++) {
+    mnl = &monomial_list[j];
+    if(mnl->even_odd_flag) {
+      init_sw_fields();
+
+      if(mnl->type != NDCLOVERRATCOR && (mnl->kappa != mnl->kappa2
+                                       || (mnl->type == NDDETRATIO 
+                                           && (mnl->mubar != mnl->mubar2 || mnl->epsbar != mnl->epsbar2))
+                                       || (mnl->type != NDDETRATIO
+                                           && (mnl->mu != mnl->mu2)))) {
+        double c_sw = mnl->c_sw;
+        if(c_sw < 0.) c_sw = 0.;
+        
+        sw_term( (const su3**) hf.gaugefield, mnl->kappa, c_sw); 
+        if(mnl->type != NDDETRATIO) {
+          trlog[j] = -sw_trace(0, mnl->mu);
+        }
+        else {
+          trlog[j] = -sw_trace_nd(0, mnl->mubar, mnl->epsbar);
+        }
+        
+        sw_term( (const su3**) hf.gaugefield, mnl->kappa2, c_sw);
+        if(mnl->type != NDDETRATIO) {
+          trlog[j] -= -sw_trace(0, mnl->mu2);
+        }
+        else {
+          trlog[j] -= -sw_trace_nd(0, mnl->mubar2, mnl->epsbar2);
+        }
+      } else
+        trlog[j] = 0.;
+    }
+    else {
+      trlog[j] = 0.;
+    }
+    if(g_proc_id == 0 && g_debug_level > 0) {
+      printf("# monomial[%d] %s, trlog = %e\n", j, mnl->name, trlog[j]);
+    }
+  }
+
+  for(int i = 0; i < N; i++) {
+    if(g_proc_id == 0 && g_debug_level > 0) {
+      printf("# computing reweighting factors for sample %d\n", i);
+    }
+    for(int j = 0; j < no_monomials; j++) {
       mnl = &monomial_list[j];
       if(mnl->type != GAUGE) {
 	if(mnl->even_odd_flag) {
 	  random_spinor_field_eo(mnl->pf, mnl->rngrepro, RN_GAUSS);
+          mnl->energy0 = square_norm(mnl->pf, n/2, 1);
 	}
-	else random_spinor_field_lexic(mnl->pf, mnl->rngrepro, RN_GAUSS);
-	mnl->energy0 = square_norm(mnl->pf, n, 1);
-	if(mnl->type == NDDETRATIO) {
+	else {
+          random_spinor_field_lexic(mnl->pf, mnl->rngrepro, RN_GAUSS);
+          mnl->energy0 = square_norm(mnl->pf, n, 1);
+        }
+	if(mnl->type == NDDETRATIO || mnl->type == NDCLOVERRATCOR) {
 	  if(mnl->even_odd_flag) {
 	    random_spinor_field_eo(mnl->pf2, mnl->rngrepro, RN_GAUSS);
+            mnl->energy0 += square_norm(mnl->pf2, n/2, 1);
 	  }
-	  else random_spinor_field_lexic(mnl->pf, mnl->rngrepro, RN_GAUSS);
-	  mnl->energy0 += square_norm(mnl->pf2, n, 1);
+	  else {
+            random_spinor_field_lexic(mnl->pf2, mnl->rngrepro, RN_GAUSS);
+            mnl->energy0 += square_norm(mnl->pf2, n, 1);
+          }
+	}
+	if(g_proc_id == 0 && g_debug_level > 1) {
+	  printf("# monomial[%d] %s, energy0 = %e\n", j, mnl->name, mnl->energy0);
 	}
       }
     }
 
-    for(j = 0; j < no_monomials; j++) {
+    for(int j = 0; j < no_monomials; j++) {
       mnl = &monomial_list[j];
       if(mnl->type != GAUGE) {
-	y = mnl->accfunction(j, &hf);
-	sq_norm -= y;
-	x = exp(sq_norm);
-	sum[j] += x;
-	sum_sq[j] += x*x;
+	double y = mnl->accfunction(j, &hf);
+	data[i*no_monomials + j] = y;
 	if(g_proc_id == 0 && g_debug_level > 0) {
-	  printf("monomial[%d] %s, w_%d=%e W=%e\n", j, mnl->name, j, y, x);
+	  printf("# monomial[%d] %s, stochastic part: w_%d=%e exp(w_%d)=%e\n", j, mnl->name, j, j, y, exp(y));
 	}
       }
     }
   }
   
   if(g_proc_id == 0) {
-    ofs = fopen("reweighting_factor.data", "a");
-    fprintf(ofs, "%d ", nstore);
-    for(j = 0; j < no_monomials; j++) {
-      fprintf(ofs, "%e %e ", sum[j]/N, sqrt((-sum[j]*sum[j]/N/N + sum_sq[j]/N)/(N-1)/N));
+    char filename[50];
+    sprintf(filename, "reweighting_factor.data.%.5d", nstore);
+    if((ofs = fopen(filename, "w")) == NULL) {
+      fatal_error("Could not open file for data output", "reweighting_factor");
+    }
+    else {
+      for(int j = 0; j < no_monomials; j++) {
+        mnl = &monomial_list[j];
+        for(int i = 0; i < N; i++) {
+          fprintf(ofs, "%.2d %.5d %.12f %.12f %.12f %.12f %.10e\n", j, i, mnl->kappa, mnl->kappa2, mnl->mu, mnl->mu2, data[i*no_monomials + j] + trlog[j]);
+        }
+      }
+      fclose(ofs);
     }
-    fprintf(ofs, "\n");
-    fclose(ofs);
   }
+  free(data);
+  free(trlog);
 }
 
diff --git a/reweighting_factor_nd.c b/reweighting_factor_nd.c
index 87f8a9274..c48e9965a 100644
--- a/reweighting_factor_nd.c
+++ b/reweighting_factor_nd.c
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/rnd_gauge_trafo.c b/rnd_gauge_trafo.c
index 3f5fe7129..2e1a06b81 100644
--- a/rnd_gauge_trafo.c
+++ b/rnd_gauge_trafo.c
@@ -23,7 +23,7 @@
  *******************************************************************************/
 
 #if HAVE_CONFIG_H
-#include<config.h>
+#include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -46,7 +46,7 @@ void rnd_gauge_trafo(const int repro, su3 ** const gf){
 
   random_gauge_field(repro, gauge_trafo);
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   xchange_gauge(gauge_trafo);
 #endif
 
diff --git a/sample-input/offline_measurement.input b/sample-input/offline_measurement.input
new file mode 100644
index 000000000..60517b9c6
--- /dev/null
+++ b/sample-input/offline_measurement.input
@@ -0,0 +1,51 @@
+# example input file for offline measurements using "offline_measurement"
+# requires 2 8^4 gauge configuration conf.0000 and conf.0002
+
+L=8
+T=8
+
+DebugLevel = 5
+ompnumthreads=4
+
+InitialStoreCounter = 0
+Measurements = 2
+# measurements will be carried out in nsave steps
+# e.g. for conf.0000 and conf.0002 in this case 
+nsave=2
+2kappamu = 0.05
+kappa = 0.177
+BCAngleT = 1
+GaugeConfigInputFile = conf
+UseEvenOdd = yes
+
+# the correlators measurement requires ONE operator to be defined
+# if multiple operators are defined, only the first one is used!
+BeginMeasurement CORRELATORS
+  Frequency = 1 
+EndMeasurement
+
+BeginMeasurement POLYAKOVLOOP
+  Frequency = 1
+EndMeasurement
+
+BeginMeasurement ORIENTEDPLAQUETTES
+  Frequency = 1
+EndMeasurement
+
+# requirements are the same as for the correlators measurement
+BeginMeasurement PIONNORM
+  Frequency = 1
+EndMeasurement
+
+# note: setting the solver to CGMMS will result in the CGMMS inversion taking place
+# because the solver is not properly decoupled form the rest of the code
+BeginOperator TMWILSON
+  2kappaMu = 0.05
+  kappa = 0.177
+  UseEvenOdd = yes
+  Solver = CG
+  SolverPrecision = 1e-14
+  MaxSolverIterations = 1000
+  AddDownPropagator = no
+EndOperator
+
diff --git a/sample-input/sample-averx.input b/sample-input/sample-averx.input
new file mode 100644
index 000000000..d3270e09a
--- /dev/null
+++ b/sample-input/sample-averx.input
@@ -0,0 +1,31 @@
+# example input file for invert
+# for CG solver
+# requires a 4^4 gauge configuration conf.0000
+
+L=4
+T=4
+
+DebugLevel = 5
+InitialStoreCounter = 0
+Measurements = 1
+2kappamu = 0.05
+kappa = 0.177
+BCAngleT = 1
+GaugeConfigInputFile = conf
+UseEvenOdd = yes
+
+SourceType = GenPionTimeSlice
+ReadSource = no
+SourceTimeSlice = 0
+NoSamples = 1
+
+BeginOperator TMWILSON
+  2kappaMu = 0.05
+  kappa = 0.177
+  UseEvenOdd = yes
+  Solver = CG
+  SolverPrecision = 1e-14
+  MaxSolverIterations = 1000
+  AddDownPropagator = no
+EndOperator
+
diff --git a/sample-input/sample-cg-tmclover.input b/sample-input/sample-cg-tmclover.input
index 993c19c2b..992127b06 100644
--- a/sample-input/sample-cg-tmclover.input
+++ b/sample-input/sample-cg-tmclover.input
@@ -18,14 +18,15 @@ Indices = 0
 ReadSource = no
 UseRelativePrecision = yes
 UseSloppyPrecision = yes
+UseEvenOdd = yes
 
 BeginOperator CLOVER
   2KappaMu = 0.1
   kappa = 0.160
   csw = 1.00
-# the following are not available for this operator (yet)
+# the following is the default for this operator and the only available option and therefore does not need and must not be set
 #  Solver = CG
-#  UseEvenOdd = yes
+  UseEvenOdd = yes
   SolverPrecision = 1.e-16
   MaxSolverIterations = 100
   PropagatorPrecision = 64
diff --git a/sample-input/sample-dfl.input b/sample-input/sample-dfl.input
new file mode 100644
index 000000000..fcb212c7d
--- /dev/null
+++ b/sample-input/sample-dfl.input
@@ -0,0 +1,78 @@
+# example input file for invert
+# for CG solver
+# requires a 4^4 gauge configuration conf.0000
+
+L=4
+T=4
+NrXProcs = 2
+NrYProcs = 2
+NrZProcs = 2
+
+NoBlocksT = 2
+NoBlocksX = 2
+NoBlocksY = 2
+NoBlocksZ = 2
+
+DebugLevel = 3
+InitialStoreCounter = 0
+Measurements = 1
+2kappamu = 0.001
+kappa = 0.177
+BCAngleT = 1
+GaugeConfigInputFile = conf
+UseEvenOdd = no
+
+SourceType = timeslice
+#ReadSource = no
+#NoSamples = 1
+UseSloppyPrecision = no
+OmpNumThreads = 2
+
+ReadSource = no
+SourceFilename = bla
+SourceTimeSlice = 38
+Indices = 0
+
+GMRESMParameter = 25
+
+BeginDeflationInit
+  ## no of global approximate egenvectors
+  DeflationSubspaceDimension = 24
+
+  ## during inversion/outer solve
+  NiterMsap = 4
+  NcycleMsap = 4
+
+  KappaMsap = 0.177
+  2KappaMuMsap = 0.010
+
+  ## the following are used during subspace generation
+  NiterMsapSubspace = 4
+  NcycleMsapSubspace = 4
+  NsmoothSubspace = 11
+
+  ## kappa and mu for little D during inversion/outer solve
+  kappa = 0.177
+  2KappaMu = 0.005
+
+  ## kappa and mu for little D during subspace generation
+  kappaSubspace = 0.177
+  2KappaSubspace = 0.0
+
+  ## parameters of little solver
+  LittleSolverMaxIter = 20
+  LittleGMRESMParameter = 25
+  LittleSolverLowPrecision = 0.01
+  LittleSolverHighPrecision = 0.01
+
+  useLittleLittleD = yes
+EndDeflationInit
+
+BeginOperator TMWILSON
+  2kappaMu = 0.01
+  kappa = 0.177
+  UseEvenOdd = no
+  Solver = dflfgmres
+  SolverPrecision = 1e-14
+  MaxSolverIterations = 125
+EndOperator
diff --git a/sample-input/sample-mixedcg.input b/sample-input/sample-mixedcg.input
new file mode 100644
index 000000000..87589920d
--- /dev/null
+++ b/sample-input/sample-mixedcg.input
@@ -0,0 +1,46 @@
+#example input file for invert
+# for MIXEDCG solver
+# requires a 4^4 gauge configuration conf.0000
+
+L=4
+T=4
+
+DebugLevel = 5
+InitialStoreCounter = 0
+Measurements = 1
+2kappamu = 0.05
+kappa = 0.177
+BCAngleT = 1
+GaugeConfigInputFile = conf
+UseEvenOdd = yes
+
+SourceType = Volume
+ReadSource = no
+NoSamples = 12
+
+# residual reduction factor for inner solver in mixed cg (1.e-6 seems to work quite well in general)
+MixCGInnerEps = 1.e-6
+# maximum number of inner solver iterations for MIXEDCG per restart
+MixCGMaxIter = 10000
+
+BeginOperator TMWILSON
+  2kappaMu = 0.05
+  kappa = 0.177
+  UseEvenOdd = yes
+  Solver = MIXEDCG
+  SolverPrecision = 1e-14
+  # MIXECG internally calculates the number of outer iterations from MaxSolverIterations and MixCGMaIter, but does
+  # at least 10 outer iterations
+  MaxSolverIterations = 30000
+  AddDownPropagator = yes
+EndOperator
+
+BeginOperator CLOVER
+  2kappaMu = 0.05
+  kappa = 0.177
+  csw = 1.74
+  Solver = MIXEDCG
+  SolverPrecision = 1e-14
+  MaxSolverIterations = 30000
+  AddDownPropagator = yes
+EndOperator
diff --git a/sample-input/sample-pcg.input b/sample-input/sample-pcg.input
new file mode 100644
index 000000000..359b64efe
--- /dev/null
+++ b/sample-input/sample-pcg.input
@@ -0,0 +1,57 @@
+# example input file for invert
+# for CG solver
+# requires a 4^4 gauge configuration conf.0000
+
+L=4
+T=4
+NrXProcs = 2
+NrYProcs = 2
+NrZProcs = 2
+
+NoBlocksT = 2
+NoBlocksX = 2
+NoBlocksY = 2
+NoBlocksZ = 2
+
+DebugLevel = 3
+InitialStoreCounter = 0
+Measurements = 1
+2kappamu = 0.005
+kappa = 0.177
+BCAngleT = 1
+GaugeConfigInputFile = conf
+UseEvenOdd = no
+
+SourceType = timeslice
+#ReadSource = no
+#NoSamples = 1
+UseSloppyPrecision = no
+OmpNumThreads = 2
+
+ReadSource = no
+SourceFilename = bla
+SourceTimeSlice = 38
+Indices = 0
+
+BeginDeflationInit
+  GlobalVectorsNum = 20
+  OuterloopMsap = yes
+  NiterMsap = 3
+  NcycleMsap = 5
+  NiterMsapDfl = 20
+  NcycleMsapDfl = 5
+  NsmoothMsapDfl = 6
+  UseiQDFL = no
+EndDeflationInit
+
+BeginOperator TMWILSON
+  2kappaMu = 0.005
+  kappa = 0.177
+  UseEvenOdd = no
+  Solver = PCG
+  SolverPrecision = 1e-14
+  MaxSolverIterations = 325
+#  AddDownPropagator = yes
+EndOperator
+
+
diff --git a/sample-input/sample-rw.input b/sample-input/sample-rw.input
new file mode 100644
index 000000000..cb147405b
--- /dev/null
+++ b/sample-input/sample-rw.input
@@ -0,0 +1,50 @@
+# example input file for invert
+# for CG solver
+# requires a 4^4 gauge configuration conf.0000
+
+L=4
+T=4
+NrXProcs = 2
+NrYProcs = 2
+NrZProcs = 2
+
+NoBlocksT = 2
+NoBlocksX = 2
+NoBlocksY = 2
+NoBlocksZ = 2
+
+
+DebugLevel = 5
+InitialStoreCounter = 0
+Measurements = 1
+2kappamu = 0.001
+kappa = 0.177
+BCAngleT = 1
+ThetaX = 1
+GaugeConfigInputFile = conf
+UseEvenOdd = yes
+
+OmpNumThreads = 2
+
+ComputeReweightingFactor = yes
+NoReweightingSamples = 2
+
+## monomial only for reweighting
+BeginMonomial CLOVERDETRATIORW
+  Timescale = 1
+  CSW = 1.00
+  # nominator parameters
+  2KappaMu = 0.01
+  kappa = 0.138
+  rho = 0.0
+  # denominator parameters
+  2KappaMu2 = 0.01
+  ## for reweighting only rho=rho2 makes sense
+  rho2 = 0.0
+  kappa2 = 0.1380001
+  AcceptancePrecision =  1.e-20
+  Name = cloverdetratiorw
+  solver = CG
+EndMonomial
+
+
diff --git a/sample-input/sample.cg-wilson-qphix.input b/sample-input/sample.cg-wilson-qphix.input
new file mode 100644
index 000000000..057f038f7
--- /dev/null
+++ b/sample-input/sample.cg-wilson-qphix.input
@@ -0,0 +1,64 @@
+L=24
+T=48
+
+# number of MPI tasks in T direction is computed automatically
+ 
+# number of MPI tasks in Z direction
+nrzprocs=2
+
+# currently only two-dimensional parallelisation works properly
+# also, qphix is slower when parallelized in x or y (especially x is poblematic)
+nrxprocs=1
+nryprocs=1
+
+# OpenMP threads per MPI task
+# down below, Ncores * Sy * Sz = ompnumthreads
+ompnumthreads = 12
+
+DebugLevel = 3
+InitialStoreCounter = 1000
+Measurements = 1
+2kappamu = 0.0008238
+kappa = 0.1373
+csw = 1.57551
+BCAngleT = 0
+GaugeConfigInputFile = conf
+disableiochecks=yes
+UseEvenOdd = yes
+
+SourceType = point
+indices = 0
+ReadSource = no
+
+userelativeprecision = no
+
+BeginExternalInverter QPHIX
+  # physical cores per MPI task
+  NCores = 12
+  
+  # block sizes (see qphix papers for details)
+  By = 4
+  Bz = 4
+  MinCt = 2
+  
+  # thread geometry
+  # ompnumthreads = Ncores * Sy * Sz
+  # hyperthreads should be specified here
+  Sy = 1
+  Sz = 1
+  
+  # paddings in XY and XYZ blocks
+  PadXY = 0
+  PadXYZ = 0
+EndExternalInverter
+
+BeginOperator WILSON
+  kappa = 0.132
+  Solver = cg
+  # solverprecision must be <= 1e-13 such that the CG in double precision is launched
+  # the other floating point precisions (single, half) seem to segfault currently
+  SolverPrecision = 1e-17
+  MaxSolverIterations = 20000
+  useqphixinverter = yes
+EndOperator
+
diff --git a/sf/sf_calc_action.c b/sf/sf_calc_action.c
index 7767b7901..06f68cdd0 100644
--- a/sf/sf_calc_action.c
+++ b/sf/sf_calc_action.c
@@ -7,7 +7,7 @@
 ********************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/sf/sf_get_rectangle_staples.c b/sf/sf_get_rectangle_staples.c
index 195ffe225..c633e247d 100644
--- a/sf/sf_get_rectangle_staples.c
+++ b/sf/sf_get_rectangle_staples.c
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/sf/sf_get_staples.c b/sf/sf_get_staples.c
index fc27981cf..5a1a07b13 100644
--- a/sf/sf_get_staples.c
+++ b/sf/sf_get_staples.c
@@ -21,7 +21,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/sf/sf_observables.c b/sf/sf_observables.c
index 0bb3d17dd..54eabe286 100644
--- a/sf/sf_observables.c
+++ b/sf/sf_observables.c
@@ -7,7 +7,7 @@
 ********************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/sf/sf_utils.c b/sf/sf_utils.c
index ecf0c0c75..d7f16d4cd 100644
--- a/sf/sf_utils.c
+++ b/sf/sf_utils.c
@@ -2,7 +2,7 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/sighandler.c b/sighandler.c
index 2c10bb592..31ef1da0d 100644
--- a/sighandler.c
+++ b/sighandler.c
@@ -33,11 +33,11 @@
  ************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 #  include <mpi.h>
 #endif
 
@@ -59,7 +59,7 @@ void catch_ill_inst(int s){
   fprintf(stderr, "Please check whether your processor supports SSE1/2/3) instructions!\n");
   fprintf(stderr, "Aborting...\n");
   fflush(stdout);
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Abort(MPI_COMM_WORLD, 1);
   MPI_Finalize();
 #endif
diff --git a/smearing/Makefile.in b/smearing/Makefile.in
index 408565722..846293f7c 100644
--- a/smearing/Makefile.in
+++ b/smearing/Makefile.in
@@ -61,7 +61,7 @@ include ${top_srcdir}/Makefile.global
 
 # rule to compile objects
 
-%.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h
+%.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config.h
 	$(COMPILE) -c $<
 
 
diff --git a/smearing/ape.ih b/smearing/ape.ih
index c1ec7fc10..44823a3eb 100644
--- a/smearing/ape.ih
+++ b/smearing/ape.ih
@@ -1,5 +1,5 @@
 #ifdef HAVE_CONFIG_H
-# include <config.h>
+# include <tmlqcd_config.h>
 #endif
 
 #include <stdio.h>
diff --git a/smearing/hex.ih b/smearing/hex.ih
index 33971fbab..3eda3f7eb 100644
--- a/smearing/hex.ih
+++ b/smearing/hex.ih
@@ -1,5 +1,5 @@
 #ifdef HAVE_CONFIG_H
-# include <config.h>
+# include <tmlqcd_config.h>
 #endif
 
 #include <stdio.h>
diff --git a/smearing/hex_stout_exclude_one.c b/smearing/hex_stout_exclude_one.c
index 4071e29a8..5327fde2e 100644
--- a/smearing/hex_stout_exclude_one.c
+++ b/smearing/hex_stout_exclude_one.c
@@ -1,9 +1,10 @@
 #include "hex.ih"
+#include "global.h"
 
 void stout_exclude_one(su3_tuple **buff_out, double const coeff, su3_tuple **staples, su3_tuple *buff_in)
 {
   static su3 tmp;
-  
+
 #define _MULTIPLY_AND_EXPONENTIATE(x, principal, component) \
   { \
     _su3_times_su3d(tmp, staples[component / 4][x][component % 4], buff_in[x][principal]); \
diff --git a/smearing/hyp.ih b/smearing/hyp.ih
index b71c28c43..73b4b1c18 100644
--- a/smearing/hyp.ih
+++ b/smearing/hyp.ih
@@ -1,5 +1,5 @@
 #ifdef HAVE_CONFIG_H
-# include <config.h>
+# include <tmlqcd_config.h>
 #endif
 
 #include <stdio.h>
diff --git a/smearing/stout.ih b/smearing/stout.ih
index 8f36ee701..3fea8041e 100644
--- a/smearing/stout.ih
+++ b/smearing/stout.ih
@@ -1,5 +1,5 @@
 #ifdef HAVE_CONFIG_H
-# include <config.h>
+# include <tmlqcd_config.h>
 #endif
 #ifdef SSE
 # undef SSE
diff --git a/smearing/utils.ih b/smearing/utils.ih
index 28914576b..3546f5dd8 100644
--- a/smearing/utils.ih
+++ b/smearing/utils.ih
@@ -1,5 +1,5 @@
 #ifdef HAVE_CONFIG_H
-# include <config.h>
+# include <tmlqcd_config.h>
 #endif
 
 #include <stdio.h>
diff --git a/solver/M_plus_block_psi_body.c b/solver/M_plus_block_psi_body.c
new file mode 100644
index 000000000..b0aa998a5
--- /dev/null
+++ b/solver/M_plus_block_psi_body.c
@@ -0,0 +1,40 @@
+/***********************************************************************
+ * Copyright (C) 2016 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+
+void _PSWITCH(Mtm_plus_block_psi)(_PTSWITCH(spinor) * const l, _PTSWITCH(spinor) * const k, const int i) {
+  block * blk = &block_list[i];
+  int vol = (*blk).volume/2;
+  _PSWITCH(Block_H_psi)(blk, &_PTSWITCH(g_spinor_field)[DUM_MATRIX+1][i*vol], k, EO);
+  _PSWITCH(mul_one_pm_imu_inv)(&_PTSWITCH(g_spinor_field)[DUM_MATRIX+1][i*vol], +1., vol);
+  _PSWITCH(Block_H_psi)(blk, &_PTSWITCH(g_spinor_field)[DUM_MATRIX][i*vol], &_PTSWITCH(g_spinor_field)[DUM_MATRIX+1][i*vol], OE);
+  _PSWITCH(mul_one_pm_imu_sub_mul)(l, k, &_PTSWITCH(g_spinor_field)[DUM_MATRIX][i*vol], +1., vol);
+  return;
+}
+
+void _PSWITCH(Msw_plus_block_psi)(_PTSWITCH(spinor) * l, _PTSWITCH(spinor) *  k, const int i) {
+  block * blk = &block_list[i];
+  int vol = (*blk).volume/2;
+  _PSWITCH(Block_H_psi)(blk, &_PTSWITCH(g_spinor_field)[DUM_MATRIX+1][i*vol], k, EO);
+  _PSWITCH(assign_mul_one_sw_pm_imu_inv_block)(EE, &_PTSWITCH(g_spinor_field)[DUM_MATRIX][i*vol],&_PTSWITCH(g_spinor_field)[DUM_MATRIX+1][i*vol], g_mu, blk);
+  _PSWITCH(Block_H_psi)(blk, &_PTSWITCH(g_spinor_field)[DUM_MATRIX+1][i*vol], &_PTSWITCH(g_spinor_field)[DUM_MATRIX][i*vol], OE);
+  _PSWITCH(assign_mul_one_sw_pm_imu_block)(OO, &_PTSWITCH(g_spinor_field)[DUM_MATRIX][i*vol],k,g_mu,blk);
+  _PSWITCH(diff)(l,&_PTSWITCH(g_spinor_field)[DUM_MATRIX][i*vol],&_PTSWITCH(g_spinor_field)[DUM_MATRIX+1][i*vol],vol);
+  return;
+}
diff --git a/solver/Makefile.in b/solver/Makefile.in
index 028ed20e6..14bc3fe42 100644
--- a/solver/Makefile.in
+++ b/solver/Makefile.in
@@ -33,14 +33,19 @@ libsolver_TARGETS = bicgstab_complex gmres incr_eigcg eigcg restart_X ortho \
 	            cgs_real cg_her mr chrono_guess \
 	            bicgstabell bicgstab2 eigenvalues fgmres fgmres4bispinors \
 	            gcr gcr4complex diagonalise_general_matrix \
+	            cgne4complex mr4complex fgmres4complex \
 	            quicksort gmres_dr lu_solve jdher Msap \
                     jdher_bi gram-schmidt eigenvalues_bi \
                     bicgstab_complex_bi cg_her_bi pcg_her \
                     sub_low_ev cg_her_nd poly_precon \
                     generate_dfl_subspace dfl_projector \
-                    cg_mms_tm cg_mms_tm_nd solver_field sumr mixed_cg_her index_jd \
+                    cg_mms_tm cg_mms_tm_nd mixed_cg_mms_tm_nd \
+                    solver_field sumr mixed_cg_her index_jd \
+		    rg_mixed_cg_her rg_mixed_cg_her_nd \
                     dirac_operator_eigenvectors	spectral_proj \
-                    jdher_su3vect cg_her_su3vect eigenvalues_Jacobi
+                    jdher_su3vect cg_her_su3vect eigenvalues_Jacobi \
+		    mcr cr mcr4complex bicg_complex monomial_solve \
+		    solver_types init_guess
 
 libsolver_OBJECTS = $(addsuffix .o, ${libsolver_TARGETS})
 
@@ -65,7 +70,7 @@ include ${top_srcdir}/Makefile.global
 
 # rule to compile objects
 
-%.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h
+%.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config.h
 	$(COMPILE) -c $<
 
 
diff --git a/solver/Msap.c b/solver/Msap.c
index 77a58427b..99956d081 100644
--- a/solver/Msap.c
+++ b/solver/Msap.c
@@ -1,4 +1,4 @@
-/***********************************************************************
+/*********t*************************************************************
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
  *
  * This file is part of tmLQCD.
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -26,9 +26,12 @@
 #include <string.h>
 #include "global.h"
 #include "su3.h"
+#include "gamma.h"
 #include "start.h"
 #include "linalg_eo.h"
 #include "operator/tm_operators.h"
+#include "operator/clovertm_operators.h"
+#include "read_input.h"
 #include "boundary.h"
 #include "gmres.h"
 #include "solver.h"
@@ -42,29 +45,55 @@ void dummy_Di(spinor * const P, spinor * const Q, const int i) {
   return;
 }
 
+#define _PTSWITCH(s) s
+#define _PSWITCH(s) s
 
-void Mtm_plus_block_psi(spinor * const l, spinor * const k, const int i) {
+#include"M_plus_block_psi_body.c"
+
+#undef _PTSWITCH
+#undef _PSWITCH
+
+#define _PTSWITCH(s) s ## 32
+#define _PSWITCH(s) s ## _32
+// this is ugly!
+#define DUM_MATRIX 0
+
+#include"M_plus_block_psi_body.c"
+
+#undef _PTSWITCH
+#undef _PSWITCH
+
+void Mtm_plus_sym_block_psi(spinor * const l, spinor * const k, const int i) {
   block * blk = &block_list[i];
   int vol = (*blk).volume/2;
-  Block_H_psi(blk, g_spinor_field[DUM_MATRIX+1], k, EO);
-  mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX+1], +1., vol);
-  Block_H_psi(blk, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], OE);
-  mul_one_pm_imu_sub_mul(l, k, g_spinor_field[DUM_MATRIX], +1., vol);
+  Block_H_psi(blk, &g_spinor_field[DUM_MATRIX+1][i*vol], k, EO);
+  mul_one_pm_imu_inv(&g_spinor_field[DUM_MATRIX+1][i*vol], +1., vol);
+  Block_H_psi(blk, &g_spinor_field[DUM_MATRIX][i*vol], &g_spinor_field[DUM_MATRIX+1][i*vol], OE);
+  mul_one_pm_imu_inv(&g_spinor_field[DUM_MATRIX][i*vol], +1., vol);
+  diff(l, k, &g_spinor_field[DUM_MATRIX][i*vol], vol);
   return;
 }
 
-void Mtm_plus_sym_block_psi(spinor * const l, spinor * const k, const int i) {
+void Msw_plus_sym_block_psi(spinor *  l, spinor *  k, const int i) {
+  if(g_proc_id == g_stdio_proc){
+    printf("==================WARNNING WARNNING WARNNING ==================\n");
+    printf("Msw_plus_sym_block_psi doesn't work properly yet because we need the inverse of (1+T +img5) on odd sites for +mu which is not computed or stored ....\n");
+            
+    printf("==================WARNNING WARNNING WARNNING ===================\n");
+    printf("Exiting ........\n");
+    exit(100);
+  }
   block * blk = &block_list[i];
   int vol = (*blk).volume/2;
-  Block_H_psi(blk, g_spinor_field[DUM_MATRIX+1], k, EO);
-  mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX+1], +1., vol);
-  Block_H_psi(blk, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], OE);
-  mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX], +1., vol);
-  diff(l, k, g_spinor_field[DUM_MATRIX], vol);
+  Block_H_psi(blk, &g_spinor_field[DUM_MATRIX+1][i*vol], k, EO);
+  assign_mul_one_sw_pm_imu_inv_block(EE, &g_spinor_field[DUM_MATRIX][i*vol], &g_spinor_field[DUM_MATRIX+1][i*vol] , g_mu,blk);
+  Block_H_psi(blk, &g_spinor_field[DUM_MATRIX+1][i*vol], &g_spinor_field[DUM_MATRIX][i*vol], OE);
+  //FIXME
+  assign_mul_one_sw_pm_imu_inv_block(OO, &g_spinor_field[DUM_MATRIX][i*vol], &g_spinor_field[DUM_MATRIX+1][i*vol] , g_mu, blk);
+  diff(l, k, &g_spinor_field[DUM_MATRIX][i*vol], vol);
   return;
 }
 
-
 void dummy_D0(spinor * const P, spinor * const Q) {
   Block_D_psi(&block_list[0], P, Q);
   return;
@@ -75,12 +104,12 @@ void dummy_D1(spinor * const P, spinor * const Q) {
   return;
 }
 
-void Msap(spinor * const P, spinor * const Q, const int Ncy) {
+void Msap(spinor * const P, spinor * const Q, const int Ncy, const int Niter) {
   int blk, ncy = 0, eo, vol;
   spinor * r, * a, * b;
   double nrm;
   spinor ** solver_field = NULL;
-  const int nr_sf = 3;
+  const int nr_sf = 6;
 
   /* 
    * here it would be probably better to get the working fields as a parameter 
@@ -99,22 +128,23 @@ void Msap(spinor * const P, spinor * const Q, const int Ncy) {
       D_psi(r, P);
       diff(r, Q, r, VOLUME);
       nrm = square_norm(r, VOLUME, 1);
-      if(g_proc_id == 0 && g_debug_level > 1 && eo == 1) {
+      if(g_proc_id == 0 && g_debug_level > 2 && eo == 1) {  /*  GG, was 1 */
 	printf("Msap: %d %1.3e\n", ncy, nrm);
+	fflush(stdout);
       }
       /* choose the even (odd) block */
-      
+
       /*blk = eolist[eo];*/
-      
+
       for (blk = 0; blk < nb_blocks; blk++) {
-      	if(block_list[blk].evenodd == eo) {
+	if(block_list[blk].evenodd == eo) {
 	  vol = block_list[blk].volume;
-	  
+
 	  /* get part of r corresponding to block blk into b */
 	  copy_global_to_block(b, r, blk);
-	  
-	  mrblk(a, b, 16, 1.e-31, 1, vol, &dummy_Di, blk);
-	  
+	  // does this work?? i.e. solver_field[3]
+	  mrblk(a, b, solver_field[3], Niter, 1.e-31, 1, vol, &dummy_Di, blk);
+
 	  /* add a up to full spinor P */
 	  add_block_to_global(P, a, blk);
 	}
@@ -125,30 +155,91 @@ void Msap(spinor * const P, spinor * const Q, const int Ncy) {
   return;
 }
 
+// This is a smoother based on the even/odd preconditioned CG
+// it applies Ncy iterations of even/odd CG to spinor Q
+// and stores the result in P
 
-void Msap_eo(spinor * const P, spinor * const Q, const int Ncy) {
-  int blk, ncy = 0, eo, vol;
-  spinor * r, * a, * b;
+void CGeoSmoother(spinor * const P, spinor * const Q, const int Ncy, const int dummy) {
+  spinor ** solver_field = NULL;
+  const int nr_sf = 5;
+  double musave = g_mu;
+  g_mu = g_mu1;
+  init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf);
+  
+  convert_lexic_to_eo(solver_field[0], solver_field[1], Q);
+  if(g_c_sw > 0)
+    assign_mul_one_sw_pm_imu_inv(EE,solver_field[2], solver_field[0], g_mu);
+  else
+    assign_mul_one_pm_imu_inv(solver_field[2], solver_field[0], +1., VOLUME/2);
+  
+  Hopping_Matrix(OE, solver_field[4], solver_field[2]); 
+  /* The sign is plus, since in Hopping_Matrix */
+  /* the minus is missing                      */
+  assign_mul_add_r(solver_field[4], +1., solver_field[1], VOLUME/2);
+  /* Do the inversion with the preconditioned  */
+  /* matrix to get the odd sites               */
+  gamma5(solver_field[4], solver_field[4], VOLUME/2);
+  if(g_c_sw > 0) {
+    cg_her(solver_field[3], solver_field[4], Ncy, 1.e-8, 1, 
+	   VOLUME/2, &Qsw_pm_psi);
+    Qsw_minus_psi(solver_field[3], solver_field[3]);
+    
+    /* Reconstruct the even sites                */
+    Hopping_Matrix(EO, solver_field[2], solver_field[3]);
+    assign_mul_one_sw_pm_imu_inv(EE,solver_field[4],solver_field[2], g_mu);
+  }
+  else {
+    cg_her(solver_field[3], solver_field[4], Ncy, 1.e-8, 1, 
+	   VOLUME/2, &Qtm_pm_psi);
+    Qtm_minus_psi(solver_field[3], solver_field[3]);
+    
+    /* Reconstruct the even sites                */
+    Hopping_Matrix(EO, solver_field[4], solver_field[3]);
+    mul_one_pm_imu_inv(solver_field[4], +1., VOLUME/2);
+  }
+  
+  /* The sign is plus, since in Hopping_Matrix */
+  /* the minus is missing                      */
+  assign_add_mul_r(solver_field[2], solver_field[4], +1., VOLUME/2);
+  
+  convert_eo_to_lexic(P, solver_field[2], solver_field[3]); 
+  g_mu = musave;
+  finalize_solver(solver_field, nr_sf);
+  return;  
+}
+
+void Msap_eo_old(spinor * const P, spinor * const Q, const int Ncy, const int Niter) {
+  int blk, ncy = 0, eo, vol, vols;
+  spinor * r, * a, * b, * c;
   double nrm;
+  double musave = g_mu;
+  double kappasave = g_kappa;
   spinor * b_even, * b_odd, * a_even, * a_odd;
   spinor ** solver_field = NULL;
-  const int nr_sf = 3;
+  // also get space for mrblk! 6 = 3+3
+  const int nr_sf = 6;
 
+  if(kappa_dflgen > 0) {
+    g_kappa = kappa_dfl;
+  }
+  if(mu_dflgen > -10) {
+    g_mu = mu_dfl;
+    // make sure the sign is correct!
+    if(g_mu*musave < 0) g_mu *= -1.;
+  }
+  boundary(g_kappa);
   /* 
    * here it would be probably better to get the working fields as a parameter 
    * from the calling function
    */
-  init_solver_field(&solver_field, VOLUME, nr_sf);
+  vols = block_list[0].volume/2+block_list[0].spinpad;
+  vol = block_list[0].volume/2;
+
+  init_solver_field(&solver_field, nb_blocks*2*vols, nr_sf);
   r = solver_field[0];
   a = solver_field[1];
   b = solver_field[2];
 
-  vol = block_list[0].volume/2;
-  b_even = b;
-  b_odd = b + vol + 1;
-  a_even = a;
-  a_odd = a + vol + 1;
-
   for(ncy = 0; ncy < Ncy; ncy++) {
     /* compute the global residue        */
     /* this can be done more efficiently */
@@ -157,34 +248,188 @@ void Msap_eo(spinor * const P, spinor * const Q, const int Ncy) {
       D_psi(r, P);
       diff(r, Q, r, VOLUME);
       nrm = square_norm(r, VOLUME, 1);
-      if(g_proc_id == 0 && g_debug_level > 1 && eo == 1) {
-	printf("Msap: %d %1.3e\n", ncy, nrm);
+      if(g_proc_id == 0 && g_debug_level > 2 && eo == 0) {
+	printf("Msap_eo: %d %1.3e mu = %e\n", ncy, nrm, g_mu/2./g_kappa);
+	fflush(stdout);
       }
       /* choose the even (odd) block */
-      
+
+      // rely on nested parallelism
+      // 
+      #ifdef TM_USE_OMP
+      # pragma omp parallel for private (a_even, a_odd, b_even, b_odd, c)
+      #endif
       for (blk = 0; blk < nb_blocks; blk++) {
-      	if(block_list[blk].evenodd == eo) {
+ 	b_even = b + blk*2*vols;
+ 	b_odd = b +blk*2*vols + vols;
+ 	a_even = a + blk*2*vols;
+ 	a_odd = a + blk*2*vols + vols;
+	c = solver_field[3] + blk*vols;
+
+	if(block_list[blk].evenodd == eo) {
 	  /* get part of r corresponding to block blk into b_even and b_odd */
+
 	  copy_global_to_block_eo(b_even, b_odd, r, blk);
-	  
-	  assign_mul_one_pm_imu_inv(a_even, b_even, +1., vol);
-	  Block_H_psi(&block_list[blk], a_odd, a_even, OE);
-	  /* a_odd = a_odd - b_odd */
-	  assign_mul_add_r(a_odd, -1., b_odd, vol);
-	  
-	  mrblk(b_odd, a_odd, 3, 1.e-31, 1, vol, &Mtm_plus_block_psi, blk);
+	  if(g_c_sw > 0) {
+	    assign_mul_one_sw_pm_imu_inv_block(EE, a_even, b_even, g_mu, &block_list[blk]);
+	    Block_H_psi(&block_list[blk], a_odd, a_even, OE);
+	    /* a_odd = b_odd - a_odd */
+	    diff(a_odd, b_odd, a_odd, vol);
+
+	    mrblk(b_odd, a_odd, solver_field[3] + blk*2*3*vols, Niter, 1.e-31, 1, vol, &Msw_plus_block_psi, blk);
+	    
+	    Block_H_psi(&block_list[blk], b_even, b_odd, EO);
+	    assign(c, b_even, vol);
+	    assign_mul_one_sw_pm_imu_inv_block(EE, b_even, c, g_mu, &block_list[blk]);
+	  }
+	  else {
+	    assign_mul_one_pm_imu_inv(a_even, b_even, +1., vol);
+	    Block_H_psi(&block_list[blk], a_odd, a_even, OE);
+	    /* a_odd = b_odd - a_odd */
+	    diff(a_odd, b_odd, a_odd, vol);
 
-	  Block_H_psi(&block_list[blk], b_even, b_odd, EO);
-	  mul_one_pm_imu_inv(b_even, +1., vol);
+	    mrblk(b_odd, a_odd, solver_field[3] + blk*2*3*vols, Niter, 1.e-31, 1, vol, &Mtm_plus_block_psi, blk);
+
+	    Block_H_psi(&block_list[blk], b_even, b_odd, EO);
+	    mul_one_pm_imu_inv(b_even, +1., vol);
+	  }
 	  /* a_even = a_even - b_even */
-	  assign_add_mul_r(a_even, b_even, -1., vol);
+	  diff(a_even, a_even, b_even, vol);
 
 	  /* add even and odd part up to full spinor P */
 	  add_eo_block_to_global(P, a_even, b_odd, blk);
+
+	}
+      }
+    }
+  }
+  finalize_solver(solver_field, nr_sf);
+  g_mu = musave;
+  g_kappa = kappasave;
+  boundary(g_kappa);
+  return;
+}
+
+
+void Msap_eo(spinor * const P, spinor * const Q, const int Ncy, const int Niter) {
+  int ncy = 0, vol, vols;
+  spinor * r, * a, * b;
+  double nrm;
+  double musave = g_mu;
+  double kappasave = g_kappa;
+  spinor ** solver_field = NULL;
+  // also get space for mrblk! 6 = 3+3
+  const int nr_sf = 6;
+
+  if(kappa_Msap > 0) {
+    g_kappa = kappa_Msap;
+  }
+  if(mu_Msap > -10) {
+    g_mu = mu_Msap;
+    // make sure the sign is correct!
+    if(g_mu*musave < 0) g_mu *= -1.;
+  }
+  boundary(g_kappa);
+  /* 
+   * here it would be probably better to get the working fields as a parameter 
+   * from the calling function
+   */
+  vols = block_list[0].volume/2+block_list[0].spinpad;
+  vol = block_list[0].volume/2;
+
+  init_solver_field(&solver_field, nb_blocks*2*vols, nr_sf);
+  r = solver_field[0];
+  a = solver_field[1];
+  b = solver_field[2];
+
+  int * blk_e_list = malloc(nb_blocks/2*sizeof(int));
+  int * blk_o_list = malloc(nb_blocks/2*sizeof(int));
+  int iblke = 0, iblko = 0;
+  for(int blk = 0; blk < nb_blocks; blk++) {
+    if (block_list[blk].evenodd == 0) {
+      blk_e_list[iblke] = blk;
+      iblke++;
+    }
+    else {
+      blk_o_list[iblko] = blk;
+      iblko++;
+    }
+  }
+
+  for(ncy = 0; ncy < Ncy; ncy++) {
+    /* compute the global residue        */
+    /* this can be done more efficiently */
+    /* here only a naive implementation  */
+    for(int eo = 0; eo < 2; eo++) {
+      D_psi(r, P);
+      diff(r, Q, r, VOLUME);
+      nrm = square_norm(r, VOLUME, 1);
+      if(g_proc_id == 0 && g_debug_level > 2 && eo == 0) {
+	printf("Msap_eo: %d %1.3e mu = %e\n", ncy, nrm, g_mu/2./g_kappa);
+	fflush(stdout);
+      }
+      int * blk_eo_list;
+      if(eo == 0) {
+	blk_eo_list = blk_e_list;
+      }
+      else {
+	blk_eo_list = blk_o_list;
+      }
+      /* choose the even (odd) block */
+      // rely on nested parallelism
+      // 
+      #ifdef TM_USE_OMP
+      # pragma omp parallel for 
+      #endif
+      for (int iblk = 0; iblk < nb_blocks/2; iblk++) {
+	int blk = blk_eo_list[iblk];
+ 	spinor32 * b_even = (spinor32*) (b + blk*2*vols);
+ 	spinor32 * b_odd = (spinor32*) (b +blk*2*vols + vols);
+ 	spinor32 * a_even = (spinor32*) (a + blk*2*vols);
+ 	spinor32 * a_odd = (spinor32*) (a + blk*2*vols + vols);
+        // mrblk needs 3 solver fields which we distribute according to the block number
+	spinor32 * c = (spinor32*) (solver_field[3] + blk*2*3*vols);
+
+	/* get part of r corresponding to block blk into b_even and b_odd */
+	copy_global_to_block_eo_32(b_even, b_odd, r, blk);
+	if(g_c_sw > 0) {
+	  assign_mul_one_sw_pm_imu_inv_block_32(EE, a_even, b_even, g_mu, &block_list[blk]);
+	  Block_H_psi_32(&block_list[blk], a_odd, a_even, OE);
+	  /* a_odd = b_odd - a_odd */
+	  diff_32(a_odd, b_odd, a_odd, vol);
+	  
+	  mrblk_32(b_odd, a_odd, c,
+		   Niter, 1.e-31, 1, vol, &Msw_plus_block_psi_32, blk);
+	  
+	  Block_H_psi_32(&block_list[blk], b_even, b_odd, EO);
+	  assign_32(c, b_even, vol);
+	  assign_mul_one_sw_pm_imu_inv_block_32(EE, b_even, c, g_mu, &block_list[blk]);
+	}
+	else {
+	  assign_mul_one_pm_imu_inv_32(a_even, b_even, +1., vol);
+	  Block_H_psi_32(&block_list[blk], a_odd, a_even, OE);
+	  /* a_odd = b_odd - a_odd */
+	  diff_32(a_odd, b_odd, a_odd, vol);
+	  
+	  mrblk_32(b_odd, a_odd, c, 
+		   Niter, 1.e-31, 1, vol, &Mtm_plus_block_psi_32, blk);
+	  
+	  Block_H_psi_32(&block_list[blk], b_even, b_odd, EO);
+	  mul_one_pm_imu_inv_32(b_even, +1., vol);
 	}
+	/* a_even = a_even - b_even */
+	diff_32(a_even, a_even, b_even, vol);
+	
+	/* add even and odd part up to full spinor P */
+	add_eo_block_32_to_global(P, a_even, b_odd, blk);
       }
     }
   }
+  free(blk_e_list);
+  free(blk_o_list);
   finalize_solver(solver_field, nr_sf);
+  g_mu = musave;
+  g_kappa = kappasave;
+  boundary(g_kappa);
   return;
 }
diff --git a/solver/Msap.h b/solver/Msap.h
index 7a1808098..3a59e16e9 100644
--- a/solver/Msap.h
+++ b/solver/Msap.h
@@ -20,8 +20,14 @@
 #ifndef _MSAP_H
 #define _MSAP_H
 
-void Msap(spinor * const P, spinor * const Q, const int Ncy);
-void Msap_eo(spinor * const P, spinor * const Q, const int Ncy);
+void Msap(spinor * const P, spinor * const Q, const int Ncy, const int Niter);
+void Msap_eo(spinor * const P, spinor * const Q, const int Ncy, const int Niter);
+void CGeoSmoother(spinor * const P, spinor * const Q, const int Ncy, const int dummy);
 void Mtm_plus_block_psi(spinor * const l, spinor * const k, const int i);
 void Mtm_plus_sym_block_psi(spinor * const l, spinor * const k, const int i);
+void Msw_plus_block_psi(spinor * l, spinor *  k, const int i);
+//This function doesn't work properly yet because we need the inverse of 1+Too+img5 where
+//Too is the clover term in the symmetric case
+void Msw_plus_sym_block_psi(spinor *  l, spinor *  k, const int i);
+
 #endif
diff --git a/solver/bicg_complex.c b/solver/bicg_complex.c
new file mode 100644
index 000000000..b0ff39149
--- /dev/null
+++ b/solver/bicg_complex.c
@@ -0,0 +1,128 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The externally accessible functions are
+ *
+ *   int bicg_complex(spinor * const, spinor * const, const int, double, matrix_mult, matrix_mult_dagg)
+ *     BiCG solver
+ * 
+ **************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "global.h"
+#include "su3.h"
+#include "linalg_eo.h"
+#include "start.h"
+#include "solver_field.h"
+#include "bicg_complex.h"
+
+/* P inout (guess for the solving spinor)
+   Q input
+*/
+int bicg_complex(spinor * const P,spinor * const Q, const int max_iter, 
+		 double eps_sq, const int rel_prec, 
+		 const int N, matrix_mult f, matrix_mult fdagg){
+  double err, squarenorm;
+  _Complex double rho0, rho1, alpha, beta, alphastar, betastar, denom;
+  int i;
+  //spinor * r, * p, * v, *hatr, * s, * t;
+  spinor * p, * phat, * r, * rhat, *tmp, *tmp2;
+  spinor ** solver_field = NULL;
+  const int nr_sf = 6;
+
+  if(N == VOLUME) {
+    init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf);
+  }
+  else {
+    init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf);
+  }
+  r = solver_field[0];
+  rhat = solver_field[1];
+  p = solver_field[2];
+  phat = solver_field[3];
+  tmp = solver_field[4];
+  tmp2 = solver_field[5];
+
+  f(tmp, P);
+  diff(r, Q, tmp, N); // r = Q - AP
+  assign(p, r, N);
+
+  //fdagg(tmp2, P);
+  //diff(rhat, Q, tmp2, N); //rhat = Q - Adagg P
+  //assign(phat, rhat, N);
+
+  // make rhat different from r, otherwise it won't work
+  //random_spinor_field(tmp2, N, 1);
+  random_spinor_field_eo(tmp2, 0, RN_GAUSS);
+  assign(rhat, tmp2, N);  
+  assign(phat, tmp2, N);  
+
+  rho0 = scalar_prod(rhat, r, N, 1);
+  squarenorm = square_norm(Q, N, 1);
+
+  printf("rho0 = %f + %fI, squarenorm = %f\n", creal(rho0), cimag(rho0), squarenorm);
+
+  for(i = 0; i < max_iter; i++){
+    err = square_norm(r, N, 1);
+    if(g_proc_id == g_stdio_proc && g_debug_level > 1) {
+      printf("%d %e\n", i, err);
+      fflush(stdout);
+    }
+
+    if((((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))) && i>0) {
+      finalize_solver(solver_field, nr_sf);
+      return(i);
+    }
+
+
+    f(tmp, p);
+    fdagg(tmp2, phat);
+    denom = scalar_prod(phat, tmp, N, 1);
+    alpha = rho0/denom;
+    alphastar = conj(alpha);
+
+    assign_add_mul(P, p, alpha, N);
+    assign_diff_mul(r, tmp, alpha, N);
+    assign_diff_mul(rhat, tmp2, alphastar, N);
+
+    rho1 = scalar_prod(rhat, r, N, 1);
+    if(fabs(creal(rho1)) < 1.e-25 && fabs(cimag(rho1)) < 1.e-25)
+      {
+	finalize_solver(solver_field, nr_sf);
+	return(-1);
+      }
+    beta = rho1/rho0;
+    betastar = conj(beta);
+    mul(tmp, beta, p, N);
+    add(p, r, tmp, N);
+    mul(tmp2, betastar, phat, N);
+    add(phat, rhat, tmp2, N);
+
+    rho0 = rho1;
+  }
+  finalize_solver(solver_field, nr_sf);
+  return -1;
+}
+
+
diff --git a/solver/bicg_complex.h b/solver/bicg_complex.h
new file mode 100644
index 000000000..d52f30576
--- /dev/null
+++ b/solver/bicg_complex.h
@@ -0,0 +1,29 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _BICG_COMPLEX_H
+#define _BICG_COMPLEX_H
+
+#include"solver/matrix_mult_typedef.h"
+#include"su3.h"
+
+int bicg_complex(spinor * const, spinor * const, const int max_iter, double eps_sq, 
+		     const int rel_prec, const int N, matrix_mult f, matrix_mult fdagg);
+
+#endif
diff --git a/solver/bicgstab2.c b/solver/bicgstab2.c
index 4c7d58d65..0163bfd61 100644
--- a/solver/bicgstab2.c
+++ b/solver/bicgstab2.c
@@ -29,7 +29,7 @@
  *************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/solver/bicgstab_complex.c b/solver/bicgstab_complex.c
index accabcf10..daa8f5ec5 100644
--- a/solver/bicgstab_complex.c
+++ b/solver/bicgstab_complex.c
@@ -31,7 +31,7 @@
  **************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/solver/bicgstab_complex_bi.c b/solver/bicgstab_complex_bi.c
index bdacd25a5..8a3ff8b66 100644
--- a/solver/bicgstab_complex_bi.c
+++ b/solver/bicgstab_complex_bi.c
@@ -31,7 +31,7 @@
  **************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/solver/bicgstabell.c b/solver/bicgstabell.c
index 1550a2df0..867232cb0 100644
--- a/solver/bicgstabell.c
+++ b/solver/bicgstabell.c
@@ -29,7 +29,7 @@
  *************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/solver/cg_her.c b/solver/cg_her.c
index 6bf0d3b1a..ed3271d5c 100644
--- a/solver/cg_her.c
+++ b/solver/cg_her.c
@@ -39,13 +39,13 @@
  **************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <time.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
@@ -66,7 +66,8 @@ int cg_her(spinor * const P, spinor * const Q, const int max_iter,
   int iteration;
   int save_sloppy = g_sloppy_precision;
   double atime, etime, flops;
-  spinor ** solver_field = NULL;
+  static spinor ** solver_field = NULL;
+  static int cg_init = 0;
   spinor * stmp;
   const int nr_sf = 3;
 
@@ -75,7 +76,7 @@ int cg_her(spinor * const P, spinor * const Q, const int max_iter,
   } 
   else {
     init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); 
-  } 
+  }
   /* initialize residue r and search vector p */
   atime = gettime();
   squarenorm = square_norm(Q, N, 1);
@@ -129,10 +130,12 @@ int cg_her(spinor * const P, spinor * const Q, const int max_iter,
   /* 2 A + 2 Nc Ns + N_Count ( 2 A + 10 Nc Ns ) */
   /* 2*1608.0 because the linalg is over VOLUME/2 */
   flops = (2*(2*1608.0+2*3*4) + 2*3*4 + iteration*(2.*(2*1608.0+2*3*4) + 10*3*4))*N/1.0e6f;
-  if(g_debug_level > 0 && g_proc_id == 0 && N != VOLUME) {
+  if(g_debug_level > 0 && g_proc_id == 0) {
     printf("# CG: iter: %d eps_sq: %1.4e t/s: %1.4e\n", iteration, eps_sq, etime-atime); 
-    printf("# CG: flopcount (for e/o tmWilson only): t/s: %1.4e mflops_local: %.1f mflops: %.1f\n", 
-           etime-atime, flops/(etime-atime), g_nproc*flops/(etime-atime));
+    if( N != VOLUME) {
+      printf("# CG: flopcount (for e/o tmWilson only): t/s: %1.4e mflops_local: %.1f mflops: %.1f\n", 
+	     etime-atime, flops/(etime-atime), g_nproc*flops/(etime-atime));
+    }
   }
   finalize_solver(solver_field, nr_sf);
   if(iteration > max_iter) return(-1);
diff --git a/solver/cg_her_bi.c b/solver/cg_her_bi.c
index d1f50dd62..94c3109c7 100644
--- a/solver/cg_her_bi.c
+++ b/solver/cg_her_bi.c
@@ -46,7 +46,7 @@
  **************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/solver/cg_her_nd.c b/solver/cg_her_nd.c
index e0fe53411..b4177e803 100644
--- a/solver/cg_her_nd.c
+++ b/solver/cg_her_nd.c
@@ -39,7 +39,7 @@
  **************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -48,6 +48,7 @@
 #include "su3.h"
 #include "linalg_eo.h"
 #include "start.h"
+#include "gettime.h"
 #include "solver/matrix_mult_typedef_nd.h"
 #include "sub_low_ev.h"
 #include "solver_field.h"
@@ -60,6 +61,7 @@ int cg_her_nd(spinor * const P_up,spinor * P_dn, spinor * const Q_up, spinor * c
   double normsp, normsq, pro, err, alpha_cg, beta_cg, squarenorm;
   int iteration;
   double err1, err2;
+  double atime, etime, flops;
   spinor ** up_field = NULL;
   spinor ** dn_field = NULL;  
   const int nr_sf = 5;
@@ -67,6 +69,7 @@ int cg_her_nd(spinor * const P_up,spinor * P_dn, spinor * const Q_up, spinor * c
   init_solver_field(&up_field, VOLUMEPLUSRAND, nr_sf);
   init_solver_field(&dn_field, VOLUMEPLUSRAND, nr_sf);
 
+  atime = gettime();
   squarenorm = square_norm(Q_up, N, 1);
   squarenorm+= square_norm(Q_dn, N, 1);
   /*        !!!!   INITIALIZATION    !!!! */
@@ -130,12 +133,7 @@ int cg_her_nd(spinor * const P_up,spinor * P_dn, spinor * const Q_up, spinor * c
     }
 
     if(((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))) {
-      assign(P_up, up_field[0], N);
-      assign(P_dn, dn_field[0], N);
-      g_sloppy_precision = 0;
-      finalize_solver(up_field, nr_sf);
-      finalize_solver(dn_field, nr_sf);
-      return(iteration+1);
+      break;
     }
 #ifdef _USE_HALFSPINOR
     if(((err*err <= eps_sq) && (rel_prec == 0)) || ((err*err <= eps_sq*squarenorm) && (rel_prec == 1))) {
@@ -156,10 +154,16 @@ int cg_her_nd(spinor * const P_up,spinor * P_dn, spinor * const Q_up, spinor * c
   assign(P_up, up_field[0], N);
   assign(P_dn, dn_field[0], N);
   g_sloppy_precision = 0;  
+
+  etime = gettime();
+  if(g_debug_level > 0 && g_proc_id == 0) {
+    printf("# CG: iter: %d eps_sq: %1.4e t/s: %1.4e\n", iteration, eps_sq, etime-atime); 
+  }
   
   finalize_solver(up_field, nr_sf);
   finalize_solver(dn_field, nr_sf);
-  return(-1);
+  if(iteration > max_iter) return(-1);
+  return(iteration);
 }
 
 
diff --git a/solver/cg_her_su3vect.c b/solver/cg_her_su3vect.c
index 514aea4bf..a2d2d97a2 100755
--- a/solver/cg_her_su3vect.c
+++ b/solver/cg_her_su3vect.c
@@ -25,13 +25,13 @@
  **************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <time.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
diff --git a/solver/cg_mms_tm.c b/solver/cg_mms_tm.c
index 9e88ef743..bdb38047e 100644
--- a/solver/cg_mms_tm.c
+++ b/solver/cg_mms_tm.c
@@ -28,13 +28,13 @@
  * in modulus. The code will use shift[i]^2, which are all >0
  *
  * parameters:
- * shifts are given to the solver in solver_pm->shifts
- * number of shifts is in solver_pm->no_shifts
- * the operator to invert in solver_pm->M_ndpsi
+ * shifts are given to the solver in solver_params->shifts
+ * number of shifts is in solver_params->no_shifts
+ * the operator to invert in solver_params->M_ndpsi
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -63,17 +63,17 @@ static void free_mms_tm();
 
 /* P output = solution , Q input = source */
 int cg_mms_tm(spinor ** const P, spinor * const Q,
-		 solver_pm_t * solver_pm, double * cgmms_reached_prec) {
+		 solver_params_t * solver_params) {
 
   static double normsq, pro, err, squarenorm;
-  int iteration, N = solver_pm->sdim, no_shifts = solver_pm->no_shifts;
+  int iteration, N = solver_params->sdim, no_shifts = solver_params->no_shifts;
   static double gamma, alpham1;
   spinor ** solver_field = NULL;
   double atime, etime;
   const int nr_sf = 3;
 
   atime = gettime();
-  if(solver_pm->sdim == VOLUME) {
+  if(solver_params->sdim == VOLUME) {
     init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf);
     init_mms_tm(no_shifts, VOLUMEPLUSRAND);
   } 
@@ -85,12 +85,12 @@ int cg_mms_tm(spinor ** const P, spinor * const Q,
   zero_spinor_field(P[0], N);
   alphas[0] = 1.0;
   betas[0] = 0.0;
-  sigma[0] = solver_pm->shifts[0]*solver_pm->shifts[0];
-  if(g_proc_id == 0 && g_debug_level > 2) printf("# CGMMS: shift %d is %e\n", 0, sigma[0]);
+  sigma[0] = solver_params->shifts[0]*solver_params->shifts[0];
+  if(g_proc_id == 0 && g_debug_level > 1) printf("# CGMMS: shift %d is %e\n", 0, sigma[0]);
 
   for(int im = 1; im < no_shifts; im++) {
-    sigma[im] = solver_pm->shifts[im]*solver_pm->shifts[im] - sigma[0];
-    if(g_proc_id == 0 && g_debug_level > 2) printf("# CGMMS: shift %d is %e\n", im, sigma[im]);
+    sigma[im] = solver_params->shifts[im]*solver_params->shifts[im] - sigma[0];
+    if(g_proc_id == 0 && g_debug_level > 1) printf("# CGMMS: shift %d is %e\n", im, sigma[im]);
     // these will be the result spinor fields
     zero_spinor_field(P[im], N);
     // these are intermediate fields
@@ -109,10 +109,10 @@ int cg_mms_tm(spinor ** const P, spinor * const Q,
   normsq = squarenorm;
 
   /* main loop */
-  for(iteration = 0; iteration < solver_pm->max_iter; iteration++) {
+  for(iteration = 0; iteration < solver_params->max_iter; iteration++) {
 
     /*   Q^2*p and then (p,Q^2*p)  */
-    solver_pm->M_psi(solver_field[2], solver_field[1]);
+    solver_params->M_psi(solver_field[2], solver_field[1]);
     // add the zero's shift
     assign_add_mul_r(solver_field[2], solver_field[1], sigma[0], N);
     pro = scalar_prod_r(solver_field[1], solver_field[2], N, 1);
@@ -143,13 +143,22 @@ int cg_mms_tm(spinor ** const P, spinor * const Q,
       // falls below a threshold
       // this is useful for computing time and needed, because otherwise
       // zita might get smaller than DOUBLE_EPS and, hence, zero
-      if(iteration > 0 && (iteration % 20 == 0) && (im == no_shifts-1)) {
-	double sn = square_norm(ps_mms_solver[im-1], N, 1);
-	if(alphas[no_shifts-1]*alphas[no_shifts-1]*sn <= solver_pm->squared_solver_prec) {
+      if(iteration > 0 && (iteration % 10 == 0) && (im == no_shifts-1)) {
+	double sn = square_norm(ps_mms_solver[no_shifts-2], N, 1);
+        err = alphas[no_shifts-1]*alphas[no_shifts-1]*sn;
+        // while because more than one shift could be converged
+	while(((err <= solver_params->squared_solver_prec) && (solver_params->rel_prec == 0)) ||
+              ((err <= solver_params->squared_solver_prec*squarenorm) && (solver_params->rel_prec > 0))) {
 	  no_shifts--;
 	  if(g_debug_level > 2 && g_proc_id == 0) {
 	    printf("# CGMMS: at iteration %d removed one shift, %d remaining\n", iteration, no_shifts);
       	  }
+          if(no_shifts>1) {
+            sn = square_norm(ps_mms_solver[no_shifts-2], N, 1);
+            err = alphas[no_shifts-1]*alphas[no_shifts-1]*sn;
+          } else {
+            break;
+          }
 	}
       }
     }
@@ -167,12 +176,10 @@ int cg_mms_tm(spinor ** const P, spinor * const Q,
       printf("# CGMMS iteration: %d residue: %g\n", iteration, err); fflush( stdout );
     }
 
-    if( ((err <= solver_pm->squared_solver_prec) && (solver_pm->rel_prec == 0)) ||
-        ((err <= solver_pm->squared_solver_prec*squarenorm) && (solver_pm->rel_prec > 0)) ||
-        (iteration == solver_pm->max_iter -1) ) {
-      /* FIXME temporary output of precision until a better solution can be found */
-      *cgmms_reached_prec = err;
-      break;
+    if( ((err <= solver_params->squared_solver_prec) && (solver_params->rel_prec == 0)) ||
+        ((err <= solver_params->squared_solver_prec*squarenorm) && (solver_params->rel_prec > 0)) ||
+        (iteration == solver_params->max_iter -1) ) {
+        break;
     }
 
     /* Compute betas[0](i+1) = (r(i+1),r(i+1))/(r(i),r(i))
@@ -190,12 +197,12 @@ int cg_mms_tm(spinor ** const P, spinor * const Q,
   }
   etime = gettime();
   g_sloppy_precision = 0;
-  if(iteration == solver_pm->max_iter -1) iteration = -1;
+  if(iteration == solver_params->max_iter -1) iteration = -1;
   else iteration++;
   if(g_debug_level > 0 && g_proc_id == 0) {
-    printf("# CGMMS (%d shifts): iter: %d eps_sq: %1.4e %1.4e t/s\n", solver_pm->no_shifts, iteration, solver_pm->squared_solver_prec, etime - atime); 
+    printf("# CGMMS (%d shifts): iter: %d eps_sq: %1.4e %1.4e t/s\n", solver_params->no_shifts, iteration, solver_params->squared_solver_prec, etime - atime); 
   }
-  
+
   finalize_solver(solver_field, nr_sf);
   return(iteration);
 }
diff --git a/solver/cg_mms_tm.h b/solver/cg_mms_tm.h
index 1b70facbe..914e928c6 100644
--- a/solver/cg_mms_tm.h
+++ b/solver/cg_mms_tm.h
@@ -28,6 +28,6 @@
 #include "matrix_mult_typedef.h"
 #include "su3.h"
 
-int cg_mms_tm(spinor ** const P,spinor * const Q, solver_pm_t * const params, double * reached_prec);
+int cg_mms_tm(spinor ** const P,spinor * const Q, solver_params_t * const params);
 
 #endif
diff --git a/solver/cg_mms_tm_nd.c b/solver/cg_mms_tm_nd.c
index 9da378692..8d0c695cd 100644
--- a/solver/cg_mms_tm_nd.c
+++ b/solver/cg_mms_tm_nd.c
@@ -28,13 +28,13 @@
  * in modulus. The code will use shift[i]^2, which are all >0
  *
  * parameters:
- * shifts are given to the solver in solver_pm->shifts
- * number of shifts is in solver_pm->no_shifts
- * the operator to invert in solver_pm->M_ndpsi
+ * shifts are given to the solver in solver_params->shifts
+ * number of shifts is in solver_params->no_shifts
+ * the operator to invert in solver_params->M_ndpsi
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -63,17 +63,19 @@ static void free_mms_tm_nd();
 /* P output = solution , Q input = source */
 int cg_mms_tm_nd(spinor ** const Pup, spinor ** const Pdn, 
 		 spinor * const Qup, spinor * const Qdn, 
-		 solver_pm_t * solver_pm) {
+		 solver_params_t * solver_params) {
 
   static double normsq, pro, err, squarenorm;
-  int iteration, N = solver_pm->sdim, shifts = solver_pm->no_shifts;
+  int iteration, N = solver_params->sdim, shifts = solver_params->no_shifts;
   static double gamma, alpham1;
   spinor ** solver_field = NULL;
   double atime, etime;
   const int nr_sf = 4;
 
+  if(g_proc_id == 0 && g_debug_level > 2) printf("# CGMMSND: solving %d shifts\n", shifts);
+
   atime = gettime();
-  if(solver_pm->sdim == VOLUME) {
+  if(solver_params->sdim == VOLUME) {
     init_solver_field(&solver_field, VOLUMEPLUSRAND, 2*nr_sf);
   } 
   else {
@@ -82,7 +84,7 @@ int cg_mms_tm_nd(spinor ** const Pup, spinor ** const Pdn,
 
   // don't need boundaries, because we never apply f to them
   // so N is enough
-  //init_mms_tm_nd(shifts, solver_pm->N);
+  //init_mms_tm_nd(shifts, solver_params->N);
   init_mms_tm_nd(shifts, VOLUMEPLUSRAND/2);
   zero_spinor_field(Pup[0], N);
   zero_spinor_field(Pdn[0], N);
@@ -90,13 +92,13 @@ int cg_mms_tm_nd(spinor ** const Pup, spinor ** const Pdn,
   assign(ps_mms_solver[1], Qdn, N);
   alphas[0] = 1.0;
   betas[0] = 0.0;
-  sigma[0] = solver_pm->shifts[0]*solver_pm->shifts[0];
-  if(g_proc_id == 0 && g_debug_level > 2) printf("# CGMMSND: shift %d is %e\n", 0, sigma[0]);
+  sigma[0] = solver_params->shifts[0]*solver_params->shifts[0];
+  if(g_proc_id == 0 && g_debug_level > 2) printf("# CGMMSND: shift %d is %e\n", 0, solver_params->shifts[0]);
 
   /* currently only implemented for P=0 */
   for(int im = 1; im < shifts; im++) {
-    sigma[im] = solver_pm->shifts[im]*solver_pm->shifts[im] - sigma[0];
-    if(g_proc_id == 0 && g_debug_level > 2) printf("# CGMMSND: shift %d is %e\n", im, sigma[im]);
+    sigma[im] = solver_params->shifts[im]*solver_params->shifts[im] - sigma[0];
+    if(g_proc_id == 0 && g_debug_level > 2) printf("# CGMMSND: shift %d is %e\n", im, solver_params->shifts[im]);
     // these will be the result spinor fields
     zero_spinor_field(Pup[im], N);
     zero_spinor_field(Pdn[im], N);
@@ -118,10 +120,10 @@ int cg_mms_tm_nd(spinor ** const Pup, spinor ** const Pdn,
   normsq = squarenorm;
 
   /* main loop */
-  for(iteration = 0; iteration < solver_pm->max_iter; iteration++) {
+  for(iteration = 0; iteration < solver_params->max_iter; iteration++) {
 
     /*   Q^2*p and then (p,Q^2*p)  */
-    solver_pm->M_ndpsi(solver_field[6], solver_field[7], solver_field[2], solver_field[3]);
+    solver_params->M_ndpsi(solver_field[6], solver_field[7], solver_field[2], solver_field[3]);
     // add the zero's shift
     assign_add_mul_r(solver_field[6], solver_field[2], sigma[0], N);
     assign_add_mul_r(solver_field[7], solver_field[3], sigma[0], N);
@@ -155,14 +157,36 @@ int cg_mms_tm_nd(spinor ** const Pup, spinor ** const Pdn,
       // falls below a threshold
       // this is useful for computing time and needed, because otherwise
       // zita might get smaller than DOUBLE_EPS and, hence, zero
-      if(iteration > 0 && (iteration % 20 == 0) && (im == shifts-1)) {
-	double sn = square_norm(ps_mms_solver[2*im], N, 1);
-	sn += square_norm(ps_mms_solver[2*im+1], N, 1);
-	if(alphas[shifts-1]*alphas[shifts-1]*sn <= solver_pm->squared_solver_prec) {
+      if(iteration > 0 && (iteration % 10 == 0) && (im == shifts-1)) {
+        double sn = square_norm(ps_mms_solver[2*(shifts-1)], N, 1);
+        sn += square_norm(ps_mms_solver[2*(shifts-1)+1], N, 1);
+        err = alphas[shifts-1]*alphas[shifts-1]*sn;
+	while(((err <= solver_params->squared_solver_prec) && (solver_params->rel_prec == 0)) ||
+              ((err <= solver_params->squared_solver_prec*squarenorm) && (solver_params->rel_prec > 0))) {
 	  shifts--;
+          // for testing purpose
+	  if(g_debug_level > 3) {
+	    if (g_proc_id == 0) printf("# CGMMSND: residual of remaining shifts\n");
+	    if (g_proc_id == 0) printf("#\t id\t\t shift\t residual\n");
+            for(int is = shifts; is>0; is--) {
+              sn = square_norm(ps_mms_solver[2*is], N, 1);
+              sn += square_norm(ps_mms_solver[2*is+1], N, 1);
+              err = alphas[is]*alphas[is]*sn;
+              if (g_proc_id == 0) printf("#\t %d\t\t %e\t %e\n", is, solver_params->shifts[is], solver_params->rel_prec ? err/squarenorm : err);
+            }
+            if (g_proc_id == 0) printf("#\t %d\t\t %e\t %e\n", 0, solver_params->shifts[0], solver_params->rel_prec ? normsq/squarenorm : normsq);
+	  }
 	  if(g_debug_level > 2 && g_proc_id == 0) {
-	    printf("# CGMMSND: at iteration %d removed one shift, %d remaining\n", iteration, shifts);
+	    printf("# CGMMSND: at iteration %d removed one shift with residual %e. %d shifts remaining\n", iteration, solver_params->rel_prec ? err/squarenorm : err, shifts);
 	  }
+          // computing next shift residual and looping for all the converged
+          if(shifts>1) {
+            sn = square_norm(ps_mms_solver[2*(shifts-1)], N, 1);
+            sn += square_norm(ps_mms_solver[2*(shifts-1)+1], N, 1);
+            err = alphas[shifts-1]*alphas[shifts-1]*sn;
+          } else {
+            break;
+          }
 	}
       }
     }
@@ -182,9 +206,9 @@ int cg_mms_tm_nd(spinor ** const Pup, spinor ** const Pdn,
       printf("# CGMMSND iteration: %d residue: %g\n", iteration, err); fflush( stdout );
     }
 
-    if( ((err <= solver_pm->squared_solver_prec) && (solver_pm->rel_prec == 0)) ||
-	((err <= solver_pm->squared_solver_prec*squarenorm) && (solver_pm->rel_prec > 0)) ||
-        (iteration == solver_pm->max_iter -1) ) {
+    if( ((err <= solver_params->squared_solver_prec) && (solver_params->rel_prec == 0)) ||
+	((err <= solver_params->squared_solver_prec*squarenorm) && (solver_params->rel_prec > 0)) ||
+        (iteration == solver_params->max_iter -1) ) {
       break;
     }
 
@@ -205,12 +229,12 @@ int cg_mms_tm_nd(spinor ** const Pup, spinor ** const Pdn,
   }
   etime = gettime();
   g_sloppy_precision = 0;
-  if(iteration == solver_pm->max_iter -1) iteration = -1;
+  if(iteration == solver_params->max_iter -1) iteration = -1;
   else iteration++;
   if(g_debug_level > 0 && g_proc_id == 0) {
-    printf("# CGMMS (%d shifts): iter: %d eps_sq: %1.4e %1.4e t/s\n", solver_pm->no_shifts, iteration, solver_pm->squared_solver_prec, etime - atime); 
+    printf("# CGMMSND (%d shifts): iter: %d eps_sq: %1.4e %1.4e t/s\n", solver_params->no_shifts, iteration, solver_params->squared_solver_prec, etime - atime); 
   }
-  
+
   finalize_solver(solver_field, 2*nr_sf);
   return(iteration);
 }
diff --git a/solver/cg_mms_tm_nd.h b/solver/cg_mms_tm_nd.h
index 1c767d903..bf1c57f25 100644
--- a/solver/cg_mms_tm_nd.h
+++ b/solver/cg_mms_tm_nd.h
@@ -29,6 +29,6 @@
 
 int cg_mms_tm_nd(spinor ** const Pup, spinor ** const Pdn, 
 		 spinor * const Qup, spinor * const Qdn, 
-		 solver_pm_t * solver_pm);
+		 solver_params_t * solver_params);
 
 #endif
diff --git a/solver/cgne4complex.c b/solver/cgne4complex.c
new file mode 100644
index 000000000..d49630e9c
--- /dev/null
+++ b/solver/cgne4complex.c
@@ -0,0 +1,89 @@
+/***********************************************************************
+ * Copyright (C) 2013 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include<stdlib.h>
+#include<stdio.h>
+#include<math.h>
+#include<string.h>
+#include"global.h"
+#include"su3.h"
+#include"linalg_eo.h"
+#include"gettime.h"
+#include"gcr4complex.h"
+#include"cgne4complex.h"
+
+int cgne4complex(_Complex double * const P, _Complex double * const Q, 
+		 const int max_iter, const double eps_sq, const int rel_prec,
+		 const int N, const int lda, c_matrix_mult f) {
+  
+  double normsq, pro, err, alpha_cg, beta_cg, squarenorm;
+  _Complex double *w_f[3], * _w_f, *stmp;
+  double atime, etime;
+  int iter;
+  
+  _w_f = (_Complex double *)malloc(3*lda*sizeof(_Complex double));
+  w_f[0] = _w_f; w_f[1] = _w_f+lda; w_f[2] = _w_f+2*lda;
+  
+    /* initialize residue r and search vector p */
+  atime = gettime();
+  squarenorm = lsquare_norm(Q, N, 1);
+
+  f(w_f[0], P);  
+
+  ldiff(w_f[1], Q, w_f[0], N);
+  memcpy(w_f[2], w_f[1], N*sizeof(_Complex double));
+  normsq=lsquare_norm(w_f[1], N, 1);
+
+  /* main loop */
+  for(iter = 1; iter <= max_iter; iter++) {
+    f(w_f[0], w_f[2]);
+    pro = lscalar_prod_r(w_f[2], w_f[0], N, 1);
+    alpha_cg = normsq / pro;
+    lassign_add_mul_r(P, w_f[2], alpha_cg, N);
+
+    lassign_mul_add_r(w_f[0], -alpha_cg, w_f[1], N);
+    err = lsquare_norm(w_f[0], N, 1);
+    if(g_proc_id == g_stdio_proc && g_debug_level > 2) {
+      printf("lCG: iterations: %d res^2 %e\n", iter, err);
+      fflush(stdout);
+    }
+
+    if (((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))) {
+      break;
+    }
+
+    beta_cg = err / normsq;
+    lassign_mul_add_r(w_f[2], beta_cg, w_f[0], N);
+    stmp = w_f[0];
+    w_f[0] = w_f[1];
+    w_f[1] = stmp;
+    normsq = err;
+  }
+  etime = gettime();
+  if(g_debug_level > 0 && g_proc_id == 0) {
+    printf("# lCG: iter: %d eps_sq: %1.4e t/s: %1.4e\n", iter, eps_sq, etime-atime); 
+  }
+  free(_w_f);
+  if(iter > max_iter) return(-1);
+  return(iter);
+
+}
diff --git a/solver/cgne4complex.h b/solver/cgne4complex.h
new file mode 100644
index 000000000..b4733d300
--- /dev/null
+++ b/solver/cgne4complex.h
@@ -0,0 +1,28 @@
+#ifndef _CGNE4COMPLEX_H
+#define _CGNE4COMPLEX_H
+
+/***********************************************************************
+ * Copyright (C) 2013 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+
+int cgne4complex(_Complex double * const P, _Complex double * const Q, 
+		 const int max_iter, const double eps_sq, const int rel_prec,
+		 const int N, const int lda, c_matrix_mult f);
+
+#endif
diff --git a/solver/cgs_real.c b/solver/cgs_real.c
index d55c385a0..9dca7143b 100644
--- a/solver/cgs_real.c
+++ b/solver/cgs_real.c
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/solver/chrono_guess.c b/solver/chrono_guess.c
index a2ecfd7f9..695c12b94 100644
--- a/solver/chrono_guess.c
+++ b/solver/chrono_guess.c
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/solver/cr.c b/solver/cr.c
new file mode 100644
index 000000000..5ad2594b4
--- /dev/null
+++ b/solver/cr.c
@@ -0,0 +1,140 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include<stdlib.h>
+#include<stdio.h>
+#include<math.h>
+#include"global.h"
+#include"su3.h"
+#include"linalg_eo.h"
+#include"solver/gmres_precon.h"
+#include"start.h"
+#include"operator/tm_operators.h"
+#include"solver/poly_precon.h"
+#include"solver/cg_her.h"
+#include"operator/D_psi.h"
+#include"Msap.h"
+#include"dfl_projector.h"
+#include "solver_field.h"
+#include"cr.h"
+#include"time.h"
+#include "gettime.h"
+
+int cr(spinor * const P, spinor * const Q, 
+        const int m, const int max_restarts,
+        const double eps_sq, const int rel_prec,
+        const int N, const int precon, matrix_mult f) {
+
+    int k, l, restart, i, iter = 0;
+    double norm_sq, err;
+    spinor * xi, * Axi, * chi, * Achi, *tmp;
+    _Complex double alpha, beta;
+    static _Complex double one = 1.0;
+    double norm, rAr, newrAr;
+    double atime, etime;
+    spinor ** solver_field = NULL;
+    const int nr_sf = 5;
+    int save_sloppy = g_sloppy_precision;
+
+    if(N == VOLUME) {
+        init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf);
+    }
+    else {
+        init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf);
+    }
+
+    atime = gettime();
+
+    xi = solver_field[0];
+    Axi = solver_field[1];
+    chi = solver_field[2];
+    Achi = solver_field[3];
+    tmp = solver_field[4];
+
+    norm_sq = square_norm(Q, N, 1);
+    if(norm_sq < 1.e-32) {
+        norm_sq = 1.;
+    }
+
+    dfl_sloppy_prec = 0;
+    f(tmp, P);
+    diff(chi, Q, tmp, N);
+    assign(xi, chi, N);
+    f(Axi, xi);
+    f(Achi, chi);
+    rAr = scalar_prod(chi, Achi, N, 1);
+    err = square_norm(chi, N, 1);
+    if(((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*norm_sq) && (rel_prec == 1))) {
+        finalize_solver(solver_field, nr_sf);
+        return(iter);
+    }
+    
+
+    for(k = 0; k < m; k++) {
+
+        dfl_sloppy_prec = 1;
+
+        norm = square_norm(Axi, N, 1);
+        alpha = rAr/norm;
+        assign_add_mul(P, xi, alpha, N);
+        /* get the new residual */
+        assign_diff_mul(chi, Axi, alpha, N);
+
+        err = square_norm(chi, N, 1);
+        iter ++;
+        etime = gettime();
+        if(g_proc_id == g_stdio_proc && g_debug_level > 3){
+            printf("# CR: %d\t%g iterated residue, time spent %f s\n", iter, err, (etime - atime)); 
+            fflush(stdout);
+        }
+        /* Precision reached? */
+        if((k == m-1) || ((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*norm_sq) && (rel_prec == 1))) {
+            break;
+        }
+
+#ifdef _USE_HALFSPINOR
+        if(((err*err <= eps_sq) && (rel_prec == 0)) || ((err*err <= eps_sq*norm_sq) && (rel_prec == 1))) {
+            if (g_sloppy_precision_flag == 1) {
+                g_sloppy_precision = 1;
+                if(g_debug_level > 2 && g_proc_id == g_stdio_proc) {
+                    printf("sloppy precision on\n"); fflush( stdout);
+                }
+            }
+        }
+#endif
+
+        f(Achi, chi); 
+
+        newrAr = scalar_prod(chi, Achi, N, 1); 
+        beta = newrAr/rAr;
+        assign_mul_add_mul(xi, beta, chi, one, N);
+        assign_mul_add_mul(Axi,beta, Achi, one, N);
+        rAr = newrAr;
+
+    }
+
+    g_sloppy_precision = save_sloppy;
+    finalize_solver(solver_field, nr_sf);
+    return(-1);
+}
+
+
diff --git a/solver/cr.h b/solver/cr.h
new file mode 100644
index 000000000..c868c3eb1
--- /dev/null
+++ b/solver/cr.h
@@ -0,0 +1,31 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _CR_H
+#define _CR_H
+
+#include"solver/matrix_mult_typedef.h"
+#include"su3.h"
+
+int cr(spinor * const P, spinor * const Q, 
+	const int m, const int max_restarts,
+	const double eps_sq, const int rel_prec,
+	const int N, const int precon, matrix_mult f);
+
+#endif
diff --git a/solver/dfl_projector.c b/solver/dfl_projector.c
index eb79f2467..d88e61bbf 100644
--- a/solver/dfl_projector.c
+++ b/solver/dfl_projector.c
@@ -1,6 +1,6 @@
 /***********************************************************************
  *
- * Copyright (C) 2008 Alber Deuzeman, Siebren Reker, Carsten Urbach
+ * Copyright (C) 2008 Albert Deuzeman, Siebren Reker, Carsten Urbach
  *
  * This file is part of tmLQCD.
  *
@@ -19,13 +19,13 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <string.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
@@ -39,28 +39,39 @@
 #include "block.h"
 #include "linalg_eo.h"
 #include "gcr4complex.h"
+#include "fgmres4complex.h"
+#include "mcr4complex.h"
+#include "mr4complex.h"
+#include "cgne4complex.h"
 #include "generate_dfl_subspace.h"
 #include "operator/tm_operators.h"
+#include "operator/clovertm_operators.h"
 #include "boundary.h"
 #include "Msap.h"
 #include "mr.h"
 #include "solver_field.h"
+#include "solver.h"
 #include "dfl_projector.h"
 
-double dfl_little_D_prec = 1.e-24;
-int dfl_sloppy_prec = 0;
+int dfl_sloppy_prec = 1;
 int init_dfl_projector = 0;
 spinor **psi;
 _Complex double *inprod;
+_Complex float  *inprod32;
 _Complex double *inprod_eo;
 _Complex double *inprod_o;
+_Complex float *inprod_o32;
 _Complex double *inprod_e;
 _Complex double *invvec;
+_Complex float  *invvec32;
 _Complex double *invvec_eo;
+_Complex float *invvec_eo32;
 _Complex double *ctmp;
 _Complex double *work_block;
 const int dfl_work_size = 16;
 _Complex double *work[16];
+int cumiter_lgcr = 0;
+
 
 static void alloc_dfl_projector();
 
@@ -74,13 +85,15 @@ static void alloc_dfl_projector();
 
 /* this is phi_k A^{-1}_{kl} (phi_k, in) */
 void project(spinor * const out, spinor * const in) {
-  int i,j, i_e, i_o, iter;
-  int evenodd = 0;
-  int usePL = 0;
+  int i_e, i_o, iter;
+  int evenodd = 1;
+  int gcr32 = 1;
+  int little_m = little_gmres_m_parameter;
+  int little_max_iter = little_solver_max_iter;
   int vol = block_list[0].volume;
   _Complex double * v, * w;
   double prec;
-  
+  evenodd = little_evenodd;
   if(init_dfl_projector == 0) {
     alloc_dfl_projector();
   }
@@ -88,135 +101,174 @@ void project(spinor * const out, spinor * const in) {
   w = work[1]; 
   /*initialize the local (block) parts of the spinor*/
   split_global_field_GEN(psi, in, nb_blocks);
-  
-  for (j = 0; j < g_N_s*nb_blocks*9; j++) {
+
+  for (int j = 0; j < g_N_s*nb_blocks*9; j++) {
     (inprod[j]) = 0.0;
+    (inprod32[j]) = 0.0;
     (inprod_o[j]) = 0.0;
+    (inprod_o32[j]) = 0.0;
     (inprod_eo[j]) = 0.0;
     (inprod_e[j]) = 0.0;
     (invvec[j]) = 0.0;
+    (invvec32[j]) = 0.0;
     (invvec_eo[j]) = 0.0;
+    (invvec_eo32[j]) = 0.0;
     (ctmp[j]) = 0.0;
     (w[j]) = 0.0;
     (v[j]) = 0.0;
   }
-  
-  for (j = 0; j < g_N_s; j++) {/*loop over block.basis */
-    i_o=0;
-    i_e=0;
-    for(i = 0; i < nb_blocks; i++) {
+
+  for (int j = 0; j < g_N_s; j++) {/*loop over block.basis */
+    i_o = 0;
+    i_e = 0;
+    for(int i = 0; i < nb_blocks; i++) {
       inprod[j + i*g_N_s]  = scalar_prod(block_list[i].basis[j], psi[i], vol, 0);
+      inprod32[j + i*g_N_s]  = (_Complex float)inprod[j + i*g_N_s];
       if(evenodd) {
-	if (block_list[i].evenodd==0) {
-	  inprod_eo[j + i_e*g_N_s] = inprod[j + i*g_N_s];
-	  i_e++;
-	}
-	if (block_list[i].evenodd==1) {
-	  inprod_eo[j + nb_blocks*g_N_s/2+i_o*g_N_s] = inprod[j + i*g_N_s];
-	  i_o++;
-	}
+        if (block_list[i].evenodd == 0) {
+          inprod_eo[j + i_e*g_N_s] = inprod[j + i*g_N_s];
+          i_e++;
+        }
+        if (block_list[i].evenodd == 1) {
+          inprod_eo[j + nb_blocks*g_N_s/2+i_o*g_N_s] = inprod[j + i*g_N_s];
+          i_o++;
+        }
       }
     }
   }
-  
+
   if(evenodd) {
-    little_D_ee_inv(inprod_e,inprod_eo);
-    little_D_hop(1,inprod_o, inprod_e);
-    little_Dhat_rhs(1,inprod_o,-1,inprod_eo);
+    little_D_ee_inv(inprod_e, inprod_eo);
+    little_D_hop(1, inprod_o, inprod_e);
+    little_Dhat_rhs(1, inprod_o, -1, inprod_eo);
   }
-  
-  
-  /* if(dfl_sloppy_prec) prec = dfl_little_D_prec;*/
-  if(dfl_sloppy_prec) prec = 1.e-12;
-  else prec = 1.e-24;
-  
-  
-  
+
+
+  if(!dfl_sloppy_prec) prec = little_solver_high_prec;
+  else prec = little_solver_low_prec;
+
   if(!usePL) {
     if(evenodd) {
-      iter = gcr4complex(invvec_eo,inprod_o,10,1000,prec,1,nb_blocks*g_N_s,1,nb_blocks*9*g_N_s,&little_D_sym);
-      
-      little_D_hop(0,ctmp, invvec_eo);
-      little_D_ee_inv(invvec_eo,ctmp);
+      if(gcr32) {
+        for (int j = 0; j < g_N_s*nb_blocks*9; j++) {
+          inprod_o32[j] = (_Complex float) inprod_o[j];
+        }
+        iter = fgmres4complex_32(invvec_eo32, inprod_o32, little_m, little_max_iter, prec, 1, 
+                                 nb_blocks*g_N_s, 1, nb_blocks*9*g_N_s, 0, &little_D_sym_32);
+        // we could do more in 32bit precision!?
+        for (int j = 0; j < g_N_s*nb_blocks*9; j++) {
+          invvec_eo[j] = (_Complex double) invvec_eo32[j];
+        }
+      }
+      else {
+        iter = gcr4complex(invvec_eo, inprod_o, little_m, little_max_iter, prec, 1, 
+                           nb_blocks*g_N_s, 1, nb_blocks*9*g_N_s, 0, &little_D_sym);
+      }
+
+      little_D_hop(0, ctmp, invvec_eo);
+      little_D_ee_inv(invvec_eo, ctmp);
       little_Dhat_rhs(0,invvec_eo, -1., inprod_e);
-    
-      for (j = 0; j < g_N_s; j++) {
-	i_o=0;
-	i_e=0;
-	for(i = 0; i < nb_blocks; i++) {
-	  if (block_list[i].evenodd==0) {
-	    invvec[j + i*g_N_s] = invvec_eo[j + i_e*g_N_s];
-	    i_e++;
-	  }
-	  if (block_list[i].evenodd==1) {
-	    invvec[j + i*g_N_s] = invvec_eo[j + nb_blocks*g_N_s/2+i_o*g_N_s];
-	    i_o++;
-	  }
-	}
+
+      for (int j = 0; j < g_N_s; j++) {
+        i_o=0;
+        i_e=0;
+        for(int i = 0; i < nb_blocks; i++) {
+          if (block_list[i].evenodd==0) {
+            invvec[j + i*g_N_s] = invvec_eo[j + i_e*g_N_s];
+            i_e++;
+          }
+          if (block_list[i].evenodd==1) {
+            invvec[j + i*g_N_s] = invvec_eo[j + nb_blocks*g_N_s/2+i_o*g_N_s];
+            i_o++;
+          }
+        }
       }
-      if(g_proc_id == 0 && g_debug_level > 0) {/*CT: was "g_debug_level > -1" */
-	printf("lgcr evenodd number of iterations %d (no P_L)\n", iter);
+      if(g_proc_id == 0 && g_debug_level > 0) {
+        printf("lgcr/lfgmres (even/odd) number of iterations %d (no LittleLittleD)\n", iter);
       }
     }
     else {
-      iter = gcr4complex(invvec, inprod, 10, 1000, prec, 1, nb_blocks * g_N_s, 1, nb_blocks * 9 * g_N_s, &little_D);
-      if(g_proc_id == 0 && g_debug_level > 0) {/*CT: was "g_debug_level > -1" */
-	printf("lgcr number of iterations %d (no P_L)\n", iter);
+      if(gcr32) {
+        iter = fgmres4complex_32(invvec32, inprod32, little_m, little_max_iter, prec, 1, 
+                             nb_blocks * g_N_s, 1, nb_blocks * 9 * g_N_s, 0, &little_D_32);
+        
+        for (int j = 0; j < g_N_s*nb_blocks*9; j++) {
+          invvec[j] = (_Complex double) invvec32[j];
+        }
+      }
+      else {
+        iter = gcr4complex(invvec, inprod, little_m, little_max_iter, prec, 1, 
+                           nb_blocks * g_N_s, 1, nb_blocks * 9 * g_N_s, 0, &little_D);
       }
+      if(g_proc_id == 0 && g_debug_level > 0) {
+        printf("lgcr/lfgmres number of iterations %d (no LittleLittleD)\n", iter);
+      }       
     }
   }
-  else {
+  else { // usePL = true
     if(evenodd) {
-      little_P_L_sym(v, inprod_o);
-      iter = gcr4complex(w, v, 10, 1000, prec, 1, nb_blocks * g_N_s, 1, nb_blocks * 9 * g_N_s, &little_P_L_D_sym);
-      little_P_R_sym(v, w);
-/*      little_project(w, inprod_o, g_N_s);*/
-      little_project_eo(w,inprod_o,g_N_s);
-      for(i = 0; i < nb_blocks*g_N_s; ++i)
-	invvec_eo[i] = w[i] + v[i];
+      // this is in adaptive MG style
+      if(gcr32) {
+        for (int j = 0; j < g_N_s*nb_blocks*9; j++) {
+          inprod_o32[j] = (_Complex float) inprod_o[j];
+        }
+        iter = fgmres4complex_32(invvec_eo32, inprod_o32, little_m, little_max_iter, prec, 1, 
+                                 nb_blocks*g_N_s, 1, nb_blocks*9*g_N_s, 1, &little_D_sym_32);
+        // we could do more in 32bit precision!?
+        for (int j = 0; j < g_N_s*nb_blocks*9; j++) {
+          invvec_eo[j] = (_Complex double) invvec_eo32[j];
+        }
+      }
+      else {
+        iter = gcr4complex(invvec_eo, inprod_o, little_m, little_max_iter, prec, 1, 
+                           nb_blocks * g_N_s, 1, nb_blocks * 9 * g_N_s, 1, &little_D_sym);
+      }
       little_D_hop(0,ctmp, invvec_eo);
       little_D_ee_inv(invvec_eo,ctmp);
       little_Dhat_rhs(0,invvec_eo, -1., inprod_e);
-      for (j = 0; j < g_N_s; j++) {
-	i_o=0;
-	i_e=0;
-	for(i = 0; i < nb_blocks; i++){
-	  if (block_list[i].evenodd==0) {
-	    invvec[j + i*g_N_s] = invvec_eo[j + i_e*g_N_s];
-	    i_e++;
-	  }
-	  if (block_list[i].evenodd==1) {
-	    invvec[j + i*g_N_s] = invvec_eo[j + nb_blocks*g_N_s/2+i_o*g_N_s];
-	    i_o++;
-	  }
-	}
+      for (int j = 0; j < g_N_s; j++) {
+        i_o=0;
+        i_e=0;
+        for(int i = 0; i < nb_blocks; i++){
+          if (block_list[i].evenodd==0) {
+            invvec[j + i*g_N_s] = invvec_eo[j + i_e*g_N_s];
+            i_e++;
+          }
+          if (block_list[i].evenodd==1) {
+            invvec[j + i*g_N_s] = invvec_eo[j + nb_blocks*g_N_s/2+i_o*g_N_s];
+            i_o++;
+          }
+        }
       } 
-      if(g_proc_id == 0 && g_debug_level > 0) {/*CT: was "g_debug_level > -1" */
-	printf("lgcr even/odd number of iterations %d (using P_L)\n", iter);
+      if(g_proc_id == 0 && g_debug_level > 0) {
+        printf("lgcr/lfgmres (even/odd) number of iterations %d (using LittleLittleD)\n", iter);
       }
     }
     else {
       little_P_L(v, inprod);
-      iter = gcr4complex(w, v, 10, 1000, prec, 1, nb_blocks * g_N_s, 1, nb_blocks * 9 * g_N_s, &little_P_L_D);
+      iter = gcr4complex(w, v, little_m, little_max_iter, prec, 1, 
+                         nb_blocks * g_N_s, 1, nb_blocks * 9 * g_N_s, 0, &little_P_L_D);
       little_P_R(v, w);
       little_project(w, inprod, g_N_s);
-      for(i = 0; i < nb_blocks*g_N_s; ++i)
-	invvec[i] = w[i] + v[i];
-      if(g_proc_id == 0 && g_debug_level > 0) {/*CT: was "g_debug_level > -1" */
-	printf("lgcr number of iterations %d (using P_L)\n", iter);
+      for(int i = 0; i < nb_blocks*g_N_s; ++i)
+        invvec[i] = w[i] + v[i];
+      if(g_proc_id == 0 && g_debug_level > 0) {
+        printf("lgcr number of iterations %d (using LittleLittleD)\n", iter);
       }
     }    
   }
+  cumiter_lgcr += iter;
+
   /* sum up */
-  for(i = 0 ; i < nb_blocks ; i++) {
+  for(int i = 0 ; i < nb_blocks ; i++) {
     mul(psi[i], invvec[i*g_N_s], block_list[i].basis[0], vol);
   }
-  for(j = 1; j < g_N_s; j++) {
-    for(i = 0 ; i < nb_blocks ; i++) {
+  for(int j = 1; j < g_N_s; j++) {
+    for(int i = 0 ; i < nb_blocks ; i++) {
       assign_add_mul(psi[i], block_list[i].basis[j], invvec[i*g_N_s + j], vol);
     }
   }
-  
+
   /* reconstruct global field */
   reconstruct_global_field_GEN(out, psi, nb_blocks);
   free_dfl_projector();
@@ -224,43 +276,54 @@ void project(spinor * const out, spinor * const in) {
 }
 
 static void alloc_dfl_projector() {
-  int i;
-  
-  psi = calloc(2*nb_blocks, sizeof(spinor*)); /*block local version of global spinor */
-  inprod = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */
-  inprod_eo = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */
-  inprod_o = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */
-  inprod_e = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */
-  ctmp = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */
-  invvec = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */
-  invvec_eo = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */
-  work_block = calloc(dfl_work_size * nb_blocks * 9 * g_N_s, sizeof(_Complex double));
-  for(i = 0; i < dfl_work_size; ++i) {
-    work[i] = work_block + i * nb_blocks * 9 * g_N_s;
-  }
-  
-  /* no loop below because further down we also don't take this cleanly into account */
-  psi[0] = calloc(nb_blocks*(block_list[0].volume + block_list[0].spinpad), sizeof(spinor));
-  for(i = 1 ;i < nb_blocks ;i++) {
-    psi[i] = psi[i-1] + (block_list[0].volume + block_list[0].spinpad);
+  if(init_dfl_projector == 0) {
+    
+    psi = calloc(2*nb_blocks, sizeof(spinor*)); /*block local version of global spinor */
+    inprod = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */
+    inprod32 = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex float)); /*inner product of spinors with bases */
+    inprod_eo = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */
+    inprod_o = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */
+    inprod_o32 = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex float)); /*inner product of spinors with bases */
+    inprod_e = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */
+    ctmp = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */
+    invvec = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */
+    invvec32 = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex float)); /*inner product of spinors with bases */
+    invvec_eo = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */
+    invvec_eo32 = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex float)); /*inner product of spinors with bases */
+    work_block = calloc(dfl_work_size * nb_blocks * 9 * g_N_s, sizeof(_Complex double));
+    for(int i = 0; i < dfl_work_size; ++i) {
+      work[i] = work_block + i * nb_blocks * 9 * g_N_s;
+    }
+    
+    /* no loop below because further down we also don't take this cleanly into account */
+    psi[0] = calloc(nb_blocks*(block_list[0].volume + block_list[0].spinpad), sizeof(spinor));
+    for(int i = 1 ;i < 2*nb_blocks ;i++) {
+      psi[i] = psi[i-1] + (block_list[0].volume + block_list[0].spinpad);
+    }
+    init_dfl_projector = 1;
   }
-  init_dfl_projector = 1;
   return;
 }
 
 
 void free_dfl_projector() {
-  free(*psi);
-  free(psi);
-  free(invvec);
-  free(invvec_eo);
-  free(inprod);
-  free(inprod_eo);
-  free(inprod_e);
-  free(inprod_o);
-  free(ctmp);
-  free(work_block);
-  init_dfl_projector = 0;
+  if(init_dfl_projector) {
+    free(*psi);
+    free(psi);
+    free(invvec);
+    free(invvec32);
+    free(invvec_eo);
+    free(invvec_eo32);
+    free(inprod);
+    free(inprod32);
+    free(inprod_eo);
+    free(inprod_e);
+    free(inprod_o);
+    free(inprod_o32);
+    free(ctmp);
+    free(work_block);
+    init_dfl_projector = 0;
+  }
   return;
 }
 
@@ -280,7 +343,7 @@ void project2(spinor * const out, spinor * const in) {
     /*loop over block.basis */
     for(i = 0 ; i < nb_blocks ; i++)  inprod[j + i*g_N_s]  = scalar_prod(block_list[i].basis[j], psi[i], vol, 0);
   }
-  
+
   /* sum up */
   for(i = 0 ; i < nb_blocks ; i++) mul(psi[i], inprod[i*g_N_s], block_list[i].basis[0], vol);
   for(j = 1; j < g_N_s; j++) {
@@ -292,6 +355,27 @@ void project2(spinor * const out, spinor * const in) {
   return;
 }
 
+// This is a preconditioner for D in Multi-Grid spirit
+// following equation (4.2) in arXiv:1303.1377
+// C^(nu) psi = M_sap[(psi - D phi) + phi]
+// with approximately P A P phi = psi   (A = little D)
+// and nu the M_sap cycles here called Ncy
+
+void mg_precon(spinor * const out, spinor * const in) {
+  // phi = PD_c^{-1} P^dagger in
+  project(out, in);
+  // in - D*phi 
+  // need to DUM_MATRIX+2,3 because in Msap_eo DUM_MATRIX+0,1 is used
+  D_psi(g_spinor_field[DUM_MATRIX+2], out);
+  diff(g_spinor_field[DUM_MATRIX+2], in, g_spinor_field[DUM_MATRIX+2], VOLUME);
+  // apply M_SAP
+  zero_spinor_field(g_spinor_field[DUM_MATRIX+3], VOLUME);
+  Msap_eo(g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+2], NcycleMsap, NiterMsap);
+  // sum with phi
+  add(out, g_spinor_field[DUM_MATRIX+3], out, VOLUME);
+  return;
+}
+
 void project_left(spinor * const out, spinor * const in) {
   /* out = P_L in = in - D proj in */ 
 
@@ -303,7 +387,6 @@ void project_left(spinor * const out, spinor * const in) {
 
 void project_right(spinor * const out, spinor * const in) {
   /* out = P_R in = in - proj D in */
-
   D_psi(out, in);
   project(g_spinor_field[DUM_MATRIX], out);
   diff(out, in, g_spinor_field[DUM_MATRIX], VOLUME);
@@ -336,18 +419,18 @@ void little_project(_Complex double * const out, _Complex double * const in, con
 
   phi = work[2];
   psi = work[3];
-  
+
   /* NOTE IS THIS REALLY NECESSARY/CORRECT? */
   for(i = 0; i < N; i++) {
     phi[i] = lscalar_prod(little_dfl_fields[i], in, nb_blocks*N, 0);
   }
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Allreduce(phi, psi, N, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
 #else
   memcpy(psi, phi, N*sizeof(_Complex double));
 #endif
-  
+
   /* apply inverse of little_A */
   for(i = 0; i < N; i++) {
     (phi[i]) = 0.0;
@@ -363,50 +446,35 @@ void little_project(_Complex double * const out, _Complex double * const in, con
   return;
 }
 
-void little_project_eo(_Complex double * const out, _Complex double * const in, const int  N) {
-  int i, j;
-  static _Complex double *phi;
-  static _Complex double *psi;
-  
-  if(init_dfl_projector == 0) {
-    alloc_dfl_projector();
-  }
-      
-  phi = work[2];
-  psi = work[3];
+#define _PSWITCH(s) s
+#define _PTSWITCH(s) s
+#define _MPI_C_TYPE MPI_DOUBLE_COMPLEX
+#define _F_TYPE double
 
-  /* NOTE IS THIS REALLY NECESSARY/CORRECT? */
-  for(i = 0; i < N; i++) {
-    phi[i] = lscalar_prod(little_dfl_fields_eo[i], in, nb_blocks*N, 0);
-  }
-  
-#ifdef MPI
-  MPI_Allreduce(phi, psi, N, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
-#else
-  memcpy(psi, phi, N*sizeof(_Complex double));
-#endif
+#include "little_project_eo_body.c"
 
-  /* apply inverse of little_A_eo */
-  for(i = 0; i < N; i++) {
-    (phi[i]) = 0.0;
-    for(j = 0; j < N; j++) {
-      (phi[i]) += (little_A_eo[j*N + i]) * (psi[j]);
-    }
-  }
-  
-  lmul(out, phi[0], little_dfl_fields_eo[0], nb_blocks*N);
-  for(i = 1; i < N; i++) {
-    lassign_add_mul(out, little_dfl_fields_eo[i], phi[i], nb_blocks*N);
-  }
-  return;
-}
+#undef _PSWITCH
+#undef _F_TYPE
+#undef _MPI_C_TYPE
+#undef _PTSWITCH
+
+#define _PSWITCH(s) s ## _32
+#define _PTSWITCH(s) s ## 32
+#define _MPI_C_TYPE MPI_COMPLEX
+#define _F_TYPE float
+
+#include "little_project_eo_body.c"
 
+#undef _PSWITCH
+#undef _F_TYPE
+#undef _MPI_C_TYPE
+#undef _PTSWITCH
 
 void little_project2(_Complex double * const out, _Complex double * const in, const int  N) {
   int i;
   static _Complex double *phi;
   static _Complex double *psi;
-  
+
   if(init_dfl_projector == 0) {alloc_dfl_projector();}
   phi = work[4];
   psi = work[5];
@@ -414,12 +482,12 @@ void little_project2(_Complex double * const out, _Complex double * const in, co
   for(i = 0; i < N; i++) {
     phi[i] = lscalar_prod(little_dfl_fields[i], in, nb_blocks*N, 0);
   }
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Allreduce(phi, psi, g_N_s, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
 #else
   memcpy(psi, phi, g_N_s*sizeof(_Complex double));
 #endif
-  
+
   lmul(out, psi[0], little_dfl_fields[0], nb_blocks*g_N_s);
   for(i = 1; i < N; i++) {
     lassign_add_mul(out, little_dfl_fields[i], psi[i], nb_blocks*g_N_s);
@@ -447,7 +515,6 @@ void little_P_R(_Complex double * const out, _Complex double * const in) {
 
 void little_P_L_sym(_Complex double * const out, _Complex double * const in) {
   if(init_dfl_projector == 0) {alloc_dfl_projector();}
-/*  little_project(out, in, g_N_s);*/
   little_project_eo(out,in,g_N_s);
   little_D_sym(work[13], out);
   ldiff(out, in, work[13], nb_blocks*g_N_s);
@@ -457,7 +524,6 @@ void little_P_L_sym(_Complex double * const out, _Complex double * const in) {
 void little_P_R_sym(_Complex double * const out, _Complex double * const in) {
   if(init_dfl_projector == 0) {alloc_dfl_projector();}
   little_D_sym(out, in);
-/*  little_project(work[14], out, g_N_s);*/
   little_project_eo(work[14],out,g_N_s);
   ldiff(out, in, work[14], nb_blocks*g_N_s);
   return;
@@ -470,6 +536,24 @@ void little_P_L_D(_Complex double * const out, _Complex double * const in) {
   return;
 }
 
+#define _PSWITCH(s) s
+#define _F_TYPE double
+
+#include "little_mg_precon_body.c"
+
+#undef _PSWITCH
+#undef _F_TYPE
+
+#define _PSWITCH(s) s ## _32
+#define _F_TYPE float
+
+#include "little_mg_precon_body.c"
+
+#undef _PSWITCH
+#undef _F_TYPE
+
+
+// little_P_L_D_sym * psi = (1 - PA^-1P little_D_sym) * little_D_sym * psi
 void little_P_L_D_sym(_Complex double * const out, _Complex double * const in) {
   if(init_dfl_projector == 0) {alloc_dfl_projector();}
   little_D_sym(work[15], in);
@@ -490,23 +574,24 @@ int check_projectors(const int repro) {
   int i,j;
   spinor **phi;
   spinor **wphi;
-  _Complex double *v;
   spinor ** work_fields = NULL;
-  const int nr_wf = 4;
+  const int nr_wf = 5;
+  const double eps = 1.e-8;
+  double savelittle_solver_high_prec = little_solver_high_prec;
+  little_solver_high_prec = eps*eps/10.;
 
   init_solver_field(&work_fields, VOLUMEPLUSRAND, nr_wf);
-  phi = malloc(nb_blocks*sizeof(spinor *));
-  wphi = malloc(nb_blocks*sizeof(spinor *));
+  phi = malloc(nb_blocks * sizeof(spinor *));
+  wphi = malloc(nb_blocks * sizeof(spinor *));
 
   random_spinor_field_lexic(work_fields[0], repro, RN_GAUSS);
   nrm = square_norm(work_fields[0], VOLUME, 1);
   if(g_cart_id == 0) {
-    printf("\nNow we check the DFL projection routines!\n\n");
-    printf("||psi|| = %1.5e\n", sqrt(nrm));
+    printf("\n######################\n");
+    printf("# Now we check the DFL projection routines!\n\n");
+    printf("# ||psi|| = %1.5e\n", sqrt(nrm));
   }
 
-
-
   /* Check generalized split/reconstruct */
   phi[0] = calloc(VOLUME + nb_blocks, sizeof(spinor));
   for(j = 1; j < nb_blocks; j++) {
@@ -517,7 +602,9 @@ int check_projectors(const int repro) {
   diff(work_fields[2], work_fields[0], work_fields[1], VOLUME);
   nrm = square_norm(work_fields[2], VOLUME, 1);
   if(g_cart_id == 0) {
-    printf("||psi_orig - psi_recon|| = %1.5e\n", sqrt(nrm));
+    printf("# ||psi_orig - psi_recon|| = %1.5e ", sqrt(nrm));
+    if(sqrt(nrm) < eps) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED!\n\n");
     fflush(stdout);
   }
   /* Check even/odd split reconstruct   */
@@ -527,16 +614,41 @@ int check_projectors(const int repro) {
   diff(work_fields[2], work_fields[0], work_fields[3], VOLUME);
   nrm = square_norm(work_fields[2], VOLUME, 1);
   if(g_cart_id == 0) {
-    printf("even/odd split: ||psi_orig - psi_recon|| = %1.5e\n", sqrt(nrm));
+    printf("# even/odd split: ||psi_orig - psi_recon|| = %1.5e ", sqrt(nrm));
+    if(sqrt(nrm) < eps) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED!\n\n");
     fflush(stdout);
   }
 
+  // check assign_mul_one_sw_pm_imu_inv_block for clover case
+  if(g_c_sw > 0) {
+    for (int blk = 0; blk < nb_blocks; blk++) {
+      copy_global_to_block_eo(phi[0], phi[1], work_fields[0], blk);
+      assign_mul_one_sw_pm_imu_inv_block(EE, phi[2], phi[0], g_mu, &block_list[blk]);
+      
+      copy_block_eo_to_global(work_fields[1], phi[2], phi[1], blk);      
+    }
+    convert_lexic_to_eo(work_fields[2], work_fields[3], work_fields[0]);
+    assign_mul_one_sw_pm_imu_inv(EE, work_fields[5], work_fields[2], g_mu);
+    convert_eo_to_lexic(work_fields[2], work_fields[5], work_fields[3]);
+    diff(work_fields[0], work_fields[1], work_fields[2], VOLUME);
+    nrm = square_norm(work_fields[0], VOLUME, 1);
+    if(g_cart_id == 0) {
+      printf("# assign_mul_one_sw_pm_imu_inv: ||psi_orig - psi_block|| = %1.5e ", sqrt(nrm));
+      if(sqrt(nrm) < eps) printf("#  -> passed\n\n");
+      else printf("#  -> FAILED!\n\n");
+      fflush(stdout);
+    }
+  }
+  
   project2(work_fields[1], work_fields[0]);
   project2(work_fields[2], work_fields[1]);
   diff(work_fields[3], work_fields[1], work_fields[2], VOLUME);
   nrm = square_norm(work_fields[3], VOLUME, 1);
   if(g_cart_id == 0) {
-    printf("||P2 psi - P2 P2 psi|| = %1.5e\n", sqrt(nrm));
+    printf("# ||P2 psi - P2 P2 psi|| = %1.5e ", sqrt(nrm));
+    if(sqrt(nrm) < eps) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED!\n\n");
     fflush(stdout);
   }
 
@@ -545,16 +657,26 @@ int check_projectors(const int repro) {
   diff(work_fields[3], work_fields[2], work_fields[1], VOLUME);
   nrm = square_norm(work_fields[3], VOLUME, 1);
   if(g_cart_id == 0) {
-    printf("||P_L D psi - D P_R psi|| = %1.5e\n", sqrt(nrm));
+    printf("# ||P_L D psi - D P_R psi|| = %1.5e ", sqrt(nrm));
+    if(sqrt(nrm) < eps) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED!\n\n");
     fflush(stdout);
+    printf("\n######################\n");
+    printf("# The following tests are only meaningful up to the precision little_D can be inverted for.\n");
+    printf("# They might, therefore, be only useful in a small volume and/or a small condition number of little_D\n");
+    printf("# The inversion precision (squared) is set to %e\n", little_solver_high_prec);
+    printf("# So don't expect a precision much better than %1.2e in the following tests\n\n", eps*10);
   }
 
+  dfl_sloppy_prec = 0;
   project_left(work_fields[1], work_fields[0]);
   project_left(work_fields[2], work_fields[1]);
   diff(work_fields[3], work_fields[2], work_fields[1], VOLUME);
   nrm = square_norm(work_fields[3], VOLUME, 1);
   if(g_cart_id == 0) {
-    printf("||P_L^2 psi - P_L psi|| = %1.5e\n", sqrt(nrm));
+    printf("# ||P_L^2 psi - P_L psi|| = %1.5e ", sqrt(nrm));
+    if(sqrt(nrm) < eps*10) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED!\n\n");
     fflush(stdout);
   }
 
@@ -563,7 +685,9 @@ int check_projectors(const int repro) {
   diff(work_fields[3], work_fields[2], work_fields[1], VOLUME);
   nrm = square_norm(work_fields[3], VOLUME, 1);
   if(g_cart_id == 0) {
-    printf("||P_R^2 psi - P_R psi|| = %1.5e\n", sqrt(nrm));
+    printf("# ||P_R^2 psi - P_R psi|| = %1.5e ", sqrt(nrm));
+    if(sqrt(nrm) < eps*10) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED!\n\n");
     fflush(stdout);
   }
 
@@ -571,7 +695,9 @@ int check_projectors(const int repro) {
   project2(work_fields[2], work_fields[1]);
   nrm = square_norm(work_fields[2], VOLUME, 1);
   if(g_cart_id == 0) {
-    printf("||P P_L psi|| = %1.5e\n", sqrt(nrm));
+    printf("# ||P P_L psi|| = %1.5e ", sqrt(nrm));
+    if(sqrt(nrm) < eps*10) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED!\n\n");
     fflush(stdout);
   }
 
@@ -579,7 +705,9 @@ int check_projectors(const int repro) {
   project_right(work_fields[2], work_fields[1]);
   nrm = square_norm(work_fields[2], VOLUME, 1);
   if(g_cart_id == 0) {
-    printf("||P_R P psi|| = %1.5e\n", sqrt(nrm));
+    printf("# ||P_R P psi|| = %1.5e ", sqrt(nrm));
+    if(sqrt(nrm) < eps*10) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED!\n\n");
     fflush(stdout);
   }
 
@@ -590,7 +718,9 @@ int check_projectors(const int repro) {
   diff(work_fields[3], work_fields[2], work_fields[1], VOLUME);
   nrm = square_norm(work_fields[3], VOLUME, 1);
   if(g_cart_id == 0) {
-    printf("||P D A^-1 P psi - P psi|| = %1.5e\n", sqrt(nrm));
+    printf("# ||P D A^-1 P psi - P psi|| = %1.5e ", sqrt(nrm));
+    if(sqrt(nrm) < eps*10) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED!\n\n");
     fflush(stdout);
   }
 
@@ -601,7 +731,9 @@ int check_projectors(const int repro) {
   diff(work_fields[3], work_fields[2], work_fields[1], VOLUME);
   nrm = square_norm(work_fields[3], VOLUME, 1);
   if(g_cart_id == 0) {
-    printf("||P A^-1 D P psi - P psi|| = %1.5e\n", sqrt(nrm));
+    printf("# ||P A^-1 D P psi - P psi|| = %1.5e ", sqrt(nrm));
+    if(sqrt(nrm) < eps*10) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED!\n\n");
     fflush(stdout);
   }
 
@@ -613,17 +745,21 @@ int check_projectors(const int repro) {
   diff(work_fields[3], work_fields[2], work_fields[1], VOLUME);
   nrm = square_norm(work_fields[3], VOLUME, 1);
   if(g_cart_id == 0) {
-    printf("||P D P (P D P)^-1 psi - P psi|| = %1.5e\n", sqrt(nrm));
+    printf("# ||P D P (P D P)^-1 psi - P psi|| = %1.5e ", sqrt(nrm));
+    if(sqrt(nrm) < eps*10) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED!\n\n");
     fflush(stdout);
   }
 
-  
+
   invert_little_D_spinor(work_fields[1], work_fields[0]);
   invert_little_D_eo_spinor(work_fields[2], work_fields[0]);
   diff(work_fields[3], work_fields[1], work_fields[2], VOLUME);
   nrm = square_norm(work_fields[3], VOLUME, 1);
   if(g_cart_id == 0) {
-    printf("||A^-1 psi - A^-1_eo psi|| = %1.5e\n", sqrt(nrm));
+    printf("# ||A^-1 psi - A^-1_eo psi|| = %1.5e ", sqrt(nrm));
+    if(sqrt(nrm) < eps*100) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED!\n\n");
     fflush(stdout);
   }
 
@@ -634,7 +770,9 @@ int check_projectors(const int repro) {
   diff(work_fields[1], work_fields[3], work_fields[2], VOLUME);
   nrm = square_norm(work_fields[1], VOLUME, 1);
   if(g_cart_id == 0) {
-    printf("||A A^-1 psi - P psi|| = %1.5e\n", sqrt(nrm));
+    printf("# ||A A^-1 psi - P psi|| = %1.5e ", sqrt(nrm));
+    if(sqrt(nrm) < eps*10) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED!\n\n");
     fflush(stdout);
   }
 
@@ -645,7 +783,11 @@ int check_projectors(const int repro) {
   diff(work_fields[2], work_fields[3], work_fields[1], VOLUME);
   nrm = square_norm(work_fields[2], VOLUME, 1);
   if(g_cart_id == 0) {
-    printf("||P A A^-1 psi - P psi|| = %1.5e\n", sqrt(nrm));
+    printf("# ||P A A^-1 psi - P psi|| = %1.5e", sqrt(nrm));
+    if(sqrt(nrm) < eps*10) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED!\n\n");
+    printf("\n######################\n");
+    printf("# The following tests should be again fulfilled up to machine precision\n\n");
     fflush(stdout);
   }
 
@@ -665,101 +807,63 @@ int check_projectors(const int repro) {
   }
   apply_little_D_spinor(work_fields[3], work_fields[1]);
   D_psi(work_fields[2], work_fields[1]);
-  
-  if (g_cart_id == 0 && g_debug_level > 4){
-    v = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double));
-    split_global_field_GEN(phi, work_fields[2], nb_blocks);
-
-    for (j = 0; j < g_N_s; ++j) {
-      for(i = 0; i < nb_blocks; i++) {
-	v[j + i*g_N_s] = scalar_prod(block_list[i].basis[j], phi[i], VOLUME/nb_blocks, 0);
-      }
-    }
-
-    for (j = 0; j < nb_blocks* g_N_s; ++j) {
-      printf("AFTER D: w[%u] = %1.5e + %1.5e i\n", j, creal(v[j]), cimag(v[j]));
-    }
-    free(v);
-  }
 
   project2(work_fields[1], work_fields[2]);
-  
-
   diff(work_fields[2], work_fields[3], work_fields[1], VOLUME);
   nrm = square_norm(work_fields[2], VOLUME, 1);
   if(g_proc_id == 0) {
-    printf("||(P D - A) phi_i || = %1.5e\n", sqrt(nrm));
+    printf("# ||(P D - A) phi_i || = %1.5e ", sqrt(nrm));
+    if(sqrt(nrm) < eps*10) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED!\n\n");
     fflush(stdout);
   }
 
   reconstruct_global_field_GEN_ID(work_fields[1], block_list, 0, nb_blocks);
   apply_little_D_spinor(work_fields[3], work_fields[1]);
   D_psi(work_fields[2], work_fields[1]);
-  if (!g_proc_id && g_debug_level > 4){
-    v = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double));
-    split_global_field_GEN(phi, work_fields[2],nb_blocks);
-    for (j = 0; j < g_N_s; ++j) 
-      for(i = 0; i < nb_blocks; i++)
-	v[j + i*g_N_s] = scalar_prod(block_list[i].basis[j], phi[i], VOLUME/nb_blocks, 0);
-    for (j = 0; j < nb_blocks* g_N_s; ++j) {
-      printf("AFTER D: w[%u] = %1.5e + %1.5e i\n", j, creal(v[j]), cimag(v[j]));
-    }
-    free(v);
-  }
+
   project2(work_fields[1], work_fields[2]);
   diff(work_fields[2], work_fields[3], work_fields[1], VOLUME);
   nrm = square_norm(work_fields[2], VOLUME, 1);
   if(g_proc_id == 0) {
-    printf("||(P D - A) phi || = %1.5e\n", sqrt(nrm));
+    printf("# ||(P D - A) phi || = %1.5e ", sqrt(nrm));
+    if(sqrt(nrm) < eps*10) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED!\n\n");
     fflush(stdout);
   }
 
   apply_little_D_spinor(work_fields[3], work_fields[0]);
   project2(work_fields[1], work_fields[0]);
   D_psi(work_fields[2], work_fields[1]);
+
   project2(work_fields[1], work_fields[2]);
   diff(work_fields[2], work_fields[3], work_fields[1], VOLUME);
   nrm = square_norm(work_fields[2], VOLUME, 1);
-  if(g_proc_id == 0 && g_debug_level > 4) {
-    printf("||P D P psi - A psi|| = %1.5e\n", sqrt(nrm));
-    printf("\n*** Comparison of the leading spinor components ***\n");
-    printf("%1.5e\t%1.5e\n", creal(work_fields[1]->s0.c0), creal(work_fields[3]->s0.c0));
-    printf("%1.5e\t%1.5e\n", cimag(work_fields[1]->s0.c0), cimag(work_fields[3]->s0.c0));
-    printf("%1.5e\t%1.5e\n", creal(work_fields[1]->s0.c1), creal(work_fields[3]->s0.c1));
-    printf("%1.5e\t%1.5e\n", cimag(work_fields[1]->s0.c1), cimag(work_fields[3]->s0.c1));
-    printf("%1.5e\t%1.5e\n", creal(work_fields[1]->s0.c2), creal(work_fields[3]->s0.c2));
-    printf("%1.5e\t%1.5e\n", cimag(work_fields[1]->s0.c2), cimag(work_fields[3]->s0.c2));
-    printf("%1.5e\t%1.5e\n", creal(work_fields[1]->s1.c0), creal(work_fields[3]->s1.c0));
-    printf("%1.5e\t%1.5e\n", cimag(work_fields[1]->s1.c0), cimag(work_fields[3]->s1.c0));
-    printf("%1.5e\t%1.5e\n", creal(work_fields[1]->s1.c1), creal(work_fields[3]->s1.c1));
-    printf("%1.5e\t%1.5e\n", cimag(work_fields[1]->s1.c1), cimag(work_fields[3]->s1.c1));
-    printf("%1.5e\t%1.5e\n", creal(work_fields[1]->s1.c2), creal(work_fields[3]->s1.c2));
-    printf("%1.5e\t%1.5e\n", cimag(work_fields[1]->s1.c2), cimag(work_fields[3]->s1.c2));
-    printf("%1.5e\t%1.5e\n", creal(work_fields[1]->s2.c0), creal(work_fields[3]->s2.c0));
-    printf("%1.5e\t%1.5e\n", cimag(work_fields[1]->s2.c0), cimag(work_fields[3]->s2.c0));
-    printf("%1.5e\t%1.5e\n", creal(work_fields[1]->s2.c1), creal(work_fields[3]->s2.c1));
-    printf("%1.5e\t%1.5e\n", cimag(work_fields[1]->s2.c1), cimag(work_fields[3]->s2.c1));
-    printf("%1.5e\t%1.5e\n", creal(work_fields[1]->s2.c2), creal(work_fields[3]->s2.c2));
-    printf("%1.5e\t%1.5e\n", cimag(work_fields[1]->s2.c2), cimag(work_fields[3]->s2.c2));
-    printf("*** End of dump ***\n\n");
+  if(g_proc_id == 0) {
+    printf("# ||P D P psi - A psi|| = %1.5e ", sqrt(nrm));
+    if(sqrt(nrm) < eps*10) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED!\n\n");
     fflush(stdout);
   }
 
   /* check little projectors now */
   if(g_cart_id == 0) {
-    printf("\nNow the little little projection routines\n\n");
+    printf("\n######################\n");
+    printf("# Now we check the little little projection routines\n\n");
   }
   if(init_dfl_projector == 0) {
     alloc_dfl_projector();
   }
-  
+
   memcpy(work[10], work_fields[0], nb_blocks*g_N_s*sizeof(_Complex double));
   little_project2(work[11], work[10], g_N_s);
   little_project2(work[12], work[11], g_N_s);
   ldiff(work[12], work[12], work[11], nb_blocks*g_N_s);
   nrm = lsquare_norm(work[12], nb_blocks*g_N_s, 1);
   if(g_cart_id == 0) {
-    printf("||lP2 v - lP2 lP2 v|| = %1.5e\n", sqrt(nrm));
+    printf("# ||lP2 v - lP2 lP2 v|| = %1.5e ", sqrt(nrm));
+    if(sqrt(nrm) < eps*10) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED!\n\n");
     fflush(stdout);
   }
 
@@ -768,16 +872,20 @@ int check_projectors(const int repro) {
   ldiff(work[12], work[12], work[11], nb_blocks*g_N_s);
   nrm = lsquare_norm(work[12], nb_blocks*g_N_s, 1);
   if(g_cart_id == 0) {
-    printf("||lP_L lD v - lP_L lD v|| = %1.5e\n", sqrt(nrm));
+    printf("# ||lP_L lD v - lP_L lD v|| = %1.5e ", sqrt(nrm));
+    if(sqrt(nrm) < eps*10) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED!\n\n");
     fflush(stdout);
   }  
-  
+
   little_P_L_D(work[11], work[10]);
   little_D_P_R(work[12], work[10]);
   ldiff(work[12], work[12], work[11], nb_blocks*g_N_s);
   nrm = lsquare_norm(work[12], nb_blocks*g_N_s, 1);
   if(g_cart_id == 0) {
-    printf("||lP_L lD v - lD lP_R v|| = %1.5e\n", sqrt(nrm));
+    printf("# ||lP_L lD v - lD lP_R v|| = %1.5e ", sqrt(nrm));
+    if(sqrt(nrm) < eps*10) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED!\n\n");
     fflush(stdout);
   }
 
@@ -786,7 +894,9 @@ int check_projectors(const int repro) {
   ldiff(work[12], work[12], work[11], nb_blocks*g_N_s);
   nrm = lsquare_norm(work[12], nb_blocks*g_N_s, 1);
   if(g_cart_id == 0) {
-    printf("||lP_R^2 v - lP_R v|| = %1.5e\n", sqrt(nrm));
+    printf("# ||lP_R^2 v - lP_R v|| = %1.5e ", sqrt(nrm));
+    if(sqrt(nrm) < eps*10) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED!\n\n");
     fflush(stdout);
   }
 
@@ -795,10 +905,13 @@ int check_projectors(const int repro) {
   ldiff(work[12], work[12], work[11], nb_blocks*g_N_s);
   nrm = lsquare_norm(work[12], nb_blocks*g_N_s, 1);
   if(g_cart_id == 0) {
-    printf("||lP_L^2 v - lP_L v|| = %1.5e\n", sqrt(nrm));
+    printf("# ||lP_L^2 v - lP_L v|| = %1.5e ", sqrt(nrm));
+    if(sqrt(nrm) < eps*10) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED!\n\n");
     fflush(stdout);
   }
 
+  little_solver_high_prec = savelittle_solver_high_prec;
   free(phi[0]);
   free(phi);
   free(wphi);
@@ -807,10 +920,9 @@ int check_projectors(const int repro) {
 }
 
 void check_little_D_inversion(const int repro) {
-  int i,j,ctr_t;
-  int contig_block = LZ / nb_blocks;
+  int i, j;
   int vol = block_list[0].volume;
-  _Complex double *result, *v, *w;
+  _Complex double *result;
   double dif;
   spinor ** work_fields = NULL;
   const int nr_wf = 1;
@@ -820,74 +932,48 @@ void check_little_D_inversion(const int repro) {
   if(init_dfl_projector == 0) {
     alloc_dfl_projector();
   }
-  v = work[11];
-  w = work[12];
 
-  result = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */
+  if(g_proc_id == 0) {
+    printf("# Perform a test inversion of little_D using lGCR\n");
+    printf("# This test might be only meaningful for not too large condition number of little_D\n\n");
+  }
 
-  /* no loop below because further down we also don't take this cleanly into account */
+  result = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */
 
   /*initialize the local (block) parts of the spinor*/
-  for (ctr_t = 0; ctr_t < (VOLUME / LZ); ++ctr_t) {
-    for(i=0; i< nb_blocks; i++) {
-      memcpy(psi[i] + ctr_t * contig_block, work_fields[0] + (nb_blocks * ctr_t + i) * contig_block, contig_block * sizeof(spinor));
-    }
-  }
+  split_global_field_GEN(psi, work_fields[0], nb_blocks);
+
   for (i = 0; i < nb_blocks; ++i) {/* loop over blocks */
     /* compute inner product */
     for (j = 0; j < g_N_s; ++j) {/*loop over block.basis */
       /*       inprod[j + i*g_N_s] = block_scalar_prod(block_list[i].basis[j], psi[i], vol); */
       inprod[j + i*g_N_s] = scalar_prod(psi[i], block_list[i].basis[j], vol, 0);
+      invvec[j + i*g_N_s] = 0.;
     }
   }
 
-  if(1) {
-    gcr4complex(invvec, inprod, 10, 1000, 1.e-24, 0, nb_blocks * g_N_s, 1, nb_blocks * 9 * g_N_s, &little_D);
-  }
-  else {
-    little_P_L(v, inprod);
-    gcr4complex(w, v, 10, 1000, 1.e-24, 1, nb_blocks * g_N_s, 1, nb_blocks * 9 * g_N_s, &little_P_L_D);
-    little_P_R(v, w);
-    little_project(w, inprod, g_N_s);
-    for(i = 0; i < nb_blocks*g_N_s; ++i)
-      invvec[i] = w[i] + v[i];
-  }
-  little_D(result, invvec); /* This should be a proper inverse now */
 
-  dif = 0.0;
-  for(ctr_t = 0; ctr_t < nb_blocks * g_N_s; ++ctr_t){
-    dif += (creal(inprod[ctr_t]) - creal(result[ctr_t])) * (creal(inprod[ctr_t]) - creal(result[ctr_t]));
-    dif += (cimag(inprod[ctr_t]) - cimag(result[ctr_t])) * (cimag(inprod[ctr_t]) - cimag(result[ctr_t]));
-  }
-  dif = sqrt(dif);
+  gcr4complex(invvec, inprod, little_gmres_m_parameter, 1000, 1.e-24, 0, nb_blocks * g_N_s, 1, nb_blocks * 9 * g_N_s, 0, &little_D);
 
-  if (dif > 1e-8 * VOLUME){
-    printf("[WARNING] check_little_D_inversion: deviation found of size %1.5e!\n", dif);
-  }
-#ifdef MPI
+  little_D(result, invvec); /* This should be a proper inverse now */
+
+#ifdef TM_USE_MPI
   MPI_Barrier(MPI_COMM_WORLD);
 #endif
 
-  if ((g_debug_level > 2) && !g_proc_id){
-    printf("Inversion check on little_D\nStart:\n");
-    for(ctr_t = 0; ctr_t < nb_blocks * g_N_s; ++ctr_t){
-      printf("%1.9e + %1.9e I   ", creal(inprod[ctr_t]), cimag(inprod[ctr_t]));
-      if (ctr_t == g_N_s - 1)
-        printf("\n");
-    }
-    printf("\nInverted:\n");
-    for(ctr_t = 0; ctr_t < nb_blocks * g_N_s; ++ctr_t){
-      printf("%1.9e + %19e I   ", creal(invvec[ctr_t]), cimag(invvec[ctr_t]));
-      if (ctr_t == g_N_s - 1 )
-        printf("\n");
-    }
-    printf("\nResult:\n");
-    for(ctr_t = 0; ctr_t < nb_blocks * g_N_s; ++ctr_t){
-      printf("%1.9e + %1.9e I   ", creal(result[ctr_t]), cimag(result[ctr_t]));
-      if (ctr_t == g_N_s - 1)
-        printf("\n");
+  ldiff(invvec, result, inprod, nb_blocks*g_N_s);
+  dif = lsquare_norm(invvec, nb_blocks*g_N_s, 1);
+  for (i = 0; i < nb_blocks; ++i) {/* loop over blocks */
+    for (j = 0; j < 9*g_N_s; ++j) {/*loop over block.basis */
+      invvec[j + i*g_N_s] = 0.;
+      inprod[j + i*g_N_s] = 0.;
     }
-    printf("\n");
+  }
+
+  if(g_proc_id == g_stdio_proc) {
+    printf("# # check_little_D_inversion: squared residue found of size %1.5e! ", dif);
+    if(dif < 1.e-24) printf("#  -> passed\n\n");
+    else printf("#  -> FAILED\n\n");
   }
 
   finalize_solver(work_fields, nr_wf);
@@ -898,7 +984,7 @@ void check_little_D_inversion(const int repro) {
 void check_local_D(const int repro)
 {
   spinor * r[8];
-  int j, vol = block_list[0].volume/2, i;
+  int j, vol = block_list[0].volume/2;
   double nrm;
   spinor ** work_fields = NULL;
   const int nr_wf = 7;
@@ -909,7 +995,8 @@ void check_local_D(const int repro)
   diff(work_fields[0], work_fields[2], block_list[0].basis[0], block_list[0].volume);
   nrm = square_norm(work_fields[0], block_list[0].volume, 0);
   if(g_proc_id == 0) {
-    printf("\nblock even/odd: ||psi - psi_recon|| = %1.5e\n", sqrt(nrm));
+    printf("# \nblock even/odd: ||psi - psi_recon|| = %1.5e\n", sqrt(nrm));
+    printf("# next we compare local D against the Hopping matrix\n");
     fflush(stdout);
   }
 
@@ -920,112 +1007,160 @@ void check_local_D(const int repro)
     /* Now test the block hopping matrix */
     /* split into even/odd sites         */
     block_convert_lexic_to_eo(work_fields[0], work_fields[1], block_list[j].basis[0]);
-  
+
     /* Even sites */
     Block_H_psi(&block_list[j], g_spinor_field[DUM_DERI], work_fields[1], EO);
-    assign_mul_one_pm_imu(work_fields[2], work_fields[0], 1., vol); 
+    if(g_c_sw > 0)
+      assign_mul_one_sw_pm_imu_block(EE, work_fields[2], work_fields[0], g_mu, &block_list[j]);
+    else 
+      assign_mul_one_pm_imu(work_fields[2], work_fields[0], 1., vol);
+    
     assign_add_mul_r(work_fields[2], g_spinor_field[DUM_DERI], 1., vol);
 
     /* Odd sites */
     Block_H_psi(&block_list[j], g_spinor_field[DUM_DERI], work_fields[0], OE);
-    assign_mul_one_pm_imu(work_fields[3], work_fields[1], 1., vol); 
+    if(g_c_sw > 0)
+      assign_mul_one_sw_pm_imu_block(OO,work_fields[3], work_fields[1], g_mu, &block_list[j]);
+    else 
+      assign_mul_one_pm_imu(work_fields[3], work_fields[1], 1., vol);
+ 
     assign_add_mul_r(work_fields[3], g_spinor_field[DUM_DERI], 1., vol);
 
     /* convert back to block spinor */
     block_convert_eo_to_lexic(work_fields[5], work_fields[2], work_fields[3]);
 
-    if(g_proc_id == 0 && g_debug_level > 5) {
-      for(i = 0; i < block_list[0].volume; i++) {
-	if(fabs(creal(work_fields[6][i].s0.c0)) > 1.e-15 || fabs(creal(work_fields[5][i].s0.c0)) > 1.e-15) {
-	  printf("%d %e %d\n", i, creal(work_fields[6][i].s0.c0), block_list[0].volume);
-	  printf("%d %e\n", i, creal(work_fields[5][i].s0.c0));
-	}
-      }
-    }
-
     diff(work_fields[4], work_fields[5], work_fields[6], block_list[0].volume);
     nrm = square_norm(work_fields[4], block_list[0].volume, 0);
     if(sqrt(nrm) > 1.e-12) {
-      printf("Check failed for local D against Hopping Matrix: ||delta|| = %1.5e block %d process %d\n", sqrt(nrm), j, g_proc_id);
+      printf("# Check failed for local D against Hopping Matrix: ||delta|| = %1.5e block %d process %d\n", sqrt(nrm), j, g_proc_id);
     }
   }
+  
+  if(g_proc_id == 0) {
+    printf("# ...done\n");
+    printf("# Test Msap and Msap_eo to reduce residue\n");
+    printf("# Expect something around 5.e-2 for the relative reduction with Ncycle=4, Niter=4\n\n");
+  }
   /* check Msap and Msap_eo on a radom vector */
   random_spinor_field_lexic(work_fields[0], repro, RN_GAUSS);
+  double nrm2 =  square_norm(work_fields[0], VOLUME, 1);
+  if(g_proc_id == 0) {
+    printf("# Initial residue ||r||^2 = %1.5e\n", nrm2);
+  }
   zero_spinor_field(work_fields[1], VOLUME);
-  Msap(work_fields[1], work_fields[0], 2);
+  Msap(work_fields[1], work_fields[0], 5,3);
   D_psi(work_fields[2], work_fields[1]);
   diff(work_fields[3], work_fields[2], work_fields[0], VOLUME);
   nrm = square_norm(work_fields[3], VOLUME, 1);
   if(g_proc_id == 0) {
-    printf("Msap relaxed the residue to ||r||^2 = %1.5e\n", nrm);
+    printf("# Msap relaxed the residue to ||r||^2 = %1.5e relative reduction %1.5e\n\n", nrm, nrm/nrm2);
   }
 
   zero_spinor_field(work_fields[1], VOLUME);
-  Msap_eo(work_fields[1], work_fields[0], 2);
+  Msap_eo(work_fields[1], work_fields[0], 5,3);
   D_psi(work_fields[2], work_fields[1]);
   diff(work_fields[3], work_fields[2], work_fields[0], VOLUME);
   nrm = square_norm(work_fields[3], VOLUME, 1);
   if(g_proc_id == 0) {
-    printf("Msap_eo relaxed the residue to ||r||^2 = %1.5e\n", nrm);
+    printf("# Msap_eo relaxed the residue to ||r||^2 = %1.5e relative reduction %1.5e\n\n", nrm, nrm/nrm2);
+    printf("# Now we test the block MR with even/odd\n");
+    printf("# Expect 1.e-3 or so on each block\n\n");
   }
 
   for(j = 0; j < 6; j++) {
     r[j] = work_fields[j];
   }
   for(j = 0; j < nb_blocks; j++) {
-    
+
     block_convert_lexic_to_eo(r[0], r[1], block_list[j].basis[0]);
     /* check even/odd inversion for Block_D_psi*/
     /* varphi_e in r[2] */
-    assign_mul_one_pm_imu_inv(r[2], r[0], +1., vol);
+    if(g_c_sw > 0)
+      assign_mul_one_sw_pm_imu_inv_block(EE,r[2], r[0], g_mu, &block_list[j]);
+    else
+      assign_mul_one_pm_imu_inv(r[2], r[0], +1., vol);
+
     Block_H_psi(&block_list[j], r[3], r[2], OE);
     /* a_odd = a_odd + b_odd */
     /* varphi_o in r[3] */
     assign_mul_add_r(r[3], -1., r[1], vol);
     /* psi_o in r[1] */
-    mrblk(r[1], r[3], 3, 1.e-31, 1, vol, &Mtm_plus_block_psi, j);
-    
+    if(g_c_sw > 0) {
+      mrblk(r[1], r[3], r[4], 3, 1.e-31, 1, vol, &Msw_plus_block_psi, j);
+    }
+    else {
+      mrblk(r[1], r[3], r[4], 3, 1.e-31, 1, vol, &Mtm_plus_block_psi, j);
+    }
+
     Block_H_psi(&block_list[j], r[0], r[1], EO);
-    mul_one_pm_imu_inv(r[0], +1., vol);
+    assign(r[5],r[0],VOLUMEPLUSRAND);
+    if(g_c_sw > 0)
+      assign_mul_one_sw_pm_imu_inv_block(EE, r[0], r[5], g_mu, &block_list[j]);
+    else
+      mul_one_pm_imu_inv(r[0], +1., vol);
     /* a_even = a_even + b_even */
     /* check this sign +1 seems to be right in Msap_eo */
     assign_add_mul_r(r[2], r[0], -1., vol);
-    
+
     block_convert_eo_to_lexic(r[4], r[2], r[1]);
-    
+
     Block_D_psi(&block_list[j], r[5], r[4]);
     diff(r[0], block_list[j].basis[0], r[5], block_list[j].volume);
     nrm = square_norm(r[0], block_list[j].volume, 0);
     if(g_proc_id == 0) {
-      printf("mr_eo, block=%d: ||r||^2 = %1.5e\n", j, nrm);
+      printf("# mr_eo, block=%d: ||r||^2 = %1.5e\n", j, nrm);
     }
   }
-  for(j = 0; j < nb_blocks; j++) {
-    block_convert_lexic_to_eo(r[0], r[1], block_list[j].basis[0]);
-    /* check even/odd inversion for Block_D_psi*/
-    /* varphi_e in r[2] */
-    assign_mul_one_pm_imu_inv(r[2], r[0], +1., vol);
-    Block_H_psi(&block_list[j], r[3], r[2], OE);
-    /* a_odd = a_odd + b_odd */
-    /* varphi_o in r[3] */
-    assign_mul_add_r(r[3], -1., r[1], vol);
-    /* psi_o in r[1] */
-    mul_one_pm_imu_inv(r[3], +1., vol); 
-    mrblk(r[1], r[3], 3, 1.e-31, 1, vol, &Mtm_plus_sym_block_psi, j);
-    
-    Block_H_psi(&block_list[j], r[0], r[1], EO);
-    mul_one_pm_imu_inv(r[0], +1., vol);
-    /* a_even = a_even + b_even */
-    /* check this sign +1 seems to be right in Msap_eo */
-    assign_add_mul_r(r[2], r[0], -1., vol);
-    
-    block_convert_eo_to_lexic(r[4], r[2], r[1]);
-    
-    Block_D_psi(&block_list[j], r[5], r[4]);
-    diff(r[0], block_list[j].basis[0], r[5], block_list[j].volume);
-    nrm = square_norm(r[0], block_list[j].volume, 0);
-    if(g_proc_id == 0) {
-      printf("mr_eo (symmetric eo), block=%d: ||r||^2 = %1.5e\n", j, nrm);
+  if( g_c_sw <= 0 ) {
+    for(j = 0; j < nb_blocks; j++) {
+      block_convert_lexic_to_eo(r[0], r[1], block_list[j].basis[0]);
+      /* check even/odd inversion for Block_D_psi*/
+      /* varphi_e in r[2] */
+      if(g_c_sw > 0)
+        assign_mul_one_sw_pm_imu_inv_block(EE,r[2],r[0], g_mu, &block_list[j]);
+      else
+        assign_mul_one_pm_imu_inv(r[2], r[0], +1., vol);
+      
+      Block_H_psi(&block_list[j], r[3], r[2], OE);
+      /* a_odd = a_odd + b_odd */
+      /* varphi_o in r[3] */
+      assign_mul_add_r(r[3], -1., r[1], vol);
+      /* psi_o in r[1] */
+      if(g_c_sw > 0) {
+        // FIXME: this cannot be correct!
+        assign(r[5],r[3],VOLUMEPLUSRAND);
+        assign_mul_one_sw_pm_imu_inv_block(OO, r[3], r[5], g_mu, &block_list[j]);
+      }
+      else {
+        mul_one_pm_imu_inv(r[3], +1., vol);
+      }
+      
+      if(g_c_sw > 0)
+        mrblk(r[1], r[3], r[4], 3, 1.e-31, 1, vol, &Msw_plus_sym_block_psi, j);
+      else
+        mrblk(r[1], r[3], r[4], 3, 1.e-31, 1, vol, &Mtm_plus_sym_block_psi, j);
+      
+      Block_H_psi(&block_list[j], r[0], r[1], EO);
+      
+      if(g_c_sw > 0){
+        assign(r[5],r[0],VOLUMEPLUSRAND);
+        assign_mul_one_sw_pm_imu_inv_block(EE, r[0], r[5], g_mu, &block_list[j]);
+      }
+      else{
+        mul_one_pm_imu_inv(r[0], +1., vol);}
+      
+      /* a_even = a_even + b_even */
+      /* check this sign +1 seems to be right in Msap_eo */
+      assign_add_mul_r(r[2], r[0], -1., vol);
+      
+      block_convert_eo_to_lexic(r[4], r[2], r[1]);
+      
+      Block_D_psi(&block_list[j], r[5], r[4]);
+      diff(r[0], block_list[j].basis[0], r[5], block_list[j].volume);
+      nrm = square_norm(r[0], block_list[j].volume, 0);
+      if(g_proc_id == 0) {
+        printf("# mr_eo (symmetric eo), block=%d: ||r||^2 = %1.5e\n", j, nrm);
+      }
     }
   }
   finalize_solver(work_fields, nr_wf);
diff --git a/solver/dfl_projector.h b/solver/dfl_projector.h
index ae4839a6f..7f4b22029 100644
--- a/solver/dfl_projector.h
+++ b/solver/dfl_projector.h
@@ -21,6 +21,8 @@
 
 #include "su3spinor.h"
 
+extern int cumiter_lgcr;
+
 void project(spinor * const out, spinor * const in);
 void project_left(spinor * const out, spinor * const in);
 void project_right(spinor * const out, spinor * const in);
@@ -30,9 +32,11 @@ int check_projectors(const int repro);
 void check_little_D_inversion(const int repro);
 void check_local_D(const int repro);
 void free_dfl_projector();
+void mg_precon(spinor * const out, spinor * const in);
 
 void little_project(_Complex double * const out, _Complex double * const in, const int  N);
 void little_project_eo(_Complex double * const out, _Complex double * const in, const int N);
+void little_project_eo_32(_Complex float * const out, _Complex float * const in, const int N);
 void little_P_L_D(_Complex double * const out, _Complex double * const in);
 void little_P_L_D_sym(_Complex double * const out, _Complex double * const in);
 void little_D_P_R(_Complex double * const out, _Complex double * const in);
@@ -40,6 +44,8 @@ void little_P_R(_Complex double * const out, _Complex double * const in);
 void little_P_L(_Complex double * const out, _Complex double * const in);
 void little_P_R_sym(_Complex double * const out, _Complex double * const in);
 void little_P_L_sym(_Complex double * const out, _Complex double * const in);
+void little_mg_precon(_Complex double * const out, _Complex double * const in);
+void little_mg_precon_32(_Complex float * const out, _Complex float * const in);
 
 extern double dfl_little_D_prec;
 extern int dfl_sloppy_prec;
diff --git a/solver/diagonalise_general_matrix.c b/solver/diagonalise_general_matrix.c
index 782ef971a..e7c2598aa 100644
--- a/solver/diagonalise_general_matrix.c
+++ b/solver/diagonalise_general_matrix.c
@@ -40,7 +40,7 @@
  ******************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/solver/dirac_operator_eigenvectors.c b/solver/dirac_operator_eigenvectors.c
index bdef1ec24..4953653d4 100644
--- a/solver/dirac_operator_eigenvectors.c
+++ b/solver/dirac_operator_eigenvectors.c
@@ -19,7 +19,7 @@
  ************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-#include<config.h>
+#include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -35,7 +35,7 @@
 #include <stdlib.h>
 
 #include "global.h"
-#include "config.h"
+#include "tmlqcd_config.h"
 #include "su3.h"
 #include "sse.h"
 #include "monomial/monomial.h"
@@ -1137,7 +1137,7 @@ void spinorStructEigenvecQtm(spinor *fv,double kappa,double mu,int epsilon,int k
   double q[8];
   double p_mu[4];
   double p_mu_t[4];
-  double psq,psq_tilde,M_wilson,prefactor,beta,norm_factor,swap_dummy;
+  double psq,psq_tilde,M_wilson,prefactor,beta,norm_factor;
   double *fv_=(double*)fv;
   int index;
 
@@ -1165,10 +1165,10 @@ void spinorStructEigenvecQtm(spinor *fv,double kappa,double mu,int epsilon,int k
 
   /* multiply with i ... */
   /* .. so first swap re <-> im .. */
-  SWAP(q[0],q[1],swap_dummy);
-  SWAP(q[2],q[3],swap_dummy);
-  SWAP(q[4],q[5],swap_dummy);
-  SWAP(q[6],q[7],swap_dummy);
+  SWAP(q[0],q[1]);
+  SWAP(q[2],q[3]);
+  SWAP(q[4],q[5]);
+  SWAP(q[6],q[7]);
 
   /* and multiply new real part (former imag part) with -1 */
   q[0]*=-prefactor; q[1]*=prefactor; q[2]*=-prefactor; q[3]*=prefactor;
@@ -1216,7 +1216,7 @@ void spinorStructEigenvecQtmSu3Vector(spinor *fv,double kappa,double mu,int epsi
   double q[8];
   double p_mu[4];
   double p_mu_t[4];
-  double psq,psq_tilde,M_wilson,prefactor,beta,norm_factor,swap_dummy;
+  double psq,psq_tilde,M_wilson,prefactor,beta,norm_factor;
 
   calcPmuLattice(rawp,p_mu,tt,ll);
   psq=p_mu[0]*p_mu[0]+
@@ -1242,10 +1242,10 @@ void spinorStructEigenvecQtmSu3Vector(spinor *fv,double kappa,double mu,int epsi
 
   /* multiply with i ... */
   /* .. so first swap re <-> im .. */
-  SWAP(q[0],q[1],swap_dummy);
-  SWAP(q[2],q[3],swap_dummy);
-  SWAP(q[4],q[5],swap_dummy);
-  SWAP(q[6],q[7],swap_dummy);
+  SWAP(q[0],q[1]);
+  SWAP(q[2],q[3]);
+  SWAP(q[4],q[5]);
+  SWAP(q[6],q[7]);
 
   /* and multiply new real part (former imag part) with -1 */
   q[0]*=-prefactor; q[1]*=prefactor; q[2]*=-prefactor; q[3]*=prefactor;
@@ -2092,7 +2092,7 @@ int * makeDiagFalloffPmuMap(int n,int maxdmanhat){
 
     for(int i = 0;i<10;i++){
       ranlxd(r,2);
-      SWAP(drawp[(int)(r[0]*4.)],drawp[(int)(r[1]*4.)],r[2]);
+      SWAP(drawp[(int)(r[0]*4.)],drawp[(int)(r[1]*4.)]);
 
   }
     fprintf(drawpStatFile," %d %d %d %d\n",drawp[0],drawp[1],drawp[2],drawp[3]);
diff --git a/solver/dirac_operator_eigenvectors.h b/solver/dirac_operator_eigenvectors.h
index da8f10187..b4ef212e1 100644
--- a/solver/dirac_operator_eigenvectors.h
+++ b/solver/dirac_operator_eigenvectors.h
@@ -22,7 +22,7 @@
 #define _DIRAC_EIGENVALUES_H
 
 #ifdef HAVE_CONFIG_H
-#include "config.h"
+#include "tmlqcd_config.h"
 #endif
 #ifdef HAVE_FFTW
   #include <fftw3.h>
@@ -40,11 +40,6 @@
 #define M_PI  3.14159265358979323846
 #endif
 
-#define SWAP(x,y,d)\
-  d=x;\
-  x=y;\
-  y=d;
-
 #define min(x,y)\
   ((x<y)?x:y)
 #define max(x,y)\
diff --git a/solver/eigcg.c b/solver/eigcg.c
index 096dea6a4..eb6646ba0 100644
--- a/solver/eigcg.c
+++ b/solver/eigcg.c
@@ -91,13 +91,13 @@
 /***********************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <time.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
@@ -199,7 +199,7 @@ void eigcg(int n, int lde, spinor * const x, spinor * const b, double *normb,
 
   int info, allelems = v_max*v_max;
   
-#ifdef MPI
+#ifdef TM_USE_MPI
   parallel=1;
 #else
   parallel=0;
@@ -412,7 +412,7 @@ void eigcg(int n, int lde, spinor * const x, spinor * const b, double *normb,
     /* Restart test */
     if(nev==0)
     {
-       if (  ( (*reshist < restart_eps_sq) && (rel_prec ==0) ) || ((*reshist < restart_eps_sq*reshist_init ) && (rel_prec==1)) ) 
+       if (*reshist < (restart_eps_sq*reshist_init) ) 
        {  
            *flag = 3;
             break;  /* break do not return */
diff --git a/solver/eigenvalues.c b/solver/eigenvalues.c
index 1f81444b2..73c9b4060 100644
--- a/solver/eigenvalues.c
+++ b/solver/eigenvalues.c
@@ -32,7 +32,7 @@
  **************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/solver/eigenvalues_Jacobi.c b/solver/eigenvalues_Jacobi.c
index a61f3deec..c6e2a7517 100644
--- a/solver/eigenvalues_Jacobi.c
+++ b/solver/eigenvalues_Jacobi.c
@@ -23,7 +23,7 @@
  *
  **************************************************************************/
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -81,7 +81,7 @@ double eigenvalues_Jacobi(int * nr_of_eigenvalues, const int max_iterations,
   int dims[]={1, LX*g_nproc_x, LY*g_nproc_y, LZ*g_nproc_z};
   FILE *efp;
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   double atime, etime;
   MPI_File fp;
   MPI_Offset siteSize=3*2*sizeof(double);
@@ -185,7 +185,7 @@ double eigenvalues_Jacobi(int * nr_of_eigenvalues, const int max_iterations,
   for(v0dim = 0; v0dim < (*nr_of_eigenvalues); v0dim++) {
     sprintf(filename, "eigenvector.%.3d.%.3d.%.4d", v0dim, tslice, nstore);
     s=(su3_vector*)&eigenvectors_su3v[v0dim*N2];
-#ifdef MPI 
+#ifdef TM_USE_MPI 
 # ifdef HAVE_LIBLEMON
     // SEGNO: dovrebbe stampare 8*2*3*SPACEVOLUME data per file, ma ne stampa 8*2*4n*SPACEVOLUME (n=4-1 per ev 0-3)
 
diff --git a/solver/eigenvalues_bi.c b/solver/eigenvalues_bi.c
index 68f0c15af..63d78e483 100644
--- a/solver/eigenvalues_bi.c
+++ b/solver/eigenvalues_bi.c
@@ -35,12 +35,12 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
diff --git a/solver/fgmres.c b/solver/fgmres.c
index 283ff0f80..0952cf805 100644
--- a/solver/fgmres.c
+++ b/solver/fgmres.c
@@ -35,23 +35,25 @@
  ********************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include<stdlib.h>
 #include<stdio.h>
 #include<math.h>
-#include"global.h"
-#include"su3.h"
-#include"linalg_eo.h"
-#include"gmres_precon.h"
-#include"operator/tm_operators.h"
-#include"sub_low_ev.h"
-#include"poly_precon.h"
+#include "global.h"
+#include "su3.h"
+#include "linalg_eo.h"
+#include "gmres_precon.h"
+#include "operator/tm_operators.h"
+#include "sub_low_ev.h"
+#include "poly_precon.h"
 #include "Msap.h"
-#include"gamma.h"
+#include "gamma.h"
 #include "start.h"
 #include "solver_field.h"
-#include"fgmres.h"
+#include "dfl_projector.h"
+#include "gettime.h"
+#include "fgmres.h"
 
 static void init_gmres(const int _M, const int _V);
 
@@ -76,28 +78,16 @@ int fgmres(spinor * const P,spinor * const Q,
 
   int restart, i, j, k;
   double beta, eps, norm;
+  double atime, etime, Ptime = 0., patime;
   _Complex double tmp1, tmp2;
   spinor * r0;
   spinor ** solver_field = NULL;
   const int nr_sf = 3;
 
+  atime = gettime();
+  cumiter_lgcr = 0;
   if(N == VOLUME) {
     init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf);/* #ifdef HAVE_LAPACK */
-/*     _FT(zhetrf)("U", &n, G, &N, ipiv, work, &lwork, &info, 1); */
-/* #endif */
-/*     if(info != 0) { */
-/*       printf("Error in zhetrf info = %d\n", info); */
-/*     } */
-/*     else { */
-/* #ifdef HAVE_LAPACK */
-/*       _FT(zhetrs)("U", &n, &ONE, G, &N, ipiv, bn, &N, &info, 1); */
-/* #endif */
-/*       if(info != 0) { */
-/* 	printf("Error in zhetrs info = %d\n", info); */
-/*       } */
-/*     } */
-    /* solution again stored in bn */
-
   }
   else {
     init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf);
@@ -132,15 +122,19 @@ int fgmres(spinor * const P,spinor * const Q,
 
     for(j = 0; j < m; j++){
       /* solver_field[0]=A*M^-1*v_j */
-
+      patime = gettime();
       if(precon == 0) {
 	assign(Z[j], V[j], N);
       }
-      else {
+      else if(precon == 1) {
 	zero_spinor_field(Z[j], N);
-	/* poly_nonherm_precon(Z[j], V[j], 0.3, 1.1, 80, N); */
-	Msap(Z[j], V[j], 8);
+	Msap_eo(Z[j], V[j], 5, 3);
+      }
+      else {
+	mg_precon(Z[j], V[j]);
       }
+      Ptime += gettime() - patime;
+
       f(r0, Z[j]); 
       /* Set h_ij and omega_j */
       /* solver_field[1] <- omega_j */
@@ -190,6 +184,13 @@ int fgmres(spinor * const P,spinor * const Q,
 	}
 	assign(P, solver_field[2], N);
 	finalize_solver(solver_field, nr_sf);
+        etime = gettime();
+	if(g_proc_id == g_stdio_proc && g_debug_level > 0){
+	  printf("FGMRES %d\t%g final iterated residue\n", restart*m+j, creal(alpha[j+1])*creal(alpha[j+1])); 
+	  printf("FGMRES cumulative little solver iterations %d, average %e\n", cumiter_lgcr, (double)cumiter_lgcr/(double)(restart*m+j));
+          printf("FGMRES total solve time in s %e, time in preconditioner %e\n", etime - atime, Ptime);
+	  fflush(stdout);
+	}
 	return(restart*m+j);
       }
       /* if not */
@@ -220,6 +221,14 @@ int fgmres(spinor * const P,spinor * const Q,
   /* If maximal number of restarts is reached */
   assign(P, solver_field[2], N);
   finalize_solver(solver_field, nr_sf);
+  etime = gettime();
+  if(g_proc_id == g_stdio_proc && g_debug_level > 0){
+    printf("FGMRES max number of restarts reached!\n");
+    printf("FGMRES %d\t%g final iterated residue\n", restart*m+j, creal(alpha[j+1])*creal(alpha[j+1])); 
+    printf("FGMRES cumulative little solver iterations %d, average %e\n", cumiter_lgcr, (double)cumiter_lgcr/(double)(restart*m+j));
+    printf("FGMRES total solve time in s %e, time in preconditioner %e\n", etime - atime, Ptime);
+    fflush(stdout);
+  }
   return(-1);
 }
 
@@ -232,8 +241,10 @@ static void init_gmres(const int _M, const int _V){
     if(init == 1){
       free(H);
       free(V);
+      free(Z);
       free(_h);
       free(_v);
+      free(_z);
       free(alpha);
       free(c);
       free(s);
diff --git a/solver/fgmres4bispinors.c b/solver/fgmres4bispinors.c
index 81f4e1891..60511c4bf 100644
--- a/solver/fgmres4bispinors.c
+++ b/solver/fgmres4bispinors.c
@@ -36,7 +36,7 @@
  ********************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include<stdlib.h>
 #include<stdio.h>
diff --git a/solver/fgmres4complex.c b/solver/fgmres4complex.c
new file mode 100644
index 000000000..c1cd7a9e2
--- /dev/null
+++ b/solver/fgmres4complex.c
@@ -0,0 +1,70 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *               2015 Mario Schroeck
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Generalized minimal residual (FGMRES) with a maximal number of restarts.    
+ * Solves Q=AP for _Complex double regular matrices A. Flexibel version of GMRES 
+ * with the ability for variable right preconditioning. 
+ *
+ * Inout:                                                                      
+ *  _Complex double * P       : guess for the solving spinor
+ * Input:                                                                      
+ *  _Complex double * Q       : source spinor
+ *  int m            : Maximal dimension of Krylov subspace                                     
+ *  int max_restarts : maximal number of restarts                                   
+ *  double eps       : stopping criterium                                                     
+ *  matrix_mult f    : pointer to a function containing the matrix mult
+ *                     for type matrix_mult see matrix_mult_typedef.h
+ *
+ * Autor: Carsten Urbach <urbach@ifh.de>
+ ********************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include<stdlib.h>
+#include<stdio.h>
+#include<math.h>
+#include<string.h>
+#include"global.h"
+#include"su3.h"
+#include"linalg_eo.h"
+#include"solver_field.h"
+#include"dfl_projector.h"
+#include"gcr4complex.h"
+#include"fgmres4complex.h"
+
+
+
+#define _PSWITCH(s) s
+#define _F_TYPE double
+
+#include "fgmres4complex_body.c"
+
+#undef _PSWITCH
+#undef _F_TYPE
+
+
+#define _PSWITCH(s) s ## _32
+#define _F_TYPE float
+
+#include "fgmres4complex_body.c"
+
+#undef _PSWITCH
+#undef _F_TYPE
diff --git a/solver/fgmres4complex.h b/solver/fgmres4complex.h
new file mode 100644
index 000000000..1993ee507
--- /dev/null
+++ b/solver/fgmres4complex.h
@@ -0,0 +1,70 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *               2015 Mario Schroeck
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+/*******************************************************************************
+ * Generalized minimal residual (GMRES) with a maximal number of restarts.    
+ * Solves Q=AP for _Complex double regular matrices A.
+ * For details see: Andreas Meister, Numerik linearer Gleichungssysteme        
+ *   or the original citation:                                                 
+ * Y. Saad, M.H.Schultz in GMRES: A generalized minimal residual algorithm    
+ *                         for solving nonsymmetric linear systems.            
+ * 			SIAM J. Sci. Stat. Comput., 7: 856-869, 1986           
+ *           
+ * int gmres(spinor * const P,spinor * const Q, 
+ *	   const int m, const int max_restarts,
+ *	   const double eps_sq, matrix_mult f)
+ *                                                                 
+ * Returns the number of iterations needed or -1 if maximal number of restarts  
+ * has been reached.                                                           
+ *
+ * Inout:                                                                      
+ *  spinor * P       : guess for the solving spinor                                             
+ * Input:                                                                      
+ *  spinor * Q       : source spinor
+ *  int m            : Maximal dimension of Krylov subspace                                     
+ *  int max_restarts : maximal number of restarts                                   
+ *  double eps       : stopping criterium                                                     
+ *  matrix_mult f    : pointer to a function containing the matrix mult
+ *                     for type matrix_mult see matrix_mult_typedef.h
+ *
+ * Autor: Carsten Urbach <urbach@ifh.de>
+ ********************************************************************************/
+
+#ifndef _FGMRES4COMPLEX_H
+#define _FGMRES4COMPLEX_H
+
+//#include"solver/matrix_mult_typedef.h"
+//#include"su3.h"
+
+int fgmres4complex(_Complex double * const P, _Complex double * const Q,
+		   const int m, const int max_restarts,
+		   const double eps_sq, const int rel_prec,
+		   const int N, const int parallel,
+		   const int lda, const int precon, c_matrix_mult f);
+
+int fgmres4complex_32(_Complex float * const P, _Complex float * const Q,
+		      const int m, const int max_restarts,
+		      const double eps_sq, const int rel_prec,
+		      const int N, const int parallel,
+		      const int lda, const int precon, c_matrix_mult_32 f);
+
+
+#endif
diff --git a/solver/fgmres4complex_body.c b/solver/fgmres4complex_body.c
new file mode 100644
index 000000000..72f6cdaae
--- /dev/null
+++ b/solver/fgmres4complex_body.c
@@ -0,0 +1,243 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2016 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Generalized minimal residual (FGMRES) with a maximal number of restarts.    
+ * Solves Q=AP for _Complex double regular matrices A. Flexibel version of GMRES 
+ * with the ability for variable right preconditioning. 
+ *
+ * Inout:                                                                      
+ *  spinor * P       : guess for the solving spinor                                             
+ * Input:                                                                      
+ *  spinor * Q       : source spinor
+ *  int m            : Maximal dimension of Krylov subspace                                     
+ *  int max_restarts : maximal number of restarts                                   
+ *  double eps       : stopping criterium                                                     
+ *  matrix_mult f    : pointer to a function containing the matrix mult
+ *                     for type matrix_mult see matrix_mult_typedef.h
+ *
+ * Autor: Carsten Urbach <urbach@ifh.de>
+ ********************************************************************************/
+
+static void _PSWITCH(init_lgmres)(const int _M, const int _V);
+
+static _Complex _F_TYPE ** _PSWITCH(H);
+static _Complex _F_TYPE ** _PSWITCH(V);
+static _Complex _F_TYPE ** _PSWITCH(Z);
+static _Complex _F_TYPE * _PSWITCH(alpha);
+static _Complex _F_TYPE * _PSWITCH(c);
+static _F_TYPE * _PSWITCH(s);
+extern void little_mg_precon_32(_Complex float *, _Complex float *);
+
+int _PSWITCH(fgmres4complex)(_Complex _F_TYPE * const P, _Complex _F_TYPE * const Q,
+                             const int m, const int max_restarts,
+                             const double eps_sq, const int rel_prec,
+                             const int N, const int parallel,
+                             const int lda, const int precon, _PSWITCH(c_matrix_mult) f) {
+
+  int restart, i, j, k;
+  double beta, eps, norm;
+  _Complex _F_TYPE tmp1, tmp2;
+  _Complex _F_TYPE * r0;
+  _Complex _F_TYPE ** solver_field = NULL;
+  const int nr_sf = 3;
+  int fltcntr = 0;
+  double alphasave = 0;
+
+  _PSWITCH(init_lsolver_field)(&solver_field, /*why not N?*/ lda, nr_sf);/* #ifdef HAVE_LAPACK */
+
+  eps=sqrt(eps_sq);
+  _PSWITCH(init_lgmres)(m, lda);
+  r0 = solver_field[0];
+  
+  norm = sqrt(_PSWITCH(lsquare_norm)(Q, N, parallel));
+
+  _PSWITCH(lassign)(solver_field[2], P, N);
+  for(restart = 0; restart < max_restarts; restart++){
+    /* r_0=Q-AP  (b=Q, x+0=P) */
+    f(r0, solver_field[2]);
+    _PSWITCH(ldiff)(r0, Q, r0, N);
+
+    /* v_0=r_0/||r_0|| */
+    _PSWITCH(alpha)[0] = sqrt(_PSWITCH(lsquare_norm)(r0, N, parallel));
+
+    if(g_proc_id == g_stdio_proc && g_debug_level > 2){
+      printf("lFGMRES %d\t%g true residue\n", restart*m, creal(_PSWITCH(alpha)[0])*creal(_PSWITCH(alpha)[0]));
+      fflush(stdout);
+    }
+
+    if(creal(_PSWITCH(alpha)[0])==0.){ 
+      _PSWITCH(lassign)(P, solver_field[2], N);
+      _PSWITCH(finalize_lsolver)(solver_field, nr_sf);
+      return(restart*m);
+    }
+
+    _PSWITCH(lmul_r)(_PSWITCH(V)[0], 1./creal(_PSWITCH(alpha)[0]), r0, N);
+
+    for(j = 0; j < m; j++){
+      /* solver_field[0]=A*M^-1*v_j */
+
+      if(precon == 0) {
+        _PSWITCH(lassign)(_PSWITCH(Z)[j], _PSWITCH(V)[j], N);
+      }
+      else {
+        _PSWITCH(little_mg_precon)(_PSWITCH(Z)[j], _PSWITCH(V)[j]);
+      }
+
+      f(r0, _PSWITCH(Z)[j]); 
+      /* Set h_ij and omega_j */
+      /* solver_field[1] <- omega_j */
+      _PSWITCH(lassign)(solver_field[1], solver_field[0], N);
+      for(i = 0; i <= j; i++){
+        _PSWITCH(H)[i][j] = _PSWITCH(lscalar_prod)(_PSWITCH(V)[i], solver_field[1], N, parallel);
+        _PSWITCH(lassign_diff_mul)(solver_field[1], _PSWITCH(V)[i], _PSWITCH(H)[i][j], N);
+      }
+
+      _PSWITCH(H)[j+1][j] = sqrt(_PSWITCH(lsquare_norm)(solver_field[1], N, parallel));
+      for(i = 0; i < j; i++){
+        tmp1 = _PSWITCH(H)[i][j];
+        tmp2 = _PSWITCH(H)[i+1][j];
+        (_PSWITCH(H)[i][j]) = (tmp2) * (_PSWITCH(s)[i]);
+        (_PSWITCH(H)[i][j]) += conj(_PSWITCH(c)[i]) * (tmp1);
+        (_PSWITCH(H)[i+1][j]) = (tmp1) * (_PSWITCH(s)[i]);
+        (_PSWITCH(H)[i+1][j]) -= (_PSWITCH(c)[i]) * (tmp2);
+      }
+
+      /* Set beta, s, _PSWITCH(c), _PSWITCH(alpha)[j],[j+1] */
+      beta = sqrt(creal(_PSWITCH(H)[j][j] * conj(_PSWITCH(H)[j][j])) + creal(_PSWITCH(H)[j+1][j] * conj(_PSWITCH(H)[j+1][j])));
+      _PSWITCH(s)[j] = creal(_PSWITCH(H)[j+1][j]) / beta;
+      (_PSWITCH(c)[j]) = (_PSWITCH(H)[j][j]) / beta;
+      (_PSWITCH(H)[j][j]) = beta;
+      (_PSWITCH(alpha)[j+1]) = (_PSWITCH(alpha)[j]) * (_PSWITCH(s)[j]);
+      tmp1 = _PSWITCH(alpha)[j];
+      (_PSWITCH(alpha)[j]) = conj(_PSWITCH(c)[j]) * (tmp1);
+
+      /* precision reached? */
+      if(g_proc_id == g_stdio_proc && g_debug_level > 2){
+        printf("lFGMRES\t%d\t%g iterated residue\n", restart*m+j, creal(_PSWITCH(alpha)[j+1])*creal(_PSWITCH(alpha)[j+1]));
+        fflush(stdout);
+      }
+      if(creal(_PSWITCH(alpha)[j+1]) > 0.999*alphasave) {
+        fltcntr++;
+      }
+      else fltcntr = 0;
+      alphasave = creal(_PSWITCH(alpha)[j+1]);
+      if((fltcntr > 20) || ((creal(_PSWITCH(alpha)[j+1]) <= eps) && (rel_prec == 0)) || ((creal(_PSWITCH(alpha)[j+1]) <= eps*norm) && (rel_prec == 1))){
+        (_PSWITCH(alpha)[j]) = (_PSWITCH(alpha)[j]) * (1./creal(_PSWITCH(H)[j][j]));
+        _PSWITCH(lassign_add_mul)(solver_field[2], _PSWITCH(Z)[j], _PSWITCH(alpha)[j], N);
+        for(i = j-1; i >= 0; i--){
+          for(k = i+1; k <= j; k++){
+            (tmp1) = (_PSWITCH(H)[i][k]) * (_PSWITCH(alpha)[k]); 
+            (_PSWITCH(alpha)[i]) -= tmp1;
+          }
+          (_PSWITCH(alpha)[i]) = (_PSWITCH(alpha)[i]) * (1./creal(_PSWITCH(H)[i][i]));
+          _PSWITCH(lassign_add_mul)(solver_field[2], _PSWITCH(Z)[i], _PSWITCH(alpha)[i], N);
+        }
+        for(i = 0; i < m; i++){
+          _PSWITCH(alpha)[i] = creal(_PSWITCH(alpha)[i]);
+        }
+        _PSWITCH(lassign)(P, solver_field[2], N);
+        _PSWITCH(finalize_lsolver)(solver_field, nr_sf);
+        return(restart*m+j);
+      }
+      /* if not */
+      else{
+        if(j != m-1){
+          _PSWITCH(lmul_r)(_PSWITCH(V)[(j+1)], 1./creal(_PSWITCH(H)[j+1][j]), solver_field[1], N);
+        }
+      }
+
+    }
+    j=m-1;
+    /* prepare for restart */
+    (_PSWITCH(alpha)[j]) = (_PSWITCH(alpha)[j]) * (1./creal(_PSWITCH(H)[j][j]));
+    _PSWITCH(lassign_add_mul)(solver_field[2], _PSWITCH(Z)[j], _PSWITCH(alpha)[j], N);
+    for(i = j-1; i >= 0; i--){
+      for(k = i+1; k <= j; k++){
+        (tmp1) = (_PSWITCH(H)[i][k]) * (_PSWITCH(alpha)[k]);
+        (_PSWITCH(alpha)[i]) -= tmp1;
+      }
+      (_PSWITCH(alpha)[i]) = (_PSWITCH(alpha)[i]) * (1./creal(_PSWITCH(H)[i][i]));
+      _PSWITCH(lassign_add_mul)(solver_field[2], _PSWITCH(Z)[i], _PSWITCH(alpha)[i], N);
+    }
+    for(i = 0; i < m; i++){
+      _PSWITCH(alpha)[i] = creal(_PSWITCH(alpha)[i]);
+    }
+  }
+
+  /* If maximal number of restarts is reached */
+  _PSWITCH(lassign)(P, solver_field[2], N);
+  _PSWITCH(finalize_lsolver)(solver_field, nr_sf);
+  return(max_restarts*m);
+}
+
+static void _PSWITCH(init_lgmres)(const int _M, const int _V){
+  static int Vo = -1;
+  static int M = -1;
+  static int init = 0;
+  static _Complex _F_TYPE * _v;
+  static _Complex _F_TYPE * _z;
+  static _Complex _F_TYPE * _h;
+  
+
+  int i;
+  if((M != _M)||(init == 0)||(Vo != _V)){
+    if(init == 1){
+      free(_PSWITCH(H));
+      free(_PSWITCH(V));
+      free(_h);
+      free(_v);
+      free(_z);
+      free(_PSWITCH(alpha));
+      free(_PSWITCH(c));
+      free(_PSWITCH(s));
+      free(_PSWITCH(Z));
+    }
+    Vo = _V;
+    M = _M;
+    _PSWITCH(H) = calloc(M+1, sizeof(_Complex _F_TYPE *));
+    _PSWITCH(V) = calloc(M, sizeof(_Complex _F_TYPE *));
+    _PSWITCH(Z) = calloc(M, sizeof(_Complex _F_TYPE *));
+#if (defined SSE || defined SSE2)
+    _h = calloc((M+2)*M, sizeof(_Complex _F_TYPE));
+    _PSWITCH(H)[0] = (_Complex _F_TYPE *)(((unsigned long int)(_h)+ALIGN_BASE)&~ALIGN_BASE); 
+    _v = calloc(M*Vo+1, sizeof(_Complex _F_TYPE));
+    _PSWITCH(V)[0] = (_Complex _F_TYPE *)(((unsigned long int)(_v)+ALIGN_BASE)&~ALIGN_BASE);
+    _z = calloc(M*Vo+1, sizeof(_Complex _F_TYPE));
+    _PSWITCH(Z)[0] = (_Complex _F_TYPE *)(((unsigned long int)(_z)+ALIGN_BASE)&~ALIGN_BASE);
+#else
+    _h = calloc((M+1)*M, sizeof(_Complex _F_TYPE));
+    _PSWITCH(H)[0] = _h;
+    _v = calloc(M*Vo, sizeof(_Complex _F_TYPE));
+    _PSWITCH(V)[0] = _v;
+    _z = calloc(M*Vo, sizeof(_Complex _F_TYPE));
+    _PSWITCH(Z)[0] = _z;
+#endif
+    _PSWITCH(s) = calloc(M, sizeof(_F_TYPE));
+    _PSWITCH(c) = calloc(M, sizeof(_Complex _F_TYPE));
+    _PSWITCH(alpha) = calloc(M+1, sizeof(_Complex _F_TYPE));
+    for(i = 1; i < M; i++){
+      _PSWITCH(V)[i] = _PSWITCH(V)[i-1] + Vo;
+      _PSWITCH(H)[i] = _PSWITCH(H)[i-1] + M;
+      _PSWITCH(Z)[i] = _PSWITCH(Z)[i-1] + Vo;
+    }
+    _PSWITCH(H)[M] = _PSWITCH(H)[M-1] + M;
+    init = 1;
+  }
+  return;
+}
diff --git a/solver/gcr.c b/solver/gcr.c
index 21156bfed..718b9bb84 100644
--- a/solver/gcr.c
+++ b/solver/gcr.c
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include<stdlib.h>
 #include<stdio.h>
@@ -85,7 +85,7 @@ int gcr(spinor * const P, spinor * const Q,
     diff(rho, Q, tmp, N);
     err = square_norm(rho, N, 1);
     if(g_proc_id == g_stdio_proc && g_debug_level > 1){
-      printf("GCR: iteration number: %d, true residue: %g\n", iter, err); 
+      printf("# GCR: iteration number: %d, true residue: %g\n", iter, err); 
       fflush(stdout);
     }
     if(((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*norm_sq) && (rel_prec == 1))) {
@@ -97,14 +97,17 @@ int gcr(spinor * const P, spinor * const Q,
       if(precon == 0) {
 	assign(xi[k], rho, N);
       }
-      else {
+      else if(precon == 1){
         zero_spinor_field(xi[k], N);  
-        Msap_eo(xi[k], rho, 6);   
- 	/* Msap(xi[k], rho, 8); */
+        Msap_eo(xi[k], rho, NcycleMsap, NiterMsap);   
+      }
+      else {
+	mg_precon(xi[k], rho);
       }
+
 	  
       dfl_sloppy_prec = 1;
-      dfl_little_D_prec = 1.e-12;
+      //dfl_little_D_prec = 1.e-12;
       f(tmp, xi[k]); 
 	  
       /* tmp will become chi[k] */
@@ -119,8 +122,8 @@ int gcr(spinor * const P, spinor * const Q,
       err = square_norm(rho, N, 1);
       iter ++;
       if(g_proc_id == g_stdio_proc && g_debug_level > 2){
-        if(rel_prec == 1) printf("# GCR: %d\t%g >= %g iterated residue\n", iter, err, eps_sq*norm_sq); 
-        else printf("# GCR: %d\t%g >= %giterated residue\n", iter, err, eps_sq);
+        if(rel_prec == 1) printf("# GCR: %d\t%g iterated residue\n", iter, err); 
+        else printf("# GCR: %d\t%g iterated residue\n", iter, err);
         fflush(stdout);
       }
       /* Precision reached? */
@@ -133,15 +136,15 @@ int gcr(spinor * const P, spinor * const Q,
     c[k] /= b[k];
     assign_add_mul(P, xi[k], c[k], N);
     for(l = k - 1; l >= 0; --l)
-    {
-      for(i = l+1; i <= k; ++i)
       {
-        ctmp = a[l][i] * c[i];
-        c[l] -= ctmp;
+	for(i = l+1; i <= k; ++i)
+	  {
+	    ctmp = a[l][i] * c[i];
+	    c[l] -= ctmp;
+	  }
+	c[l] /= b[l];
+	assign_add_mul(P, xi[l], c[l], N);
       }
-      c[l] /= b[l];
-      assign_add_mul(P, xi[l], c[l], N);
-    }
   }
   finalize_solver(solver_field, nr_sf);
   return(-1);
diff --git a/solver/gcr4complex.c b/solver/gcr4complex.c
index f88709044..1ea5b5e50 100644
--- a/solver/gcr4complex.c
+++ b/solver/gcr4complex.c
@@ -19,237 +19,44 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include<stdlib.h>
 #include<stdio.h>
 #include<math.h>
 #include<string.h>
+#ifdef TM_USE_OMP
+#include <omp.h>
+#endif
 #include"global.h"
 #include"su3.h"
 #include"linalg_eo.h"
+// needed for the dummy function little_mg_precon ...
+#include"block.h"
+#include"dfl_projector.h"
 #include"gcr4complex.h"
 
-static void init_lgcr(const int _M, const int _V);
-static void free_lgcr();
-static _Complex double ** a = NULL; 
-static _Complex double * _a = NULL;
-static double * b = NULL;
-static _Complex double * c = NULL;
-static _Complex double ** chi = NULL;
-static _Complex double * _chi = NULL;
-static _Complex double ** xi = NULL;
-static _Complex double * _xi = NULL;
-static _Complex double * alpha = NULL;
-static _Complex double * tmp = NULL;
-static _Complex double * rho = NULL;
-static int lgcr_init = 0;
-
-int gcr4complex(_Complex double * const P, _Complex double * const Q, 
-		const int m, const int max_restarts,
-		const double eps_sq, const int rel_prec,
-		const int N, const int parallel, 
-		const int lda, c_matrix_mult f) {
-  
-  int k, l, restart, i, p=0;
-  double norm_sq, err;
-  _Complex double ctmp;
-
-  init_lgcr(m, lda);
-
-  norm_sq = lsquare_norm(Q, N, parallel);
-  if(norm_sq < 1.e-20) {
-    norm_sq = 1.;
-  }
-  for(restart = 0; restart < max_restarts; restart++) {
-    f(tmp, P);
-    ldiff(rho, Q, tmp, N);
-    err = lsquare_norm(rho, N, parallel);
-    if(g_proc_id == g_stdio_proc && g_debug_level > 1){/*CT: was "g_debug_level > 0" */
-      printf("lGCR: %d\t%g true residue %1.3e\n", restart * m, err, norm_sq); 
-      fflush(stdout);
-    }
-    if(((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq * norm_sq) && (rel_prec == 1))) {
-      if(g_proc_id == 0 && g_debug_level > 1) printf("lgcr: %d %e %e %e %e\n", p, err, norm_sq, err/norm_sq, eps_sq);
-      return (p);
-    }
-    for(k = 0; ; k++) {
-      memcpy(xi[k], rho, N*sizeof(_Complex double));
-      /* here we could put in a preconditioner */
-      f(tmp, xi[k]); 
-      /* tmp will become chi[k] */
-      for(l = 0; l < k; l++) {
-        a[l][k] = lscalar_prod(chi[l], tmp, N, parallel);
-        lassign_diff_mul(tmp, chi[l], a[l][k], N);
-      }
-      b[k] = sqrt(lsquare_norm(tmp, N, parallel));
-      lmul_r(chi[k], 1./b[k], tmp, N);
-      c[k] = lscalar_prod(chi[k], rho, N, parallel);
-      lassign_diff_mul(rho, chi[k], c[k], N);
-      err = lsquare_norm(rho, N, parallel);
-      if(g_proc_id == g_stdio_proc && g_debug_level > 1){
-        printf("lGCR: %d\t%g iterated residue\n", restart*m+k, err); 
-        fflush(stdout);
-      }
-      p++;
-      /* Precision reached? */
-      if((k == m-1) || ((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*norm_sq) && (rel_prec == 1))) {
-	break;
-      }
-    }
-    /* prepare for restart */
-    c[k] /= b[k];
-    lassign_add_mul(P, xi[k], c[k], N);
-    for(l = k-1; l >= 0; --l)
-    {
-      for(i = l+1; i <= k; ++i)
-      {
-        ctmp  = a[l][i] * c[i];
-        c[l] -= ctmp;
-      }
-      c[l] /= b[l];
-      lassign_add_mul(P, xi[l], c[l], N);
-    }
-  }
-  if(g_proc_id == 0 && g_debug_level > 1) printf("lgcr: for -1 %d %e %e %e %e\n", p, err, norm_sq, err/norm_sq, eps_sq);
-  return(-1);
-}
-
-static void init_lgcr(const int _M, const int _V){
-  static int Vo = -1;
-  static int M = -1;
-
-  int i;
-  if((M != _M)||(lgcr_init == 0)||(Vo != _V)){
-    if(lgcr_init == 1) free_lgcr();
-    Vo = _V;
-    M = _M;
-    a = calloc(M+1, sizeof(_Complex double *));
-    chi = calloc(M, sizeof(_Complex double *));
-    xi = calloc(M, sizeof(_Complex double *));
-    tmp = calloc(Vo, sizeof(_Complex double));
-    rho = calloc(Vo, sizeof(_Complex double));
-    _a = calloc((M+1)*M, sizeof(_Complex double));
-    a[0] = _a;
-    _chi = calloc(M*Vo, sizeof(_Complex double));
-    chi[0] = _chi;
-    _xi = calloc(M*Vo, sizeof(_Complex double));
-    xi[0] = _xi;
-
-    b = calloc(M, sizeof(double));
-    c = calloc(M, sizeof(_Complex double));
-    alpha = calloc(M+1, sizeof(_Complex double));
-    for(i = 1; i < M; i++) { 
-      chi[i] = chi[i-1] + Vo;
-      xi[i] = xi[i-1] + Vo;
-      a[i] = a[i-1] + M;
-    }
-    a[M] = a[M-1] + M;
-    lgcr_init = 1;
-  }
-}
-
-static void free_lgcr() 
-{
-  lgcr_init = 0;
-  free(a);
-  free(chi);
-  free(_a);
-  free(_chi);
-  free(alpha);
-  free(c);
-  free(_xi);
-  free(xi);
-  free(rho);
-  free(tmp);
-  return;
-}
-
-
-void ldiff(_Complex double * const Q, _Complex double * const R, _Complex double * const S, const int N) 
-{
-  for(int i = 0; i < N; ++i)
-    Q[i] = R[i] - S[i];
-  return;
-}
 
-void ldiff_assign(_Complex double * const Q, _Complex double * const S, const int N) 
-{
-  for(int i = 0; i < N; ++i)
-    Q[i] -= S[i];
-  return;
-}
-
-void ladd(_Complex double * const Q, _Complex double * const R, _Complex double * const S, const int N) 
-{
-  for(int i = 0; i < N; ++i)
-    Q[i] = R[i] + S[i];
-  return;
-}
-
-void ladd_assign(_Complex double * const Q, _Complex double * const S, const int N) 
-{
-  for(int i = 0; i < N; ++i)
-    Q[i] += S[i];
-  return;
-}
-
-double lsquare_norm(_Complex double * const Q, const int N, const int parallel) 
-{
-  double nrm = 0.0;
-
-  for(int i = 0; i < N; ++i)
-    
-    nrm += conj(Q[i]) * Q[i];
-#ifdef MPI
-  if(parallel)
-  {
-    double nrm2 = nrm;
-    MPI_Allreduce(&nrm2, &nrm, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  }
-#endif
-
-  return(nrm);
-}
-
-_Complex double lscalar_prod(_Complex double * const R, _Complex double * const S, const int N, const int parallel) 
-{
-  _Complex double res = 0.0;
-
-  for(int i = 0; i < N; ++i)
-    res += conj(R[i]) * S[i];
-  
-#ifdef MPI
-  if(parallel)
-  {
-    _Complex double res2 = res;
-    MPI_Allreduce(&res2, &res, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
-  }
-#endif
+#define _PSWITCH(s) s 
+#define _PTSWITCH(s) s 
+#define _C_TYPE _Complex double
+#define _F_TYPE double
 
-  return(res);
-}
+#include"gcr4complex_body.c"
 
-void lmul_r(_Complex double * const R, const double c, _Complex double * const S, const int N) 
-{
-  for(int i = 0; i < N; ++i)
-    R[i] = c * S[i];
-}
+#undef _PSWITCH
+#undef _PTSWITCH
+#undef _C_TYPE
+#undef _F_TYPE
 
-void lmul(_Complex double * const R, const _Complex double c, _Complex double * const S, const int N) 
-{
-  for(int i = 0; i < N; ++i)
-    R[i] = c * S[i];
-}
+#define _PSWITCH(s) s ## _32
+#define _PTSWITCH(s) s ## 32
+#define _C_TYPE _Complex float
+#define _F_TYPE float
 
-void lassign_add_mul(_Complex double * const R, _Complex double * const S, const _Complex double c, const int N)
-{
-  for(int i = 0; i < N; ++i)
-    R[i] += c * S[i];
-}
+#include"gcr4complex_body.c"
 
-void lassign_diff_mul(_Complex double * const R, _Complex double * const S, const _Complex double c, const int N) 
-{
-  for(int i = 0; i < N; i++)
-    R[i] -= c * S[i];
-}
+#undef _PSWITCH
+#undef _PTSWITCH
+#undef _C_TYPE
+#undef _F_TYPE
diff --git a/solver/gcr4complex.h b/solver/gcr4complex.h
index 4fa1823b7..28623e545 100644
--- a/solver/gcr4complex.h
+++ b/solver/gcr4complex.h
@@ -23,25 +23,29 @@
 #include"solver/matrix_mult_typedef.h"
 #include"su3.h"
 
-void ldiff(_Complex double * Q, _Complex double * const R, _Complex double * const S, const int N);
-void ladd(_Complex double * Q, _Complex double * const R, _Complex double * const S, const int N);
-double lsquare_norm(_Complex double * const Q, const int N, const int parallel);
-_Complex double lscalar_prod(_Complex double * const R, _Complex double * const S, const int N, const int parallel);
-void lmul_r(_Complex double * const R, const double c, _Complex double * const S, const int N);
-void lmul(_Complex double * const R, const _Complex double c, _Complex double * const S, const int N);
-void lassign_diff_mul(_Complex double * const R, _Complex double * const S, const _Complex double c, const int N);
-void lassign_add_mul(_Complex double * const R, _Complex double * const S, const _Complex double c, const int N);
-void ldiff_assign(_Complex double * const Q, _Complex double * const S, 
-		  const int N);
-void ladd_assign(_Complex double * const Q, _Complex double * const S, 
-		  const int N);
-
-
-int gcr4complex(_Complex double * const P, _Complex double * const Q, 
-		const int m, const int max_restarts,
-		const double eps_sq, const int rel_prec,
-		const int N, const int parallel,
-		const int lda, c_matrix_mult f);
 
+#define _PSWITCH(s) s 
+#define _PTSWITCH(s) s 
+#define _C_TYPE _Complex double
+#define _F_TYPE double
+
+#include"gcr4complex_body.h"
+
+#undef _PSWITCH
+#undef _PTSWITCH
+#undef _C_TYPE
+#undef _F_TYPE
+
+#define _PSWITCH(s) s ## _32
+#define _PTSWITCH(s) s ## 32
+#define _C_TYPE _Complex float
+#define _F_TYPE float
+
+#include"gcr4complex_body.h"
+
+#undef _PSWITCH
+#undef _PTSWITCH
+#undef _C_TYPE
+#undef _F_TYPE
 
 #endif
diff --git a/solver/gcr4complex_body.c b/solver/gcr4complex_body.c
new file mode 100644
index 000000000..44b68148b
--- /dev/null
+++ b/solver/gcr4complex_body.c
@@ -0,0 +1,374 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2016 Carsten Urbach
+ *               2010 claude Tadonki
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifdef TM_USE_OMP
+# define gcr4complexOMP
+#endif
+
+static void _PSWITCH(init_lgcr)(const int _M, const int _V);
+static void _PSWITCH(free_lgcr)();
+static _C_TYPE ** _PSWITCH(a)     = NULL; 
+static _C_TYPE *  _PSWITCH(_a)    = NULL;
+static _F_TYPE *  _PSWITCH(b)     = NULL;
+static _C_TYPE *  _PSWITCH(c)     = NULL;
+static _C_TYPE ** _PSWITCH(chi)   = NULL;
+static _C_TYPE *  _PSWITCH(_chi)  = NULL;
+static _C_TYPE ** _PSWITCH(xi)    = NULL;
+static _C_TYPE *  _PSWITCH(_xi)   = NULL;
+static _C_TYPE *  _PSWITCH(tmp)   = NULL;
+static _C_TYPE *  _PSWITCH(rho)   = NULL;
+static int _PSWITCH(lgcr_init)   = 0;
+
+
+int _PSWITCH(gcr4complex)(_C_TYPE * const P, _C_TYPE * const Q, 
+                          const int m, const int max_restarts,
+                          const double eps_sq, const int rel_prec,
+                          const int N, const int parallel, 
+                          const int lda, const int precon, _PSWITCH(c_matrix_mult) f) {
+
+  int k, l, restart, i, p=0;
+  double norm_sq, err;
+  _C_TYPE ctmp;
+
+  _PSWITCH(init_lgcr)(m, lda);
+
+  norm_sq = _PSWITCH(lsquare_norm)(Q, N, parallel);
+  if(norm_sq < 1.e-20) {
+    norm_sq = 1.;
+  }
+
+  for(restart = 0; restart < max_restarts; restart++) {
+    f(_PSWITCH(tmp), P);
+    _PSWITCH(ldiff)(_PSWITCH(rho), Q, _PSWITCH(tmp), N);
+    err = _PSWITCH(lsquare_norm)(_PSWITCH(rho), N, parallel);
+    if(g_proc_id == g_stdio_proc && g_debug_level > 2){
+      printf("lGCR: %d\t%g true residue\n", p, err); 
+      fflush(stdout);
+    }
+    if(((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq * norm_sq) && (rel_prec == 1))) {
+      if(g_proc_id == 0 && g_debug_level > 2) printf("lGCR: %d %e %e %e %e\n", p, err, norm_sq, err/norm_sq, eps_sq);
+      return (p);
+    }
+    for(k = 0; k < m ; k++) {
+      if(precon == 0) {
+        memcpy(_PSWITCH(xi)[k], _PSWITCH(rho), N*sizeof(_C_TYPE));
+      }
+      else {
+        _PSWITCH(little_mg_precon)(_PSWITCH(xi)[k], _PSWITCH(rho));
+      }
+      f(_PSWITCH(tmp), _PSWITCH(xi)[k]); 
+      /* tmp will become chi[k] */
+      for(l = 0; l < k; l++) {
+        _PSWITCH(a)[l][k] = _PSWITCH(lscalar_prod)(_PSWITCH(chi)[l], _PSWITCH(tmp), N, parallel);
+        _PSWITCH(lassign_diff_mul)(_PSWITCH(tmp), _PSWITCH(chi)[l], _PSWITCH(a)[l][k], N);
+      }
+      _PSWITCH(b)[k] = sqrt(_PSWITCH(lsquare_norm)(_PSWITCH(tmp), N, parallel));
+      _PSWITCH(lmul_r)(_PSWITCH(chi)[k], 1./_PSWITCH(b)[k], _PSWITCH(tmp), N);
+      _PSWITCH(c)[k] = _PSWITCH(lscalar_prod)(_PSWITCH(chi)[k], _PSWITCH(rho), N, parallel);
+      _PSWITCH(lassign_diff_mul)(_PSWITCH(rho), _PSWITCH(chi)[k], _PSWITCH(c)[k], N);
+      err = _PSWITCH(lsquare_norm)(_PSWITCH(rho), N, parallel);
+      if(g_proc_id == g_stdio_proc && g_debug_level > 2){
+        printf("lGCR: %d\t%g iterated residue\n", p, err); 
+        fflush(stdout);
+      }
+      p++;
+      /* Precision reached? */
+      if((k == m-1) || ((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*norm_sq) && (rel_prec == 1))) {
+        break;
+      }
+    }
+    /* prepare for restart */
+    _PSWITCH(c)[k] /= _PSWITCH(b)[k];
+    _PSWITCH(lassign_add_mul)(P, _PSWITCH(xi)[k], _PSWITCH(c)[k], N);
+    for(l = k-1; l >= 0; --l) {
+      for(i = l+1; i <= k; ++i) {
+        ctmp  = _PSWITCH(a)[l][i] * _PSWITCH(c)[i];
+        _PSWITCH(c)[l] -= ctmp;
+      }
+      _PSWITCH(c)[l] /= _PSWITCH(b)[l];
+      _PSWITCH(lassign_add_mul)(P, _PSWITCH(xi)[l], _PSWITCH(c)[l], N);
+    }
+  }
+  return(max_restarts*m);
+}
+
+static void _PSWITCH(init_lgcr)(const int _M, const int _V){
+  static int Vo = -1;
+  static int M = -1;
+
+  int i;
+  if((M != _M) || (_PSWITCH(lgcr_init) == 0) || (Vo != _V)){
+    if(_PSWITCH(lgcr_init) == 1) _PSWITCH(free_lgcr)();
+    Vo = _V;
+    M = _M;
+    _PSWITCH(a) = calloc(M+1, sizeof(_C_TYPE *));
+    _PSWITCH(chi) = calloc(M, sizeof(_C_TYPE *));
+    _PSWITCH(xi) = calloc(M, sizeof(_C_TYPE *));
+    _PSWITCH(tmp) = calloc(Vo, sizeof(_C_TYPE));
+    _PSWITCH(rho) = calloc(Vo, sizeof(_C_TYPE));
+    _PSWITCH(_a) = calloc((M+1)*M, sizeof(_C_TYPE));
+    _PSWITCH(a)[0] = _PSWITCH(_a);
+    _PSWITCH(_chi) = calloc(M*Vo, sizeof(_C_TYPE));
+    _PSWITCH(chi)[0] = _PSWITCH(_chi);
+    _PSWITCH(_xi) = calloc(M*Vo, sizeof(_C_TYPE));
+    _PSWITCH(xi)[0] = _PSWITCH(_xi);
+
+    _PSWITCH(b) = calloc(M, sizeof(_F_TYPE));
+    _PSWITCH(c) = calloc(M, sizeof(_C_TYPE));
+    for(i = 1; i < M; i++) { 
+      _PSWITCH(chi)[i] = _PSWITCH(chi)[i-1] + Vo;
+      _PSWITCH(xi)[i] = _PSWITCH(xi)[i-1] + Vo;
+      _PSWITCH(a)[i] = _PSWITCH(a)[i-1] + M;
+    }
+    _PSWITCH(a)[M] = _PSWITCH(a)[M-1] + M;
+    _PSWITCH(lgcr_init) = 1;
+  }
+}
+
+static void _PSWITCH(free_lgcr)() 
+{
+  _PSWITCH(lgcr_init) = 0;
+  free(_PSWITCH(a));
+  free(_PSWITCH(chi));
+  free(_PSWITCH(_a));
+  free(_PSWITCH(_chi));
+  free(_PSWITCH(b));
+  free(_PSWITCH(c));
+  free(_PSWITCH(_xi));
+  free(_PSWITCH(xi));
+  free(_PSWITCH(rho));
+  free(_PSWITCH(tmp));
+  return;
+}
+
+
+void _PSWITCH(ldiff)(_C_TYPE * const Q, _C_TYPE * const R, _C_TYPE * const S, const int N) 
+{
+#ifdef gcr4complexOMP
+#pragma omp parallel for
+#endif
+  for(int i = 0; i < N; ++i)
+    Q[i] = R[i] - S[i];
+  return;
+}
+
+void _PSWITCH(lassign)(_C_TYPE * const R, _C_TYPE * const S, const int N)
+{
+  memcpy(R, S, N*sizeof(_C_TYPE));
+  return;
+}
+
+void _PSWITCH(ldiff_assign)(_C_TYPE * const Q, _C_TYPE * const S, const int N) 
+{
+#ifdef gcr4complexOMP
+#pragma omp parallel for
+#endif
+  for(int i = 0; i < N; ++i)
+    Q[i] -= S[i];
+  return;
+}
+
+void _PSWITCH(ladd)(_C_TYPE * const Q, _C_TYPE * const R, _C_TYPE * const S, const int N) 
+{
+#ifdef gcr4complexOMP
+#pragma omp parallel for
+#endif
+  for(int i = 0; i < N; ++i)
+    Q[i] = R[i] + S[i];
+  return;
+}
+
+void _PSWITCH(ladd_assign)(_C_TYPE * const Q, _C_TYPE * const S, const int N) 
+{
+#ifdef gcr4complexOMP
+#pragma omp parallel for
+#endif
+  for(int i = 0; i < N; ++i)
+    Q[i] += S[i];
+  return;
+}
+
+_F_TYPE _PSWITCH(lsquare_norm)(_C_TYPE * const Q, const int N, const int parallel) 
+{
+  double nrm = 0.;
+#ifdef gcr4complexOMP
+#  pragma omp parallel
+  {
+    int thread_num = omp_get_thread_num();
+#endif
+    double tmp = 0.0;
+#ifdef gcr4complexOMP
+#  pragma omp for 
+#endif
+    for(int i = 0; i < N; ++i)
+      tmp += creal(conj(Q[i]) * Q[i]);
+    
+#ifdef gcr4complexOMP
+    g_omp_acc_re[thread_num] = tmp;
+    
+  } // OpenMP closing brace
+  
+  // having left the parallel section, we can now sum up
+  for(int i = 0; i < omp_num_threads; ++i)
+    nrm += g_omp_acc_re[i];
+#else
+  nrm = tmp;
+#endif
+
+#ifdef TM_USE_MPI
+  if(parallel)
+    {
+      double nrm2 = nrm;
+      MPI_Allreduce(&nrm2, &nrm, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    }
+#endif
+
+  return((_F_TYPE)nrm);
+}
+
+_C_TYPE _PSWITCH(lscalar_prod)(_C_TYPE * const R, _C_TYPE * const S, const int N, const int parallel) 
+{
+  _Complex double res = 0.;
+#ifdef gcr4complexOMP
+#  pragma omp parallel
+  {
+    int thread_num = omp_get_thread_num();
+#endif
+    _Complex double tmp = 0.0;
+#ifdef gcr4complexOMP
+#  pragma omp for 
+#endif
+    for(int i = 0; i < N; ++i)
+      tmp += conj(R[i]) * S[i];
+
+#ifdef gcr4complexOMP
+    g_omp_acc_cp[thread_num] = tmp;
+    
+  } // OpenMP closing brace
+  
+  // having left the parallel section, we can now sum up
+  for(int i = 0; i < omp_num_threads; ++i)
+    res += g_omp_acc_cp[i];
+#else
+  res = tmp;
+#endif
+    
+#ifdef TM_USE_MPI
+  if(parallel)
+    {
+      _Complex double res2 = res;
+      MPI_Allreduce(&res2, &res, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
+    }
+#endif
+
+  return((_C_TYPE)res);
+}
+
+_F_TYPE _PSWITCH(lscalar_prod_r)(_C_TYPE * const R, _C_TYPE * const S, const int N, const int parallel) 
+{
+  double res = 0.;
+#ifdef gcr4complexOMP
+#  pragma omp parallel
+  {
+    int thread_num = omp_get_thread_num();
+#endif
+    double tmp = 0.0;
+#ifdef gcr4complexOMP
+#  pragma omp for
+#endif
+    for(int i = 0; i < N; ++i) {
+      tmp += creal(conj(R[i]) * S[i]);
+    }
+#ifdef gcr4complexOMP
+    g_omp_acc_re[thread_num] = tmp;
+    
+  } // OpenMP closing brace
+  
+  // having left the parallel section, we can now sum up
+  for(int i = 0; i < omp_num_threads; ++i)
+    res += g_omp_acc_re[i];
+#else
+  res = tmp;
+#endif
+  
+#ifdef TM_USE_MPI
+  if(parallel) {
+    double res2 = res;
+    MPI_Allreduce(&res2, &res, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  }
+#endif
+  
+  return((_F_TYPE)res);
+}
+
+
+void _PSWITCH(lmul_r)(_C_TYPE * const R, const _F_TYPE c, _C_TYPE * const S, const int N) 
+{
+#ifdef gcr4complexOMP
+#pragma omp parallel for
+#endif
+  for(int i = 0; i < N; ++i)
+    R[i] = c * S[i];
+}
+
+void _PSWITCH(lmul)(_C_TYPE * const R, const _C_TYPE c, _C_TYPE * const S, const int N) 
+{
+#ifdef gcr4complexOMP
+#pragma omp parallel for
+#endif
+  for(int i = 0; i < N; ++i)
+    R[i] = c * S[i];
+}
+
+void _PSWITCH(lassign_add_mul)(_C_TYPE * const R, _C_TYPE * const S, const _C_TYPE c, const int N)
+{
+#ifdef gcr4complexOMP
+#pragma omp parallel for
+#endif
+  for(int i = 0; i < N; ++i)
+    R[i] += c * S[i];
+}
+
+void _PSWITCH(lassign_add_mul_r)(_C_TYPE * const R, _C_TYPE * const S, const _F_TYPE c, const int N)
+{
+#ifdef gcr4complexOMP
+#pragma omp parallel for
+#endif
+  for(int i = 0; i < N; ++i)
+    R[i] += c * S[i];
+}
+
+void _PSWITCH(lassign_mul_add_r)(_C_TYPE * const R, const _F_TYPE c, _C_TYPE * const S, const int N)
+{
+#ifdef gcr4complexOMP
+#pragma omp parallel for
+#endif
+  for(int i = 0; i < N; ++i)
+    R[i] = c * R[i] + S[i];
+}
+
+void _PSWITCH(lassign_diff_mul)(_C_TYPE * const R, _C_TYPE * const S, const _C_TYPE c, const int N) 
+{
+#ifdef gcr4complexOMP
+#pragma omp parallel for
+#endif
+  for(int i = 0; i < N; i++)
+    R[i] -= c * S[i];
+}
diff --git a/solver/gcr4complex_body.h b/solver/gcr4complex_body.h
new file mode 100644
index 000000000..dd73976fd
--- /dev/null
+++ b/solver/gcr4complex_body.h
@@ -0,0 +1,44 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+void _PSWITCH(ldiff)(_C_TYPE * Q, _C_TYPE * const R, _C_TYPE * const S, const int N);
+void _PSWITCH(lassign)(_C_TYPE * Q, _C_TYPE * const R, const int N);
+void _PSWITCH(ladd)(_C_TYPE * Q, _C_TYPE * const R, _C_TYPE * const S, const int N);
+_F_TYPE _PSWITCH(lsquare_norm)(_C_TYPE * const Q, const int N, const int parallel);
+_C_TYPE _PSWITCH(lscalar_prod)(_C_TYPE * const R, _C_TYPE * const S, const int N, const int parallel);
+_F_TYPE _PSWITCH(lscalar_prod_r)(_C_TYPE * const R, _C_TYPE * const S, const int N, const int parallel);
+void _PSWITCH(lmul_r)(_C_TYPE * const R, const _F_TYPE c, _C_TYPE * const S, const int N);
+void _PSWITCH(lmul)(_C_TYPE * const R, const _C_TYPE c, _C_TYPE * const S, const int N);
+void _PSWITCH(lassign_diff_mul)(_C_TYPE * const R, _C_TYPE * const S, const _C_TYPE c, const int N);
+void _PSWITCH(lassign_add_mul)(_C_TYPE * const R, _C_TYPE * const S, const _C_TYPE c, const int N);
+void _PSWITCH(lassign_add_mul_r)(_C_TYPE * const R, _C_TYPE * const S, const _F_TYPE c, const int N);
+void _PSWITCH(lassign_mul_add_r)(_C_TYPE * const R, const _F_TYPE c, _C_TYPE * const S, const int N);
+void _PSWITCH(ldiff_assign)(_C_TYPE * const Q, _C_TYPE * const S, 
+		  const int N);
+void _PSWITCH(ladd_assign)(_C_TYPE * const Q, _C_TYPE * const S, 
+		  const int N);
+
+
+int _PSWITCH(gcr4complex)(_C_TYPE * const P, _C_TYPE * const Q, 
+			  const int m, const int max_restarts,
+			  const double eps_sq, const int rel_prec,
+			  const int N, const int parallel,
+			  const int lda, const int precon, _PSWITCH(c_matrix_mult) f);
+
+
diff --git a/solver/generate_dfl_subspace.c b/solver/generate_dfl_subspace.c
index 305d8f4a3..643dabdb9 100644
--- a/solver/generate_dfl_subspace.c
+++ b/solver/generate_dfl_subspace.c
@@ -17,24 +17,28 @@
  * 
  * You should have received a copy of the GNU General Public License
  * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
-
- This file was modified according to a flexible number of blocks
- by Claude Tadonki - PetaQCD - April 2010 ( claude.tadonki@lal.in2p3.fr )
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <time.h>
 #include "global.h"
+#include "gettime.h"
 #include "su3.h"
 #include <complex.h>
+#include "read_input.h"
 #include "start.h"
+#include "gamma.h"
 #include "ranlxs.h"
+#include "init/init.h"
 #include "operator/D_psi.h"
+#include "operator/tm_operators.h"
+#include "operator/clover_leaf.h"
+#include "operator/clovertm_operators.h"
 #include "poly_precon.h"
 #include "Msap.h"
 #include "gmres_precon.h"
@@ -44,15 +48,19 @@
 #include "block.h"
 #include "little_D.h"
 #include "gcr4complex.h"
+#include "cgne4complex.h"
 #include "boundary.h"
 #include <io/params.h>
 #include <io/gauge.h>
 #include <io/spinor.h>
 #include <io/utils.h>
+#include "solver/solver.h"
 #include "solver_field.h"
+#include "dfl_projector.h"
 #include "generate_dfl_subspace.h"
 
 int init_little_dfl_subspace(const int N_s);
+void compute_little_little_D(const int N_s);
 
 spinor ** dfl_fields = NULL;
 static spinor * _dfl_fields = NULL;
@@ -60,342 +68,215 @@ _Complex double ** little_dfl_fields = NULL;
 static _Complex double *_little_dfl_fields = NULL;
 _Complex double ** little_dfl_fields_eo = NULL;
 static _Complex double *_little_dfl_fields_eo = NULL;
+_Complex float ** little_dfl_fields_eo_32 = NULL;
+static _Complex float *_little_dfl_fields_eo_32 = NULL;
+
 static int init_subspace = 0;
 static int init_little_subspace = 0;
 
 static void random_fields(const int Ns) {
+
+  for (int i = 0; i < Ns; i++) {
+    random_spinor_field_lexic(dfl_fields[i], 0, RN_PM1UNIF);
+  }
+  return;
+}
+
+
+// this routine updates the deflation subspace
+
+int update_dfl_subspace(const int Ns, const int N, const int Nsmooth) {
+  int vpr = VOLUMEPLUSRAND*sizeof(spinor)/sizeof(_Complex double),
+    vol = VOLUME*sizeof(spinor)/sizeof(_Complex double);
+  double musave = mu_dfl;
+  double kappasave = kappa_dfl;
+  double muMsapsave = mu_Msap;
+  mu_dfl = mu_dflgen;
+  kappa_dfl = kappa_dflgen;
+  mu_Msap = mu_dflgen;
   
-  int i, j, ix;
-  float r,s[24];
-  double *t;
-  
-  r=(float)(1.0/sqrt(24.0*(double)(VOLUME)));
-  
-  for (i = 0; i < Ns; i++) {
-    t=(double*)(dfl_fields[i]);
-    for (ix = 0; ix < VOLUME; ix++){
-      ranlxs(s,24);
-      for (j = 0; j < 24; j++) {
- 	(*t) = (double)(r*(s[j]-0.5f)); 
-	(*t) = 1.; 
- 	t += 1; 
-      }
+  double nrm; 
+  for(int j = 0; j < Nsmooth; j++) {
+    //if(j - loop_SAP > 4) {
+    //  usePL = usePLsave;
+    //}
+    // build little D automatically when little_D is called and dfl_subspace_updated == 1
+    for (int i = 0; i < Ns; i++) {
+      /* add it to the basis */
+      split_global_field_GEN_ID(block_list, i, dfl_fields[i], nb_blocks);
+    }
+    /* perform local orthonormalization */
+    for(int i = 0; i < nb_blocks; i++) {
+      block_orthonormalize(block_list+i);
+    }
+    dfl_subspace_updated = 1;
+    compute_little_little_D(Ns);
+    
+    // why can't I use g_spinor_field[1] in mg_precon??
+    for(int i = 0; i < Ns; i++) {
+      g_sloppy_precision = 1;
+
+      // v <- C v
+      mg_precon(g_spinor_field[0], dfl_fields[i]);
+      assign(dfl_fields[i], g_spinor_field[0], VOLUME);
+      
+      g_sloppy_precision = 0;
+      ModifiedGS((_Complex double*)g_spinor_field[0], vol, i, (_Complex double*)dfl_fields[0], vpr);
+      nrm = sqrt(square_norm(g_spinor_field[0], N, 1));
+      mul_r(dfl_fields[i], 1./nrm, g_spinor_field[0], N);
+      
     }
   }
-  return;
+  for(int i = 0; i < Ns; i++) {
+    /* add it to the basis */
+    split_global_field_GEN_ID(block_list, i, dfl_fields[i], nb_blocks);
+  }
+  /* perform local orthonormalization */
+  for(int i = 0; i < nb_blocks; i++) {
+    block_orthonormalize(block_list+i);
+  }
+  dfl_subspace_updated = 1;
+  mu_dfl = musave;
+  kappa_dfl = kappasave;
+  mu_Msap = muMsapsave;
+  return(0);
 }
 
+// this routine generates the deflation subspace
+
 int generate_dfl_subspace(const int Ns, const int N, const int repro) {
-  int ix, i_o,i, j, k, p, blk, vpr = VOLUMEPLUSRAND*sizeof(spinor)/sizeof(_Complex double),
+  int p=0, vpr = VOLUMEPLUSRAND*sizeof(spinor)/sizeof(_Complex double),
     vol = VOLUME*sizeof(spinor)/sizeof(_Complex double);
+  const int loop_SAP = 3;
   spinor **psi;
-  double nrm, e = 0.3, d = 1.1, atime, etime;
+  double nrm, atime, etime;
   _Complex double s;
   _Complex double * work;
-  WRITER *writer = NULL;  
-  FILE *fp_dfl_fields; 
-  char file_name[500]; // CT
-  double musave = g_mu;
+  double musave = mu_dfl;
+  double kappasave = kappa_dfl;
+  int usePLsave = usePL;
+  double muMsapsave = mu_Msap;
   spinor ** work_fields = NULL;
   const int nr_wf = 2;
+  g_mu2 = 0.;
+  kappa_dfl = kappa_dflgen;
+  mu_dfl = mu_dflgen;
+  mu_Msap = mu_dflgen;
+
+  if(g_c_sw > 0) {
+    if (g_cart_id == 0 && g_debug_level > 1) {
+      printf("#\n# csw = %e, computing clover leafs\n", g_c_sw);
+    }
+    init_sw_fields(VOLUME);
+    sw_term( (const su3**) g_gauge_field, g_kappa, g_c_sw);
+    // this must be EE = 0, this is needed for Msap_eo!?
+    sw_invert(0, g_mu);
+    /* now copy double sw and sw_inv fields to 32bit versions */
+    copy_32_sw_fields();
+  }
+
+  // currently set to 0 during subspace generation
+  usePL = 0;
+  atime = gettime();
 
-#ifdef MPI
-  atime = MPI_Wtime();
-#else
-  atime = (double)clock()/(double)(CLOCKS_PER_SEC);
-#endif
   init_solver_field(&work_fields, VOLUMEPLUSRAND, nr_wf);
   work = (_Complex double*)malloc(nb_blocks*9*Ns*sizeof(_Complex double));
   psi = (spinor **)calloc(nb_blocks, sizeof(spinor *));
   psi[0] = calloc(VOLUME + nb_blocks, sizeof(spinor));
-  for(i = 1; i < nb_blocks; i++) psi[i] = psi[i-1] + (VOLUME / nb_blocks) + 1;
-  
-  if(init_subspace == 0) i = init_dfl_subspace(Ns);
-  
-  if(init_little_subspace == 0) i = init_little_dfl_subspace(Ns);
-  
-  random_fields(Ns);
-  if(g_debug_level > 4) {
-    for(e = 0.; e < 1.; e=e+0.05) {
-      random_spinor_field_lexic(dfl_fields[0], repro, RN_GAUSS);
-      nrm = sqrt(square_norm(dfl_fields[0], N, 1));
-      mul_r(dfl_fields[0], 1./nrm, dfl_fields[0], N);
-      d = 1.1;
-      /*       gmres_precon(work_fields[0], dfl_fields[0], 20, 1, 1.e-20, 0, N, &D_psi); */
-      poly_nonherm_precon(work_fields[0], dfl_fields[0], e, d, 30, N);
-      D_psi(work_fields[1], work_fields[0]);
-      diff(work_fields[0], work_fields[1], dfl_fields[0], N);
-      nrm = square_norm(work_fields[0], N, 1);
-      if(g_proc_id == 0) {
-	printf(" e= %f d= %f nrm = %1.5e\n", e, d, nrm);
-      }
-    }
-    d = 1.1;
-    e=0.3;
-  }
-  
-  boundary(g_kappa);
-  g_mu = 0.;
-  /*
-    CT: We try to read dfl_fields[i] from file if it exists, 
-    otherwise we recalculate it                               
-  */
-  /* CU: reading and writing should be done with lemon! */
-  for(p = 0; p < Ns; p++) {
-    sprintf(file_name,"dfl_fields.%.2d", p);
-    if((fp_dfl_fields = fopen(file_name,  "r")) == NULL) {
-      break;
-    }
-    else {
-      fclose(fp_dfl_fields);
-      if((i = read_spinor(dfl_fields[p], NULL, file_name, 0)) != 0) {
-	if(g_proc_id == 0) {
-	  fprintf(stderr, "Could not read from file %s err = %d\n", file_name, i);
-	}
-	break;
-      }
-    }
-  }
-  
-  if((g_proc_id == 0) && (p < Ns) && (g_debug_level > 0))  printf("Compute remaining fields from scratch\n");
-  /*CT: We do Ns x 80 x 20 evaluation of Dpsi */
-  /*      ModifiedGS((_Complex double*)dfl_fields[i], vol, i, (_Complex double*)dfl_fields[0], vpr); */
-  /*      nrm = sqrt(square_norm(dfl_fields[i], N, 1)); */
-  /*      mul_r(dfl_fields[i], 1./nrm, dfl_fields[i], N); */
-  if(p < Ns) {
-    if(1) {
-      for(i = 0; i < Ns; i++) {
-	/*    ModifiedGS((_Complex double*)dfl_fields[i], vol, i, (_Complex double*)dfl_fields[0], vpr);
-	      nrm = sqrt(square_norm(dfl_fields[i], N, 1));
-	      mul_r(dfl_fields[i], 1./nrm, dfl_fields[i], N);
-	*/
-	for(j = 0; j < 20; j++) {
-	  zero_spinor_field(g_spinor_field[0],VOLUME);  
-	  g_sloppy_precision = 1;
-	  Msap_eo(g_spinor_field[0], dfl_fields[i], j+1); 
-	  /*      poly_nonherm_precon(g_spinor_field[0], dfl_fields[i], e, d, 2, N);*/
-	  /*       gmres_precon(work_fields[0], dfl_fields[i], 20, 1, 1.e-20, 0, N, &D_psi); */
-	  
-	  for (ix=0;ix<VOLUME;ix++) {
-	    _spinor_assign((*(dfl_fields[i] + ix)),(*(g_spinor_field[0]+ix)));
-	  }
-	  
-	  g_sloppy_precision = 0;
-	  /*       for (i=0;i<Ns; i++) { */
-	  ModifiedGS((_Complex double*)g_spinor_field[0], vol, i, (_Complex double*)dfl_fields[0], vpr);
-	  nrm = sqrt(square_norm(g_spinor_field[0], N, 1));
-	  mul_r(dfl_fields[i], 1./nrm, g_spinor_field[0], N);
-	}
-      }
-  
-      for (i=0; i<Ns; i++) {
-	/* test quality */
-	if(g_debug_level > -1) {
-	  D_psi(work_fields[0], dfl_fields[i]);
-	  nrm = sqrt(square_norm(work_fields[0], N, 1));
-	  if(g_proc_id == 0) {
-	    printf(" ||D psi_%d||/||psi_%d|| = %1.5e\n", i, i, nrm*nrm);
-	  }
-	}
-      }
-    }
+  for(int i = 1; i < nb_blocks; i++) psi[i] = psi[i-1] + (VOLUME / nb_blocks) + 1;
 
-    if(0) {
-      for(j = 0; j < 4; j++) {/*dfl_field_iter = 80  by default */
-	for(i = p; i < Ns; i++) {
-	  ModifiedGS((_Complex double*)dfl_fields[i], vol, i, (_Complex double*)dfl_fields[0], vpr);
-	  nrm = sqrt(square_norm(dfl_fields[i], N, 1));
-	  mul_r(dfl_fields[i], 1./nrm, dfl_fields[i], N);
-	  for(k = 0; k < 3; k++) {
-	    g_sloppy_precision = 1;
-	    /* dfl_poly_iter = 20 by default */
-	    zero_spinor_field(g_spinor_field[0],VOLUME);
-	    Msap_eo(g_spinor_field[0], dfl_fields[i], 4);
-	    /* poly_nonherm_precon(g_spinor_field[0], dfl_fields[i], e, d, 4, N);  */
-	    g_sloppy_precision = 0;
-	    ModifiedGS((_Complex double*)g_spinor_field[0], vol, i, (_Complex double*)dfl_fields[0], vpr);
-	    nrm = sqrt(square_norm(g_spinor_field[0], N, 1));
-	    mul_r(dfl_fields[i], 1./nrm, g_spinor_field[0], N);
-	  }
-	  
-	  /* test quality */
-	  if(g_debug_level > -1) {
-	    D_psi(work_fields[0], dfl_fields[i]);
-	    nrm = sqrt(square_norm(work_fields[0], N, 1));
-	    if(g_proc_id == 0) {
-	      printf(" ||D psi_%d||/||psi_%d|| = %1.5e\n", i, i, nrm);
-	    }
-	  }
-	}
-      }
-    }
-    for(i = 0; i < Ns; i++) {
-      /*
-	CT: We save dfl_fields[i] in a binary file, 
-	using a generic nomenclature proc_i__dfl_fields for later reads                               
-      */
-      sprintf(file_name,"dfl_fields.%.2d", i);
-      construct_writer(&writer, file_name, 0);
-      write_propagator_type(writer, 4);
-      write_spinor(writer, &dfl_fields[i], NULL, 1, 64);
-      destruct_writer(writer);
-    }
+  if((g_proc_id == 0) && (g_debug_level > 0)) {
+    printf("# Initialising subspaces\n");
+    fflush(stdout);
   }
-  g_mu = musave;
-  g_sloppy_precision = 0;
-  boundary(g_kappa);
-  if(g_debug_level > 2) {
-    for(i = 0; i < Ns; i++) {
-      for(j = 0; j < Ns; j++) {
-	s = scalar_prod(dfl_fields[i], dfl_fields[j], N, 1);
-	if(g_proc_id == 0) {
-	  printf("<%d, %d> = %1.3e +i %1.3e\n", i, j, creal(s), cimag(s));
-	}
-      }
-    }
+
+  if(init_subspace == 0) p = init_dfl_subspace(Ns);
+
+  if(init_little_subspace == 0) p = init_little_dfl_subspace(Ns);
+
+  if((g_proc_id == 0) && (g_debug_level > 0)) {
+    printf("# Generating random fields...");
   }
-  for (i = 0; i < Ns; i++) {
-    /* add it to the basis */
-    /* split_global_field(block_list[0].basis[i], block_list[1].basis[i], dfl_fields[i]); */
-    split_global_field_GEN_ID(block_list, i, dfl_fields[i], nb_blocks);
+  double ta = gettime();
+  random_fields(Ns);
+  double tb = gettime();
+  if((g_proc_id == 0) && (g_debug_level > 0)) {
+    printf(" done in %e seconds\n", tb-ta);
+    fflush(stdout);
   }
-  
-  /* perform local orthonormalization */
-  for(i = 0; i < nb_blocks; i++) block_orthonormalize(block_list+i);
-  /* block_orthonormalize(block_list+1); */
-  
-  dfl_subspace_updated = 1;
-  
-  for(j = 0; j < Ns; j++) {
-    for(i = 0; i < nb_blocks*9*Ns; i++) {
-      (little_dfl_fields[j][i]) = 0.0;
-      (work[i]) = 0.0;
-    }
+
+  if((g_proc_id == 0) && (p < Ns) && (g_debug_level > 0)) {
+    printf("# Compute approximate eigenvectors from scratch\n");
+    printf("# Using kappa= %e and mu = %e for the subspace generation\n", g_kappa, g_mu/g_kappa/2.);
   }
-  
-  /* compute the little little basis */
-  /* r = work_fields[0]; */
-  /* q = g_spinor_field[DUM__SOLVER+1]; */
-  
-  for(i = 0; i < Ns; i++) {
-    /* split_global_field(r, q,  dfl_fields[i]); */
-    split_global_field_GEN(psi, dfl_fields[i], nb_blocks);
-    /* now take the local scalar products */
-    for(j = 0; j < Ns; j++) {
-      //p = r;
-      for(blk = 0; blk < nb_blocks; blk++) {
-	//if(blk == 0) p = r; else p = q;
-	little_dfl_fields[i][j + blk*Ns] = scalar_prod(block_list[blk].basis[j], psi[blk], block_list[0].volume, 0);
-      }
+
+  for(int j = 0; j < loop_SAP; j++) {
+    for(int i = 0; i < Ns; i++) {
+      zero_spinor_field(g_spinor_field[0], VOLUME);  
+      g_sloppy_precision = 1;
+      Msap_eo(g_spinor_field[0], dfl_fields[i], NcycleMsap_dflgen, NiterMsap_dflgen); 
+      
+      assign(dfl_fields[i], g_spinor_field[0], VOLUME);
+      
+      g_sloppy_precision = 0;
+      ModifiedGS((_Complex double*)g_spinor_field[0], vol, i, (_Complex double*)dfl_fields[0], vpr);
+      nrm = sqrt(square_norm(g_spinor_field[0], N, 1));
+      mul_r(dfl_fields[i], 1./nrm, g_spinor_field[0], N);
     }
   }
+
+  update_dfl_subspace(Ns, N, NsmoothMsap_dflgen-loop_SAP);
   
-  /* orthonormalise */
-  for(i = 0; i < Ns; i++) {
-    for (j = 0; j < i; j++) {
-      s = lscalar_prod(little_dfl_fields[j], little_dfl_fields[i], nb_blocks*Ns, 1);
-      lassign_diff_mul(little_dfl_fields[i], little_dfl_fields[j], s, nb_blocks*Ns);
-    }
-    s = lsquare_norm(little_dfl_fields[i], nb_blocks*Ns, 1);
-    lmul_r(little_dfl_fields[i], 1./sqrt(creal(s)), little_dfl_fields[i], nb_blocks*Ns);
+  compute_little_D(0);
+  compute_little_little_D(Ns);
+  dfl_subspace_updated = 0;
+  mu_dfl = musave;
+  kappa_dfl = kappasave;
+  mu_Msap = muMsapsave;
+  usePL = usePLsave;
+  if(g_debug_level > 0 && g_proc_id == 0) {
+    printf("# Switched to target parameters kappa= %e, mu=%e\n", g_kappa, g_mu/2/g_kappa);
   }
-  if(g_debug_level > 0) {
-    for(i = 0; i < Ns; i++) {
-      for(j = 0; j < Ns; j++) {
-	s = lscalar_prod(little_dfl_fields[i], little_dfl_fields[j], nb_blocks*Ns, 1);
-	if(g_proc_id == 0) {
-	  printf("<%d, %d> = %1.3e +i %1.3e\n", i, j, creal(s), cimag(s));
-	}
+  if(g_debug_level > 1) {
+    for(int i = 0; i < Ns; i++) {
+      /* test quality */
+      D_psi(work_fields[0], dfl_fields[i]);
+      nrm = sqrt(square_norm(work_fields[0], N, 1));
+      if(g_proc_id == 0) {
+	printf(" ||D psi_%d||/||psi_%d|| = %1.15e\n", i, i, nrm);
       }
     }
   }
   
-  for(i = 0; i < Ns; i++) {
-    little_D(work, little_dfl_fields[i]);
-    for(j = 0; j < Ns; j++) {
-      little_A[i * Ns + j]  = lscalar_prod(little_dfl_fields[j], work, nb_blocks*Ns, 1);
-      if(g_proc_id == 0 && g_debug_level > 4) {
-	printf("%1.3e %1.3ei, ", creal(little_A[i * Ns + j]), cimag(little_A[i * Ns + j]));
-      }
-    }
-    if(g_proc_id == 0 && g_debug_level > 4) printf("\n");
+  if((g_proc_id == 0) && (g_debug_level > 0)) {
+    printf("# Approximate eigenvectors generated\n");
   }
-  if(g_proc_id == 0 && g_debug_level > 4) printf("\n");
-  /* the precision in the inversion is not yet satisfactory! */
-  LUInvert(Ns, little_A, Ns);
-  /* inverse of little little D now in little_A */
-
+  
+  g_mu1 = 0.;
+  g_mu2 = 0.;
 
-  for(j = 0; j < Ns; j++) {
-    for(i = 0; i < nb_blocks*9*Ns; i++) {
-      (little_dfl_fields_eo[j][i]) = 0.0;
-      (work[i]) = 0.0;
-    }
-  }
+  g_sloppy_precision = 0;
 
-  /* compute the eo little little basis */
-  /* r = work_fields[0]; */
-  /* q = g_spinor_field[DUM__SOLVER+1]; */
-      
-  for(i = 0; i < Ns; i++) {
-    /* split_global_field(r, q,  dfl_fields[i]); */
-    split_global_field_GEN(psi, dfl_fields[i], nb_blocks);
-    /* now take the local scalar products */
-    for(j = 0; j < Ns; j++) {
-      i_o=0;
-      for(blk = 0; blk < nb_blocks; blk++) {
-         if (block_list[blk].evenodd==1) {
-	 little_dfl_fields_eo[i][j + (nb_blocks/2+i_o)*Ns] = scalar_prod(block_list[blk].basis[j], psi[blk], block_list[0].volume, 0);
-         i_o++;
-	 }	
-      }
-    }
-  }  
-     
-  /* orthonormalise */
-  for(i = 0; i < Ns; i++) {
-    for (j = 0; j < i; j++) {
-      s = lscalar_prod(little_dfl_fields_eo[j], little_dfl_fields_eo[i], nb_blocks*Ns, 1);
-      lassign_diff_mul(little_dfl_fields_eo[i], little_dfl_fields_eo[j], s, nb_blocks*Ns);
-    }
-    s = lsquare_norm(little_dfl_fields_eo[i], nb_blocks*Ns, 1);
-    lmul_r(little_dfl_fields_eo[i], 1./sqrt(creal(s)), little_dfl_fields_eo[i], nb_blocks*Ns);
-  }
-  if(g_debug_level > 0) {
-    for(i = 0; i < Ns; i++) {
-      for(j = 0; j < Ns; j++) {
-        s = lscalar_prod(little_dfl_fields_eo[i], little_dfl_fields_eo[j], nb_blocks*Ns, 1);
+  if(g_debug_level > 4) {
+    printf("Checking orthonormality of dfl_fields\n");
+    for(int i = 0; i < Ns; i++) {
+      for(int j = 0; j < Ns; j++) {
+        s = scalar_prod(dfl_fields[i], dfl_fields[j], N, 1);
         if(g_proc_id == 0) {
           printf("<%d, %d> = %1.3e +i %1.3e\n", i, j, creal(s), cimag(s));
         }
       }
     }
   }
-  
-  for(i = 0; i < Ns; i++) {  
-    little_D_sym(work, little_dfl_fields_eo[i]);
-    for(j = 0; j < Ns; j++) {
-      little_A_eo[i * Ns + j]  = lscalar_prod(little_dfl_fields_eo[j], work, nb_blocks*Ns, 1);
-      if(g_proc_id == 0 && g_debug_level > 4) {
-        printf("%1.3e %1.3ei, ", creal(little_A_eo[i * Ns + j]), cimag(little_A_eo[i * Ns + j])); 
-      }
-    }
-    if(g_proc_id == 0 && g_debug_level > 4) printf("\n");
-  }
-  if(g_proc_id == 0 && g_debug_level > 4) printf("\n");
-  /* the precision in the inversion is not yet satisfactory! */
-  LUInvert(Ns, little_A_eo, Ns);
-  /* inverse of eo little little D now in little_A_eo */
 
+  dfl_subspace_updated = 1;
 
+  compute_little_little_D(Ns);
   
-#ifdef MPI
-  etime = MPI_Wtime();
-#else
-  etime = (double)clock()/(double)(CLOCKS_PER_SEC);
-#endif
+  etime = gettime();
+
   if(g_proc_id == 0) {
-    printf("time for subspace generation %1.3e s\n", etime-atime);
+    printf("# time for subspace generation %1.3e s\n", etime-atime);
     fflush(stdout);
   }
 
@@ -404,6 +285,14 @@ int generate_dfl_subspace(const int Ns, const int N, const int repro) {
   free(work);
   free(psi[0]);
   free(psi);
+
+  /* Cross-checks */
+  if (g_debug_level > 3) {
+    check_projectors(reproduce_randomnumber_flag);
+    check_local_D(reproduce_randomnumber_flag);
+    check_little_D_inversion(reproduce_randomnumber_flag);
+  }
+
   return(0);
 }
 
@@ -429,7 +318,7 @@ int generate_dfl_subspace_free(const int Ns, const int N) {
       D_psi(work_fields[0], dfl_fields[i]);
       nrm = sqrt(square_norm(work_fields[0], N, 1));
       if(g_proc_id == 0) {
-	printf(" ||D psi_%d||/||psi_%d|| = %1.5e\n", i, i, nrm); 
+        printf(" ||D psi_%d||/||psi_%d|| = %1.5e\n", i, i, nrm); 
       }
     }
   }
@@ -437,10 +326,10 @@ int generate_dfl_subspace_free(const int Ns, const int N) {
   if(g_debug_level > 4) {
     for(i = 0; i < 12; i++) {
       for(j = 0; j < 12; j++) {
-	s = scalar_prod(dfl_fields[i], dfl_fields[j], N, 1);
-	if(g_proc_id == 0) {
-	  printf("<%d, %d> = %1.3e +i %1.3e\n", i, j, creal(s), cimag(s));
-	}
+        s = scalar_prod(dfl_fields[i], dfl_fields[j], N, 1);
+        if(g_proc_id == 0) {
+          printf("<%d, %d> = %1.3e +i %1.3e\n", i, j, creal(s), cimag(s));
+        }
       }
     }
   }
@@ -448,31 +337,187 @@ int generate_dfl_subspace_free(const int Ns, const int N) {
   return(0);
 }
 
+
+void compute_little_little_D(const int Ns) {
+  int i_o;
+  _Complex double s;
+  if(usePL) {
+    double atime = gettime();
+    _Complex double * work = (_Complex double*)malloc(nb_blocks*9*Ns*sizeof(_Complex double));
+    spinor ** psi = (spinor **)calloc(nb_blocks, sizeof(spinor *));
+    psi[0] = calloc(VOLUME + nb_blocks, sizeof(spinor));
+    for(int i = 1; i < nb_blocks; i++) psi[i] = psi[i-1] + (VOLUME / nb_blocks) + 1;
+    
+    if(!little_evenodd) {
+      // compute the little little basis
+      for(int j = 0; j < Ns; j++) {
+	for(int i = 0; i < nb_blocks*9*Ns; i++) {
+	  (little_dfl_fields[j][i]) = 0.0;
+	  (work[i]) = 0.0;
+	}
+      }
+      
+      for(int i = 0; i < Ns; i++) {
+	split_global_field_GEN(psi, dfl_fields[i], nb_blocks);
+	// now take the local scalar products
+	for(int j = 0; j < Ns; j++) {
+	  for(int blk = 0; blk < nb_blocks; blk++) {
+	    little_dfl_fields[i][j + blk*Ns] = scalar_prod(block_list[blk].basis[j], psi[blk], block_list[0].volume, 0);
+	  }
+	}
+      }
+      
+      // orthonormalise 
+      for(int i = 0; i < Ns; i++) {
+	for(int j = 0; j < i; j++) {
+	  s = lscalar_prod(little_dfl_fields[j], little_dfl_fields[i], nb_blocks*Ns, 1);
+	  lassign_diff_mul(little_dfl_fields[i], little_dfl_fields[j], s, nb_blocks*Ns);
+	}
+	s = lsquare_norm(little_dfl_fields[i], nb_blocks*Ns, 1);
+	lmul_r(little_dfl_fields[i], 1./sqrt(creal(s)), little_dfl_fields[i], nb_blocks*Ns);
+      }
+      if(g_debug_level > 4) {
+	printf("Checking orthonormality of little dfl fields\n");
+	for(int i = 0; i < Ns; i++) {
+	  for(int j = 0; j < Ns; j++) {
+	    s = lscalar_prod(little_dfl_fields[i], little_dfl_fields[j], nb_blocks*Ns, 1);
+	    if(g_proc_id == 0) {
+	      printf("<%d, %d> = %1.3e +i %1.3e\n", i, j, creal(s), cimag(s));
+	    }
+	  }
+	}
+      }
+      
+      for(int i = 0; i < Ns; i++) {
+	little_D(work, little_dfl_fields[i]);
+	for(int j = 0; j < Ns; j++) {
+	  little_A[i * Ns + j]  = lscalar_prod(little_dfl_fields[j], work, nb_blocks*Ns, 1);
+	  if(g_proc_id == 0 && g_debug_level > 4) {
+	    printf("%1.3e %1.3ei, ", creal(little_A[i * Ns + j]), cimag(little_A[i * Ns + j]));
+	  }
+	}
+	if(g_proc_id == 0 && g_debug_level > 4) printf("\n");
+      }
+      if(g_proc_id == 0 && g_debug_level > 4) printf("\n");
+      LUInvert(Ns, little_A, Ns);
+      // inverse of little little D now in little_A 
+    }
+    else { // if little_evenodd = true
+      // compute the little little basis for the even/odd case 
+      for(int j = 0; j < Ns; j++) {
+	for(int i = 0; i < nb_blocks*Ns; i++) {
+	  (little_dfl_fields_eo[j][i]) = 0.0;
+	  (work[i]) = 0.0;
+	}
+      }
+      
+      for(int i = 0; i < Ns; i++) {
+	split_global_field_GEN(psi, dfl_fields[i], nb_blocks);
+	// now take the local scalar products
+        i_o=0;
+        for(int blk = 0; blk < nb_blocks; blk++) {
+          if (block_list[blk].evenodd==1) {
+            for(int j = 0; j < Ns; j++) {
+	      little_dfl_fields_eo[i][(nb_blocks/2+i_o)*Ns + j] = scalar_prod(block_list[blk].basis[j], psi[blk], block_list[0].volume, 0);
+            }
+            i_o++;
+	  }
+	}
+      }
+      
+      // orthonormalise
+      for(int i = 0; i < Ns; i++) {
+	for (int j = 0; j < i; j++) {
+	  s = lscalar_prod(little_dfl_fields_eo[j], little_dfl_fields_eo[i], nb_blocks*Ns, 1);
+	  lassign_diff_mul(little_dfl_fields_eo[i], little_dfl_fields_eo[j], s, nb_blocks*Ns);
+	}
+	s = lsquare_norm(little_dfl_fields_eo[i], nb_blocks*Ns, 1);
+	lmul_r(little_dfl_fields_eo[i], 1./sqrt(creal(s)), little_dfl_fields_eo[i], nb_blocks*Ns);
+      }
+      // copy to 32Bit fields
+      for(int i = 0; i < Ns; i++) {
+        for(int j = 0; j < nb_blocks*Ns; j++) {
+          little_dfl_fields_eo_32[i][j] = (_Complex float)little_dfl_fields_eo[i][j];
+        }
+      }
+
+      if(g_debug_level > 4) {
+	printf("Checking orthonormality of littel_dfl_fields_eo\n");
+	for(int i = 0; i < Ns; i++) {
+	  for(int j = 0; j < Ns; j++) {
+	    s = lscalar_prod(little_dfl_fields_eo[i], little_dfl_fields_eo[j], nb_blocks*Ns, 1);
+	    if(g_proc_id == 0) {
+	      printf("<%d, %d> = %1.3e +i %1.3e\n", i, j, creal(s), cimag(s));
+	    }
+	  }
+	}
+      }
+      
+      for(int i = 0; i < Ns; i++) {  
+	little_D_sym(work, little_dfl_fields_eo[i]);
+	for(int j = 0; j < Ns; j++) {
+	  little_A_eo[i * Ns + j]  = lscalar_prod(little_dfl_fields_eo[j], work, nb_blocks*Ns, 1);
+	  if(g_proc_id == 0 && g_debug_level > 4) {
+	    printf("%1.3e %1.3ei, ", creal(little_A_eo[i * Ns + j]), cimag(little_A_eo[i * Ns + j])); 
+	  }
+	}
+	if(g_proc_id == 0 && g_debug_level > 4) printf("\n");
+      }
+      if(g_proc_id == 0 && g_debug_level > 4) printf("\n");
+      LUInvert(Ns, little_A_eo, Ns);
+      // inverse of eo little little D now in little_A_eo
+      // copy to the 32Bit version
+      for(int ij = 0; ij < Ns*Ns; ij++) {
+        little_A_eo_32[ij]  =  (_Complex float)little_A_eo[ij];
+      }
+    }
+    
+    double etime = gettime();
+    if(g_proc_id == 0 && g_debug_level > 0) {
+      printf("# time for little little D computation %1.3e s\n", etime-atime);
+      fflush(stdout);
+    }
+    free(work);
+    free(psi[0]);
+    free(psi);
+  }
+
+  return;
+}
+
+// corresponding free routine is missing!
+
 int init_little_dfl_subspace(const int N_s) {
-  int i;
   if(init_little_subspace == 0) {
+    // why nb_blocks*9*N_s?
     if((void*)(_little_dfl_fields = (_Complex double*)calloc((N_s)*nb_blocks*9*N_s+4, sizeof(_Complex double))) == NULL) {
       return(1);
     }
     if((void*)(little_dfl_fields = (_Complex double**)calloc(N_s, sizeof(_Complex double*))) == NULL) {
       return(1);
     }
+    // why do we need this factor of 9??
     if((void*)(_little_dfl_fields_eo = (_Complex double*)calloc((N_s)*nb_blocks*9*N_s+4, sizeof(_Complex double))) == NULL) {
       return(1);
     }
     if((void*)(little_dfl_fields_eo = (_Complex double**)calloc(N_s, sizeof(_Complex double*))) == NULL) {
       return(1);
     }
-#if ( defined SSE || defined SSE2 || defined SSE3)
+    // why do we need this factor of 9??
+    if((void*)(_little_dfl_fields_eo_32 = (_Complex float*)calloc((N_s)*nb_blocks*9*N_s+4, sizeof(_Complex float))) == NULL) {
+      return(1);
+    }
+    if((void*)(little_dfl_fields_eo_32 = (_Complex float**)calloc(N_s, sizeof(_Complex float*))) == NULL) {
+      return(1);
+    }
+
     little_dfl_fields[0] = (_Complex double*)(((unsigned long int)(_little_dfl_fields)+ALIGN_BASE)&~ALIGN_BASE);
     little_dfl_fields_eo[0] = (_Complex double*)(((unsigned long int)(_little_dfl_fields_eo)+ALIGN_BASE)&~ALIGN_BASE);
-#else
-    little_dfl_fields[0] = _little_dfl_fields;
-    little_dfl_fields_eo[0] = _little_dfl_fields_eo;
-#endif
-    for (i = 1; i < N_s; i++) {
+    little_dfl_fields_eo_32[0] = (_Complex float*)(((unsigned long int)(_little_dfl_fields_eo_32)+ALIGN_BASE)&~ALIGN_BASE);
+    for (int i = 1; i < N_s; i++) {
       little_dfl_fields[i] = little_dfl_fields[i-1] + nb_blocks*9*N_s;
       little_dfl_fields_eo[i] = little_dfl_fields_eo[i-1] + nb_blocks*9*N_s;
+      little_dfl_fields_eo_32[i] = little_dfl_fields_eo_32[i-1] + nb_blocks*9*N_s;
     }
     if((void*)(little_A = (_Complex double*)calloc(N_s*N_s, sizeof(_Complex double))) == NULL) {
       return(1);
@@ -480,6 +525,9 @@ int init_little_dfl_subspace(const int N_s) {
     if((void*)(little_A_eo = (_Complex double*)calloc(N_s*N_s, sizeof(_Complex double))) == NULL) {
       return(1);
     }   
+    if((void*)(little_A_eo_32 = (_Complex float*)calloc(N_s*N_s, sizeof(_Complex float))) == NULL) {
+      return(1);
+    }   
     init_little_subspace = 1;
   }
   return(0);
@@ -494,11 +542,7 @@ int init_dfl_subspace(const int N_s) {
   if ((void*)(dfl_fields = calloc((N_s), sizeof(spinor *))) == NULL) {
     return(1);
   }
-#if ( defined SSE || defined SSE2 || defined SSE3)
   dfl_fields[0] = (spinor*)(((unsigned long int)(_dfl_fields)+ALIGN_BASE)&~ALIGN_BASE);
-#else
-  dfl_fields[0] = _dfl_fields;
-#endif
   for (i = 1; i < N_s; ++i) {
     dfl_fields[i] = dfl_fields[i-1] + VOLUMEPLUSRAND;
   }
diff --git a/solver/generate_dfl_subspace.h b/solver/generate_dfl_subspace.h
index dc5848539..9b5cda97a 100644
--- a/solver/generate_dfl_subspace.h
+++ b/solver/generate_dfl_subspace.h
@@ -24,11 +24,13 @@
 
 int init_dfl_subspace(const int);
 int free_dfl_subspace();
+int update_dfl_subspace(const int Ns, const int N, const int Nsmooth);
 int generate_dfl_subspace(const int Ns, const int N, const int repro);
 int generate_dfl_subspace_free(const int Ns, const int N);
 
 extern spinor ** dfl_fields;
 extern _Complex double ** little_dfl_fields;
 extern _Complex double ** little_dfl_fields_eo;
+extern _Complex float ** little_dfl_fields_eo_32;
 
 #endif
diff --git a/solver/gmres.c b/solver/gmres.c
index 036545b25..f53b61fe6 100644
--- a/solver/gmres.c
+++ b/solver/gmres.c
@@ -46,7 +46,7 @@
  ********************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include<stdlib.h>
 #include<stdio.h>
diff --git a/solver/gmres_dr.c b/solver/gmres_dr.c
index 3a55a137c..787c34858 100644
--- a/solver/gmres_dr.c
+++ b/solver/gmres_dr.c
@@ -36,7 +36,7 @@
  ********************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include<stdlib.h>
 #include<stdio.h>
diff --git a/solver/gmres_precon.c b/solver/gmres_precon.c
index bc3d6184a..479bf85c9 100644
--- a/solver/gmres_precon.c
+++ b/solver/gmres_precon.c
@@ -42,7 +42,7 @@
  ********************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include<stdlib.h>
 #include<stdio.h>
diff --git a/solver/gram-schmidt.c b/solver/gram-schmidt.c
index e16eee224..47f046968 100644
--- a/solver/gram-schmidt.c
+++ b/solver/gram-schmidt.c
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <math.h>
 #include <stdio.h>
diff --git a/solver/incr_eigcg.c b/solver/incr_eigcg.c
index 787348a78..3e1047a57 100644
--- a/solver/incr_eigcg.c
+++ b/solver/incr_eigcg.c
@@ -104,7 +104,7 @@
  ****************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 
 #include <stdlib.h>
@@ -112,7 +112,7 @@
 #include <math.h>
 #include <time.h>
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 
@@ -201,7 +201,7 @@ int incr_eigcg(const int N, const int nrhs,  const int nrhs1, spinor * const x,
      LDN = VOLUMEPLUSRAND/2;
   
  
-  #ifdef MPI
+  #ifdef TM_USE_MPI
     parallel=1;
   #else
     parallel=0;
@@ -382,21 +382,8 @@ int incr_eigcg(const int N, const int nrhs,  const int nrhs1, spinor * const x,
     /* ------------------------------------------------------------ */
     /* Adjust nev for eigcg according to available ldh/restart      */
     /* ------------------------------------------------------------ */
-	  
     if (flag == 3) { /* restart with the same rhs, set nev_used = 0 */
       nev_used = 0;
-      /* if convergence seems before next restart do not restart again */
-      if(rel_prec)
-      {
-	       if (cur_res*(restart_eps_sq) < eps_sq*normb*normb) 
-	           restart_eps_sq=0.0;
-      }
-      else
-      {
-	       if (cur_res*(restart_eps_sq) < eps_sq) 
-	          restart_eps_sq=0.0;
-      } /* if(rel_prec) */
-	  
     }
     else
     {    
@@ -405,7 +392,6 @@ int incr_eigcg(const int N, const int nrhs,  const int nrhs1, spinor * const x,
       if (ldh-ncurEvals < nev)
 	       nev = ldh - ncurEvals;
       nev_used = nev;
-      
     }
 
     /* ------------------------------------------------------------ */
diff --git a/solver/index_jd.c b/solver/index_jd.c
index a0a53691b..69fd02772 100644
--- a/solver/index_jd.c
+++ b/solver/index_jd.c
@@ -11,7 +11,7 @@
 #include <math.h>
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 
 #include "global.h"
@@ -52,7 +52,7 @@ void index_jd(int * nr_of_eigenvalues_ov,
   double absdifference;
   const int N2 = VOLUMEPLUSRAND;
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   double atime, etime;
 #endif
   double lowestmodes[20];
@@ -117,7 +117,7 @@ void index_jd(int * nr_of_eigenvalues_ov,
   j_min = 8; j_max = 16;
   max_iter = 70;
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   atime = MPI_Wtime();
 #endif
 
@@ -187,7 +187,7 @@ void index_jd(int * nr_of_eigenvalues_ov,
     }
   }
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   etime = MPI_Wtime();
   if(g_proc_id == g_stdio_proc){
     printf("It took %f sec to determine the sector with zero modes, if any!\n", etime-atime);
@@ -262,7 +262,7 @@ void index_jd(int * nr_of_eigenvalues_ov,
       eigenvalues_ov[i] = lowestmodes[first_blocksize*intsign+i];
     }
 
-#ifdef MPI
+#ifdef TM_USE_MPI
     atime = MPI_Wtime();
 #endif
 
@@ -285,7 +285,7 @@ void index_jd(int * nr_of_eigenvalues_ov,
       returncode = 0;
 
       /* compute minimal eigenvalues */
-#ifdef MPI
+#ifdef TM_USE_MPI
       /*      pjdher(VOLUME*sizeof(spinor)/sizeof(_Complex double), VOLUMEPLUSRAND*sizeof(spinor)/sizeof(_Complex double),
 	     shift, prec, omega, n_omega, ev_tr,
 	     i+blocksize, j_max, j_min, 
@@ -364,7 +364,7 @@ void index_jd(int * nr_of_eigenvalues_ov,
       }
     }
 
-#ifdef MPI
+#ifdef TM_USE_MPI
     etime = MPI_Wtime();
 #endif
 
@@ -383,7 +383,7 @@ void index_jd(int * nr_of_eigenvalues_ov,
     /* Some Output */
     if(g_proc_id == g_stdio_proc) {
       printf("Index is %s%d!\n", intsign ? "-" : "+", index);
-#ifdef MPI
+#ifdef TM_USE_MPI
       printf("Zero modes determined in %f sec!\n", etime-atime);
 #endif
     }
diff --git a/solver/init_guess.c b/solver/init_guess.c
new file mode 100644
index 000000000..30c4e1c9f
--- /dev/null
+++ b/solver/init_guess.c
@@ -0,0 +1,190 @@
+/***********************************************************************
+ *
+ *
+ * Copyright (C) 2016 Simone Bacchio
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "global.h"
+#include "su3.h"
+#include "gamma.h"
+#include "linalg_eo.h"
+#include "start.h"
+#include "gettime.h"
+#include "solver/solver.h"
+#include "solver_field.h"
+#include "operator/tm_operators.h"
+#include "operator/tm_operators_nd.h"
+#include "init_guess.h"
+#include <io/params.h>
+
+int init_guess_mms(spinor ** const P, spinor * const Q,
+                   int shift, solver_params_t * const solver_params) {
+  double * shifts=solver_params->shifts;
+  int no_shifts = solver_params->no_shifts;
+  if(shift==no_shifts-1) {
+    zero_spinor_field(P[shift], solver_params->sdim);
+  } else {
+    double coeff;
+    for( int j = no_shifts-1; j > shift; j-- ) {
+      coeff = 1;
+      for( int k = no_shifts-1; k > shift; k-- ) {
+        if(j!=k)
+          coeff *= (shifts[k]*shifts[k]-shifts[shift]*shifts[shift])/
+                   (shifts[k]*shifts[k]-shifts[j]*shifts[j]);
+      }
+      if(j==no_shifts-1) {
+        mul_r(P[shift], coeff, P[j], solver_params->sdim);
+      } else {
+        assign_add_mul_r(P[shift], P[j], coeff, solver_params->sdim);
+      }
+    }
+  }
+  if(g_debug_level > 2){
+    double old_g_mu3 = g_mu3;
+    spinor** temp;
+    if(solver_params->sdim == VOLUME/2) {
+      init_solver_field(&temp, VOLUMEPLUSRAND/2, 1);
+    } else {
+      init_solver_field(&temp, VOLUMEPLUSRAND, 1);
+    }
+
+    g_mu3 = solver_params->shifts[shift]; 
+    solver_params->M_psi( temp[0], P[shift]);
+    g_mu3 = old_g_mu3;
+
+    diff( temp[0], temp[0], Q, solver_params->sdim);
+    double res = sqrt(square_norm(temp[0], solver_params->sdim, 1)/square_norm(Q, solver_params->sdim, 1));
+      
+    finalize_solver(temp, 1);
+    if(g_proc_id == 0)
+      printf("INITIAL GUESS: shift id=%d value=%e  relative residual: %e\n",shift,shifts[shift],res); 
+  }
+
+}
+
+int init_guess_mms_nd(spinor ** const Pup, spinor ** const Pdn, 
+                      spinor * const Qup, spinor * const Qdn, 
+                      int shift, solver_params_t * solver_params) {
+  double * shifts=solver_params->shifts;
+  int no_shifts = solver_params->no_shifts;
+  if(shift==no_shifts-1) {
+    zero_spinor_field(Pup[shift], solver_params->sdim);
+    zero_spinor_field(Pdn[shift], solver_params->sdim);
+  } else {
+    double coeff;
+    for( int j = no_shifts-1; j > shift; j-- ) {
+      coeff = 1;
+      for( int k = no_shifts-1; k > shift; k-- ) {
+        if(j!=k)
+          coeff *= (shifts[k]*shifts[k]-shifts[shift]*shifts[shift])/
+                   (shifts[k]*shifts[k]-shifts[j]*shifts[j]);
+      }
+      if(j==no_shifts-1) {
+        mul_r(Pup[shift], coeff, Pup[j], solver_params->sdim);
+        mul_r(Pdn[shift], coeff, Pdn[j], solver_params->sdim);
+      } else {
+        assign_add_mul_r(Pup[shift], Pup[j], coeff, solver_params->sdim);
+        assign_add_mul_r(Pdn[shift], Pdn[j], coeff, solver_params->sdim);
+      }
+    }
+  }
+  if(g_debug_level > 2){
+    double old_g_shift = g_shift;
+    matrix_mult_nd f = Qtm_pm_ndpsi_shift;
+    if( solver_params->M_ndpsi == Qsw_pm_ndpsi ) 
+      f = Qsw_pm_ndpsi_shift;
+    spinor** temp;
+    if(solver_params->sdim == VOLUME/2) {
+      init_solver_field(&temp, VOLUMEPLUSRAND/2, 2);
+    } else {
+      init_solver_field(&temp, VOLUMEPLUSRAND, 2);
+    }
+
+    g_shift = shifts[shift]*shifts[shift]; 
+    f( temp[0], temp[1], Pup[shift], Pdn[shift]);
+    g_shift = old_g_shift;
+
+    diff( temp[0], temp[0], Qup, solver_params->sdim);
+    diff( temp[1], temp[1], Qdn, solver_params->sdim);
+    double res = sqrt(square_norm(temp[0], solver_params->sdim, 1)+square_norm(temp[1], solver_params->sdim, 1))/
+      sqrt(square_norm(Qup, solver_params->sdim, 1)+square_norm(Qdn, solver_params->sdim, 1));
+      
+    finalize_solver(temp, 2);
+    if(g_proc_id == 0)
+      printf("INITIAL GUESS ND: shift id=%d value=%e  relative residual: %e\n",shift,shifts[shift],res); 
+  }
+}
+
+int init_guess_mms_nd_plus(spinor ** const Pup, spinor ** const Pdn, 
+                           spinor * const Qup, spinor * const Qdn, 
+                           int shift, solver_params_t * solver_params) {
+  double * shifts=solver_params->shifts;
+  int no_shifts = solver_params->no_shifts;
+  if(shift==no_shifts-1) {
+    zero_spinor_field(Pup[shift], solver_params->sdim);
+    zero_spinor_field(Pdn[shift], solver_params->sdim);
+  } else {
+    double coeff;
+    for( int j = no_shifts-1; j > shift; j-- ) {
+      coeff = 1;
+      for( int k = no_shifts-1; k > shift; k-- ) {
+        if(j!=k)
+          coeff *= (shifts[k]-shifts[shift])/(shifts[k]-shifts[j]);
+      }
+      if(j==no_shifts-1) {
+        mul_r(Pup[shift], coeff, Pup[j], solver_params->sdim);
+        mul_r(Pdn[shift], coeff, Pdn[j], solver_params->sdim);
+      } else {
+        assign_add_mul_r(Pup[shift], Pup[j], coeff, solver_params->sdim);
+        assign_add_mul_r(Pdn[shift], Pdn[j], coeff, solver_params->sdim);
+      }
+    }
+  }
+  if(g_debug_level > 2){
+    double old_g_shift = g_shift;
+    matrix_mult_nd f = Qtm_tau1_ndpsi_add_Ishift;
+    if( solver_params->M_ndpsi == Qsw_pm_ndpsi )
+      f = Qsw_tau1_ndpsi_add_Ishift;
+    spinor** temp;
+    if(solver_params->sdim == VOLUME/2) {
+      init_solver_field(&temp, VOLUMEPLUSRAND/2, 2);
+    } else {
+      init_solver_field(&temp, VOLUMEPLUSRAND, 2);
+    }
+
+    g_shift = shifts[shift]*shifts[shift]; 
+    f( temp[0], temp[1], Pup[shift], Pdn[shift]);
+    g_shift = old_g_shift;
+
+    diff( temp[0], temp[0], Qup, solver_params->sdim);
+    diff( temp[1], temp[1], Qdn, solver_params->sdim);
+    double res = sqrt(square_norm(temp[0], solver_params->sdim, 1)+square_norm(temp[1], solver_params->sdim, 1))/
+      sqrt(square_norm(Qup, solver_params->sdim, 1)+square_norm(Qdn, solver_params->sdim, 1));
+      
+    finalize_solver(temp, 2);
+    if(g_proc_id == 0)
+      printf("INITIAL GUESS ND PLUS: shift id=%d value=%e  relative residual: %e\n",shift,shifts[shift],res); 
+  }
+}
diff --git a/solver/init_guess.h b/solver/init_guess.h
new file mode 100644
index 000000000..8ae29e80c
--- /dev/null
+++ b/solver/init_guess.h
@@ -0,0 +1,39 @@
+/***********************************************************************
+ *
+ *
+ * Copyright (C) 2016 Simone Bacchio
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+
+#ifndef _INIT_GUESS_H
+#define _INIT_GUESS_H
+
+#include"su3.h"
+#include"solver.h"
+
+int init_guess_mms(spinor ** const P, spinor * const Q,
+                   int shift, solver_params_t * const params);
+
+int init_guess_mms_nd(spinor ** const Pup, spinor ** const Pdn, 
+                      spinor * const Qup, spinor * const Qdn, 
+                      int shift, solver_params_t * solver_params);
+
+int init_guess_mms_nd_plus(spinor ** const Pup, spinor ** const Pdn, 
+                           spinor * const Qup, spinor * const Qdn, 
+                           int shift, solver_params_t * solver_params);
+#endif
diff --git a/solver/jdher.c b/solver/jdher.c
index a7f5d0563..c6e61c62c 100644
--- a/solver/jdher.c
+++ b/solver/jdher.c
@@ -42,7 +42,7 @@
 
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <limits.h>
 #include <float.h>
@@ -84,7 +84,7 @@ void Proj_A_psi(spinor * const y, spinor * const x);
 
 void jderrorhandler(const int i, char * message) {
   fprintf(stderr, "jdher %s \n", message);
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Finalize();
 #endif
   exit(i);
diff --git a/solver/jdher_bi.c b/solver/jdher_bi.c
index 2334c73f7..7f6dda27e 100644
--- a/solver/jdher_bi.c
+++ b/solver/jdher_bi.c
@@ -41,7 +41,7 @@
  **************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <limits.h>
 #include <float.h>
@@ -50,7 +50,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
diff --git a/solver/jdher_su3vect.c b/solver/jdher_su3vect.c
index 551576f07..83ff8bc04 100644
--- a/solver/jdher_su3vect.c
+++ b/solver/jdher_su3vect.c
@@ -23,7 +23,7 @@
  *
  *******************************************************************************/
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <limits.h>
 #include <stdio.h>
@@ -71,7 +71,7 @@ void Proj_A_psi_su3vect(su3_vector * const y, su3_vector * const x, int tslice);
 void jderrorhandler_su3vect(const int i, char * message) 
 {
   fprintf(stderr, "jdher %s \n", message);
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Finalize();
 #endif
   exit(i);
diff --git a/solver/little_mg_precon_body.c b/solver/little_mg_precon_body.c
new file mode 100644
index 000000000..14d6ddd18
--- /dev/null
+++ b/solver/little_mg_precon_body.c
@@ -0,0 +1,29 @@
+/***********************************************************************
+ * Copyright (C) 2016 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+void _PSWITCH(little_mg_precon)(_Complex _F_TYPE * const out, _Complex _F_TYPE * const in) {
+  // phi = PD_c^{-1} P^dagger in
+  _PSWITCH(little_project_eo)(out, in, g_N_s);
+  // in - D*phi
+  _PSWITCH(little_D_sym)((_Complex _F_TYPE *) work[2], out);
+  _PSWITCH(ldiff)((_Complex _F_TYPE *) work[3], in, (_Complex _F_TYPE *) work[2], nb_blocks*g_N_s);
+  // sum with phi
+  _PSWITCH(ladd)(out, (_Complex _F_TYPE *) work[3], out, nb_blocks*g_N_s);
+  return;
+}
diff --git a/solver/little_project_eo_body.c b/solver/little_project_eo_body.c
new file mode 100644
index 000000000..8e65e2676
--- /dev/null
+++ b/solver/little_project_eo_body.c
@@ -0,0 +1,57 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2008,2016 Albert Deuzeman, Siebren Reker, Carsten Urbach
+ *                    Claude Tadonki
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+void _PSWITCH(little_project_eo)(_Complex _F_TYPE * const out, _Complex _F_TYPE * const in, const int  N) {
+  static _Complex _F_TYPE * phi;
+  static _Complex _F_TYPE * psi;
+
+  if(init_dfl_projector == 0) {
+    alloc_dfl_projector();
+  }
+
+  phi = (_Complex _F_TYPE *)work[2];
+  psi = (_Complex _F_TYPE *)work[3];
+
+  /* NOTE IS THIS REALLY NECESSARY/CORRECT? */
+  for(int i = 0; i < N; i++) {
+    phi[i] = _PSWITCH(lscalar_prod)(_PSWITCH(little_dfl_fields_eo)[i], in, nb_blocks*N, 0);
+  }
+
+#ifdef TM_USE_MPI
+  MPI_Allreduce(phi, psi, N, _MPI_C_TYPE, MPI_SUM, MPI_COMM_WORLD);
+#else
+  memcpy(psi, phi, N*sizeof(_Complex _F_TYPE));
+#endif
+
+  /* apply inverse of little_A_eo */
+  for(int i = 0; i < N; i++) {
+    (phi[i]) = 0.0;
+    for(int j = 0; j < N; j++) {
+      (phi[i]) += (_PSWITCH(little_A_eo)[j*N + i]) * (psi[j]);
+    }
+  }
+  _PSWITCH(lmul)(out, phi[0], _PSWITCH(little_dfl_fields_eo)[0], nb_blocks*N);
+  for(int i = 1; i < N; i++) {
+    _PSWITCH(lassign_add_mul)(out, _PSWITCH(little_dfl_fields_eo)[i], phi[i], nb_blocks*N);
+  }
+
+  return;
+}
diff --git a/solver/lu_solve.c b/solver/lu_solve.c
index 8207ffd95..d0d2d0aab 100644
--- a/solver/lu_solve.c
+++ b/solver/lu_solve.c
@@ -17,7 +17,7 @@
  * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
  ***********************************************************************/
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include<stdlib.h>
 #include<math.h>
@@ -327,7 +327,7 @@ void LUInvert( const int Nvec, _Complex double * const M, const int ldM) {
     }
   }
 
-  if(g_debug_level > 4 && g_proc_id == 0) {
+  if(g_debug_level > 6 && g_proc_id == 0) {
     printf("check little_A inversion \n");
     for(i = 0; i < Nvec; i++) {
       for(j = 0; j < Nvec; j++) {
diff --git a/solver/matrix_mult_typedef.h b/solver/matrix_mult_typedef.h
index 70c6e94b5..9944566e8 100644
--- a/solver/matrix_mult_typedef.h
+++ b/solver/matrix_mult_typedef.h
@@ -29,9 +29,13 @@
 
 typedef void (*matrix_mult)(spinor * const, spinor * const);
 typedef void (*matrix_mult4bispinors)(bispinor * const, bispinor * const);
+typedef void (*matrix_mult_full)(spinor * const, spinor * const, spinor * const, spinor * const);
+typedef void (*matrix_mult32)(spinor32 * const, spinor32 * const);
 typedef void (*matrix_mult_blk)(spinor * const, spinor * const, const int);
+typedef void (*matrix_mult_blk32)(spinor32 * const, spinor32 * const, const int);
 typedef void (*matrix_mult_clover)(spinor * const, spinor * const, const double);
 typedef void (*c_matrix_mult)(_Complex double * const, _Complex double * const);
+typedef void (*c_matrix_mult_32)(_Complex float * const, _Complex float * const);
 typedef void (*matrix_mult_su3vect)(su3_vector * const, su3_vector * const, const int);
 
 #endif
diff --git a/solver/matrix_mult_typedef_nd.h b/solver/matrix_mult_typedef_nd.h
index d5ab8cab4..ce298c946 100644
--- a/solver/matrix_mult_typedef_nd.h
+++ b/solver/matrix_mult_typedef_nd.h
@@ -29,6 +29,7 @@
 #define _MATRIX_MULT_TYPEDEF_ND_H
 
 typedef void (*matrix_mult_nd)(spinor * const, spinor * const,spinor * const, spinor * const);
-
+typedef void (*matrix_mult_full_nd)(spinor * const, spinor * const,spinor * const, spinor * const,spinor * const, spinor * const,spinor * const, spinor * const);
+typedef void (*matrix_mult_nd32)(spinor32 * const, spinor32 * const, spinor32 * const, spinor32 * const);
 
 #endif
diff --git a/solver/mcr.c b/solver/mcr.c
new file mode 100644
index 000000000..1d439f5b6
--- /dev/null
+++ b/solver/mcr.c
@@ -0,0 +1,179 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include<stdlib.h>
+#include<stdio.h>
+#include<math.h>
+#include"global.h"
+#include"su3.h"
+#include"linalg_eo.h"
+#include"solver/gmres_precon.h"
+#include"start.h"
+#include"operator/tm_operators.h"
+#include"solver/poly_precon.h"
+#include"solver/cg_her.h"
+#include"operator/D_psi.h"
+#include"Msap.h"
+#include"dfl_projector.h"
+#include "solver_field.h"
+#include"mcr.h"
+#include"time.h"
+#include "gettime.h"
+
+int mcr(spinor * const P, spinor * const Q, 
+		const int m, const int max_restarts,
+		const double eps_sq, const int rel_prec,
+		const int N, const int precon, matrix_mult f) {
+
+	int k, l, restart, i, iter = 0;
+	double norm_sq, err;
+	spinor * xi, * Axi, * chi, * Achi, *tmp;
+	_Complex double alpha, beta;
+	static _Complex double one = 1.0;
+	double norm;
+	double atime, etime;
+	spinor ** solver_field = NULL;
+	const int nr_sf = 5;
+  	int save_sloppy = g_sloppy_precision;
+
+	if(N == VOLUME) {
+		init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf);
+	}
+	else {
+		init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf);
+	}
+
+//#ifdef TM_USE_MPI
+//	atime = MPI_Wtime();
+//#else
+//	atime = ((double)clock())/((double)(CLOCKS_PER_SEC));
+//#endif
+	atime = gettime();
+
+	xi = solver_field[0];
+	Axi = solver_field[1];
+	chi = solver_field[2];
+	Achi = solver_field[3];
+	tmp = solver_field[4];
+
+	norm_sq = square_norm(Q, N, 1);
+	if(norm_sq < 1.e-32) {
+		norm_sq = 1.;
+	}
+
+	for(restart = 0; restart < max_restarts; restart++) {
+		dfl_sloppy_prec = 0;
+		f(tmp, P);
+		diff(chi, Q, tmp, N);
+		assign(xi, chi, N);
+		f(Axi, xi);
+		err = square_norm(chi, N, 1);
+		if(g_proc_id == g_stdio_proc && g_debug_level > 2){
+			printf("mCR: iteration number: %d true residue: %g\n", iter, err); 
+			fflush(stdout);
+		}
+		if(((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*norm_sq) && (rel_prec == 1))) {
+			finalize_solver(solver_field, nr_sf);
+			return(iter);
+		}
+
+		for(k = 0; k < m; k++) {
+
+			if(precon == 0) {
+				assign(tmp, chi, N);
+			}
+			else {
+				zero_spinor_field(tmp, N);  
+				Msap_eo(tmp, chi, NcycleMsap, NiterMsap);   
+			}
+
+			dfl_sloppy_prec = 1;
+
+            alpha = scalar_prod(Axi, tmp, N, 1);
+            norm = square_norm(Axi, N, 1);
+            alpha /= norm;
+            assign_add_mul(P, xi, alpha, N);
+            /* get the new residual */
+            assign_diff_mul(chi, Axi, alpha, N);
+	
+			err = square_norm(chi, N, 1);
+			iter ++;
+//#ifdef TM_USE_MPI
+//			etime = MPI_Wtime();
+//#else
+//			etime = ((double)clock())/((double)(CLOCKS_PER_SEC));
+//#endif
+			etime = gettime();
+			if(g_proc_id == g_stdio_proc && g_debug_level > 1){
+				printf("# mCR: %d\t%g iterated residue, time spent %f s\n", iter, err, (etime - atime)); 
+				fflush(stdout);
+			}
+			/* Precision reached? */
+			if((k == m-1) || ((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*norm_sq) && (rel_prec == 1))) {
+				break;
+			}
+
+
+			#ifdef _USE_HALFSPINOR
+    		if(((err*err <= eps_sq) && (rel_prec == 0)) || ((err*err <= eps_sq*norm_sq) && (rel_prec == 1))) {
+				if (g_sloppy_precision_flag == 1) {
+      				g_sloppy_precision = 1;
+      				if(g_debug_level > 2 && g_proc_id == g_stdio_proc) {
+        				printf("sloppy precision on\n"); fflush( stdout);
+      				}
+				}
+    		}
+			#endif
+
+			f(Achi, chi); 
+
+            beta = scalar_prod(Axi, Achi, N, 1);
+			beta /= norm;
+			beta = -beta;
+            assign_mul_add_mul(xi, beta, chi, one, N);
+            assign_mul_add_mul(Axi,beta, Achi, one, N);
+
+		}
+
+	}
+
+    /* check if the iteration converges in the last restart cycle */
+    if (restart == max_restarts) {
+        f(tmp, P);
+        diff(chi, Q, tmp, N);
+
+        err = square_norm(chi, N, 1);
+        if(g_proc_id == g_stdio_proc && g_debug_level > 0){
+            printf("mCR: %d\t%g true residue\n", iter, err); 
+            fflush(stdout);
+        }
+        if(((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*norm_sq) && (rel_prec == 1))) {
+			finalize_solver(solver_field, nr_sf);
+            return(iter);
+        }
+    }
+	g_sloppy_precision = save_sloppy;
+	finalize_solver(solver_field, nr_sf);
+	return(-1);
+}
+
+
diff --git a/solver/mcr.h b/solver/mcr.h
new file mode 100644
index 000000000..db19cbb6f
--- /dev/null
+++ b/solver/mcr.h
@@ -0,0 +1,31 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _MCR_H
+#define _MCR_H
+
+#include"solver/matrix_mult_typedef.h"
+#include"su3.h"
+
+int mcr(spinor * const P, spinor * const Q, 
+	const int m, const int max_restarts,
+	const double eps_sq, const int rel_prec,
+	const int N, const int precon, matrix_mult f);
+
+#endif
diff --git a/solver/mcr4complex.c b/solver/mcr4complex.c
new file mode 100644
index 000000000..388c6a501
--- /dev/null
+++ b/solver/mcr4complex.c
@@ -0,0 +1,204 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *               2010 claude Tadonki
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include<stdlib.h>
+#include<stdio.h>
+#include<math.h>
+#include<string.h>
+#include"global.h"
+#include"su3.h"
+#include"linalg_eo.h"
+#include"mcr4complex.h"
+#include "time.h"
+
+static void init_lmcr(const int _M, const int _V);
+static void free_lmcr();
+static _Complex double * chi = NULL;
+static _Complex double * xi = NULL;
+static _Complex double * Achi = NULL;
+static _Complex double * Axi = NULL;
+static _Complex double * tmp = NULL;
+static int lmcr_init = 0;
+
+int mcr4complex(_Complex double * const P, _Complex double * const Q, 
+                const int m, const int max_restarts,
+                const double eps_sq, const int rel_prec,
+                const int N, const int parallel, 
+                const int lda, c_matrix_mult f) {
+
+  int k, l, restart, i, p=0;
+  double norm_sq, norm,err;
+  _Complex double ctmp;
+  _Complex double alpha,beta;
+  _Complex double one = 1.0;
+
+
+  double atime, etime;
+  init_lmcr(m, lda);
+
+  norm_sq = lsquare_norm(Q, N, parallel);
+  if(norm_sq < 1.e-20) {
+    norm_sq = 1.;
+  }
+
+#ifdef TM_USE_MPI
+  atime = MPI_Wtime();
+#else
+  atime = ((double)clock())/((double)(CLOCKS_PER_SEC));
+#endif
+
+
+  for(restart = 0; restart < max_restarts; restart++) {
+
+    f(tmp, P);
+    ldiff(chi, Q, tmp, N);
+    memcpy(xi, chi, N*sizeof(_Complex double));
+
+    f(Axi,xi);
+
+    err = lsquare_norm(chi, N, parallel);
+    if(g_proc_id == g_stdio_proc && g_debug_level > 1){
+      printf("lPHCR: %d\t%g true residue\n", p, err); 
+      fflush(stdout);
+    }
+
+    if(((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq * norm_sq) && (rel_prec == 1))) {
+      return (p);
+    }
+
+    for(k = 0;k<m ; k++) {
+      alpha = lscalar_prod(Axi, chi, N, parallel);
+      norm = lsquare_norm(Axi, N, parallel);
+
+      alpha /= norm;
+
+      lassign_add_mul(P,xi, alpha, N);
+      lassign_diff_mul(chi, Axi, alpha, N);
+
+      err = lsquare_norm(chi, N, parallel);
+      p++;
+
+      if(g_proc_id == g_stdio_proc && g_debug_level > 1){
+        printf("mCR: %d\t%g iterated residue\n", p, err); 
+        fflush(stdout);
+      }
+      /* Precision reached? */
+      if((k == m-1) || ((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*norm_sq) && (rel_prec == 1))) {
+        break;
+      }
+
+      f(Achi, chi);
+
+      beta = lscalar_prod(Axi, Achi, N, 1);
+      beta /= -norm;
+
+      lmul_add_mul(tmp,chi,xi,one,beta,N);
+      memcpy(xi, tmp, N*sizeof(_Complex double));
+
+      lmul_add_mul(tmp,Achi,Axi,one,beta,N);
+      memcpy(Axi, tmp, N*sizeof(_Complex double));
+
+    }
+
+  }
+
+  /* check if it converges in the last restart cycle */
+  if (restart == max_restarts) {
+    f(tmp, P);
+    ldiff(chi, Q, tmp, N);
+    memcpy(xi, chi, N*sizeof(_Complex double));
+
+    f(Axi,xi);
+
+    err = lsquare_norm(chi, N, parallel);
+
+#ifdef TM_USE_MPI
+    etime = MPI_Wtime();
+#else
+    etime = ((double)clock())/((double)(CLOCKS_PER_SEC));
+#endif
+    if(g_proc_id == g_stdio_proc && g_debug_level > 1){
+      printf("lPHCR: %d\t%g true residue, time spent %f s\n", p, err, (etime - atime)); 
+      fflush(stdout);
+    }
+    if(((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq * norm_sq) && (rel_prec == 1))) {
+      return (p);
+    }
+  }
+
+
+  if(g_proc_id == 0 && g_debug_level > 1) printf("lPHcr: for -1 %d %e\n", p, err);
+  return(-1);
+}
+
+static void init_lmcr(const int _M, const int _V){
+  static int Vo = -1;
+  static int M = -1;
+
+  int i;
+  if((M != _M)||(lmcr_init == 0)||(Vo != _V)){
+    if(lmcr_init == 1) free_lmcr();
+    Vo = _V;
+    M = _M;
+    chi = calloc(Vo, sizeof(_Complex double));
+    xi = calloc(Vo, sizeof(_Complex double));
+    Achi = calloc(Vo, sizeof(_Complex double));
+    Axi = calloc(Vo, sizeof(_Complex double));
+    tmp = calloc(Vo, sizeof(_Complex double));
+
+    lmcr_init = 1;
+  }
+}
+
+static void free_lmcr() 
+{
+  lmcr_init = 0;
+  free(chi);
+  free(xi);
+  free(Achi);
+  free(Axi);
+  free(tmp);
+  return;
+}
+
+
+
+void lmul_add_mul(_Complex double * const R, _Complex double * const S, _Complex double * const T,const _Complex double c, const _Complex double d, const int N) 
+{
+  int i;
+  for(i = 0; i < N; i++) {
+    R[i] = c * S[i] + d * T[i];
+  }
+  return;
+}
+
+
+void lmul_diff_mul(_Complex double * const R, _Complex double * const S, _Complex double * const T,const _Complex double c, const _Complex double d, const int N) 
+{
+  int i;
+  for(i = 0; i < N; i++) {
+    R[i] = c * S[i] - d * T[i];
+  }
+  return;
+}
+
diff --git a/solver/mcr4complex.h b/solver/mcr4complex.h
new file mode 100644
index 000000000..3db9ebf27
--- /dev/null
+++ b/solver/mcr4complex.h
@@ -0,0 +1,50 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _MCR4COMPLEX_H
+#define _MCR4COMPLEX_H
+
+#include"solver/matrix_mult_typedef.h"
+#include"su3.h"
+
+void ldiff(_Complex double * Q, _Complex double * const R, _Complex double * const S, const int N);
+void ladd(_Complex double * Q, _Complex double * const R, _Complex double * const S, const int N);
+double lsquare_norm(_Complex double * const Q, const int N, const int parallel);
+_Complex double lscalar_prod(_Complex double * const R, _Complex double * const S, const int N, const int parallel);
+void lmul_r(_Complex double * const R, const double c, _Complex double * const S, const int N);
+void lmul(_Complex double * const R, const _Complex double c, _Complex double * const S, const int N);
+void lassign_diff_mul(_Complex double * const R, _Complex double * const S, const _Complex double c, const int N);
+void lassign_add_mul(_Complex double * const R, _Complex double * const S, const _Complex double c, const int N);
+void ldiff_assign(_Complex double * const Q, _Complex double * const S, 
+		    const int N);
+void ladd_assign(_Complex double * const Q, _Complex double * const S, 
+		    const int N);
+void lmul_add_mul(_Complex double * const R, _Complex double * const S, _Complex double * const T, 
+                    const _Complex double c, const _Complex double d, const int N);
+void lmul_diff_mul(_Complex double * const R, _Complex double * const S, _Complex double * const T, 
+                    const _Complex double c, const _Complex double d, const int N);
+
+int mcr4complex(_Complex double * const P, _Complex double * const Q, 
+		const int m, const int max_restarts,
+		const double eps_sq, const int rel_prec,
+		const int N, const int parallel,
+		const int lda, c_matrix_mult f);
+
+
+#endif
diff --git a/solver/mixed_cg_her.c b/solver/mixed_cg_her.c
index 556b08a58..7377fc342 100644
--- a/solver/mixed_cg_her.c
+++ b/solver/mixed_cg_her.c
@@ -1,8 +1,5 @@
 /***********************************************************************
- *
- * Copyright (C) 2001 Martin Hasenbusch
- *               2003 Thomas Chiarappa
- *               2002,2003,2004,2005 Carsten Urbach
+ * Copyright (C) 2013 Florian Burger
  *
  * This file is part of tmLQCD.
  *
@@ -43,7 +40,7 @@
  **************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -52,103 +49,154 @@
 #include "su3.h"
 #include "linalg_eo.h"
 #include "start.h"
+#include "operator/tm_operators_32.h"
 #include "solver/matrix_mult_typedef.h"
+#include "solver/solver_params.h"
+#include "read_input.h"
+
 #include "solver_field.h"
 #include "solver/mixed_cg_her.h"
+#include "gettime.h"
+
+
+
 
 /* P output = solution , Q input = source */
-int mixed_cg_her(spinor * const P, spinor * const Q, const int max_iter, 
-		 double eps_sq, const int rel_prec, const int N, matrix_mult f) {
+int mixed_cg_her(spinor * const P, spinor * const Q, solver_params_t solver_params, 
+                 const int max_iter, double eps_sq, const int rel_prec, const int N,
+                 matrix_mult f, matrix_mult32 f32) {
 
   int i = 0, iter = 0, j = 0;
-  double sqnrm = 0., sqnrm2, squarenorm;
-  double pro, err, alpha_cg, beta_cg;
-  spinor *x, *delta, *y;
+  float sqnrm = 0., sqnrm2, squarenorm;
+  float pro, err, alpha_cg, beta_cg;
+  double sourcesquarenorm, sqnrm_d, squarenorm_d;
+  spinor *delta, *y, *xhigh;
+  spinor32 *x, *stmp;
   spinor ** solver_field = NULL;
-  const int nr_sf = 6;
-
+  spinor32 ** solver_field32 = NULL;  
+  const int nr_sf = 3;
+  const int nr_sf32 = 4;
+
+  int max_inner_it = mixcg_maxinnersolverit;
+  int N_outer = max_iter/max_inner_it;
+  //to be on the save side we allow at least 10 outer iterations
+  if(N_outer < 10) N_outer = 10;
+  
+  int save_sloppy = g_sloppy_precision_flag;
+  double atime, etime, flops;
+  
   if(N == VOLUME) {
-    init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf);
+    init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf);    
+    init_solver_field_32(&solver_field32, VOLUMEPLUSRAND, nr_sf32);
   }
   else {
     init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf);
+    init_solver_field_32(&solver_field32, VOLUMEPLUSRAND/2, nr_sf32);    
   }
-  squarenorm = square_norm(Q, N, 1);
-  sqnrm = squarenorm;
 
-  delta = solver_field[3];
-  x = solver_field[4];
-  y = solver_field[5];
+  squarenorm_d = square_norm(Q, N, 1);
+  sourcesquarenorm = squarenorm_d;
+  sqnrm_d = squarenorm_d;
+ 
+  delta = solver_field[0];
+  y = solver_field[1];
+  xhigh = solver_field[2];
+  x = solver_field32[3];   
   assign(delta, Q, N);
-    
-  if(squarenorm > 1.e-7) { 
-    /* if a starting solution vector different from zero is chosen */
-    f(y, P);
-    diff(delta, Q, y, N);
-    sqnrm = square_norm(delta, N, 1);
-    if(((sqnrm <= eps_sq) && (rel_prec == 0)) || ((sqnrm <= eps_sq*squarenorm) && (rel_prec == 1))) {
-      finalize_solver(solver_field, nr_sf);
-      return(0);
-    }
-  }
+  
+  //set solution to zero
+  zero_spinor_field(P, N);
+  
+  atime = gettime();
+  for(i = 0; i < N_outer; i++) {
 
-  for(i = 0; i < 20; i++) {
-
-    g_sloppy_precision = 1;
     /* main CG loop in lower precision */
-    zero_spinor_field(x, N);
-    assign(solver_field[1], delta, N);
-    assign(solver_field[2], delta, N);
+    zero_spinor_field_32(x, N);
+    zero_spinor_field_32(solver_field32[0], N);   
+    assign_to_32(solver_field32[1], delta, N);
+    assign_to_32(solver_field32[2], delta, N);
+    
+    sqnrm = (float) sqnrm_d;
     sqnrm2 = sqnrm;
-    for(j = 0; j <= max_iter; j++) {
-      f(solver_field[0], solver_field[2]);
-      pro = scalar_prod_r(solver_field[2], solver_field[0], N, 1);
-      alpha_cg = sqnrm2 / pro;
-      assign_add_mul_r(x, solver_field[2], alpha_cg, N);
     
-      assign_mul_add_r(solver_field[0], -alpha_cg, solver_field[1], N);
-      err = square_norm(solver_field[0], N, 1);
-
-      if(g_proc_id == g_stdio_proc && g_debug_level > 1) {
-	printf("inner CG: %d res^2 %g\n", iter+j, err);
-	fflush(stdout);
+    /*inner CG loop */
+    for(j = 0; j <= max_inner_it; j++) {
+      
+      f32(solver_field32[0], solver_field32[2]); 
+      pro = scalar_prod_r_32(solver_field32[2], solver_field32[0], N, 1);
+      alpha_cg = sqnrm2 / pro;
+      
+      assign_add_mul_r_32(x, solver_field32[2], alpha_cg, N);
+      
+      assign_mul_add_r_32(solver_field32[0], -alpha_cg, solver_field32[1], N);      
+      
+      err = square_norm_32(solver_field32[0], N, 1);
+
+      if(g_proc_id == g_stdio_proc && g_debug_level > 2) {
+	      printf("inner CG: %d res^2 %g\n", iter+j, err);
+	      fflush(stdout);
       }
     
-      if (((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))){
-	break;
+      //if (((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))){
+      if((err <= mixcg_innereps*sqnrm)|| (j==max_inner_it) ||  ((1.3*err <= eps_sq) && (rel_prec == 0)) || ((1.3*err <= eps_sq*sourcesquarenorm) && (rel_prec == 1))) {
+	      break;
       }
       beta_cg = err / sqnrm2;
-      assign_mul_add_r(solver_field[2], beta_cg, solver_field[0], N);
-      assign(solver_field[1], solver_field[0], N);
+      assign_mul_add_r_32(solver_field32[2], beta_cg, solver_field32[0], N);
+      stmp = solver_field32[0];
+      solver_field32[0] = solver_field32[1];
+      solver_field32[1] = stmp;
       sqnrm2 = err;
     }
-    /* end main CG loop */
+    /* end inner CG loop */
     iter += j;
-    g_sloppy_precision = 0;
-    add(P, P, x, N);
 
-    f(y, x);
-    diff(delta, delta, y, N);
-    sqnrm = square_norm(delta, N, 1);
-    if(g_debug_level > 0 && g_proc_id == g_stdio_proc) {
-      printf("mixed CG: true residue %d\t%g\t\n",iter, sqnrm); fflush( stdout);
+    /* we want to apply a true double matrix with f(y,P) -> set sloppy off here*/
+    g_sloppy_precision_flag = 0;
+    
+    /* calculate defect in double precision */
+    assign_to_64(xhigh, x, N);    
+    add(P, P, xhigh, N);
+    f(y, P);
+    diff(delta, Q, y, N);
+    sqnrm_d = square_norm(delta, N, 1);
+    if(g_debug_level > 2 && g_proc_id == 0) {
+      printf("mixed CG: last inner residue: %g\t\n", err);
+      printf("mixed CG: true residue %d %g\t\n",iter, sqnrm_d); fflush(stdout);
     }
-
-    if(((sqnrm <= eps_sq) && (rel_prec == 0)) || ((sqnrm <= eps_sq*squarenorm) && (rel_prec == 1))) {
+    
+    /* here we can reset it to its initial value*/
+    g_sloppy_precision_flag = save_sloppy;
+    
+    if(((sqnrm_d <= eps_sq) && (rel_prec == 0)) || ((sqnrm_d <= eps_sq*sourcesquarenorm) && (rel_prec == 1))) {
+      etime = gettime();     
+
+      if(g_debug_level > 0 && g_proc_id == 0) {
+      	if(N != VOLUME){
+      	  /* 2 A + 2 Nc Ns + N_Count ( 2 A + 10 Nc Ns ) */
+      	  /* 2*1608.0 because the linalg is over VOLUME/2 */
+      	  flops = (2*(2*1608.0+2*3*4) + 2*3*4 + iter*(2.*(2*1608.0+2*3*4) + 10*3*4))*N/1.0e6f;
+      	  printf("# mixed CG: iter: %d eps_sq: %1.4e t/s: %1.4e\n", iter, eps_sq, etime-atime); 
+      	  printf("# mixed CG: flopcount (for e/o tmWilson only): t/s: %1.4e mflops_local: %.1f mflops: %.1f\n", 
+      	      etime-atime, flops/(etime-atime), g_nproc*flops/(etime-atime));
+      	}
+      	else{
+      	  /* 2 A + 2 Nc Ns + N_Count ( 2 A + 10 Nc Ns ) */
+      	  flops = (2*(1608.0+2*3*4) + 2*3*4 + iter*(2.*(1608.0+2*3*4) + 10*3*4))*N/1.0e6f;      
+      	  printf("# mixed CG: iter: %d eps_sq: %1.4e t/s: %1.4e\n", iter, eps_sq, etime-atime); 
+      	  printf("# mixed CG: flopcount (for non-e/o tmWilson only): t/s: %1.4e mflops_local: %.1f mflops: %.1f\n", 
+      	      etime-atime, flops/(etime-atime), g_nproc*flops/(etime-atime));      
+      	}
+      }      
+      
       finalize_solver(solver_field, nr_sf);
+      finalize_solver_32(solver_field32, nr_sf32); 
       return(iter+i);
     }
     iter++;
   }
   finalize_solver(solver_field, nr_sf);
+  finalize_solver_32(solver_field32, nr_sf32); 
   return(-1);
 }
 
-
-
-
-
-
-
-
-
diff --git a/solver/mixed_cg_her.h b/solver/mixed_cg_her.h
index fb1261eba..58e069031 100644
--- a/solver/mixed_cg_her.h
+++ b/solver/mixed_cg_her.h
@@ -19,10 +19,13 @@
 #ifndef _MIXED_CG_HER_H
 #define _MIXED_CG_HER_H
 
+#include"operator/tm_operators_32.h"
 #include"solver/matrix_mult_typedef.h"
+#include"solver/solver_params.h"
 #include"su3.h"
 
-int mixed_cg_her(spinor * const P, spinor * const Q, const int max_iter, 
-		 double eps_sq, const int rel_prec, const int N, matrix_mult f);
+int mixed_cg_her(spinor * const P, spinor * const Q, solver_params_t solver_params, 
+                 const int max_iter, double eps_sq, const int rel_prec, const int N,
+                 matrix_mult f, matrix_mult32 f32);
 
 #endif
diff --git a/solver/mixed_cg_mms_tm_nd.c b/solver/mixed_cg_mms_tm_nd.c
new file mode 100644
index 000000000..6ce1a13e9
--- /dev/null
+++ b/solver/mixed_cg_mms_tm_nd.c
@@ -0,0 +1,564 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2015 Florian Burger
+ * partially based on cg_mms_tm_nd.c by Andrea Shindler and Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ * 
+ * Author: 2015 Florian Burger
+ * 
+ * This is a Multi-Shift reliable update single/double mixed CG solver
+ * it expects that the shifts fulfil
+ *
+ * shift[0] < shift[1] < shift{2] < ... < shift[no_shifts-1]
+ *
+ * in modulus. The code will use shift[i]^2, which are all >0
+ *
+ * parameters:
+ * shifts are given to the solver in solver_params->shifts
+ * number of shifts is in solver_params->no_shifts
+ * the operator to invert in solver_params->M_ndpsi
+ * the 32 bit operator to invert in solver_params->M_ndpsi32
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "global.h"
+#include "su3.h"
+#include "gamma.h"
+#include "linalg_eo.h"
+#include "start.h"
+#include "gettime.h"
+#include "solver/solver.h"
+#include "solver_field.h"
+#include "cg_mms_tm_nd.h"
+#include "mixed_cg_mms_tm_nd.h"
+
+
+static spinor32 * x_up_qmms;
+static spinor32 ** mms_x_up;
+static spinor32 * x_dn_qmms;
+static spinor32 ** mms_x_dn;
+
+static spinor32 * d_up_qmms;
+static spinor32 ** mms_d_up;
+static spinor32 * d_dn_qmms;
+static spinor32 ** mms_d_dn;
+
+
+static void init_mms_tm_nd_32(const unsigned int nr, const unsigned int N);
+static void free_mms_tm_nd_32();
+
+int mixed_cg_mms_tm_nd(spinor ** const Pup, spinor ** const Pdn, 
+		 spinor * const Qup, spinor * const Qdn, 
+		 solver_params_t * solver_params) {
+
+  double eps_sq = solver_params->squared_solver_prec;
+  int noshifts = solver_params->no_shifts;
+  int rel_prec = solver_params->rel_prec;
+  int max_iter = solver_params->max_iter;
+  int check_abs, check_rel;
+  double * shifts = solver_params->shifts;
+  int Nshift = noshifts;
+ 
+  // algorithm
+  double rr_up, rr_dn, rr, rr_old, r0r0, dAd_up, dAd_dn, dAd;  
+  
+  if(rel_prec){
+    check_rel = 1;
+    check_abs = 0;
+   }
+   else{
+    check_rel = 0;
+    check_abs = 1;     
+  }
+  
+  int use_eo=1, eofactor=2;
+  //not even-odd?
+  if(solver_params->sdim == VOLUME) {
+    eofactor = 1;
+    use_eo = 0;
+  }
+  
+  int N = VOLUME/eofactor;
+  int Vol = VOLUMEPLUSRAND/eofactor;
+ 
+  
+  // norm of source
+  rr_up = square_norm(Qup, N, 1);
+  rr_dn = square_norm(Qdn, N, 1);
+  rr    = rr_up + rr_dn;  
+ 
+  if( (g_cart_id == 0 && g_debug_level > 2)) printf("# CGMMSND_mixed: Initial mms residue: %.6e\n", rr);
+  if(rr < 1.0e-4){
+    if( (g_cart_id == 0 && g_debug_level > 2)) printf("# CGMMSND_mixed: norm of source too low: falling back to double mms solver %.6e\n", rr);
+    return(cg_mms_tm_nd(Pup, Pdn, Qup, Qdn, solver_params));
+  }
+  
+  r0r0   = rr;	// for relative precision 
+  rr_old = rr;	// for the first iteration
+  
+  
+  
+  //allocate an auxiliary solver fields 
+  spinor ** sf = NULL;
+  const int nr_sf = 6;
+  init_solver_field(&sf, Vol, nr_sf);  
+   
+  spinor32 ** sf32 = NULL;
+  const int nr_sf32 = 8;
+  init_solver_field_32(&sf32, Vol, nr_sf32);  
+  
+  
+  //spinor fields  
+  //we need one less than shifts, since one field is cared of by the usual cg fields
+  init_mms_tm_nd_32(noshifts-1, Vol);
+   
+  // Pup/dn  can be used as auxiliary field to work on, as it is not later used (could be used as initial guess at the very start)
+  // Q_up/dn  can be used as feedback, or if not, also as auxiliary field
+  
+
+  
+  //allocate cg constants
+  double * sigma;
+  double * zitam1, * zita;
+  double * alphas, * betas;
+  double gamma;
+  double alpham1;
+    sigma = (double*)calloc((noshifts), sizeof(double));
+    zitam1 = (double*)calloc((noshifts), sizeof(double));
+    zita = (double*)calloc((noshifts), sizeof(double));
+    alphas = (double*)calloc((noshifts), sizeof(double));
+    betas = (double*)calloc((noshifts), sizeof(double));
+
+
+
+  spinor32 *  r_up, *  r_dn, * Ad_up, * Ad_dn, *  x_up, *  x_dn, *  d_up, *  d_dn;		
+  spinor * r_up_d, * r_dn_d, * x_up_d, * x_dn_d, * Ax_up_d, * Ax_dn_d;
+  
+ // iteration counter
+ int j; 
+ 
+ //reliable update flag
+ int rel_update = 0;
+ //no of reliable updates done
+ int no_rel_update = 0;
+ //use reliable update flag
+ int use_reliable = 1;
+ 
+ double rel_delta = 1.0e-10;
+ int trigger_shift = -1;
+ double * res;
+ double * res0;
+ double * maxres;
+ res = (double*)calloc((noshifts), sizeof(double));
+ res0 = (double*)calloc((noshifts), sizeof(double));
+ maxres = (double*)calloc((noshifts), sizeof(double)); 
+    
+  /////////////////
+  // ASSIGNMENTS //
+  /////////////////
+  
+  x_up  = sf32[0];	
+  x_dn  = sf32[1];	
+  r_up  = sf32[2];	
+  r_dn  = sf32[3];
+  d_up  = sf32[4];
+  d_dn  = sf32[5];
+  Ad_up = sf32[6];
+  Ad_dn = sf32[7];
+
+
+  x_up_d = sf[0];
+  x_dn_d = sf[1];
+  r_up_d = sf[2];
+  r_dn_d = sf[3];
+  Ax_up_d = sf[4];
+  Ax_dn_d = sf[5];  
+  
+  /*
+  //matrix test
+   spinor32 * help_low_up = sf32[0];
+   spinor32 * help_low_dn = sf32[1];   
+   spinor * help_high_up = sf[0];
+   spinor * help_high_dn = sf[1];   
+   assign_to_32(help_low_up, Qup, N);
+   assign_to_32(help_low_dn, Qdn, N);   
+   assign(help_high_up, Qup, N);
+   assign(help_high_dn, Qdn, N);   
+   double sqn_high = square_norm(help_high_up,N,1) +
+                     square_norm(help_high_dn,N,1);
+   printf("square_norm(Q_high) = %e\n", sqn_high);
+   float sqn_low  = square_norm_32(help_low_up,N,1) +
+                    square_norm_32(help_low_dn,N,1);   
+   printf("square_norm(Q_low) = %e\n", sqn_low);  
+   
+   solver_params->M_ndpsi32(sf32[2], sf32[3], help_low_up, help_low_dn);
+   solver_params->M_ndpsi(sf[2], sf[3], help_high_up, help_high_dn);
+   
+   assign_to_64(sf[4], sf32[2], N);
+   assign_to_64(sf[5], sf32[3], N);   
+   diff(sf[0], sf[4], sf[2], N);
+   diff(sf[1], sf[5], sf[3], N);   
+   double sqnrm = square_norm(sf[0], N, 1) +
+                  square_norm(sf[1], N, 1);
+   printf("Operator 32 test: (square_norm) / (spinor component) = %.8e\n", sqnrm/24.0/N);
+   exit(1);  
+  */
+  
+  // r(0) = b
+  assign_to_32(r_up, Qup, N);
+  assign_to_32(r_dn, Qdn, N); 
+  
+  // d(0) = b
+  assign_to_32(d_up, Qup, N);
+  assign_to_32(d_dn, Qdn, N); 
+  
+
+  
+  maxres[0] = rr;
+  res[0] = rr;
+  res0[0] = rr;
+  alphas[0] = 1.0;
+  betas[0] = 0.0;
+  sigma[0] = shifts[0]*shifts[0];
+  if(g_cart_id == 0 && g_debug_level > 2) printf("# CGMMSND_mixed: shift %d is %e\n", 0, sigma[0]);
+
+  // currently only implemented for P=0 
+  for(int im = 1; im < noshifts; im++) {
+    maxres[im] = rr;
+    res[im] = rr;
+    res0[im] = rr;    
+    sigma[im] = shifts[im]*shifts[im] - sigma[0];
+    if(g_cart_id == 0 && g_debug_level > 2) printf("# CGMMSND_mixed: shift %d is %e\n", im, sigma[im]);
+    // these will be the result spinor fields
+    zero_spinor_field_32(mms_x_up[im-1], N);
+    zero_spinor_field_32(mms_x_dn[im-1], N);    
+
+    assign_to_32(mms_d_up[im-1], Qup, N);
+    assign_to_32(mms_d_dn[im-1], Qdn, N);
+    zitam1[im] = 1.0;
+    zita[im] = 1.0;
+    alphas[im] = 1.0;
+    betas[im] = 0.0;
+  }
+
+  //zero fields for solution Pup, Pdn
+  for(int im = 0; im < noshifts; im++){
+    zero_spinor_field(Pup[im], N);
+    zero_spinor_field(Pdn[im], N);    
+  }
+  
+  
+  //////////
+  // LOOP //
+  //////////
+    
+  for (j = 0; j < max_iter; j++) {   
+      // A*d(k)
+    solver_params->M_ndpsi32(Ad_up, Ad_dn, d_up,  d_dn);     
+    //add zero'th shift
+    assign_add_mul_r_32(Ad_up, d_up, (float) sigma[0], N);
+    assign_add_mul_r_32(Ad_dn, d_dn, (float) sigma[0], N);
+	     
+    
+    // alpha = r(k)*r(k) / d(k)*A*d(k)
+    dAd_up = scalar_prod_r_32(d_up, Ad_up, N, 1);
+    dAd_dn = scalar_prod_r_32(d_dn, Ad_dn, N, 1);
+
+    dAd    = dAd_up + dAd_dn; 
+    alpham1 = alphas[0];
+    alphas[0]  = rr_old / dAd;	// rr_old is taken from the last iteration respectively
+    
+   
+    // r(k+1)
+    assign_add_mul_r_32(r_up, Ad_up, (float) -alphas[0],N);
+    assign_add_mul_r_32(r_dn, Ad_dn, (float) -alphas[0],N);
+
+    // r(k+1)*r(k+1)
+    rr_up  = square_norm_32(r_up, N, 1);
+    rr_dn  = square_norm_32(r_dn, N, 1);
+    rr     = rr_up + rr_dn;
+    
+      
+
+    if((g_cart_id == 0) && (g_debug_level > 2)) printf("# CGMMSND_mixed: mms iteration j = %i: rr = %.6e\n", j, rr);
+
+		 
+
+    // aborting ?? // check wether precision is reached ...
+    if ( ((check_abs)&&(rr <= eps_sq)) || ((check_rel)&&(rr <= eps_sq*r0r0)) ) 
+    {
+	if ((check_rel)&&(rr <= eps_sq*r0r0)) {
+	  if((g_cart_id == 0) && (g_debug_level > 3)) printf("# CGMMSND_mixed: Reached relative solver precision of eps_rel = %.2e\n", eps_sq);
+	}
+      break;
+   }
+    
+    // update alphas and zitas  
+    // used later
+    for(int im = 1; im < noshifts; im++) {
+      gamma = zita[im]*alpham1/(alphas[0]*betas[0]*(1.-zita[im]/zitam1[im]) 
+				+ alpham1*(1.+sigma[im]*alphas[0]));
+      zitam1[im] = zita[im];
+      zita[im] = gamma;
+      alphas[im] = alphas[0]*zita[im]/zitam1[im];
+    }  
+    
+    //check for reliable update
+    res[0] = rr;
+    for(int im=1; im<noshifts; im++) res[im] = rr * zita[im]; 
+      
+    rel_update = 0;
+    for(int im = (noshifts-1); im >= 0; im--) {
+      if( res[im] > maxres[im] ) maxres[im] = res[im];
+      if( (res[im] < rel_delta*res0[im]) && (res0[im]<=maxres[im]) && (use_reliable) ) rel_update=1; 
+      if( rel_update && ( trigger_shift == -1) ) trigger_shift = im;
+    }     
+    
+    if(!rel_update)
+    {
+      // x_j(k+1) = x_j(k) + alpha_j*d_j(k) 
+      // alphas are set above
+      assign_add_mul_r_32(x_up, d_up, (float) alphas[0], N);   
+      assign_add_mul_r_32(x_dn, d_dn, (float) alphas[0], N);
+      
+      
+      for(int im = 1; im < noshifts; im++) {
+	assign_add_mul_r_32(mms_x_up[im-1], mms_d_up[im-1], (float) alphas[im],  N);   
+	assign_add_mul_r_32(mms_x_dn[im-1], mms_d_dn[im-1], (float) alphas[im],  N);  
+      }  
+   
+      // beta = r(k+1)*r(k+1) / r(k)*r(k)
+      betas[0] = rr / rr_old;
+      rr_old = rr;  // for next iteration
+      
+      // d_0(k+1) = r(k+1) + beta*d_0(k) 
+      assign_mul_add_r_32(d_up, (float) betas[0], r_up, N);  
+      assign_mul_add_r_32(d_dn, (float) betas[0], r_dn, N); 
+       
+      // d_j(k+1) = zita*r(k+1) + beta*d_j(k)
+      for(int im = 1; im < noshifts; im++) {
+	betas[im] = betas[0]*zita[im]*alphas[im]/(zitam1[im]*alphas[0]);
+	assign_mul_add_mul_r_32(mms_d_up[im-1], r_up, (float) betas[im], (float) zita[im], N);
+	assign_mul_add_mul_r_32(mms_d_dn[im-1], r_dn, (float) betas[im], (float) zita[im], N);
+      }   
+    }
+    else{
+      //reliable update
+      if( (g_cart_id == 0) && (g_debug_level > 3) ){
+	printf("# CGMMSND_mixed: Shift %d with offset squared %e triggered a reliable update\n", trigger_shift, sigma[trigger_shift]);
+      }
+      //add low prec solutions  
+      assign_add_mul_r_32(x_up, d_up, (float) alphas[0], N); 
+      assign_add_mul_r_32(x_dn, d_dn, (float) alphas[0], N); 
+      
+      addto_32(Pup[0], x_up, N);
+      addto_32(Pdn[0], x_dn, N);	    
+      for(int im = 1; im < noshifts; im++) {  
+	assign_add_mul_r_32(mms_x_up[im-1], mms_d_up[im-1], alphas[im], N);
+	assign_add_mul_r_32(mms_x_dn[im-1], mms_d_dn[im-1], alphas[im], N);	
+	addto_32(Pup[im], mms_x_up[im-1], N);
+        addto_32(Pdn[im], mms_x_dn[im-1], N);	
+      }
+      
+      //add low precision for shift 0 only
+      addto_32(x_up_d, x_up, N); 
+      addto_32(x_dn_d, x_dn, N);      
+ 
+      
+      solver_params->M_ndpsi(Ax_up_d, Ax_dn_d, x_up_d,  x_dn_d);
+      //add zero'th shift
+      assign_add_mul_r(Ax_up_d, x_up_d, sigma[0], N);
+      assign_add_mul_r(Ax_dn_d, x_dn_d, sigma[0], N);
+      
+      diff(r_up_d, Qup, Ax_up_d, N);         
+      diff(r_dn_d, Qdn, Ax_dn_d, N); 
+ 
+      rr_up = square_norm(r_up_d, N, 1);
+      rr_dn = square_norm(r_dn_d, N, 1);
+      rr    = rr_up + rr_dn;
+      if ((g_cart_id == 0) && (g_debug_level > 3) ) printf("# CGMMSND_mixed: New residue after reliable update: %.6e\n", rr);
+       
+      //update res[im]
+      res[0] = rr;
+
+       
+      if(res[trigger_shift] > res0[trigger_shift]){
+	if(g_cart_id == 0) printf("# CGMMSND_mixed: Warning: residue of shift no %d got larger after rel. update\n", trigger_shift);
+	//if this is the zero'th shift not getting better -> no further convergence, break
+	if(trigger_shift == 0) break;
+      }    
+      
+      //zero float fields
+      zero_spinor_field_32(x_up, N);
+      zero_spinor_field_32(x_dn, N);        
+      for(int im = 1; im < noshifts; im++) {
+	zero_spinor_field_32(mms_x_up[im-1], N);
+	zero_spinor_field_32(mms_x_dn[im-1], N);  
+      }
+      
+      //update the source
+      assign_to_32(r_up, r_up_d, N);
+      assign_to_32(r_dn, r_dn_d, N); 
+      
+
+      
+      betas[0] = res[0]/rr_old;
+      rr_old = rr;
+      // d_0(k+1) = r(k+1) + beta*d_0(k)
+      assign_mul_add_r_32(d_up, betas[0], r_up, N);
+      assign_mul_add_r_32(d_dn, betas[0], r_dn, N);      
+      // d_j(k+1) = r(k+1) + beta*d_j(k)
+      for(int im = 1; im < noshifts; im++) {
+	betas[im] = betas[0]*zita[im]*alphas[im]/(zitam1[im]*alphas[0]);
+        assign_mul_add_mul_r_32(mms_d_up[im-1], r_up, (float) betas[im], (float) zita[im], N);
+	assign_mul_add_mul_r_32(mms_d_dn[im-1], r_dn, (float) betas[im], (float) zita[im], N);
+      } 
+      
+      //new maxres for the shift that initiated the reliable update
+      res[trigger_shift] = res[0]*zita[trigger_shift]*zita[trigger_shift];
+      res0[trigger_shift] = res[trigger_shift];  
+      maxres[trigger_shift] = res[trigger_shift];
+      trigger_shift = -1;
+      no_rel_update ++;
+    }	//reliable update	
+    
+    //check if some shift is converged
+    for(int im = 1; im < noshifts; im++) {    
+      if(j > 0 && (j % 10 == 0) && (im == noshifts-1)) {
+	double sn = square_norm_32(mms_d_up[im-1], N, 1);
+	sn +=       square_norm_32(mms_d_dn[im-1], N, 1);
+	if(alphas[noshifts-1]*alphas[noshifts-1]*sn <= eps_sq) {
+	  noshifts--;
+	  if( (g_debug_level > 1) && (g_cart_id == 0) ) {
+	    printf("# CGMMSND_mixed: at iteration %d removed one shift, %d remaining\n", j, noshifts);
+	  }
+	  //if removed we add the latest solution vector for this shift 	  
+	  addto_32(Pup[im], mms_x_up[im-1], N);
+          addto_32(Pdn[im], mms_x_dn[im-1], N);
+	}
+      }
+    }
+       
+  }//LOOP
+  
+  if( (g_cart_id == 0) && (g_debug_level > 1) ) printf("Final mms residue: %.6e\n", rr);
+
+  //add the latest solutions 
+  for(int im = 0; im < noshifts; im++) {  
+    if(im == 0){   
+      addto_32(Pup[0], x_up, N);
+      addto_32(Pdn[0], x_dn, N);        
+    }
+    else{     
+      addto_32(Pup[im], mms_x_up[im-1], N);
+      addto_32(Pdn[im], mms_x_dn[im-1], N);      
+    }
+  } 
+  
+  if(g_debug_level > 4){
+    if(g_cart_id == 0) printf("# CGMMSND_mixed: Checking mms result:\n");
+    //loop over all shifts (-> Nshift) 
+    for(int im = 0; im < Nshift; im++){
+      solver_params->M_ndpsi(sf[0], sf[1], Pup[im], Pdn[im]);
+      assign_add_mul_r(sf[0], Pup[im] , shifts[im]*shifts[im], N);
+      assign_add_mul_r(sf[1], Pdn[im] , shifts[im]*shifts[im], N);
+      diff(sf[2], sf[0], Qup, N);
+      diff(sf[3], sf[1], Qdn, N);
+      rr_up = square_norm(sf[2], N, 1);
+      rr_dn = square_norm(sf[3], N, 1);      
+      rr = rr_up + rr_dn;
+      if(g_cart_id == 0) printf("# CGMMSND_mixed: Shift[%d] squared residue: %e\n", im, rr);
+    }
+  }
+  
+ 
+  finalize_solver(sf, nr_sf);  
+  finalize_solver_32(sf32, nr_sf32); 
+ 
+  //free cg constants
+  free(sigma); free(zitam1); free(zita); free(alphas); free(betas);    
+  
+  //free reliable update stuff
+  free(res); free(res0); free(maxres);
+
+
+  //if not converged -> return(-1)
+  if(j<max_iter){
+    return(j);
+  }
+  else{
+    return(-1);
+  }
+}//
+
+
+
+
+
+
+
+
+
+
+static unsigned int ini_mms_nd = 0;
+static unsigned int nr_nd = 0;
+
+static void init_mms_tm_nd_32(const unsigned int _nr, const unsigned int N) {
+  if(ini_mms_nd == 0 || _nr > nr_nd) {
+    if(nr_nd != 0) {
+      free_mms_tm_nd_32();
+    }
+    nr_nd = _nr;
+
+    x_up_qmms = (spinor32*)calloc(N*(nr_nd)+1,sizeof(spinor32));
+    x_dn_qmms = (spinor32*)calloc(N*(nr_nd)+1,sizeof(spinor32));    
+    d_up_qmms = (spinor32*)calloc(N*(nr_nd)+1,sizeof(spinor32));
+    d_dn_qmms = (spinor32*)calloc(N*(nr_nd)+1,sizeof(spinor32));     
+    mms_x_up = (spinor32**)calloc((nr_nd)+1,sizeof(spinor32*));
+    mms_x_dn = (spinor32**)calloc((nr_nd)+1,sizeof(spinor32*));    
+    mms_d_up = (spinor32**)calloc((nr_nd)+1,sizeof(spinor32*));
+    mms_d_dn = (spinor32**)calloc((nr_nd)+1,sizeof(spinor32*));
+    
+    for(int i = 0; i < nr_nd; i++) {
+      mms_x_up[i]=(spinor32*)(((unsigned long int)(x_up_qmms)+ALIGN_BASE32)&~ALIGN_BASE32) + i*N;
+      mms_x_dn[i]=(spinor32*)(((unsigned long int)(x_dn_qmms)+ALIGN_BASE32)&~ALIGN_BASE32) + i*N;
+      mms_d_up[i]=(spinor32*)(((unsigned long int)(d_up_qmms)+ALIGN_BASE32)&~ALIGN_BASE32) + i*N;
+      mms_d_dn[i]=(spinor32*)(((unsigned long int)(d_dn_qmms)+ALIGN_BASE32)&~ALIGN_BASE32) + i*N;      
+    }
+    ini_mms_nd = 1;
+  }
+}
+
+static void free_mms_tm_nd_32() {
+  free(x_up_qmms); free(x_dn_qmms);
+  free(d_up_qmms); free(d_dn_qmms);  
+  free(mms_x_up); free(mms_x_dn);
+  free(mms_d_up); free(mms_d_dn);  
+  
+  nr_nd = 0;
+  ini_mms_nd = 0;
+  return;
+}
+
+
+
+
diff --git a/solver/mixed_cg_mms_tm_nd.h b/solver/mixed_cg_mms_tm_nd.h
new file mode 100644
index 000000000..0240bd8f2
--- /dev/null
+++ b/solver/mixed_cg_mms_tm_nd.h
@@ -0,0 +1,33 @@
+/***********************************************************************
+ *
+ *
+ * Copyright (C) 2015 Florian Burger
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+
+#ifndef _MIXED_CG_MMS_TM_ND_H
+#define _MIXED_CG_MMS_TM_ND_H
+
+#include"su3.h"
+#include"solver.h"
+
+int mixed_cg_mms_tm_nd(spinor ** const Pup, spinor ** const Pdn, 
+		 spinor * const Qup, spinor * const Qdn, 
+		 solver_params_t * solver_params);
+
+#endif
diff --git a/solver/monomial_solve.c b/solver/monomial_solve.c
new file mode 100644
index 000000000..8f29075b4
--- /dev/null
+++ b/solver/monomial_solve.c
@@ -0,0 +1,600 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2014 Florian Burger
+ *               2017 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  
+ * File: monomial_solve.c
+ *
+ * solver wrapper for monomials
+ *
+ * The externally accessible functions are
+ *
+ *
+ *   int solve_degenerate(spinor * const P, spinor * const Q, const int max_iter, 
+ *                       double eps_sq, const int rel_prec, const int N, matrix_mult f)
+ *
+ *   int solve_mms_tm(spinor ** const P, spinor * const Q,
+ *                    solver_params_t * solver_params)  
+ *
+ *   int solve_mms_nd(spinor ** const Pup, spinor ** const Pdn, 
+ *                    spinor * const Qup, spinor * const Qdn, 
+ *                    solver_params_t * solver_params)  
+ *
+ **************************************************************************/
+
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include "global.h"
+#include "start.h"
+#include "read_input.h"
+#include "default_input_values.h"
+#include "linalg/mul_gamma5.h"
+#include "linalg/diff.h"
+#include "linalg/square_norm.h"
+#include "linalg/mul_r_gamma5.h"
+#include "gamma.h"
+// for the non-degenerate operator normalisation
+#include "phmc.h"
+#include "solver/solver.h"
+#include "solver/solver_field.h"
+#include "solver/init_guess.h"
+#include "solver/matrix_mult_typedef.h"
+#include "solver/solver_types.h"
+#include "solver/solver_params.h"
+#include "operator/tm_operators.h"
+#include "operator/tm_operators_32.h"
+#include "operator/tm_operators_nd.h"
+#include "operator/tm_operators_nd_32.h"
+#include "operator/clovertm_operators.h"
+#include "operator/clovertm_operators_32.h"
+#include "misc_types.h"
+#include "monomial_solve.h"
+#include "linalg_eo.h"
+#ifdef DDalphaAMG
+#include "DDalphaAMG_interface.h"
+#endif
+#ifdef TM_USE_QPHIX
+#include "qphix_interface.h"
+#endif
+#include "fatal_error.h"
+
+#include <io/params.h>
+#include <io/spinor.h>
+
+#ifdef HAVE_GPU
+#include"../GPU/cudadefs.h"
+extern  int linsolve_eo_gpu (spinor * const P, spinor * const Q, const int max_iter, 
+                            double eps, const int rel_prec, const int N, matrix_mult f);
+extern int dev_cg_mms_tm_nd(spinor ** const Pup, spinor ** const Pdn, 
+		 spinor * const Qup, spinor * const Qdn, 
+		 solver_params_t * solver_params);
+   #ifdef TEMPORALGAUGE
+     #include "../temporalgauge.h" 
+   #endif
+#include "read_input.h" 
+#endif
+
+int solve_degenerate(spinor * const P, spinor * const Q, solver_params_t solver_params,
+                     const int max_iter, double eps_sq, const int rel_prec, 
+                     const int N, matrix_mult f, int solver_type){
+  int iteration_count = 0;
+
+  // temporary field required by the QPhiX solve or by residual check
+  spinor** temp;
+  if(g_debug_level > 2 || solver_params.external_inverter == QPHIX_INVERTER){
+    init_solver_field(&temp, VOLUMEPLUSRAND/2, 1);
+  }
+
+  solver_params.use_initial_guess = 0;
+
+#ifdef TM_USE_QPHIX
+  if(solver_params.external_inverter == QPHIX_INVERTER){
+    // using CG for the HMC, we always want to have the solution of (Q Q^dagger) x = b, which is equivalent to
+    // gamma_5 (M M^dagger)^{-1} gamma_5 b
+    // FIXME: this needs to be adjusted to also support BICGSTAB
+    gamma5(temp[0], Q, VOLUME/2);
+    iteration_count = invert_eo_qphix_oneflavour(P, temp[0], max_iter, eps_sq, solver_type, 
+                                                 rel_prec, solver_params, solver_params.sloppy_precision, solver_params.compression_type);
+    mul_gamma5(P, VOLUME/2);
+  } else
+#endif
+  if(solver_type == MIXEDCG || solver_type == RGMIXEDCG){
+    // the default mixed solver is rg_mixed_cg_her
+    int (*msolver_fp)(spinor * const, spinor * const, solver_params_t, 
+                      const int, double, const int, const int, matrix_mult, matrix_mult32) = rg_mixed_cg_her;
+
+    // but it might be necessary at some point to use the old version
+    if(solver_type == MIXEDCG){
+      msolver_fp = mixed_cg_her;
+    }
+
+    // FIXME: this GPU stuff needs to go...
+    if(usegpu_flag){   
+      #ifdef HAVE_GPU     
+        #ifdef TEMPORALGAUGE
+          to_temporalgauge(g_gauge_field, Q , P);
+        #endif          
+        iteration_count = linsolve_eo_gpu(P, Q, max_iter, eps_sq, rel_prec, N, f);
+        #ifdef TEMPORALGAUGE
+          from_temporalgauge(Q, P);
+        #endif
+      #endif
+      return(iteration_count);
+    }
+    else{
+      if(f==Qtm_pm_psi){   
+        iteration_count = msolver_fp(P, Q, solver_params, max_iter, eps_sq, rel_prec, N, f, &Qtm_pm_psi_32);
+      } else if(f==Q_pm_psi){     
+	iteration_count = msolver_fp(P, Q, solver_params, max_iter, eps_sq, rel_prec, N, f, &Q_pm_psi_32);
+      } else if(f==Qsw_pm_psi){
+        copy_32_sw_fields();
+        iteration_count = msolver_fp(P, Q, solver_params, max_iter, eps_sq, rel_prec, N, f, &Qsw_pm_psi_32);
+      } else {
+        if(g_proc_id==0) printf("Warning: 32 bit matrix not available. Falling back to CG in 64 bit\n"); 
+        solver_type = CG;
+      }
+    }
+  } 
+  else if(solver_type == CG){
+    iteration_count =  cg_her(P, Q, max_iter, eps_sq, rel_prec, N, f);
+  }
+  else if(solver_type == BICGSTAB){
+     iteration_count =  bicgstab_complex(P, Q, max_iter, eps_sq, rel_prec, N, f);     
+  }
+#ifdef DDalphaAMG 
+  else if (solver_type == MG)
+    iteration_count =  MG_solver(P, Q, eps_sq, max_iter,rel_prec, N , g_gauge_field, f);
+#endif     
+  else{
+    fatal_error("Error: solver not allowed for degenerate solve. Aborting...\n", "solve_degenerate");
+  }
+
+  if(g_debug_level > 2){
+    f(temp[0], P);
+    diff(temp[0], temp[0], Q, VOLUME/2);
+    double diffnorm = square_norm(temp[0], VOLUME/2, 1); 
+    if( g_proc_id == 0 ){
+      printf("# solve_degenerate residual check: %e\n", diffnorm);
+    }
+  }
+  if(g_debug_level > 2 || solver_params.external_inverter == QPHIX_INVERTER){
+    finalize_solver(temp, 1);
+  }
+
+  return(iteration_count);
+}
+
+int solve_mms_tm(spinor ** const P, spinor * const Q,
+                 solver_params_t * solver_params){ 
+  int iteration_count = 0; 
+
+  solver_params->use_initial_guess = 0;
+
+  // temporary field required by the QPhiX solve or by residual check
+  spinor ** temp;
+  if(g_debug_level > 2 || (solver_params->external_inverter == QPHIX_INVERTER  && solver_params->type != MG)){
+    init_solver_field(&temp, VOLUMEPLUSRAND/2, 1);
+  }
+
+#ifdef TM_USE_QPHIX
+  if( solver_params->external_inverter == QPHIX_INVERTER && solver_params->type != MG ){
+    gamma5(temp[0], Q, VOLUME/2);
+    iteration_count = invert_eo_qphix_oneflavour_mshift(P, temp[0],
+                                                        solver_params->max_iter, solver_params->squared_solver_prec,
+                                                        solver_params->type, solver_params->rel_prec,
+                                                        *solver_params,
+                                                        solver_params->sloppy_precision,
+                                                        solver_params->compression_type);
+    for( int shift = 0; shift < solver_params->no_shifts; shift++){
+      mul_gamma5(P[shift], VOLUME/2);
+    }
+  } else
+#endif // TM_USE_QPHIX
+  if (solver_params->type == CGMMS){
+    iteration_count = cg_mms_tm(P, Q, solver_params);
+  }
+#ifdef DDalphaAMG
+  else if (solver_params->type == MG) {
+    // if the mg_mms_mass is larger than the smallest shift we use MG
+    if (mg_no_shifts > 0 || mg_mms_mass >= solver_params->shifts[0]) { 
+      int nshifts = solver_params->no_shifts;
+      int mg_nshifts = mg_no_shifts > nshifts ? nshifts:mg_no_shifts;
+      // if the mg_mms_mass is smaller than the larger shifts, we use CGMMS for those
+      // in case mg_no_shifts is used, then mg_mms_mass = 0
+      if(mg_mms_mass >= solver_params->shifts[0]) {
+        mg_nshifts = solver_params->no_shifts;
+        while (mg_mms_mass < solver_params->shifts[mg_nshifts-1]) { mg_nshifts--; }
+      }
+      // Number of initial guesses provided by gcmms
+      // README: tunable value. 1 it's fine for now.
+      int  no_cgmms_init_guess = 1;
+      if(no_cgmms_init_guess > mg_nshifts) {
+        no_cgmms_init_guess = mg_nshifts;
+      }
+#ifdef TM_USE_QPHIX
+      if( solver_params->external_inverter == QPHIX_INVERTER && mg_nshifts < nshifts ) {
+        // TODO: no initial guess option with QphiX
+        no_cgmms_init_guess = 0;
+        spinor ** P_cg = P+(mg_nshifts - no_cgmms_init_guess);
+        double * shifts_start = solver_params->shifts;
+        solver_params->no_shifts = nshifts - (mg_nshifts - no_cgmms_init_guess);
+        solver_params->shifts += (mg_nshifts - no_cgmms_init_guess);
+        solver_params->type = CGMMS;
+        gamma5(temp[0], Q, VOLUME/2);
+        iteration_count = invert_eo_qphix_oneflavour_mshift(P, temp[0],
+                                                            solver_params->max_iter, solver_params->squared_solver_prec,
+                                                            solver_params->type, solver_params->rel_prec,
+                                                            *solver_params,
+                                                            solver_params->sloppy_precision,
+                                                            solver_params->compression_type);
+        for( int shift = 0; shift < solver_params->no_shifts; shift++) {
+          mul_gamma5(P[shift], VOLUME/2);
+        }
+        // Restoring solver_params
+        solver_params->no_shifts = nshifts;
+        solver_params->shifts = shifts_start;
+        solver_params->type = MG;
+        } else
+#endif // TM_USE_QPHIX  
+      if (mg_nshifts < nshifts) {
+        spinor ** P_cg = P+(mg_nshifts - no_cgmms_init_guess);
+        double * shifts_start = solver_params->shifts;
+        solver_params->no_shifts = nshifts - (mg_nshifts - no_cgmms_init_guess);
+        solver_params->shifts += (mg_nshifts - no_cgmms_init_guess);
+        solver_params->type = CGMMS;
+        // switching last shift. We run CGMMS for the shift we want to solve.
+        if (no_cgmms_init_guess > 0) {
+          SWAP(solver_params->shifts[0], solver_params->shifts[no_cgmms_init_guess]);
+          SWAP(P_cg[0], P_cg[no_cgmms_init_guess]);
+        }
+        iteration_count = solve_mms_tm( P_cg, Q, solver_params );
+        // Switching back last shift
+        if (no_cgmms_init_guess > 0) {
+          SWAP(solver_params->shifts[0], solver_params->shifts[no_cgmms_init_guess]);
+          SWAP(P_cg[0], P_cg[no_cgmms_init_guess]);
+        }
+        // Restoring solver_params
+        solver_params->no_shifts = nshifts;
+        solver_params->shifts = shifts_start;
+        solver_params->type = MG;
+      } else {
+        no_cgmms_init_guess = 0;
+      }
+
+      for(int i = mg_nshifts-1; i>=0; i--){
+        // preparing initial guess
+        if(i<mg_nshifts-no_cgmms_init_guess)
+          init_guess_mms(P, Q, i, solver_params);
+        g_mu3 = solver_params->shifts[i]; 
+        iteration_count += MG_solver( P[i], Q, solver_params->squared_solver_prec, solver_params->max_iter,
+                                         solver_params->rel_prec, solver_params->sdim, g_gauge_field, solver_params->M_psi );
+        g_mu3 = _default_g_mu3;
+      }
+    } else {
+      iteration_count = cg_mms_tm( P, Q, solver_params );
+    }
+  }
+#endif
+  else if (solver_params->type == RGMIXEDCG){
+    matrix_mult32 f32  = Qtm_pm_psi_32;
+    if( solver_params->M_psi == Qsw_pm_psi ){ 
+      f32  = Qsw_pm_psi_32;
+    }
+    iteration_count = 0;
+    // solver_params_t struct needs to be passed to all solvers except for cgmms, so we need to construct it here
+    // and set the one relevant parameter
+    solver_params_t temp_params;
+    temp_params.mcg_delta = _default_mixcg_innereps;
+    double iter_local = 0;
+    for(int i = solver_params->no_shifts-1; i>=0; i--){
+      // preparing initial guess
+      init_guess_mms(P, Q, i, solver_params); 
+      solver_params->use_initial_guess = 1;
+     
+      // inverting
+      g_mu3 = solver_params->shifts[i]; 
+      iter_local = rg_mixed_cg_her( P[i], Q, temp_params, solver_params->max_iter,
+                                    solver_params->squared_solver_prec, solver_params->rel_prec, solver_params->sdim,
+                                    solver_params->M_psi, f32);
+      g_mu3 = _default_g_mu3;
+      solver_params->use_initial_guess = 0;
+      if(iter_local == -1){
+        return(-1);
+      } else {
+        iteration_count += iter_local;
+      }
+    }
+  } else {
+    fatal_error("Error: solver not allowed for TM mms solve. Aborting...\n", "solve_mms_tm");
+  }
+
+  if(g_debug_level > 2){
+    for( int shift = 0; shift < solver_params->no_shifts; shift++){
+      g_mu3 = solver_params->shifts[shift]; 
+      solver_params->M_psi(temp[0], P[shift]);
+      g_mu3 = _default_g_mu3;
+      diff(temp[0], temp[0], Q, VOLUME/2);
+      double diffnorm = square_norm(temp[0], VOLUME/2, 1); 
+      if( g_proc_id == 0 ){
+        printf("# solve_mms_tm residual check: shift %d, res. %e\n", shift, diffnorm);
+      }
+    }
+  }
+  if(g_debug_level > 2 || (solver_params->external_inverter == QPHIX_INVERTER && solver_params->type != MG)){
+    finalize_solver(temp, 1);
+  }
+
+  return(iteration_count);
+}
+
+int solve_mms_nd(spinor ** const Pup, spinor ** const Pdn, 
+                 spinor * const Qup, spinor * const Qdn, 
+                 solver_params_t * solver_params){ 
+  int iteration_count = 0; 
+  solver_params->use_initial_guess = 0;
+
+  // temporary field required by the QPhiX solve or by residual check
+  spinor ** temp;
+  if(g_debug_level > 2 || (solver_params->external_inverter == QPHIX_INVERTER && solver_params->type != MG)){
+    init_solver_field(&temp, VOLUMEPLUSRAND/2, 2);
+  }
+
+#ifdef TM_USE_QPHIX
+  if(solver_params->external_inverter == QPHIX_INVERTER && solver_params->type != MG){
+    //  gamma5 (M.M^dagger)^{-1} gamma5 = [ Q(+mu,eps) Q(-mu,eps) ]^{-1}
+    gamma5(temp[0], Qup, VOLUME/2);
+    gamma5(temp[1], Qdn, VOLUME/2);
+    iteration_count = invert_eo_qphix_twoflavour_mshift(Pup, Pdn, temp[0], temp[1],
+                                                        solver_params->max_iter, solver_params->squared_solver_prec,
+                                                        solver_params->type, solver_params->rel_prec,
+                                                        *solver_params,
+                                                        solver_params->sloppy_precision,
+                                                        solver_params->compression_type);
+    
+    // the tmLQCD ND operator used for HMC is normalised by the inverse of the maximum eigenvalue
+    // so the inverse of Q^2 is normalised by the square of the maximum eigenvalue
+    // or, equivalently, the square of the inverse of the inverse
+    // note that in the QPhiX interface, we also correctly normalise the shifts
+    const double maxev_sq = (1.0/phmc_invmaxev)*(1.0/phmc_invmaxev);
+    for( int shift = 0; shift < solver_params->no_shifts; shift++){
+      mul_r_gamma5(Pup[shift], maxev_sq, VOLUME/2);
+      mul_r_gamma5(Pdn[shift], maxev_sq, VOLUME/2);
+    }
+  } else
+#endif //TM_USE_QPHIX
+  if(solver_params->type==MIXEDCGMMSND){
+    if(usegpu_flag){
+    #ifdef HAVE_GPU      
+      #ifdef TEMPORALGAUGE
+      to_temporalgauge_mms(g_gauge_field , Qup, Qdn, Pup, Pdn, solver_params->no_shifts);
+      #endif        
+      iteration_count = dev_cg_mms_tm_nd(Pup, Pdn, Qup, Qdn, solver_params);  
+      #ifdef TEMPORALGAUGE
+      from_temporalgauge_mms(Qup, Qdn, Pup, Pdn, solver_params->no_shifts);
+      #endif 
+    #endif
+    } else {
+      iteration_count = mixed_cg_mms_tm_nd(Pup, Pdn, Qup, Qdn, solver_params);
+    }
+  } else if (solver_params->type == CGMMSND){
+    iteration_count = cg_mms_tm_nd(Pup, Pdn, Qup, Qdn, solver_params);
+  }
+#ifdef DDalphaAMG
+  else if (solver_params->type == MG) {
+    // if the mg_mms_mass is larger than the smallest shift we use MG
+    if (mg_no_shifts > 0 || mg_mms_mass >= solver_params->shifts[0]) { 
+
+      int nshifts = solver_params->no_shifts;
+      int mg_nshifts = mg_no_shifts > nshifts ? nshifts:mg_no_shifts;
+      // if the mg_mms_mass is smaller than the larger shifts, we use CGMMS for those
+      // in case mg_no_shifts is used, then mg_mms_mass = 0
+      if(mg_mms_mass >= solver_params->shifts[0]) {
+        mg_nshifts = nshifts;
+        while (mg_mms_mass < solver_params->shifts[mg_nshifts-1]) { mg_nshifts--; }
+      }
+      // Number of initial guesses provided by gcmms
+      // README: tunable value. 2 it's fine for now.
+      int no_cgmms_init_guess = 2;
+      if(no_cgmms_init_guess > mg_nshifts) {
+        no_cgmms_init_guess = mg_nshifts;
+      }
+#ifdef TM_USE_QPHIX
+      if(solver_params->external_inverter == QPHIX_INVERTER && mg_nshifts < nshifts){
+        // TODO: no initial guess option with QphiX
+        no_cgmms_init_guess = 0;
+        spinor ** Pup_cg = Pup+(mg_nshifts - no_cgmms_init_guess);
+        spinor ** Pdn_cg = Pdn+(mg_nshifts - no_cgmms_init_guess);
+        double * shifts_start = solver_params->shifts;
+        solver_params->no_shifts = nshifts - (mg_nshifts - no_cgmms_init_guess);
+        solver_params->shifts += (mg_nshifts - no_cgmms_init_guess);
+        solver_params-> type = CGMMSND;
+        //  gamma5 (M.M^dagger)^{-1} gamma5 = [ Q(+mu,eps) Q(-mu,eps) ]^{-1}
+        gamma5(temp[0], Qup, VOLUME/2);
+        gamma5(temp[1], Qdn, VOLUME/2);
+        iteration_count = invert_eo_qphix_twoflavour_mshift(Pup_cg, Pdn_cg, temp[0], temp[1],
+                                                            solver_params->max_iter, solver_params->squared_solver_prec,
+                                                            solver_params->type, solver_params->rel_prec,
+                                                            *solver_params,
+                                                            solver_params->sloppy_precision,
+                                                            solver_params->compression_type);
+    
+        // the tmLQCD ND operator used for HMC is normalised by the inverse of the maximum eigenvalue
+        // so the inverse of Q^2 is normalised by the square of the maximum eigenvalue
+        // or, equivalently, the square of the inverse of the inverse
+        // note that in the QPhiX interface, we also correctly normalise the shifts
+        const double maxev_sq = (1.0/phmc_invmaxev)*(1.0/phmc_invmaxev);
+        for( int shift = 0; shift < solver_params->no_shifts; shift++){
+          mul_r_gamma5(Pup[shift], maxev_sq, VOLUME/2);
+          mul_r_gamma5(Pdn[shift], maxev_sq, VOLUME/2);
+        }
+        // Restoring solver_params
+        solver_params->no_shifts = nshifts;
+        solver_params->shifts = shifts_start;
+        solver_params-> type = MG;
+      } else
+#endif //TM_USE_QPHIX
+      if (mg_nshifts < nshifts) {
+        spinor ** Pup_cg = Pup+(mg_nshifts - no_cgmms_init_guess);
+        spinor ** Pdn_cg = Pdn+(mg_nshifts - no_cgmms_init_guess);
+        double * shifts_start = solver_params->shifts;
+        solver_params->no_shifts = nshifts - (mg_nshifts - no_cgmms_init_guess);
+        solver_params->shifts += (mg_nshifts - no_cgmms_init_guess);
+        solver_params-> type = CGMMSND;
+        if (no_cgmms_init_guess > 0) {
+          SWAP(solver_params->shifts[0], solver_params->shifts[no_cgmms_init_guess]);
+          SWAP(Pup_cg[0], Pup_cg[no_cgmms_init_guess]);
+          SWAP(Pdn_cg[0], Pdn_cg[no_cgmms_init_guess]);
+        }
+        iteration_count = solve_mms_nd( Pup_cg, Pdn_cg, Qup, Qdn, solver_params );
+        // Switching back last shift
+        if (no_cgmms_init_guess > 0) {
+          SWAP(solver_params->shifts[0], solver_params->shifts[no_cgmms_init_guess]);
+          SWAP(Pup_cg[0], Pup_cg[no_cgmms_init_guess]);
+          SWAP(Pdn_cg[0], Pdn_cg[no_cgmms_init_guess]);
+        }
+        // Restoring solver_params
+        solver_params->no_shifts = nshifts;
+        solver_params->shifts = shifts_start;
+        solver_params-> type = MG;
+      } else {
+        no_cgmms_init_guess = 0;
+      }
+
+      matrix_mult_nd f = Qtm_pm_ndpsi_shift;
+      if( solver_params->M_ndpsi == Qsw_pm_ndpsi ) 
+        f = Qsw_pm_ndpsi_shift;
+
+      for(int i = mg_nshifts-1; i>=0; i--){
+        // preparing initial guess
+        if(i<mg_nshifts-no_cgmms_init_guess)
+          init_guess_mms_nd(Pup, Pdn, Qup, Qdn, i, solver_params);
+        g_shift = solver_params->shifts[i]*solver_params->shifts[i]; 
+        iteration_count += MG_solver_nd( Pup[i], Pdn[i], Qup, Qdn, solver_params->squared_solver_prec, solver_params->max_iter,
+                                         solver_params->rel_prec, solver_params->sdim, g_gauge_field, f );
+        g_shift = _default_g_shift;
+      }
+    } else {
+      iteration_count = cg_mms_tm_nd( Pup, Pdn, Qup, Qdn, solver_params );
+    }
+  }
+#endif
+  else if (solver_params->type == RGMIXEDCG){
+    matrix_mult_nd   f    = Qtm_pm_ndpsi_shift;
+    matrix_mult_nd32 f32  = Qtm_pm_ndpsi_shift_32;
+    if( solver_params->M_ndpsi == Qsw_pm_ndpsi ){ 
+      f    = Qsw_pm_ndpsi_shift;
+      f32  = Qsw_pm_ndpsi_shift_32;
+    }
+    iteration_count = 0;
+    // solver_params_t struct needs to be passed to all solvers except for cgmms, so we need to construct it here
+    // and set the one relevant parameter
+    solver_params_t temp_params;
+    temp_params.mcg_delta = _default_mixcg_innereps;
+    double iter_local = 0;
+    for(int i = solver_params->no_shifts-1; i>=0; i--){
+      // preparing initial guess
+      init_guess_mms_nd(Pup, Pdn, Qup, Qdn, i, solver_params);
+      solver_params->use_initial_guess = 1;
+      
+      // inverting
+      g_shift = solver_params->shifts[i]*solver_params->shifts[i]; 
+      iter_local = rg_mixed_cg_her_nd( Pup[i], Pdn[i], Qup, Qdn, temp_params, solver_params->max_iter,
+                                       solver_params->squared_solver_prec, solver_params->rel_prec, solver_params->sdim, f, f32);
+      g_shift = _default_g_shift;
+      solver_params->use_initial_guess = 0;
+      if(iter_local == -1){
+        return(-1);
+      } else {
+        iteration_count += iter_local;
+      }
+    }
+  } else {
+    fatal_error("Error: solver not allowed for ND mms solve. Aborting...\n", "solve_mss_nd");
+  }
+
+  if( g_debug_level > 2 ){
+    for( int shift = 0; shift < solver_params->no_shifts; shift++){
+      matrix_mult_nd f = Qtm_pm_ndpsi_shift;
+      if( solver_params->M_ndpsi == Qsw_pm_ndpsi ) 
+        f = Qsw_pm_ndpsi_shift;
+      g_shift = solver_params->shifts[shift]*solver_params->shifts[shift]; 
+      f(temp[0], temp[1], Pup[shift], Pdn[shift]);
+      g_shift = _default_g_shift;
+      diff(temp[0], temp[0], Qup, VOLUME/2);
+      diff(temp[1], temp[1], Qdn, VOLUME/2);
+      double diffnorm = square_norm(temp[0], VOLUME/2, 1) + square_norm(temp[1], VOLUME/2, 1); 
+      if( g_proc_id == 0 ){
+        printf("# solve_mms_nd residual check: %e\n", diffnorm);
+      }
+    }
+  }
+  if(g_debug_level > 2 || (solver_params->external_inverter == QPHIX_INVERTER  && solver_params->type != MG)){
+    finalize_solver(temp, 2);
+  }
+
+  return(iteration_count);
+}
+
+int solve_mms_nd_plus(spinor ** const Pup, spinor ** const Pdn, 
+                      spinor * const Qup, spinor * const Qdn, 
+                      solver_params_t * solver_params){ 
+
+  int iteration_count = 0; 
+
+#ifdef DDalphaAMG
+  // With MG we can solve directly the unsquared operator
+  if( solver_params->type == MG ){
+    matrix_mult_nd f = Qtm_tau1_ndpsi_add_Ishift;
+    if( solver_params->M_ndpsi == Qsw_pm_ndpsi )
+      f = Qsw_tau1_ndpsi_add_Ishift;
+    for(int i = solver_params->no_shifts-1; i>=0; i--){
+      // preparing initial guess
+      init_guess_mms_nd_plus(Pup, Pdn, Qup, Qdn, i, solver_params);
+  
+      // g_shift = shift^2 and then in Qsw_tau1_ndpsi_add_Ishift the square root is taken
+      g_shift = solver_params->shifts[i]*solver_params->shifts[i]; 
+      iteration_count += MG_solver_nd( Pup[i], Pdn[i], Qup, Qdn, solver_params->squared_solver_prec,
+                                       solver_params->max_iter, solver_params->rel_prec, solver_params->sdim,
+                                       g_gauge_field, f );
+      g_shift = _default_g_shift;
+    }
+  } else 
+#endif
+  {
+    iteration_count = solve_mms_nd(Pup, Pdn, Qup, Qdn, solver_params);
+    
+    // apply operator for retrieving unsquared solution
+    matrix_mult_nd f = Qtm_tau1_ndpsi_sub_Ishift;
+    if( solver_params->M_ndpsi == Qsw_pm_ndpsi )
+      f = Qsw_tau1_ndpsi_sub_Ishift;
+    spinor** temp;
+    init_solver_field(&temp, VOLUMEPLUSRAND/2, 2);
+    for(int i = solver_params->no_shifts-1; i>=0; i--){
+      g_shift = solver_params->shifts[i]*solver_params->shifts[i]; 
+      f(temp[0],temp[1],Pup[i],Pdn[i]);
+      assign(Pup[i], temp[0], VOLUME/2);
+      assign(Pdn[i], temp[1], VOLUME/2);
+      g_shift = _default_g_shift;
+    }
+    finalize_solver(temp, 2);
+  }
+  return iteration_count;
+}
diff --git a/solver/monomial_solve.h b/solver/monomial_solve.h
new file mode 100644
index 000000000..6a42c4558
--- /dev/null
+++ b/solver/monomial_solve.h
@@ -0,0 +1,40 @@
+/***********************************************************************
+ * Copyright (C) 2014 Florian Burger
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+#ifndef _MONOMIAL_SOLVE_H
+#define _MONOMIAL_SOLVE_H
+
+#include "misc_types.h"
+#include "solver/matrix_mult_typedef.h"
+#include "solver/solver_params.h"
+#include "su3.h"
+#include"solver/matrix_mult_typedef.h"
+#include"solver/solver_params.h"
+#include"su3.h"
+int solve_degenerate(spinor * const P, spinor * const Q, solver_params_t solver_params, const int max_iter, 
+                     double eps_sq, const int rel_prec, const int N, matrix_mult f, int solver_type);
+int solve_mms_tm(spinor ** const P, spinor * const Q,
+                 solver_params_t * solver_params);
+int solve_mms_nd(spinor ** const Pup, spinor ** const Pdn, 
+                 spinor * const Qup, spinor * const Qdn, 
+                 solver_params_t * solver_params);
+int solve_mms_nd_plus(spinor ** const Pup, spinor ** const Pdn, 
+                      spinor * const Qup, spinor * const Qdn, 
+                      solver_params_t * solver_params);
+
+#endif
diff --git a/solver/mr.c b/solver/mr.c
index cad212216..b28233a26 100644
--- a/solver/mr.c
+++ b/solver/mr.c
@@ -42,7 +42,7 @@
  ****************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -111,72 +111,26 @@ int mr(spinor * const P, spinor * const Q,
 }
 
 
-int mrblk(spinor * const P, spinor * const Q,
-	  const int max_iter, const double eps_sq,
-	  const int rel_prec, const int N, 
-	  matrix_mult_blk f, const int blk) {
-  static int mr_init=0;
-  int i = 0;
-  double norm_r,beta;
-  _Complex double alpha;
-  spinor * r;
-  const int parallel = 0;
-  spinor * s[3];
-  static spinor *s_=NULL;
-  static int N_;
+#define _F_TYPE double
+#define _C_TYPE _Complex double
+#define _PSWITCH(s) s
+#define _PTSWITCH(s) s 
 
-  if(mr_init == 0 || N != N_) {
-    if(N!= N_ && mr_init != 0) {
-      free(s_);
-    }
-    N_ = N;
-    s_ = calloc(3*(N+1)+1, sizeof(spinor));
-    mr_init = 1;
-  }
-#if (defined SSE || defined SSE2 || defined SSE3)
-  s[0] = (spinor *)(((unsigned long int)(s_)+ALIGN_BASE)&~ALIGN_BASE); 
-#else
-  s[0] = s_;
-#endif
-  s[1] = s[0] + N + 1;
-  s[2] = s[1] + N + 1;
+#include "mrblk_body.c"
 
-  r = s[0];
-  norm_r = square_norm(Q, N, parallel);
-  
-  zero_spinor_field(P, N);
-  f(s[2], P, blk);
-  diff(r, Q, s[2], N);
-  norm_r = square_norm(r, N, parallel);
-  if(g_proc_id == g_stdio_proc && g_debug_level > 2 && blk == 0) {
-    printf("MRblk iteration= %d  |res|^2= %e\n", i, norm_r);
-    fflush( stdout );
-  }
-  
-  while((norm_r > eps_sq) && (i < max_iter)){
-    i++;
-    f(s[1], r, blk);
-    alpha = scalar_prod(s[1], r, N, parallel);
-    beta = square_norm(s[1], N, parallel);
-    alpha /= beta;
-    assign_add_mul(P, r, alpha, N);
-    if(i%50 == 0) {
-      f(s[2], P,blk);
-    }
-    else{
-      assign_add_mul(s[2], s[1], alpha, N);
-    }
-    
-    diff(r, Q, s[2], N);
-    norm_r = square_norm(r, N, parallel);
-    if(g_proc_id == g_stdio_proc && g_debug_level > 2 && blk == 0) {
-      printf("MRblk iteration= %d  |res|^2= %g\n", i, norm_r);
-      fflush(stdout);
-    }
-  }
-  /* free(s_); */
-  if(norm_r > eps_sq){
-    return(-1);
-  }
-  return(i);
-}
+#undef _F_TYPE
+#undef _C_TYPE
+#undef _PSWITCH
+#undef _PTSWITCH
+
+#define _F_TYPE float
+#define _C_TYPE _Complex float
+#define _PSWITCH(s) s ## _32
+#define _PTSWITCH(s) s ## 32
+
+#include "mrblk_body.c"
+
+#undef _F_TYPE
+#undef _C_TYPE
+#undef _PSWITCH
+#undef _PTSWITCH
diff --git a/solver/mr.h b/solver/mr.h
index df82f3c84..bce24d5d4 100644
--- a/solver/mr.h
+++ b/solver/mr.h
@@ -50,9 +50,14 @@ int mr(spinor * const P, spinor * const Q,
        const int rel_prec, const int N, 
        const int parallel, matrix_mult f);
 
-int mrblk(spinor * const P, spinor * const Q,
+int mrblk(spinor * const P, spinor * const Q, spinor * const s_,
 	  const int max_iter, const double eps_sq,
 	  const int rel_prec, const int N, 
 	  matrix_mult_blk f, const int blk);
 
+int mrblk_32(spinor32 * const P, spinor32 * const Q, spinor32 * const s_,
+	     const int max_iter, const double eps_sq,
+	     const int rel_prec, const int N, 
+	     matrix_mult_blk32 f, const int blk);
+
 #endif
diff --git a/solver/mr4complex.c b/solver/mr4complex.c
new file mode 100644
index 000000000..ddc24d781
--- /dev/null
+++ b/solver/mr4complex.c
@@ -0,0 +1,89 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *               2010 claude Tadonki
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include<stdlib.h>
+#include<stdio.h>
+#include<math.h>
+#include<string.h>
+#include"global.h"
+#include"su3.h"
+#include"linalg_eo.h"
+#include"gcr4complex.h"
+#include"mr4complex.h"
+
+int mr4complex(_Complex double * const P, _Complex double * const Q,
+	       const int max_iter, const double eps_sq,
+	       const int rel_prec, const int N, 
+	       const int parallel, const int lda,
+	       c_matrix_mult f){
+  int i=0;
+  double norm_r, norm_sq, beta;
+  _Complex double alpha;
+  _Complex double *w_f[3], * _w_f, *stmp;
+  _Complex double *r;
+  
+  _w_f = (_Complex double *)malloc(3*lda*sizeof(_Complex double));
+  w_f[0] = _w_f; w_f[1] = _w_f+lda; w_f[2] = _w_f+2*lda;
+  
+  r = w_f[0];
+  
+  for(int j = 0; j < N; j++) {
+    P[j] = 0.;
+    w_f[2][j] = 0.;
+    r[j] = Q[j];
+  }
+  //f(w_f[2], P);
+  //ldiff(r, Q, w_f[2], N);
+  norm_sq = lsquare_norm(Q, N, parallel);
+  norm_r = lsquare_norm(r, N, parallel);
+  if(g_proc_id == g_stdio_proc && g_debug_level > 2) {
+    printf("lMR iteration number: %d, |res|^2 = %e of %e %d\n", i, norm_r, eps_sq, rel_prec); 
+    fflush( stdout );
+  }
+  while(((norm_r > eps_sq && !rel_prec) || ((norm_r > eps_sq*norm_sq && rel_prec))) && (i < max_iter)) {
+    i++;
+    f(w_f[1], r);
+    alpha=lscalar_prod(w_f[1], r, N, parallel);
+    beta=lsquare_norm(w_f[1], N, parallel);
+    alpha /= beta;
+    lassign_add_mul(P, r, alpha, N);
+    if(i%50 == 0){
+      f(w_f[2], P);
+    }
+    else{
+      lassign_add_mul(w_f[2], w_f[1], alpha, N);
+    }
+
+    ldiff(r, Q, w_f[2], N);
+    norm_r=lsquare_norm(w_f[0], N, parallel);
+    if(g_proc_id == g_stdio_proc && g_debug_level > 2) {
+      printf("# lMR iteration= %d  |res|^2= %g\n", i, norm_r); 
+      fflush(stdout);
+    }
+  }
+  free(_w_f);
+  if(norm_r > eps_sq){
+    return(-1);
+  }
+  return(i);
+}
diff --git a/solver/mr4complex.h b/solver/mr4complex.h
new file mode 100644
index 000000000..7c3d7b668
--- /dev/null
+++ b/solver/mr4complex.h
@@ -0,0 +1,14 @@
+#ifndef _MR4COMPLEX_H
+#define _MR4COMPLEX_H
+
+#include"su3.h"
+#include"solver/matrix_mult_typedef.h"
+#include"solver/gcr4complex.h"
+
+int mr4complex(_Complex double * const P, _Complex double * const Q,
+	       const int max_iter, const double eps_sq,
+	       const int rel_prec, const int N, 
+	       const int parallel, const int lda,
+	       c_matrix_mult f);
+
+#endif
diff --git a/solver/mrblk_body.c b/solver/mrblk_body.c
new file mode 100644
index 000000000..4e901b31e
--- /dev/null
+++ b/solver/mrblk_body.c
@@ -0,0 +1,73 @@
+/***********************************************************************
+ * Copyright (C) 2016 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+
+int _PSWITCH(mrblk)(_PTSWITCH(spinor) * const P, _PTSWITCH(spinor) * const Q, 
+                    _PTSWITCH(spinor) * const s_,
+                    const int max_iter, const double eps_sq,
+                    const int rel_prec, const int N, 
+                    _PTSWITCH(matrix_mult_blk) f, const int blk) {
+  int i = 0;
+  _F_TYPE norm_r, beta;
+  _C_TYPE alpha;
+  _PTSWITCH(spinor) * r;
+  _PTSWITCH(spinor) * s[3];
+  const int parallel = 0;
+
+  s[0] = s_;
+  s[1] = s[0] + N + 1;
+  s[2] = s[1] + N + 1;
+
+  r = s[0];
+  norm_r = _PSWITCH(square_norm_ts)(Q, N, parallel);
+
+  _PSWITCH(zero_spinor_field)(P, N);
+  f(s[2], P, blk);
+  _PSWITCH(diff_ts)(r, Q, s[2], N);
+  norm_r = _PSWITCH(square_norm_ts)(r, N, parallel);
+  if(g_proc_id == g_stdio_proc && g_debug_level > 2 && blk == 0) {
+    printf("MRblk iteration= %d  |res|^2= %e\n", i, norm_r);
+    fflush( stdout );
+  }
+  
+  while((norm_r > eps_sq) && (i < max_iter)){
+    i++;
+    f(s[1], r, blk);
+    alpha = _PSWITCH(scalar_prod_ts)(s[1], r, N, parallel);
+    beta = _PSWITCH(square_norm_ts)(s[1], N, parallel);
+    alpha /= beta;
+    _PSWITCH(assign_add_mul_ts)(P, r, alpha, N);
+    if(i%50 == 0) {
+      f(s[2], P, blk);
+    }
+    else{
+      _PSWITCH(assign_add_mul_ts)(s[2], s[1], alpha, N);
+    }
+    _PSWITCH(diff_ts)(r, Q, s[2], N);
+    norm_r = _PSWITCH(square_norm_ts)(r, N, parallel);
+    if(g_proc_id == g_stdio_proc && g_debug_level > 2 && blk == 0) {
+      printf("MRblk iteration= %d  |res|^2= %g\n", i, norm_r);
+      fflush(stdout);
+    }
+  }
+  if(norm_r > eps_sq){
+    return(-1);
+  }
+  return(i);
+}
diff --git a/solver/ortho.c b/solver/ortho.c
index ddbb18307..0b0bed14c 100644
--- a/solver/ortho.c
+++ b/solver/ortho.c
@@ -27,13 +27,13 @@
  * Gram-Shmidt orthogonalization
  ****************************************************************************/
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <time.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
@@ -55,7 +55,7 @@ int ortho_new_vectors(spinor **Vecs, int N, int nv_old, int nv_new, double ortht
     int nadded=0;
     double tmpd;
     
-    #ifdef MPI
+    #ifdef TM_USE_MPI
     parallel=1;
     #else
     parallel=0;
diff --git a/solver/pcg_her.c b/solver/pcg_her.c
index a4d4235a5..c985ad901 100644
--- a/solver/pcg_her.c
+++ b/solver/pcg_her.c
@@ -19,7 +19,7 @@
 
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -29,8 +29,10 @@
 #include "linalg_eo.h"
 #include "start.h"
 #include "solver/matrix_mult_typedef.h"
+#include "solver/cg_her.h"
 #include "sub_low_ev.h"
 #include "solver_field.h"
+#include "dfl_projector.h"
 #include "pcg_her.h"
 
 /* P output = solution , Q input = source */
@@ -48,50 +50,39 @@ int pcg_her(spinor * const P, spinor * const Q, const int max_iter,
     init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf);
   }
   squarenorm = square_norm(Q, N, 1);
-  /*        !!!!   INITIALIZATION    !!!! */
+  /* x_0 = P */
   assign(solver_field[0], P, N);
-  /*        (r_0,r_0)  =  normsq         */
   normsp = square_norm(P, N, 1);
 
-  assign(solver_field[3], Q, N);
   /* initialize residue r and search vector p */
-  if(normsp==0){
-    /* if a starting solution vector equal to zero is chosen */
-    /* r0 */
-    assign(solver_field[1], solver_field[3], N);
-    /* p0 */
-  }
-  else{
-    /* if a starting solution vector different from zero is chosen */
-    /* r0 = b - A x0 */
-    f(solver_field[2], solver_field[0]);
-    diff(solver_field[1], solver_field[3], solver_field[2], N);
-  }
-  /* z0 = M^-1 r0 */
-  invert_eigenvalue_part(solver_field[3], solver_field[1], 10, N);
-  /* p0 = z0 */
+  /* r0 = b - A x0 */
+  f(solver_field[2], solver_field[0]);
+  diff(solver_field[1], Q, solver_field[2], N);
+  /* z_0 = M^-1 r_0 */
+  // here we could have a preconditioner for Q^2
+  assign(solver_field[3], solver_field[1], N);
+  /* p_0 = z_0 */
   assign(solver_field[2], solver_field[3], N);
-
-  /* Is this really real? */
+  /* (r_0, z_0) */
   pro2 = scalar_prod_r(solver_field[1], solver_field[3], N, 1);  
   /* main loop */
   for(iteration = 0; iteration < max_iter; iteration++) {
-    /* A p */
+    /* w_i = A p_i */
     f(solver_field[4], solver_field[2]);
-
+    /* (p_i, w_i) */
     pro = scalar_prod_r(solver_field[2], solver_field[4], N, 1);
-    /*  Compute alpha_cg(i+1)   */
-    alpha_cg=pro2/pro;
+    /*  Compute alpha_cg   */
+    alpha_cg = pro2 / pro;
      
     /*  Compute x_(i+1) = x_i + alpha_cg(i+1) p_i    */
     assign_add_mul_r(solver_field[0], solver_field[2],  alpha_cg, N);
-    /*  Compute r_(i+1) = r_i - alpha_cg(i+1) Qp_i   */
+    /*  Compute r_(i+1) = r_i - alpha_cg(i+1) A p_i   */
     assign_add_mul_r(solver_field[1], solver_field[4], -alpha_cg, N);
 
     /* Check whether the precision is reached ... */
-    err=square_norm(solver_field[1], N, 1);
-    if(g_debug_level > 1 && g_proc_id == g_stdio_proc) {
-      printf("%d\t%g\n",iteration,err); fflush( stdout);
+    err = square_norm(solver_field[1], N, 1);
+    if(g_debug_level > 2 && g_proc_id == g_stdio_proc) {
+      printf("PCG %d\t%g\n",iteration,err); fflush( stdout);
     }
 
     if(((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))) {
@@ -100,28 +91,25 @@ int pcg_her(spinor * const P, spinor * const Q, const int max_iter,
       finalize_solver(solver_field, nr_sf);
       return(iteration+1);
     }
-#ifdef _USE_HALFSPINOR
-    if(((err*err <= eps_sq) && (rel_prec == 0)) || ((err*err <= eps_sq*squarenorm) && (rel_prec == 1)) || iteration > 1400) {
-      g_sloppy_precision = 1;
-      if(g_debug_level > 2 && g_proc_id == g_stdio_proc) {
-	printf("sloppy precision on\n"); fflush( stdout);
-      }
-    }
-#endif
-    /* z_j */
-    beta_cg = 1/pro2;
-/*     invert_eigenvalue_part(solver_field[3], solver_field[1], 10, N); */
-    /* Compute beta_cg(i+1)
-       Compute p_(i+1) = r_i+1 + beta_(i+1) p_i     */
+
+    /* z_i+1 = M r_(i+1) */
+    // here we could have a preconditioner for Q^2
+    //mg_Qsq_precon(solver_field[3], solver_field[1]);
+    assign(solver_field[3], solver_field[1], N);
+    /* Compute beta_cg(i+1) */
+    beta_cg = 1. / pro2;
+    // here we might use Polak-Ribiere formula instead of the standard one
+    // beta = (z_i+1,r_i+1 - r_i) / (z_i,r_i)
+    //pro2 = -alpha_cg*scalar_prod_r(solver_field[4], solver_field[3], N, 1);
+    // standard choice
     pro2 = scalar_prod_r(solver_field[1], solver_field[3], N, 1);
     beta_cg *= pro2;
+    /* p_(i+1) = z_(i+1) + beta_cg p_i */
     assign_mul_add_r(solver_field[2], beta_cg, solver_field[3], N);
   }
   assign(P, solver_field[0], N);
-  g_sloppy_precision = 0;
-/*   return(-1); */
   finalize_solver(solver_field, nr_sf);
-  return(1);
+  return(-11);
 }
 
 
diff --git a/solver/poly_precon.c b/solver/poly_precon.c
index 249278217..bce478681 100644
--- a/solver/poly_precon.c
+++ b/solver/poly_precon.c
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/solver/quicksort.c b/solver/quicksort.c
index c9aa471b3..a76ab8eb7 100644
--- a/solver/quicksort.c
+++ b/solver/quicksort.c
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include<stdlib.h>
 #include<stdio.h>
diff --git a/solver/restart_X.c b/solver/restart_X.c
index 821ab2bf0..121527c48 100644
--- a/solver/restart_X.c
+++ b/solver/restart_X.c
@@ -45,13 +45,13 @@
 
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <time.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
diff --git a/solver/rg_mixed_cg_her.c b/solver/rg_mixed_cg_her.c
new file mode 100644
index 000000000..afa708531
--- /dev/null
+++ b/solver/rg_mixed_cg_her.c
@@ -0,0 +1,353 @@
+/***********************************************************************
+ * Copyright (C) 2015 Bartosz Kostrzewa, Florian Burger
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *******************
+ * rg_mixed_cg_her *
+ *******************
+ *
+ * Mixed precision solver which uses true reliable updates and has a double
+ * precision fail-safe mechanism. The Polak-Ribiere computation of beta is
+ * implemented but currently not used because the extra scalar product is
+ * more expensive than the gain from the self-stabilisation as far as has 
+ * been tested.
+ *
+ * in:
+ *   Q: source
+ * inout:
+ *   P: result (initial guess currently not supported)
+ *
+ * POSSIBLE IMPROVEMENTS
+ * There are still quite a few things that can be tried to make it better,
+ * the most significant of which would be to guide the search direction
+ * using the previous one upon restart. However, it seems that for the number
+ * non-zero entries in the Dirac operator and usual lattice sizes, the
+ * requisite projection 
+ *
+ *   p' = r - <r,Ap>/<p,Ap> p
+ *
+ * cannot be computed with sufficient precision in 64 bit arithmetic. It should
+ * be noted that for L < 24 in general, this does work and produces
+ * a mixed solver which converges at the same rate as a double solver, but it's
+ * not generally useable... For point sources, it also works for larger lattice 
+ * volumes. Might be introduced as an optional mode in the future with some
+ * fail-safe mechanism which detects if the search direction begins to diverge.
+ *
+ **************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "global.h"
+#include "su3.h"
+#include "linalg_eo.h"
+#include "start.h"
+#include "operator/tm_operators_32.h"
+#include "operator/clovertm_operators_32.h"
+#include "solver/matrix_mult_typedef.h"
+#include "solver/solver_params.h"
+#include "read_input.h"
+
+#include "solver_field.h"
+#include "solver/rg_mixed_cg_her.h"
+#include "gettime.h"
+
+static void output_flops(const double seconds, const unsigned int N, const unsigned int iter_out, const unsigned int iter_in_sp, const unsigned int iter_in_dp, const double eps_sq);
+
+static inline unsigned int inner_loop_high(spinor * const x, spinor * const p, spinor * const q, spinor * const r, double * const rho1, const double delta,
+                                           matrix_mult f, const double eps_sq, const unsigned int N, const unsigned int iter, const unsigned int max_iter ){
+
+  static double alpha, beta, rho, rhomax;
+  unsigned int j = 0;
+
+  rho = *rho1;
+  rhomax = *rho1;
+
+  /* break out of inner loop if iterated residual goes below some fraction of the maximum observed
+  * iterated residual since the last update or if the target precision has been reached 
+  * enforce convergence more strictly by a factor of 1.3 to avoid unnecessary restarts 
+  * if the real residual is still a bit too large */
+  while( rho > delta*rhomax && j+iter <= max_iter ){
+    ++j;
+    f(q,p);
+    alpha = rho/scalar_prod_r(p,q,N,1);
+    assign_add_mul_r(x, p, alpha, N);
+    assign_add_mul_r(r, q, -alpha, N);
+    rho = square_norm(r,N,1);
+    beta = rho / *rho1;
+    *rho1 = rho;
+    assign_mul_add_r(p, beta, r, N);
+    
+    if( 1.3*rho < eps_sq ) break;
+    if( rho > rhomax ) rhomax = rho;
+    
+    if(g_debug_level > 2 && g_proc_id == 0) {
+      printf("DP_inner CG: %d res^2 %g\t\n", j+iter, rho);
+    }
+  }
+
+  return j;
+}
+
+static inline unsigned int inner_loop(spinor32 * const x, spinor32 * const p, spinor32 * const q, spinor32 * const r, float * const rho1, const float delta,
+                                      matrix_mult32 f32, const float eps_sq, const unsigned int N, const unsigned int iter, const unsigned max_iter,
+                                      float alpha, float beta, MCG_PIPELINED_TYPE pipelined, MCG_PR_TYPE pr ){
+
+  static float rho, rhomax, pro;
+  unsigned int j = 0;
+
+  rho = *rho1;
+  rhomax = *rho1;
+
+  if(pipelined==MCG_NO_PIPELINED){
+    /* break out of inner loop if iterated residual goes below some fraction of the maximum observed
+    * iterated residual since the last update */ 
+    while( rho > delta*rhomax && j+iter <= max_iter ){
+      ++j;
+      f32(q,p);
+      pro = scalar_prod_r_32(p,q,N,1);
+      alpha = rho/pro;
+      assign_add_mul_r_32(x, p, alpha, N);
+      assign_add_mul_r_32(r, q, -alpha, N);
+      rho = square_norm_32(r,N,1);
+      // Polak-Ribiere computation of beta, claimed to be self-stabilising, positive effect so far not observed or required
+      if(pr==MCG_PR){
+        beta = alpha*(alpha*square_norm_32(q,N,1)-pro) / *rho1;
+      }else{
+        beta = rho / *rho1;
+      }
+      *rho1 = rho;
+      assign_mul_add_r_32(p, beta, r, N);
+      if(g_debug_level > 2 && g_proc_id == 0) {
+        printf("SP_inner CG: %d res^2 %g\t\n", j+iter, rho);
+      }
+       /* enforce convergence more strictly by a factor of 1.3 to avoid unnecessary restarts 
+       * if the real residual is still a bit too large */
+      if( 1.3*rho < eps_sq ) break;
+      if( rho > rhomax ) rhomax = rho;
+    }
+  }else{
+    // pipelined cg requires one more scalar product but may allow optimisations to be made
+    // e.g.: one could do the collective communication for sqrnrm(r) while other stuff is being computed
+    // it is also self-initialising (alpha=0, beta=0 will work)
+    while( rho > delta*rhomax && j+iter <= max_iter ){
+      ++j;
+      assign_add_mul_r_32(x, p, alpha, N);
+      assign_add_mul_r_32(r, q, -alpha, N);
+      assign_mul_add_r_32(p, beta, r, N);
+      f32(q,p);
+  
+      rho = square_norm_32(r,N,1);
+      pro = scalar_prod_r_32(p,q,N,1);
+      alpha = rho/pro;
+      if(pr==MCG_PR){
+        beta = alpha*(alpha*square_norm_32(q,N,1)-pro)/rho;
+      }else{
+        beta = rho/ *rho1;
+      }
+      *rho1=rho;
+
+      if(g_debug_level > 2 && g_proc_id == 0) {
+        printf("SP_inner CG: %d res^2 %g\t\n", j+iter, rho);
+      }
+      if( 1.3*rho < eps_sq ) break;
+      if( rho > rhomax ) rhomax = rho;
+    }
+  }
+
+  return j;
+}
+
+
+/* P output = solution , Q input = source */
+int rg_mixed_cg_her(spinor * const P, spinor * const Q, solver_params_t solver_params,
+                    const int max_iter, const double eps_sq, const int rel_prec,
+                    const int N, matrix_mult f, matrix_mult32 f32) {
+
+  int iter_in_sp = 0, iter_in_dp = 0, iter_out = 0;
+  float rho_sp, delta = solver_params.mcg_delta;
+  double beta_dp, rho_dp;
+  double sourcesquarenorm, target_eps_sq;
+
+  spinor *xhigh, *rhigh, *qhigh, *phigh;
+  spinor32 *x, *p, *q, *r;
+
+  spinor ** solver_field = NULL;
+  spinor32 ** solver_field32 = NULL;  
+  const int nr_sf = 4;
+  const int nr_sf32 = 4;
+  
+  int high_control = 0;
+
+  double atime, etime, flops;
+  
+  if(N == VOLUME) {
+    init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf);    
+    init_solver_field_32(&solver_field32, VOLUMEPLUSRAND, nr_sf32);
+  }
+  else {
+    init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf);
+    init_solver_field_32(&solver_field32, VOLUMEPLUSRAND/2, nr_sf32);    
+  }
+
+  atime = gettime();
+
+  // we could get away with using fewer fields, of course
+  phigh = solver_field[3];
+  xhigh = solver_field[2];
+  rhigh = solver_field[1];
+  qhigh = solver_field[0];
+
+  x = solver_field32[3];
+  r = solver_field32[2];
+  p = solver_field32[1];
+  q = solver_field32[0];
+
+  // we always want to apply the full precision operator in double
+  int save_sloppy = g_sloppy_precision_flag;
+  g_sloppy_precision_flag = 0;
+
+  sourcesquarenorm = square_norm(Q,N,1);
+  if( rel_prec == 1 ) {
+    target_eps_sq = eps_sq*sourcesquarenorm;
+    if(g_debug_level > 0 && g_proc_id==0) 
+      printf("#RG_Mixed CG: Using relative precision! eps_sq: %.6g target_eps_sq: %.6g \n",eps_sq,target_eps_sq);
+  }else{
+    target_eps_sq = eps_sq;
+  }
+ 
+  // compute maximum expected number of outer iterations based on expected reduction 
+  // of the residual at each run of the inner solver
+  int N_outer = (int)ceil(log10( sourcesquarenorm*delta/target_eps_sq ));
+  if(g_debug_level > 0 && g_proc_id==0) 
+    printf("#RG_Mixed CG: N_outer: %d \n", N_outer);
+  
+  // should compute real residual here and solve subtracted problem with initial guess
+  // for now we always use a zero guess
+  zero_spinor_field_32(x,N);
+  zero_spinor_field(P,N);
+  assign(phigh,Q,N);
+  assign(rhigh,Q,N);
+  
+  rho_dp = square_norm(rhigh,N,1);
+  assign_to_32(r,rhigh,N);
+  rho_sp = rho_dp;
+  assign_32(p,r,N);
+  
+  iter_in_sp += inner_loop(x, p, q, r, &rho_sp, delta, f32, (float)target_eps_sq, 
+                           N, iter_out+iter_in_sp+iter_in_dp, max_iter, 0.0, 0.0, MCG_NO_PIPELINED, MCG_NO_PR);
+
+  for(iter_out = 1; iter_out < N_outer; ++iter_out) {
+
+    // prepare for defect correction
+    // update high precision solution 
+    if(high_control==0) {
+      // accumulate solution (sp -> dp) 
+      addto_32(P,x,N);
+      // compute real residual
+      f(qhigh,P);
+      diff(rhigh,Q,qhigh,N);
+      beta_dp = 1/rho_dp;
+      rho_dp = square_norm(rhigh,N,1);
+      beta_dp *= rho_dp;
+    }
+    
+    // the iteration limit was reached in the previous iteration, let's try to save the day using double precision
+    if( high_control==1 ) {
+      assign(phigh,rhigh,N);
+      zero_spinor_field(xhigh,N);
+      beta_dp = 1/rho_dp;
+      iter_in_dp += inner_loop_high(xhigh, phigh, qhigh, rhigh, &rho_dp, delta, f, 
+                                    target_eps_sq, N, iter_out+iter_in_sp+iter_in_dp, max_iter);
+      rho_sp = rho_dp;
+      // accumulate solution
+      add(P,P,xhigh,N);
+      // compute real residual
+      f(qhigh,P);
+      diff(rhigh,Q,qhigh,N);
+      rho_dp = square_norm(rhigh,N,1);
+      beta_dp *= rho_dp;
+    }
+
+    if(g_debug_level > 2 && g_proc_id == 0) {
+      printf("RG_mixed CG last inner residue:       %17g\n", rho_sp);
+      printf("RG_mixed CG true residue:             %6d %10g\n", iter_in_sp+iter_in_dp+iter_out, rho_dp);
+      printf("RG_mixed CG residue reduction factor: %6d %10g\n", iter_in_sp+iter_in_dp+iter_out, beta_dp); fflush(stdout);
+    }
+
+    if( rho_dp <= target_eps_sq || (iter_in_sp+iter_in_dp+iter_out) >= max_iter ) {
+      etime = gettime();
+      output_flops(etime-atime, N, iter_out, iter_in_sp, iter_in_dp, eps_sq);
+      
+      g_sloppy_precision_flag = save_sloppy;
+      finalize_solver(solver_field, nr_sf);
+      finalize_solver_32(solver_field32, nr_sf32);
+      if( (iter_in_sp+iter_in_dp+iter_out) >= max_iter ){
+        return(-1);
+      } else {
+        return(iter_in_sp+iter_in_dp+iter_out);
+      }
+    }
+
+    // if it seems like we're stuck and reaching the iteration limit, we skip this correction and proceed in full precision above
+    if( iter_out >= (N_outer-2) ){
+      if(g_proc_id==0) printf("mixed CG: Reaching iteration limit, switching to DP!\n");
+      high_control = 1;
+      continue;
+    }else{
+      // correct defect
+      assign_to_32(r,rhigh,N);
+      rho_sp = rho_dp; // not sure if it's fine to truncate this or whether one should calculate it in SP directly, it seems to work fine though
+      assign_32(p,r,N);
+    }
+
+    zero_spinor_field_32(x,N);
+    iter_in_sp += inner_loop(x, p, q, r, &rho_sp, delta, f32, (float)target_eps_sq, 
+                             N, iter_out+iter_in_sp+iter_in_dp, max_iter, 0.0, 0.0, MCG_NO_PIPELINED, MCG_NO_PR);
+  }
+  
+  // convergence failure...
+  g_sloppy_precision_flag = save_sloppy;
+  finalize_solver(solver_field, nr_sf);
+  finalize_solver_32(solver_field32, nr_sf32);
+  return -1; 
+}
+
+void output_flops(const double seconds, const unsigned int N, const unsigned int iter_out, const unsigned int iter_in_sp, const unsigned int iter_in_dp, const double eps_sq){
+  double flops;
+  // TODO: compute real number of flops...
+  int total_it = iter_in_sp+iter_in_dp+iter_out;
+  if(g_debug_level > 0 && g_proc_id == 0) {
+    printf("# RG_mixed CG: iter_out: %d iter_in_sp: %d iter_in_dp: %d\n",iter_out,iter_in_sp,iter_in_dp);
+  	if(N != VOLUME){
+  	  /* 2 A + 2 Nc Ns + N_Count ( 2 A + 10 Nc Ns ) */
+  	  /* 2*1608.0 because the linalg is over VOLUME/2 */
+  	  flops = (2*(2*1608.0+2*3*4) + 2*3*4 + total_it*(2.*(2*1608.0+2*3*4) + 10*3*4))*N/1.0e6f;
+  	}
+  	else{
+  	  /* 2 A + 2 Nc Ns + N_Count ( 2 A + 10 Nc Ns ) */
+  	  flops = (2*(1608.0+2*3*4) + 2*3*4 + total_it*(2.*(1608.0+2*3*4) + 10*3*4))*N/1.0e6f;      
+  	}
+  	printf("#RG_mixed CG: iter: %d eps_sq: %1.4e t/s: %1.4e\n", total_it, eps_sq, seconds); 
+    printf("# FIXME: note the following flop counts are wrong! Consider only the time to solution!\n");
+  	printf("#RG_mixed CG: flopcount (for e/o tmWilson only): t/s: %1.4e mflops_local: %.1f mflops: %.1f\n", 
+  	       seconds, flops/(seconds), g_nproc*flops/(seconds));
+  }      
+}
diff --git a/solver/rg_mixed_cg_her.h b/solver/rg_mixed_cg_her.h
new file mode 100644
index 000000000..dade21af9
--- /dev/null
+++ b/solver/rg_mixed_cg_her.h
@@ -0,0 +1,32 @@
+/***********************************************************************
+ * Copyright (C) 2015 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+#ifndef _RG_MIXED_CG_HER_H
+#define _RG_MIXED_CG_HER_H
+
+#include "operator/tm_operators_32.h"
+#include "solver/rg_mixed_cg_typedef.h"
+#include "solver/matrix_mult_typedef.h"
+#include "solver/solver_params.h"
+#include "su3.h"
+
+int rg_mixed_cg_her(spinor * const P, spinor * const Q, solver_params_t solver_params,
+                    const int max_iter, const double eps_sq, const int rel_prec,
+                    const int N, matrix_mult f, matrix_mult32 f32);
+
+#endif
diff --git a/solver/rg_mixed_cg_her_nd.c b/solver/rg_mixed_cg_her_nd.c
new file mode 100644
index 000000000..8490b6090
--- /dev/null
+++ b/solver/rg_mixed_cg_her_nd.c
@@ -0,0 +1,370 @@
+/***********************************************************************
+ * Copyright (C) 2016 Bartosz Kostrzewa, Florian Burger
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ **********************
+ * rg_mixed_cg_her_nd *
+ **********************
+ *
+ * Mixed precision solver which uses true reliable updates and has a double
+ * precision fail-safe mechanism. The Polak-Ribiere computation of beta is
+ * implemented but currently not used because the extra scalar product is
+ * more expensive than the gain from the self-stabilisation as far as has 
+ * been tested.
+ *
+ * in:
+ *   Q: source
+ * input:
+ *   P: result
+ *
+ * POSSIBLE IMPROVEMENTS
+ * There are still quite a few things that can be tried to make it better,
+ * the most significant of which would be to guide the search direction
+ * using the previous one upon restart. However, it seems that for the number
+ * non-zero entries in the Dirac operator and usual lattice sizes, the
+ * requisite projection 
+ *
+ *   p' = r - <r,Ap>/<p,Ap> p
+ *
+ * cannot be computed with sufficient precision in 64 bit arithmetic. It should
+ * be noted that for L < 24 in general, this does work and produces
+ * a mixed solver which converges at the same rate as a double solver, but it's
+ * not generally useable... For point sources, it also works for larger lattice 
+ * volumes. Might be introduced as an optional mode in the future with some
+ * fail-safe mechanism which detects if the search direction begins to diverge.
+ *
+ **************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "global.h"
+#include "su3.h"
+#include "linalg_eo.h"
+#include "start.h"
+#include "operator/tm_operators_32.h"
+#include "operator/clovertm_operators_32.h"
+#include "solver/matrix_mult_typedef_nd.h"
+#include "solver/solver_params.h"
+#include "read_input.h"
+
+#include "solver_field.h"
+#include "solver/rg_mixed_cg_her.h"
+#include "gettime.h"
+
+static void output_flops(const double seconds, const unsigned int N, const unsigned int iter_out, 
+                  const unsigned int iter_in_sp, const unsigned int iter_in_dp, const double eps_sq);
+
+static inline unsigned int inner_loop_high(spinor * const x_up, spinor * const x_dn, 
+                                           spinor * const p_up, spinor * const p_dn,
+                                           spinor * const q_up, spinor * const q_dn,
+                                           spinor * const r_up, spinor * const r_dn, 
+                                           double * const rho1, const double delta,
+                                           matrix_mult_nd f, const double eps_sq, const unsigned int N, const unsigned int iter, const unsigned int max_iter ){
+
+  static double alpha, beta, rho, rhomax;
+  unsigned int j = 0;
+
+  rho = *rho1;
+  rhomax = *rho1;
+
+  /* break out of inner loop if iterated residual goes below some fraction of the maximum observed
+  * iterated residual since the last update or if the target precision has been reached 
+  * enforce convergence more strictly by a factor of 1.3 to avoid unnecessary restarts 
+  * if the real residual is still a bit too large */
+  while( rho > delta*rhomax && j+iter <= max_iter ){
+    ++j;
+    f(q_up,q_dn,p_up,p_dn);
+    alpha = rho/( scalar_prod_r(p_up,q_up,N,1) + scalar_prod_r(p_dn,q_dn,N,1) );
+    assign_add_mul_r(x_up, p_up, alpha, N); assign_add_mul_r(x_dn, p_dn, alpha, N);
+    assign_add_mul_r(r_up, q_up, -alpha, N); assign_add_mul_r(r_dn, q_dn, -alpha, N);
+    rho = ( square_norm(r_up,N,1) + square_norm(r_dn,N,1) );
+    beta = rho / *rho1;
+    *rho1 = rho;
+    assign_mul_add_r(p_up, beta, r_up, N); assign_mul_add_r(p_dn, beta, r_dn, N);
+    
+    if( 1.3*rho < eps_sq ) break;
+    if( rho > rhomax ) rhomax = rho;
+    
+    if(g_debug_level > 2 && g_proc_id == 0) {
+      printf("DP_inner CG: %d res^2 %g\t\n", j+iter, rho);
+    }
+  }
+
+  return j;
+}
+
+static inline unsigned int inner_loop(spinor32 * const x_up, spinor32 * const x_dn, 
+                                      spinor32 * const p_up, spinor32 * const p_dn, 
+                                      spinor32 * const q_up, spinor32 * const q_dn,
+                                      spinor32 * const r_up, spinor32 * const r_dn,
+                                      float * const rho1, const float delta,
+                                      matrix_mult_nd32 f32, const float eps_sq, const unsigned int N, const unsigned int iter, const unsigned int max_iter,
+                                      float alpha, float beta, MCG_PIPELINED_TYPE pipelined, MCG_PR_TYPE pr ){
+
+  static float rho, rhomax, pro;
+  unsigned int j = 0;
+
+  rho = *rho1;
+  rhomax = *rho1;
+
+  if(pipelined==MCG_NO_PIPELINED){
+    /* break out of inner loop if iterated residual goes below some fraction of the maximum observed
+    * iterated residual since the last update */ 
+    while( rho > delta*rhomax && j+iter <= max_iter ){
+      ++j;
+      f32(q_up,q_dn,p_up,p_dn);
+      pro = ( scalar_prod_r_32(p_up,q_up,N,1) + scalar_prod_r_32(p_dn,q_dn,N,1) );
+      alpha = rho/pro;
+      assign_add_mul_r_32(x_up, p_up, alpha, N); assign_add_mul_r_32(x_dn, p_dn, alpha, N);
+      assign_add_mul_r_32(r_up, q_up, -alpha, N); assign_add_mul_r_32(r_dn, q_dn, -alpha, N);
+      rho = ( square_norm_32(r_up,N,1) + square_norm_32(r_dn,N,1) );
+      // Polak-Ribiere computation of beta, claimed to be self-stabilising, positive effect so far not observed or required
+      if(pr==MCG_PR){
+        beta = alpha*(alpha*(square_norm_32(q_up,N,1)+square_norm_32(q_dn,N,1)) - pro) / *rho1;
+      }else{
+        beta = rho / *rho1;
+      }
+      *rho1 = rho;
+      assign_mul_add_r_32(p_up, beta, r_up, N); assign_mul_add_r_32(p_dn, beta, r_dn, N);
+      if(g_debug_level > 2 && g_proc_id == 0) {
+        printf("SP_inner CG_ND: %d res^2 %g\t\n", j+iter, rho);
+      }
+       /* enforce convergence more strictly by a factor of 1.3 to avoid unnecessary restarts 
+       * if the real residual is still a bit too large */
+      if( 1.3*rho < eps_sq ) break;
+      if( rho > rhomax ) rhomax = rho;
+    }
+  }else{
+    // pipelined cg requires one more scalar product but may allow optimisations to be made
+    // e.g.: one could do the collective communication for sqrnrm(r) while other stuff is being computed
+    // it is also self-initialising (alpha=0, beta=0 will work)
+    while( rho > delta*rhomax && j+iter <= max_iter ){
+      ++j;
+      assign_add_mul_r_32(x_up, p_up, alpha, N); assign_add_mul_r_32(x_dn, p_dn, alpha, N);
+      assign_add_mul_r_32(r_up, q_up, -alpha, N); assign_add_mul_r_32(r_dn, q_dn, -alpha, N);
+      assign_mul_add_r_32(p_up, beta, r_up, N); assign_mul_add_r_32(p_dn, beta, r_dn, N);
+      f32(q_up,q_dn,p_up,p_dn);
+  
+      rho = ( square_norm_32(r_up,N,1) + square_norm_32(r_dn,N,1) );
+      pro = ( scalar_prod_r_32(p_up,q_up,N,1) + scalar_prod_r_32(p_dn,q_dn,N,1) );
+      alpha = rho/pro;
+      if(pr==MCG_PR){
+        beta = alpha*(alpha*(square_norm_32(q_up,N,1)+square_norm_32(q_dn,N,1))-pro)/rho;
+      }else{
+        beta = rho/ *rho1;
+      }
+      *rho1=rho;
+
+      if(g_debug_level > 2 && g_proc_id == 0) {
+        printf("SP_inner CG_ND: %d res^2 %g\t\n", j+iter, rho);
+      }
+      if( 1.3*rho < eps_sq ) break;
+      if( rho > rhomax ) rhomax = rho;
+    }
+  }
+
+  return j;
+}
+
+
+/* P output = solution , Q input = source */
+int rg_mixed_cg_her_nd(spinor * const P_up, spinor * const P_dn, spinor * const Q_up, spinor * const Q_dn, 
+                       solver_params_t solver_params, const int max_iter, const double eps_sq, const int rel_prec,
+                       const int N, matrix_mult_nd f, matrix_mult_nd32 f32) {
+
+  int iter_in_sp = 0, iter_in_dp = 0, iter_out = 0;
+  float rho_sp, delta = solver_params.mcg_delta;
+  double beta_dp, rho_dp;
+  double sourcesquarenorm, guesssquarenorm, target_eps_sq;
+
+  spinor *xhigh_up, *xhigh_dn, *rhigh_up, *rhigh_dn, *qhigh_up, *qhigh_dn, *phigh_up, *phigh_dn;
+  spinor32 *x_up, *x_dn, *p_up, *p_dn, *q_up, *q_dn, *r_up, *r_dn;
+
+  spinor ** solver_field = NULL;
+  spinor32 ** solver_field32 = NULL;  
+  const int nr_sf = 8;
+  const int nr_sf32 = 8;
+  
+  int high_control = 0;
+
+  double atime, etime, flops;
+  
+  if(N == VOLUME) {
+    init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf);    
+    init_solver_field_32(&solver_field32, VOLUMEPLUSRAND, nr_sf32);
+  }
+  else {
+    init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf);
+    init_solver_field_32(&solver_field32, VOLUMEPLUSRAND/2, nr_sf32);    
+  }
+
+  atime = gettime();
+
+  // we could get away with using fewer fields, of course
+  phigh_up = solver_field[7]; phigh_dn = solver_field[6];
+  xhigh_up = solver_field[5]; xhigh_dn = solver_field[4];
+  rhigh_up = solver_field[3]; rhigh_dn = solver_field[2];
+  qhigh_up = solver_field[1]; qhigh_dn = solver_field[0];
+
+  x_up = solver_field32[7]; x_dn = solver_field32[6];
+  r_up = solver_field32[5]; r_dn = solver_field32[4];
+  p_up = solver_field32[3]; p_dn = solver_field32[2];
+  q_up = solver_field32[1]; q_dn = solver_field32[0];
+
+  // we always want to apply the full precision operator in double
+  int save_sloppy = g_sloppy_precision_flag;
+  g_sloppy_precision_flag = 0;
+
+  sourcesquarenorm = ( square_norm(Q_up,N,1) + square_norm(Q_dn,N,1) );
+  if( rel_prec == 1 ) {
+    target_eps_sq = eps_sq*sourcesquarenorm;
+    if(g_debug_level > 0 && g_proc_id==0) 
+      printf("#RG_Mixed CG_ND: Using relative precision! eps_sq: %.6g target_eps_sq: %.6g \n",eps_sq,target_eps_sq);
+  }else{
+    target_eps_sq = eps_sq;
+  }
+  
+  // compute the maximum number of outer iterations based on the expected reduction
+  // of the residual at each run of the inner solver
+  int N_outer = (int)ceil(log10( sourcesquarenorm*delta/target_eps_sq ));
+  if(g_debug_level > 0 && g_proc_id==0) 
+    printf("#RG_Mixed CG_ND: N_outer: %d \n", N_outer);
+  
+  zero_spinor_field_32(x_up,N); zero_spinor_field_32(x_dn,N);
+
+  if(solver_params.use_initial_guess == 0) {
+    assign(phigh_up,Q_up,N); assign(phigh_dn,Q_dn,N);
+    assign(rhigh_up,Q_up,N); assign(rhigh_dn,Q_dn,N);
+    rho_dp = sourcesquarenorm;
+  } else {
+    // computing initial guess
+    f(rhigh_up,rhigh_dn,P_up,P_dn);
+    diff(rhigh_up,Q_up,rhigh_up,N); diff(rhigh_dn,Q_dn,rhigh_dn,N);
+    assign(phigh_up,rhigh_up,N); assign(phigh_dn,rhigh_dn,N);
+    rho_dp = ( square_norm(rhigh_up,N,1) + square_norm(rhigh_dn,N,1) );
+  }
+
+  assign_to_32(r_up,rhigh_up,N); assign_to_32(r_dn,rhigh_dn,N);
+  rho_sp = rho_dp;
+  assign_32(p_up,r_up,N); assign_32(p_dn,r_dn,N);
+
+  iter_in_sp += inner_loop(x_up, x_dn, p_up, p_dn, q_up, q_dn, r_up, r_dn, &rho_sp, delta, 
+                           f32, (float)target_eps_sq, 
+                           N, iter_out+iter_in_sp+iter_in_dp, max_iter, 0.0, 0.0, MCG_NO_PIPELINED, MCG_NO_PR);
+
+  for(iter_out = 1; iter_out < N_outer; ++iter_out ) {
+
+    // prepare for defect correction
+    // update high precision solution 
+    if(high_control==0) {
+      // accumulate solution (sp -> dp) 
+      addto_32(P_up,x_up,N); addto_32(P_dn,x_dn,N);
+      // compute real residual
+      f(qhigh_up,qhigh_dn,P_up,P_dn);
+      diff(rhigh_up,Q_up,qhigh_up,N); diff(rhigh_dn,Q_dn,qhigh_dn,N);
+      beta_dp = 1/rho_dp;
+      rho_dp = ( square_norm(rhigh_up,N,1) + square_norm(rhigh_dn,N,1) );
+      beta_dp *= rho_dp;
+    }
+   
+    // the iteration limit was reached in the previous iteration, let's try to save the day using double precision
+    if( high_control==1 ) {
+      assign(phigh_up,rhigh_up,N); assign(phigh_dn,rhigh_dn,N);
+      zero_spinor_field(xhigh_up,N); zero_spinor_field(xhigh_dn,N);
+      beta_dp = 1/rho_dp;
+      iter_in_dp += inner_loop_high(xhigh_up, xhigh_dn, phigh_up, phigh_dn,
+                                    qhigh_up, qhigh_dn, rhigh_up, rhigh_dn, &rho_dp, delta, f, 
+                                    target_eps_sq, N, iter_out+iter_in_sp+iter_in_dp, max_iter);
+      rho_sp = rho_dp;
+      // accumulate solution
+      add(P_up,P_up,xhigh_up,N); add(P_dn, P_dn, xhigh_dn, N);
+      // compute real residual
+      f(qhigh_up, qhigh_dn, P_up, P_dn);
+      diff(rhigh_up,Q_up,qhigh_up,N); diff(rhigh_dn,Q_dn,qhigh_dn,N);
+      rho_dp = ( square_norm(rhigh_up,N,1) + square_norm(rhigh_dn,N,1) );
+      beta_dp *= rho_dp;
+    }
+
+    if(g_debug_level > 2 && g_proc_id == 0) {
+      printf("RG_mixed CG_ND last inner residue:       %17g\n", rho_sp);
+      printf("RG_mixed CG_ND true residue:             %6d %10g\n", iter_in_sp+iter_in_dp+iter_out, rho_dp);
+      printf("RG_mixed CG_ND residue reduction factor: %6d %10g\n", iter_in_sp+iter_in_dp+iter_out, beta_dp); fflush(stdout);
+    }
+
+    if( rho_dp <= target_eps_sq || (iter_in_sp+iter_in_dp+iter_out) >= max_iter ) {
+      etime = gettime();
+      output_flops(etime-atime, N, iter_out, iter_in_sp, iter_in_dp, eps_sq);
+      
+      g_sloppy_precision_flag = save_sloppy;
+      finalize_solver(solver_field, nr_sf);
+      finalize_solver_32(solver_field32, nr_sf32); 
+      if( (iter_in_sp+iter_in_dp+iter_out) >= max_iter ){
+        return(-1);
+      } else {
+        return(iter_in_sp+iter_in_dp+iter_out);
+      }
+    }
+
+    // if it seems like we're stuck and reaching the iteration limit, we skip this correction and proceed in full precision above
+    if( iter_out >= (N_outer-2) ){
+      if(g_proc_id==0) printf("RG_mixed CG_ND: Reaching iteration limit, switching to DP!\n");
+      high_control = 1;
+      continue;
+    }else{
+      // correct defect
+      assign_to_32(r_up,rhigh_up,N); assign_to_32(r_dn,rhigh_dn,N);
+      rho_sp = rho_dp; // not sure if it's fine to truncate this or whether one should calculate it in SP directly, it seems to work fine though
+      assign_32(p_up,r_up,N); assign_32(p_dn,r_dn,N);
+    }
+
+    zero_spinor_field_32(x_up,N); zero_spinor_field_32(x_dn,N);
+    iter_in_sp += inner_loop(x_up, x_dn, p_up, p_dn, q_up, q_dn, r_up, r_dn, &rho_sp, delta, f32, (float)target_eps_sq, 
+                             N, iter_out+iter_in_sp+iter_in_dp, max_iter, 0.0, 0.0, MCG_NO_PIPELINED, MCG_NO_PR);
+  }
+  
+  // convergence failure...
+  g_sloppy_precision_flag = save_sloppy;
+  finalize_solver(solver_field, nr_sf);
+  finalize_solver_32(solver_field32, nr_sf32);
+  return -1; 
+}
+
+void output_flops(const double seconds, const unsigned int N, const unsigned int iter_out, const unsigned int iter_in_sp, const unsigned int iter_in_dp, const double eps_sq){
+  double flops;
+  // TODO: compute real number of flops...
+  int total_it = iter_in_sp+iter_in_dp+iter_out;
+  if(g_debug_level > 0 && g_proc_id == 0) {
+    printf("# RG_mixed CG_ND: iter_out: %d iter_in_sp: %d iter_in_dp: %d\n",iter_out,iter_in_sp,iter_in_dp);
+  	if(N != VOLUME){
+  	  /* 2 A + 2 Nc Ns + N_Count ( 2 A + 10 Nc Ns ) */
+  	  /* 2*1608.0 because the linalg is over VOLUME/2 */
+  	  flops = 2*(2*(2*1608.0+2*3*4) + 2*3*4 + total_it*(2.*(2*1608.0+2*3*4) + 10*3*4))*N/1.0e6f;
+  	}
+  	else{
+  	  /* 2 A + 2 Nc Ns + N_Count ( 2 A + 10 Nc Ns ) */
+  	  flops = 2*(2*(1608.0+2*3*4) + 2*3*4 + total_it*(2.*(1608.0+2*3*4) + 10*3*4))*N/1.0e6f;      
+  	}
+  	printf("#RG_mixed CG_ND: iter: %d eps_sq: %1.4e t/s: %1.4e\n", total_it, eps_sq, seconds); 
+    printf("# FIXME: note the following flop counts are wrong! Consider only the time to solution!\n");
+  	printf("#RG_mixed CG_ND: flopcount (for e/o tmWilson only): t/s: %1.4e mflops_local: %.1f mflops: %.1f\n", 
+  	       seconds, flops/(seconds), g_nproc*flops/(seconds));
+  }      
+}
diff --git a/solver/rg_mixed_cg_her_nd.h b/solver/rg_mixed_cg_her_nd.h
new file mode 100644
index 000000000..4a093091f
--- /dev/null
+++ b/solver/rg_mixed_cg_her_nd.h
@@ -0,0 +1,32 @@
+/***********************************************************************
+ * Copyright (C) 2016 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+#ifndef _RG_MIXED_CG_HER_ND_H
+#define _RG_MIXED_CG_HER_ND_H
+
+#include "operator/tm_operators_32.h"
+#include "solver/matrix_mult_typedef_nd.h"
+#include "solver/solver_params.h"
+#include "solver/rg_mixed_cg_typedef.h"
+#include "su3.h"
+
+int rg_mixed_cg_her_nd(spinor * const Pup, spinor * const Pdn, spinor * const Qup, spinor * const Qdn,
+                    solver_params_t solver_params, const int max_iter, const double eps_sq, const int rel_prec,
+                    const int N, matrix_mult_nd f, matrix_mult_nd32 f32);
+
+#endif
diff --git a/solver/rg_mixed_cg_typedef.h b/solver/rg_mixed_cg_typedef.h
new file mode 100644
index 000000000..61c367342
--- /dev/null
+++ b/solver/rg_mixed_cg_typedef.h
@@ -0,0 +1,21 @@
+#ifndef _RG_MIXED_CG_HER_TYPEDEF_H
+#define _RG_MIXED_CG_HER_TYPEDEF_H
+
+typedef enum MCG_PR_TYPE {
+  MCG_NO_PR=0,
+  MCG_PR
+} MCG_PR_TYPE;
+
+typedef enum MCG_PIPELINED_TYPE {
+  MCG_NO_PIPELINED=0,
+  MCG_PIPELINED
+} MCG_PIPELINED_TYPE;
+
+// currently not used
+typedef enum MCG_RESGUIDE_TYPE {
+  MCG_NO_RESGUIDE=0,
+  MCG_RESGUIDE
+} MCG_RESGUIDE_TYPE;
+     
+
+#endif
diff --git a/solver/solver.h b/solver/solver.h
index 5f0b97051..0591dfbc0 100644
--- a/solver/solver.h
+++ b/solver/solver.h
@@ -20,52 +20,19 @@
 #ifndef _SOLVER_H
 #define _SOLVER_H
 
-#define BICGSTAB 0
-#define CG 1
-#define GMRES 2
-#define CGS 3
-#define MR 4
-#define BICGSTABELL 5
-#define FGMRES 6
-#define GCR 7
-#define GMRESDR 8
-#define PCG 9
-#define DFLGCR 10
-#define DFLFGMRES 11
-#define CGMMS 12
-#define MIXEDCG 13
-#define CGMMSND 14
-#define INCREIGCG 15
 
+#include"solver/solver_types.h"
 #include"solver/matrix_mult_typedef.h"
 #include "solver/matrix_mult_typedef_bi.h"
 #include "solver/matrix_mult_typedef_nd.h"
 
-typedef struct {
-  // solver type
-  int type;
-  // maximal number of iterations
-  int max_iter;
-  // use relative precision
-  int rel_prec;
-  // number of shifts in multi shift solvers
-  int no_shifts;
-  // dimension of spinors
-  int sdim;
-  // squared desired residue
-  double squared_solver_prec;
-  // single flavour matrix to invert
-  matrix_mult M_psi;
-  // flavour doublet matrix to invert
-  matrix_mult_nd M_ndpsi;
-  // pointer to array of shifts
-  double * shifts;
-} solver_pm_t;
+#include "solver/solver_params.h"
 
 #include"solver/gmres.h"
 #include"solver/gmres_dr.h"
 #include"solver/fgmres.h"
 #include"solver/bicgstab_complex.h"
+#include"solver/bicg_complex.h"
 #include"solver/cgs_real.h"
 #include"solver/bicgstabell.h"
 #include"solver/bicgstab2.h"
@@ -77,6 +44,9 @@ typedef struct {
 #include"solver/eigenvalues.h"
 #include"solver/cg_mms_tm.h"
 #include"solver/mixed_cg_her.h"
+#include"solver/mcr.h"
+#include"solver/cr.h"
+#include "solver/rg_mixed_cg_her.h"
 
 #include"solver/sub_low_ev.h"
 #include"solver/gmres_precon.h"
@@ -86,9 +56,14 @@ typedef struct {
 #include "solver/cg_her_bi.h"
 
 #include "solver/cg_her_nd.h"
+#include "solver/rg_mixed_cg_her_nd.h"
 #include"solver/cg_mms_tm_nd.h"
+#include"solver/mixed_cg_mms_tm_nd.h"
 
 #include "solver/generate_dfl_subspace.h"
 
+#include "solver/sumr.h"
+
+#include "solver/monomial_solve.h"
 
 #endif
diff --git a/solver/solver_field.c b/solver/solver_field.c
index a369ae78c..f52897a3c 100644
--- a/solver/solver_field.c
+++ b/solver/solver_field.c
@@ -20,7 +20,7 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include<stdlib.h>
 #include<errno.h>
@@ -72,6 +72,52 @@ void finalize_solver(spinor ** solver_field, const int nr){
 }
 
 
+
+
+
+int init_solver_field_32(spinor32 *** const solver_field, const int V, const int nr) {
+  int i=0;
+
+  /* allocate nr+1 to save the linear field in solver_field[nr] */
+  if((void*)((*solver_field) = (spinor32**)malloc((nr+1)*sizeof(spinor32*))) == NULL) {
+    printf ("malloc errno in init_solver_field: %d\n",errno); 
+    errno = 0;
+    return(2);
+  }
+  
+  /* allocate the full chunk of memory to solver_field[nr] */
+#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+  if((void*)((*solver_field)[nr] = (spinor32*)shmalloc((nr*V+1)*sizeof(spinor32))) == NULL) {
+    fprintf (stderr, "malloc errno in init_solver_field: %d\n",errno); 
+    errno = 0;
+    return(1);
+  }
+#else
+  if((void*)((*solver_field)[nr] = (spinor32*)calloc(nr*V+1, sizeof(spinor32))) == NULL) {
+    printf ("malloc errno in init_solver_field: %d\n",errno); 
+    errno = 0;
+    return(1);
+  }
+#endif
+
+  /* now cut in pieces and distribute to solver_field[0]-solver_field[nr-1] */
+#if ( defined SSE || defined SSE2 || defined SSE3)
+  (*solver_field)[0] = (spinor32*)(((unsigned long int)((*solver_field)[nr])+ALIGN_BASE32)&~ALIGN_BASE32);
+#else
+  (*solver_field)[0] = (spinor32*)(((unsigned long int)((*solver_field)[nr])+ALIGN_BASE32)&~ALIGN_BASE32);
+#endif
+  for(i = 1; i < nr; i++){
+    (*solver_field)[i] = (*solver_field)[i-1]+V;
+  }
+  return(0);
+}
+
+void finalize_solver_32(spinor32 ** solver_field, const int nr){
+  free(solver_field[nr]);
+  free(solver_field);
+  solver_field = NULL;
+}
+
 int init_bisolver_field(bispinor *** const solver_field, const int V, const int nr) {
   int i=0;
 
@@ -106,3 +152,92 @@ void finalize_bisolver(bispinor ** solver_field, const int nr) {
   free(solver_field);
   solver_field = NULL;
 }
+
+
+/* little solver fields (deflation) */
+int init_lsolver_field(_Complex double *** const solver_field, const int V, const int nr) {
+  int i=0;
+
+  /* allocate nr+1 to save the linear field in solver_field[nr] */
+  if((void*)((*solver_field) = (_Complex double**)malloc((nr+1)*sizeof(_Complex double*))) == NULL) {
+    printf ("malloc errno in init_solver_field: %d\n",errno);
+    errno = 0;
+    return(2);
+  }
+
+  /* allocate the full chunk of memory to solver_field[nr] */
+#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+  if((void*)((*solver_field)[nr] = (_Complex double*)shmalloc((nr*V+1)*sizeof(_Complex double))) == NULL) {
+    fprintf (stderr, "malloc errno in init_solver_field: %d\n",errno);
+    errno = 0;
+    return(1);
+  }
+#else
+  if((void*)((*solver_field)[nr] = (_Complex double*)calloc(nr*V+1, sizeof(_Complex double))) == NULL) {
+    printf ("malloc errno in init_solver_field: %d\n",errno);
+    errno = 0;
+    return(1);
+  }
+#endif
+
+  /* now cut in pieces and distribute to solver_field[0]-solver_field[nr-1] */
+#if ( defined SSE || defined SSE2 || defined SSE3)
+  (*solver_field)[0] = (_Complex double*)(((unsigned long int)((*solver_field)[nr])+ALIGN_BASE)&~ALIGN_BASE);
+#else
+  (*solver_field)[0] = (*solver_field)[nr];
+#endif
+  for(i = 1; i < nr; i++){
+    (*solver_field)[i] = (*solver_field)[i-1]+V;
+  }
+  return(0);
+}
+
+void finalize_lsolver(_Complex double ** solver_field, const int nr){
+  free(solver_field[nr]);
+  free(solver_field);
+  solver_field = NULL;
+}
+
+/* little solver fields (deflation) */
+int init_lsolver_field_32(_Complex float *** const solver_field, const int V, const int nr) {
+  int i=0;
+
+  /* allocate nr+1 to save the linear field in solver_field[nr] */
+  if((void*)((*solver_field) = (_Complex float**)malloc((nr+1)*sizeof(_Complex float*))) == NULL) {
+    printf ("malloc errno in init_solver_field: %d\n",errno);
+    errno = 0;
+    return(2);
+  }
+
+  /* allocate the full chunk of memory to solver_field[nr] */
+#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+  if((void*)((*solver_field)[nr] = (_Complex float*)shmalloc((nr*V+1)*sizeof(_Complex float))) == NULL) {
+    fprintf (stderr, "malloc errno in init_solver_field: %d\n",errno);
+    errno = 0;
+    return(1);
+  }
+#else
+  if((void*)((*solver_field)[nr] = (_Complex float*)calloc(nr*V+1, sizeof(_Complex float))) == NULL) {
+    printf ("malloc errno in init_solver_field: %d\n",errno);
+    errno = 0;
+    return(1);
+  }
+#endif
+
+  /* now cut in pieces and distribute to solver_field[0]-solver_field[nr-1] */
+#if ( defined SSE || defined SSE2 || defined SSE3)
+  (*solver_field)[0] = (_Complex float*)(((unsigned long int)((*solver_field)[nr])+ALIGN_BASE)&~ALIGN_BASE);
+#else
+  (*solver_field)[0] = (*solver_field)[nr];
+#endif
+  for(i = 1; i < nr; i++){
+    (*solver_field)[i] = (*solver_field)[i-1]+V;
+  }
+  return(0);
+}
+
+void finalize_lsolver_32(_Complex float ** solver_field, const int nr){
+  free(solver_field[nr]);
+  free(solver_field);
+  solver_field = NULL;
+}
diff --git a/solver/solver_field.h b/solver/solver_field.h
index 2488d41dc..22c1e7ec4 100644
--- a/solver/solver_field.h
+++ b/solver/solver_field.h
@@ -26,6 +26,14 @@
 
 int init_solver_field(spinor *** const solver_field, const int V, const int nr);
 void finalize_solver(spinor ** solver_field, const int nr);
+int init_solver_field_32(spinor32 *** const solver_field, const int V, const int nr);
+void finalize_solver_32(spinor32 ** solver_field, const int nr);
 int init_bisolver_field(bispinor *** const solver_field, const int V, const int nr);
 void finalize_bisolver(bispinor ** solver_field, const int nr);
+/* little solver fields (deflation) */
+int init_lsolver_field(_Complex double *** const solver_field, const int V, const int nr);
+void finalize_lsolver(_Complex double ** solver_field, const int nr);
+int init_lsolver_field_32(_Complex float *** const solver_field, const int V, const int nr);
+void finalize_lsolver_32(_Complex float ** solver_field, const int nr);
+
 #endif
diff --git a/solver/solver_params.h b/solver/solver_params.h
index 8ca0569a5..8c301e46d 100644
--- a/solver/solver_params.h
+++ b/solver/solver_params.h
@@ -1,5 +1,6 @@
 /***************************************************************************
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *               2017                               Bartosz Kostrzewa
  *
  * This file is part of tmLQCD.
  *
@@ -27,8 +28,18 @@
  ****************************************************************************/
 
 
-#ifndef _SOLVER_PARAMS_H
-#define _SOLVER_PARAMS_H
+#ifndef SOLVER_PARAMS_H
+#define SOLVER_PARAMS_H
+
+#include "solver/matrix_mult_typedef.h"
+#include "solver/matrix_mult_typedef_nd.h"
+
+#include "misc_types.h"
+
+typedef enum solution_type_t {
+  TM_SOLUTION_M_MDAG = 0,
+  TM_SOLUTION_M
+} solution_type_t;
 
 typedef struct {
 
@@ -49,11 +60,44 @@ typedef struct {
                              Example, to solve the linear systems to squared residual 1e-16, one chooses eigcg_restolsq=1e-8 or smaller 
                              This will specify how many times deflated CG restaretd in the second phase (after eigenvectors has been computed)*/
   int eigcg_rand_guess_opt; /*set to 0 to use 0 initial guesses or non-zero values if you want to use random initial guess as a volume source */
-} solver_params_t;
 
+  /* factor below which iterated resdiual has to drop to trigger a 
+     reliable update in the mixed solver
+       if(<r,r>) < delta * max( <r,r> )
+     where the maximum is over the iterated residuals since the last update */  
+  float mcg_delta; 
 
+  // solver type
+  int type;
+  // maximal number of iterations
+  int max_iter;
+  // use relative precision
+  int rel_prec;
+  // number of shifts in multi shift solvers
+  int no_shifts;
+  // dimension of spinors
+  int sdim;
+  // squared desired residue
+  double squared_solver_prec;
+  // single flavour matrix to invert
+  matrix_mult M_psi;
+  // 32bit single flavour matrix to invert
+  matrix_mult32 M_psi32;  
+  // flavour doublet matrix to invert
+  matrix_mult_nd M_ndpsi;
+  // 32bit flavour doublet matrix to invert
+  matrix_mult_nd32 M_ndpsi32;  
+  // pointer to array of shifts
+  double * shifts;
+  
+  solution_type_t solution_type;
+  
+  CompressionType compression_type;
+  SloppyPrecision sloppy_precision;
+  ExternalInverter external_inverter;
 
-#endif
-
+  int use_initial_guess;  
+} solver_params_t;
 
+#endif
  
diff --git a/solver/solver_types.c b/solver/solver_types.c
new file mode 100644
index 000000000..e28a88b3e
--- /dev/null
+++ b/solver/solver_types.c
@@ -0,0 +1,27 @@
+/***************************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *               2017                               Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "solver/solver_types.h"
+
+int solver_is_mixed( const int solver_type ){
+  return( solver_type == MIXEDCG || solver_type == RGMIXEDCG || solver_type == MIXEDCGMMSND ||
+          solver_type == MIXEDBICGSTAB );
+}
+
diff --git a/solver/solver_types.h b/solver/solver_types.h
new file mode 100644
index 000000000..0c80f52e5
--- /dev/null
+++ b/solver/solver_types.h
@@ -0,0 +1,54 @@
+/***************************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *               2017                               Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#ifndef _SOLVER_TYPES_H
+#define _SOLVER_TYPES_H
+
+typedef enum SOLVER_TYPE {
+ BICGSTAB = 0,
+ CG,
+ GMRES,
+ CGS,
+ MR,
+ BICGSTABELL,
+ FGMRES,
+ GCR,
+ GMRESDR,
+ PCG,
+ DFLGCR,
+ DFLFGMRES,
+ CGMMS,
+ MIXEDCG,
+ RGMIXEDCG,
+ CGMMSND,
+ INCREIGCG,
+ MIXEDCGMMSND,
+ SUMR,
+ MCR,
+ CR,
+ BICG,
+ MG,
+ MIXEDBICGSTAB,
+ DUMMYHERMTEST
+} SOLVER_TYPE;
+
+int solver_is_mixed( const int solver_type );
+
+#endif
diff --git a/solver/spectral_proj.c b/solver/spectral_proj.c
index 8ffbcea8c..ec8e90e76 100644
--- a/solver/spectral_proj.c
+++ b/solver/spectral_proj.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/solver/sub_low_ev.c b/solver/sub_low_ev.c
index e29449707..7923eb4e5 100644
--- a/solver/sub_low_ev.c
+++ b/solver/sub_low_ev.c
@@ -66,7 +66,7 @@
  ********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <math.h>
 #include <stdlib.h>
diff --git a/solver/sumr.c b/solver/sumr.c
index e96ab4112..5683991c5 100644
--- a/solver/sumr.c
+++ b/solver/sumr.c
@@ -39,7 +39,7 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/source_generation.c b/source_generation.c
index f5d7b6984..d9d36a03e 100644
--- a/source_generation.c
+++ b/source_generation.c
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -124,7 +124,7 @@ void gaussian_volume_source(spinor * const P, spinor * const Q,
 
 void extended_pion_source(spinor * const P, spinor * const Q,
 			  spinor * const R, spinor * const S,
-			  const int t0,
+			  const int t0, const int ts,
 			  const double px, const double py, const double pz) {
   int lt, lx, ly, lz, i, x, y, z, id=0, t;
   int coords[4];
@@ -134,7 +134,7 @@ void extended_pion_source(spinor * const P, spinor * const Q,
   zero_spinor_field(P,VOLUME/2);
   zero_spinor_field(Q,VOLUME/2);
   
-  t=((g_nproc_t*T)/2+t0)%(g_nproc_t*T);
+  t=(ts + t0)%(g_nproc_t*T);
   lt = t - g_proc_coords[0]*T;
   coords[0] = t / T;
   for(x = 0; x < LX*g_nproc_x; x++) {
@@ -146,7 +146,7 @@ void extended_pion_source(spinor * const P, spinor * const Q,
       for(z = 0; z < LZ*g_nproc_z; z++) {
 	lz = z - g_proc_coords[3]*LZ;
 	coords[3] = z / LZ;
-#ifdef MPI
+#ifdef TM_USE_MPI
 	MPI_Cart_rank(g_cart_grid, coords, &id);
 #endif
 	if(g_cart_id == id) {
@@ -172,8 +172,8 @@ void extended_pion_source(spinor * const P, spinor * const Q,
 }
 
 void source_generation_pion_only(spinor * const P, spinor * const Q,
-				 const int t,
-				 const int sample, const int nstore) {
+				 const int t, const int sample, 
+                                 const int nstore, const unsigned int _seed) {
 
   int reset = 0, i, x, y, z, is, ic, lt, lx, ly, lz, id=0;
   int coords[4], seed, r;
@@ -192,7 +192,7 @@ void source_generation_pion_only(spinor * const P, spinor * const Q,
   }
 
   /* Compute the seed */
-  seed =(int) abs(1 + sample + t*10*97 + nstore*100*53);
+  seed =(int) abs(_seed + sample + t*10*97 + nstore*100*53);
 
   rlxd_init(2, seed);
 
@@ -207,7 +207,7 @@ void source_generation_pion_only(spinor * const P, spinor * const Q,
       for(z = 0; z < LZ*g_nproc_z; z++) {
 	lz = z - g_proc_coords[3]*LZ;
 	coords[3] = z / LZ;
-#ifdef MPI
+#ifdef TM_USE_MPI
 	MPI_Cart_rank(g_cart_grid, coords, &id);
 #endif
 	for(is = 0; is < 4; is++) {
@@ -294,7 +294,7 @@ void source_generation_pion_zdir(spinor * const P, spinor * const Q,
       ly = y - g_proc_coords[2]*LY;
       coords[2] = y / LY;
 
-#ifdef MPI
+#ifdef TM_USE_MPI
         MPI_Cart_rank(g_cart_grid, coords, &id);
 #endif
         for(is = 0; is < 4; is++) {
@@ -394,7 +394,7 @@ void source_generation_nucleon(spinor * const P, spinor * const Q,
 	for(zz = 0; zz < LZ*g_nproc_z; zz+=nx) {
 	  lz = zz - g_proc_coords[3]*LZ;
 	  coords[3] = zz / LZ;
-#ifdef MPI
+#ifdef TM_USE_MPI
 	  MPI_Cart_rank(g_cart_grid, coords, &id);
 #endif
 	  ranlxd(&rnumber, 1);
diff --git a/source_generation.h b/source_generation.h
index f733ea0aa..6cf3cf6f0 100644
--- a/source_generation.h
+++ b/source_generation.h
@@ -23,8 +23,8 @@ void gaussian_volume_source(spinor * const P, spinor * const Q,
 			    const int sample, const int nstore, const int f);
 
 void source_generation_pion_only(spinor * const P, spinor * const Q,
-				 const int t,
-				 const int sample, const int nstore);
+				 const int t, const int sample, 
+                                 const int nstore, const unsigned int _seed);
 
 void source_generation_nucleon(spinor * const P, spinor * const Q, 
 			       const int is, const int ic,
@@ -34,7 +34,7 @@ void source_generation_nucleon(spinor * const P, spinor * const Q,
 
 void extended_pion_source(spinor * const P, spinor * const Q,
 			  spinor * const R, spinor * const S,
-			  const int t0,
+			  const int t0, const int ts,
 			  const double px, const double py, const double pz);
 
 void source_generation_pion_zdir(spinor * const P, spinor * const Q,
diff --git a/spinor_fft.c b/spinor_fft.c
index 6fe193a68..1cb8861b8 100644
--- a/spinor_fft.c
+++ b/spinor_fft.c
@@ -1,13 +1,13 @@
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <time.h>
 #include <assert.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
@@ -30,7 +30,7 @@
 
 void  spinor_fft_print_reduct_dims(int *remaining_dims,FILE *logFile);
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 void check_mpi_comm_membership(MPI_Comm commself,MPI_Comm commcheck,const char *name_a,const char *name_b,FILE *logFile);
 #endif
 
@@ -48,7 +48,7 @@ void spinor_fft_transpose_xp_t(spinor *fieldout,spinor* fieldin,int dim0,int dim
  */
 void spinor_fft_reduce_2d(spinor *localSpinorField,int *collectionRank,spinor*** field_collection,spinor **membuff){
   /* this implementation is intended for four dimensional parallelisation */
-#if (defined  PARALLELXYZT  && defined MPI && defined HAVE_FFTW)
+#if (defined  PARALLELXYZT  && defined TM_USE_MPI && defined HAVE_FFTW)
 
   int sendRecvCoord[4];
   int i;
@@ -199,7 +199,7 @@ void spinor_fft_reduce_2d(spinor *localSpinorField,int *collectionRank,spinor***
  */
 void spinor_fft_redist_2d(spinor *localSpinorField,int collectionRank,spinor** field_collection,spinor *membuff){
   /* this implementation is intended for four dimensional parallelisation */
-#if ( defined PARALLELXYZT && defined MPI && defined HAVE_FFTW)
+#if ( defined PARALLELXYZT && defined TM_USE_MPI && defined HAVE_FFTW)
 
   int sendRecvCoord[4];
   int i;
@@ -457,7 +457,7 @@ void spinor_fft_transpose_xp_t(spinor *fieldout,spinor* fieldin,int dim0,int dim
 }
 
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 void check_mpi_comm_membership(MPI_Comm commself,MPI_Comm commcheck,const char *name_a,const char *name_b,FILE *logFile){
   int result;
   fprintf(logFile,"checking %s against %s : \n" , name_a,name_b);
diff --git a/start.c b/start.c
index 65cfe0474..798ae71a8 100644
--- a/start.c
+++ b/start.c
@@ -61,13 +61,13 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
@@ -118,6 +118,15 @@ static void z2_vector(double *v, const int N) {
   return;
 }
 
+// produce a double array uniformly random in [-1,1]
+static void pm1_unit(double * v, const int N) {
+  ranlxd(v,N);
+  for (int i = 0; i < N; ++i) {
+    v[i] = 2*v[i] - 1.;
+  }
+  return;
+}
+
 static su3 unit_su3(void)
 {
    su3 u = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0};
@@ -229,7 +238,7 @@ void random_spinor_field_lexic(spinor * const k, const int repro, const enum RN_
 
   _rn_switch(rn_type,random_vector)
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   int rlxd_state[105];
   int rlxd_state_backup[105];
 #endif
@@ -238,7 +247,7 @@ void random_spinor_field_lexic(spinor * const k, const int repro, const enum RN_
   double v[24];
 
   if(repro) {
-#ifdef MPI
+#ifdef TM_USE_MPI
     if(g_proc_id != 0) {
       rlxd_get(rlxd_state_backup);
     } else if(g_proc_id == 0) {
@@ -261,7 +270,7 @@ void random_spinor_field_lexic(spinor * const k, const int repro, const enum RN_
 	  for(z = 0; z < g_nproc_z*LZ; z++) {
 	    Z = z - g_proc_coords[3]*LZ;
 	    coords[3] = z / LZ;
-#ifdef MPI
+#ifdef TM_USE_MPI
 	    MPI_Cart_rank(g_cart_grid, coords, &id);
 #endif
 	    if(g_cart_id == id) {
@@ -275,7 +284,7 @@ void random_spinor_field_lexic(spinor * const k, const int repro, const enum RN_
 	}
       }
     }
-#ifdef MPI
+#ifdef TM_USE_MPI
     if(g_proc_id != 0) {
       rlxd_reset(rlxd_state_backup);
     }
@@ -301,7 +310,7 @@ void random_spinor_field_eo(spinor * const k, const int repro, const enum RN_TYP
 
   _rn_switch(rn_type,random_vector)
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   int rlxd_state[105];
   int rlxd_state_backup[105];
 #endif
@@ -310,7 +319,7 @@ void random_spinor_field_eo(spinor * const k, const int repro, const enum RN_TYP
   double v[24];
 
   if(repro) {
-#ifdef MPI
+#ifdef TM_USE_MPI
     if(g_proc_id != 0) {
       rlxd_get(rlxd_state_backup);
     } else if(g_proc_id == 0) {
@@ -333,7 +342,7 @@ void random_spinor_field_eo(spinor * const k, const int repro, const enum RN_TYP
 	  for(z = 0; z < g_nproc_z*LZ; z++) {
 	    coords[3] = z / LZ;
 	    Z = z - g_proc_coords[3]*LZ;
-#ifdef MPI
+#ifdef TM_USE_MPI
 	    MPI_Cart_rank(g_cart_grid, coords, &id);
 #endif
 	    if((t0+x+y+z)%2 == 0) {
@@ -347,7 +356,7 @@ void random_spinor_field_eo(spinor * const k, const int repro, const enum RN_TYP
 	}
       }
     }
-#ifdef MPI
+#ifdef TM_USE_MPI
     if(g_proc_id != 0) {
       rlxd_reset(rlxd_state_backup);
     }
@@ -369,6 +378,13 @@ void zero_spinor_field(spinor * const k, const int N)
   memset(k, 0, sizeof(spinor) * N);
 }
 
+/* Function provides a zero spinor field of length N */
+void zero_spinor_field_32(spinor32 * const k, const int N)
+{
+  memset(k, 0, sizeof(spinor32) * N);
+}
+
+
 /* Function provides a constant spinor field of length N */
 void constant_spinor_field(spinor * const k, const int p, const int N)
 {
@@ -452,13 +468,13 @@ void random_gauge_field(const int repro, su3 ** const gf) {
   int id = 0; /* May not be initialized for scalar builds! */
   int coords[4];
   su3 ALIGN tmp;
-#ifdef MPI
+#ifdef TM_USE_MPI
   int rlxd_state[105];
   int rlxd_state_backup[105];
 #endif
 
   if(repro) {
-#ifdef MPI
+#ifdef TM_USE_MPI
     if(g_proc_id != 0) {
       rlxd_get(rlxd_state_backup);
     } else if(g_proc_id == 0) {
@@ -479,7 +495,7 @@ void random_gauge_field(const int repro, su3 ** const gf) {
 	  for(z = 0; z < g_nproc_z*LZ; z++) {
 	    Z = z - g_proc_coords[3]*LZ;
 	    coords[3] = z / LZ;
-#ifdef MPI
+#ifdef TM_USE_MPI
 	    MPI_Cart_rank(g_cart_grid, coords, &id);
 #endif
 	    for(mu = 0; mu < 4; mu++) {
@@ -495,9 +511,9 @@ void random_gauge_field(const int repro, su3 ** const gf) {
 	}
       }
     }
-#ifdef MPI
+#ifdef TM_USE_MPI
     if(g_proc_id != 0) {
-      rlxd_get(rlxd_state_backup);
+      rlxd_reset(rlxd_state_backup);
     }
 #endif
   }
@@ -519,7 +535,7 @@ double random_su3adj_field(const int repro, su3adj ** const momenta) {
   su3adj *xm;
   int i, mu, t0, x, y, z, X, Y, Z, t, id = 0;
   int coords[4];
-#ifdef MPI
+#ifdef TM_USE_MPI
   int k;
   int rlxd_state[105];
   int rlxd_state_backup[105];
@@ -528,7 +544,7 @@ double random_su3adj_field(const int repro, su3adj ** const momenta) {
   double ALIGN tt, tr, ts, kc = 0., ks = 0., sum;
   
   if(repro) {
-#ifdef MPI
+#ifdef TM_USE_MPI
     if(g_proc_id != 0) {
       rlxd_get(rlxd_state_backup);
     } else if(g_proc_id == 0) {
@@ -549,7 +565,7 @@ double random_su3adj_field(const int repro, su3adj ** const momenta) {
 	  for(z = 0; z < g_nproc_z*LZ; z++) {
 	    Z = z - g_proc_coords[3]*LZ;
 	    coords[3] = z / LZ;
-#ifdef MPI
+#ifdef TM_USE_MPI
 	    MPI_Cart_rank(g_cart_grid, coords, &id);
 #endif
 	    if(g_cart_id == id) i = g_ipt[t][X][Y][Z];
@@ -583,7 +599,7 @@ double random_su3adj_field(const int repro, su3adj ** const momenta) {
       }
     }
     kc=0.5*(ks+kc);
-#ifdef MPI
+#ifdef TM_USE_MPI
     if(g_proc_id != 0) {
       rlxd_reset(rlxd_state_backup);
     }
@@ -616,7 +632,7 @@ double random_su3adj_field(const int repro, su3adj ** const momenta) {
     }
     kc=0.5*(ks+kc);
   }
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Allreduce(&kc, &ks, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
   return ks;
 #endif
@@ -784,7 +800,7 @@ void source_spinor_field_point_from_file(spinor * const P, spinor * const Q, int
   source_pe_coord[2] = source_coord[2]/LY;
   source_pe_coord[3] = source_coord[3]/LZ;
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Cart_rank(g_cart_grid, source_pe_coord, &source_pe_indx);
 #else
   source_pe_indx=0;
@@ -851,6 +867,24 @@ void start_ranlux(int level, int seed)
    loc_seed = (seed + step*max_seed) % 2147483647;
 
    if(loc_seed == 0) loc_seed++;
+
+   #ifdef TM_USE_MPI
+   unsigned int * seeds = calloc(g_nproc,sizeof(unsigned int));
+   if(seeds == NULL) fatal_error("Memory allocation for seeds buffer failed!","start_ranlux");  
+   MPI_Gather(&loc_seed,1,MPI_UNSIGNED,seeds,1,MPI_UNSIGNED,0,MPI_COMM_WORLD);
+   if(g_proc_id == 0) {
+     for(int i = 0; i < g_nproc; ++i) {
+       for(int j = i+1; j < g_nproc; ++j) {
+         if( seeds[i] == seeds[j] ) {
+           char error_message[100];
+           snprintf(error_message,100,"Process %d and %d have the same seed. Aborting!",i,j);
+           fatal_error(error_message,"start_ranlux");
+         }
+       }
+     }
+   }
+   free(seeds);
+   #endif 
  
    if(g_debug_level > 3) {
      printf("Local seed is %d  proc_id = %d\n", loc_seed, g_proc_id);
diff --git a/start.h b/start.h
index 8b140a221..558a4c5b9 100644
--- a/start.h
+++ b/start.h
@@ -25,28 +25,33 @@
    with the first argument set to a random number type as defined below and a function pointer
    (see start.c for examples) */
 
-#define _rn_switch(type,rn_fn_ptr) \
-  switch( type ) { \
-    case RN_Z2: \
-      rn_fn_ptr = z2_vector; \
-      break; \
-    case RN_UNIF: \
-      rn_fn_ptr = ranlxd; \
-      break; \
-    case RN_GAUSS: \
-    default: \
+#define _rn_switch(type,rn_fn_ptr)		\
+  switch( type ) {				\
+  case RN_Z2:					\
+    rn_fn_ptr = z2_vector;			\
+    break;					\
+  case RN_UNIF:					\
+    rn_fn_ptr = ranlxd;				\
+    break;					\
+  case RN_PM1UNIF:				\
+    rn_fn_ptr = pm1_unit;			\
+    break;					\
+  case RN_GAUSS:				\
+  default:					\
       rn_fn_ptr = gauss_vector; \
       break; \
   } \
 
 /* RN_GAUSS: gaussian ditributed random numbers
    RN_UNIF:  random numbers drawn from a uniform distribution (this is a simple call to ranlxd!)
+   RN_PM1UNIF: random numbers drawn from a uniform distribution in [-1,1]
    RN_Z2:    z2 noise */
 
-enum RN_TYPE { RN_GAUSS, RN_UNIF, RN_Z2 };
+enum RN_TYPE { RN_GAUSS, RN_UNIF, RN_Z2 , RN_PM1UNIF};
 
 void unit_spinor_field(const int k);
 void zero_spinor_field(spinor * const k, const int N);
+void zero_spinor_field_32(spinor32 * const k, const int N);
 void constant_spinor_field(spinor * const k, const int p, const int N);
 
 void unit_spinor_field_lexic(spinor * const k);
diff --git a/struct_accessors.h b/struct_accessors.h
new file mode 100644
index 000000000..22158b587
--- /dev/null
+++ b/struct_accessors.h
@@ -0,0 +1,193 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2017 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+
+#ifndef STRUCT_ACCESSORS_H
+#define STRUCT_ACCESSORS_H
+
+#include "su3.h"
+#include <stdlib.h>
+
+static inline double su3_get_elem_linear(const su3* const matrix, int cc, int reim){
+  switch(cc){
+    case 0:
+      if(reim==0) return( ((double*)(&matrix->c00))[0] );
+      else return( ((double*)(&matrix->c00))[1] );
+      break;
+    case 1:
+      if(reim==0) return( ((double*)(&matrix->c01))[0] );
+      else return( ((double*)(&matrix->c01))[1] );
+      break;
+    case 2:
+      if(reim==0) return( ((double*)(&matrix->c02))[0] );
+      else return( ((double*)(&matrix->c02))[1] );
+      break;
+    case 3:
+      if(reim==0) return( ((double*)(&matrix->c10))[0] );
+      else return( ((double*)(&matrix->c10))[1] );
+      break;
+    case 4:
+      if(reim==0) return( ((double*)(&matrix->c11))[0] );
+      else return( ((double*)(&matrix->c11))[1] );
+      break;
+    case 5:
+      if(reim==0) return( ((double*)(&matrix->c12))[0] );
+      else return( ((double*)(&matrix->c12))[1] );
+      break;
+    case 6:
+      if(reim==0) return( ((double*)(&matrix->c20))[0] );
+      else return( ((double*)(&matrix->c20))[1] );
+      break;
+    case 7:
+      if(reim==0) return( ((double*)(&matrix->c21))[0] );
+      else return( ((double*)(&matrix->c21))[1] );
+      break;
+    case 8:
+      if(reim==0) return( ((double*)(&matrix->c22))[0] );
+      else return( ((double*)(&matrix->c22))[1] );
+      break;
+    default:
+      exit(-222);
+  }
+}
+
+static inline double su3_get_elem(const su3* const matrix, int c0, int c1, int reim){
+  return su3_get_elem_linear(matrix, 3*c0+c1, reim);
+}
+
+static inline double spinor_get_elem_linear(const spinor* const matrix, int sc, int reim){
+  switch(sc){
+    case 0:
+      if(reim==0) return( ((double*)(&matrix->s0.c0))[0] );
+      else return( ((double*)(&matrix->s0.c0))[1] );
+      break;
+    case 1:
+      if(reim==0) return( ((double*)(&matrix->s0.c1))[0] );
+      else return( ((double*)(&matrix->s0.c1))[1] );
+      break;
+    case 2:
+      if(reim==0) return( ((double*)(&matrix->s0.c2))[0] );
+      else return( ((double*)(&matrix->s0.c2))[1] );
+      break;
+    case 3:
+      if(reim==0) return( ((double*)(&matrix->s1.c0))[0] );
+      else return( ((double*)(&matrix->s1.c0))[1] );
+      break;
+    case 4:
+      if(reim==0) return( ((double*)(&matrix->s1.c1))[0] );
+      else return( ((double*)(&matrix->s1.c1))[1] );
+      break;
+    case 5:
+      if(reim==0) return( ((double*)(&matrix->s1.c2))[0] );
+      else return( ((double*)(&matrix->s1.c2))[1] );
+      break;
+    case 6:
+      if(reim==0) return( ((double*)(&matrix->s2.c0))[0] );
+      else return( ((double*)(&matrix->s2.c0))[1] );
+      break;
+    case 7:
+      if(reim==0) return( ((double*)(&matrix->s2.c1))[0] );
+      else return( ((double*)(&matrix->s2.c1))[1] );
+      break;
+    case 8:
+      if(reim==0) return( ((double*)(&matrix->s2.c2))[0] );
+      else return( ((double*)(&matrix->s2.c2))[1] );
+      break;
+    case 9:
+      if(reim==0) return( ((double*)(&matrix->s3.c0))[0] );
+      else return( ((double*)(&matrix->s3.c0))[1] );
+      break;
+    case 10:
+      if(reim==0) return( ((double*)(&matrix->s3.c1))[0] );
+      else return( ((double*)(&matrix->s3.c1))[1] );
+      break;
+    case 11:
+      if(reim==0) return( ((double*)(&matrix->s3.c2))[0] );
+      else return( ((double*)(&matrix->s3.c2))[1] );
+      break;
+    default:
+      exit(-223);
+  }
+}
+
+static inline double spinor_get_elem(const spinor* const matrix, int s, int c, int reim){
+  return spinor_get_elem_linear(matrix, 3*s+c, reim);
+}
+
+static inline void spinor_set_elem_linear(spinor* const matrix, int sc, const double rein, const double imin){
+  switch(sc){
+    case 0:
+      *(((double*const)(&(matrix->s0.c0)))  ) = rein;
+      *(((double*const)(&(matrix->s0.c0)))+1) = imin;
+      break;
+    case 1:
+      *(((double*const)(&(matrix->s0.c1)))  ) = rein;
+      *(((double*const)(&(matrix->s0.c1)))+1) = imin;
+      break;
+    case 2:
+      *(((double*const)(&(matrix->s0.c2)))  ) = rein;
+      *(((double*const)(&(matrix->s0.c2)))+1) = imin;
+      break;
+    case 3:
+      *(((double*const)(&(matrix->s1.c0)))  ) = rein;
+      *(((double*const)(&(matrix->s1.c0)))+1) = imin;
+      break;
+    case 4:
+      *(((double*const)(&(matrix->s1.c1)))  ) = rein;
+      *(((double*const)(&(matrix->s1.c1)))+1) = imin;
+      break;
+    case 5:
+      *(((double*const)(&(matrix->s1.c2)))  ) = rein;
+      *(((double*const)(&(matrix->s1.c2)))+1) = imin;
+      break;
+    case 6:
+      *(((double*const)(&(matrix->s2.c0)))  ) = rein;
+      *(((double*const)(&(matrix->s2.c0)))+1) = imin;
+      break;
+    case 7:
+      *(((double*const)(&(matrix->s2.c1)))  ) = rein;
+      *(((double*const)(&(matrix->s2.c1)))+1) = imin;
+      break;
+    case 8:
+      *(((double*const)(&(matrix->s2.c2)))  ) = rein;
+      *(((double*const)(&(matrix->s2.c2)))+1) = imin;
+      break;
+    case 9:
+      *(((double*const)(&(matrix->s3.c0)))  ) = rein;
+      *(((double*const)(&(matrix->s3.c0)))+1) = imin;
+      break;
+    case 10:
+      *(((double*const)(&(matrix->s3.c1)))  ) = rein;
+      *(((double*const)(&(matrix->s3.c1)))+1) = imin;
+      break;
+    case 11:
+      *(((double*const)(&(matrix->s3.c2)))  ) = rein;
+      *(((double*const)(&(matrix->s3.c2)))+1) = imin;
+      break;
+    default:
+      exit(-224);
+  }
+}
+
+static inline void spinor_set_elem(spinor* const matrix, int s, int c, const double rein, const double imin){
+  spinor_set_elem_linear(matrix, 3*s+c, rein, imin);
+}
+
+#endif
diff --git a/su3.h b/su3.h
index d91021992..ef258b5be 100644
--- a/su3.h
+++ b/su3.h
@@ -42,6 +42,11 @@ typedef struct
    _Complex double c00, c01, c02, c10, c11, c12, c20, c21, c22;
 } su3;
 
+typedef struct 
+{
+   _Complex float c00, c01, c02, c10, c11, c12, c20, c21, c22;
+} su3_32;
+
 typedef struct
 {
    _Complex double c0,c1,c2;
@@ -57,6 +62,11 @@ typedef struct
    su3_vector s0,s1,s2,s3;
 } spinor;
 
+typedef struct
+{
+   su3_vector32 s0,s1,s2,s3;
+} spinor32;
+
 typedef struct
 {
   su3_vector s0, s1;
@@ -123,6 +133,13 @@ typedef double scalar;
    _vector_assign((r).s2,(s).s2);\
    _vector_assign((r).s3,(s).s3);
 
+#define _spinor_add_assign(r,s) \
+   _vector_add_assign((r).s0,(s).s0);\
+   _vector_add_assign((r).s1,(s).s1);\
+   _vector_add_assign((r).s2,(s).s2);\
+   _vector_add_assign((r).s3,(s).s3);
+
+
 #define _vector_norm_square(r) \
    conj((r).c0) * (r).c0 + conj((r).c1) * (r).c1 + conj((r).c2) * (r).c2
    
@@ -141,6 +158,12 @@ typedef double scalar;
   (r).c1 += (c) * (s).c1;			\
   (r).c2 += (c) * (s).c2;
 
+#define _vector_sub_mul(r,c,s)                  \
+  (r).c0 -= (c) * (s).c0;                       \
+  (r).c1 -= (c) * (s).c1;                       \
+  (r).c2 -= (c) * (s).c2;
+
+
 #define _spinor_add_mul(r,c,s)                  \
   _vector_add_mul( (r).s0, (c), (s).s0);        \
   _vector_add_mul( (r).s1, (c), (s).s1);        \
@@ -166,6 +189,34 @@ typedef double scalar;
   (r).c1 += I*(c)*(s).c1; \
   (r).c2 += I*(c)*(s).c2;
 
+#define _vector_add_i_assign(r,s) \
+  (r).c0 += I*(s).c0; \
+  (r).c1 += I*(s).c1; \
+  (r).c2 += I*(s).c2;
+
+#define _vector_sub_i_assign(r,s) \
+  (r).c0 -= I*(s).c0; \
+  (r).c1 -= I*(s).c1; \
+  (r).c2 -= I*(s).c2;
+
+#define _vector_i_add_assign(r,s)               \
+  (r).c0 += I * (s).c0;                 \
+  (r).c1 += I * (s).c1;                 \
+  (r).c2 += I * (s).c2;
+
+
+#define _vector_i_sub_assign(r,s)               \
+  (r).c0 -= I * (s).c0;                         \
+  (r).c1 -= I * (s).c1;                         \
+  (r).c2 -= I * (s).c2;
+
+
+#define _vector_sub_i_mul(r,c,s) \
+  (r).c0 -= I*(c)*(s).c0; \
+  (r).c1 -= I*(c)*(s).c1; \
+  (r).c2 -= I*(c)*(s).c2;
+
+
 #define _vector_i_mul(r,c,s) \
   (r).c0 = I*(c)*(s).c0; \
   (r).c1 = I*(c)*(s).c1; \
@@ -276,7 +327,7 @@ _sse_store(r);
   (r).c1 -= I * (s).c1;				\
   (r).c2 -= I * (s).c2;
 
-#define complex_times_vector(r,c,s)		\
+#define _complex_times_vector(r,c,s)		\
   (r).c0 = (c) * (s).c0;			\
   (r).c1 = (c) * (s).c1;			\
   (r).c2 = (c) * (s).c2;
@@ -398,6 +449,17 @@ _sse_store_up(r);
   (u).c21 = conj((v).c12);			\
   (u).c22 = conj((v).c22); 
 
+#define _su3_transpose(u,v)			\
+  (u).c00 = ((v).c00);			\
+  (u).c01 = ((v).c10);			\
+  (u).c02 = ((v).c20);			\
+  (u).c10 = ((v).c01);			\
+  (u).c11 = ((v).c11);			\
+  (u).c12 = ((v).c21);			\
+  (u).c20 = ((v).c02);			\
+  (u).c21 = ((v).c12);			\
+  (u).c22 = ((v).c22);
+
 #define _itimes_su3(u,v)			\
   (u).c00 = I * (v).c00;			\
   (u).c01 = I * (v).c01;			\
@@ -663,6 +725,11 @@ _sse_store_up(r);
 
 #endif
 
+#define _su3_minus_const_times_im_trace_su3(w,c,v) \
+  (w).c00 -= I*c*(cimag((v).c00) + cimag((v).c11) + cimag((v).c22)); \
+  (w).c11 -= I*c*(cimag((v).c00) + cimag((v).c11) + cimag((v).c22)); \
+  (w).c22 -= I*c*(cimag((v).c00) + cimag((v).c11) + cimag((v).c22)); 
+
 #define _trace_su3_times_su3d(x,v,w)	\
   x =   (v).c00 * conj((w).c00)		\
       + (v).c01 * conj((w).c01)		\
@@ -684,12 +751,7 @@ _sse_store_up(r);
     + (v).c20 * (w).c02			\
     + (v).c21 * (w).c12			\
     + (v).c22 * (w).c22;
-
-#define _complex_times_vector(x, c, y)	\
-   x.c0 = (c) * (y).c0;			\
-   x.c1 = (c) * (y).c1;			\
-   x.c2 = (c) * (y).c2;
-    
+ 
 #define _vector_tensor_vector(t,u,v)	\
   (t).c00 = (u).c0 * conj((v).c0);	\
   (t).c01 = (u).c0 * conj((v).c1);	\
@@ -724,6 +786,11 @@ _sse_store_up(r);
   (t).c21 = (u).c2 * conj((v).c1) + (w).c2 * conj((z).c1);	\
   (t).c22 = (u).c2 * conj((v).c2) + (w).c2 * conj((z).c2);
 
-
+#define _su3_add_equals_complex_identity(u, c) \
+  (u).c00 += (c); \
+  (u).c11 += (c); \
+  (u).c22 += (c);
 
 #endif
+
+
diff --git a/su3adj.h b/su3adj.h
index 91eb39afe..59bfd7ce0 100644
--- a/su3adj.h
+++ b/su3adj.h
@@ -74,6 +74,16 @@ typedef struct
 (r).d7=+creal((a).c21)-creal((a).c12); \
 (r).d8=(-cimag((a).c00)-cimag((a).c11) + 2.0 * cimag((a).c22))*0.577350269189625;
 
+#define _trace_lambda_mul(r,c,a) \
+(r).d1=c*(+cimag((a).c10)-cimag((a).c01)); \
+(r).d2=c*(+creal((a).c10)-creal((a).c01)); \
+(r).d3=c*(-cimag((a).c00)+cimag((a).c11)); \
+(r).d4=c*(-cimag((a).c20)-cimag((a).c02)); \
+(r).d5=c*(+creal((a).c20)-creal((a).c02)); \
+(r).d6=c*(-cimag((a).c21)-cimag((a).c12)); \
+(r).d7=c*(+creal((a).c21)-creal((a).c12)); \
+(r).d8=c*((-cimag((a).c00)-cimag((a).c11) + 2.0 * cimag((a).c22))*0.577350269189625);
+
 #define _add_trace_lambda(r,a) \
 (r).d1+=-cimag((a).c10)-cimag((a).c01); \
 (r).d2+=+creal((a).c10)-creal((a).c01); \
@@ -125,7 +135,7 @@ typedef struct
 (r).d7 -= (+creal((a).c21)-creal((a).c12)); \
 (r).d8 -= ((-cimag((a).c00)-cimag((a).c11) + 2.0 * cimag(a.c22))*0.577350269189625);
 
-#if ( defined OMP )
+#if ( defined TM_USE_OMP )
 
 #define _trace_lambda_mul_add_assign_nonlocal(r,c,a) \
 _Pragma("omp atomic") \
@@ -190,7 +200,6 @@ _Pragma("omp atomic") \
 (r).d7=0.; \
 (r).d8=0.;
 
-
 #if defined SSE2
 #define _su3adj_assign_const_times_su3adj(res,c,in) \
 __asm__ __volatile__ ("movsd %0, %%xmm0 \n\t" \
diff --git a/temporalgauge.c b/temporalgauge.c
index 6dc73e2cc..4e3902238 100644
--- a/temporalgauge.c
+++ b/temporalgauge.c
@@ -1,5 +1,5 @@
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include "global.h"
 #include "GPU/cudadefs.h"
@@ -7,9 +7,11 @@
 #include "geometry_eo.h"
 #include "start.h"
 #include "temporalgauge.h"
+#include "measure_gauge_action.h"
 #include "stdio.h"
 #include "stdlib.h"
-#ifdef MPI
+#include "linalg_eo.h"
+#ifdef TM_USE_MPI
   #include<mpi.h>
   #include "mpi_init.h"
 #endif
@@ -54,7 +56,7 @@ void copy_gauge_field (su3 ** to, su3 ** from)
 */
 int init_temporalgauge_trafo (const int V, su3** gfield) {
 
-#ifndef MPI
+#ifndef TM_USE_MPI
 
    int it, iz, iy, ix;
    
@@ -274,7 +276,7 @@ int init_temporalgauge_trafo (const int V, su3** gfield) {
 
 // MPI implementation									// was merged into init_temporalgauge_without_mpi()
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 
 int init_temporalgauge_trafo_mpi (const int V, su3 ** gfield) {				// will initialize  g_trafo[]  as the transformation matrices
 											//	and  g_tempgauge_field  as a copy of  g_gauge_field
@@ -468,7 +470,7 @@ void finalize_temporalgauge() {
   free(tempgauge_field);
   free(g_tempgauge_field);
   
-  #ifdef MPI
+  #ifdef TM_USE_MPI
     free(left);
     free(right);
   #endif
@@ -545,7 +547,7 @@ void apply_gtrafo (su3 ** gfield, su3 * trafofield) {
       for (iy = 0; iy < LY; iy++) {
         for (iz = 0; iz < LZ; iz++) {
         
-          #ifdef MPI				// this is the MPI implementation of the GLOBAL TEMPORALGAUGE
+          #ifdef TM_USE_MPI				// this is the MPI implementation of the GLOBAL TEMPORALGAUGE
           
             pos = g_ipt[it][ix][iy][iz];
             
@@ -959,10 +961,130 @@ void apply_inv_gtrafo_spinor_even (spinor * spin, su3 * trafofield) {
   
 }
 
-
-
-
-
-
-
+void gtrafo_eo_nd(spinor * const Even_s, spinor * const Odd_s, spinor * const Even_c, spinor * const Odd_c, 
+                  spinor * const Even_new_s, spinor * const Odd_new_s, spinor * const Even_new_c, spinor * const Odd_new_c,
+                  GTRAFO_TYPE type){
+  
+  /* initialize temporal gauge here */
+  int retval;
+  double dret1, dret2;
+  static double plaquette1 = 0.0;
+  static double plaquette2 = 0.0;
+  
+  if(type==GTRAFO_APPLY){
+    /* need VOLUME here (not N=VOLUME/2)*/
+    if ((retval = init_temporalgauge_trafo(VOLUME, g_gauge_field)) != 0 ) {				// initializes the transformation matrices
+      if (g_proc_id == 0) printf("Error while gauge fixing to temporal gauge. Aborting...\n");   	//	g_tempgauge_field as a copy of g_gauge_field
+      exit(200);
+    }
+    
+    /* do trafo */
+    plaquette1 = measure_plaquette(g_gauge_field);
+    apply_gtrafo(g_gauge_field, g_trafo);								// transformation of the gauge field
+    plaquette2 = measure_plaquette(g_gauge_field);
+    if (g_proc_id == 0) printf("\tPlaquette before gauge fixing: %.16e\n", plaquette1/6./VOLUME);
+    if (g_proc_id == 0) printf("\tPlaquette after gauge fixing:  %.16e\n", plaquette2/6./VOLUME);
+    
+    /* do trafo to odd_s part of source */
+    dret1 = square_norm(Odd_s, VOLUME/2 , 1);
+    apply_gtrafo_spinor_odd(Odd_s, g_trafo);								// odd spinor transformation, strange
+    dret2 = square_norm(Odd_s, VOLUME/2, 1);
+    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); 
+    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2);
+    
+    /* do trafo to odd_c part of source */
+    dret1 = square_norm(Odd_c, VOLUME/2 , 1);
+    apply_gtrafo_spinor_odd(Odd_c, g_trafo);								// odd spinor transformation, charm
+    dret2 = square_norm(Odd_c, VOLUME/2, 1);
+    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); 
+    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2);       
+    
+    /* do trafo to even_s part of source */
+    dret1 = square_norm(Even_s, VOLUME/2 , 1);
+    apply_gtrafo_spinor_even(Even_s, g_trafo);							// even spinor transformation, strange
+    dret2 = square_norm(Even_s, VOLUME/2, 1);
+    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); 
+    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2);
+    
+    /* do trafo to even_c part of source */
+    dret1 = square_norm(Even_c, VOLUME/2 , 1);
+    apply_gtrafo_spinor_even(Even_c, g_trafo);							// even spinor transformation, charm
+    dret2 = square_norm(Even_c, VOLUME/2, 1);
+    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); 
+    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2);
+  } else {
+      /* undo trafo */
+    /* apply_inv_gtrafo(g_gauge_field, g_trafo);*/
+    /* copy back the saved original field located in g_tempgauge_field -> update necessary*/
+    plaquette1 = measure_plaquette(g_gauge_field);
+    copy_gauge_field(g_gauge_field, g_tempgauge_field);
+    g_update_gauge_copy = 1;
+    plaquette2 = measure_plaquette(g_gauge_field);
+    if (g_proc_id == 0) printf("\tPlaquette before inverse gauge fixing: %.16e\n", plaquette1/6./VOLUME);
+    if (g_proc_id == 0) printf("\tPlaquette after inverse gauge fixing:  %.16e\n", plaquette2/6./VOLUME);
+    
+    /* undo trafo to source Even_s */
+    dret1 = square_norm(Even_s, VOLUME/2 , 1);
+    apply_inv_gtrafo_spinor_even(Even_s, g_trafo);
+    dret2 = square_norm(Even_s, VOLUME/2, 1);
+    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); 
+    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2);
+    
+    
+    /* undo trafo to source Even_c */
+    dret1 = square_norm(Even_c, VOLUME/2 , 1);
+    apply_inv_gtrafo_spinor_even(Even_c, g_trafo);
+    dret2 = square_norm(Even_c, VOLUME/2, 1);
+    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1);
+    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2); 
+    
+    /* undo trafo to source Odd_s */
+    dret1 = square_norm(Odd_s, VOLUME/2 , 1);
+    apply_inv_gtrafo_spinor_odd(Odd_s, g_trafo);
+    dret2 = square_norm(Odd_s, VOLUME/2, 1);
+    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); 
+    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2);
+    
+    /* undo trafo to source Odd_c */
+    dret1 = square_norm(Odd_c, VOLUME/2 , 1);
+    apply_inv_gtrafo_spinor_odd(Odd_c, g_trafo);
+    dret2 = square_norm(Odd_c, VOLUME/2, 1);
+    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); 
+    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2); 
+    
+    
+    // Even_new_s
+    dret1 = square_norm(Even_new_s, VOLUME/2 , 1);
+    apply_inv_gtrafo_spinor_even(Even_new_s, g_trafo);
+    dret2 = square_norm(Even_new_s, VOLUME/2, 1);
+    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); 
+    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2);
+    
+    // Even_new_c
+    dret1 = square_norm(Even_new_c, VOLUME/2 , 1);
+    apply_inv_gtrafo_spinor_even(Even_new_c, g_trafo);
+    dret2 = square_norm(Even_new_c, VOLUME/2, 1);
+    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); 
+    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2);
+    
+    // Odd_new_s
+    dret1 = square_norm(Odd_new_s, VOLUME/2 , 1);
+    apply_inv_gtrafo_spinor_odd(Odd_new_s, g_trafo);
+    dret2 = square_norm(Odd_new_s, VOLUME/2, 1);
+    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); 
+    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2);
+    
+    // Odd_new_c
+    dret1 = square_norm(Odd_new_c, VOLUME/2 , 1);
+    apply_inv_gtrafo_spinor_odd(Odd_new_c, g_trafo);
+    dret2 = square_norm(Odd_new_c, VOLUME/2, 1);
+    if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); 
+    if (g_proc_id == 0) printf("\tsquare norm after gauge fixing:  %.16e\n", dret2); 
+    
+    finalize_temporalgauge();
+  }
+#    ifdef TM_USE_MPI
+  xchange_gauge(g_gauge_field);
+#    endif
+}
 
diff --git a/temporalgauge.h b/temporalgauge.h
index 128832d7e..3ebb5d687 100644
--- a/temporalgauge.h
+++ b/temporalgauge.h
@@ -20,6 +20,10 @@
 #ifndef _TEMPORALGAUGE_H
 #define _TEMPORALGAUGE_H
 
+typedef enum GTRAFO_TYPE {
+  GTRAFO_APPLY = 0,
+  GTRAFO_REVERT } GTRAFO_TYPE;
+
 int init_temporalgauge_trafo(const int V, su3** gfield);
 void apply_gtrafo(su3 ** gfield, su3 * trafofield);
 void apply_gtrafo_spinor(spinor * spin, su3 * trafofield);
@@ -32,6 +36,12 @@ void apply_inv_gtrafo_spinor_odd(spinor * spin, su3 * trafofield);
 void apply_gtrafo_spinor_even(spinor * spin, su3 * trafofield);
 void apply_inv_gtrafo_spinor_even(spinor * spin, su3 * trafofield);
 
+void gtrafo_eo_nd(spinor * const Even_s, spinor * const Odd_s, spinor * const Even_c, spinor * const Odd_c, 
+                  spinor * const Even_new_s, spinor * const Odd_new_s, spinor * const Even_new_c, spinor * const Odd_new_c,
+                  GTRAFO_TYPE type);
+
+void gtrafo_eo(spinor * const Even, spinor * const Odd, GTRAFO_TYPE type);
+
 void copy_gauge_field(su3** to, su3** from);
 
 #endif
diff --git a/tensors.h b/tensors.h
new file mode 100644
index 000000000..9d92c1d51
--- /dev/null
+++ b/tensors.h
@@ -0,0 +1,95 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2018 Bartosz Kostrzewa 
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ************************************************************************/
+
+#ifndef TENSORS_H
+#define TENSORS_H
+
+typedef struct epsilon4_t {
+  int N;
+  double eps_val[24];
+  int eps_idx[24][4];
+} epsilon4_t; 
+
+typedef struct epsilon3_t {
+  int N;
+  double eps_val[6];
+  int eps_idx[6][3];
+} epsilon3_t;
+
+static inline epsilon3_t new_epsilon3(void) {
+  epsilon3_t ret;
+
+  ret.N = 6;
+
+  int i = 0;
+  int p = 0;
+  for( int i1 = 1; i1 <= 3; i1++ ){
+    for( int i2 = 1; i2 <= 3; i2++ ){
+      for( int i3 = 1; i3 <= 3; i3++ ){
+        // for eps_123 we have: (1 - 2)(1 - 3)(2 - 3) = -2 
+        //                      -> minus sign
+        p = -(i1 - i2)*(i1 - i3)*(i2 - i3);
+        if( p != 0 ){
+          ret.eps_val[i] = p > 0 ? 1 : -1;
+          ret.eps_idx[i][0] = i1-1;
+          ret.eps_idx[i][1] = i2-1;
+          ret.eps_idx[i][2] = i3-1;
+          i++;
+        }
+      } 
+    }
+  }
+  return(ret);
+}
+
+// note that this is the Euclidean eps_ijkl, for which we have eps_1234 = 1,
+// whereas in Minkowski space we have eps_0123 = -1
+static inline epsilon4_t new_epsilon4(void) {
+  epsilon4_t ret;
+
+  ret.N = 24;
+
+  int i = 0;
+  int p = 0;
+  for( int i1 = 1; i1 <= 4; i1++ ){
+    for( int i2 = 1; i2 <= 4; i2++ ){
+      for( int i3 = 1; i3 <= 4; i3++ ){
+        for( int i4 = 1; i4 <= 4; i4++ ){
+          // for eps_1234 we have: (1 - 2)(1 - 3)(1 - 4)(2 - 3)(2 - 4)(3 - 4) = 12 
+          //                       -> NO minus sign
+          p = (i1 - i2)*(i1 - i3)*(i1 - i4)*(i2 - i3)*(i2 - i4)*(i3 - i4);
+          if( p != 0 ){
+            ret.eps_val[i] = p > 0 ? 1 : -1;
+            ret.eps_idx[i][0] = i1-1;
+            ret.eps_idx[i][1] = i2-1;
+            ret.eps_idx[i][2] = i3-1;
+            ret.eps_idx[i][3] = i4-1;
+            i++;
+          }
+        }
+      } 
+    }
+  }
+  return(ret);
+}
+
+#endif
+
diff --git a/test/check_geometry.c b/test/check_geometry.c
index eae06a51b..d087aba8d 100644
--- a/test/check_geometry.c
+++ b/test/check_geometry.c
@@ -28,18 +28,18 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
 #include "global.h"
 #include "geometry_eo.h"
 #include "test/check_geometry.h"
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include "mpi_init.h"
 #endif
 
@@ -466,7 +466,7 @@ int check_geometry()
 	    if(iz0 != iy0) {
 	      printf("Edge -t -z has an error\n");
 	      printf("Program aborted\n");
-#  ifdef MPI
+#  ifdef TM_USE_MPI
 	      MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize();
 #  endif
 	      exit(0);
@@ -488,7 +488,7 @@ int check_geometry()
 	    if(iz0 != iy0) {
 	      printf("Edge -t +z has an error\n");
 	      printf("Program aborted\n");
-#  ifdef MPI
+#  ifdef TM_USE_MPI
 	      MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize();
 #  endif
 	      exit(0);
@@ -691,7 +691,7 @@ int check_geometry()
     itest[ix]=0;
   }
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   if(g_dbw2rand > 0) {
 
 #if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
@@ -2016,7 +2016,7 @@ return(-1);
     itest[ix]=0;
   }
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   if(g_dbw2rand > 0) {
     for (x1 = 0; x1 < LX; x1++) {
       for (x2 = 0; x2 < LY; x2++) {
diff --git a/test/check_nan.c b/test/check_nan.c
index 51e515163..e18320d0c 100644
--- a/test/check_nan.c
+++ b/test/check_nan.c
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/test/check_overlap.c b/test/check_overlap.c
index 70d1c8b22..e6d02100b 100644
--- a/test/check_overlap.c
+++ b/test/check_overlap.c
@@ -26,7 +26,7 @@
 
 #include"lime.h"
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -34,7 +34,7 @@
 #include <time.h>
 #include <string.h>
 #include <signal.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
 #include "global.h"
@@ -44,7 +44,7 @@
 #include "start.h"
 /*#include "eigenvalues.h"*/
 #include "observables.h"
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include "xchange.h"
 #endif
 #include "io.h"
@@ -127,15 +127,14 @@ int main(int argc, char *argv[])
 
   DUM_DERI = 6;
   /* DUM_DERI + 2 is enough (not 7) */
-  DUM_SOLVER = DUM_DERI + 3;
-  DUM_MATRIX = DUM_SOLVER + 8;
+  DUM_MATRIX = DUM_DERI + 11;
   /* DUM_MATRIX + 2 is enough (not 6) */
   NO_OF_SPINORFIELDS = DUM_MATRIX + 2;
 
   verbose = 0;
   g_use_clover_flag = 0;
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Init(&argc, &argv);
 #endif
 
@@ -193,7 +192,7 @@ int main(int argc, char *argv[])
   /* generator                                            */
   start_ranlux(rlxd_level, random_seed);
 
-#ifndef MPI
+#ifndef TM_USE_MPI
   g_dbw2rand = 0;
 #endif
 
@@ -332,7 +331,7 @@ int main(int argc, char *argv[])
       fflush(stdout);
     }
     /*     unit_g_gauge_field(); */
-#ifdef MPI
+#ifdef TM_USE_MPI
     xchange_gauge(g_gauge_field);
 #endif
 
@@ -368,7 +367,7 @@ int main(int argc, char *argv[])
 	}
 
 	if (phmc_compute_evs != 0) {
-#ifdef MPI
+#ifdef TM_USE_MPI
 		MPI_Finalize();
 #endif
 		return (0);
@@ -387,7 +386,7 @@ int main(int argc, char *argv[])
 
     nstore += Nsave;
   }
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Finalize();
 #endif
 
diff --git a/test/check_xchange.c b/test/check_xchange.c
index 4367ccda8..ccdc716c8 100644
--- a/test/check_xchange.c
+++ b/test/check_xchange.c
@@ -26,12 +26,12 @@
 
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
@@ -50,7 +50,7 @@ int check_xchange()
 #pragma execution_frequency(very_low)
 #endif
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   double * x;
   int i,ix, mu, x0, x1, x2, x3, k;
   int mp, pm, mm, pp, di[4];
@@ -2981,7 +2981,7 @@ int check_xchange()
 #pragma execution_frequency(very_low)
 #endif
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   double * x;
   int i,ix, mu, x0, x1, x2, x3 = 0, k;
   int mp, pm, mm, pp, di[4];
diff --git a/test/measure_rectangles.debug.c b/test/measure_rectangles.debug.c
index 8eda8f018..963a44ed1 100644
--- a/test/measure_rectangles.debug.c
+++ b/test/measure_rectangles.debug.c
@@ -32,12 +32,12 @@
  *******************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
 #include "global.h"
@@ -131,7 +131,7 @@ double measure_rectangles() {
 /*   fprintf(debugfile,"###\n"); */
   fclose(debugfile);
   ga=(kc+ks)/3.0;
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Allreduce(&ga, &gas, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
   return gas;
 #else
diff --git a/test/overlaptests.c b/test/overlaptests.c
index b3f810752..e3cc32bb9 100644
--- a/test/overlaptests.c
+++ b/test/overlaptests.c
@@ -1,6 +1,6 @@
 #include"lime.h"
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -8,14 +8,14 @@
 #include <time.h>
 #include <string.h>
 #include <signal.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
 #include "global.h"
 #include "su3spinor.h"
 #include "linalg_eo.h"
 #include "start.h"
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include "xchange/xchange.h"
 #endif
 #include "read_input.h"
diff --git a/test/scalar_prod_r_test.c b/test/scalar_prod_r_test.c
index fed7d0e39..db61853d4 100644
--- a/test/scalar_prod_r_test.c
+++ b/test/scalar_prod_r_test.c
@@ -21,7 +21,7 @@
 /* #endif */
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #ifndef _STD_C99_COMPLEX
 #include "complex.h"
diff --git a/test/test_eigenvalues.c b/test/test_eigenvalues.c
index 12aa30665..dab454260 100644
--- a/test/test_eigenvalues.c
+++ b/test/test_eigenvalues.c
@@ -25,7 +25,7 @@
  *******************************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -34,7 +34,7 @@
 #include <sys/time.h>
 #include <string.h>
 #include <signal.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
 #include "global.h"
@@ -47,7 +47,7 @@
 */
 #include "observables.h"
 #include "measure_rectangles.h"
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include "xchange.h"
 #endif
 #include "io.h"
@@ -164,7 +164,7 @@ int main(int argc,char *argv[]) {
   if(g_rgi_C1 == 0.) {
     g_dbw2rand = 0;
   }
-#ifndef MPI
+#ifndef TM_USE_MPI
   g_dbw2rand = 0;
 #endif
 
@@ -335,7 +335,7 @@ int main(int argc,char *argv[]) {
 
     fprintf(parameterfile, "The lattice size is %d x %d x %d x %d\n", (int)(g_nproc_t*T), (int)(g_nproc_x*LX), (int)(LY), (int)(LZ));
     fprintf(parameterfile, "The local lattice size is %d x %d x %d x %d\n", (int)(T), (int)(LX), (int)(LY), (int)(LZ));
-    fprintf(parameterfile, "g_beta = %f , g_kappa= %f, g_kappa*csw/8= %f \n",g_beta,g_kappa,g_ka_csw_8);
+    fprintf(parameterfile, "g_beta = %f , g_kappa= %f, c_sw = %f \n",g_beta,g_kappa,g_c_sw);
     fprintf(parameterfile, "boundary of fermion fields (t,x,y,z): %f %f %f %f \n",X0,X1,X2,X3);
     fprintf(parameterfile, "EPS_SQ0=%e, EPS_SQ1=%e EPS_SQ2=%e, EPS_SQ3=%e \n"
 	    ,EPS_SQ0,EPS_SQ1,EPS_SQ2,EPS_SQ3);
@@ -410,13 +410,13 @@ int main(int argc,char *argv[]) {
 	random_gauge_field();
       }
       rlxd_get(rlxd_state);
-#ifdef MPI
+#ifdef TM_USE_MPI
       MPI_Send(&rlxd_state[0], 105, MPI_INT, 1, 99, MPI_COMM_WORLD);
       MPI_Recv(&rlxd_state[0], 105, MPI_INT, g_nproc-1, 99, MPI_COMM_WORLD, &status);
       rlxd_reset(rlxd_state);
 #endif
     }
-#ifdef MPI
+#ifdef TM_USE_MPI
     else {
       MPI_Recv(&rlxd_state[0], 105, MPI_INT, g_proc_id-1, 99, MPI_COMM_WORLD, &status);
       rlxd_reset(rlxd_state);
@@ -448,7 +448,7 @@ int main(int argc,char *argv[]) {
   }
 
   /*For parallelization: exchange the gaugefield */
-#ifdef MPI
+#ifdef TM_USE_MPI
   xchange_gauge(g_gauge_field);
 #endif
 #ifdef _GAUGE_COPY
@@ -562,7 +562,7 @@ int main(int argc,char *argv[]) {
     fprintf(parameterfile, "Acceptance Rate was: %e Prozent\n", 100.*(double)Rate/(double)Nmeas);
     fclose(parameterfile);
   }
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Finalize();
 #endif
   free_gauge_tmp();
diff --git a/test_Dslash.c b/test_Dslash.c
index 416c351e1..5f04ef2f3 100644
--- a/test_Dslash.c
+++ b/test_Dslash.c
@@ -23,14 +23,14 @@
 #if (defined BGL && !defined BGP)
 #  include <rts.h>
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 # ifdef HAVE_LIBLEMON
 #  include <io/params.h>
 #  include <io/gauge.h>
 # endif
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 # include "init/init_openmp.h"
 #endif
@@ -92,7 +92,7 @@ int main(int argc,char *argv[])
   static double t1,t2,dt,sdt,dts,qdt,sqdt;
   double antioptaway=0.0;
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   static double dt2;
 
   DUM_DERI = 6;
@@ -100,7 +100,7 @@ int main(int argc,char *argv[])
   DUM_MATRIX = DUM_SOLVER+6;
   NO_OF_SPINORFIELDS = DUM_MATRIX+2;
 
-#  ifdef OMP
+#  ifdef TM_USE_OMP
   int mpi_thread_provided;
   MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_thread_provided);
 #  else
@@ -120,7 +120,7 @@ int main(int argc,char *argv[])
     exit(-1);
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   init_openmp();
 #endif
 
@@ -162,7 +162,7 @@ int main(int argc,char *argv[])
     printf("# The code was compiled for persistent MPI calls (halfspinor only)\n");
 #  endif
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
 #  ifdef _NON_BLOCKING
     printf("# The code was compiled for non-blocking MPI calls (spinor and gauge)\n");
 #  endif
@@ -247,7 +247,7 @@ int main(int argc,char *argv[])
   start_ranlux(1, 123456);
   random_gauge_field(reproduce_randomnumber_flag, g_gauge_field);
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   /*For parallelization: exchange the gaugefield */
   xchange_gauge(g_gauge_field);
 #endif
@@ -260,7 +260,7 @@ int main(int argc,char *argv[])
 	  random_spinor_field_lexic(g_spinor_field[k], reproduce_randomnumber_flag, RN_GAUSS);
 	}
 
-#ifdef MPI
+#ifdef TM_USE_MPI
       MPI_Barrier(MPI_COMM_WORLD);
 #endif
       t1 = gettime();
@@ -270,7 +270,7 @@ int main(int argc,char *argv[])
 
       t2 = gettime();
       dt=t2-t1;
-#ifdef MPI
+#ifdef TM_USE_MPI
       MPI_Allreduce (&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 #else
       sdt = dt;
@@ -295,14 +295,14 @@ int main(int argc,char *argv[])
 #endif
 
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   free_omp_accumulators();
 #endif
   free_gauge_field();
   free_geometry_indices();
   free_spinor_field();
   free_moment_field();
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Barrier(MPI_COMM_WORLD);
   MPI_Finalize();
 #endif
diff --git a/test_Dslash.input b/test_Dslash.input
deleted file mode 100644
index 4c5b942e7..000000000
--- a/test_Dslash.input
+++ /dev/null
@@ -1,7 +0,0 @@
-T=16
-L=8
-NrXProcs = 1
-NrYProcs = 1
-NrZProcs = 1
-
-OMPNumThreads = 1
diff --git a/test_DslashBSM.c b/test_DslashBSM.c
index b4e9df726..c497ddc4d 100644
--- a/test_DslashBSM.c
+++ b/test_DslashBSM.c
@@ -1,6 +1,4 @@
-/***********************************************************************
- *
- * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach,
+/* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach,
  * 2014 Mario Schroeck
  *
  * This file is part of tmLQCD.
@@ -31,21 +29,21 @@
 
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <time.h>
 #include <string.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 # ifdef HAVE_LIBLEMON
 #	include <io/params.h>
 #	include <io/gauge.h>
 # endif
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 # include "init/init_openmp.h"
 #endif
@@ -60,18 +58,24 @@
 #include "start.h"
 #include "boundary.h"
 #include "io/gauge.h"
+#ifdef TM_USE_BSM
 #include "io/scalar.h"
+#endif
 #include "global.h"
 #include "git_hash.h"
 #include "getopt.h"
 #include "xchange/xchange.h"
 #include "init/init.h"
+#ifdef TM_USE_BSM
 #include "init/init_scalar_field.h"
 #include "init/init_bsm_2hop_lookup.h"
+#endif
 #include "test/check_geometry.h"
+#ifdef TM_USE_BSM
 #include "operator/D_psi_BSM2b.h"
 #include "operator/D_psi_BSM2m.h"
 #include "operator/M_psi.h"
+#endif
 #include "mpi_init.h"
 #include "measure_gauge_action.h"
 #include "buffers/utils.h"
@@ -102,7 +106,12 @@
 static void usage();
 static void process_args(int argc, char *argv[], char ** input_filename, char ** filename);
 static void set_default_filenames(char ** input_filename, char ** filename);
-
+#ifndef TM_USE_BSM
+int main(int argc,char *argv[])
+{
+  printf("Works only with BSM operator switched on\n");
+}
+#else
 int main(int argc,char *argv[])
 {
   FILE *parameterfile = NULL;
@@ -135,15 +144,14 @@ int main(int argc,char *argv[])
 	static double t1,t2,dt,sdt,dts,qdt,sqdt;
 	double antioptaway=0.0;
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 	static double dt2;
 
 	DUM_DERI = 6;
-	DUM_SOLVER = DUM_DERI+2;
-	DUM_MATRIX = DUM_SOLVER+6;
+	DUM_MATRIX = DUM_DERI+2;
 	NO_OF_SPINORFIELDS = DUM_MATRIX+2;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 	int mpi_thread_provided;
 	MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_thread_provided);
 #else
@@ -170,9 +178,11 @@ int main(int argc,char *argv[])
 		printf("parameter rho_BSM set to %f\n", rho_BSM);
 		printf("parameter eta_BSM set to %f\n", eta_BSM);
 		printf("parameter  m0_BSM set to %f\n",  m0_BSM);
+		printf("parameter mu03_BSM set to %f\n", mu03_BSM);
+		printf("parameter mu01_BSM set to %f\n", mu01_BSM);
 	}
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 	init_openmp();
 #endif
 
@@ -213,7 +223,7 @@ int main(int argc,char *argv[])
 		printf("# The code was compiled for persistent MPI calls (halfspinor only)\n");
 #endif
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
 	#ifdef _NON_BLOCKING
 		printf("# The code was compiled for non-blocking MPI calls (spinor and gauge)\n");
 	#endif
@@ -323,6 +333,14 @@ int main(int argc,char *argv[])
 		for( int s=0; s<numbScalarFields; s++ )
 			ranlxd(g_scalar_field[s], VOLUME);
 	}
+	else if( strcmp(scalar_input_filename, "create_trivial_scalarfield") == 0 ) {
+		for(int s=0;s<VOLUME;s++){
+			g_scalar_field[0][s]=1.0;
+			g_scalar_field[1][s]=0;
+			g_scalar_field[2][s]=0;
+			g_scalar_field[3][s]=0;
+		}
+	}
 	else {
 		sprintf(scalar_filename, "%s.%d", scalar_input_filename, nscalar);
 		if (g_cart_id == 0) {
@@ -343,7 +361,7 @@ int main(int argc,char *argv[])
 		}
 	}
 
-#ifdef MPI
+#ifdef TM_USE_MPI
     xchange_gauge(g_gauge_field);
 #endif
 
@@ -355,7 +373,7 @@ int main(int argc,char *argv[])
       fflush(stdout);
     }
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 	for( int s=0; s<numbScalarFields; s++ )
 		generic_exchange(g_scalar_field[s], sizeof(scalar));
 #endif
@@ -370,7 +388,7 @@ int main(int argc,char *argv[])
   // v
 	random_spinor_field_lexic( (spinor*)(g_bispinor_field[5]), reproduce_randomnumber_flag, RN_GAUSS);
 	random_spinor_field_lexic( (spinor*)(g_bispinor_field[5])+VOLUME, reproduce_randomnumber_flag, RN_GAUSS);
-#if defined MPI
+#if defined TM_USE_MPI
 	generic_exchange(g_bispinor_field[4], sizeof(bispinor));
 #endif
 
@@ -405,28 +423,28 @@ int main(int argc,char *argv[])
   /* now apply the operators to the same bispinor field and do various comparisons */
 
   // Marco's operator
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Barrier(MPI_COMM_WORLD);
 #endif
   t_MG = 0.0;
   t1 = gettime();
   D_psi_BSM2m(g_bispinor_field[0], g_bispinor_field[4]);
   t1 = gettime() - t1;
-#ifdef MPI
+#ifdef TM_USE_MPI
 	MPI_Allreduce (&t1, &t_MG, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 #else
   t_MG = t1;
 #endif
 
   // Bartek's operator
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Barrier(MPI_COMM_WORLD);
 #endif
   t_BK = 0.0;
   t1 = gettime();
   D_psi_BSM2b(g_bispinor_field[1], g_bispinor_field[4]);
   t1 = gettime() - t1;
-#ifdef MPI
+#ifdef TM_USE_MPI
 	MPI_Allreduce (&t1, &t_BK, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 #else
   t_BK = t1;
@@ -516,14 +534,14 @@ int main(int argc,char *argv[])
 	}
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 	free_omp_accumulators();
 #endif
 	free_gauge_field();
 	free_geometry_indices();
 	free_bispinor_field();
 	free_scalar_field();
-#ifdef MPI
+#ifdef TM_USE_MPI
 	MPI_Barrier(MPI_COMM_WORLD);
 	MPI_Finalize();
 #endif
@@ -580,4 +598,4 @@ static void set_default_filenames(char ** input_filename, char ** filename) {
     strcpy(*filename,"output");
   }
 }
-
+#endif
diff --git a/test_DslashBSM.input b/test_DslashBSM.input
index 27fa425d0..abf0afca2 100644
--- a/test_DslashBSM.input
+++ b/test_DslashBSM.input
@@ -1,6 +1,19 @@
-L=24
-T=48
+L=4
+T=8
 ompnumthreads=4
+
 GaugeFieldInFile = create_random_gaugefield
-ScalarFieldInFile = create_random_scalarfield
+ScalarFieldInFile = create_trivial_scalarfield
 
+  
+BeginOperator BSM2b
+  rho = 1.
+  m0 = 0.0000
+  mu03 =  1.00000
+  mu01 =  0.00000
+  eta = 0.5
+  solver = cg
+  solverprecision = 1e-24
+  maxsolveriterations = 15000
+  npergauge = 1
+EndOperator
diff --git a/test_DslashBSM2.c b/test_DslashBSM2.c
index 2e4eb6543..4e6ea2716 100644
--- a/test_DslashBSM2.c
+++ b/test_DslashBSM2.c
@@ -27,18 +27,18 @@
 * otherwise a simple application of Dslash on a spinor will be tested.
 *
 *******************************************************************************/
-#define TEST_INVERSION 0
+#define TEST_INVERSION 1
 
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <time.h>
 #include <string.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 # ifdef HAVE_LIBLEMON
 #	include <io/params.h>
@@ -56,19 +56,26 @@
 #include "start.h"
 #include "boundary.h"
 #include "io/gauge.h"
+#ifdef TM_USE_BSM
 #include "io/scalar.h"
+#endif
 #include "global.h"
 #include "getopt.h"
 #include "xchange/xchange.h"
 #include "init/init.h"
+#ifdef TM_USE_BSM
 #include "init/init_scalar_field.h"
 #include "init/init_bsm_2hop_lookup.h"
+#include "buffers/utils_nogauge.h"
+#endif
 #include "test/check_geometry.h"
+#ifdef TM_USE_BSM
 #include "operator/D_psi_BSM2b.h"
 #include "operator/D_psi_BSM2m.h"
 #include "operator/D_psi_BSM2f.h"
-#include "solver/eigenvalues_krylov.h"
 #include "operator/M_psi.h"
+#endif
+#include "solver/eigenvalues_krylov.h"
 #include "mpi_init.h"
 #include "measure_gauge_action.h"
 #include "buffers/utils.h"
@@ -99,7 +106,11 @@
 static void usage();
 static void process_args(int argc, char *argv[], char ** input_filename, char ** filename);
 static void set_default_filenames(char ** input_filename, char ** filename);
-
+#ifndef TM_USE_BSM
+int main(int argc,char *argv[]){
+  printf("Works only with BSM operators switched on\n");
+}
+#else
 int main(int argc,char *argv[])
 {
   FILE *parameterfile = NULL;
@@ -121,7 +132,7 @@ int main(int argc,char *argv[])
 		even_odd_flag=0;
 		printf("# WARNING: even_odd_flag will be ignored (not supported here).\n");
 	}
-	int j,j_max,k,k_max = 2;
+	int j;
 //	_Complex double * drvsc;
 
 #ifdef HAVE_LIBLEMON
@@ -129,15 +140,12 @@ int main(int argc,char *argv[])
 #endif
 	int status = 0;
 
-	static double t1,t2,dt,sdt,dts,qdt,sqdt;
-	double antioptaway=0.0;
+	static double t1;
 
-#ifdef MPI
-	static double dt2;
+#ifdef TM_USE_MPI
 
 	DUM_DERI = 6;
-	DUM_SOLVER = DUM_DERI+2;
-	DUM_MATRIX = DUM_SOLVER+6;
+	DUM_MATRIX = DUM_DERI+2;
 	NO_OF_SPINORFIELDS = DUM_MATRIX+2;
 
 	MPI_Init(&argc, &argv);
@@ -165,7 +173,7 @@ int main(int argc,char *argv[])
 		printf("parameter  m0_BSM set to %f\n",  m0_BSM);
 	}
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 	init_openmp();
 #endif
 
@@ -206,7 +214,7 @@ int main(int argc,char *argv[])
 		printf("# The code was compiled for persistent MPI calls (halfspinor only)\n");
 #endif
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
 	#ifdef _NON_BLOCKING
 		printf("# The code was compiled for non-blocking MPI calls (spinor and gauge)\n");
 	#endif
@@ -312,7 +320,7 @@ int main(int argc,char *argv[])
 	}
 
 	// read scalar field
-if( strcmp(scalar_input_filename, "create_random_scalarfield") == 0 ) {
+	if( strcmp(scalar_input_filename, "create_random_scalarfield") == 0 ) {
 		for( int s=0; s<numbScalarFields; s++ )
 			ranlxd(g_scalar_field[s], VOLUME);
 	}
@@ -336,7 +344,7 @@ if( strcmp(scalar_input_filename, "create_random_scalarfield") == 0 ) {
 		}
 	}
 
-#ifdef MPI
+#ifdef TM_USE_MPI
     xchange_gauge(g_gauge_field);
 #endif
 
@@ -351,7 +359,7 @@ if( strcmp(scalar_input_filename, "create_random_scalarfield") == 0 ) {
     }
    
 
-#ifdef MPI
+#ifdef TM_USE_MPI
         // printf("Starting generic exchange routines for the scalar field\n");
 	for( int s=0; s<numbScalarFields; s++ )
             generic_exchange_nogauge(g_scalar_field[s], sizeof(scalar));
@@ -359,8 +367,6 @@ if( strcmp(scalar_input_filename, "create_random_scalarfield") == 0 ) {
 #endif
 
 	/*initialize the bispinor fields*/
-	j_max=1;
-	sdt=0.;
   // w/
 	unit_spinor_field_lexic( (spinor*)(g_bispinor_field[4]));//, 	  reproduce_randomnumber_flag, RN_GAUSS);
 	unit_spinor_field_lexic( (spinor*)(g_bispinor_field[4])+VOLUME);//, reproduce_randomnumber_flag, RN_GAUSS);
@@ -380,10 +386,12 @@ if( strcmp(scalar_input_filename, "create_random_scalarfield") == 0 ) {
 		printf("\n# square norm of the source: ||w||^2 = %e\n\n", squarenorm);
 		fflush(stdout);
 	}
-
+//initialize BSM2f operator
+ init_D_psi_BSM2f();
 ////  eigmax();
   double t_F;
 	/* inversion needs to be done first because it uses loads of the g_bispinor_fields internally */
+
 #if TEST_INVERSION
   if(g_proc_id==1)
     printf("Testing inversion\n");
@@ -400,14 +408,14 @@ if( strcmp(scalar_input_filename, "create_random_scalarfield") == 0 ) {
   /* now apply the operators to the same bispinor field and do various comparisons */
 
   // Feri's operator
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Barrier(MPI_COMM_WORLD);
 #endif
   t_F = 0.0;
   t1 = gettime();
   D_psi_BSM2f(g_bispinor_field[0], g_bispinor_field[4]);
   t1 = gettime() - t1;
-#ifdef MPI
+#ifdef TM_USE_MPI
 	MPI_Allreduce (&t1, &t_F, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 #else
   t_F = t1;
@@ -449,7 +457,7 @@ if( strcmp(scalar_input_filename, "create_random_scalarfield") == 0 ) {
   free_geometry_indices();
   free_bispinor_field();
   free_scalar_field();
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Barrier(MPI_COMM_WORLD);
   MPI_Finalize();
 #endif
@@ -506,4 +514,4 @@ static void set_default_filenames(char ** input_filename, char ** filename) {
     strcpy(*filename,"output");
   }
 }
-
+#endif
diff --git a/test_DslashBSM3.c b/test_DslashBSM3.c
new file mode 100644
index 000000000..9555da4ab
--- /dev/null
+++ b/test_DslashBSM3.c
@@ -0,0 +1,802 @@
+/* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach,
+ * 2014 Mario Schroeck
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.	If not, see <http://www.gnu.org/licenses/>.
+ *
+ *******************************************************************************/
+
+/*******************************************************************************
+*
+* test program for Frezzotti-Rossi BSM toy model Dslash (D_psi_BSM)
+* set variable TEST_INVERSION to 1 for testing the inversion,
+* otherwise a simple application of Dslash on a spinor will be tested.
+*
+*******************************************************************************/
+#define TEST_INVERSION 0
+
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <string.h>
+#ifdef TM_USE_MPI
+# include <mpi.h>
+# ifdef HAVE_LIBLEMON
+#	include <io/params.h>
+#	include <io/gauge.h>
+# endif
+#endif
+#ifdef TM_USE_OMP
+# include <omp.h>
+# include "init/init_openmp.h"
+#endif
+#include "gettime.h"
+#include "su3.h"
+#include "linalg/scalar_prod.h"
+#include "linalg/diff.h"
+#include "su3adj.h"
+#include "ranlxd.h"
+#include "geometry_eo.h"
+#include "read_input.h"
+#include "start.h"
+#include "boundary.h"
+#include "io/gauge.h"
+#ifdef TM_USE_BSM
+#include "io/scalar.h"
+#endif
+#include "global.h"
+#include "git_hash.h"
+#include "getopt.h"
+#include "xchange/xchange.h"
+#include "init/init.h"
+#include "linalg/assign.h"
+#include "operator/D_psi.h"
+#ifdef TM_USE_BSM
+#include "init/init_scalar_field.h"
+#include "init/init_bsm_2hop_lookup.h"
+#endif
+#include "test/check_geometry.h"
+#ifdef TM_USE_BSM
+#include "operator/D_psi_BSM2b.h"
+#include "operator/D_psi_BSM3.h"
+#include "operator/D_psi_BSM3_test.h"
+#include "operator/D_psi_BSM2m.h"
+#include "operator/M_psi.h"
+#include "buffers/utils_nogauge.h"
+#include "operator/clovertm_operators.h"
+#include "operator/clover_leaf.h"
+#endif
+#include "mpi_init.h"
+#include "measure_gauge_action.h"
+#include "buffers/utils.h"
+#include "linalg/square_norm.h"
+#include "linalg/comp_decomp.h"
+#include "linalg/assign_diff_mul.h"
+#include "solver/fgmres4bispinors.h"
+#include "solver/solver.h"
+
+#ifdef PARALLELT
+#	define SLICE (LX*LY*LZ/2)
+#elif defined PARALLELXT
+#	define SLICE ((LX*LY*LZ/2)+(T*LY*LZ/2))
+#elif defined PARALLELXYT
+#	define SLICE ((LX*LY*LZ/2)+(T*LY*LZ/2) + (T*LX*LZ/2))
+#elif defined PARALLELXYZT
+#	define SLICE ((LX*LY*LZ/2)+(T*LY*LZ/2) + (T*LX*LZ/2) + (T*LX*LY/2))
+#elif defined PARALLELX
+#	define SLICE ((LY*LZ*T/2))
+#elif defined PARALLELXY
+#	define SLICE ((LY*LZ*T/2) + (LX*LZ*T/2))
+#elif defined PARALLELXYZ
+#	define SLICE ((LY*LZ*T/2) + (LX*LZ*T/2) + (LX*LY*T/2))
+#endif
+
+int check_xchange();
+
+static void usage();
+static void process_args(int argc, char *argv[], char ** input_filename, char ** filename);
+static void set_default_filenames(char ** input_filename, char ** filename);
+#ifndef TM_USE_BSM
+int main(int argc,char *argv[])
+{
+  printf("Works only with BSM operator switched on\n");
+}
+#else
+int main(int argc,char *argv[])
+{
+  FILE *parameterfile = NULL;
+  char datafilename[206];
+  char parameterfilename[206];
+  char conf_filename[50];
+  char scalar_filename[50];
+  char * input_filename = NULL;
+  char * filename = NULL;
+  double plaquette_energy;
+
+#ifdef _USE_HALFSPINOR
+#undef _USE_HALFSPINOR
+  printf("# WARNING: USE_HALFSPINOR will be ignored (not supported here).\n");
+#endif
+
+  if(even_odd_flag)
+  {
+     even_odd_flag=0;
+     printf("# WARNING: even_odd_flag will be ignored (not supported here).\n");
+  }
+  int j,j_max,k,k_max = 2;
+  _Complex double * drvsc;
+
+#ifdef HAVE_LIBLEMON
+  paramsXlfInfo *xlfInfo;
+#endif
+  int status = 0;
+
+  static double t1,t2,dt,sdt,dts,qdt,sqdt;
+  double antioptaway=0.0;
+
+  static double dt2;
+
+  DUM_DERI = 6;
+  DUM_MATRIX = DUM_DERI+5;
+  NO_OF_SPINORFIELDS = DUM_MATRIX+2;
+
+
+#ifdef TM_USE_MPI
+#ifdef TM_USE_OMP
+  int mpi_thread_provided;
+  MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_thread_provided);
+#else
+  MPI_Init(&argc, &argv);
+#endif
+  MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id);
+#else 
+  g_proc_id = 0;
+#endif
+  g_rgi_C1 = 1.;
+
+  process_args(argc,argv,&input_filename,&filename);
+  set_default_filenames(&input_filename, &filename);
+
+  /* Read the input file */
+  if( (j = read_input(input_filename)) != 0) {
+    fprintf(stderr, "Could not find input file: %s\nAborting...\n", input_filename);
+    exit(-1);
+  }
+
+  if(g_proc_id==0) {
+    printf("parameter rho_BSM set to %f\n", rho_BSM);
+    printf("parameter eta_BSM set to %f\n", eta_BSM);
+    printf("parameter  m0_BSM set to %f\n",  m0_BSM);
+    printf("parameter mu03_BSM set to %f\n", mu03_BSM);
+    printf("parameter mu01_BSM set to %f\n", mu01_BSM);
+  }
+
+#ifdef TM_USE_OMP
+  init_openmp();
+#endif
+     
+
+  tmlqcd_mpi_init(argc, argv);
+
+
+  if(g_proc_id==0) {
+#ifdef SSE
+     printf("# The code was compiled with SSE instructions\n");
+#endif
+#ifdef SSE2
+     printf("# The code was compiled with SSE2 instructions\n");
+#endif
+#ifdef SSE3
+     printf("# The code was compiled with SSE3 instructions\n");
+#endif
+#ifdef P4
+     printf("# The code was compiled for Pentium4\n");
+#endif
+#ifdef OPTERON
+     printf("# The code was compiled for AMD Opteron\n");
+#endif
+#ifdef _GAUGE_COPY
+     printf("# The code was compiled with -D_GAUGE_COPY\n");
+#endif
+#ifdef BGL
+     printf("# The code was compiled for Blue Gene/L\n");
+#endif
+#ifdef BGP
+     printf("# The code was compiled for Blue Gene/P\n");
+#endif
+#ifdef _USE_HALFSPINOR
+     printf("# The code was compiled with -D_USE_HALFSPINOR\n");
+#endif
+#ifdef _USE_SHMEM
+     printf("# The code was compiled with -D_USE_SHMEM\n");
+#ifdef _PERSISTENT
+     printf("# The code was compiled for persistent MPI calls (halfspinor only)\n");
+#endif
+#ifdef TM_USE_BSM
+     printf("# The code was compiled for persistent for BSM operators i.e. we are using two gauge fields\n");
+#endif
+#endif
+#ifdef TM_USE_MPI
+#ifdef _NON_BLOCKING
+     printf("# The code was compiled for non-blocking MPI calls (spinor and gauge)\n");
+#endif
+#endif
+     printf("\n");
+     fflush(stdout);
+  }
+
+#ifdef TM_USE_BSM
+
+#ifdef _GAUGE_COPY
+  init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
+#else
+  init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
+#endif
+  init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand);
+
+
+  j = init_bispinor_field(VOLUMEPLUSRAND, 12);
+  if ( j!= 0) {
+     fprintf(stderr, "Not enough memory for bispinor fields! Aborting...\n");
+     exit(0);  
+  }
+
+  j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS+1);
+  if ( j!= 0) {
+    fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n");
+    exit(0);
+  }
+
+  int numbScalarFields = 4;
+  j = init_scalar_field(VOLUMEPLUSRAND, numbScalarFields);
+  if ( j!= 0) {
+     fprintf(stderr, "Not enough memory for scalar fields! Aborting...\n");
+		exit(0);
+  }
+ 
+  drvsc = malloc(18*VOLUMEPLUSRAND*sizeof(_Complex double));
+
+  if(g_proc_id == 0) {
+    fprintf(stdout,"# The number of processes is %d \n",g_nproc);
+    fprintf(stdout,"# The lattice size is %d x %d x %d x %d\n",
+		 (int)(T*g_nproc_t), (int)(LX*g_nproc_x), (int)(LY*g_nproc_y), (int)(g_nproc_z*LZ));
+    fprintf(stdout,"# The local lattice size is %d x %d x %d x %d\n",
+		 (int)(T), (int)(LX), (int)(LY),(int) LZ);
+
+    fflush(stdout); 
+  }
+
+  /* define the geometry */
+  geometry();
+
+  /* define the boundary conditions for the fermion fields */
+  /* for the actual inversion, this is done in invert.c as the operators are iterated through */
+  // 
+  // For the BSM operator we don't use kappa normalisation,
+  // as a result, when twisted boundary conditions are applied this needs to be unity.
+  // In addition, unlike in the Wilson case, the hopping term comes with a plus sign.
+  // However, in boundary(), the minus sign for the Wilson case is implicitly included.
+  // We therefore use -1.0 here.
+  boundary(-1.0);
+
+  status = check_geometry();
+  if (status != 0) {
+    fprintf(stderr, "Checking of geometry failed. Unable to proceed.\nAborting....\n");
+    exit(1);
+  }
+  init_dirac_halfspinor();
+  init_D_psi_BSM3();
+//#if (defined MPI && !(defined _USE_SHMEM))
+// fails, we're not using spinor fields
+//	check_xchange();
+//#endif
+
+  start_ranlux(1, 123456);
+
+  // read gauge field
+  if( strcmp(gauge_input_filename, "create_random_gaugefield") == 0 ) {
+
+    //Creating random field for the chitilde breaking part
+    random_gauge_field(reproduce_randomnumber_flag, g_gauge_field);
+
+    //Creating random field for the wilson part part
+    random_gauge_field(reproduce_randomnumber_flag, g_smeared_gauge_field);
+
+  }
+  else {
+    snprintf(conf_filename, 50, "%s.%.4d", gauge_input_filename, nstore);
+    if (g_cart_id == 0) {
+      printf("#\n# Trying to read gauge field from file %s in %s precision.\n",
+		conf_filename, (gauge_precision_read_flag == 32 ? "single" : "double"));
+      fflush(stdout);
+    }
+
+    int i;
+    if( (i = read_gauge_field(conf_filename,g_gauge_field)) !=0) {
+	  fprintf(stderr, "Error %d while reading gauge field from %s\n Aborting...\n", i, conf_filename);
+	  exit(-2);
+    }
+
+    snprintf(conf_filename, 50, "%s_smeared.%.4d", gauge_input_filename, nstore);
+    if (g_cart_id == 0) {
+      printf("#\n# Trying to read smeared gauge field from file %s in %s precision.\n",
+                conf_filename, (gauge_precision_read_flag == 32 ? "single" : "double"));
+      fflush(stdout);
+    }
+
+    if( (i = read_gauge_field(conf_filename,g_smeared_gauge_field)) !=0) {
+          fprintf(stderr, "Error %d while reading gauge field from %s\n Aborting...\n", i, conf_filename);
+          exit(-2);
+    }
+
+
+    if (g_cart_id == 0) {
+      printf("# Finished reading gauge field.\n");
+      fflush(stdout);
+    }
+  }
+
+  // read scalar field
+  if( strcmp(scalar_input_filename, "create_random_scalarfield") == 0 ) {
+    for( int s=0; s<numbScalarFields; s++ )
+      ranlxd(g_scalar_field[s], VOLUME);
+  }
+  else if( strcmp(scalar_input_filename, "create_trivial_scalarfield") == 0 ) {
+    for(int s=0;s<VOLUME;s++){
+      g_scalar_field[0][s]=1.0;
+      g_scalar_field[1][s]=0;
+      g_scalar_field[2][s]=0;
+      g_scalar_field[3][s]=0;
+    }
+  }
+  else {
+    sprintf(scalar_filename, "%s.%d", scalar_input_filename, nscalar);
+    if (g_cart_id == 0) {
+      printf("#\n# Trying to read scalar field from file %s in %s precision.\n",
+	scalar_filename, (scalar_precision_read_flag == 32 ? "single" : "double"));
+        fflush(stdout);
+    }
+
+    int i;
+    if( (i = read_scalar_field_parallel(scalar_filename,g_scalar_field)) !=0) {
+      fprintf(stderr, "Error %d while reading scalar field from %s\n Aborting...\n", i, scalar_filename);
+      exit(-2);
+    }
+
+    if (g_cart_id == 0) {
+      printf("# Finished reading scalar field.\n");
+      fflush(stdout);
+    }
+  }
+
+#ifdef TM_USE_MPI
+  xchange_gauge(g_gauge_field);
+  xchange_gauge(g_smeared_gauge_field);
+#endif
+
+  /*compute the energy of the gauge field*/
+  plaquette_energy = measure_plaquette( (const su3**) g_gauge_field);
+
+  if (g_cart_id == 0) {
+    printf("# The computed plaquette for the unsmeared gauge field value is %e.\n", plaquette_energy / (6.*VOLUME*g_nproc));
+    fflush(stdout);
+  }
+
+  plaquette_energy = measure_plaquette( (const su3**) g_smeared_gauge_field);
+
+  if (g_cart_id == 0) {
+    printf("# The computed plaquette for the  smeared gauge field value is %e.\n", plaquette_energy / (6.*VOLUME*g_nproc));
+    fflush(stdout);
+  }
+
+
+#if defined TM_USE_MPI
+  for( int s=0; s<numbScalarFields; s++ )
+    generic_exchange_nogauge(g_scalar_field[s], sizeof(scalar));
+#endif
+
+  if (g_cart_id == 0) {
+    printf("# Scalar send is done\n");
+    fflush(stdout);
+  }
+
+  /*initialize the bispinor fields*/
+  j_max=1;
+  sdt=0.;
+  // w
+  random_spinor_field_lexic( (spinor*)(g_bispinor_field[4]), reproduce_randomnumber_flag, RN_GAUSS);
+  random_spinor_field_lexic( (spinor*)(g_bispinor_field[4])+VOLUME, reproduce_randomnumber_flag, RN_GAUSS);
+        
+  if (g_cart_id == 0) {
+   printf("# Random fields for fermion generated\n");
+   fflush(stdout);
+  }
+
+  // for the D^\dagger test:
+  random_spinor_field_lexic( (spinor*)(g_bispinor_field[5])+VOLUME, reproduce_randomnumber_flag, RN_GAUSS);
+#if defined TM_USE_MPI
+  generic_exchange(g_bispinor_field[4], sizeof(bispinor));
+  generic_exchange(g_bispinor_field[5], sizeof(bispinor));
+#endif
+
+  double squarenorm_w;
+
+  init_sw_fields(VOLUME);
+/*************************************************************************************************************
+ *
+ *
+ *           Testing for the correct implementation of dagger of the Dslash 
+ *
+ ************************************************************************************************************/
+
+  printf("# [tmlqcd-BSM test] First we test the correct implementation of the dagger of D_psi_BSM3\n");      
+
+  double t_FP;
+
+#ifdef TM_USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  t_FP = 0.0;
+  t1 = gettime();
+  D_psi_BSM3(g_bispinor_field[2], g_bispinor_field[4]);
+  t1 = gettime() - t1;
+#ifdef TM_USE_MPI
+  MPI_Allreduce (&t1, &t_FP, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#else
+  t_FP = t1;
+#endif
+
+#ifdef TM_USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  t_FP = 0.0;
+  t1 = gettime();
+  D_psi_dagger_BSM3(g_bispinor_field[1], g_bispinor_field[5]);
+  t1 = gettime() - t1;
+#ifdef TM_USE_MPI
+  MPI_Allreduce (&t1, &t_FP, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#else
+  t_FP = t1;
+#endif
+
+  _Complex double prod2_FP_vw  = scalar_prod((spinor*)g_bispinor_field[5], (spinor*)g_bispinor_field[2], 2*VOLUME, 1);
+  _Complex double prod2_FP_wdv = scalar_prod((spinor*)g_bispinor_field[4], (spinor*)g_bispinor_field[1], 2*VOLUME, 1);
+
+  if( g_proc_id == 0 ){
+          printf("# [tmlqcd-BSM test] < v, D_FP w > = %.16e + I*(%.16e)\n", creal(prod2_FP_vw), cimag(prod2_FP_vw));
+          printf("# [tmlqcd-BSM test] < w, D_FP^dagger v > = %.16e + I*(%.16e)\n\n", creal(prod2_FP_wdv), cimag(prod2_FP_wdv));
+
+  }
+
+#if TEST_INVERSION
+  if(g_proc_id==1)
+    printf("Testing inversion\n");
+  // Feri's operator
+  assign_add_mul((spinor*)g_bispinor_field[10], (spinor*)g_bispinor_field[5], 1.0, 2*VOLUME);
+  printf("Starting field %e\n",creal(g_bispinor_field[5][0].sp_up.s0.c0));
+  printf("Starting field %e\n",creal(g_bispinor_field[10][0].sp_up.s0.c0));
+  t1 = gettime();
+        cg_her_bi(g_bispinor_field[2], g_bispinor_field[5],
+           25000, 1.0e-14, 0, VOLUME, &Q2_psi_BSM3);
+  t_FP = gettime() - t1;
+
+  if(g_proc_id==0)
+    printf("Operator inversion time: t_FP = %f sec\n\n", t_FP);
+  
+
+  Q2_psi_BSM3(g_bispinor_field[9], g_bispinor_field[2]);
+  printf("Starting field %e\n",creal(g_bispinor_field[10][0].sp_up.s0.c0));
+  assign_diff_mul((spinor*)g_bispinor_field[9], (spinor*)g_bispinor_field[10], 1.0, 2*VOLUME);
+  
+  double squarenorm_FP = square_norm((spinor*)g_bispinor_field[9], 2*VOLUME, 1);
+  if(g_proc_id==0) {
+     printf("# ||Q2_FP*(Q2_FP)^-1*(b)-b||^2 = %.16e\n\n", squarenorm_FP);
+     fflush(stdout);
+  }
+#endif
+
+  eta_BSM=0.5;
+  rho_BSM=0.5;
+
+  csw_BSM=0;  /* Also original BSM operator is without the clover term */
+  r0_BSM=0;
+  //sw_term( (const su3**) g_smeared_gauge_field, 1, csw_BSM);
+  
+  random_spinor_field_lexic( (spinor*)(g_bispinor_field[4]), reproduce_randomnumber_flag, RN_GAUSS);
+  random_spinor_field_lexic( (spinor*)(g_bispinor_field[4])+VOLUME, reproduce_randomnumber_flag, RN_GAUSS);
+
+
+#if defined TM_USE_MPI
+  generic_exchange(g_bispinor_field[4], sizeof(bispinor));
+#endif
+/**************************************************************************************************************************
+ *
+ * Testing the new BSM operator with respect to the old one (r0_BSM=0)
+ *
+ * ************************************************************************************************************************/
+  printf("# [tmlqcd-BSM test] Testing the new BSM operator with respect to the old one at r0_BSM=0\n");
+
+ // print L2-norm of w source:
+  squarenorm_w = square_norm((spinor*)g_bispinor_field[4], 2*VOLUME, 1);
+  if(g_proc_id==0) {
+   printf("# [tmlqcd-BSM test] input source vector for application D_BSM3: square norm of the source: ||w||^2 = %e\n", squarenorm_w);
+   fflush(stdout);
+  }
+
+  // Feri's operator
+
+  printf("# [tmlqcd-BSM test] Application D_psi_BSM3 operator\n");
+#ifdef TM_USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  t_FP = 0.0;
+  t1 = gettime();
+  printf("r0_BSM = %e\n",r0_BSM);
+  D_psi_BSM3_test(g_bispinor_field[2], g_bispinor_field[4]);
+  t1 = gettime() - t1;
+#ifdef TM_USE_MPI
+  MPI_Allreduce (&t1, &t_FP, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#else
+  t_FP = t1;
+#endif
+  squarenorm_w = square_norm((spinor*)g_bispinor_field[2], 2*VOLUME, 1);
+  if(g_proc_id==0) {
+   printf("# [tmlqcd-BSM test] square norm of the multiplication results with D_BSM3: ||w||^2 = %e\n", squarenorm_w);
+   fflush(stdout);
+  }
+
+#ifdef TM_USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  squarenorm_w = square_norm((spinor*)g_bispinor_field[4], 2*VOLUME, 1);
+  if(g_proc_id==0) {
+   printf("# [tmlqcd-BSM test] square norm of the source for D_psi_BSM : ||w||^2 = %e\n", squarenorm_w);
+   fflush(stdout);
+  }
+
+  printf("# [tmlqcd-BSM test] Application of D_psi_BSM  ( Carsten' operator )\n");
+  t_FP = 0.0;
+  t1 = gettime();
+  D_psi_BSM(g_bispinor_field[5], g_bispinor_field[4]);
+  t1 = gettime() - t1;
+#ifdef TM_USE_MPI
+  MPI_Allreduce (&t1, &t_FP, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#else
+  t_FP = t1;
+#endif
+  squarenorm_w = square_norm((spinor*)g_bispinor_field[5], 2*VOLUME, 1);
+  if(g_proc_id==0) {
+   printf("# [tmlqcd-BSM test] square norm of the multiplication results with D_psi_BSM:  ||w||^2 = %e\n", squarenorm_w);
+   fflush(stdout);
+  }
+
+  assign_diff_mul((spinor *)g_bispinor_field[5], (spinor *)g_bispinor_field[2],1.0,  2*VOLUME ); 
+
+  squarenorm_w = square_norm((spinor*)g_bispinor_field[5], 2*VOLUME, 1);
+  if(g_proc_id==0) {
+   printf("# [tmlqcd-BSM test] square norm of the difference D_psi_BSM - D_psi_BSM3:  ||w||^2 = %e\n\n", squarenorm_w);
+   fflush(stdout);
+  }
+
+/***********************************************************************************************************
+ *
+ * Testing D_psi_BSM3 Dirac operator gives the same results as the usual 
+ * wilson operator at rho=eta=0,csw=0
+ *
+ *
+ *********************************************************************************************************/
+ // print L2-norm of w source:
+  bispinor_assign(g_bispinor_field[5],g_bispinor_field[4], VOLUME);
+  if(g_proc_id==0) {
+   printf("# [tmlqcd-BSM test] Test the compatibility with the Wilson operator implemented, BSM parameters zero(mu,rho,eta), r0_BSM=1\n");
+   fflush(stdout);
+  }
+
+  squarenorm_w = square_norm((spinor*)g_bispinor_field[4], 2*VOLUME, 1);
+  if(g_proc_id==0) {
+   printf("# [tmlqcd-BSM test] input source vector for application D_BSM3: square norm of the source: ||w||^2 = %e\n", squarenorm_w);
+   fflush(stdout);
+  }
+  // Feri's operator
+  csw_BSM=0;
+  m0_BSM=0;
+  mu03_BSM=0;
+  eta_BSM=0.;
+  rho_BSM=0;
+
+  printf("# [tmlqcd-BSM test] csw_BSM %e m0_BSM %e mu03_BSM %e eta_BSM %e rho_BSM \n", csw_BSM, m0_BSM, mu03_BSM, eta_BSM, rho_BSM);
+  g_mu=0;
+  g_c_sw=0;
+  g_kappa=1;
+
+  printf("# [tmlqcd-BSM test] g_mu %e g_c_sw %e g_kappa %e\n", g_mu,g_c_sw,g_kappa);
+  printf("# [tmlqcd-BSM test] application of D_psi_BSM3 operator\n");
+#ifdef TM_USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  t_FP = 0.0;
+  t1 = gettime();
+  D_psi_BSM3(g_bispinor_field[2], g_bispinor_field[4]);
+  t1 = gettime() - t1;
+#ifdef TM_USE_MPI
+  MPI_Allreduce (&t1, &t_FP, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#else
+  t_FP = t1;
+#endif
+  squarenorm_w = square_norm((spinor*)g_bispinor_field[2], 2*VOLUME, 1);
+  if(g_proc_id==0) {
+   printf("# [tmlqcd-BSM test] square norm of the multiplication results with D_BSM3: ||w||^2 = %e\n", squarenorm_w);
+   fflush(stdout);
+  }
+
+  squarenorm_w = square_norm((spinor*)g_bispinor_field[4], 2*VOLUME, 1);
+  if(g_proc_id==0) {
+   printf("# [tmlqcd-BSM test] input source vector for application D_psi: square norm of the source: ||w||^2 = %e\n", squarenorm_w);
+   fflush(stdout);
+  }
+
+
+  printf("# [tmlqcd-BSM test] Application of D_psi ( Carsten' operator ) \n");
+  D_psi_bispinor((bispinor *)(g_bispinor_field[5]), (bispinor *)(g_bispinor_field[4]));
+
+  squarenorm_w = square_norm((spinor*)g_bispinor_field[5], 2*VOLUME, 1);
+  if(g_proc_id==0) {
+   printf("# [tmlqcd-BSM test] square norm of the multiplication results with D_psi_bispinor:  ||w||^2 = %e\n", squarenorm_w);
+   fflush(stdout);
+  }
+
+  assign_diff_mul((spinor *)g_bispinor_field[5], (spinor *)g_bispinor_field[2],1.0,  2*VOLUME );
+
+  squarenorm_w = square_norm((spinor*)g_bispinor_field[5], 2*VOLUME, 1);
+  if(g_proc_id==0) {
+   printf("# [tmlqcd-BSM test] square norm of the difference D_psi - D_psi_BSM3:  ||w||^2 = %e\n\n", squarenorm_w);
+   fflush(stdout);
+  }
+
+/******************************************************************************************************************************
+ *
+ *
+ *  Test the compatibility with the Clover Wilson operator implemented, BSM parameters zero(mu,rho,eta), r0_BSM=1,csw=1,kappa=1
+ *
+ *
+ ******************************************************************************************************************************/
+
+  if(g_proc_id==0) {
+   printf("# [tmlqcd-BSM test] Test the compatibility with the Clover Wilson operator implemented, BSM parameters zero(mu,rho,eta), r0_BSM=1,csw=1,kappa=1\n");
+   fflush(stdout);
+  }
+
+  csw_BSM=1;  /* Also the new BSM operator is with the clover term */
+  sw_term( (const su3**) g_smeared_gauge_field, 1., csw_BSM/2.);
+  bispinor_assign(g_bispinor_field[5],g_bispinor_field[4], VOLUME);
+ 
+  printf("# [tmlqcd-BSM test] application of D_psi_BSM3 operator\n");
+#ifdef TM_USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  t_FP = 0.0;
+  t1 = gettime();
+  D_psi_BSM3(g_bispinor_field[2], g_bispinor_field[4]);
+  t1 = gettime() - t1;
+#ifdef TM_USE_MPI
+  MPI_Allreduce (&t1, &t_FP, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#else
+  t_FP = t1;
+#endif
+  squarenorm_w = square_norm((spinor*)g_bispinor_field[2], 2*VOLUME, 1);
+  if(g_proc_id==0) {
+   printf("# [tmlqcd-BSM test] square norm of the multiplication results with D_BSM3: ||w||^2 = %e\n", squarenorm_w);
+   fflush(stdout);
+  }
+
+  g_c_sw=1;
+  g_kappa=1;
+  sw_term( (const su3**) g_gauge_field, 1., g_c_sw/2);
+
+  printf("# [tmlqcd-BSM test] Application of D_psi ( Carsten' operator ) \n");
+
+  D_psi_bispinor((bispinor *)(g_bispinor_field[3]), (bispinor *)(g_bispinor_field[4]));
+
+  squarenorm_w = square_norm((spinor*)g_bispinor_field[3], 2*VOLUME, 1);
+  if(g_proc_id==0) {
+   printf("# [tmlqcd-BSM test] square norm of the multiplication results with D_psi_bispinor:  ||w||^2 = %e\n", squarenorm_w);
+   fflush(stdout);
+  }
+
+  assign_diff_mul((spinor *)g_bispinor_field[3], (spinor *)g_bispinor_field[2],1.0,  2*VOLUME );
+
+  squarenorm_w = square_norm((spinor*)g_bispinor_field[3], 2*VOLUME, 1);
+  if(g_proc_id==0) {
+   printf("# [tmlqcd-BSM test] square norm of the difference D_psi - D_psi_BSM3:  ||w||^2 = %e\n", squarenorm_w);
+   fflush(stdout);
+  }
+
+  free_D_psi_BSM3();
+  free_bispinor_field();
+  free_scalar_field();
+
+  free_gauge_field();
+  free_geometry_indices();
+
+#endif //_USE_BSM
+  
+
+#ifdef TM_USE_OMP
+  free_omp_accumulators();
+#endif 
+#ifdef TM_USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Finalize();
+#endif
+  return(0);
+}
+
+
+static void usage()
+{
+  fprintf(stdout, "Options: [-f input-filename]\n");
+  exit(0);
+}
+
+static void process_args(int argc, char *argv[], char ** input_filename, char ** filename) {
+  int c;
+  while ((c = getopt(argc, argv, "h?vVf:o:")) != -1) {
+    switch (c) {
+      case 'f':
+        *input_filename = calloc(200, sizeof(char));
+        strncpy(*input_filename, optarg, 200);
+        break;
+      case 'o':
+        *filename = calloc(200, sizeof(char));
+        strncpy(*filename, optarg, 200);
+        break;
+      case 'v':
+        verbose = 1;
+        break;
+      case 'V':
+        if(g_proc_id == 0) {
+//          fprintf(stdout,"%s %s\n",PACKAGE_STRING,git_hash);
+        }
+        exit(0);
+        break;
+      case 'h':
+      case '?':
+      default:
+        if( g_proc_id == 0 ) {
+          usage();
+        }
+        break;
+    }
+  }
+}
+
+static void set_default_filenames(char ** input_filename, char ** filename) {
+  if( *input_filename == NULL ) {
+    *input_filename = calloc(13, sizeof(char));
+    strcpy(*input_filename,"invert.input");
+  }
+
+  if( *filename == NULL ) {
+    *filename = calloc(7, sizeof(char));
+    strcpy(*filename,"output");
+  }
+}
+#endif
+
diff --git a/test_lemon.c b/test_lemon.c
index 1a50a6243..636f4333e 100644
--- a/test_lemon.c
+++ b/test_lemon.c
@@ -25,7 +25,7 @@
 
 #include <lime.h>
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -35,7 +35,7 @@
 #if (defined BGL && !defined BGP)
 #  include <rts.h>
 #endif
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "su3.h"
@@ -61,7 +61,7 @@ int main(int argc,char *argv[]) {
   paramsXlfInfo *xlfInfo;
   
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   
   MPI_Init(&argc, &argv);
 #endif
@@ -99,7 +99,7 @@ int main(int argc,char *argv[]) {
   start_ranlux(1, 123456);
   random_gauge_field(reproduce_randomnumber_flag, g_gauge_field);
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   /*For parallelization: exchange the gaugefield */
   xchange_gauge(g_gauge_field);
 #endif
@@ -165,7 +165,7 @@ int main(int argc,char *argv[]) {
   }
 
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Finalize();
 #endif
   free_gauge_field();
diff --git a/tests/test_buffers.c b/tests/test_buffers.c
index 98ac13c52..aea3b7d2e 100644
--- a/tests/test_buffers.c
+++ b/tests/test_buffers.c
@@ -1,8 +1,8 @@
 
 #include <global.h>
-#include <config.h>
+#include <tmlqcd_config.h>
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
 
@@ -14,7 +14,7 @@ TEST_SUITES {
 };
 
 int main(int argc,char *argv[]){
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Init(&argc, &argv);
   MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id);
 #else
@@ -24,7 +24,7 @@ int main(int argc,char *argv[]){
   CU_SET_OUT_PREFIX("regressions/");
   CU_RUN(argc,argv);
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Finalize();
 #endif
 
diff --git a/tests/test_buffers_gauge.c b/tests/test_buffers_gauge.c
index 393bf7c12..f7062afae 100644
--- a/tests/test_buffers_gauge.c
+++ b/tests/test_buffers_gauge.c
@@ -1,4 +1,4 @@
-#include <config.h>
+#include <tmlqcd_config.h>
 #include <global.h>
 
 #include <cu/cu.h>
diff --git a/tests/test_clover_six_invert.c b/tests/test_clover_six_invert.c
index fa95c8ef6..d53f89739 100644
--- a/tests/test_clover_six_invert.c
+++ b/tests/test_clover_six_invert.c
@@ -1,5 +1,5 @@
 #include <stdio.h>
-#include <config.h>
+#include <tmlqcd_config.h>
 #include <complex.h>
 #include <cu/cu.h>
 
diff --git a/tests/test_linalg.c b/tests/test_linalg.c
index 51ceb0cf1..c5f5cad8a 100644
--- a/tests/test_linalg.c
+++ b/tests/test_linalg.c
@@ -1,6 +1,6 @@
 
 #if HAVE_CONFIG_H
-#include<config.h>
+#include<tmlqcd_config.h>
 #endif
 #include "../global.h"
 #include "test_linalg_spinor.h"
diff --git a/tests/test_linalg_spinor.c b/tests/test_linalg_spinor.c
index fd8df823d..5b5cc6758 100644
--- a/tests/test_linalg_spinor.c
+++ b/tests/test_linalg_spinor.c
@@ -1,9 +1,9 @@
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
-#include <config.h>
+#include <tmlqcd_config.h>
 #include <complex.h>
 #include <time.h>
 #include <cu/cu.h>
diff --git a/tests/test_qpx_algebra.c b/tests/test_qpx_algebra.c
index 7de1685af..85934a77f 100644
--- a/tests/test_qpx_algebra.c
+++ b/tests/test_qpx_algebra.c
@@ -1,5 +1,5 @@
 #include <stdio.h>
-#include <config.h>
+#include <tmlqcd_config.h>
 #include <complex.h>
 #include <cu/cu.h>
 #if (defined SSE || defined SSE2 || defined SSE3)
diff --git a/tests/test_rat.c b/tests/test_rat.c
index 754ef0a2d..f86c3a0b1 100644
--- a/tests/test_rat.c
+++ b/tests/test_rat.c
@@ -1,6 +1,6 @@
 
 #if HAVE_CONFIG_H
-#include<config.h>
+#include<tmlqcd_config.h>
 #endif
 #include "../global.h"
 #include "test_rat_init.h"
diff --git a/tests/test_rat_init.c b/tests/test_rat_init.c
index fa6e8e0b1..bad051c81 100644
--- a/tests/test_rat_init.c
+++ b/tests/test_rat_init.c
@@ -1,6 +1,6 @@
 #include <stdlib.h>
 #include <stdio.h>
-#include <config.h>
+#include <tmlqcd_config.h>
 #include <complex.h>
 #include <cu/cu.h>
 #include <complex.h>
diff --git a/tests/test_su3_algebra.c b/tests/test_su3_algebra.c
index 25265bcb8..3090f5229 100644
--- a/tests/test_su3_algebra.c
+++ b/tests/test_su3_algebra.c
@@ -1,5 +1,5 @@
 #include <stdio.h>
-#include <config.h>
+#include <tmlqcd_config.h>
 #include <complex.h>
 #include <cu/cu.h>
 #if (defined SSE || defined SSE2 || defined SSE3)
diff --git a/tm_debug_printf.c b/tm_debug_printf.c
new file mode 100644
index 000000000..7bebd8493
--- /dev/null
+++ b/tm_debug_printf.c
@@ -0,0 +1,39 @@
+/***********************************************************************
+ *  
+ * Copyright (C) 2018 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#include "global.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+
+void tm_debug_printf(const int proc_id,
+                     const int dbg_level_threshold,
+                     const char * format,
+                     ...)
+{
+  if( g_proc_id == proc_id && g_debug_level >= dbg_level_threshold ){
+    va_list arglist;
+    va_start(arglist, format);
+    vprintf(format, arglist);
+    va_end(arglist);
+  }
+}
+
diff --git a/tm_debug_printf.h b/tm_debug_printf.h
new file mode 100644
index 000000000..862a99948
--- /dev/null
+++ b/tm_debug_printf.h
@@ -0,0 +1,34 @@
+/***********************************************************************
+ *  
+ * Copyright (C) 2018 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef TM_DEBUG_PRINTF_H
+#define TM_DEBUG_PRINTF_H
+
+/* Function along the lines of printf which produces output on a single
+ * or all MPI tasks (unordered) when g_debug_level is at or
+ * above the provided threshold 
+ * to have output by all MPI tasks, simply pass g_proc_id for proc_id */
+
+void tm_debug_printf(const int proc_id,
+                     const int dbg_level_threshold,
+                     const char * format,
+                     ...);
+
+#endif
diff --git a/travis-ci.sh b/travis-ci.sh
new file mode 100755
index 000000000..9fd73afee
--- /dev/null
+++ b/travis-ci.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+########################################################################
+#
+# Copyright (C) 2017 Martin Ueding <dev@martin-ueding.de>
+#
+# This file is part of tmLQCD.
+#
+# tmLQCD is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# 
+# tmLQCD is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+########################################################################
+
+# Compiles and tests tmLQCD on the [Travis CI](https://travis-ci.org/) infrastructure.
+
+set -e
+set -u
+set -x
+
+# Compile C-LIME.
+pushd ..
+git clone https://github.com/usqcd-software/c-lime.git
+pushd c-lime
+./autogen.sh
+./configure
+make -j $(nproc)
+popd
+popd
+
+if ! [[ -f git_hash.h ]]; then
+    echo "#ifndef _GIT_HASH_H" > git_hash.h
+    echo "#define _GIT_HASH_H" >> git_hash.h
+    echo "const char git_hash[] = {\"travisbuild\"};" >> git_hash.h
+    echo "#endif /* _GIT_HASH_H */" >> git_hash.h
+fi
+
+# Compile tmLQCD.
+sudo apt-get update
+sudo apt-get install -y flex libblas-dev liblapack-dev gfortran
+
+autoconf
+
+./configure \
+    --disable-mpi \
+    --with-lapack='-llapack -lblas' \
+    --with-limedir=$PWD/../c-lime \
+    CC=/usr/bin/gcc \
+    CXX=/usr/bin/g++ \
+    CFLAGS='-O2 --std=c99 -fopenmp -g -fPIC' \
+    CXXFLAGS='-O2 --std=c++11 -fopenmp -g -fPIC' \
+    LIBS='-fopenmp' \
+    || ( echo; echo '###############################################################################'; echo '#                                Configure Log                                #'; echo '###############################################################################'; echo; set -x; cat config.log; exit 1)
+
+make -j $(nproc)
+
+# Run some tests.
+cp sample-input/sample-hmc0.input travis.input
+sed -i 's/Measurements = 1000/Measurements = 1/' travis.input
+./hmc_tm -f travis.input
diff --git a/update_backward_gauge.c b/update_backward_gauge.c
index 68e50334b..3028d4571 100644
--- a/update_backward_gauge.c
+++ b/update_backward_gauge.c
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include "global.h"
@@ -28,14 +28,14 @@
 
 #if defined _USE_HALFSPINOR
 void update_backward_gauge(su3 ** const gf) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
 
   int ix=0, kb=0, iy=0;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif 
   for(ix = 0; ix < VOLUME/2; ix++) {
@@ -59,7 +59,7 @@ void update_backward_gauge(su3 ** const gf) {
     _su3_assign(g_gauge_field_copy[1][ix][3], gf[kb][3]);
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 
@@ -67,17 +67,69 @@ void update_backward_gauge(su3 ** const gf) {
   return;
 }
 
+void update_backward_gauge_32_orphaned(su3_32 ** const gf) {
+
+  int ix=0, kb=0, iy=0;
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif 
+  for(ix = 0; ix < VOLUME/2; ix++) {
+    iy = (VOLUME+RAND)/2+ix;
+    kb = g_idn[ g_eo2lexic[iy] ][0];
+    _su3_assign(g_gauge_field_copy_32[0][ix][0], gf[kb][0]);
+    kb = g_idn[ g_eo2lexic[iy] ][1];
+    _su3_assign(g_gauge_field_copy_32[0][ix][1], gf[kb][1]);
+    kb = g_idn[ g_eo2lexic[iy] ][2];
+    _su3_assign(g_gauge_field_copy_32[0][ix][2], gf[kb][2]);
+    kb = g_idn[ g_eo2lexic[iy] ][3];
+    _su3_assign(g_gauge_field_copy_32[0][ix][3], gf[kb][3]);
+
+    kb = g_idn[ g_eo2lexic[ix] ][0];
+    _su3_assign(g_gauge_field_copy_32[1][ix][0], gf[kb][0]);
+    kb = g_idn[ g_eo2lexic[ix] ][1];
+    _su3_assign(g_gauge_field_copy_32[1][ix][1], gf[kb][1]);
+    kb = g_idn[ g_eo2lexic[ix] ][2];
+    _su3_assign(g_gauge_field_copy_32[1][ix][2], gf[kb][2]);
+    kb = g_idn[ g_eo2lexic[ix] ][3];
+    _su3_assign(g_gauge_field_copy_32[1][ix][3], gf[kb][3]);
+  }
+
+// we use the implicit barrier at the end of the single section to catch all
+// threads, in the meantime, one of them modifies the global flag
+#ifdef TM_USE_OMP
+#pragma omp single
+  {
+#endif
+    g_update_gauge_copy_32 = 0;
+#ifdef TM_USE_OMP
+  } 
+#endif
+}
+
+void update_backward_gauge_32(su3_32 ** const gf) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  update_backward_gauge_32_orphaned(gf);
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+  return;
+}
+
 #elif _USE_TSPLITPAR 
 
 void update_backward_gauge(su3 ** const gf) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
 
   int ix=0, kb=0, kb2=0;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(ix = 0; ix < VOLUME/2;ix++) {
@@ -98,7 +150,7 @@ void update_backward_gauge(su3 ** const gf) {
     kb=g_idn[g_eo2lexic[ix]][3];
     _su3_assign(g_gauge_field_copys[ix][5],gf[kb][3]);
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(ix = (VOLUME+RAND)/2; ix < (VOLUME+RAND)/2+VOLUME/2;ix++) {
@@ -120,7 +172,7 @@ void update_backward_gauge(su3 ** const gf) {
     _su3_assign(g_gauge_field_copys[ix][5],gf[kb][3]);
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 
@@ -131,14 +183,14 @@ void update_backward_gauge(su3 ** const gf) {
 #else
 
 void update_backward_gauge(su3 ** const gf) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
 
   int ix=0, kb=0, kb2=0;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(ix = 0; ix < VOLUME/2; ix++) {
@@ -159,7 +211,7 @@ void update_backward_gauge(su3 ** const gf) {
     kb=g_idn[g_eo2lexic[ix]][3];
     _su3_assign(g_gauge_field_copy[ix][7],gf[kb][3]);
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(ix = (VOLUME+RAND)/2; ix < (VOLUME+RAND)/2+VOLUME/2; ix++) {
@@ -181,7 +233,7 @@ void update_backward_gauge(su3 ** const gf) {
     _su3_assign(g_gauge_field_copy[ix][7],gf[kb][3]);
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 
@@ -189,4 +241,72 @@ void update_backward_gauge(su3 ** const gf) {
   return;
 }
 
+void update_backward_gauge_32_orphaned(su3_32 ** const gf) {
+  int ix=0, kb=0, kb2=0;
+
+#ifdef TM_USE_OMP
+#pragma omp for nowait
+#endif
+  for(ix = 0; ix < VOLUME/2; ix++) {
+    kb2=g_eo2lexic[ix];
+    _su3_assign(g_gauge_field_copy_32[ix][0],gf[kb2][0]);
+    kb=g_idn[g_eo2lexic[ix]][0];
+    _su3_assign(g_gauge_field_copy_32[ix][1],gf[kb][0]);
+
+    _su3_assign(g_gauge_field_copy_32[ix][2],gf[kb2][1]);
+    kb=g_idn[g_eo2lexic[ix]][1];
+    _su3_assign(g_gauge_field_copy_32[ix][3],gf[kb][1]);
+
+    _su3_assign(g_gauge_field_copy_32[ix][4],gf[kb2][2]);
+    kb=g_idn[g_eo2lexic[ix]][2];
+    _su3_assign(g_gauge_field_copy_32[ix][5],gf[kb][2]);
+
+    _su3_assign(g_gauge_field_copy_32[ix][6],gf[kb2][3]);
+    kb=g_idn[g_eo2lexic[ix]][3];
+    _su3_assign(g_gauge_field_copy_32[ix][7],gf[kb][3]);
+  }
+#ifdef TM_USE_OMP
+#pragma omp for nowait
+#endif
+  for(ix = (VOLUME+RAND)/2; ix < (VOLUME+RAND)/2+VOLUME/2; ix++) {
+    kb2=g_eo2lexic[ix];
+    _su3_assign(g_gauge_field_copy_32[ix][0],gf[kb2][0]);
+    kb=g_idn[g_eo2lexic[ix]][0];
+    _su3_assign(g_gauge_field_copy_32[ix][1],gf[kb][0]);
+
+    _su3_assign(g_gauge_field_copy_32[ix][2],gf[kb2][1]);
+    kb=g_idn[g_eo2lexic[ix]][1];
+    _su3_assign(g_gauge_field_copy_32[ix][3],gf[kb][1]);
+
+    _su3_assign(g_gauge_field_copy_32[ix][4],gf[kb2][2]);
+    kb=g_idn[g_eo2lexic[ix]][2];
+    _su3_assign(g_gauge_field_copy_32[ix][5],gf[kb][2]);
+
+    _su3_assign(g_gauge_field_copy_32[ix][6],gf[kb2][3]);
+    kb=g_idn[g_eo2lexic[ix]][3];
+    _su3_assign(g_gauge_field_copy_32[ix][7],gf[kb][3]);
+  }
+// the threads are caught by the implicit barrier here
+#ifdef TM_USE_OMP
+#pragma omp single
+  {
+#endif
+  g_update_gauge_copy_32 = 0;
+#ifdef TM_USE_OMP
+ }
+#endif
+}
+
+void update_backward_gauge_32(su3_32 ** const gf) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  update_backward_gauge_32_orphaned(gf);
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+  return;
+}
+
 #endif
diff --git a/update_backward_gauge.h b/update_backward_gauge.h
index 911386200..e06d5655e 100644
--- a/update_backward_gauge.h
+++ b/update_backward_gauge.h
@@ -23,5 +23,7 @@
 #include "su3.h"
 
 void update_backward_gauge(su3 ** const gf);
+void update_backward_gauge_32_orphaned(su3_32 ** const gf);
+void update_backward_gauge_32(su3_32 ** const gf);
 
 #endif
diff --git a/update_gauge.c b/update_gauge.c
index dae2a9f3c..8742494bf 100644
--- a/update_gauge.c
+++ b/update_gauge.c
@@ -22,7 +22,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -38,19 +38,24 @@
 #include "xchange/xchange.h"
 #include "hamiltonian_field.h"
 #include "update_gauge.h"
-
-
+#include "init/init_gauge_field.h"
+#ifdef DDalphaAMG
+#include "DDalphaAMG_interface.h"
+#endif
 /*******************************************************
  *
  * Updates the gauge field corresponding to the momenta
  *
  *******************************************************/
 
-
 void update_gauge(const double step, hamiltonian_field_t * const hf) {
   double atime, etime;
   atime = gettime();
-#ifdef OMP
+#ifdef DDalphaAMG
+  MG_update_gauge(step);
+#endif
+
+#ifdef TM_USE_OMP
 #define static
 #pragma omp parallel
   {
@@ -64,11 +69,11 @@ void update_gauge(const double step, hamiltonian_field_t * const hf) {
 #pragma pomp inst begin(updategauge)
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #undef static
 #endif
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(i = 0; i < VOLUME; i++) { 
@@ -80,24 +85,31 @@ void update_gauge(const double step, hamiltonian_field_t * const hf) {
       exposu3(&w,&deriv);
       restoresu3(&v,&w);
       _su3_times_su3(w, v, *z);
-      _su3_assign(*z, w);
+      restoresu3(&v,&w);
+      _su3_assign(*z, v);
     }
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP parallel closing brace */
 #endif
   
-#ifdef MPI
+#ifdef TM_USE_MPI
   /* for parallelization */
   xchange_gauge(hf->gaugefield);
 #endif
+  
+  /*Convert to a 32 bit gauge field, after xchange*/
+  convert_32_gauge_field(g_gauge_field_32, hf->gaugefield, VOLUMEPLUSRAND + g_dbw2rand);
+  
   /*
    * The backward copy of the gauge field
    * is not updated here!
    */
   hf->update_gauge_copy = 1;
   g_update_gauge_copy = 1;
+  g_update_gauge_copy_32 = 1;
+
   etime = gettime();
   if(g_debug_level > 1 && g_proc_id == 0) {
     printf("# Time gauge update: %e s\n", etime-atime); 
diff --git a/update_momenta.c b/update_momenta.c
index e9f8bcf9e..bac54957f 100644
--- a/update_momenta.c
+++ b/update_momenta.c
@@ -20,7 +20,7 @@
  ***********************************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
@@ -42,7 +42,7 @@
 void update_momenta(int * mnllist, double step, const int no, 
 		    hamiltonian_field_t * const hf) {
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel for
 #endif
   for(int i = 0; i < (VOLUMEPLUSRAND + g_dbw2rand);i++) { 
@@ -57,11 +57,11 @@ void update_momenta(int * mnllist, double step, const int no,
     }
   }
   
-#ifdef MPI
+#ifdef TM_USE_MPI
   xchange_deri(hf->derivative);
 #endif
     
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel for
 #endif
   for(int i = 0; i < VOLUME; i++) {
diff --git a/update_momenta_fg.c b/update_momenta_fg.c
new file mode 100644
index 000000000..e291d50a5
--- /dev/null
+++ b/update_momenta_fg.c
@@ -0,0 +1,225 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2017 Jacob Finkenrath
+ *               2018 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+
+#include <lime.h>
+#include "sighandler.h"
+#include "read_input.h"
+#include "monomial/monomial.h"
+#include "init/init_gauge_fg.h"
+#include "operator/clover_leaf.h"
+
+#include "global.h"
+#include "gettime.h"
+#include "su3.h"
+#include "su3adj.h"
+#include "su3spinor.h"
+#include "expo.h"
+#include "sse.h"
+#include "xchange/xchange.h"
+#include "hamiltonian_field.h"
+#include "init/init_gauge_field.h"
+#ifdef DDalphaAMG
+#include "DDalphaAMG_interface.h"
+#endif
+
+inline void calculate_fg(const double step_fg,
+                         hamiltonian_field_t * const hf){
+#ifdef TM_USE_OMP
+#define static
+#pragma omp parallel
+  {
+#endif
+
+  static su3 v,w;
+  su3 *z;
+  su3 *ztmp;
+  static su3adj deriv;
+  su3adj *Fm;
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(int i = 0; i < VOLUME; i++) { 
+    for(int mu = 0; mu < 4; mu++){
+      /* Cope gauge field to be temporarily updated */
+      z = &hf->gaugefield[i][mu];
+      ztmp = &gauge_fg[i][mu];
+      _su3_assign(*ztmp,*z);  
+ 
+      /* Calculate approximated force gradient term and update temporary gauge field */
+      Fm = &hf->derivative[i][mu];
+      _zero_su3adj(deriv);
+      _su3adj_assign_const_times_su3adj(deriv, step_fg, *Fm);
+      exposu3(&w,&deriv);
+      restoresu3(&v,&w);
+      _su3_times_su3(w, v, *z);
+      restoresu3(&v,&w);
+      _su3_assign(*z, v);
+    }
+  }
+#ifdef TM_USE_OMP
+  } // OpenMP parallel section closing brace
+#undef static
+#endif
+}
+
+inline void fg_update_momenta_reset_gaugefield(const double step,
+                                               hamiltonian_field_t * const hf){
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  su3 *z;
+  su3 *ztmp;
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(int i = 0; i < VOLUME; i++) { 
+    for(int mu = 0; mu < 4; mu++){
+      /* Update momenta (the minus comes from an extra minus in trace_lambda)
+       and restore initial gauge field */
+      _su3adj_minus_const_times_su3adj(hf->momenta[i][mu], step, hf->derivative[i][mu]);
+  
+      z = &hf->gaugefield[i][mu];
+      ztmp = &gauge_fg[i][mu];
+      _su3_assign(*z,*ztmp);
+  
+    }
+  }
+#ifdef TM_USE_OMP
+  } // OpenMP parallel section closing brace
+#endif
+}
+
+/*******************************************************
+ *
+ * Temporarily updates the gauge field corresponding to 
+ * the approximated force gradient term to finally update
+ * the momenta
+ *
+ *******************************************************/
+void update_momenta_fg(int * mnllist, double step, const int no,
+		       hamiltonian_field_t * const hf, double step0) {
+  double atime, etime;
+  atime = gettime();
+#ifdef DDalphaAMG
+  MG_update_gauge(0.0);
+#endif
+  if (g_exposu3_no_c == 0) init_exposu3();
+
+  double step_fg=-step0*step0/24;
+
+#ifdef TM_USE_OMP
+#pragma omp parallel for
+#endif
+  for(int i = 0; i < (VOLUMEPLUSRAND + g_dbw2rand);i++) {
+    for(int mu=0;mu<4;mu++) {
+      _zero_su3adj(hf->derivative[i][mu]);
+    }
+  }
+
+  // calculate derivatives to estimate force gradient
+  for(int k = 0; k < no; k++) {
+    if(monomial_list[ mnllist[k] ].derivativefunction != NULL) {
+      monomial_list[ mnllist[k] ].derivativefunction(mnllist[k], hf);
+    }
+  }
+
+#ifdef TM_USE_MPI
+  xchange_deri(hf->derivative);
+#endif
+  // estimate force gradient and propagate to gauge field
+  calculate_fg(step_fg, hf);
+
+#ifdef TM_USE_MPI
+     /* for parallelization */
+     xchange_gauge(hf->gaugefield);
+#endif
+#ifdef DDalphaAMG
+     MG_update_gauge(0.0);
+#endif
+
+   /*Convert to a 32 bit gauge field, after xchange*/
+   convert_32_gauge_field(g_gauge_field_32, hf->gaugefield, VOLUMEPLUSRAND + g_dbw2rand);
+
+   /* The backward copy of gaugefield is not updated here! */
+   hf->update_gauge_copy = 1;
+   g_update_gauge_copy = 1;
+   g_update_gauge_copy_32 = 1;
+
+  // calculate forces with force-gradient updated gauge field
+#ifdef TM_USE_OMP
+#pragma omp parallel for
+#endif
+  for(int i = 0; i < (VOLUMEPLUSRAND + g_dbw2rand);i++) {
+    for(int mu=0;mu<4;mu++) {
+      _zero_su3adj(hf->derivative[i][mu]);
+    }
+  }
+
+  for(int k = 0; k < no; k++) {
+    if(monomial_list[ mnllist[k] ].derivativefunction != NULL) {
+      monomial_list[ mnllist[k] ].derivativefunction(mnllist[k], hf);
+    }
+  }
+
+#ifdef TM_USE_MPI
+  xchange_deri(hf->derivative);
+#endif
+  
+  // and finally update the momenta and reset the gauge field 
+  fg_update_momenta_reset_gaugefield(step, hf);
+
+#ifdef TM_USE_MPI
+  /* for parallelization */
+  xchange_gauge(hf->gaugefield);
+#endif
+#ifdef DDalphaAMG
+  MG_update_gauge(0.0);
+#endif
+
+  /*Convert to a 32 bit gauge field, after xchange*/
+  convert_32_gauge_field(g_gauge_field_32, hf->gaugefield, VOLUMEPLUSRAND + g_dbw2rand);
+  
+  /*
+   * The backward copy of the gauge field
+   * is not updated here!
+   */
+  hf->update_gauge_copy = 1;
+  g_update_gauge_copy = 1;
+  g_update_gauge_copy_32 = 1;
+
+
+  etime = gettime();
+  if(g_debug_level > 1 && g_proc_id == 0) {
+    printf("# Time gauge update: %e s\n", etime-atime); 
+  } 
+  return;
+}
diff --git a/update_momenta_fg.h b/update_momenta_fg.h
new file mode 100644
index 000000000..27264255c
--- /dev/null
+++ b/update_momenta_fg.h
@@ -0,0 +1,26 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+#ifndef _UPDATE_MOMENTA_FG_H
+#define _UPDATE_MOMENTA_FG_H
+
+#include "hamiltonian_field.h"
+
+void update_momenta_fg(int * mnllist, double step, const int no, hamiltonian_field_t * const hf, double step_fg);
+
+#endif
diff --git a/update_tm.c b/update_tm.c
index cc18d0538..05959f376 100644
--- a/update_tm.c
+++ b/update_tm.c
@@ -29,16 +29,16 @@
 
 #include <lime.h>
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <time.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include "global.h"
@@ -60,12 +60,15 @@
 #include "hamiltonian_field.h"
 #include "update_tm.h"
 #include "gettime.h"
+#ifdef DDalphaAMG
+#include "DDalphaAMG_interface.h"
+#endif
 
 extern su3 ** g_gauge_field_saved;
 
 int update_tm(double *plaquette_energy, double *rectangle_energy, 
               char * filename, const int return_check, const int acctest, 
-	      const int traj_counter) {
+              const int traj_counter) {
 
   su3 *v, *w;
   int accept, i=0, j=0, iostatus=0;
@@ -106,7 +109,7 @@ int update_tm(double *plaquette_energy, double *rectangle_energy,
   /* 
    *  copy the gauge field to gauge_tmp 
    */
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel for private(w,v)
 #endif
   for(int ix=0;ix<VOLUME;ix++) { 
@@ -117,6 +120,10 @@ int update_tm(double *plaquette_energy, double *rectangle_energy,
     }
   }
 
+#ifdef DDalphaAMG
+  MG_reset();
+#endif
+
   /* heatbath for all monomials */
   for(i = 0; i < Integrator.no_timescales; i++) {
     for(j = 0; j < Integrator.no_mnls_per_ts[i]; j++) {
@@ -127,13 +134,13 @@ int update_tm(double *plaquette_energy, double *rectangle_energy,
   if(Integrator.monitor_forces) monitor_forces(&hf);
   /* initialize the momenta  */
   enep = random_su3adj_field(reproduce_randomnumber_flag, hf.momenta);
-
+  
   g_sloppy_precision = 1;
 
   /* run the trajectory */
   if(Integrator.n_int[Integrator.no_timescales-1] > 0) {
     Integrator.integrate[Integrator.no_timescales-1](Integrator.tau, 
-						     Integrator.no_timescales-1, 1);
+                 Integrator.no_timescales-1, 1, Integrator.tau);
   }
 
   g_sloppy_precision = 0;
@@ -164,17 +171,8 @@ int update_tm(double *plaquette_energy, double *rectangle_energy,
   /* the random number is only taken at node zero and then distributed to 
      the other sites */
   ranlxd(yy,1);
-  if(g_proc_id==0) {
-#ifdef MPI
-    for(i = 1; i < g_nproc; i++) {
-      MPI_Send(&yy[0], 1, MPI_DOUBLE, i, 31, MPI_COMM_WORLD);
-    }
-#endif
-  }
-#ifdef MPI
-  else{
-    MPI_Recv(&yy[0], 1, MPI_DOUBLE, 0, 31, MPI_COMM_WORLD, &status);
-  }
+#ifdef TM_USE_MPI
+  MPI_Bcast(&yy[0], 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
 #endif
 
   /* when acctest is 0 (i.e. do not perform acceptance test), the trajectory is accepted whatever the energy difference */
@@ -190,7 +188,7 @@ int update_tm(double *plaquette_energy, double *rectangle_energy,
     }
     if(accept) {
       /* save gauge file to disk before performing reversibility check */
-      xlfInfo = construct_paramsXlfInfo((*plaquette_energy)/(6.*VOLUME*g_nproc), -1);
+      xlfInfo = construct_paramsXlfInfo((new_plaquette_energy)/(6.*VOLUME*g_nproc), traj_counter);
       // Should write this to temporary file first, and then check
       if(g_proc_id == 0 && g_debug_level > 0) {
         fprintf(stdout, "# Writing gauge field to file %s.\n", tmp_filename);
@@ -207,11 +205,15 @@ int update_tm(double *plaquette_energy, double *rectangle_energy,
       }
       free(xlfInfo);
     }
+
+#ifdef DDalphaAMG
+    MG_reset();
+#endif
+
     g_sloppy_precision = 1;
     /* run the trajectory back */
     Integrator.integrate[Integrator.no_timescales-1](-Integrator.tau, 
-                         Integrator.no_timescales-1, 1);
-
+                         Integrator.no_timescales-1, 1, -Integrator.tau);
     g_sloppy_precision = 0;
 
     /*   compute the energy contributions from the pseudo-fermions  */
@@ -231,13 +233,13 @@ int update_tm(double *plaquette_energy, double *rectangle_energy,
     ks = 0.;
     kc = 0.;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel private(w,v,tt,tr,ts,ds,ks,kc)
     {
     int thread_num = omp_get_thread_num();
 #endif
     su3 ALIGN v0;
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
     for(int ix = 0; ix < VOLUME; ++ix)
@@ -246,8 +248,8 @@ int update_tm(double *plaquette_energy, double *rectangle_energy,
       {
         v=&hf.gaugefield[ix][mu];
         w=&gauge_tmp[ix][mu];
-	_su3_minus_su3(v0, *v, *w);
-	_su3_square_norm(ds, v0);
+        _su3_minus_su3(v0, *v, *w);
+        _su3_square_norm(ds, v0);
 
         tr = sqrt(ds) + kc;
         ts = tr + ks;
@@ -257,7 +259,7 @@ int update_tm(double *plaquette_energy, double *rectangle_energy,
       }
     }
     kc=ks+kc;
-#ifdef OMP
+#ifdef TM_USE_OMP
     g_omp_acc_re[thread_num] = kc;
       
     } /* OpenMP parallel section closing brace */
@@ -269,7 +271,7 @@ int update_tm(double *plaquette_energy, double *rectangle_energy,
     ret_gauge_diff = kc;
 #endif
 
-#ifdef MPI
+#ifdef TM_USE_MPI
     tmp = ret_gauge_diff;
     MPI_Reduce(&tmp, &ret_gauge_diff, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
 #endif
@@ -283,8 +285,8 @@ int update_tm(double *plaquette_energy, double *rectangle_energy,
     /* Output */
     if(g_proc_id == 0) {
       ret_check_file = fopen("return_check.data","a");
-      fprintf(ret_check_file,"ddh = %1.4e ddU= %1.4e ddh/H = %1.4e\n",
-              ret_dh, ret_gauge_diff/4./((double)(VOLUME*g_nproc))/3., ret_dh/tmp);
+      fprintf(ret_check_file,"%08d ddh = %1.4e ddh/dh = %1.4e ddh/H = %1.4e ddU= %1.4e\n", traj_counter,
+              ret_dh, ret_dh/dh, ret_dh/tmp, ret_gauge_diff/4./((double)(VOLUME*g_nproc))/3.);
       fclose(ret_check_file);
     }
 
@@ -313,7 +315,7 @@ int update_tm(double *plaquette_energy, double *rectangle_energy,
     *rectangle_energy = new_rectangle_energy;
     /* put the links back to SU(3) group */
     if (!bc_flag) { /* periodic boundary conditions */
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel for private(v)
 #endif
       for(int ix=0;ix<VOLUME;ix++) { 
@@ -325,7 +327,7 @@ int update_tm(double *plaquette_energy, double *rectangle_energy,
     }
   }
   else { /* reject: copy gauge_tmp to hf.gaugefield */
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel for private(w) private(v)
 #endif
     for(int ix=0;ix<VOLUME;ix++) {
@@ -335,12 +337,20 @@ int update_tm(double *plaquette_energy, double *rectangle_energy,
         _su3_assign(*v,*w);
       }
     }
+#ifdef DDalphaAMG
+    MG_reset();
+#endif
   }
   hf.update_gauge_copy = 1;
   g_update_gauge_copy = 1;
-#ifdef MPI
+  g_update_gauge_copy_32 = 1;  
+#ifdef TM_USE_MPI
   xchange_gauge(hf.gaugefield);
 #endif
+  
+  /*Convert to a 32 bit gauge field, after xchange*/
+  convert_32_gauge_field(g_gauge_field_32, hf.gaugefield, VOLUMEPLUSRAND + g_dbw2rand); 
+  
   etime=gettime();
 
   /* printing data in the .data file */
@@ -353,11 +363,11 @@ int update_tm(double *plaquette_energy, double *rectangle_energy,
     for(i = 0; i < Integrator.no_timescales; i++) {
       for(j = 0; j < Integrator.no_mnls_per_ts[i]; j++) {
         if(monomial_list[ Integrator.mnls_per_ts[i][j] ].type != GAUGE
-	   && monomial_list[ Integrator.mnls_per_ts[i][j] ].type != SFGAUGE 
-	   && monomial_list[ Integrator.mnls_per_ts[i][j] ].type != NDPOLY
-	   && monomial_list[ Integrator.mnls_per_ts[i][j] ].type != NDCLOVER
-	   && monomial_list[ Integrator.mnls_per_ts[i][j] ].type != CLOVERNDTRLOG
-	   && monomial_list[ Integrator.mnls_per_ts[i][j] ].type != CLOVERTRLOG ) {
+            && monomial_list[ Integrator.mnls_per_ts[i][j] ].type != SFGAUGE 
+            && monomial_list[ Integrator.mnls_per_ts[i][j] ].type != NDPOLY
+            && monomial_list[ Integrator.mnls_per_ts[i][j] ].type != NDCLOVER
+            && monomial_list[ Integrator.mnls_per_ts[i][j] ].type != CLOVERNDTRLOG
+            && monomial_list[ Integrator.mnls_per_ts[i][j] ].type != CLOVERTRLOG ) {
           fprintf(datafile,"%d %d ",  monomial_list[ Integrator.mnls_per_ts[i][j] ].iter0, 
                   monomial_list[ Integrator.mnls_per_ts[i][j] ].iter1);
         }
diff --git a/util/tmlqcd-indent b/util/tmlqcd-indent
new file mode 100755
index 000000000..b5dc8b818
--- /dev/null
+++ b/util/tmlqcd-indent
@@ -0,0 +1,2 @@
+#!/bin/bash
+clang-format -style=file -i "$@"
diff --git a/wrapper/Makefile.in b/wrapper/Makefile.in
index bbff117e5..cb7ced023 100644
--- a/wrapper/Makefile.in
+++ b/wrapper/Makefile.in
@@ -58,10 +58,10 @@ include ${top_srcdir}/Makefile.global
 
 # rule to compile objects
 
-${libwrapper_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h
+${libwrapper_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config.h
 	$(COMPILE) ${OPTARGS} -c $<
 
-${libwrapper_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h
+${libwrapper_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config.h
 	$(COMPILE) ${SOPTARGS} -c $<
 
 # rule to make libwrapper
diff --git a/wrapper/lib_wrapper.c b/wrapper/lib_wrapper.c
old mode 100755
new mode 100644
index a4d540f91..597e5f947
--- a/wrapper/lib_wrapper.c
+++ b/wrapper/lib_wrapper.c
@@ -24,16 +24,17 @@
  *
  *******************************************************************************/
 
+#include "tmlqcd_config.h"
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <time.h>
 #include <string.h>
 #include <signal.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include "global.h"
@@ -41,9 +42,12 @@
 #include "getopt.h"
 #include "linalg_eo.h"
 #include "geometry_eo.h"
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include "xchange/xchange.h"
 #endif
+#ifdef TM_USE_QUDA
+#include "quda_interface.h"
+#endif
 #include <io/utils.h>
 #include <io/gauge.h>
 #include "read_input.h"
@@ -54,8 +58,10 @@
 #include "invert_eo.h"
 #include "start.h"
 #include "operator.h"
+#include "measure_gauge_action.h"
 #include "linalg/convert_eo_to_lexic.h"
 #include "include/tmLQCD.h"
+#include "fatal_error.h"
 
 #ifdef HAVE_GPU
 extern void init_mixedsolve_eo(su3** gf);
@@ -69,20 +75,25 @@ extern void finalize_gpu_fields();
 #  endif
 #endif
 
+#define CONF_FILENAME_LENGTH 500
 
 static int tmLQCD_invert_initialised = 0;
 
-int tmLQCD_invert_init(int argc, char *argv[], const int _verbose) {
+int tmLQCD_invert_init(int argc, char *argv[], const int _verbose, const int external_id) {
 
   DUM_DERI = 8;
   DUM_MATRIX = DUM_DERI + 5;
   NO_OF_SPINORFIELDS = DUM_MATRIX + 3;
+  //4 extra fields (corresponding to DUM_MATRIX+0..5) for deg. and ND matrix mult.  
+  NO_OF_SPINORFIELDS_32 = 6;
+
+  NO_OF_SPINORFIELDS_32 = 6;
 
   // in read_input.h
   verbose = _verbose;
   g_use_clover_flag = 0;
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id);
 #else
   g_proc_id = 0;
@@ -91,9 +102,14 @@ int tmLQCD_invert_init(int argc, char *argv[], const int _verbose) {
   /* Read the input file */
   if( (read_input("invert.input")) != 0) {
     fprintf(stderr, "tmLQCD_init_invert: Could not find input file: invert.input\nAborting...");
+    return(-1);
   }
 
-#ifdef OMP
+#ifndef TM_USE_MPI
+  if(subprocess_flag) g_external_id = external_id;
+#endif  
+
+#ifdef TM_USE_OMP
   init_openmp();
 #endif
 
@@ -103,8 +119,10 @@ int tmLQCD_invert_init(int argc, char *argv[], const int _verbose) {
 
 #ifdef _GAUGE_COPY
   int j = init_gauge_field(VOLUMEPLUSRAND, 1);
+  j += init_gauge_field_32(VOLUMEPLUSRAND, 1);
 #else
   int j = init_gauge_field(VOLUMEPLUSRAND, 0);
+  j += init_gauge_field_32(VOLUMEPLUSRAND, 0);
 #endif
   if (j != 0) {
     fprintf(stderr, "tmLQCD_init_invert: Not enough memory for gauge_fields! Aborting...\n");
@@ -115,16 +133,28 @@ int tmLQCD_invert_init(int argc, char *argv[], const int _verbose) {
     fprintf(stderr, "tmLQCD_init_invert: Not enough memory for geometry indices! Aborting...\n");
     return(-1);
   }
-  if (even_odd_flag) {
-    j = init_spinor_field(VOLUMEPLUSRAND / 2, NO_OF_SPINORFIELDS);
-  }
-  else {
-    j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS);
+  if(!lowmem_flag){
+    if (even_odd_flag) {
+      j = init_spinor_field(VOLUMEPLUSRAND / 2, NO_OF_SPINORFIELDS);
+      j += init_spinor_field_32(VOLUMEPLUSRAND/2, NO_OF_SPINORFIELDS_32);
+    }
+    else {
+      j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS);
+      j += init_spinor_field_32(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS_32);
+    } 
+    if (j != 0) {
+      fprintf(stderr, "tmLQCD_init_invert: Not enough memory for spinor fields! Aborting...\n");
+      return(-1);
+    }
   }
-  if (j != 0) {
-    fprintf(stderr, "tmLQCD_init_invert: Not enough memory for spinor fields! Aborting...\n");
-    return(-1);
+
+  if(g_cart_id == 0) {
+    FILE *parameterfile = parameterfile = fopen("tmLQCD-libwrapper.para", "w");
+    write_first_messages(parameterfile, "tmLQCD lib-wrapper", git_hash);
+    fclose(parameterfile);
   }
+
+
   // define the geometry
   geometry();
 
@@ -155,35 +185,47 @@ int tmLQCD_invert_init(int argc, char *argv[], const int _verbose) {
 
 
 #ifdef _USE_HALFSPINOR
-  j = init_dirac_halfspinor();
-  if (j != 0) {
-    fprintf(stderr, "tmLQCD_init_invert: Not enough memory for halffield! Aborting...\n");
-    return(-1);
-  }
-  if (g_sloppy_precision_flag == 1) {
+  if(!lowmem_flag){
+    j = init_dirac_halfspinor();
+    if (j != 0) {
+      fprintf(stderr, "tmLQCD_init_invert: Not enough memory for halffield! Aborting...\n");
+      return(-1);
+    }
     j = init_dirac_halfspinor32();
     if (j != 0) {
       fprintf(stderr, "tmLQCD_init_invert: Not enough memory for 32-bit halffield! Aborting...\n");
       return(-1);
     }
-  }
-#  if (defined _PERSISTENT)
-  if (even_odd_flag)
-    init_xchange_halffield();
+#    if (defined _PERSISTENT)
+    if (even_odd_flag)
+      init_xchange_halffield();
 #  endif
+  }
 #endif
-  tmLQCD_invert_initialised = 1;  
+  tmLQCD_invert_initialised = 1;
+#ifdef TM_USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif 
   return(0);
 }
 
 int tmLQCD_read_gauge(const int nconfig) {
-  char conf_filename[500];
+  char conf_filename[CONF_FILENAME_LENGTH];
   if(!tmLQCD_invert_initialised) {
     fprintf(stderr, "tmLQCD_read_gauge: tmLQCD_inver_init must be called first. Aborting...\n");
     return(-1);
   }
 
-  sprintf(conf_filename, "%s.%.4d", gauge_input_filename, nconfig);
+  int n_written = snprintf(conf_filename, CONF_FILENAME_LENGTH, "%s.%.4d", gauge_input_filename, nconfig);
+  if( n_written < 0 || n_written >= CONF_FILENAME_LENGTH ){
+    char error_message[500];
+    snprintf(error_message,
+             500,
+             "Encoding error or gauge configuration filename "
+             "longer than %d characters! See wrapper/lib_wrapper.c CONF_FILENAME_LENGTH\n", 
+             CONF_FILENAME_LENGTH);
+    fatal_error(error_message, "tmLQCD_read_gauge");
+  }
   int j=0;
   if (g_cart_id == 0) {
     printf("#\n# Trying to read gauge field from file %s.\n",
@@ -198,9 +240,23 @@ int tmLQCD_read_gauge(const int nconfig) {
     printf("# Finished reading gauge field.\n");
     fflush(stdout);
   }
-#ifdef MPI
-  xchange_gauge(g_gauge_field);
+
+  // set the global nstore parameter
+  nstore = nconfig;
+
+#ifdef TM_USE_MPI
+  if(!lowmem_flag){
+    xchange_gauge(g_gauge_field);
+  }
 #endif
+  if(!lowmem_flag){
+    convert_32_gauge_field(g_gauge_field_32, g_gauge_field, VOLUMEPLUSRAND);
+  }
+
+  double plaquette = measure_plaquette( (const su3** const) g_gauge_field)/(6.*VOLUME*g_nproc);
+  if (g_cart_id == 0) {
+    printf("# The computed plaquette value is %.16e.\n", plaquette);
+  }
   return(0);
 }
 
@@ -210,6 +266,10 @@ int tmLQCD_invert(double * const propagator, double * const source,
   unsigned int index_start = 0;
   g_mu = 0.;
 
+  if(lowmem_flag && g_proc_id==0){
+    printf("!!! WARNING: you are calling tmLQCD_invert in \'lowmem\' mode.\n Did you make sure that all required fields are allocated and initialised??\n");
+  }
+
   if(!tmLQCD_invert_initialised) {
     fprintf(stderr, "tmLQCD_invert: tmLQCD_inver_init must be called first. Aborting...\n");
     return(-1);
@@ -243,7 +303,7 @@ int tmLQCD_invert(double * const propagator, double * const source,
 
 int tmLQCD_finalise() {
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   free_omp_accumulators();
 #endif
 
@@ -258,13 +318,21 @@ int tmLQCD_finalise() {
 #  endif
   }
 #endif
+
+#ifdef TM_USE_QUDA
+  _endQuda();
+#endif
   
   free_gauge_field();
   free_geometry_indices();
-  free_spinor_field();
-  free_moment_field();
-  free_chi_spinor_field();
-#ifdef MPI
+  if(!lowmem_flag){
+    free_gauge_field_32();
+    free_spinor_field();
+    free_spinor_field_32();
+    free_moment_field();
+    free_chi_spinor_field();
+  }
+#ifdef TM_USE_MPI
   MPI_Barrier(MPI_COMM_WORLD);
 #endif
   return(0);
@@ -306,20 +374,25 @@ int tmLQCD_get_mpi_params(tmLQCD_mpi_params * params) {
   params->proc_coords[1] = g_proc_coords[1];
   params->proc_coords[2] = g_proc_coords[2];
   params->proc_coords[3] = g_proc_coords[3];
-
+#ifdef TM_USE_MPI
+  params->cart_grid = g_cart_grid;
+#endif
   return(0);
 }
 
-int tmLQCD_get_gauge_field_pointer(double * gf) {
+int tmLQCD_get_gauge_field_pointer(double ** gf) {
   if(!tmLQCD_invert_initialised) {
-    fprintf(stderr, "tmLQCD_get_mpi_params: tmLQCD_inver_init must be called first. Aborting...\n");
+    fprintf(stderr, "tmLQCD_get_gauge_field_pointer: tmLQCD_invert_init must be called first. Aborting...\n");
     return(-1);
   }
-#ifdef MPI
+#ifdef TM_USE_MPI
   xchange_gauge(g_gauge_field);
 #endif
+  if(!lowmem_flag){
+    convert_32_gauge_field(g_gauge_field_32, g_gauge_field, VOLUMEPLUSRAND);
+  }
 
-  gf = (double*) g_gauge_field[0];
+  *gf = (double*) g_gauge_field[0];
 
   return(0);
 }
diff --git a/xchange/Makefile.in b/xchange/Makefile.in
index bfea6a61b..a598daba4 100644
--- a/xchange/Makefile.in
+++ b/xchange/Makefile.in
@@ -32,7 +32,7 @@ COMPILE = ${CC} $(DEFS) ${INCLUDES} ${CFLAGS}
 LIBRARIES = libxchange
 libxchange_TARGETS = xchange_deri xchange_field xchange_gauge xchange_halffield \
 	xchange_lexicfield xchange_2fields xchange_field_tslice \
-	xchange_jacobi
+	xchange_jacobi little_field_gather
 
 libxchange_STARGETS = 
 
@@ -60,10 +60,10 @@ include ${top_srcdir}/Makefile.global
 
 # rule to compile objects
 
-${libxchange_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h
+${libxchange_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config.h
 	$(COMPILE) ${OPTARGS} -c $<
 
-${libxchange_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h
+${libxchange_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config.h
 	$(COMPILE) ${SOPTARGS} -c $<
 
 # rule to make libxchange
diff --git a/xchange/little_field_gather.c b/xchange/little_field_gather.c
new file mode 100644
index 000000000..36457f5e5
--- /dev/null
+++ b/xchange/little_field_gather.c
@@ -0,0 +1,79 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2008 Albert Deuzeman, Siebren Reker, Carsten Urbach
+ *               2010 Claude Tadonki, Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+#ifdef TM_USE_MPI
+# include <mpi.h>
+#endif
+#include "global.h"
+#include <complex.h>
+#include "block.h"
+#include "little_field_gather.h"
+
+enum{
+  T_UP = 0,
+  T_DN = 1,
+  X_UP = 2,
+  X_DN = 3,
+  Y_UP = 4,
+  Y_DN = 5,
+  Z_UP = 6,
+  Z_DN = 7
+} Direction;
+
+
+#ifdef TM_USE_MPI
+MPI_Request lrequests[16];
+MPI_Status lstatus[16];
+int waitcount = 0;
+#endif
+
+
+#define _PSWITCH(s) s 
+#define _PTSWITCH(s) s 
+#define _C_TYPE _Complex double
+#define _MPI_C_TYPE MPI_DOUBLE_COMPLEX
+
+#include"little_field_gather_body.c"
+
+#undef _PSWITCH
+#undef _PTSWITCH
+#undef _C_TYPE
+#undef _MPI_C_TYPE
+
+#define _PSWITCH(s) s ## _32
+#define _PTSWITCH(s) s ## 32
+#define _C_TYPE _Complex float
+#define _MPI_C_TYPE MPI_COMPLEX
+
+#include"little_field_gather_body.c"
+
+#undef _PSWITCH
+#undef _PTSWITCH
+#undef _C_TYPE
+#undef _MPI_C_TYPE
diff --git a/xchange/little_field_gather.h b/xchange/little_field_gather.h
new file mode 100644
index 000000000..e31c8ebd7
--- /dev/null
+++ b/xchange/little_field_gather.h
@@ -0,0 +1,32 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2008 Albert Deuzeman, Siebren Reker, Carsten Urbach
+ *               2010 Claude Tadonki, Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _LITTLE_FIELD_GATHER_H
+#define _LITTLE_FIELD_GATHER_H
+
+
+void little_field_gather(_Complex double * w);
+void little_field_gather_eo(const int eo, _Complex double * w);
+
+void little_field_gather_32(_Complex float * w);
+void little_field_gather_eo_32(const int eo, _Complex float * w);
+
+#endif
diff --git a/xchange/little_field_gather_body.c b/xchange/little_field_gather_body.c
new file mode 100644
index 000000000..9a64b1cc7
--- /dev/null
+++ b/xchange/little_field_gather_body.c
@@ -0,0 +1,455 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2008 Albert Deuzeman, Siebren Reker, Carsten Urbach
+ *               2010 Claude Tadonki, Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+void _PSWITCH(little_field_gather)(_C_TYPE * w) {
+  int ib;
+  _C_TYPE * wt_buf=NULL, * wx_buf=NULL, * wy_buf=NULL, * wz_buf=NULL, 
+    * w_buf=NULL, * w_source=NULL, * w_dest=NULL;
+  _C_TYPE * wt=NULL, * wx=NULL, * wy=NULL, * wz=NULL;
+
+  /************************************************************************/
+  /* This routine has been extended for multi_dimensional blocking        */
+  /* by Claude Tadonki (claude.tadonki@u-psud.fr) from PetaQCD project    */
+  /* June 2010                                                            */
+  /************************************************************************/
+
+  wt = w + ( 0*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction t starts
+  wx = w + ( 1*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction x starts
+  wy = w + ( 2*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction y starts
+  wz = w + ( 3*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction z starts
+
+#ifdef TM_USE_MPI
+  int request = 0;
+  int err;
+  w_buf = calloc(8 * nb_blocks * g_N_s, sizeof(_C_TYPE)); // +-t +-x +-y +-z
+
+  wt_buf = w_buf + ( 0*(2*nb_blocks)) * g_N_s; // Were data in the direction t starts
+  wx_buf = w_buf + ( 1*(2*nb_blocks)) * g_N_s; // Were data in the direction x starts
+  wy_buf = w_buf + ( 2*(2*nb_blocks)) * g_N_s; // Were data in the direction y starts
+  wz_buf = w_buf + ( 3*(2*nb_blocks)) * g_N_s; // Were data in the direction z starts
+
+  /* We first exchange the fields regardless of block considerations                   */
+  /* The data need to be received in an intermediate buffer because of later shuffling */
+
+  if(g_nproc_t > 1) {
+    /* Send t up */
+    MPI_Isend(w, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_t_up, T_UP, g_cart_grid, &lrequests[request]);
+    request++;
+    MPI_Irecv(wt_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_t_dn, T_UP, g_cart_grid, &lrequests[request]);
+    request++;
+    /* Send t down */
+    MPI_Isend(w, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_t_dn, T_DN, g_cart_grid, &lrequests[request]);
+    request++;
+    MPI_Irecv(wt_buf, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_t_up, T_DN, g_cart_grid, &lrequests[request]);
+    request++;
+  }
+  if(g_nproc_x > 1) {
+    /* Send x up */
+    MPI_Isend(w, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_x_up, X_UP, g_cart_grid, &lrequests[request]);
+    request++;
+    MPI_Irecv(wx_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_x_dn, X_UP, g_cart_grid, &lrequests[request]);
+    request++;
+    /* Send x down */
+    MPI_Isend(w, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_x_dn, X_DN, g_cart_grid, &lrequests[request]);
+    request++;
+    MPI_Irecv(wx_buf, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_x_up, X_DN, g_cart_grid, &lrequests[request]);
+    request++;
+  }
+  if(g_nproc_y > 1) {
+    /* Send y up */
+    MPI_Isend(w, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_y_up, Y_UP, g_cart_grid, &lrequests[request]);
+    request++;
+    MPI_Irecv(wy_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_y_dn, Y_UP, g_cart_grid, &lrequests[request]);
+    request++;
+    /* Send y down */
+    MPI_Isend(w, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_y_dn, Y_DN, g_cart_grid, &lrequests[request]);
+    request++;
+    MPI_Irecv(wy_buf, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_y_up, Y_DN, g_cart_grid, &lrequests[request]);
+    request++;
+  }
+  if(g_nproc_z > 1) {
+    /* Send z up */
+    MPI_Isend(w, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_z_up, Z_UP, g_cart_grid, &lrequests[request]);
+    request++;
+    MPI_Irecv(wz_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_z_dn, Z_UP, g_cart_grid, &lrequests[request]);
+    request++;
+    /* Send z down */
+    MPI_Isend(w, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_z_dn, Z_DN, g_cart_grid, &lrequests[request]);
+    request++;
+    MPI_Irecv(wz_buf, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_z_up, Z_DN, g_cart_grid, &lrequests[request]);
+    request++;
+  }
+  err = MPI_Waitall(request, lrequests, lstatus);
+
+#endif
+  
+  /* We now correct the field according to block partitionning               */
+  /* We could have avoid the previous corresponding MPI communication        */
+  /* We proceed like this for code simplicity, maybe will be optimized later */
+  
+  for(int pm = 0; pm < 8; pm++) {
+    for(int bt = 0; bt < nblks_t; bt++) {
+      for(int bx = 0; bx < nblks_x; bx++) {
+        for(int by = 0; by < nblks_y; by++) {
+          for(int bz = 0; bz < nblks_z; bz++) {
+            ib = block_index(bt, bx, by, bz) * g_N_s;
+            switch(pm){ 
+            case T_UP: /* Direction +t */
+              w_dest = wt + ib;
+              if( bt == nblks_t - 1 ) {
+                ib = block_index(0, bx, by, bz) * g_N_s; 
+                if(g_nproc_t > 1) w_source = wt_buf + ib;
+                else w_source = w + ib;
+              }
+              // got it from the MPI exchange
+              else  {
+                ib = block_index(bt + 1, bx, by, bz) * g_N_s; 
+                w_source = w + ib;
+              }
+              // got it from the diagonal block
+              break; 
+            case T_DN: /* Direction -t */
+              w_dest = wt + ib + nb_blocks * g_N_s;
+              if( bt == 0 ) {
+                ib = block_index(nblks_t - 1, bx, by, bz) * g_N_s; 
+                if(g_nproc_t > 1) w_source = wt_buf + ib + nb_blocks * g_N_s;
+                else w_source = w + ib;
+              }
+              // got it from the MPI exchange
+              else  {
+                ib = block_index(bt - 1, bx, by, bz) * g_N_s;
+                w_source = w + ib;
+              }
+              // got it from the diagonal block
+              break; 
+            case X_UP: /* Direction +x */
+              w_dest = wx + ib;
+              if( bx == nblks_x - 1 ) {
+                ib = block_index(bt, 0, by, bz) * g_N_s; 
+                if(g_nproc_x > 1) w_source = wx_buf + ib;
+                else w_source = w + ib;
+              }
+              // got it from the MPI exchange
+              else  {
+                ib = block_index(bt, bx + 1, by, bz) * g_N_s; 
+                w_source = w + ib;
+              }
+              // got it from the diagonal block
+              break; 
+            case X_DN: /* Direction -x */
+              w_dest = wx + ib + nb_blocks * g_N_s;
+              if( bx == 0 ) {
+                ib = block_index(bt, nblks_x - 1, by, bz) * g_N_s; 
+                if(g_nproc_x > 1) w_source = wx_buf + ib + nb_blocks * g_N_s;
+                else w_source = w + ib;
+              }
+              // got it from the MPI exchange
+              else  {
+                ib = block_index(bt, bx - 1, by, bz) * g_N_s;
+                w_source = w + ib;
+              }
+              // got it from the diagonal block
+              break; 
+            case Y_UP: /* Direction +y */
+              w_dest = wy + ib;
+              if( by == nblks_y - 1 ) {
+                ib = block_index(bt, bx, 0, bz) * g_N_s; 
+                if(g_nproc_y > 1) w_source = wy_buf + ib;
+                else w_source = w + ib;
+              }
+              // got it from the MPI exchange
+              else  {
+                ib = block_index(bt, bx, by + 1, bz) * g_N_s; 
+                w_source = w + ib;
+              }
+              // got it from the diagonal block
+              break; 
+            case Y_DN: /* Direction -y */
+              w_dest = wy + ib + nb_blocks * g_N_s;
+              if( by == 0 ) {
+                ib = block_index(bt, bx, nblks_y - 1, bz) * g_N_s; 
+                if(g_nproc_y > 1) w_source = wy_buf + ib + nb_blocks * g_N_s;
+                else w_source = w + ib;
+              }
+              // got it from the MPI exchange
+              else  {
+                ib = block_index(bt, bx, by - 1, bz) * g_N_s;
+                w_source = w + ib;
+              }
+              // got it from the diagonal block
+              break; 
+            case Z_UP: /* Direction +z */
+              w_dest = wz + ib;
+              if( bz == nblks_z - 1 ) {
+                ib = block_index(bt, bx, by, 0) * g_N_s; 
+                if(g_nproc_z > 1) w_source = wz_buf + ib;
+                else w_source = w + ib;
+              }
+              // got it from the MPI exchange
+              else  {
+                ib = block_index(bt, bx, by, bz + 1) * g_N_s; 
+                w_source = w + ib;
+              }
+              // got it from the diagonal block
+              break; 
+            case Z_DN: /* Direction -z */
+              w_dest = wz + ib + nb_blocks * g_N_s;
+              if( bz == 0 ) {
+                ib = block_index(bt, bx, by, nblks_z - 1) * g_N_s; 
+                if(g_nproc_z > 1) w_source = wz_buf + ib + nb_blocks * g_N_s;
+                else w_source = w + ib;
+              }
+              // got it from the MPI exchange
+              else  {
+                ib = block_index(bt, bx, by, bz - 1) * g_N_s; 
+                w_source = w + ib; 
+              }
+              // got it from the diagonal block
+              break; 
+              
+            default: 
+              w_dest = NULL;
+              w_source = NULL;
+            }
+            memcpy(w_dest, w_source, g_N_s * sizeof(_C_TYPE));
+          }
+        }
+      }
+    }
+  }
+  free(w_buf);
+  
+  return;
+}
+
+void _PSWITCH(little_field_gather_eo)(const int eo, _C_TYPE * w) {
+
+  int ib, ib2;
+  _C_TYPE *wt = NULL, *wx = NULL, *wy = NULL, *wz = NULL;
+  _C_TYPE *wt_buf = NULL, *wx_buf = NULL, *wy_buf = NULL, *wz_buf = NULL, *w_buf = NULL, *w_source = NULL, *w_dest = NULL;
+
+  
+  wt = w + ( 0*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction t starts
+  wx = w + ( 1*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction x starts
+  wy = w + ( 2*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction y starts
+  wz = w + ( 3*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction z starts
+
+#ifdef TM_USE_MPI
+  int request = 0;
+  int err;
+  w_buf = calloc(8 * nb_blocks * g_N_s, sizeof(_C_TYPE)); // +-t +-x +-y +-z
+
+  wt_buf = w_buf + ( 0*(2*nb_blocks)) * g_N_s; // Were data in the direction t starts
+  wx_buf = w_buf + ( 1*(2*nb_blocks)) * g_N_s; // Were data in the direction x starts
+  wy_buf = w_buf + ( 2*(2*nb_blocks)) * g_N_s; // Were data in the direction y starts
+  wz_buf = w_buf + ( 3*(2*nb_blocks)) * g_N_s; // Were data in the direction z starts
+
+  /* We first exchange the fields regardless of block considerations                   */
+  /* The data need to be received in an intermediate buffer because of later shuffling */
+
+  if(g_nproc_t > 1) {
+    /* Send t up */
+    MPI_Isend(w, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_t_up, T_UP, g_cart_grid, &lrequests[request]);
+    request++;
+    MPI_Irecv(wt_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_t_dn, T_UP, g_cart_grid, &lrequests[request]);
+    request++;
+    /* Send t down */
+    MPI_Isend(w, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_t_dn, T_DN, g_cart_grid, &lrequests[request]);
+    request++;
+    MPI_Irecv(wt_buf, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_t_up, T_DN, g_cart_grid, &lrequests[request]);
+    request++;
+  }
+  if(g_nproc_x > 1) {
+    /* Send x up */
+    MPI_Isend(w, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_x_up, X_UP, g_cart_grid, &lrequests[request]);
+    request++;
+    MPI_Irecv(wx_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_x_dn, X_UP, g_cart_grid, &lrequests[request]);
+    request++;
+    /* Send x down */
+    MPI_Isend(w, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_x_dn, X_DN, g_cart_grid, &lrequests[request]);
+    request++;
+    MPI_Irecv(wx_buf, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_x_up, X_DN, g_cart_grid, &lrequests[request]);
+    request++;
+  }
+  if(g_nproc_y > 1) {
+    /* Send y up */
+    MPI_Isend(w, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_y_up, Y_UP, g_cart_grid, &lrequests[request]);
+    request++;
+    MPI_Irecv(wy_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_y_dn, Y_UP, g_cart_grid, &lrequests[request]);
+    request++;
+    /* Send y down */
+    MPI_Isend(w, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_y_dn, Y_DN, g_cart_grid, &lrequests[request]);
+    request++;
+    MPI_Irecv(wy_buf, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_y_up, Y_DN, g_cart_grid, &lrequests[request]);
+    request++;
+  }
+  if(g_nproc_z > 1) {
+    /* Send z up */
+    MPI_Isend(w, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_z_up, Z_UP, g_cart_grid, &lrequests[request]);
+    request++;
+    MPI_Irecv(wz_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_z_dn, Z_UP, g_cart_grid, &lrequests[request]);
+    request++;
+    /* Send z down */
+    MPI_Isend(w, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_z_dn, Z_DN, g_cart_grid, &lrequests[request]);
+    request++;
+    MPI_Irecv(wz_buf, nb_blocks * g_N_s, _MPI_C_TYPE, g_nb_z_up, Z_DN, g_cart_grid, &lrequests[request]);
+    request++;
+  }
+  err = MPI_Waitall(request, lrequests, lstatus);
+
+#endif
+  
+  /* We now correct the field according to block partitionning               */
+
+  for(int pm = 0; pm < 8; pm++) {
+    ib2=0;
+    for(int bt = 0; bt < nblks_t; bt++) {
+      for(int bx = 0; bx < nblks_x; bx++) {
+        for(int by = 0; by < nblks_y; by++) {
+          for(int bz = 0; bz < nblks_z; bz++) {
+            if ((bt+bx+by+bz)%2==eo) {
+              ib2 = index_block_eo[block_index(bt, bx, by, bz)] * g_N_s;
+              
+              switch(pm){ 
+              case T_UP: /* Direction +t */
+                w_dest = wt + ib2;
+                if( bt == nblks_t - 1 ) {
+                  ib = index_block_eo[block_index(0,bx, by,bz)] * g_N_s; 
+                  if(g_nproc_t > 1) w_source = wt_buf + ib;     
+                  else w_source = w + ib;
+                }
+                // got it from the MPI exchange
+                else  {
+                  ib = index_block_eo[block_index(bt+1, bx, by, bz)] * g_N_s; 
+                  w_source = w + ib;    
+                }
+                // got it from the diagonal block
+                break; 
+              case T_DN: /* Direction -t */
+                w_dest = wt + ib2 + nb_blocks * g_N_s;
+                if( bt == 0) {
+                  ib = index_block_eo[block_index(nblks_t-1, bx,by,bz)] * g_N_s; 
+                  if(g_nproc_t > 1) w_source = wt_buf + ib + nb_blocks * g_N_s;
+                  else w_source = w + ib;
+                } // got it from the MPI exchange
+                else  {
+                  ib = index_block_eo[block_index(bt-1,bx, by, bz)] * g_N_s; 
+                  w_source = w + ib; 
+                }
+                // got it from the diagonal block
+                break; 
+              case X_UP: /* Direction +x */
+                w_dest = wx + ib2;
+                if( bx == nblks_x - 1 ) {
+                  ib = index_block_eo[block_index(bt, 0, by,bz)] * g_N_s; 
+                  if(g_nproc_x > 1) w_source = wx_buf + ib;     
+                  else w_source = w + ib;
+                }
+                // got it from the MPI exchange
+                else  {
+                  ib = index_block_eo[block_index(bt, bx+1, by, bz)] * g_N_s; 
+                  w_source = w + ib;
+                }
+                // got it from the diagonal block
+                break; 
+              case X_DN: /* Direction -x */
+                w_dest = wx + ib2 + nb_blocks * g_N_s;
+                if( bx == 0) {ib = index_block_eo[block_index(bt, nblks_x-1, by,bz)] * g_N_s;
+                  if(g_nproc_x > 1) w_source = wx_buf + ib + nb_blocks * g_N_s;
+                  else w_source = w + ib;
+                }
+                // got it from the MPI exchange
+                else  {
+                  ib = index_block_eo[block_index(bt, bx-1, by, bz)] * g_N_s; 
+                  w_source = w + ib;
+                }
+                // got it from the diagonal block
+                break; 
+              case Y_UP: /* Direction +y */
+                w_dest = wy + ib2;
+                if( by == nblks_y - 1 ) {
+                  ib = index_block_eo[block_index(bt, bx, 0,bz)] * g_N_s; 
+                  if(g_nproc_y > 1) w_source = wy_buf + ib;
+                  else w_source = w + ib;
+                }
+                // got it from the MPI exchange
+                else  {
+                  ib = index_block_eo[block_index(bt, bx, by+1, bz)] * g_N_s;
+                  w_source = w + ib;
+                }
+                // got it from the diagonal block
+                break; 
+              case Y_DN: /* Direction -y */
+                w_dest = wy + ib2 + nb_blocks * g_N_s;
+                if( by == 0) {
+                  ib = index_block_eo[block_index(bt, bx, nblks_y-1, bz)] * g_N_s;
+                  if(g_nproc_y > 1) w_source = wy_buf + ib + nb_blocks * g_N_s;
+                  else w_source = w + ib;
+                }
+                // got it from the MPI exchange
+                else  {
+                  ib = index_block_eo[block_index(bt, bx, by-1, bz)] * g_N_s;
+                  w_source = w + ib; 
+                }
+                // got it from the diagonal block
+                break; 
+              case Z_UP: /* Direction +z */
+                w_dest = wz + ib2;
+                if( bz == nblks_z - 1 ) {
+                  ib = index_block_eo[block_index(bt, bx, by, 0)] * g_N_s;
+                  if(g_nproc_z > 1) w_source = wz_buf + ib;
+                  else w_source = w + ib;
+                }
+                // got it from the MPI exchange
+                else  {
+                  ib = index_block_eo[block_index(bt, bx, by, bz + 1)] * g_N_s;
+                  w_source = w + ib;
+                }
+                // got it from the diagonal block
+                break; 
+              case Z_DN: /* Direction -z */
+                w_dest = wz + ib2 + nb_blocks * g_N_s;
+                if( bz == 0) {
+                  ib = index_block_eo[block_index(bt, bx, by, nblks_z - 1)] * g_N_s;
+                  if(g_nproc_z > 1) w_source = wz_buf + ib + nb_blocks * g_N_s;
+                  else w_source = w + ib;
+                }
+                // got it from the MPI exchange
+                else  {
+                  ib = index_block_eo[block_index(bt, bx, by, bz - 1)] * g_N_s;
+                  w_source = w + ib;
+                }
+                // got it from the diagonal block
+                break; 
+              default:
+                w_dest = NULL;
+                w_source = NULL;
+              }
+              memcpy(w_dest, w_source, g_N_s * sizeof(_C_TYPE));
+            }
+          }
+        }
+      }
+    }
+  }
+  free(w_buf);
+  return;
+}
diff --git a/xchange/xchange_2fields.c b/xchange/xchange_2fields.c
index 9e083cd6e..0227d395e 100644
--- a/xchange/xchange_2fields.c
+++ b/xchange/xchange_2fields.c
@@ -26,13 +26,13 @@
  **********************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <string.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 
@@ -59,7 +59,7 @@
 
 void xchange_2fields(spinor * const l, spinor * const k, const int ieo) {
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Request requests[32];
   MPI_Status status[32];
 #endif
@@ -72,7 +72,7 @@ void xchange_2fields(spinor * const l, spinor * const k, const int ieo) {
 #pragma pomp inst begin(xchange2fields)
 #endif
 
-#  ifdef MPI
+#  ifdef TM_USE_MPI
 
 #  if (defined BGL && defined XLC)
   __alignx(16, l);
@@ -226,7 +226,7 @@ void xchange_2fields(spinor * const l, spinor * const k, const int ieo) {
 #pragma pomp inst begin(xchange2fields)
 #endif
 
-#  ifdef MPI
+#  ifdef TM_USE_MPI
 
 #  if (defined BGL && defined XLC)
 #    ifdef PARALLELXYZT
diff --git a/xchange/xchange_deri.c b/xchange/xchange_deri.c
index 7112a0f7d..253273f79 100644
--- a/xchange/xchange_deri.c
+++ b/xchange/xchange_deri.c
@@ -24,12 +24,12 @@
 
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
@@ -38,7 +38,7 @@
 #include "su3adj.h"
 #include "xchange_deri.h"
 
-inline void addup_ddummy(su3adj** const df, const int ix, const int iy) {
+static inline void addup_ddummy(su3adj** const df, const int ix, const int iy) {
   for(int mu = 0; mu < 4; mu++) {
     df[ix][mu].d1 += ddummy[iy][mu].d1;
     df[ix][mu].d2 += ddummy[iy][mu].d2;
@@ -57,7 +57,7 @@ inline void addup_ddummy(su3adj** const df, const int ix, const int iy) {
 
 void xchange_deri(su3adj ** const df)
 {
-#  ifdef MPI
+#  ifdef TM_USE_MPI
   int ix,mu, t, y, z, x;
   MPI_Status status;
 
@@ -186,7 +186,7 @@ void xchange_deri(su3adj ** const df)
 
 void xchange_deri(su3adj ** const df)
 {
-#  ifdef MPI
+#  ifdef TM_USE_MPI
   int ix,iy, t, y, z, x;
   MPI_Status status;
 #    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
diff --git a/xchange/xchange_field.c b/xchange/xchange_field.c
index 7138d30e7..527e78e51 100644
--- a/xchange/xchange_field.c
+++ b/xchange/xchange_field.c
@@ -26,13 +26,13 @@
  **********************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <string.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #ifdef _USE_SHMEM
@@ -60,7 +60,7 @@
 
 void xchange_field(spinor * const l, const int ieo) {
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Request requests[16];
   MPI_Status status[16];
 #endif
@@ -83,7 +83,7 @@ void xchange_field(spinor * const l, const int ieo) {
   __alignx(16, l);
 #  endif
 
-#  ifdef MPI
+#  ifdef TM_USE_MPI
 
 
   /* In 4 dimensions there are two processors sharing the   */
@@ -268,7 +268,7 @@ void xchange_field(spinor * const l, const int ieo) {
 
 void xchange_field(spinor * const l, const int ieo) {
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Request requests[16];
   MPI_Status status[16];
 #endif
@@ -294,7 +294,7 @@ void xchange_field(spinor * const l, const int ieo) {
   __alignx(16, l);
 #  endif
 
-#  ifdef MPI
+#  ifdef TM_USE_MPI
 
 
   /* In 4 dimensions there are two processors sharing the   */
@@ -473,7 +473,7 @@ void xchange_field(spinor * const l, const int ieo) {
 /* exchanges the field  l */
 void xchange_field(spinor * const l, const int ieo) {
 
-#  ifdef MPI
+#  ifdef TM_USE_MPI
   int i,ix, mu, x0, x1, x2, x3, k;
 
 #ifdef _KOJAK_INST
@@ -589,7 +589,7 @@ void xchange_field(spinor * const l, const int ieo) {
 #pragma pomp inst begin(xchangefield)
 #endif
 
-#  ifdef MPI
+#  ifdef TM_USE_MPI
     
   MPI_Status status;
 
@@ -683,7 +683,7 @@ void xchange_field(spinor * const l, const int ieo) {
 #pragma pomp inst begin(xchangefield)
 #endif
 
-#  ifdef MPI
+#  ifdef TM_USE_MPI
     
   MPI_Status status;
   /* send the data to the neighbour on the left */
diff --git a/xchange/xchange_field_tslice.c b/xchange/xchange_field_tslice.c
index 83f7d4f92..7f6e00d63 100644
--- a/xchange/xchange_field_tslice.c
+++ b/xchange/xchange_field_tslice.c
@@ -7,13 +7,13 @@
  **********************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <string.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #ifdef _USE_SHMEM
@@ -28,7 +28,7 @@
 #include "su3.h"
 #include "xchange_field_tslice.h"
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 # ifdef _USE_TSPLITPAR
 void xchange_field_open(spinor * const l, const int ieo, const int x0, MPI_Request * requests, 
 			MPI_Status * status) {
@@ -40,7 +40,7 @@ void xchange_field_open(spinor * const l, const int ieo, const int x0, MPI_Reque
   __alignx(16, l); /* ?!? */
 #  endif
 
-#  ifdef MPI
+#  ifdef TM_USE_MPI
 
 #    if (defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ )
   /* send the data to the neighbour on the left in x direction */
@@ -145,7 +145,7 @@ void xchange_field_slice(spinor * const l, const int ieo, const int x0) {
   __alignx(16, l); /* ?!? */
 #  endif
 
-#  ifdef MPI
+#  ifdef TM_USE_MPI
 
     MPI_Status status;
 
diff --git a/xchange/xchange_field_tslice.h b/xchange/xchange_field_tslice.h
index 724a45c9f..841434b7f 100644
--- a/xchange/xchange_field_tslice.h
+++ b/xchange/xchange_field_tslice.h
@@ -12,7 +12,7 @@
 #define EVEN 1 
 #define  ODD 0 
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 void xchange_field_open(spinor * const , const int , const int , MPI_Request * , MPI_Status *);  
 void xchange_field_close(MPI_Request * , MPI_Status * , int );
 void xchange_field_slice(spinor * const , const int , const int );
diff --git a/xchange/xchange_gauge.c b/xchange/xchange_gauge.c
index f80a0c9b5..66ace3005 100644
--- a/xchange/xchange_gauge.c
+++ b/xchange/xchange_gauge.c
@@ -24,12 +24,12 @@
  **********************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
@@ -45,7 +45,7 @@
 
 void xchange_gauge(su3 ** const gf) {
   int cntr=0;
-#  ifdef MPI
+#  ifdef TM_USE_MPI
   MPI_Request request[105];
   MPI_Status status[105];
 
@@ -606,7 +606,7 @@ void xchange_gauge(su3 ** const gf) {
 
 void xchange_gauge(su3 ** const gf) {
   int cntr=0;
-#  ifdef MPI
+#  ifdef TM_USE_MPI
   MPI_Request request[105];
   MPI_Status status[105];
 
@@ -1195,7 +1195,7 @@ void xchange_gauge(su3 ** const gf) {
 
 void xchange_gauge(su3 ** const gf) {
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 
   MPI_Status status;
 #    if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT )
@@ -1621,7 +1621,7 @@ void xchange_gauge(su3 ** const gf) {
 # else /* _INDEX_INDEP_GEOM */
 void xchange_gauge(su3 ** const gf) {
 
-#ifdef MPI
+#ifdef TM_USE_MPI
 
   MPI_Status status;
   /* send the data to the neighbour on the left */
diff --git a/xchange/xchange_halffield.c b/xchange/xchange_halffield.c
index 73106a26e..50b30279c 100644
--- a/xchange/xchange_halffield.c
+++ b/xchange/xchange_halffield.c
@@ -26,13 +26,13 @@
  **********************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <string.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 #include "global.h"
@@ -53,7 +53,7 @@ MPI_Request prequests[16];
 /* 2. */
 void init_xchange_halffield() {
 
-#  ifdef MPI
+#  ifdef TM_USE_MPI
 
 #  ifdef PARALLELT
   int reqcount = 4;
@@ -145,7 +145,7 @@ void init_xchange_halffield() {
 
 /* 3. */
 void xchange_halffield() {
-#  ifdef MPI
+#  ifdef TM_USE_MPI
 
   MPI_Status status[16];
 #    ifdef PARALLELT
@@ -175,7 +175,7 @@ void xchange_halffield() {
 /* 4. -IIG */
 void xchange_halffield() {
 
-#  ifdef MPI
+#  ifdef TM_USE_MPI
 
   MPI_Request requests[16];
   MPI_Status status[16];
@@ -267,7 +267,7 @@ void xchange_halffield() {
 /* 4. */
 void xchange_halffield() {
 
-#  ifdef MPI
+#  ifdef TM_USE_MPI
 
   MPI_Request requests[16];
   MPI_Status status[16];
@@ -370,7 +370,7 @@ void xchange_halffield() {
 /* 32-2. */
 void xchange_halffield32() {
 
-#  ifdef MPI
+#  ifdef TM_USE_MPI
 
   MPI_Request requests[16];
   MPI_Status status[16];
diff --git a/xchange/xchange_jacobi.c b/xchange/xchange_jacobi.c
index de1985af2..0d38c36e0 100644
--- a/xchange/xchange_jacobi.c
+++ b/xchange/xchange_jacobi.c
@@ -25,13 +25,13 @@
  **********************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <string.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 
@@ -53,7 +53,7 @@ void xchange_jacobi(su3_vector * const l) {
 #pragma pomp inst begin(xchange_jacobi)
 #endif
 
-#  ifdef MPI
+#  ifdef TM_USE_MPI
 
   MPI_Status status;
 #    if (defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ )
diff --git a/xchange/xchange_lexicfield.c b/xchange/xchange_lexicfield.c
index 6f3bd4e84..eaa4d7973 100644
--- a/xchange/xchange_lexicfield.c
+++ b/xchange/xchange_lexicfield.c
@@ -27,13 +27,13 @@
  **********************************************************/
 
 #ifdef HAVE_CONFIG_H
-# include<config.h>
+# include<tmlqcd_config.h>
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <string.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 # include <mpi.h>
 #endif
 
@@ -54,7 +54,7 @@
 
 void xchange_lexicfield(spinor * const l) {
 
-#ifdef MPI
+#ifdef TM_USE_MPI
   MPI_Request requests[16];
   MPI_Status status[16];
 #endif
@@ -77,7 +77,7 @@ void xchange_lexicfield(spinor * const l) {
   __alignx(16, l);
 #  endif
 
-#  ifdef MPI
+#  ifdef TM_USE_MPI
 
 
   ireq=0;
@@ -178,7 +178,7 @@ void xchange_lexicfield(spinor * const l) {
   __alignx(16, l);
 #  endif
 
-#  ifdef MPI
+#  ifdef TM_USE_MPI
 
 
   /* send the data to the neighbour on the left */
@@ -263,7 +263,7 @@ void xchange_lexicfield(spinor * const l) {
 #pragma pomp inst begin(xchange_lexicfield)
 #endif
 
-#  ifdef MPI
+#  ifdef TM_USE_MPI
     
 #    if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
   MPI_Status status;
@@ -342,7 +342,7 @@ void xchange_lexicfield(spinor * const l) {
 #pragma pomp inst begin(xchange_lexicfield)
 #endif
 
-#  ifdef MPI
+#  ifdef TM_USE_MPI
     
   MPI_Status status;
   /* send the data to the neighbour on the left */
@@ -420,6 +420,384 @@ void xchange_lexicfield(spinor * const l) {
 
 
 
+/***********************************************************************
+ ****************            32 bit versions        ********************
+ ***********************************************************************/
+
+
+
+
+
+/* this version uses non-blocking MPI calls */
+#if (defined _NON_BLOCKING)
+
+/* this is the version independent of the content of the function Index (only available with non-blocking)) */
+/* this if statement will be removed in future and _INDEX_INDEP_GEOM will be the default */
+# if defined _INDEX_INDEP_GEOM
+
+void xchange_lexicfield32(spinor32 * const l) {
+
+#ifdef TM_USE_MPI
+  MPI_Request requests[16];
+  MPI_Status status[16];
+#endif
+  int ireq;
+#  if ( defined PARALLELT || defined PARALLELX )
+  int reqcount = 4;
+#  elif ( defined PARALLELXT || defined PARALLELXY )
+  int reqcount = 8;
+#  elif ( defined PARALLELXYT || defined PARALLELXYZ )
+  int reqcount = 12;
+#  elif defined PARALLELXYZT
+  int ix=0;
+  int reqcount = 16;
+#  endif
+
+#ifdef _KOJAK_INST
+#pragma pomp inst begin(xchange_lexicfield32)
+#endif
+#  if (defined BGL && defined XLC)
+  __alignx(16, l);
+#  endif
+
+#  ifdef TM_USE_MPI
+
+
+  ireq=0;
+
+#    if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT )
+  /* send the data to the neighbour on the left */
+  /* recieve the data from the neighbour on the right */
+  MPI_Isend((void*)(l+gI_0_0_0_0), 1, lfield_time_slice_cont32, g_nb_t_dn, 5081, g_cart_grid, &requests[ireq]);
+  MPI_Irecv((void*)(l+gI_L_0_0_0), 1, lfield_time_slice_cont32, g_nb_t_up, 5081, g_cart_grid, &requests[ireq+1]);
+  ireq=ireq+4;
+#    endif
+
+
+#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ )
+  /* send the data to the neighbour on the left in x direction */
+  /* recieve the data from the neighbour on the right in x direction */
+  MPI_Isend((void*)(l+gI_0_0_0_0), 1, lfield_x_slice_gath32, g_nb_x_dn, 5091, g_cart_grid,  &requests[ireq]);
+  MPI_Irecv((void*)(l+gI_0_L_0_0), 1, lfield_x_slice_cont32, g_nb_x_up, 5091, g_cart_grid, &requests[ireq+1]);
+  ireq=ireq+4;
+#    endif
+  
+#    if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ )
+  /* send the data to the neighbour on the left in y direction */
+  /* recieve the data from the neighbour on the right in y direction */
+  MPI_Isend((void*)(l+gI_0_0_0_0), 1, lfield_y_slice_gath32, g_nb_y_dn, 5101, g_cart_grid, &requests[ireq]);
+  MPI_Irecv((void*)(l+gI_0_0_L_0), 1, lfield_y_slice_cont32, g_nb_y_up, 5101, g_cart_grid, &requests[ireq+1]);
+  ireq=ireq+4;
+#    endif
+  
+#    if (defined PARALLELXYZT || defined PARALLELXYZ )  
+  /* send the data to the neighbour on the left in z direction */
+  /* recieve the data from the neighbour on the right in z direction */
+  MPI_Isend((void*)(l+gI_0_0_0_0), 1, lfield_z_slice_gath32, g_nb_z_dn, 5503, g_cart_grid, &requests[ireq]);
+  MPI_Irecv((void*)(l+gI_0_0_0_L), 1, lfield_z_slice_cont32, g_nb_z_up, 5503, g_cart_grid, &requests[ireq+1]); 
+  ireq=ireq+4;
+#    endif
+
+  ireq=2;
+
+#    if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT )
+  /* send the data to the neighbour on the right */
+  /* recieve the data from the neighbour on the left */
+  MPI_Isend((void*)(l+gI_Lm1_0_0_0), 1, lfield_time_slice_cont32, g_nb_t_up, 5082, g_cart_grid, &requests[ireq]);
+  MPI_Irecv((void*)(l+gI_m1_0_0_0), 1, lfield_time_slice_cont32, g_nb_t_dn, 5082, g_cart_grid, &requests[ireq+1]);
+  ireq=ireq+4;
+#endif
+  
+#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ )
+  /* send the data to the neighbour on the right in x direction */
+  /* recieve the data from the neighbour on the left in x direction */  
+  MPI_Isend((void*)(l+gI_0_Lm1_0_0), 1, lfield_x_slice_gath32, g_nb_x_up, 5092, g_cart_grid, &requests[ireq]);
+  MPI_Irecv((void*)(l+gI_0_m1_0_0), 1, lfield_x_slice_cont32, g_nb_x_dn, 5092, g_cart_grid, &requests[ireq+1]);
+  ireq=ireq+4;
+#    endif
+  
+#    if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ )
+  /* send the data to the neighbour on the right in y direction */
+  /* recieve the data from the neighbour on the left in y direction */  
+  MPI_Isend((void*)(l+gI_0_0_Lm1_0), 1, lfield_y_slice_gath32, g_nb_y_up, 5102, g_cart_grid, &requests[ireq]);
+  MPI_Irecv((void*)(l+gI_0_0_m1_0), 1, lfield_y_slice_cont32, g_nb_y_dn, 5102, g_cart_grid, &requests[ireq+1]);
+  ireq=ireq+4;
+#    endif
+  
+#    if ( defined PARALLELXYZT || defined PARALLELXYZ )  
+  /* send the data to the neighbour on the right in y direction */
+  /* recieve the data from the neighbour on the left in y direction */  
+  MPI_Isend((void*)(l+gI_0_0_0_Lm1), 1, lfield_z_slice_gath32, g_nb_z_up, 5504, g_cart_grid, &requests[ireq]);
+  MPI_Irecv((void*)(l+gI_0_0_0_m1), 1, lfield_z_slice_cont32, g_nb_z_dn, 5504, g_cart_grid, &requests[ireq+1]); 
+#    endif
+  
+  MPI_Waitall(reqcount, requests, status);
+
+#  endif
+  return;
+#ifdef _KOJAK_INST
+#pragma pomp inst end(xchange_lexicfield32)
+#endif
+}
+# else /* _INDEX_INDEP_GEOM */
+
+void xchange_lexicfield32(spinor32 * const l) {
+
+  MPI_Request requests[16];
+  MPI_Status status[16];
+#  ifdef PARALLELT
+  int reqcount = 4;
+#  elif defined PARALLELXT
+  int reqcount = 8;
+#  elif defined PARALLELXYT
+  int reqcount = 12;
+#  elif defined PARALLELXYZT
+  int reqcount = 16;
+#  endif
+#ifdef _KOJAK_INST
+#pragma pomp inst begin(xchange_lexicfield32)
+#endif
+#  if (defined BGL && defined XLC)
+  __alignx(16, l);
+#  endif
+
+#  ifdef TM_USE_MPI
+
+
+  /* send the data to the neighbour on the left */
+  /* recieve the data from the neighbour on the right */
+  MPI_Isend((void*)l, 1, lfield_time_slice_cont32, g_nb_t_dn, 5081, g_cart_grid, &requests[0]);
+  MPI_Irecv((void*)(l+VOLUME), 1, lfield_time_slice_cont32, g_nb_t_up, 5081, g_cart_grid, &requests[1]);
+#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+  /* send the data to the neighbour on the left in x direction */
+  /* recieve the data from the neighbour on the right in x direction */
+  MPI_Isend((void*)l, 1, lfield_x_slice_gath32, g_nb_x_dn, 5091, g_cart_grid,  &requests[4]);
+  MPI_Irecv((void*)(l+(T+2)*LX*LY*LZ), 1, lfield_x_slice_cont32, g_nb_x_up, 5091, g_cart_grid, &requests[5]);
+
+#    endif
+  
+#    if (defined PARALLELXYT || defined PARALLELXYZT)
+  /* send the data to the neighbour on the left in y direction */
+  /* recieve the data from the neighbour on the right in y direction */
+  MPI_Isend((void*)l, 1, lfield_y_slice_gath32, g_nb_y_dn, 5101, g_cart_grid, &requests[8]);
+  MPI_Irecv((void*)(l + VOLUME + 2*LZ*(LX*LY + T*LY)), 1, lfield_y_slice_cont32, g_nb_y_up, 5101, g_cart_grid, &requests[9]);
+#    endif
+  
+#    if (defined PARALLELXYZT)
+  
+  /* send the data to the neighbour on the left in z direction */
+  /* recieve the data from the neighbour on the right in z direction */
+  MPI_Isend((void*)l, 1, lfield_z_slice_gath32, g_nb_z_dn, 5503, g_cart_grid, &requests[12]);
+  MPI_Irecv((void*)(l+VOLUME + 2*LZ*(LX*LY + T*LY) + 2*LZ*T*LX), 1, lfield_z_slice_cont32, g_nb_z_up, 5503, g_cart_grid, &requests[13]); 
+#    endif
+  /* send the data to the neighbour on the right */
+  /* recieve the data from the neighbour on the left */
+  MPI_Isend((void*)(l+(T-1)*LX*LY*LZ), 1, lfield_time_slice_cont32, g_nb_t_up, 5082, g_cart_grid, &requests[2]);
+  MPI_Irecv((void*)(l+(T+1)*LX*LY*LZ), 1, lfield_time_slice_cont32, g_nb_t_dn, 5082, g_cart_grid, &requests[3]);
+  
+#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+  /* send the data to the neighbour on the right in x direction */
+  /* recieve the data from the neighbour on the left in x direction */  
+  MPI_Isend((void*)(l+(LX-1)*LY*LZ), 1, lfield_x_slice_gath32, g_nb_x_up, 5092, g_cart_grid, &requests[6]);
+  MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + T*LY*LZ)), 1, lfield_x_slice_cont32, g_nb_x_dn, 5092, g_cart_grid, &requests[7]);
+#    endif
+  
+#    if (defined PARALLELXYT || defined PARALLELXYZT)
+  /* send the data to the neighbour on the right in y direction */
+  /* recieve the data from the neighbour on the left in y direction */  
+  MPI_Isend((void*)(l+(LY-1)*LZ), 1, lfield_y_slice_gath32, g_nb_y_up, 5102, g_cart_grid, &requests[10]);
+  MPI_Irecv((void*)(l+VOLUME + 2*LZ*(LX*LY + T*LY) + T*LX*LZ), 1, lfield_y_slice_cont32, g_nb_y_dn, 5102, g_cart_grid, &requests[11]);
+#    endif
+  
+#    if defined PARALLELXYZT
+  
+  /* send the data to the neighbour on the right in y direction */
+  /* recieve the data from the neighbour on the left in y direction */  
+  MPI_Isend((void*)(l+LZ-1), 1, lfield_z_slice_gath32, g_nb_z_up, 5504, g_cart_grid, &requests[14]);
+  MPI_Irecv((void*)(l+VOLUME + 2*LZ*(LX*LY + T*LY) + 2*T*LX*LZ + T*LX*LY), 1, lfield_z_slice_cont32, g_nb_z_dn, 5504, g_cart_grid, &requests[15]); 
+#    endif
+  
+  MPI_Waitall(reqcount, requests, status);
+
+#  endif
+  return;
+#ifdef _KOJAK_INST
+#pragma pomp inst end(xchange_lexicfield32)
+#endif
+}
+
+# endif /* _INDEX_INDEP_GEOM */
+
+/* Here comes the naive version */  
+/* Using MPI_Sendrecv */
+#else /* _NON_BLOCKING */
+
+/* this is the version independent of the content of the function Index (only available with non-blocking)) */
+/* this if statement will be removed in future and _INDEX_INDEP_GEOM will be the default */
+# if defined _INDEX_INDEP_GEOM
+
+/* exchanges the field  l */
+void xchange_lexicfield32(spinor32 * const l) {
+  
+#  ifdef PARALLELXYZT
+  int x0=0, x1=0, x2=0, ix=0;
+#  endif
+#ifdef _KOJAK_INST
+#pragma pomp inst begin(xchange_lexicfield32)
+#endif
+
+#  ifdef TM_USE_MPI
+    
+#    if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+  MPI_Status status;
+  /* send the data to the neighbour on the left */
+  /* recieve the data from the neighbour on the right */
+  MPI_Sendrecv((void*)(l+gI_0_0_0_0), 1, lfield_time_slice_cont32, g_nb_t_dn, 5081,
+	       (void*)(l+gI_L_0_0_0), 1, lfield_time_slice_cont32, g_nb_t_up, 5081,
+	       g_cart_grid, &status);
+    
+  /* send the data to the neighbour on the right */
+  /* recieve the data from the neighbour on the left */
+  MPI_Sendrecv((void*)(l+gI_Lm1_0_0_0), 1, lfield_time_slice_cont32, g_nb_t_up, 5082,
+	       (void*)(l+gI_m1_0_0_0), 1, lfield_time_slice_cont32, g_nb_t_dn, 5082,
+	       g_cart_grid, &status);
+#    endif
+#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ )
+  /* send the data to the neighbour on the left in x direction */
+  /* recieve the data from the neighbour on the right in x direction */
+  MPI_Sendrecv((void*)(l+gI_0_0_0_0), 1, lfield_x_slice_gath32, g_nb_x_dn, 5091, 
+	       (void*)(l+gI_0_L_0_0), 1, lfield_x_slice_cont32, g_nb_x_up, 5091,
+	       g_cart_grid, &status);
+    
+  /* send the data to the neighbour on the right in x direction */
+  /* recieve the data from the neighbour on the left in x direction */  
+  MPI_Sendrecv((void*)(l+gI_0_Lm1_0_0), 1, lfield_x_slice_gath32, g_nb_x_up, 5092, 
+	       (void*)(l+gI_0_m1_0_0), 1, lfield_x_slice_cont32, g_nb_x_dn, 5092,
+	       g_cart_grid, &status);
+    
+#    endif
+    
+#    if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ )
+  /* send the data to the neighbour on the left in y direction */
+  /* recieve the data from the neighbour on the right in y direction */
+  MPI_Sendrecv((void*)(l+gI_0_0_0_0), 1, lfield_y_slice_gath32, g_nb_y_dn, 5101, 
+	       (void*)(l+gI_0_0_L_0), 1, lfield_y_slice_cont32, g_nb_y_up, 5101,
+	       g_cart_grid, &status);
+    
+  /* send the data to the neighbour on the right in y direction */
+  /* recieve the data from the neighbour on the left in y direction */  
+  MPI_Sendrecv((void*)(l+gI_0_0_Lm1_0), 1, lfield_y_slice_gath32, g_nb_y_up, 5102, 
+	       (void*)(l+gI_0_0_m1_0), 1, lfield_y_slice_cont32, g_nb_y_dn, 5102,
+	       g_cart_grid, &status);
+    
+#    endif
+    
+#    if (defined PARALLELXYZT || defined PARALLELXYZ )  
+  /* send the data to the neighbour on the left in z direction */
+  /* recieve the data from the neighbour on the right in z direction */
+  MPI_Sendrecv((void*)(l+gI_0_0_0_0), 1, lfield_z_slice_gath32, g_nb_z_dn, 5503,  
+	       (void*)(l+gI_0_0_0_L), 1, lfield_z_slice_cont32, g_nb_z_up, 5503, 
+	       g_cart_grid, &status); 
+    
+  /* send the data to the neighbour on the right in y direction */
+  /* recieve the data from the neighbour on the left in y direction */  
+  MPI_Sendrecv((void*)(l+gI_0_0_0_Lm1), 1, lfield_z_slice_gath32, g_nb_z_up, 5504, 
+	       (void*)(l+gI_0_0_0_m1), 1, lfield_z_slice_cont32, g_nb_z_dn, 5504, 
+	       g_cart_grid, &status); 
+    
+#    endif
+#  endif
+  return;
+#ifdef _KOJAK_INST
+#pragma pomp inst end(xchange_lexicfield32)
+#endif
+}
+
+# else // _INDEX_INDEP_GEOM
+
+/* exchanges the field  l */
+void xchange_lexicfield32(spinor32 * const l) {
+  
+#  ifdef PARALLELXYZT
+  int x0=0, x1=0, x2=0, ix=0;
+#  endif
+#ifdef _KOJAK_INST
+#pragma pomp inst begin(xchange_lexicfield32)
+#endif
+
+#  ifdef TM_USE_MPI
+    
+  MPI_Status status;
+  /* send the data to the neighbour on the left */
+  /* recieve the data from the neighbour on the right */
+  MPI_Sendrecv((void*)l,                1, lfield_time_slice_cont32, g_nb_t_dn, 5081,
+	       (void*)(l+T*LX*LY*LZ), 1, lfield_time_slice_cont32, g_nb_t_up, 5081,
+	       g_cart_grid, &status);
+    
+  /* send the data to the neighbour on the right */
+  /* recieve the data from the neighbour on the left */
+  MPI_Sendrecv((void*)(l+(T-1)*LX*LY*LZ), 1, lfield_time_slice_cont32, g_nb_t_up, 5082,
+	       (void*)(l+(T+1)*LX*LY*LZ), 1, lfield_time_slice_cont32, g_nb_t_dn, 5082,
+	       g_cart_grid, &status);
+    
+#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+  /* send the data to the neighbour on the left in x direction */
+  /* recieve the data from the neighbour on the right in x direction */
+  MPI_Sendrecv((void*)l,                    1, lfield_x_slice_gath32, g_nb_x_dn, 5091, 
+	       (void*)(l+(T+2)*LX*LY*LZ), 1, lfield_x_slice_cont32, g_nb_x_up, 5091,
+	       g_cart_grid, &status);
+    
+  /* send the data to the neighbour on the right in x direction */
+  /* recieve the data from the neighbour on the left in x direction */  
+  MPI_Sendrecv((void*)(l+(LX-1)*LY*LZ),               1, lfield_x_slice_gath32, g_nb_x_up, 5092, 
+	       (void*)(l+((T+2)*LX*LY*LZ + T*LY*LZ)), 1, lfield_x_slice_cont32, g_nb_x_dn, 5092,
+	       g_cart_grid, &status);
+    
+#    endif
+    
+#    if (defined PARALLELXYT || defined PARALLELXYZT)
+  /* send the data to the neighbour on the left in y direction */
+  /* recieve the data from the neighbour on the right in y direction */
+  MPI_Sendrecv((void*)l,                                1, lfield_y_slice_gath32, g_nb_y_dn, 5101, 
+	       (void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ)), 1, lfield_y_slice_cont32, g_nb_y_up, 5101,
+	       g_cart_grid, &status);
+    
+  /* send the data to the neighbour on the right in y direction */
+  /* recieve the data from the neighbour on the left in y direction */  
+  MPI_Sendrecv((void*)(l+(LY-1)*LZ/2),                            1, lfield_y_slice_gath32, g_nb_y_up, 5102, 
+	       (void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ)), 1, lfield_y_slice_cont32, g_nb_y_dn, 5102,
+	       g_cart_grid, &status);
+    
+#    endif
+    
+#    if (defined PARALLELXYZT)
+  /* send the data to the neighbour on the left in z direction */
+  /* recieve the data from the neighbour on the right in z direction */
+  MPI_Sendrecv((void*)l, 
+	       1, lfield_z_slice_gath32, g_nb_z_dn, 5503,  
+	       (void*)(l + VOLUME + 2*LZ*(LX*LY + T*LY) + 2*LZ*T*LX),  
+	       1, lfield_z_slice_cont32, g_nb_z_up, 5503, 
+	       g_cart_grid, &status); 
+    
+  /* send the data to the neighbour on the right in y direction */
+  /* recieve the data from the neighbour on the left in y direction */  
+  MPI_Sendrecv((void*)(l+LZ-1),  
+	       1, lfield_z_slice_gath32, g_nb_z_up, 5504, 
+	       (void*)(l+(VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY)),  
+	       1, lfield_z_slice_cont32, g_nb_z_dn, 5504, 
+	       g_cart_grid, &status); 
+    
+#    endif
+#  endif
+  return;
+#ifdef _KOJAK_INST
+#pragma pomp inst end(xchange_lexicfield32)
+#endif
+}
+
+# endif // _INDEX_INDEP_GEOM
+
+#endif
 
 
 
diff --git a/xchange/xchange_lexicfield.h b/xchange/xchange_lexicfield.h
index d7e0b41e7..2da8f804a 100644
--- a/xchange/xchange_lexicfield.h
+++ b/xchange/xchange_lexicfield.h
@@ -21,5 +21,6 @@
 #define _XCHANGE_LEXICFIELD_H
 
 void xchange_lexicfield(spinor * const s);
+void xchange_lexicfield32(spinor32 * const s);
 
 #endif
diff --git a/xlc_prefetch.h b/xlc_prefetch.h
index f550deefc..1beb78fbf 100644
--- a/xlc_prefetch.h
+++ b/xlc_prefetch.h
@@ -28,10 +28,22 @@
 
 #ifdef XLC
 
+#define _prefetch_halfspinor(addr)		\
+  __dcbt(((char*)((unsigned long int)(addr))));
+
 #define _prefetch_spinor(addr)			    \
   __dcbt(((char*)((unsigned long int)(addr))));	    \
   __dcbt(((char*)((unsigned long int)(addr)))+128); 
 
+#define _prefetch_spinor_32(addr)			    \
+  __dcbt(((char*)((unsigned long int)(addr))));
+//#define _prefetch_spinor_32(addr)
+
+
+#define _prefetch_su3_32(addr)			    \
+  __dcbt(((char*)((unsigned long int)(addr))));
+//#define _prefetch_su3_32(addr)
+
 #define _prefetch_su3(addr)			    \
   __dcbt(((char*)((unsigned long int)(addr))));	    \
   __dcbt(((char*)((unsigned long int)(addr)))+128); 
@@ -54,10 +66,16 @@ __prefetch_by_load((void*)(addr2));
 
 #else
 
+#define _prefetch_halfspinor(addr)
+
 #define _prefetch_spinor(addr)
 
 #define _prefetch_su3(addr)
 
+#define _prefetch_spinor_32(addr)
+
+#define _prefetch_su3_32(addr)
+
 #endif
 
 #endif