From 8770e571b01199bca333313fa4f301f0728f6406 Mon Sep 17 00:00:00 2001
From: "j.m.o.rantaharju" <j.m.o.rantaharju@swansea.ac.uk>
Date: Mon, 8 Oct 2018 16:18:56 +0100
Subject: [PATCH] Patch to reproduce changes in openQCD 1.6

---
 AVX512_patch | 4959 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 4959 insertions(+)
 create mode 100644 AVX512_patch

diff --git a/AVX512_patch b/AVX512_patch
new file mode 100644
index 0000000..1dde314
--- /dev/null
+++ b/AVX512_patch
@@ -0,0 +1,4959 @@
+From 7b114a257f936ac26be81aab8af96d586c38c4e0 Mon Sep 17 00:00:00 2001
+From: "j.m.o.rantaharju" <j.m.o.rantaharju@swansea.ac.uk>
+Date: Thu, 19 Apr 2018 14:39:10 +0100
+Subject: [PATCH] This patch extends openQCD-1.6 with an implementation of the
+ Dirac operator with intel Intrincic operations in order to use the full
+ vector width on Intel Skylake, Knights Landing and other processors.
+
+Add patch file to reproduce changes
+
+Removing assembly files
+
+Compilation of AVX code with ICC is is now possible
+---
+ CITATION.cff                               |  39 +
+ README-AVX512                              |  24 +
+ devel/archive/Makefile                     |  37 +-
+ devel/block/Makefile                       |  36 +-
+ devel/dfl/Makefile                         |  36 +-
+ devel/dirac/Makefile                       |  39 +-
+ devel/forces/Makefile                      |  36 +-
+ devel/linalg/Makefile                      |  17 +-
+ devel/little/Makefile                      |  36 +-
+ devel/sap/Makefile                         |  36 +-
+ devel/sflds/Makefile                       |  34 +-
+ devel/sw_term/Makefile                     |  21 +-
+ devel/update/Makefile                      |  37 +-
+ devel/vflds/Makefile                       |  33 +-
+ include/avx512.h                           | 978 +++++++++++++++++++++
+ include/sw_term.h                          |   1 +
+ main/Makefile                              |  36 +-
+ modules/dirac/Dw.c                         |  22 +-
+ modules/dirac/Dw_dble.c                    |  74 +-
+ modules/dirac/avx512/Dw_avx512.c           | 221 +++++
+ modules/dirac/avx512/Dw_dble_avx512.c      | 261 ++++++
+ modules/linalg/avx512/salg_avx512.c        | 154 ++++
+ modules/linalg/avx512/salg_dble_avx512.c   | 402 +++++++++
+ modules/linalg/salg.c                      |  82 +-
+ modules/linalg/salg_dble.c                 | 165 +++-
+ modules/sw_term/avx512/pauli_avx512.c      | 235 +++++
+ modules/sw_term/avx512/pauli_dble_avx512.c | 489 +++++++++++
+ modules/sw_term/pauli.c                    | 213 ++---
+ modules/sw_term/pauli_dble.c               |  48 +-
+ 29 files changed, 3512 insertions(+), 330 deletions(-)
+ create mode 100644 CITATION.cff
+ create mode 100644 README-AVX512
+ create mode 100644 include/avx512.h
+ create mode 100644 modules/dirac/avx512/Dw_avx512.c
+ create mode 100644 modules/dirac/avx512/Dw_dble_avx512.c
+ create mode 100644 modules/linalg/avx512/salg_avx512.c
+ create mode 100644 modules/linalg/avx512/salg_dble_avx512.c
+ create mode 100644 modules/sw_term/avx512/pauli_avx512.c
+ create mode 100644 modules/sw_term/avx512/pauli_dble_avx512.c
+
+diff --git a/CITATION.cff b/CITATION.cff
+new file mode 100644
+index 0000000..c2e855d
+--- /dev/null
++++ b/CITATION.cff
+@@ -0,0 +1,39 @@
++
++cff-version: 1.0.3
++message: "Please cite the following works when using this software."
++
++title: OpenQCD AVX512 Extension
++version: 1.0
++date-released: 19 January 2018
++authors:
++  - name: "Swansea Academy of Advanced Computing"
++    website: http://www.swansea.ac.uk/iss/sa2c/
++    email: sa2c-support@swansea.ac.uk
++  - family-names: Rantaharju
++    given-names: Jarno
++    orcid:  https://orcid.org/0000-0002-0072-7707
++  - family-names: Messiti
++    given-names: Michele
++
++references:
++  - type: software-code
++    name: OpenQCD
++    version: 1.6
++    website: luscher.web.cern.ch/luscher/openQCD/
++    authors:
++       - family-names: Lüscher
++         given-names: Martin
++       - family-names: Schaefer 
++         given-names: Stefan
++       - family-names: Bulava 
++         given-names: John
++       - family-names: Del Debbio 
++         given-names: Luigi
++       - family-names: Giusti 
++         given-names: Leonardo
++       - family-names: Leder
++         given-names: Björn
++       - family-names: Palombi 
++         given-names: Filippo
++    date-released: 22. April 2014
++    email: luscher@mail.cern.ch
+diff --git a/README-AVX512 b/README-AVX512
+new file mode 100644
+index 0000000..25a47ff
+--- /dev/null
++++ b/README-AVX512
+@@ -0,0 +1,24 @@
++An optimized version of openQCD-1.6 for Intel processors with 512 bit vector width
++==================================================================================
++
++DESCRIPTION
++
++   This code extends openQCD-1.6 with an implementation of the Dirac operator
++   with intel Intrincic operations in order to use the full vector width on Intel
++   Skylake, Knight Landing and other processors.
++
++   The extension is enabled with the -DAVX521 flag in CFLAGS in the Makefiles.
++
++
++
++AUTHORS
++
++   This patch extends the openQCD code written by Martin Luscher, Stefan
++   Schaefer and other contributors (see http://luscher.web.cern.ch/luscher/openQCD/).
++
++   The batch has been written by and tested by Jarno Rantaharju and Michele Messiti.
++
++LICENSE
++
++   The code may be used under the terms of the GNU General Public License (GPL)
++   http://www.fsf.org/licensing/licenses/gpl.html
+diff --git a/devel/archive/Makefile b/devel/archive/Makefile
+index 8c94a12..c5e949a 100644
+--- a/devel/archive/Makefile
++++ b/devel/archive/Makefile
+@@ -39,9 +39,13 @@ UFLDS = plaq_sum uflds udcom
+ 
+ UTILS = endian error hsum mutils utils wspace
+ 
+-MODULES = $(FLAGS) $(LATTICE) $(ARCHIVE) $(LINALG) $(RANDOM) $(UFLDS) \
++STD_MODULES = $(FLAGS) $(LATTICE) $(ARCHIVE) $(LINALG) $(RANDOM) $(UFLDS) \
+           $(SFLDS) $(SU3FCTS) $(UTILS)
+ 
++AVX512_MODULES = salg_dble_avx512
++
++AVX512_ASM_MODULES = salg_dble_avx512_asm
++
+ 
+ # Logging option (-mpilog or -mpitrace or -mpianim)
+ 
+@@ -54,7 +58,7 @@ MDIR = ../../modules
+ 
+ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\
+           $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/sflds:\
+-          $(MDIR)/su3fcts:$(MDIR)/utils
++          $(MDIR)/su3fcts:$(MDIR)/utils:$(MDIR)/linalg/avx512:
+ 
+ 
+ # additional include directories
+@@ -72,10 +76,10 @@ LIBPATH = $(MPI_HOME)/lib
+ # scheduling and optimization options
+ 
+ CFLAGS = -std=c89 -pedantic -fstrict-aliasing \
+-         -Wall -Wno-long-long -Wstrict-prototypes -Werror \
+-         -O -mno-avx -Dx64 -DPM
++         -Wall -Wno-long-long -Wstrict-prototypes \
++         -O2 -DAVX512
+ 
+-LFLAGS =
++LFLAGS = $(CFLAGS)
+ 
+ 
+ ############################## do not change ###################################
+@@ -84,25 +88,40 @@ SHELL=/bin/bash
+ CC=mpicc
+ CLINKER=$(CC)
+ 
++#Check CFLAGS to find which AVX512 flags are active
++ifneq (,$(findstring AVX512_ASM,$(CFLAGS)))
++MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES)
++else
++ifneq (,$(findstring AVX512,$(CFLAGS)))
++MODULES= $(STD_MODULES) $(AVX512_MODULES)
++else
++MODULES= $(STD_MODULES)
++endif
++endif
++
+ PGMS= $(MAIN) $(MODULES)
+ 
+ -include $(addsuffix .d,$(PGMS))
+ 
+ 
+-# rule to make dependencies
+ 
++
++# rule to make dependencies
+ $(addsuffix .d,$(PGMS)): %.d: %.c Makefile
+ 	@ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@
+ 
+-
+ # rule to compile source programs
++$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile
++	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile
++# pattern to compile files in the avx512 directiories
++$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile
+ 	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
++$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile
++	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+ # rule to link object files
+-
+ $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile
+ 	$(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \
+         $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@
+diff --git a/devel/block/Makefile b/devel/block/Makefile
+index ad90f62..3856581 100644
+--- a/devel/block/Makefile
++++ b/devel/block/Makefile
+@@ -45,9 +45,15 @@ UFLDS = uflds udcom shift
+ 
+ UTILS = endian error hsum mutils utils wspace
+ 
+-MODULES = $(FLAGS) $(RANDOM) $(LATTICE) $(BLOCK) $(UFLDS) $(SFLDS) \
++STD_MODULES = $(FLAGS) $(RANDOM) $(LATTICE) $(BLOCK) $(UFLDS) $(SFLDS) \
+           $(LINALG) $(SU3FCTS) $(UTILS) $(TCHARGE) $(SW_TERM) $(SAP)
+ 
++AVX512_MODULES = salg_avx512 salg_dble_avx512 \
++					 pauli_avx512 pauli_dble_avx512
++
++AVX512_ASM_MODULES = salg_avx512_asm salg_dble_avx512_asm \
++					 pauli_avx512_asm pauli_dble_avx512_asm
++
+ 
+ # Logging option (-mpilog or -mpitrace or -mpianim)
+ 
+@@ -60,7 +66,7 @@ MDIR = ../../modules
+ 
+ VPATH = .:$(MDIR)/flags:$(MDIR)/random:$(MDIR)/lattice:$(MDIR)/block:\
+           $(MDIR)/uflds:$(MDIR)/sflds:$(MDIR)/su3fcts:$(MDIR)/utils:\
+-          $(MDIR)/linalg:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/sap
++          $(MDIR)/linalg:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/sap:$(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512
+ 
+ 
+ # additional include directories
+@@ -78,10 +84,10 @@ LIBPATH = $(MPI_HOME)/lib
+ # scheduling and optimization options
+ 
+ CFLAGS = -std=c89 -pedantic -fstrict-aliasing \
+-         -Wall -Wno-long-long -Wstrict-prototypes -Werror \
+-         -O -mno-avx -Dx64 -DPM
++         -Wall -Wno-long-long -Wstrict-prototypes \
++         -O2 -DAVX512
+ 
+-LFLAGS =
++LFLAGS = $(CFLAGS)
+ 
+ 
+ ############################## do not change ###################################
+@@ -90,25 +96,39 @@ SHELL=/bin/bash
+ CC=mpicc
+ CLINKER=$(CC)
+ 
++#Check CFLAGS to find which AVX512 flags are active
++ifneq (,$(findstring AVX512_ASM,$(CFLAGS)))
++MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES)
++else
++ifneq (,$(findstring AVX512,$(CFLAGS)))
++MODULES= $(STD_MODULES) $(AVX512_MODULES)
++else
++MODULES= $(STD_MODULES)
++endif
++endif
++
+ PGMS= $(MAIN) $(MODULES)
+ 
+ -include $(addsuffix .d,$(PGMS))
+ 
+ 
+ # rule to make dependencies
+-
+ $(addsuffix .d,$(PGMS)): %.d: %.c Makefile
+ 	@ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@
+ 
+ 
+ # rule to compile source programs
++$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile
++	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile
++# pattern to compile files in the avx512 directiories
++$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile
+ 	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
++$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile
++	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+ # rule to link object files
+-
+ $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile
+ 	$(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \
+         $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@
+diff --git a/devel/dfl/Makefile b/devel/dfl/Makefile
+index 9def70f..0a6ce2e 100644
+--- a/devel/dfl/Makefile
++++ b/devel/dfl/Makefile
+@@ -57,10 +57,15 @@ UTILS = endian error hsum mutils utils wspace
+ 
+ VFLDS = vflds vinit vcom vdcom
+ 
+-MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \
++STD_MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \
+           $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) $(DIRAC) \
+           $(BLOCK) $(SAP) $(ARCHIVE) $(DFL) $(VFLDS) $(LITTLE)
+ 
++AVX512_MODULES = Dw_avx512 Dw_dble_avx512 salg_avx512 salg_dble_avx512 \
++					pauli_avx512 pauli_dble_avx512
++
++AVX512_ASM_MODULES = Dw_avx512_asm Dw_dble_avx512_asm salg_avx512_asm \
++					 salg_dble_avx512_asm pauli_avx512_asm pauli_dble_avx512_asm
+ 
+ # Logging option (-mpilog or -mpitrace or -mpianim)
+ 
+@@ -75,7 +80,8 @@ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/linsolv:\
+           $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:\
+           $(MDIR)/sflds:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:\
+           $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive:$(MDIR)/dfl:\
+-          $(MDIR)/vflds:$(MDIR)/little
++          $(MDIR)/vflds:$(MDIR)/little\
++					$(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512
+ 
+ # additional include directories
+ 
+@@ -92,10 +98,10 @@ LIBPATH = $(MPI_HOME)/lib
+ # scheduling and optimization options
+ 
+ CFLAGS = -std=c89 -pedantic -fstrict-aliasing \
+-         -Wall -Wno-long-long -Wstrict-prototypes -Werror \
+-         -O -mno-avx -Dx64 -DPM
++         -Wall -Wno-long-long -Wstrict-prototypes \
++         -O2 -DAVX512
+ 
+-LFLAGS =
++LFLAGS = $(CFLAGS)
+ 
+ # -DFGCR_DBG -DFGCR4VD_DBG -DDFL_MODES_DBG
+ 
+@@ -106,25 +112,39 @@ SHELL=/bin/bash
+ CC=mpicc
+ CLINKER=$(CC)
+ 
++#Check CFLAGS to find which AVX512 flags are active
++ifneq (,$(findstring AVX512_ASM,$(CFLAGS)))
++MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES)
++else
++ifneq (,$(findstring AVX512,$(CFLAGS)))
++MODULES= $(STD_MODULES) $(AVX512_MODULES)
++else
++MODULES= $(STD_MODULES)
++endif
++endif
++
+ PGMS= $(MAIN) $(MODULES)
+ 
+ -include $(addsuffix .d,$(PGMS))
+ 
+ 
+ # rule to make dependencies
+-
+ $(addsuffix .d,$(PGMS)): %.d: %.c Makefile
+ 	@ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@
+ 
+ 
+ # rule to compile source programs
++$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile
++	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile
++# pattern to compile files in the avx512 directiories
++$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile
+ 	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
++$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile
++	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+ # rule to link object files
+-
+ $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile
+ 	$(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \
+         $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@
+diff --git a/devel/dirac/Makefile b/devel/dirac/Makefile
+index 8fac63d..f467889 100644
+--- a/devel/dirac/Makefile
++++ b/devel/dirac/Makefile
+@@ -12,6 +12,7 @@
+ # Dependencies on included files are automatically taken care of
+ #
+ ################################################################################
++################################################################################
+ 
+ all: rmxeq mkdep mkxeq
+ .PHONY: all
+@@ -48,10 +49,15 @@ UFLDS = plaq_sum shift uflds udcom
+ 
+ UTILS = endian error hsum mutils utils wspace
+ 
+-MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(RANDOM) $(UFLDS) \
++STD_MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(RANDOM) $(UFLDS) \
+           $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) \
+           $(DIRAC) $(BLOCK) $(SAP)
+ 
++AVX512_MODULES = Dw_avx512 Dw_dble_avx512 salg_avx512 salg_dble_avx512 \
++				 	pauli_avx512 pauli_dble_avx512
++
++AVX512_ASM_MODULES = Dw_avx512_asm Dw_dble_avx512_asm salg_avx512_asm \
++					salg_dble_avx512_asm pauli_avx512_asm pauli_dble_avx512_asm
+ 
+ # Logging option (-mpilog or -mpitrace or -mpianim)
+ 
+@@ -65,7 +71,7 @@ MDIR = ../../modules
+ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/random:\
+           $(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:$(MDIR)/sflds:\
+           $(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:$(MDIR)/block:\
+-          $(MDIR)/sap
++          $(MDIR)/sap:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512:$(MDIR)/linalg/avx512
+ 
+ # additional include directories
+ 
+@@ -82,11 +88,10 @@ LIBPATH = $(MPI_HOME)/lib
+ # scheduling and optimization options
+ 
+ CFLAGS = -std=c89 -pedantic -fstrict-aliasing \
+-         -Wall -Wno-long-long -Wstrict-prototypes -Werror \
+-         -O -mno-avx -Dx64 -DPM
+-
+-LFLAGS =
++         -Wall -Wno-long-long -Wstrict-prototypes \
++         -O2 -DAVX512 -DAVX -DPM
+ 
++LFLAGS = $(CFLAGS)
+ 
+ ############################## do not change ###################################
+ 
+@@ -94,30 +99,46 @@ SHELL=/bin/bash
+ CC=mpicc
+ CLINKER=$(CC)
+ 
++
++#Check CFLAGS to find which AVX512 flags are active
++ifneq (,$(findstring AVX512_ASM,$(CFLAGS)))
++MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES)
++else
++ifneq (,$(findstring AVX512,$(CFLAGS)))
++MODULES= $(STD_MODULES) $(AVX512_MODULES)
++else
++MODULES= $(STD_MODULES)
++endif
++endif
++
+ PGMS= $(MAIN) $(MODULES)
+ 
+ -include $(addsuffix .d,$(PGMS))
+ 
+ 
+ # rule to make dependencies
+-
+ $(addsuffix .d,$(PGMS)): %.d: %.c Makefile
+ 	@ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@
+ 
+ 
+ # rule to compile source programs
++$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile
++	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile
++# pattern to compile files in the avx512 directiories
++$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile
+ 	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
++$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile
++	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+ # rule to link object files
+-
+ $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile
+ 	$(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \
+         $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@
+ 
+ 
++
+ # produce executables
+ 
+ mkxeq: $(MAIN)
+diff --git a/devel/forces/Makefile b/devel/forces/Makefile
+index e6955fb..7bf668c 100644
+--- a/devel/forces/Makefile
++++ b/devel/forces/Makefile
+@@ -68,11 +68,16 @@ UTILS = endian error hsum mutils utils wspace
+ 
+ VFLDS = vflds vinit vcom vdcom
+ 
+-MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \
++STD_MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \
+           $(LATTICE) $(LINALG) $(LINSOLV) $(LITTLE) $(MDFLDS) $(RANDOM) \
+           $(RATFCTS) $(SAP) $(SFLDS) $(SU3FCTS) $(SW_TERM) $(TCHARGE) \
+           $(UFLDS) $(UPDATE) $(UTILS) $(VFLDS)
+ 
++AVX512_MODULES = Dw_avx512 Dw_dble_avx512 salg_avx512 salg_dble_avx512 \
++				 	pauli_avx512 pauli_dble_avx512
++
++AVX512_ASM_MODULES = Dw_avx512_asm Dw_dble_avx512_asm salg_avx512_asm \
++ 					salg_dble_avx512_asm pauli_avx512_asm pauli_dble_avx512_asm
+ 
+ # Logging option (-mpilog or -mpitrace or -mpianim)
+ 
+@@ -88,7 +93,8 @@ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\
+           $(MDIR)/utils:$(MDIR)/forces:$(MDIR)/sflds:$(MDIR)/dirac:\
+ 	  $(MDIR)/sw_term:$(MDIR)/tcharge:$(MDIR)/block:$(MDIR)/sap:\
+ 	  $(MDIR)/linsolv:$(MDIR)/dfl:$(MDIR)/vflds:$(MDIR)/little:\
+-          $(MDIR)/update:$(MDIR)/ratfcts
++          $(MDIR)/update:$(MDIR)/ratfcts\
++					$(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512
+ 
+ 
+ # additional include directories
+@@ -106,10 +112,10 @@ LIBPATH = $(MPI_HOME)/lib
+ # scheduling and optimization options
+ 
+ CFLAGS = -std=c89 -pedantic -fstrict-aliasing \
+-         -Wall -Wno-long-long -Wstrict-prototypes -Werror \
+-         -O -mno-avx -Dx64 -DPM
++         -Wall -Wno-long-long -Wstrict-prototypes \
++         -O2 -DAVX512
+ 
+-LFLAGS =
++LFLAGS = $(CFLAGS)
+ 
+ # -DCGNE_DBG -DFGCR_DBG -DMSCG_DBG
+ # -DDFL_MODES_DBG
+@@ -121,25 +127,39 @@ SHELL=/bin/bash
+ CC=mpicc
+ CLINKER=$(CC)
+ 
++#Check CFLAGS to find which AVX512 flags are active
++ifneq (,$(findstring AVX512_ASM,$(CFLAGS)))
++MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES)
++else
++ifneq (,$(findstring AVX512,$(CFLAGS)))
++MODULES= $(STD_MODULES) $(AVX512_MODULES)
++else
++MODULES= $(STD_MODULES)
++endif
++endif
++
+ PGMS= $(MAIN) $(MODULES)
+ 
+ -include $(addsuffix .d,$(PGMS))
+ 
+ 
+ # rule to make dependencies
+-
+ $(addsuffix .d,$(PGMS)): %.d: %.c Makefile
+ 	@ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@
+ 
+ 
+ # rule to compile source programs
++$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile
++	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile
++# pattern to compile files in the avx512 directiories
++$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile
+ 	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
++$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile
++	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+ # rule to link object files
+-
+ $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile
+ 	$(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \
+         $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@
+diff --git a/devel/linalg/Makefile b/devel/linalg/Makefile
+index 76ff701..e13bd0b 100644
+--- a/devel/linalg/Makefile
++++ b/devel/linalg/Makefile
+@@ -25,7 +25,7 @@ FLAGS = flags lat_parms dfl_parms
+ 
+ LATTICE = bcnds uidx geometry
+ 
+-LINALG = cmatrix_dble liealg salg salg_dble valg valg_dble
++LINALG = cmatrix_dble liealg salg salg_dble valg valg_dble salg_avx512 salg_dble_avx512
+ 
+ RANDOM = ranlux ranlxs ranlxd gauss
+ 
+@@ -54,7 +54,7 @@ MDIR = ../../modules
+ 
+ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/random:$(MDIR)/linalg:\
+           $(MDIR)/utils:$(MDIR)/uflds:$(MDIR)/sflds:$(MDIR)/su3fcts:\
+-          $(MDIR)/vflds
++          $(MDIR)/vflds:$(MDIR)/linalg/avx512
+ 
+ 
+ # additional include directories
+@@ -72,10 +72,10 @@ LIBPATH = $(MPI_HOME)/lib
+ # scheduling and optimization options
+ 
+ CFLAGS = -std=c89 -pedantic -fstrict-aliasing \
+-         -Wall -Wno-long-long -Wstrict-prototypes -Werror \
+-         -O -mno-avx -Dx64 -DPM
++         -Wall -Wno-long-long -Wstrict-prototypes \
++         -O2 -DAVX512 -DAVX -DPM
+ 
+-LFLAGS =
++LFLAGS = $(CFLAGS)
+ 
+ 
+ ############################## do not change ###################################
+@@ -90,24 +90,19 @@ PGMS= $(MAIN) $(MODULES)
+ 
+ 
+ # rule to make dependencies
+-
+ $(addsuffix .d,$(PGMS)): %.d: %.c Makefile
+ 	@ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@
+ 
+ 
+ # rule to compile source programs
+-
+-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile
++$(addsuffix .o,$(MAIN) $(MODULES)): %.o: %.c Makefile
+ 	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+-
+ # rule to link object files
+-
+ $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile
+ 	$(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \
+         $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@
+ 
+-
+ # produce executables
+ 
+ mkxeq: $(MAIN)
+diff --git a/devel/little/Makefile b/devel/little/Makefile
+index 3ab8aee..295f92c 100644
+--- a/devel/little/Makefile
++++ b/devel/little/Makefile
+@@ -57,10 +57,15 @@ UTILS = endian error hsum mutils utils wspace
+ 
+ VFLDS = vflds vinit vcom vdcom
+ 
+-MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \
++STD_MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \
+           $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) $(DIRAC) \
+           $(BLOCK) $(SAP) $(ARCHIVE) $(DFL) $(VFLDS) $(LITTLE)
+ 
++AVX512_MODULES = Dw_avx512 Dw_dble_avx512 salg_avx512 salg_dble_avx512 \
++				pauli_avx512 pauli_dble_avx512
++
++AVX512_ASM_MODULES = Dw_avx512_asm Dw_dble_avx512_asm salg_avx512_asm \
++ 				salg_dble_avx512_asm pauli_avx512_asm pauli_dble_avx512_asm
+ 
+ # Logging option (-mpilog or -mpitrace or -mpianim)
+ 
+@@ -75,7 +80,8 @@ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/linsolv:\
+           $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:\
+           $(MDIR)/sflds:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:\
+           $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive:$(MDIR)/dfl:\
+-          $(MDIR)/vflds:$(MDIR)/little
++          $(MDIR)/vflds:$(MDIR)/little\
++					$(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512
+ 
+ 
+ # additional include directories
+@@ -93,10 +99,10 @@ LIBPATH = $(MPI_HOME)/lib
+ # scheduling and optimization options
+ 
+ CFLAGS = -std=c89 -pedantic -fstrict-aliasing \
+-         -Wall -Wno-long-long -Wstrict-prototypes -Werror \
+-         -O -mno-avx -Dx64 -DPM
++         -Wall -Wno-long-long -Wstrict-prototypes \
++         -O2 -DAVX512
+ 
+-LFLAGS =
++LFLAGS = $(CFLAGS)
+ 
+ 
+ ############################## do not change ###################################
+@@ -105,25 +111,39 @@ SHELL=/bin/bash
+ CC=mpicc
+ CLINKER=$(CC)
+ 
++#Check CFLAGS to find which AVX512 flags are active
++ifneq (,$(findstring AVX512_ASM,$(CFLAGS)))
++MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES)
++else
++ifneq (,$(findstring AVX512,$(CFLAGS)))
++MODULES= $(STD_MODULES) $(AVX512_MODULES)
++else
++MODULES= $(STD_MODULES)
++endif
++endif
++
+ PGMS= $(MAIN) $(MODULES)
+ 
+ -include $(addsuffix .d,$(PGMS))
+ 
+ 
+ # rule to make dependencies
+-
+ $(addsuffix .d,$(PGMS)): %.d: %.c Makefile
+ 	@ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@
+ 
+ 
+ # rule to compile source programs
++$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile
++	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile
++# pattern to compile files in the avx512 directiories
++$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile
+ 	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
++$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile
++	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+ # rule to link object files
+-
+ $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile
+ 	$(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \
+         $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@
+diff --git a/devel/sap/Makefile b/devel/sap/Makefile
+index f56144d..b39f521 100644
+--- a/devel/sap/Makefile
++++ b/devel/sap/Makefile
+@@ -51,10 +51,15 @@ UFLDS = plaq_sum shift uflds udcom
+ 
+ UTILS = endian error hsum mutils utils wspace
+ 
+-MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \
++STD_MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \
+           $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) $(DIRAC) \
+           $(BLOCK) $(SAP) $(ARCHIVE)
+ 
++AVX512_MODULES = Dw_avx512 Dw_dble_avx512 salg_avx512 salg_dble_avx512 \
++					pauli_avx512 pauli_dble_avx512
++
++AVX512_ASM_MODULES = Dw_avx512_asm Dw_dble_avx512_asm salg_avx512_asm \
++ 					salg_dble_avx512_asm pauli_avx512_asm pauli_dble_avx512_asm
+ 
+ # Logging option (-mpilog or -mpitrace or -mpianim)
+ 
+@@ -68,7 +73,8 @@ MDIR = ../../modules
+ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/linsolv:\
+           $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:\
+           $(MDIR)/sflds:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:\
+-          $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive:
++          $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive:\
++					$(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512
+ 
+ 
+ # additional include directories
+@@ -86,10 +92,10 @@ LIBPATH = $(MPI_HOME)/lib
+ # scheduling and optimization options
+ 
+ CFLAGS = -std=c89 -pedantic -fstrict-aliasing \
+-         -Wall -Wno-long-long -Wstrict-prototypes -Werror \
+-         -O -mno-avx -Dx64 -DPM
++         -Wall -Wno-long-long -Wstrict-prototypes \
++         -O2 -DAVX512
+ 
+-LFLAGS =
++LFLAGS = $(CFLAGS)
+ 
+ # -DFGCR_DBG
+ 
+@@ -100,25 +106,39 @@ SHELL=/bin/bash
+ CC=mpicc
+ CLINKER=$(CC)
+ 
++#Check CFLAGS to find which AVX512 flags are active
++ifneq (,$(findstring AVX512_ASM,$(CFLAGS)))
++MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES)
++else
++ifneq (,$(findstring AVX512,$(CFLAGS)))
++MODULES= $(STD_MODULES) $(AVX512_MODULES)
++else
++MODULES= $(STD_MODULES)
++endif
++endif
++
+ PGMS= $(MAIN) $(MODULES)
+ 
+ -include $(addsuffix .d,$(PGMS))
+ 
+ 
+ # rule to make dependencies
+-
+ $(addsuffix .d,$(PGMS)): %.d: %.c Makefile
+ 	@ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@
+ 
+ 
+ # rule to compile source programs
++$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile
++	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile
++# pattern to compile files in the avx512 directiories
++$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile
+ 	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
++$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile
++	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+ # rule to link object files
+-
+ $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile
+ 	$(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \
+         $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@
+diff --git a/devel/sflds/Makefile b/devel/sflds/Makefile
+index 111e826..b424540 100644
+--- a/devel/sflds/Makefile
++++ b/devel/sflds/Makefile
+@@ -37,9 +37,12 @@ UFLDS = uflds
+ 
+ UTILS = endian error hsum mutils utils wspace
+ 
+-MODULES = $(FLAGS) $(LATTICE) $(RANDOM) $(LINALG) $(UTILS) \
++STD_MODULES = $(FLAGS) $(LATTICE) $(RANDOM) $(LINALG) $(UTILS) \
+           $(UFLDS) $(SFLDS) $(SU3FCTS)
+ 
++AVX512_MODULES = salg_avx512 salg_dble_avx512
++
++AVX512_ASM_MODULES = salg_avx512_asm salg_dble_avx512_asm
+ 
+ # Logging option (-mpilog or -mpitrace or -mpianim)
+ 
+@@ -51,7 +54,8 @@ LOGOPTION =
+ MDIR = ../../modules
+ 
+ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/random:$(MDIR)/linalg:\
+-          $(MDIR)/utils:$(MDIR)/uflds:$(MDIR)/sflds:$(MDIR)/su3fcts
++          $(MDIR)/utils:$(MDIR)/uflds:$(MDIR)/sflds:$(MDIR)/su3fcts\
++					$(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512
+ 
+ 
+ # additional include directories
+@@ -69,10 +73,10 @@ LIBPATH = $(MPI_HOME)/lib
+ # scheduling and optimization options
+ 
+ CFLAGS = -std=c89 -pedantic -fstrict-aliasing \
+-         -Wall -Wno-long-long -Wstrict-prototypes -Werror \
+-         -O -mno-avx -Dx64 -DPM
++         -Wall -Wno-long-long -Wstrict-prototypes \
++         -O2 -DAVX512
+ 
+-LFLAGS =
++LFLAGS = $(CFLAGS)
+ 
+ 
+ ############################## do not change ###################################
+@@ -81,25 +85,39 @@ SHELL=/bin/bash
+ CC=mpicc
+ CLINKER=$(CC)
+ 
++#Check CFLAGS to find which AVX512 flags are active
++ifneq (,$(findstring AVX512_ASM,$(CFLAGS)))
++MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES)
++else
++ifneq (,$(findstring AVX512,$(CFLAGS)))
++MODULES= $(STD_MODULES) $(AVX512_MODULES)
++else
++MODULES= $(STD_MODULES)
++endif
++endif
++
+ PGMS= $(MAIN) $(MODULES)
+ 
+ -include $(addsuffix .d,$(PGMS))
+ 
+ 
+ # rule to make dependencies
+-
+ $(addsuffix .d,$(PGMS)): %.d: %.c Makefile
+ 	@ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@
+ 
+ 
+ # rule to compile source programs
++$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile
++	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile
++# pattern to compile files in the avx512 directiories
++$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile
+ 	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
++$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile
++	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+ # rule to link object files
+-
+ $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile
+ 	$(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \
+         $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@
+diff --git a/devel/sw_term/Makefile b/devel/sw_term/Makefile
+index b9c80c4..c7f455f 100644
+--- a/devel/sw_term/Makefile
++++ b/devel/sw_term/Makefile
+@@ -25,7 +25,7 @@ FLAGS = flags lat_parms dfl_parms
+ 
+ LATTICE = bcnds uidx ftidx geometry
+ 
+-LINALG = salg_dble liealg cmatrix_dble
++LINALG = salg_dble liealg cmatrix_dble salg_dble_avx512
+ 
+ RANDOM = ranlux ranlxs ranlxd gauss
+ 
+@@ -33,7 +33,7 @@ SFLDS = sflds
+ 
+ SU3FCTS = chexp su3prod su3ren cm3x3 random_su3
+ 
+-SW_TERM = pauli pauli_dble swflds sw_term
++SW_TERM = pauli pauli_dble swflds sw_term pauli_avx512 pauli_dble_avx512
+ 
+ TCHARGE = ftcom ftensor
+ 
+@@ -44,7 +44,6 @@ UTILS = endian error hsum mutils utils wspace
+ MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(RANDOM) $(UFLDS) \
+           $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM)
+ 
+-
+ # Logging option (-mpilog or -mpitrace or -mpianim)
+ 
+ LOGOPTION =
+@@ -56,7 +55,8 @@ MDIR = ../../modules
+ 
+ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/random:\
+           $(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:$(MDIR)/sflds:\
+-          $(MDIR)/tcharge:$(MDIR)/sw_term
++          $(MDIR)/tcharge:$(MDIR)/sw_term:\
++          $(MDIR)/linalg/avx512:$(MDIR)/sw_term/avx512
+ 
+ 
+ # additional include directories
+@@ -74,10 +74,10 @@ LIBPATH = $(MPI_HOME)/lib
+ # scheduling and optimization options
+ 
+ CFLAGS = -std=c89 -pedantic -fstrict-aliasing \
+-         -Wall -Wno-long-long -Wstrict-prototypes -Werror \
+-         -O -mno-avx -Dx64 -DPM
++         -Wall -Wno-long-long -Wstrict-prototypes \
++         -O2 -DAVX512 -DAVX -DPM
+ 
+-LFLAGS =
++LFLAGS = $(CFLAGS)
+ 
+ ############################## do not change ###################################
+ 
+@@ -91,24 +91,19 @@ PGMS= $(MAIN) $(MODULES)
+ 
+ 
+ # rule to make dependencies
+-
+ $(addsuffix .d,$(PGMS)): %.d: %.c Makefile
+ 	@ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@
+ 
+ 
+ # rule to compile source programs
+-
+-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile
++$(addsuffix .o,$(MAIN) $(MODULES)): %.o: %.c Makefile
+ 	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+-
+ # rule to link object files
+-
+ $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile
+ 	$(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \
+         $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@
+ 
+-
+ # produce executables
+ 
+ mkxeq: $(MAIN)
+diff --git a/devel/update/Makefile b/devel/update/Makefile
+index d3e8910..749ade4 100644
+--- a/devel/update/Makefile
++++ b/devel/update/Makefile
+@@ -67,11 +67,16 @@ UTILS = endian error hsum mutils utils wspace
+ 
+ VFLDS = vflds vinit vcom vdcom
+ 
+-MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \
++STD_MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \
+           $(LATTICE) $(LINALG) $(LINSOLV) $(LITTLE) $(MDFLDS) $(RANDOM) \
+           $(RATFCTS) $(SAP) $(SFLDS) $(SU3FCTS) $(SW_TERM) $(TCHARGE) \
+           $(UFLDS) $(UPDATE) $(UTILS) $(VFLDS)
+ 
++AVX512_MODULES = Dw_avx512 Dw_dble_avx512 salg_avx512 salg_dble_avx512 \
++			 		pauli_avx512 pauli_dble_avx512
++
++AVX512_ASM_MODULES = Dw_avx512_asm Dw_dble_avx512_asm salg_avx512_asm \
++ 					salg_dble_avx512_asm pauli_avx512_asm pauli_dble_avx512_asm
+ 
+ # Logging option (-mpilog or -mpitrace or -mpianim)
+ 
+@@ -87,7 +92,8 @@ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\
+           $(MDIR)/utils:$(MDIR)/forces:$(MDIR)/sflds:$(MDIR)/dirac:\
+ 	  $(MDIR)/sw_term:$(MDIR)/tcharge:$(MDIR)/block:$(MDIR)/sap:\
+ 	  $(MDIR)/linsolv:$(MDIR)/dfl:$(MDIR)/vflds:$(MDIR)/little:\
+-          $(MDIR)/update:$(MDIR)/ratfcts
++          $(MDIR)/update:$(MDIR)/ratfcts\
++					$(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512
+ 
+ 
+ # additional include directories
+@@ -105,10 +111,11 @@ LIBPATH = $(MPI_HOME)/lib
+ # scheduling and optimization options
+ 
+ CFLAGS = -std=c89 -pedantic -fstrict-aliasing \
+-         -Wall -Wno-long-long -Wstrict-prototypes -Werror \
+-         -O -mno-avx -Dx64 -DPM
++         -Wall -Wno-long-long -Wstrict-prototypes \
++         -O2 -DAVX512
++
++LFLAGS = $(CFLAGS)
+ 
+-LFLAGS =
+ 
+ # -DMDINT_DBG -DRWRAT_DBG
+ 
+@@ -119,25 +126,39 @@ SHELL=/bin/bash
+ CC=mpicc
+ CLINKER=$(CC)
+ 
++#Check CFLAGS to find which AVX512 flags are active
++ifneq (,$(findstring AVX512_ASM,$(CFLAGS)))
++MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES)
++else
++ifneq (,$(findstring AVX512,$(CFLAGS)))
++MODULES= $(STD_MODULES) $(AVX512_MODULES)
++else
++MODULES= $(STD_MODULES)
++endif
++endif
++
+ PGMS= $(MAIN) $(MODULES)
+ 
+ -include $(addsuffix .d,$(PGMS))
+ 
+ 
+ # rule to make dependencies
+-
+ $(addsuffix .d,$(PGMS)): %.d: %.c Makefile
+ 	@ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@
+ 
+ 
+ # rule to compile source programs
++$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile
++	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile
++# pattern to compile files in the avx512 directiories
++$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile
+ 	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
++$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile
++	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+ # rule to link object files
+-
+ $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile
+ 	$(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \
+         $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@
+diff --git a/devel/vflds/Makefile b/devel/vflds/Makefile
+index 11778a2..a29b334 100644
+--- a/devel/vflds/Makefile
++++ b/devel/vflds/Makefile
+@@ -59,7 +59,11 @@ MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \
+           $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) $(DIRAC) \
+           $(BLOCK) $(SAP) $(ARCHIVE) $(DFL) $(VFLDS)
+ 
++AVX512_MODULES = Dw_avx512 Dw_dble_avx512 salg_avx512 salg_dble_avx512 \
++			 		pauli_avx512 pauli_dble_avx512
+ 
++AVX512_ASM_MODULES = Dw_avx512_asm Dw_dble_avx512_asm salg_avx512_asm \
++ 					salg_dble_avx512_asm pauli_avx512_asm pauli_dble_avx512_asm
+ # Logging option (-mpilog or -mpitrace or -mpianim)
+ 
+ LOGOPTION =
+@@ -73,7 +77,8 @@ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/linsolv:\
+           $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:\
+           $(MDIR)/sflds:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:\
+           $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive:$(MDIR)/dfl:\
+-          $(MDIR)/vflds
++          $(MDIR)/vflds\
++					$(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512
+ 
+ # additional include directories
+ 
+@@ -90,10 +95,10 @@ LIBPATH = $(MPI_HOME)/lib
+ # scheduling and optimization options
+ 
+ CFLAGS = -std=c89 -pedantic -fstrict-aliasing \
+-         -Wall -Wno-long-long -Wstrict-prototypes -Werror \
+-         -O -mno-avx -Dx64 -DPM
++         -Wall -Wno-long-long -Wstrict-prototypes \
++         -O2 -DAVX512
+ 
+-LFLAGS =
++LFLAGS = $(CFLAGS)
+ 
+ 
+ ############################## do not change ###################################
+@@ -102,25 +107,39 @@ SHELL=/bin/bash
+ CC=mpicc
+ CLINKER=$(CC)
+ 
++#Check CFLAGS to find which AVX512 flags are active
++ifneq (,$(findstring AVX512_ASM,$(CFLAGS)))
++MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES)
++else
++ifneq (,$(findstring AVX512,$(CFLAGS)))
++MODULES= $(STD_MODULES) $(AVX512_MODULES)
++else
++MODULES= $(STD_MODULES)
++endif
++endif
++
+ PGMS= $(MAIN) $(MODULES)
+ 
+ -include $(addsuffix .d,$(PGMS))
+ 
+ 
+ # rule to make dependencies
+-
+ $(addsuffix .d,$(PGMS)): %.d: %.c Makefile
+ 	@ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@
+ 
+ 
+ # rule to compile source programs
++$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile
++	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile
++# pattern to compile files in the avx512 directiories
++$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile
+ 	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
++$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile
++	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+ # rule to link object files
+-
+ $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile
+ 	$(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \
+         $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@
+diff --git a/include/avx512.h b/include/avx512.h
+new file mode 100644
+index 0000000..ac73c60
+--- /dev/null
++++ b/include/avx512.h
+@@ -0,0 +1,978 @@
++
++/*******************************************************************************
++*
++* File avx512.h
++*
++* This software is distributed under the terms of the GNU General Public
++* License (GPL)
++*
++* Macros for operating on SU(3) vectors and matrices using Intel intrinsic
++* operations for AVX512 compatible processors
++*
++*******************************************************************************/
++
++#ifndef AVX512_H
++#define AVX512_H
++
++#ifndef SSE2_H
++#include "sse2.h"
++#endif
++
++#include "immintrin.h"
++
++
++
++/* Macros for single precision floating point numbers */
++
++/* Write 6 color weyl vectors as a spinor */
++#define _avx512_write_6_hwv_f( c1,c2,c3, a ){            \
++  __m256 t256; __m128 t128;                              \
++  t256 = _mm256_shuffle_ps( c1, c2, 0b01000100 );        \
++  t128 = _mm256_castps256_ps128( t256 );                 \
++  _mm_storeu_ps( a, t128 );                               \
++  t128 = _mm256_extractf128_ps( t256, 1 );               \
++  _mm_storeu_ps( a+12, t128 );                            \
++                                                         \
++  t256 = _mm256_shuffle_ps( c3, c1, 0b11100100 );        \
++  t128 = _mm256_castps256_ps128( t256 );                 \
++  _mm_storeu_ps( a+4, t128 );                             \
++  t128 = _mm256_extractf128_ps( t256, 1 );               \
++  _mm_storeu_ps( a+16, t128 );                            \
++                                                         \
++  t256 = _mm256_shuffle_ps( c2, c3, 0b11101110 );        \
++  t128 = _mm256_castps256_ps128( t256 );                 \
++  _mm_storeu_ps( a+8, t128 );                             \
++  t128 = _mm256_extractf128_ps( t256, 1 );               \
++  _mm_storeu_ps( a+20, t128 );                            \
++}
++
++
++/* Load 6 color weyl vectors from a spinor */
++#define _avx512_load_6_hwv_f( c1,c2,c3, a ){             \
++  __m256 t1,t2,t3; __m128 t11, t14;                      \
++  t11 = _mm_loadu_ps( a );                               \
++  t14 = _mm_loadu_ps( a+12 );                            \
++  t1 = _mm256_castps128_ps256( t11 );                    \
++  t1 = _mm256_insertf128_ps( t1, t14, 1 );               \
++                                                         \
++  t11 = _mm_loadu_ps( a+4 );                             \
++  t14 = _mm_loadu_ps( a+16 );                            \
++  t2 = _mm256_castps128_ps256( t11 );                    \
++  t2 = _mm256_insertf128_ps( t2, t14, 1 );               \
++                                                         \
++  t11 = _mm_loadu_ps( a+8 );                             \
++  t14 = _mm_loadu_ps( a+20 );                            \
++  t3 = _mm256_castps128_ps256( t11 );                    \
++  t3 = _mm256_insertf128_ps( t3, t14, 1 );               \
++                                                         \
++  c1 = _mm256_shuffle_ps( t1, t2, 0b11100100 );          \
++  c2 = _mm256_shuffle_ps( t1, t3, 0b01001110 );          \
++  c3 = _mm256_shuffle_ps( t2, t3, 0b11100100 );          \
++}
++
++
++
++/* Load 4x2 complex numbers into an aray */
++#define _avx512_load_4_cf( r, c1,c2,c3,c4 ){            \
++     __m128 t128;                                       \
++     t128 = _mm_loadu_ps( c1 );                         \
++     r = _mm512_castps128_ps512( t128 );                \
++     t128 = _mm_loadu_ps( c2 );                         \
++     r = _mm512_insertf32x4 ( r, t128, 1);              \
++     t128 = _mm_loadu_ps( c3 );                         \
++     r = _mm512_insertf32x4 ( r, t128, 2);              \
++     t128 = _mm_loadu_ps( c4 );                         \
++     r = _mm512_insertf32x4 ( r, t128, 3);              \
++}
++
++/* Load 4 half-spinors and organize colorwise into vectors r1, r2 and r3 */
++#define _avx512_load_4_halfspinor_f( r1, r2, r3, s1, s2, s3, s4   )   \
++{                                                                     \
++     __m512 t512a, t512b, t512c;                                      \
++     _avx512_load_4_cf( t512a, s1,s2,s3,s4 );                         \
++     _avx512_load_4_cf( t512b, s1+4,s2+4,s3+4,s4+4 );                 \
++     _avx512_load_4_cf( t512c, s1+8,s2+8,s3+8,s4+8 );                 \
++                                                                      \
++     r1 = _mm512_shuffle_ps(t512a,t512b, 0b11100100);                 \
++     r2 = _mm512_shuffle_ps(t512a,t512c, 0b01001110);                 \
++     r3 = _mm512_shuffle_ps(t512b,t512c, 0b11100100);                 \
++}
++
++/* Load 4 half-spinors reversing the second two spinor indeces and
++ * organize colorwise into vectors r1, r2 and r3 */
++#define _avx512_load_4_halfspinor_f_reverse_up( r1, r2, r3, s1, s2, s3, s4 )  \
++{                                                                             \
++     __m512 t512a, t512b, t512c;                                              \
++     __m512i idx;                                                             \
++     _avx512_load_4_cf( t512a, s1,s2,s3,s4 );                                 \
++     _avx512_load_4_cf( t512b, s1+4,s2+4,s3+4,s4+4 );                         \
++     _avx512_load_4_cf( t512c, s1+8,s2+8,s3+8,s4+8 );                         \
++                                                                              \
++     idx = _mm512_setr_epi32( 0,1,16+2,16+3, 4,5,16+6,16+7,                   \
++                              16+10,16+11,8,9, 16+14,16+15,12,13  );          \
++     r1 = _mm512_permutex2var_ps( t512a, idx, t512b );                        \
++     idx = _mm512_setr_epi32( 2,3,16+0,16+1, 6,7,16+4,16+5,                   \
++                              16+8,16+9,10,11, 16+12,16+13,14,15  );          \
++     r2 = _mm512_permutex2var_ps( t512a, idx, t512c );                        \
++     idx = _mm512_setr_epi32( 0,1,16+2,16+3, 4,5,16+6,16+7,                   \
++                              16+10,16+11,8,9, 16+14,16+15,12,13  );          \
++     r3 = _mm512_permutex2var_ps( t512b, idx, t512c );                        \
++}
++
++/* Load 4 half-spinors reversing first two the spinor indeces and
++ * organize colorwise into vectors r1, r2 and r3 */
++#define _avx512_load_4_halfspinor_f_reverse_dn( r1, r2, r3, s1, s2, s3, s4 )  \
++{                                                                             \
++     __m512 t512a, t512b, t512c;                                              \
++     __m512i idx;                                                             \
++     _avx512_load_4_cf( t512a, s1,s2,s3,s4 );                                 \
++     _avx512_load_4_cf( t512b, s1+4,s2+4,s3+4,s4+4 );                         \
++     _avx512_load_4_cf( t512c, s1+8,s2+8,s3+8,s4+8 );                         \
++                                                                              \
++     idx = _mm512_setr_epi32( 16+2,16+3,0,1, 16+6,16+7,4,5,                   \
++                              8,9,16+10,16+11, 12,13,16+14,16+15  );          \
++     r1 = _mm512_permutex2var_ps( t512a, idx, t512b );                        \
++     idx = _mm512_setr_epi32( 16+0,16+1,2,3, 16+4,16+5,6,7,                   \
++                              10,11,16+8,16+9, 14,15,16+12,16+13  );          \
++     r2 = _mm512_permutex2var_ps( t512a, idx, t512c );                        \
++     idx = _mm512_setr_epi32( 16+2,16+3,0,1, 16+6,16+7,4,5,                   \
++                              8,9,16+10,16+11, 12,13,16+14,16+15  );          \
++     r3 = _mm512_permutex2var_ps( t512b, idx, t512c );                        \
++}
++
++/* Load 4x2 complex numbers into an aray */
++#define _avx512_write_4_cf( r, c1,c2,c3,c4 ){                          \
++     __m512 t512;                                                      \
++     __m128 t128 = _mm512_castps512_ps128( r );                        \
++     _mm_storeu_ps( c1, t128 );                                        \
++     t128 = _mm512_extractf32x4_ps( r, 1 );                            \
++     _mm_storeu_ps( c2, t128 );                                        \
++     t128 = _mm512_extractf32x4_ps( r, 2 );                            \
++     _mm_storeu_ps( c3, t128 );                                        \
++     t128 = _mm512_extractf32x4_ps( r, 3 );                            \
++     _mm_storeu_ps( c4, t128 );                                        \
++}
++
++/* Store 4 half-spinors from color vectors */
++#define _avx512_write_4_halfspinor_f( r1, r2, r3, s1, s2, s3, s4   )   \
++{                                                                      \
++     __m512 t512a, t512b, t512c;                                       \
++                                                                       \
++     t512a = _mm512_shuffle_ps(r1,r2, 0b01000100);                     \
++     t512b = _mm512_shuffle_ps(r3,r1, 0b11100100);                     \
++     t512c = _mm512_shuffle_ps(r2,r3, 0b11101110);                     \
++                                                                       \
++     _avx512_write_4_cf( t512a, s1,s2,s3,s4 );                         \
++     _avx512_write_4_cf( t512b, s1+4,s2+4,s3+4,s4+4 );                 \
++     _avx512_write_4_cf( t512c, s1+8,s2+8,s3+8,s4+8 );                 \
++}
++
++/* Store 4 half-spinors from color vectors reversing the first two Dirac indeces */
++#define _avx512_write_4_halfspinor_f_reverse_up( r1, r2, r3, s1, s2, s3, s4 ) \
++{                                                                             \
++     __m512 t512a, t512b, t512c;                                              \
++     __m512i idx;                                                             \
++     idx = _mm512_setr_epi32( 0,1,16+0,16+1, 4,5,16+4,16+5,                   \
++                              10,11,16+10,16+11, 14,15,16+14,16+15  );        \
++     t512a = _mm512_permutex2var_ps( r1, idx, r2 );                           \
++     idx = _mm512_setr_epi32( 0,1,16+2,16+3, 4,5,16+6,16+7,                   \
++                              10,11,16+8,16+9, 14,15,16+12,16+13  );          \
++     t512b = _mm512_permutex2var_ps( r3, idx, r1 );                           \
++     idx = _mm512_setr_epi32( 2,3,16+2,16+3, 6,7,16+6,16+7,                   \
++                              8,9,16+8,16+9, 12,13,16+12,16+13  );            \
++     t512c = _mm512_permutex2var_ps( r2, idx, r3 );                           \
++                                                                              \
++     _avx512_write_4_cf( t512a, s1,s2,s3,s4 );                                \
++     _avx512_write_4_cf( t512b, s1+4,s2+4,s3+4,s4+4 );                        \
++     _avx512_write_4_cf( t512c, s1+8,s2+8,s3+8,s4+8 );                        \
++}
++
++
++/* Store 4 half-spinors from color vectors reversing the second two Dirac indeces */
++#define _avx512_write_4_halfspinor_f_reverse_dn( r1, r2, r3, s1, s2, s3, s4 ) \
++{                                                                             \
++     __m512 t512a, t512b, t512c;                                              \
++     __m512i idx;                                                             \
++     idx = _mm512_setr_epi32( 2,3,16+2,16+3, 6,7,16+6,16+7,                   \
++                              8,9,16+8,16+9, 12,13,16+12,16+13  );            \
++     t512a = _mm512_permutex2var_ps( r1, idx, r2 );                           \
++     idx = _mm512_setr_epi32( 2,3,16+0,16+1, 6,7,16+4,16+5,                   \
++                              8,9,16+10,16+11, 12,13,16+14,16+15  );          \
++     t512b = _mm512_permutex2var_ps( r3, idx, r1 );                           \
++     idx = _mm512_setr_epi32( 0,1,16+0,16+1, 4,5,16+4,16+5,                   \
++                              10,11,16+10,16+11, 14,15,16+14,16+15  );        \
++     t512c = _mm512_permutex2var_ps( r2, idx, r3 );                           \
++                                                                              \
++     _avx512_write_4_cf( t512a, s1,s2,s3,s4 );                                \
++     _avx512_write_4_cf( t512b, s1+4,s2+4,s3+4,s4+4 );                        \
++     _avx512_write_4_cf( t512c, s1+8,s2+8,s3+8,s4+8 );                        \
++}
++
++
++
++
++
++
++
++
++/* Combine Dirac spinors to half-spinors in deo and doe.
++ */
++#define _avx512_dirac_combine_f_1( a, b )                                \
++{                                                                        \
++  __m512i indexes;                                                       \
++  __m512 c;                                                              \
++  indexes = _mm512_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7,                   \
++                               9, 8, 11, 10, 13, 12, 15, 14 );           \
++  c = _mm512_permutexvar_ps( indexes, b);                                \
++  a = _mm512_mask_add_ps( a, 0b0101101000001111, a, c );                 \
++  a = _mm512_mask_sub_ps( a, 0b1010010111110000, a, c );                 \
++}
++
++#define _avx512_dirac_combine_f_2( a, b )                                \
++{                                                                        \
++  __m512i indexes;                                                       \
++  __m512 c;                                                              \
++  indexes = _mm512_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7,                   \
++                               9, 8, 11, 10, 13, 12, 15, 14 );           \
++  c = _mm512_permutexvar_ps( indexes, b);                                \
++  a = _mm512_mask_add_ps( a, 0b1001011011000011, a, c );                 \
++  a = _mm512_mask_sub_ps( a, 0b0110100100111100, a, c );                 \
++}
++
++
++#define _avx512_dirac_combine_f_3( a, b )                                \
++{                                                                        \
++  __m512i indexes;                                                       \
++  __m512 c;                                                              \
++  indexes = _mm512_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7,                   \
++                               9, 8, 11, 10, 13, 12, 15, 14 );           \
++  c = _mm512_permutexvar_ps( indexes, b);                                \
++  a = _mm512_mask_add_ps( a, 0b1010010100001111, a, c );                 \
++  a = _mm512_mask_sub_ps( a, 0b0101101011110000, a, c );                 \
++}
++
++
++#define _avx512_dirac_combine_f_4( a, b )                                \
++{                                                                        \
++  __m512i indexes;                                                       \
++  __m512 c;                                                              \
++  indexes = _mm512_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7,                   \
++                       9, 8, 11, 10, 13, 12, 15, 14 );                   \
++  c = _mm512_permutexvar_ps( indexes, b);                                \
++  a = _mm512_mask_add_ps( a, 0b0110100111000011, a, c );                 \
++  a = _mm512_mask_sub_ps( a, 0b1001011000111100, a, c );                 \
++}
++
++
++
++
++/* Multiply 4 vectors with a su(3) matrices, taking the inverse of every
++ * second matrix
++ */
++#define avx512_su3_mixed_multiply_8( u1, um1, u2, um2, b1,b2,b3, a1,a2,a3 )           \
++ {                                                                                    \
++   __m512 ut11, ut21, ut31, ut41, ut12, ut22, ut32, ut42;                             \
++   __m512 ut1, ut2, sign, c;                                                          \
++   __m512 t1,t2,t3;                                                                   \
++   __m512i indexes;                                                                   \
++   ut11 = _mm512_loadu_ps( &(u1).c11.re );  ut12 = _mm512_loadu_ps( &(u1).c33.re );   \
++   ut21 = _mm512_loadu_ps( &(um1).c11.re ); ut22 = _mm512_loadu_ps( &(um1).c33.re );  \
++   ut31 = _mm512_loadu_ps( &(u2).c11.re );  ut32 = _mm512_loadu_ps( &(u2).c33.re );   \
++   ut41 = _mm512_loadu_ps( &(um2).c11.re ); ut42 = _mm512_loadu_ps( &(um2).c33.re );  \
++                                                                                      \
++   indexes = _mm512_setr_epi32( 0, 1, 2, 3, 8, 9, 6, 7,                               \
++                                16, 17, 18, 19, 24, 25, 22, 23 );                     \
++   ut1 = _mm512_permutex2var_ps( ut11, indexes, ut21 );                               \
++   ut2 = _mm512_permutex2var_ps( ut31, indexes, ut41 );                               \
++                                                                                      \
++   indexes = _mm512_setr_epi32( 0,0,0,0, 8,8,8,8, 16,16,16,16, 24,24,24,24 );         \
++   c = _mm512_permutex2var_ps( ut1, indexes, ut2 );                                   \
++   b1 = _mm512_mul_ps ( a1, c );                                                      \
++                                                                                      \
++   indexes = _mm512_setr_epi32( 4,4,4,4, 12,12,12,12, 20,20,20,20, 28,28,28,28 );     \
++   c = _mm512_permutex2var_ps( ut1, indexes, ut2 );                                   \
++   b2 = _mm512_mul_ps ( a2, c );                                                      \
++                                                                                      \
++   indexes = _mm512_setr_epi32( 2,2,2,2, 14,14,14,14, 18,18,18,18, 30,30,30,30 );     \
++   c = _mm512_permutex2var_ps( ut1, indexes, ut2 );                                   \
++   b1 = _mm512_fmadd_ps ( a2, c, b1 );                                                \
++                                                                                      \
++   indexes = _mm512_setr_epi32( 6,6,6,6, 10,10,10,10, 22,22,22,22, 26,26,26,26 );     \
++   c = _mm512_permutex2var_ps( ut1, indexes, ut2 );                                   \
++   b2 = _mm512_fmadd_ps ( a1, c, b2 );                                                \
++                                                                                      \
++                                                                                      \
++   sign = _mm512_set_ps( -1,1,-1,1, 1,-1,1,-1, -1,1,-1,1, 1,-1,1,-1 );                \
++   t1 = _mm512_permute_ps( a1, 0b10110001  );                                         \
++   t1 = _mm512_mul_ps( t1, sign );                                                    \
++   t2 = _mm512_permute_ps( a2, 0b10110001  );                                         \
++   t2 = _mm512_mul_ps( t2, sign );                                                    \
++                                                                                      \
++   indexes = _mm512_setr_epi32( 1,1,1,1, 9,9,9,9, 17,17,17,17, 25,25,25,25 );         \
++   c = _mm512_permutex2var_ps( ut1, indexes, ut2 );                                   \
++   b1 = _mm512_fmadd_ps ( t1, c, b1 );                                                \
++                                                                                      \
++   indexes = _mm512_setr_epi32( 5,5,5,5, 13,13,13,13, 21,21,21,21, 29,29,29,29 );     \
++   c = _mm512_permutex2var_ps( ut1, indexes, ut2 );                                   \
++   b2 = _mm512_fmadd_ps ( t2, c, b2 );                                                \
++                                                                                      \
++   indexes = _mm512_setr_epi32( 3,3,3,3, 15,15,15,15, 19,19,19,19, 31,31,31,31 );     \
++   c = _mm512_permutex2var_ps( ut1, indexes, ut2 );                                   \
++   b1 = _mm512_fmadd_ps ( t2, c, b1 );                                                \
++                                                                                      \
++   indexes = _mm512_setr_epi32( 7,7,7,7, 11,11,11,11, 23,23,23,23, 27,27,27,27 );     \
++   c = _mm512_permutex2var_ps( ut1, indexes, ut2 );                                   \
++   b2 = _mm512_fmadd_ps ( t1, c, b2 );                                                \
++                                                                                      \
++                                                                                      \
++   indexes = _mm512_setr_epi32( 4, 5, 12, 13, 10, 11, 14, 15,                         \
++                                20, 21, 28, 29, 26, 27, 30, 31 );                     \
++   ut1 = _mm512_permutex2var_ps( ut11, indexes, ut21 );                               \
++   ut2 = _mm512_permutex2var_ps( ut31, indexes, ut41 );                               \
++                                                                                      \
++   indexes = _mm512_setr_epi32( 0,0,0,0, 10,10,10,10, 16,16,16,16, 26,26,26,26 );     \
++   c = _mm512_permutex2var_ps( ut1, indexes, ut2 );                                   \
++   b1 = _mm512_fmadd_ps ( a3, c, b1 );                                                \
++                                                                                      \
++   indexes = _mm512_setr_epi32( 2,2,2,2, 8,8,8,8, 18,18,18,18, 24,24,24,24 );         \
++   c = _mm512_permutex2var_ps( ut1, indexes, ut2 );                                   \
++   b3 = _mm512_mul_ps ( a1, c );                                                      \
++                                                                                      \
++   indexes = _mm512_setr_epi32( 4,4,4,4, 14,14,14,14, 20,20,20,20, 30,30,30,30 );     \
++   c = _mm512_permutex2var_ps( ut1, indexes, ut2 );                                   \
++   b2 = _mm512_fmadd_ps ( a3, c, b2 );                                                \
++                                                                                      \
++   indexes = _mm512_setr_epi32( 6,6,6,6, 12,12,12,12, 22,22,22,22, 28,28,28,28 );     \
++   c = _mm512_permutex2var_ps( ut1, indexes, ut2 );                                   \
++   b3 = _mm512_fmadd_ps ( a2, c, b3 );                                                \
++                                                                                      \
++                                                                                      \
++   t3 = _mm512_permute_ps( a3, 0b10110001  );                                         \
++   t3 = _mm512_mul_ps( t3, sign );                                                    \
++                                                                                      \
++   indexes = _mm512_setr_epi32( 1,1,1,1, 11,11,11,11, 17,17,17,17, 27,27,27,27 );     \
++   c = _mm512_permutex2var_ps( ut1, indexes, ut2 );                                   \
++   b1 = _mm512_fmadd_ps ( t3, c, b1 );                                                \
++                                                                                      \
++   indexes = _mm512_setr_epi32( 3,3,3,3, 9,9,9,9, 19,19,19,19, 25,25,25,25 );         \
++   c = _mm512_permutex2var_ps( ut1, indexes, ut2 );                                   \
++   b3 = _mm512_fmadd_ps ( t1, c, b3 );                                                \
++                                                                                      \
++   indexes = _mm512_setr_epi32( 5,5,5,5, 15,15,15,15, 21,21,21,21, 31,31,31,31 );     \
++   c = _mm512_permutex2var_ps( ut1, indexes, ut2 );                                   \
++   b2 = _mm512_fmadd_ps ( t3, c, b2 );                                                \
++                                                                                      \
++   indexes = _mm512_setr_epi32( 7,7,7,7, 13,13,13,13, 23,23,23,23, 29,29,29,29 );     \
++   c = _mm512_permutex2var_ps( ut1, indexes, ut2 );                                   \
++   b3 = _mm512_fmadd_ps ( t2, c, b3 );                                                \
++                                                                                      \
++                                                                                      \
++   indexes = _mm512_setr_epi32( 0, 1, 16, 17, 0,0,0,0, 0,0,0,0, 0,0,0,0 );            \
++   ut1 = _mm512_permutex2var_ps( ut12, indexes, ut22 );                               \
++   ut2 = _mm512_permutex2var_ps( ut32, indexes, ut42 );                               \
++                                                                                      \
++   indexes = _mm512_setr_epi32( 0,0,0,0, 2,2,2,2, 16,16,16,16, 18,18,18,18 );         \
++   c = _mm512_permutex2var_ps( ut1, indexes, ut2 );                                   \
++   b3 = _mm512_fmadd_ps ( a3, c, b3 );                                                \
++                                                                                      \
++   indexes = _mm512_setr_epi32( 1,1,1,1, 3,3,3,3, 17,17,17,17, 19,19,19,19 );         \
++   c = _mm512_permutex2var_ps( ut1, indexes, ut2 );                                   \
++   b3 = _mm512_fmadd_ps ( t3, c, b3 );                                                \
++}
++
++
++
++/* Insert 256 bits into a 512 bit single precision vector */
++#define _avx512_insert_256_h_f( a, t )                             \
++{                                                                  \
++  __m512d td512;                                                   \
++  __m256d td256;                                                   \
++  td512 = _mm512_castps_pd( a );                                   \
++  td256 = _mm256_castps_pd( t );                                   \
++  td512 = _mm512_insertf64x4( td512, td256, 1 );                   \
++  a = _mm512_castpd_ps( td512 );                                   \
++}
++
++/* Extract 256 bits from a 512 bit single precision vector */
++#define _avx512_extract_256_h_f( t, a )                           \
++{                                                                 \
++  __m512d td512;                                                  \
++  __m256d td256;                                                  \
++  td512 = _mm512_castps_pd( a );                                  \
++  td256 = _mm256_castps_pd( t );                                  \
++  td256 = _mm512_extractf64x4_pd( td512, 1 );                     \
++  t = _mm256_castpd_ps( td256 );                                  \
++}
++
++
++
++/* Accumulate elements of Dirac vectors into a Weyl vector in deo and doe */
++#define _avx512_to_weyl_f_12( c, b ){                                  \
++  __m256 w, sign, t5,t6;                                               \
++  __m256i idx;                                                         \
++  t5 = _mm512_castps512_ps256( b );                                    \
++                                                                       \
++  idx = _mm256_setr_epi32( 4, 5, 6, 7, 0, 1, 2, 3 );                   \
++  t6 = _mm256_permutevar8x32_ps( t5, idx );                            \
++  sign = _mm256_set_ps( -1,-1,-1,-1, 1,1,1,1 );                        \
++  c = _mm256_fmadd_ps( t5, sign, t6 );                                 \
++                                                                       \
++  _avx512_extract_256_h_f( t5, b );                                    \
++  t6 = _mm256_permutevar8x32_ps( t5, idx );                            \
++  sign = _mm256_set_ps( -1,-1,-1,-1, 1,1,1,1 );                        \
++  w = _mm256_fmadd_ps( t6, sign, t5 );                                 \
++  idx = _mm256_setr_epi32( 0, 1, 2, 3,  3, 2, 1, 0 );                  \
++  w = _mm256_permutevar_ps( w, idx );                                  \
++  sign = _mm256_set_ps( 1,-1,1,-1, 1,1,1,1 );                          \
++  c = _mm256_fmadd_ps( w, sign, c );                                   \
++}
++
++#define _avx512_to_weyl_f_34( c, b ){                                  \
++  __m256 w, sign, t5,t6;                                               \
++  __m256i idx;                                                         \
++  t5 = _mm512_castps512_ps256( b );                                    \
++                                                                       \
++  idx = _mm256_setr_epi32( 4, 5, 6, 7, 0, 1, 2, 3 );                   \
++  t6 = _mm256_permutevar8x32_ps( t5, idx );                            \
++  sign = _mm256_set_ps( -1,-1,-1,-1, 1,1,1,1 );                        \
++  w = _mm256_fmadd_ps( t6, sign, t5 );                                 \
++  idx = _mm256_setr_epi32( 0, 1, 2, 3,  2, 3, 0, 1 );                  \
++  w = _mm256_permutevar_ps( w, idx );                                  \
++  sign = _mm256_set_ps( -1,-1,1,1, 1,1,1,1 );                          \
++  c = _mm256_fmadd_ps( w, sign, c );                                   \
++                                                                       \
++  _avx512_extract_256_h_f( t5, b );                                    \
++  idx = _mm256_setr_epi32( 4, 5, 6, 7, 0, 1, 2, 3 );                   \
++  t6 = _mm256_permutevar8x32_ps( t5, idx );                            \
++  sign = _mm256_set_ps( -1,-1,-1,-1, 1,1,1,1 );                        \
++  w = _mm256_fmadd_ps( t6, sign, t5 );                                 \
++  idx = _mm256_setr_epi32( 0, 1, 2, 3,  1, 0, 3, 2 );                  \
++  w = _mm256_permutevar_ps( w, idx );                                  \
++  sign = _mm256_set_ps( -1,1,1,-1, 1,1,1,1 );                          \
++  c = _mm256_fmadd_ps( w, sign, c );                                   \
++}
++
++
++/* Expand a Weyl vector into a Dirac vector in deo and doe */
++#define _avx512_to_dirac_f_1( a1, c1 )                                 \
++{                                                                      \
++  __m256 t1,t2, sign;                                                  \
++  __m256i idx;                                                         \
++  idx = _mm256_setr_epi32( 4, 5, 6, 7, 0, 1, 2, 3 );                   \
++  t1 = _mm256_permutevar8x32_ps( c1, idx );                            \
++  sign = _mm256_set_ps( -1,-1,-1,-1, 1,1,1,1 );                        \
++  t2 = _mm256_fmadd_ps( c1, sign, t1 );                                \
++  a1 = _mm512_castps256_ps512( t2 );                                   \
++}
++
++
++#define _avx512_to_dirac_f_2( a1, c1 )                                 \
++{                                                                      \
++  __m256 t1,t2, sign;                                                  \
++  __m256i idx;                                                         \
++  idx = _mm256_setr_epi32( 7, 6, 5, 4, 7, 6, 5, 4 );                   \
++  t1 = _mm256_permutevar8x32_ps( c1, idx );                            \
++  idx = _mm256_setr_epi32( 0, 1, 2, 3, 0, 1, 2, 3 );                   \
++  t2 = _mm256_permutevar8x32_ps( c1, idx );                            \
++  sign = _mm256_set_ps( -1,1,-1,1, 1,-1,1,-1 );                        \
++  t2 = _mm256_fmadd_ps( t1, sign, t2 );                                \
++  _avx512_insert_256_h_f( a1, t2 );                                    \
++}
++
++#define _avx512_to_dirac_f_3( a1, c1 )                                 \
++{                                                                      \
++  __m512 t5,t6,t7;                                                     \
++  __m512i idx;                                                         \
++  t5 = _mm512_castps256_ps512( c1 );                                   \
++  idx = _mm512_setr_epi32( 6,7,4,5, 6,7,4,5, 5,4,7,6, 5,4,7,6 );       \
++  t6 = _mm512_permutexvar_ps( idx, t5 );                               \
++  idx = _mm512_setr_epi32( 0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3 );       \
++  t7 = _mm512_permutexvar_ps( idx, t5 );                               \
++  a1 = _mm512_maskz_add_ps(    0b1001011011000011, t7, t6 );           \
++  a1 = _mm512_mask_sub_ps( a1, 0b0110100100111100, t7, t6 );           \
++}
++
++
++
++
++
++
++
++
++
++
++
++
++
++
++
++/* Macros for double precision numbers */
++
++/* Load 2 half-spinors and organize colorwise into vectors s1, s2 and s3 */
++#define _avx512_load_2_halfspinor_d( s1, s2, s3, sp, sm )            \
++{                                                                    \
++  __m512d a1,a2,a3,a4,a5,a6;                                         \
++  __m512i idx;                                                       \
++  a1 = _mm512_loadu_pd( sm );                                        \
++  a2 = _mm512_loadu_pd( sm+8 );                                      \
++  a3 = _mm512_loadu_pd( sp );                                        \
++  a4 = _mm512_loadu_pd( sp+8 );                                      \
++                                                                     \
++  idx = _mm512_setr_epi64( 8,9,14,15, 0,1,6,7 );                     \
++  s1 = _mm512_permutex2var_pd( a1, idx, a3 );                        \
++                                                                     \
++  idx = _mm512_setr_epi64( 4,5,10,11, 2,3,8,9 );                     \
++  a5 = _mm512_permutex2var_pd( a1, idx, a2 );                        \
++  idx = _mm512_setr_epi64( 2,3,8,9, 4,5,10,11 );                     \
++  a6 = _mm512_permutex2var_pd( a3, idx, a4 );                        \
++  idx = _mm512_setr_epi64( 0,1,2,3, 12,13,14,15 );                   \
++  s2 = _mm512_permutex2var_pd( a6, idx, a5 );                        \
++                                                                     \
++  idx = _mm512_setr_epi64( 4,5,6,7, 8,9,10,11 );                     \
++  s3 = _mm512_permutex2var_pd( a6, idx, a5 );                        \
++}
++
++/* Load 2 half-spinors reversing the spinor indeces and
++ * organize colorwise into vectors s1, s2 and s3 */
++#define _avx512_load_2_halfspinor_d_reverse( s1, s2, s3, sp, sm )    \
++{                                                                    \
++  __m512d a1,a2,a3,a4,a5,a6;                                         \
++  __m512i idx;                                                       \
++  a1 = _mm512_loadu_pd( sm );                                        \
++  a2 = _mm512_loadu_pd( sm+8 );                                      \
++  a3 = _mm512_loadu_pd( sp );                                        \
++  a4 = _mm512_loadu_pd( sp+8 );                                      \
++                                                                     \
++  idx = _mm512_setr_epi64( 14,15,8,9, 6,7,0,1 );                     \
++  s1 = _mm512_permutex2var_pd( a1, idx, a3 );                        \
++                                                                     \
++  idx = _mm512_setr_epi64( 10,11,4,5, 8,9,2,3 );                     \
++  a5 = _mm512_permutex2var_pd( a1, idx, a2 );                        \
++  idx = _mm512_setr_epi64( 8,9,2,3, 10,11,4,5 );                     \
++  a6 = _mm512_permutex2var_pd( a3, idx, a4 );                        \
++  idx = _mm512_setr_epi64( 0,1,2,3, 12,13,14,15 );                   \
++  s2 = _mm512_permutex2var_pd( a6, idx, a5 );                        \
++                                                                     \
++  idx = _mm512_setr_epi64( 4,5,6,7, 8,9,10,11 );                     \
++  s3 = _mm512_permutex2var_pd( a6, idx, a5 );                        \
++}
++
++/* Write 2 half-spinors from three color vectors */
++#define _avx512_store_2_halfspinor_d( s1, s2, s3, sp, sm )           \
++{                                                                    \
++  __m512d a1,a2,a3,a4,a5,a6;                                         \
++  __m256d l;                                                         \
++  __m512i idx;                                                       \
++  idx = _mm512_setr_epi64( 0,1,8,9, 4,5,12,13 );                     \
++  a1 = _mm512_permutex2var_pd( s1, idx, s2 );                        \
++  idx = _mm512_setr_epi64( 0,1,10,11, 4,5,14,15 );                   \
++  a2 = _mm512_permutex2var_pd( s3, idx, s1 );                        \
++  idx = _mm512_setr_epi64( 2,3,10,11, 6,7,14,15 );                   \
++  a3 = _mm512_permutex2var_pd( s2, idx, s3 );                        \
++                                                                     \
++  l = _mm512_castpd512_pd256( a1 );                                  \
++  _mm256_storeu_pd( sp, l );                                          \
++  l = _mm512_castpd512_pd256( a2 );                                  \
++  _mm256_storeu_pd( sp+4, l );                                        \
++  l = _mm512_castpd512_pd256( a3 );                                  \
++  _mm256_storeu_pd( sp+8, l );                                        \
++                                                                     \
++  l = _mm512_extractf64x4_pd( a1, 1 );                               \
++  _mm256_storeu_pd( sm, l );                                          \
++  l = _mm512_extractf64x4_pd( a2, 1 );                               \
++  _mm256_storeu_pd( sm+4, l );                                        \
++  l = _mm512_extractf64x4_pd( a3, 1 );                               \
++  _mm256_storeu_pd( sm+8, l );                                        \
++}
++
++
++
++/* Multiply the lower half of the color vectors distributed in c1, c2 and c3
++ * by the su3 matrix u and the upper half by the conjugate of um
++ * Store in b1, b2 and b3
++ */
++#define avx512_su3_mul_quad_dble( u, um, b1, b2, b3, c1, c2, c3   )     \
++{                                                                       \
++  __m512d tu1, tu2, tu3, tum1, tum2, tum3;                              \
++  __m512d u1;                              \
++  __m512d t1, t2, t3, sign;                                             \
++  __m512i indexes;                                                      \
++  tu1 = _mm512_loadu_pd( &(u).c11.re );                                 \
++  tu2 = _mm512_loadu_pd( &(u).c22.re );                                 \
++  tu3 = _mm512_loadu_pd( &(u).c33.re );                                 \
++  tum1 = _mm512_loadu_pd( &(um).c11.re );                               \
++  tum2 = _mm512_loadu_pd( &(um).c22.re );                               \
++  tum3 = _mm512_loadu_pd( &(um).c33.re );                               \
++                                                                        \
++  sign = _mm512_set_pd( -1,1,-1,1, 1,-1,1,-1 );                         \
++  t1 = _mm512_permute_pd( c1, 0b01010101 );                             \
++  t2 = _mm512_permute_pd( c2, 0b01010101 );                             \
++  t3 = _mm512_permute_pd( c3, 0b01010101 );                             \
++  t1 = _mm512_mul_pd( t1, sign );                                       \
++  t2 = _mm512_mul_pd( t2, sign );                                       \
++  t3 = _mm512_mul_pd( t3, sign );                                       \
++                                                                        \
++  indexes = _mm512_setr_epi64( 0, 0, 0, 0, 8, 8, 8, 8 );                \
++  u1 = _mm512_permutex2var_pd( tu1, indexes, tum1 );                    \
++  b1 = _mm512_mul_pd ( u1, c1 );                                        \
++  indexes = _mm512_setr_epi64( 1, 1, 1, 1, 9, 9, 9, 9 );                \
++  u1 = _mm512_permutex2var_pd( tu1, indexes, tum1 );                    \
++  b1 = _mm512_fmadd_pd ( u1, t1, b1 );                                  \
++                                                                        \
++  indexes = _mm512_setr_epi64( 2, 2, 2, 2, 14, 14, 14, 14 );            \
++  u1 = _mm512_permutex2var_pd( tu1, indexes, tum1 );                    \
++  b1 = _mm512_fmadd_pd ( u1, c2, b1 );                                  \
++  indexes = _mm512_setr_epi64( 3, 3, 3, 3, 15, 15, 15, 15 );            \
++  u1 = _mm512_permutex2var_pd( tu1, indexes, tum1 );                    \
++  b1 = _mm512_fmadd_pd ( u1, t2, b1 );                                  \
++                                                                        \
++  indexes = _mm512_setr_epi64( 4, 4, 4, 4, 12, 12, 12, 12 );            \
++  u1 = _mm512_permutex2var_pd( tu1, indexes, tum2 );                    \
++  b1 = _mm512_fmadd_pd ( u1, c3, b1 );                                  \
++  indexes = _mm512_setr_epi64( 5, 5, 5, 5, 13, 13, 13, 13 );            \
++  u1 = _mm512_permutex2var_pd( tu1, indexes, tum2 );                    \
++  b1 = _mm512_fmadd_pd ( u1, t3, b1 );                                  \
++                                                                        \
++  indexes = _mm512_setr_epi64( 6, 6, 6, 6, 10, 10, 10, 10 );            \
++  u1 = _mm512_permutex2var_pd( tu1, indexes, tum1 );                    \
++  b2 = _mm512_mul_pd ( u1, c1 );                                        \
++  indexes = _mm512_setr_epi64( 7, 7, 7, 7, 11, 11, 11, 11 );            \
++  u1 = _mm512_permutex2var_pd( tu1, indexes, tum1 );                    \
++  b2 = _mm512_fmadd_pd ( u1, t1, b2 );                                  \
++                                                                        \
++  indexes = _mm512_setr_epi64( 0, 0, 0, 0, 8, 8, 8, 8 );                \
++  u1 = _mm512_permutex2var_pd( tu2, indexes, tum2 );                    \
++  b2 = _mm512_fmadd_pd ( u1, c2, b2 );                                  \
++  indexes = _mm512_setr_epi64( 1, 1, 1, 1, 9, 9, 9, 9 );                \
++  u1 = _mm512_permutex2var_pd( tu2, indexes, tum2 );                    \
++  b2 = _mm512_fmadd_pd ( u1, t2, b2 );                                  \
++                                                                        \
++  indexes = _mm512_setr_epi64( 2, 2, 2, 2, 14, 14, 14, 14 );            \
++  u1 = _mm512_permutex2var_pd( tu2, indexes, tum2 );                    \
++  b2 = _mm512_fmadd_pd ( u1, c3, b2 );                                  \
++  indexes = _mm512_setr_epi64( 3, 3, 3, 3, 15, 15, 15, 15 );            \
++  u1 = _mm512_permutex2var_pd( tu2, indexes, tum2 );                    \
++  b2 = _mm512_fmadd_pd ( u1, t3, b2 );                                  \
++                                                                        \
++  indexes = _mm512_setr_epi64( 4, 4, 4, 4, 12, 12, 12, 12 );            \
++  u1 = _mm512_permutex2var_pd( tu2, indexes, tum1 );                    \
++  b3 = _mm512_mul_pd ( u1, c1 );                                        \
++  indexes = _mm512_setr_epi64( 5, 5, 5, 5, 13, 13, 13, 13 );            \
++  u1 = _mm512_permutex2var_pd( tu2, indexes, tum1 );                    \
++  b3 = _mm512_fmadd_pd ( u1, t1, b3 );                                  \
++                                                                        \
++  indexes = _mm512_setr_epi64( 6, 6, 6, 6, 10, 10, 10, 10 );            \
++  u1 = _mm512_permutex2var_pd( tu2, indexes, tum2 );                    \
++  b3 = _mm512_fmadd_pd ( u1, c2, b3 );                                  \
++  indexes = _mm512_setr_epi64( 7, 7, 7, 7, 11, 11, 11, 11 );            \
++  u1 = _mm512_permutex2var_pd( tu2, indexes, tum2 );                    \
++  b3 = _mm512_fmadd_pd ( u1, t2, b3 );                                  \
++                                                                        \
++  indexes = _mm512_setr_epi64( 0, 0, 0, 0, 8, 8, 8, 8 );                \
++  u1 = _mm512_permutex2var_pd( tu3, indexes, tum3 );                    \
++  b3 = _mm512_fmadd_pd ( u1, c3, b3 );                                  \
++  indexes = _mm512_setr_epi64( 1, 1, 1, 1, 9, 9, 9, 9 );                \
++  u1 = _mm512_permutex2var_pd( tu3, indexes, tum3 );                    \
++  b3 = _mm512_fmadd_pd ( u1, t3, b3 );                                  \
++}
++
++
++
++
++
++
++/* Combine spinor entries into 2 weyl vectors
++   stored in high and low entries of a spinor
++ */
++#define   _avx512_to_weyl_1( w, b ){                                 \
++  __m512i indexes;                                                   \
++  __m512d _t;                                                        \
++  indexes = _mm512_setr_epi64( 4, 5, 6, 7, 0, 1, 2, 3 );             \
++  _t = _mm512_permutexvar_pd( indexes, (b) );                        \
++  w = _mm512_maskz_add_pd( 0b00001111, _t, (b) );                    \
++  w = _mm512_mask_sub_pd( w, 0b11110000, _t, (b) );                  \
++}
++
++#define   _avx512_to_weyl_2( w, b ){                                 \
++  __m512i indexes;                                                   \
++  __m512d _t;                                                        \
++  indexes = _mm512_setr_epi64( 4, 5, 6, 7, 0, 1, 2, 3 );             \
++  _t = _mm512_permutexvar_pd( indexes, (b) );                        \
++  _t = _mm512_mask_add_pd( _t, 0b00001111, _t, (b) );                \
++  _t = _mm512_mask_sub_pd( _t, 0b11110000, (b), _t );                \
++  indexes = _mm512_setr_epi64( 0, 1, 2, 3, 7, 6, 5, 4 );             \
++  _t = _mm512_permutexvar_pd( indexes, _t );                         \
++  w = _mm512_mask_add_pd( w, 0b10101111, w, _t );                    \
++  w = _mm512_mask_sub_pd( w, 0b01010000, w, _t );                    \
++}
++
++#define   _avx512_to_weyl_3( w, b ){                                \
++  __m512i indexes;                                                  \
++  __m512d _t;                                                       \
++  indexes = _mm512_setr_epi64( 4, 5, 6, 7, 0, 1, 2, 3 );            \
++  _t = _mm512_permutexvar_pd( indexes, (b) );                       \
++  _t = _mm512_mask_add_pd( _t, 0b00001111, _t, (b) );               \
++  _t = _mm512_mask_sub_pd( _t, 0b11110000, (b), _t );               \
++  indexes = _mm512_setr_epi64( 0, 1, 2, 3, 6, 7, 4, 5 );            \
++  _t = _mm512_permutexvar_pd( indexes, _t );                        \
++  w = _mm512_mask_add_pd( w, 0b00111111, w, _t );                   \
++  w = _mm512_mask_sub_pd( w, 0b11000000, w, _t );                   \
++}
++
++#define   _avx512_to_weyl_4( w, b ){                                \
++  __m512i indexes;                                                  \
++  __m512d _t;                                                       \
++  indexes = _mm512_setr_epi64( 4, 5, 6, 7, 0, 1, 2, 3 );            \
++  _t = _mm512_permutexvar_pd( indexes, (b) );                       \
++  _t = _mm512_mask_add_pd( _t, 0b00001111, _t, (b) );               \
++  _t = _mm512_mask_sub_pd( _t, 0b11110000, (b), _t );               \
++  indexes = _mm512_setr_epi64( 0, 1, 2, 3, 5, 4, 7, 6 );            \
++  _t = _mm512_permutexvar_pd( indexes, _t );                        \
++  w = _mm512_mask_add_pd( w, 0b01101111, w, _t );                   \
++  w = _mm512_mask_sub_pd( w, 0b10010000, w, _t );                   \
++}
++
++
++
++
++/* Create a full Dirac vector by adding and subtracting the indeces of
++ * a weyl vector */
++#define _avx512_expand_weyl( a, w ){                                \
++  __m512i indexes;                                                  \
++  __m512d _t;                                                       \
++  indexes = _mm512_setr_epi64( 4, 5, 6, 7, 0, 1, 2, 3 );            \
++  _t = _mm512_permutexvar_pd( indexes, (w) );                       \
++  a = _mm512_maskz_add_pd(   0b00001111, _t, w );                   \
++  a = _mm512_mask_sub_pd( a, 0b11110000, _t, w );                   \
++}
++
++#define _avx512_expand_weyl_2( a, w ){                              \
++  __m512i indexes;                                                  \
++  __m512d _t1, _t2;                                                 \
++  indexes = _mm512_setr_epi64( 7, 6, 5, 4, 7, 6, 5, 4 );            \
++  _t1 = _mm512_permutexvar_pd( indexes, (w) );                      \
++  indexes = _mm512_setr_epi64( 0, 1, 2, 3, 0, 1, 2, 3 );            \
++  _t2 = _mm512_permutexvar_pd( indexes, (w) );                      \
++  a = _mm512_maskz_add_pd(   0b01011010, _t2, _t1 );                \
++  a = _mm512_mask_sub_pd( a, 0b10100101, _t2, _t1 );                \
++}
++
++#define _avx512_expand_weyl_3( a, w ){                              \
++  __m512i indexes;                                                  \
++  __m512d _t1, _t2;                                                 \
++  indexes = _mm512_setr_epi64( 6, 7, 4, 5, 6, 7, 4, 5 );            \
++  _t1 = _mm512_permutexvar_pd( indexes, (w) );                      \
++  indexes = _mm512_setr_epi64( 0, 1, 2, 3, 0, 1, 2, 3 );            \
++  _t2 = _mm512_permutexvar_pd( indexes, (w) );                      \
++  a = _mm512_maskz_add_pd(   0b11000011, _t2, _t1 );                \
++  a = _mm512_mask_sub_pd( a, 0b00111100, _t2, _t1 );                \
++}
++
++#define _avx512_expand_weyl_4( a, w ){                              \
++  __m512i indexes;                                                  \
++  __m512d _t1, _t2;                                                 \
++  indexes = _mm512_setr_epi64( 5, 4, 7, 6, 5, 4, 7, 6 );            \
++  _t1 = _mm512_permutexvar_pd( indexes, (w) );                      \
++  indexes = _mm512_setr_epi64( 0, 1, 2, 3, 0, 1, 2, 3 );            \
++  _t2 = _mm512_permutexvar_pd( indexes, (w) );                      \
++  a = _mm512_maskz_add_pd(   0b10010110, _t2, _t1 );                \
++  a = _mm512_mask_sub_pd( a, 0b01101001, _t2, _t1 );                \
++}
++
++
++
++
++
++/* Load four complex numbers. */
++#define _avx512_load_4_d( v,  c1, c2, c3, c4 )                      \
++{                                                                   \
++  __m128d t128l, t128u;                                             \
++  __m256d t256l ,t256u;                                             \
++  t128l = _mm_loadu_pd( &(c1).re );                                 \
++  t128u = _mm_loadu_pd( &(c2).re );                                 \
++  t256l = _mm256_castpd128_pd256( t128l );                          \
++  t256l = _mm256_insertf128_pd( t256l, t128u, 1 );                  \
++  t128l = _mm_loadu_pd( &(c3).re );                                 \
++  t128u = _mm_loadu_pd( &(c4).re );                                 \
++  t256u = _mm256_castpd128_pd256( t128l );                          \
++  t256u = _mm256_insertf128_pd( t256u, t128u, 1 );                  \
++  v = _mm512_castpd256_pd512( t256l );                              \
++  v = _mm512_insertf64x4( v, t256u, 1 );                            \
++}
++
++/* Store four complex numbers */
++#define _avx512_store_4_d( r,  v1, v2, v3, v4 )                     \
++{                                                                   \
++  __m256d t256;                                                     \
++  __m128d t128;                                                     \
++  t256 = _mm512_extractf64x4_pd( r, 1 );                            \
++  t128 = _mm256_extractf128_pd( t256, 1 );                          \
++  _mm_storeu_pd( &(v4).re, t128  );                                 \
++  t128 = _mm256_castpd256_pd128( t256 );                            \
++  _mm_storeu_pd( &(v3).re, t128  );                                 \
++  t256 = _mm512_castpd512_pd256( r );                               \
++  t128 = _mm256_extractf128_pd( t256, 1 );                          \
++  _mm_storeu_pd( &(v2).re, t128  );                                 \
++  t128 = _mm256_castpd256_pd128( t256 );                            \
++  _mm_storeu_pd( &(v1).re, t128  );                                 \
++}
++
++
++
++
++/* Adding a vectors to a spinors */
++
++/* Load half spinors and organize for adding */
++#define _avx512_load_s_ud_d( s1,s2,s3,s4, t1,t2,t3, sp, sm )   \
++  s1 = _mm512_loadu_pd( (sm) );                                \
++  s2 = _mm512_loadu_pd( (sm)+8 );                              \
++  s3 = _mm512_loadu_pd( (sp) );                                \
++  s4 = _mm512_loadu_pd( (sp)+8 );                              \
++                                                               \
++  idx = _mm512_setr_epi64( 0,1,2,3, 8,9,10,11 );               \
++  t1 = _mm512_permutex2var_pd( s1, idx, s3 );                  \
++  idx = _mm512_setr_epi64( 4,5,6,7, 12,13,14,15 );             \
++  t2 = _mm512_permutex2var_pd( s1, idx, s3 );                  \
++  idx = _mm512_setr_epi64( 0,1,2,3, 8,9,10,11 );               \
++  t3 = _mm512_permutex2var_pd( s2, idx, s4 );                  \
++
++/* reorganize weyl vectors for adding */
++#define _avx512_reorganize_a_ud_d( b1,b2,b3, a1,a2,a3 )        \
++  idx = _mm512_setr_epi64( 0,1,8,9, 4,5,12,13 );               \
++  a1 = _mm512_permutex2var_pd( b1, idx, b2 );                  \
++  idx = _mm512_setr_epi64( 0,1,10,11, 4,5,14,15 );             \
++  a2 = _mm512_permutex2var_pd( b3, idx, b1 );                  \
++  idx = _mm512_setr_epi64( 2,3,10,11, 6,7,14,15 );             \
++  a3 = _mm512_permutex2var_pd( b2, idx, b3 );                  \
++
++/* store after adding */
++#define _avx512_write_a_ud_d( a1, a2, a3, sp, sm ){            \
++  __m256d l;                                                   \
++  l = _mm512_castpd512_pd256( a1 );                            \
++  _mm256_storeu_pd( (sm), l );                                  \
++  l = _mm512_castpd512_pd256( a2 );                            \
++  _mm256_storeu_pd( (sm)+4, l );                                \
++  l = _mm512_castpd512_pd256( a3 );                            \
++  _mm256_storeu_pd( (sm)+8, l );                                \
++                                                               \
++  l = _mm512_extractf64x4_pd( a1, 1 );                         \
++  _mm256_storeu_pd( (sp), l );                                  \
++  l = _mm512_extractf64x4_pd( a2, 1 );                         \
++  _mm256_storeu_pd( (sp)+4, l );                                \
++  l = _mm512_extractf64x4_pd( a3, 1 );                         \
++  _mm256_storeu_pd( (sp)+8, l );                                \
++}
++
++
++#define _avx512_add_to_spinors( b1, b2, b3, sp, sm )           \
++{                                                              \
++  __m512d s1,s2,s3,s4, a1,a2,a3, t1,t2,t3;                     \
++  __m512i idx;                                                 \
++                                                               \
++  _avx512_load_s_ud_d( s1,s2,s3,s4, t1,t2,t3, sp, sm );        \
++  _avx512_reorganize_a_ud_d( b1,b2,b3, a1,a2,a3 );             \
++                                                               \
++  t1 = _mm512_add_pd( a1, t1 );                                \
++  t2 = _mm512_add_pd( a2, t2 );                                \
++  t3 = _mm512_add_pd( a3, t3 );                                \
++                                                               \
++  _avx512_write_a_ud_d( t1, t2, t3, sp, sm );                  \
++}
++
++#define _avx512_add_to_spinors_2( b1, b2, b3, sp, sm )         \
++{                                                              \
++  __m512d s1,s2,s3,s4, a1,a2,a3, t1,t2,t3;                     \
++  __m512i idx;                                                 \
++                                                               \
++  _avx512_load_s_ud_d( s1,s2,s3,s4, t1,t2,t3, sp, sm );        \
++  _avx512_reorganize_a_ud_d( b1,b2,b3, a1,a2,a3 );             \
++                                                               \
++  t1 = _mm512_mask_add_pd( t1, 0b00001111, t1, a1 );           \
++  t1 = _mm512_mask_sub_pd( t1, 0b11110000, t1, a1 );           \
++  t2 = _mm512_mask_add_pd( t2, 0b00001111, t2, a2 );           \
++  t2 = _mm512_mask_sub_pd( t2, 0b11110000, t2, a2 );           \
++  t3 = _mm512_mask_add_pd( t3, 0b00001111, t3, a3 );           \
++  t3 = _mm512_mask_sub_pd( t3, 0b11110000, t3, a3 );           \
++                                                               \
++  _avx512_write_a_ud_d( t1, t2, t3, sp, sm );                  \
++}
++
++#define _avx512_add_to_spinors_3( b1, b2, b3, sp, sm )         \
++{                                                              \
++  __m512d s1,s2,s3,s4, a1,a2,a3, t1,t2,t3;                     \
++  __m512i idx;                                                 \
++                                                               \
++  _avx512_load_s_ud_d( s1,s2,s3,s4, t1,t2,t3, sp, sm );        \
++  idx = _mm512_setr_epi64( 3,2,11,10, 7,6,15,14 );             \
++  a1 = _mm512_permutex2var_pd( b1, idx, b2 );                  \
++  idx = _mm512_setr_epi64( 3,2,9,8, 7,6,13,12 );               \
++  a2 = _mm512_permutex2var_pd( b3, idx, b1 );                  \
++  idx = _mm512_setr_epi64( 1,0,9,8, 5,4,13,12 );               \
++  a3 = _mm512_permutex2var_pd( b2, idx, b3 );                  \
++                                                               \
++  t1 = _mm512_mask_add_pd( t1, 0b10100101, t1, a1 );           \
++  t1 = _mm512_mask_sub_pd( t1, 0b01011010, t1, a1 );           \
++  t2 = _mm512_mask_add_pd( t2, 0b10100101, t2, a2 );           \
++  t2 = _mm512_mask_sub_pd( t2, 0b01011010, t2, a2 );           \
++  t3 = _mm512_mask_add_pd( t3, 0b10100101, t3, a3 );           \
++  t3 = _mm512_mask_sub_pd( t3, 0b01011010, t3, a3 );           \
++                                                               \
++  _avx512_write_a_ud_d( t1, t2, t3, sp, sm );                  \
++}
++
++#define _avx512_add_to_spinors_4( b1, b2, b3, sp, sm )         \
++{                                                              \
++  __m512d s1,s2,s3,s4, a1,a2,a3, t1,t2,t3;                     \
++  __m512i idx;                                                 \
++  idx = _mm512_setr_epi64( 2,3,0,1, 6,7,4,5 );                 \
++  b1 = _mm512_permutexvar_pd( idx, b1 );                       \
++  b2 = _mm512_permutexvar_pd( idx, b2 );                       \
++  b3 = _mm512_permutexvar_pd( idx, b3 );                       \
++                                                               \
++  _avx512_load_s_ud_d( s1,s2,s3,s4, t1,t2,t3, sp, sm );        \
++  idx = _mm512_setr_epi64( 0,1,8,9, 4,5,12,13 );               \
++  a1 = _mm512_permutex2var_pd( b1, idx, b2 );                  \
++  idx = _mm512_setr_epi64( 0,1,10,11, 4,5,14,15 );             \
++  a2 = _mm512_permutex2var_pd( b3, idx, b1 );                  \
++  idx = _mm512_setr_epi64( 2,3,10,11, 6,7,14,15 );             \
++  a3 = _mm512_permutex2var_pd( b2, idx, b3 );                  \
++                                                               \
++  t1 = _mm512_mask_add_pd( t1, 0b11110000, t1, a1 );           \
++  t1 = _mm512_mask_sub_pd( t1, 0b00001111, t1, a1 );           \
++  t2 = _mm512_mask_add_pd( t2, 0b00111100, t2, a2 );           \
++  t2 = _mm512_mask_sub_pd( t2, 0b11000011, t2, a2 );           \
++  t3 = _mm512_mask_add_pd( t3, 0b00001111, t3, a3 );           \
++  t3 = _mm512_mask_sub_pd( t3, 0b11110000, t3, a3 );           \
++                                                               \
++  _avx512_write_a_ud_d( t1, t2, t3, sp, sm );                  \
++}
++
++
++#define _avx512_add_to_spinors_5( b1, b2, b3, sp, sm )         \
++{                                                              \
++  __m512d s1,s2,s3,s4, a1,a2,a3, t1,t2,t3;                     \
++  __m512i idx;                                                 \
++                                                               \
++  _avx512_load_s_ud_d( s1,s2,s3,s4, t1,t2,t3, sp, sm );        \
++  idx = _mm512_setr_epi64( 1,0,9,8, 5,4,13,12 );               \
++  a1 = _mm512_permutex2var_pd( b1, idx, b2 );                  \
++  idx = _mm512_setr_epi64( 1,0,11,10, 5,4,15,14 );             \
++  a2 = _mm512_permutex2var_pd( b3, idx, b1 );                  \
++  idx = _mm512_setr_epi64( 3,2,11,10, 7,6,15,14 );             \
++  a3 = _mm512_permutex2var_pd( b2, idx, b3 );                  \
++                                                               \
++  t1 = _mm512_mask_add_pd( t1, 0b10100101, t1, a1 );           \
++  t1 = _mm512_mask_sub_pd( t1, 0b01011010, t1, a1 );           \
++  t2 = _mm512_mask_add_pd( t2, 0b01101001, t2, a2 );           \
++  t2 = _mm512_mask_sub_pd( t2, 0b10010110, t2, a2 );           \
++  t3 = _mm512_mask_add_pd( t3, 0b01011010, t3, a3 );           \
++  t3 = _mm512_mask_sub_pd( t3, 0b10100101, t3, a3 );           \
++                                                               \
++  _avx512_write_a_ud_d( t1, t2, t3, sp, sm );                  \
++}
++
++
++
++
++
++#endif
+diff --git a/include/sw_term.h b/include/sw_term.h
+index f1ef622..b47d0a7 100644
+--- a/include/sw_term.h
++++ b/include/sw_term.h
+@@ -29,6 +29,7 @@ extern void apply_sw(int vol,float mu,pauli *m,spinor *s,spinor *r);
+ 
+ /* PAULI_DBLE_C */
+ extern void mul_pauli_dble(double mu,pauli_dble *m,weyl_dble *s,weyl_dble *r);
++void mul_pauli2_dble(double mu, pauli_dble *m, weyl_dble *s, weyl_dble *r);
+ extern int inv_pauli_dble(double mu,pauli_dble *m,pauli_dble *im);
+ extern complex_dble det_pauli_dble(double mu,pauli_dble *m);
+ extern void apply_sw_dble(int vol,double mu,pauli_dble *m,spinor_dble *s,
+diff --git a/main/Makefile b/main/Makefile
+index 6dc4e1b..8c739d5 100644
+--- a/main/Makefile
++++ b/main/Makefile
+@@ -26,7 +26,7 @@ BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk
+ 
+ DFL = dfl_geometry dfl_subspace ltl_gcr dfl_sap_gcr dfl_modes
+ 
+-DIRAC = Dw_dble Dw Dw_bnd
++DIRAC = Dw_dble Dw Dw_bnd Dw_avx512 Dw_dble_avx512
+ 
+ FLAGS = flags action_parms dfl_parms force_parms hmc_parms lat_parms \
+         mdint_parms rat_parms rw_parms sap_parms solver_parms
+@@ -36,7 +36,8 @@ FORCES = force0 force1 force2 force3 force4 force5 \
+ 
+ LATTICE = bcnds uidx ftidx geometry
+ 
+-LINALG = salg salg_dble valg valg_dble liealg cmatrix_dble cmatrix
++LINALG = salg salg_dble valg valg_dble liealg cmatrix_dble cmatrix \
++         salg_avx512 salg_dble_avx512
+ 
+ LINSOLV = cgne mscg fgcr fgcr4vd
+ 
+@@ -54,7 +55,7 @@ SFLDS = sflds scom sdcom Pbnd Pbnd_dble
+ 
+ SU3FCTS = chexp su3prod su3ren cm3x3 random_su3
+ 
+-SW_TERM = pauli pauli_dble swflds sw_term
++SW_TERM = pauli pauli_dble swflds sw_term pauli_avx512 pauli_dble_avx512
+ 
+ TCHARGE = ftcom ftensor tcharge ym_action
+ 
+@@ -68,12 +69,13 @@ VFLDS = vflds vinit vcom vdcom
+ 
+ WFLOW = wflow
+ 
+-MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \
++STD_MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \
+           $(LATTICE) $(LINALG) $(LINSOLV) $(LITTLE) $(MDFLDS) $(RANDOM) \
+           $(RATFCTS) $(SAP) $(SFLDS) $(SU3FCTS) $(SW_TERM) $(TCHARGE) \
+           $(UFLDS) $(UPDATE) $(UTILS) $(VFLDS) $(WFLOW)
+ 
+ 
++
+ # Logging option (-mpilog or -mpitrace or -mpianim)
+ 
+ LOGOPTION =
+@@ -88,7 +90,9 @@ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\
+           $(MDIR)/utils:$(MDIR)/forces:$(MDIR)/sflds:$(MDIR)/dirac:\
+ 	  $(MDIR)/sw_term:$(MDIR)/tcharge:$(MDIR)/block:$(MDIR)/sap:\
+ 	  $(MDIR)/linsolv:$(MDIR)/dfl:$(MDIR)/vflds:$(MDIR)/little:\
+-          $(MDIR)/update:$(MDIR)/wflow:$(MDIR)/ratfcts
++          $(MDIR)/update:$(MDIR)/wflow:$(MDIR)/ratfcts:\
++	  $(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512
++
+ 
+ 
+ # additional include directories
+@@ -103,16 +107,17 @@ LIBS = m
+ LIBPATH = $(MPI_HOME)/lib
+ 
+ 
+-# scheduling and optimization options
++# compilation, scheduling and optimization options
+ 
+ CFLAGS = -std=c89 -pedantic -fstrict-aliasing \
+-         -Wall -Wno-long-long -Wstrict-prototypes -Werror \
+-         -O -mno-avx -Dx64 -DPM
++         -Wall -Wno-long-long -Wstrict-prototypes \
++         -O2 -DAVX512 -DAVX -DPM
++
++LFLAGS = $(CFLAGS)
+ 
+-LFLAGS =
+ 
+ # See the README in the top directory for alternative choices of the
+-# optimization options -O -mno-avx -Dx64 -DPM.
++# optimization options -O2 -DAVX512.
+ #
+ # The available debugging flags are
+ #
+@@ -127,25 +132,24 @@ SHELL=/bin/bash
+ CC=mpicc
+ CLINKER=$(CC)
+ 
++
++#Check CFLAGS to find which AVX512 flags are active
++MODULES= $(STD_MODULES)
++
+ PGMS= $(MAIN) $(MODULES)
+ 
+ -include $(addsuffix .d,$(PGMS))
+ 
+ 
+ # rule to make dependencies
+-
+ $(addsuffix .d,$(PGMS)): %.d: %.c Makefile
+ 	@ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@
+ 
+-
+ # rule to compile source programs
+-
+-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile
++$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile
+ 	$(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH))
+ 
+-
+ # rule to link object files
+-
+ $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile
+ 	$(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \
+         $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@
+diff --git a/modules/dirac/Dw.c b/modules/dirac/Dw.c
+index 3900283..bb18a72 100644
+--- a/modules/dirac/Dw.c
++++ b/modules/dirac/Dw.c
+@@ -122,7 +122,27 @@ static const spinor s0={{{0.0f,0.0f},{0.0f,0.0f},{0.0f,0.0f}},
+                         {{0.0f,0.0f},{0.0f,0.0f},{0.0f,0.0f}}};
+ static spin_t rs ALIGNED32;
+ 
+-#if (defined AVX)
++
++#if ( defined AVX512 )
++
++#include "avx512.h"
++#include "sse.h"
++
++void doe_avx512(int *piup, int *pidn, su3 *u, spinor *pk, float coe, spin_t *rs);
++static void doe(int *piup, int *pidn, su3 *u, spinor *pk)
++{
++  doe_avx512( piup, pidn, u, pk, coe, &rs );
++}
++
++void deo_avx512(int *piup, int *pidn, su3 *u, spinor *pl, float ceo, spin_t *rs);
++static void deo(int *piup, int *pidn, su3 *u, spinor *pl)
++{
++  deo_avx512( piup, pidn, u, pl, ceo, &rs );
++}
++
++
++
++#elif ( defined AVX )
+ #include "avx.h"
+ 
+ #define _load_cst(c) \
+diff --git a/modules/dirac/Dw_dble.c b/modules/dirac/Dw_dble.c
+index d683cbb..4fc58f5 100644
+--- a/modules/dirac/Dw_dble.c
++++ b/modules/dirac/Dw_dble.c
+@@ -1,4 +1,3 @@
+-
+ /*******************************************************************************
+ *
+ * File Dw_dble.c
+@@ -121,7 +120,22 @@ static const spinor_dble sd0={{{0.0,0.0},{0.0,0.0},{0.0,0.0}},
+                               {{0.0,0.0},{0.0,0.0},{0.0,0.0}}};
+ static spin_t rs ALIGNED32;
+ 
+-#if (defined AVX)
++#if ( defined AVX512 )
++
++void doe_dble_avx512(int *piup,int *pidn,su3_dble *u,spinor_dble *pk,double coe, spin_t *rs);
++static void doe(int *piup,int *pidn,su3_dble *u,spinor_dble *pk)
++{
++  doe_dble_avx512( piup, pidn, u, pk, coe, &rs );
++}
++
++void deo_dble_avx512( int *piup, int *pidn, su3_dble *u,  spinor_dble *pl, double ceo, spin_t *rs);
++static void deo(int *piup, int *pidn, su3_dble *u, spinor_dble *pl)
++{
++  deo_dble_avx512( piup, pidn, u, pl, ceo, &rs );
++}
++
++
++#elif ( defined AVX )
+ #include "avx.h"
+ 
+ #define _load_cst(c) \
+@@ -1412,9 +1426,7 @@ void Dw_dble(double mu,spinor_dble *s,spinor_dble *r)
+          {
+             doe(piup,pidn,u,s);
+ 
+-            mul_pauli_dble(mu,m,(*so).w,(*ro).w);
+-            mul_pauli_dble(-mu,m+1,(*so).w+1,(*ro).w+1);
+-
++            mul_pauli2_dble(mu, m, (*so).w, (*ro).w);
+             _vector_add_assign((*ro).s.c1,rs.s.c1);
+             _vector_add_assign((*ro).s.c2,rs.s.c2);
+             _vector_add_assign((*ro).s.c3,rs.s.c3);
+@@ -1442,9 +1454,7 @@ void Dw_dble(double mu,spinor_dble *s,spinor_dble *r)
+       {
+          doe(piup,pidn,u,s);
+ 
+-         mul_pauli_dble(mu,m,(*so).w,(*ro).w);
+-         mul_pauli_dble(-mu,m+1,(*so).w+1,(*ro).w+1);
+-
++         mul_pauli2_dble(mu, m, (*so).w, (*ro).w);
+          _vector_add_assign((*ro).s.c1,rs.s.c1);
+          _vector_add_assign((*ro).s.c2,rs.s.c2);
+          _vector_add_assign((*ro).s.c3,rs.s.c3);
+@@ -1488,8 +1498,7 @@ void Dwee_dble(double mu,spinor_dble *s,spinor_dble *r)
+ 
+          if ((t>0)&&((t<(N0-1))||(bc!=0)))
+          {
+-            mul_pauli_dble(mu,m,(*se).w,(*re).w);
+-            mul_pauli_dble(-mu,m+1,(*se).w+1,(*re).w+1);
++            mul_pauli2_dble(mu, m, (*se).w, (*re).w);
+          }
+          else
+          {
+@@ -1505,9 +1514,7 @@ void Dwee_dble(double mu,spinor_dble *s,spinor_dble *r)
+    {
+       for (;m<mm;m+=2)
+       {
+-         mul_pauli_dble(mu,m,(*se).w,(*re).w);
+-         mul_pauli_dble(-mu,m+1,(*se).w+1,(*re).w+1);
+-
++         mul_pauli2_dble(mu, m, (*se).w, (*re).w);
+          se+=1;
+          re+=1;
+       }
+@@ -1542,8 +1549,7 @@ void Dwoo_dble(double mu,spinor_dble *s,spinor_dble *r)
+ 
+          if ((t>0)&&((t<(N0-1))||(bc!=0)))
+          {
+-            mul_pauli_dble(mu,m,(*so).w,(*ro).w);
+-            mul_pauli_dble(-mu,m+1,(*so).w+1,(*ro).w+1);
++            mul_pauli2_dble(mu, m, (*so).w, (*ro).w);
+          }
+          else
+          {
+@@ -1559,9 +1565,7 @@ void Dwoo_dble(double mu,spinor_dble *s,spinor_dble *r)
+    {
+       for (;m<mm;m+=2)
+       {
+-         mul_pauli_dble(mu,m,(*so).w,(*ro).w);
+-         mul_pauli_dble(-mu,m+1,(*so).w+1,(*ro).w+1);
+-
++         mul_pauli2_dble(mu, m, (*so).w, (*ro).w);
+          so+=1;
+          ro+=1;
+       }
+@@ -1716,9 +1720,7 @@ void Dwhat_dble(double mu,spinor_dble *s,spinor_dble *r)
+          {
+             doe(piup,pidn,u,s);
+ 
+-            mul_pauli_dble(0.0,m,rs.w,rs.w);
+-            mul_pauli_dble(0.0,m+1,rs.w+1,rs.w+1);
+-
++            mul_pauli2_dble(0.0, m, rs.w, rs.w);
+             deo(piup,pidn,u,r);
+          }
+ 
+@@ -1732,9 +1734,7 @@ void Dwhat_dble(double mu,spinor_dble *s,spinor_dble *r)
+       for (;u<um;u+=8)
+       {
+          doe(piup,pidn,u,s);
+-
+-         mul_pauli_dble(0.0,m,rs.w,rs.w);
+-         mul_pauli_dble(0.0,m+1,rs.w+1,rs.w+1);
++         mul_pauli2_dble(0.0, m, rs.w, rs.w);
+ 
+          deo(piup,pidn,u,r);
+ 
+@@ -1815,9 +1815,7 @@ void Dw_blk_dble(blk_grid_t grid,int n,double mu,int k,int l)
+          if (((pidn[0]<vol)||(!ibd))&&((piup[0]<vol)||(!ibu)))
+          {
+             doe(piup,pidn,u,s);
+-
+-            mul_pauli_dble(mu,m,(*so).w,(*ro).w);
+-            mul_pauli_dble(-mu,m+1,(*so).w+1,(*ro).w+1);
++            mul_pauli2_dble(mu, m, (*so).w, (*ro).w);
+ 
+             _vector_add_assign((*ro).s.c1,rs.s.c1);
+             _vector_add_assign((*ro).s.c2,rs.s.c2);
+@@ -1852,8 +1850,7 @@ void Dw_blk_dble(blk_grid_t grid,int n,double mu,int k,int l)
+       {
+          doe(piup,pidn,u,s);
+ 
+-         mul_pauli_dble(mu,m,(*so).w,(*ro).w);
+-         mul_pauli_dble(-mu,m+1,(*so).w+1,(*ro).w+1);
++         mul_pauli2_dble(mu, m, (*so).w, (*ro).w);
+ 
+          _vector_add_assign((*ro).s.c1,rs.s.c1);
+          _vector_add_assign((*ro).s.c2,rs.s.c2);
+@@ -1916,8 +1913,7 @@ void Dwee_blk_dble(blk_grid_t grid,int n,double mu,int k,int l)
+       {
+          if (((pidn[0]<vol)||(!ibd))&&((piup[0]<vol)||(!ibu)))
+          {
+-            mul_pauli_dble(mu,m,(*se).w,(*re).w);
+-            mul_pauli_dble(-mu,m+1,(*se).w+1,(*re).w+1);
++            mul_pauli2_dble(mu, m, (*se).w, (*re).w);
+          }
+          else
+          {
+@@ -1935,8 +1931,7 @@ void Dwee_blk_dble(blk_grid_t grid,int n,double mu,int k,int l)
+    {
+       for (;m<mm;m+=2)
+       {
+-         mul_pauli_dble(mu,m,(*se).w,(*re).w);
+-         mul_pauli_dble(-mu,m+1,(*se).w+1,(*re).w+1);
++         mul_pauli2_dble(mu, m, (*se).w, (*re).w);
+ 
+          se+=1;
+          re+=1;
+@@ -1994,8 +1989,7 @@ void Dwoo_blk_dble(blk_grid_t grid,int n,double mu,int k,int l)
+       {
+          if (((pidn[0]<vol)||(!ibd))&&((piup[0]<vol)||(!ibu)))
+          {
+-            mul_pauli_dble(mu,m,(*so).w,(*ro).w);
+-            mul_pauli_dble(-mu,m+1,(*so).w+1,(*ro).w+1);
++            mul_pauli2_dble(mu, m, (*so).w, (*ro).w);
+          }
+          else
+          {
+@@ -2013,9 +2007,7 @@ void Dwoo_blk_dble(blk_grid_t grid,int n,double mu,int k,int l)
+    {
+       for (;m<mm;m+=2)
+       {
+-         mul_pauli_dble(mu,m,(*so).w,(*ro).w);
+-         mul_pauli_dble(-mu,m+1,(*so).w+1,(*ro).w+1);
+-
++         mul_pauli2_dble(mu, m, (*so).w, (*ro).w);
+          so+=1;
+          ro+=1;
+       }
+@@ -2242,8 +2234,7 @@ void Dwhat_blk_dble(blk_grid_t grid,int n,double mu,int k,int l)
+          {
+             doe(piup,pidn,u,s);
+ 
+-            mul_pauli_dble(0.0f,m,rs.w,rs.w);
+-            mul_pauli_dble(0.0f,m+1,rs.w+1,rs.w+1);
++            mul_pauli2_dble(0.0f, m, rs.w, rs.w);
+ 
+             deo(piup,pidn,u,r);
+          }
+@@ -2265,8 +2256,7 @@ void Dwhat_blk_dble(blk_grid_t grid,int n,double mu,int k,int l)
+       {
+          doe(piup,pidn,u,s);
+ 
+-         mul_pauli_dble(0.0f,m,rs.w,rs.w);
+-         mul_pauli_dble(0.0f,m+1,rs.w+1,rs.w+1);
++         mul_pauli2_dble(0.0f, m, rs.w, rs.w);
+ 
+          deo(piup,pidn,u,r);
+ 
+diff --git a/modules/dirac/avx512/Dw_avx512.c b/modules/dirac/avx512/Dw_avx512.c
+new file mode 100644
+index 0000000..2a7301e
+--- /dev/null
++++ b/modules/dirac/avx512/Dw_avx512.c
+@@ -0,0 +1,221 @@
++/*******************************************************************************
++*
++* File Dw_avx512.c
++*
++* This software is distributed under the terms of the GNU General Public
++* License (GPL)
++*
++* AVX512 implementation of the O(a)-improved Wilson-Dirac operator D (single-
++* precision programs).
++*
++* See ../Dw.c for more details and alternative implementations
++ *******************************************************************************/
++
++#ifdef AVX512
++
++#include <stdlib.h>
++#include <stdio.h>
++#include <math.h>
++#include "mpi.h"
++#include "su3.h"
++#include "utils.h"
++#include "flags.h"
++#include "lattice.h"
++#include "uflds.h"
++#include "sflds.h"
++#include "sw_term.h"
++#include "block.h"
++#include "dirac.h"
++#include "global.h"
++
++#define N0 (NPROC0 * L0)
++
++typedef union
++{
++  spinor s;
++  weyl w[2];
++} spin_t;
++
++#include "avx512.h"
++void doe_avx512(int *piup, int *pidn, su3 *u, spinor *pk, float coe, spin_t *rs)
++{
++  spinor *sp, *sm, *sp2, *sm2;
++  su3 *up, *up1, *u1, *up2, *u2;
++
++  /* 512-bit wide stores for the spinor for each color */
++   __m512 a1, a2, a3;
++   __m512 b1, b2, b3;
++   __m256 c1, c2, c3;
++
++   __m256 c256;
++
++  /******************************* direction 0,1 **********************************/
++
++  sp = pk + (*(piup++));
++  sm = pk + (*(pidn++));
++  sp2 = pk + (*(piup++));
++  sm2 = pk + (*(pidn++));
++
++  _avx512_load_4_halfspinor_f( a1, a2, a3,
++                  &(*sp).c1.c1.re, &(*sm).c1.c1.re,
++                  &(*sp2).c1.c1.re, &(*sm2).c1.c1.re );
++  _avx512_load_4_halfspinor_f_reverse_up( b1, b2, b3,
++                 &(*sp).c3.c1.re, &(*sm).c3.c1.re,
++                 &(*sp2).c3.c1.re, &(*sm2).c3.c1.re );
++
++  sp = pk + (*(piup++));
++  _mm_prefetch( (char *) sp, _MM_HINT_T0 );
++  sm = pk + (*(pidn++));
++  _mm_prefetch( (char *) sm, _MM_HINT_T0 );
++  sp2 = pk + (*(piup));
++  _mm_prefetch( (char *) sp2, _MM_HINT_T0 );
++  sm2 = pk + (*(pidn));
++  _mm_prefetch( (char *) sm2, _MM_HINT_T0 );
++
++  up1 = u;
++  u1  = u+1;
++  up2 = u+2;
++  u2  = u+3; u=u2;
++  _avx512_dirac_combine_f_1( a1, b1 );
++  _avx512_dirac_combine_f_1( a2, b2 );
++  _avx512_dirac_combine_f_1( a3, b3 );
++
++  avx512_su3_mixed_multiply_8( *up1, *u1, *up2, *u2, b1, b2, b3, a1, a2, a3 );
++
++  _avx512_to_weyl_f_12( c1, b1 );
++  _avx512_to_weyl_f_12( c2, b2 );
++  _avx512_to_weyl_f_12( c3, b3 );
++
++  /******************************* direction 2,3 *********************************/
++
++  _avx512_load_4_halfspinor_f( a1, a2, a3,
++                 &(*sp).c1.c1.re,&(*sm).c1.c1.re,
++                 &(*sp2).c1.c1.re, &(*sm2).c1.c1.re );
++  _avx512_load_4_halfspinor_f_reverse_dn( b1, b2, b3,
++                 &(*sp).c3.c1.re, &(*sm).c3.c1.re,
++                 &(*sp2).c3.c1.re, &(*sm2).c3.c1.re );
++
++  _avx512_dirac_combine_f_2( a1, b1 );
++  _avx512_dirac_combine_f_2( a2, b2 );
++  _avx512_dirac_combine_f_2( a3, b3 );
++
++  up1 = u+1;
++  u1  = u+2;
++  up2 = u+3;
++  u2  = u+4;
++  avx512_su3_mixed_multiply_8( *up1, *u1, *up2, *u2, b1, b2, b3, a1, a2, a3 );
++
++
++  c256 = _mm256_broadcast_ss( &coe );
++
++  _avx512_to_weyl_f_34( c1, b1 );
++  _avx512_to_weyl_f_34( c2, b2 );
++  _avx512_to_weyl_f_34( c3, b3 );
++
++  c1 = _mm256_mul_ps( c1, c256);
++  c2 = _mm256_mul_ps( c2, c256);
++  c3 = _mm256_mul_ps( c3, c256);
++
++  _avx512_write_6_hwv_f( c1, c2, c3,  &rs->s.c1.c1.re);
++}
++
++void deo_avx512(int *piup, int *pidn, su3 *u, spinor *pl, float ceo, spin_t *rs)
++{
++  spinor *sp, *sm, *sp2, *sm2;
++  su3 *up, *up1, *u1, *up2, *u2;
++
++  /* 512-bit wide stores for the spinor for each color */
++  __m512 a1, a2, a3;
++  __m512 b1, b2, b3;
++  __m256 c1, c2, c3;
++
++  __m256 c256;
++
++
++  /******************************* direction 0 *********************************/
++
++  sp = pl + (*(piup++));
++  _mm_prefetch( (char *) sp, _MM_HINT_T0 );
++  sm = pl + (*(pidn++));
++  _mm_prefetch( (char *) sm, _MM_HINT_T0 );
++  sp2 = pl + (*(piup++));
++  _mm_prefetch( (char *) sp2, _MM_HINT_T0 );
++  sm2 = pl + (*(pidn++));
++  _mm_prefetch( (char *) sm2, _MM_HINT_T0 );
++
++  _avx512_load_6_hwv_f( c1,c2,c3, &rs->s.c1.c1.re );
++
++  c256 = _mm256_broadcast_ss( &ceo );
++  c1 = _mm256_mul_ps( c1, c256 );
++  c2 = _mm256_mul_ps( c2, c256 );
++  c3 = _mm256_mul_ps( c3, c256 );
++
++  _avx512_to_dirac_f_1( a1, c1 );
++  _avx512_to_dirac_f_1( a2, c2 );
++  _avx512_to_dirac_f_1( a3, c3 );
++
++  _avx512_to_dirac_f_2( a1, c1 );
++  _avx512_to_dirac_f_2( a2, c2 );
++  _avx512_to_dirac_f_2( a3, c3 );
++
++  up1 = u;
++  u1 = u+1;
++  up2 = u+2;
++  u2 = u+3;  u=u2;
++  avx512_su3_mixed_multiply_8( *u1, *up1, *u2, *up2, b1, b2, b3, a1, a2, a3 );
++
++  _avx512_load_4_halfspinor_f( a1, a2, a3,
++                 &(*sm).c1.c1.re, &(*sp).c1.c1.re,
++                 &(*sm2).c1.c1.re, &(*sp2).c1.c1.re );
++  a1 = _mm512_add_ps( a1, b1 );
++  a2 = _mm512_add_ps( a2, b2 );
++  a3 = _mm512_add_ps( a3, b3 );
++  _avx512_write_4_halfspinor_f( a1, a2, a3,
++                  &(*sm).c1.c1.re, &(*sp).c1.c1.re,
++                  &(*sm2).c1.c1.re, &(*sp2).c1.c1.re );
++
++  _avx512_load_4_halfspinor_f_reverse_up( a1, a2, a3,
++                 &(*sm).c3.c1.re, &(*sp).c3.c1.re,
++                 &(*sm2).c3.c1.re, &(*sp2).c3.c1.re );
++  _avx512_dirac_combine_f_3( a1, b1 );
++  _avx512_dirac_combine_f_3( a2, b2 );
++  _avx512_dirac_combine_f_3( a3, b3 );
++  _avx512_write_4_halfspinor_f_reverse_up( a1, a2, a3,
++                  &(*sm).c3.c1.re, &(*sp).c3.c1.re,
++                  &(*sm2).c3.c1.re, &(*sp2).c3.c1.re );
++
++  /******************************* direction 2 *********************************/
++
++  sp = pl + (*(piup++));
++  _mm_prefetch( (char *) sp, _MM_HINT_T0 );
++  sm = pl + (*(pidn++));
++  _mm_prefetch( (char *) sm, _MM_HINT_T0 );
++  sp2 = pl + (*(piup++));
++  _mm_prefetch( (char *) sp2, _MM_HINT_T0 );
++  sm2 = pl + (*(pidn++));
++  _mm_prefetch( (char *) sm2, _MM_HINT_T0 );
++
++  _avx512_to_dirac_f_3( a1, c1 );
++  _avx512_to_dirac_f_3( a2, c2 );
++  _avx512_to_dirac_f_3( a3, c3 );
++
++  up1 = u+1;
++  u1 = u+2;
++  up2 = u+3;
++  u2 = u+4;
++  avx512_su3_mixed_multiply_8( *u1, *up1, *u2, *up2, b1, b2, b3, a1, a2, a3 );
++
++  _avx512_load_4_halfspinor_f( a1, a2, a3, &(*sm).c1.c1.re, &(*sp).c1.c1.re, &(*sm2).c1.c1.re, &(*sp2).c1.c1.re );
++  a1 = _mm512_add_ps( a1, b1 );
++  a2 = _mm512_add_ps( a2, b2 );
++  a3 = _mm512_add_ps( a3, b3 );
++  _avx512_write_4_halfspinor_f( a1, a2, a3, &(*sm).c1.c1.re, &(*sp).c1.c1.re, &(*sm2).c1.c1.re, &(*sp2).c1.c1.re );
++
++  _avx512_load_4_halfspinor_f_reverse_dn( a1, a2, a3, &(*sm).c3.c1.re, &(*sp).c3.c1.re, &(*sm2).c3.c1.re, &(*sp2).c3.c1.re );
++  _avx512_dirac_combine_f_4( a1, b1 );
++  _avx512_dirac_combine_f_4( a2, b2 );
++  _avx512_dirac_combine_f_4( a3, b3 );
++  _avx512_write_4_halfspinor_f_reverse_dn( a1, a2, a3, &(*sm).c3.c1.re, &(*sp).c3.c1.re, &(*sm2).c3.c1.re, &(*sp2).c3.c1.re );
++}
++
++#endif
+\ No newline at end of file
+diff --git a/modules/dirac/avx512/Dw_dble_avx512.c b/modules/dirac/avx512/Dw_dble_avx512.c
+new file mode 100644
+index 0000000..cf07bfa
+--- /dev/null
++++ b/modules/dirac/avx512/Dw_dble_avx512.c
+@@ -0,0 +1,261 @@
++/*******************************************************************************
++*
++* File Dw_dble_avx512.c
++*
++* This software is distributed under the terms of the GNU General Public
++* License (GPL)
++*
++* AVX512 implementation of the O(a)-improved Wilson-Dirac operator D (double-
++* precision programs).
++*
++* See ../Dw_dble.c for more details and alternative implementations
++ *******************************************************************************/
++
++#ifdef AVX512
++
++
++#include <stdlib.h>
++#include <stdio.h>
++#include <math.h>
++#include "mpi.h"
++#include "su3.h"
++#include "utils.h"
++#include "flags.h"
++#include "lattice.h"
++#include "uflds.h"
++#include "sflds.h"
++#include "sw_term.h"
++#include "dirac.h"
++#include "global.h"
++
++#define N0 (NPROC0 * L0)
++
++typedef union
++{
++  spinor_dble s;
++  weyl_dble w[2];
++} spin_t;
++
++#include "avx512.h"
++#include "sse.h"
++
++void doe_dble_avx512( const int *piup, const int *pidn, const su3_dble *u, const spinor_dble *pk, double coe, spin_t *rs)
++{
++  const spinor_dble *sp, *sm;
++  const su3_dble *up;
++
++  /* 512-bit wide stores for the spinor for each color */
++  __m512d a1, a2, a3;
++  __m512d b1, b2, b3;
++  __m512d w1, w2, w3;
++  __m512d t1, t2, t3, t4, t5, t6;
++
++  __m128d tc;
++  __m512d c512;
++
++  /******************************* direction 0 *********************************/
++
++  sp = pk + (*(piup++));
++  sm = pk + (*(pidn++));
++
++  _avx512_load_2_halfspinor_d( t1, t2, t3, &(*sp).c1.c1.re, &(*sm).c1.c1.re );
++  _avx512_load_2_halfspinor_d( t4, t5, t6, &(*sp).c3.c1.re, &(*sm).c3.c1.re );
++
++  a1 = _mm512_maskz_add_pd( 0b00001111, t1, t4 );
++  a1 = _mm512_mask_sub_pd( a1, 0b11110000, t1, t4 );
++  a2 = _mm512_maskz_add_pd( 0b00001111, t2, t5 );
++  a2 = _mm512_mask_sub_pd( a2, 0b11110000, t2, t5 );
++  a3 = _mm512_maskz_add_pd( 0b00001111, t3, t6 );
++  a3 = _mm512_mask_sub_pd( a3, 0b11110000, t3, t6 );
++
++  sp = pk + (*(piup++));
++  _mm_prefetch( (char *) sp, _MM_HINT_T0 );
++  sm = pk + (*(pidn++));
++  _mm_prefetch( (char *) sm, _MM_HINT_T0 );
++
++  up = u;
++  u += 1;
++  avx512_su3_mul_quad_dble(  *up, *u, b1, b2, b3, a1, a2, a3  );
++
++  _avx512_to_weyl_1( w1, b1 );
++  _avx512_to_weyl_1( w2, b2 );
++  _avx512_to_weyl_1( w3, b3 );
++
++
++  /******************************* direction 1 *********************************/
++  _avx512_load_2_halfspinor_d( t1, t2, t3, &(*sp).c1.c1.re, &(*sm).c1.c1.re );
++  _avx512_load_2_halfspinor_d_reverse( t4, t5, t6, &(*sp).c3.c1.re, &(*sm).c3.c1.re );
++
++  t4 = _mm512_permute_pd ( t4, 0b01010101 );
++  a1 = _mm512_maskz_add_pd( 0b01011010, t1, t4 );
++  a1 = _mm512_mask_sub_pd( a1, 0b10100101, t1, t4 );
++  t5 = _mm512_permute_pd ( t5, 0b01010101 );
++  a2 = _mm512_maskz_add_pd( 0b01011010, t2, t5 );
++  a2 = _mm512_mask_sub_pd( a2, 0b10100101, t2, t5 );
++  t6 = _mm512_permute_pd ( t6, 0b01010101 );
++  a3 = _mm512_maskz_add_pd( 0b01011010, t3, t6 );
++  a3 = _mm512_mask_sub_pd( a3, 0b10100101, t3, t6 );
++
++  sp = pk + (*(piup++));
++  _mm_prefetch( (char *) sp, _MM_HINT_T0 );
++  sm = pk + (*(pidn++));
++  _mm_prefetch( (char *) sm, _MM_HINT_T0 );
++  up = ++u;
++  u += 1;
++
++  avx512_su3_mul_quad_dble(  *up, *u, b1, b2, b3, a1, a2, a3  );
++
++  _avx512_to_weyl_2( w1, b1 );
++  _avx512_to_weyl_2( w2, b2 );
++  _avx512_to_weyl_2( w3, b3 );
++
++  /******************************* direction 2 *********************************/
++
++  _avx512_load_2_halfspinor_d( t1, t2, t3, &(*sp).c1.c1.re, &(*sm).c1.c1.re );
++  _avx512_load_2_halfspinor_d_reverse( t4, t5, t6, &(*sp).c3.c1.re, &(*sm).c3.c1.re );
++
++  a1 = _mm512_maskz_add_pd( 0b11000011, t1, t4 );
++  a1 = _mm512_mask_sub_pd( a1, 0b00111100, t1, t4 );
++  a2 = _mm512_maskz_add_pd( 0b11000011, t2, t5 );
++  a2 = _mm512_mask_sub_pd( a2, 0b00111100, t2, t5 );
++  a3 = _mm512_maskz_add_pd( 0b11000011, t3, t6 );
++  a3 = _mm512_mask_sub_pd( a3, 0b00111100, t3, t6 );
++
++  sp = pk + (*(piup));
++  _mm_prefetch( (char *) sp, _MM_HINT_T0 );
++  sm = pk + (*(pidn));
++  _mm_prefetch( (char *) sm, _MM_HINT_T0 );
++  up = ++u;
++  u += 1;
++
++  avx512_su3_mul_quad_dble(  *up, *u, b1, b2, b3, a1, a2, a3  );
++
++  _avx512_to_weyl_3( w1, b1 );
++  _avx512_to_weyl_3( w2, b2 );
++  _avx512_to_weyl_3( w3, b3 );
++
++
++  /******************************* direction 3 *********************************/
++  _avx512_load_2_halfspinor_d( t1, t2, t3, &(*sp).c1.c1.re, &(*sm).c1.c1.re );
++  _avx512_load_2_halfspinor_d( t4, t5, t6, &(*sp).c3.c1.re, &(*sm).c3.c1.re );
++
++  t4 = _mm512_permute_pd ( t4, 0b01010101 );
++  a1 = _mm512_maskz_add_pd( 0b10010110, t1, t4 );
++  a1 = _mm512_mask_sub_pd( a1, 0b01101001, t1, t4 );
++  t5 = _mm512_permute_pd ( t5, 0b01010101 );
++  a2 = _mm512_maskz_add_pd( 0b10010110, t2, t5 );
++  a2 = _mm512_mask_sub_pd( a2, 0b01101001, t2, t5 );
++  t6 = _mm512_permute_pd ( t6, 0b01010101 );
++  a3 = _mm512_maskz_add_pd( 0b10010110, t3, t6 );
++  a3 = _mm512_mask_sub_pd( a3, 0b01101001, t3, t6 );
++
++  up = ++u;
++  u += 1;
++  avx512_su3_mul_quad_dble(  *up, *u, b1, b2, b3, a1, a2, a3  );
++
++  _avx512_to_weyl_4( w1, b1 );
++  _avx512_to_weyl_4( w2, b2 );
++  _avx512_to_weyl_4( w3, b3 );
++
++  tc =  _mm_load_sd( &coe );
++  c512 = _mm512_broadcastsd_pd( tc );
++  w1 = _mm512_mul_pd( c512, w1 );
++  w2 = _mm512_mul_pd( c512, w2 );
++  w3 = _mm512_mul_pd( c512, w3 );
++
++  _avx512_store_2_halfspinor_d( w1, w2, w3, &rs->s.c1.c1.re, &rs->s.c3.c1.re );
++}
++
++void deo_dble_avx512( const int *piup, const int *pidn, const su3_dble *u,  spinor_dble *pl, double ceo, spin_t *rs)
++{
++  const su3_dble *up;
++  spinor_dble *sp, *sm;
++
++  /* 512-bit wide stores for the spinor for each color */
++   __m512d a1, a2, a3;
++   __m512d b1, b2, b3;
++   __m512d w1, w2, w3;
++
++   __m128d tc;
++   __m512d c512;
++
++  /******************************* direction 0 *********************************/
++
++  sp = pl + (*(piup++));
++  _mm_prefetch( (char *) sp, _MM_HINT_T0 );
++  sm = pl + (*(pidn++));
++  _mm_prefetch( (char *) sm, _MM_HINT_T0 );
++
++  _avx512_load_2_halfspinor_d( w1, w2, w3, &rs->s.c1.c1.re, &rs->s.c3.c1.re );
++
++  tc =  _mm_load_sd( &ceo );
++  c512 = _mm512_broadcastsd_pd( tc );
++  w1 = _mm512_mul_pd( c512, w1 );
++  w2 = _mm512_mul_pd( c512, w2 );
++  w3 = _mm512_mul_pd( c512, w3 );
++
++  _avx512_expand_weyl( a1, w1 )
++  _avx512_expand_weyl( a2, w2 )
++  _avx512_expand_weyl( a3, w3 )
++
++  up = u;
++  u += 1;
++  avx512_su3_mul_quad_dble(  *u, *up, b1, b2, b3, a1, a2, a3  );
++
++  _avx512_add_to_spinors( b1, b2, b3, &(*sp).c1.c1.re, &(*sm).c1.c1.re );
++  _avx512_add_to_spinors_2( b1, b2, b3, &(*sp).c3.c1.re, &(*sm).c3.c1.re );
++
++  /******************************* direction 1 *********************************/
++  sp = pl + (*(piup++));
++  _mm_prefetch( (char *) sp, _MM_HINT_T0 );
++  sm = pl + (*(pidn++));
++  _mm_prefetch( (char *) sm, _MM_HINT_T0 );
++
++  _avx512_expand_weyl_2( a1, w1 );
++  _avx512_expand_weyl_2( a2, w2 );
++  _avx512_expand_weyl_2( a3, w3 );
++
++  up = ++u;
++  u += 1;
++  avx512_su3_mul_quad_dble(  *u, *up, b1, b2, b3, a1, a2, a3  );
++
++  _avx512_add_to_spinors( b1, b2, b3, &(*sp).c1.c1.re, &(*sm).c1.c1.re );
++  _avx512_add_to_spinors_3( b1, b2, b3, &(*sp).c3.c1.re, &(*sm).c3.c1.re );
++
++  /******************************* direction 2 *********************************/
++  sp = pl + (*(piup++));
++  _mm_prefetch( (char *) sp, _MM_HINT_T0 );
++  sm = pl + (*(pidn++));
++  _mm_prefetch( (char *) sm, _MM_HINT_T0 );
++
++  _avx512_expand_weyl_3( a1, w1 );
++  _avx512_expand_weyl_3( a2, w2 );
++  _avx512_expand_weyl_3( a3, w3 );
++
++  up = ++u;
++  u += 1;
++  avx512_su3_mul_quad_dble(  *u, *up, b1, b2, b3, a1, a2, a3  );
++
++  _avx512_add_to_spinors( b1, b2, b3, &(*sp).c1.c1.re, &(*sm).c1.c1.re );
++  _avx512_add_to_spinors_4( b1, b2, b3, &(*sp).c3.c1.re, &(*sm).c3.c1.re );
++
++
++  /******************************* direction 3 *********************************/
++  sp = pl + (*(piup++));
++  _mm_prefetch( (char *) sp, _MM_HINT_T0 );
++  sm = pl + (*(pidn++));
++  _mm_prefetch( (char *) sm, _MM_HINT_T0 );
++
++  _avx512_expand_weyl_4( a1, w1 );
++  _avx512_expand_weyl_4( a2, w2 );
++  _avx512_expand_weyl_4( a3, w3 );
++
++  up = ++u;
++  u += 1;
++  avx512_su3_mul_quad_dble(  *u, *up, b1, b2, b3, a1, a2, a3  );
++
++  _avx512_add_to_spinors( b1, b2, b3, &(*sp).c1.c1.re, &(*sm).c1.c1.re );
++  _avx512_add_to_spinors_5( b1, b2, b3, &(*sp).c3.c1.re, &(*sm).c3.c1.re );
++}
++
++#endif
+\ No newline at end of file
+diff --git a/modules/linalg/avx512/salg_avx512.c b/modules/linalg/avx512/salg_avx512.c
+new file mode 100644
+index 0000000..509e78f
+--- /dev/null
++++ b/modules/linalg/avx512/salg_avx512.c
+@@ -0,0 +1,154 @@
++/*******************************************************************************
++*
++* File salg_avx512.c
++*
++* This software is distributed under the terms of the GNU General Public
++* License (GPL)
++*
++* AVX512 implementations of single precision linear algebra
++* functions for spinors.
++*
++* See ../salg.c for more information and alternative
++* implementations.
++*
++*******************************************************************************/
++
++#ifdef AVX512
++
++#include "global.h"
++#include "linalg.h"
++#include "mpi.h"
++#include "sflds.h"
++
++#include "avx512.h"
++
++void mulc_spinor_add(int vol, spinor *s, spinor *r, complex z)
++{
++  spinor *sm;
++  __m128 tr, ti;
++  __m512 zr, zi, t1, t2;
++  __m512 sign;
++
++  sign = _mm512_set_ps( -1,1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1 );
++  sm = s + vol;
++
++  tr = _mm_load_ps1( &z.re );
++  ti = _mm_load_ps1( &z.im );
++  zr = _mm512_broadcast_f32x4( tr );
++  zi = _mm512_broadcast_f32x4( ti );
++
++  zi = _mm512_mul_ps( zi, sign );
++
++  for (; s < sm; s+=2) {
++    t1 = _mm512_loadu_ps( &(*r).c1.c1.re );
++    t2 = _mm512_mul_ps( zi, t1 );
++    t2 = _mm512_permute_ps( t2, 0b10110001 );
++    t2 = _mm512_fmadd_ps( zr, t1, t2 );
++    t1 = _mm512_loadu_ps( &(*s).c1.c1.re );
++    t1 = _mm512_add_ps( t1, t2 );
++    _mm512_storeu_ps( &(*s).c1.c1.re, t1 );
++
++    t1 = _mm512_loadu_ps( &(*r).c1.c1.re + 16 );
++    t2 = _mm512_mul_ps( zi, t1 );
++    t2 = _mm512_permute_ps( t2, 0b10110001 );
++    t2 = _mm512_fmadd_ps( zr, t1, t2 );
++    t1 = _mm512_loadu_ps( &(*s).c1.c1.re + 16 );
++    t1 = _mm512_add_ps( t1, t2 );
++    _mm512_storeu_ps( &(*s).c1.c1.re + 16, t1 );
++
++    t1 = _mm512_loadu_ps( &(*r).c1.c1.re + 32 );
++    t2 = _mm512_mul_ps( zi, t1 );
++    t2 = _mm512_permute_ps( t2, 0b10110001 );
++    t2 = _mm512_fmadd_ps( zr, t1, t2 );
++    t1 = _mm512_loadu_ps( &(*s).c1.c1.re + 32 );
++    t1 = _mm512_add_ps( t1, t2 );
++    _mm512_storeu_ps( &(*s).c1.c1.re + 32, t1 );
++
++    r += 2;
++  }
++}
++
++#if __GNUC__ < 7
++/* This function was implemented to gcc 7 */
++extern __inline double _mm512_reduce_add_ps( __m512 a ) {
++  float * d = (float *) &a;
++  return d[0]+d[1]+d[2]+d[3]+d[4]+d[5]+d[6]+d[7]
++        +d[8]+d[9]+d[10]+d[11]+d[12]+d[13]+d[14]+d[15] ;
++}
++#endif
++
++complex spinor_prod(int vol, int icom, spinor *s, spinor *r )
++{
++  complex z;
++  complex_dble v, w;
++    spinor const *sm, *smb;
++  __m512 tr, ti, s1, s2, s3, r1, r2, r3, sign;
++
++  double x, y;
++
++  x = 0.0;
++  y = 0.0;
++  sm = s + vol;
++
++
++  while (s < sm) {
++    smb = s + 8;
++    if (smb > sm) {
++      smb = sm;
++    }
++
++    tr = _mm512_setzero_ps();
++    ti = _mm512_setzero_ps();
++
++    for (; s < smb; s+=2) {
++      s1 = _mm512_loadu_ps( &(*s).c1.c1.re );
++      s2 = _mm512_loadu_ps( &(*s).c1.c1.re+16 );
++      s3 = _mm512_loadu_ps( &(*s).c1.c1.re+32 );
++      r1 = _mm512_loadu_ps( &(*r).c1.c1.re );
++      r2 = _mm512_loadu_ps( &(*r).c1.c1.re+16 );
++      r3 = _mm512_loadu_ps( &(*r).c1.c1.re+32 );
++
++      tr = _mm512_fmadd_ps( s1, r1, tr );
++      tr = _mm512_fmadd_ps( s2, r2, tr );
++      tr = _mm512_fmadd_ps( s3, r3, tr );
++
++      r1 = _mm512_permute_ps( r1, 0b10110001 );
++      r2 = _mm512_permute_ps( r2, 0b10110001 );
++      r3 = _mm512_permute_ps( r3, 0b10110001 );
++
++      sign = _mm512_set_ps( -1,1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1 );
++      r1 = _mm512_mul_ps( r1, sign );
++      r2 = _mm512_mul_ps( r2, sign );
++      r3 = _mm512_mul_ps( r3, sign );
++
++      ti = _mm512_fmadd_ps( s1, r1, ti );
++      ti = _mm512_fmadd_ps( s2, r2, ti );
++      ti = _mm512_fmadd_ps( s3, r3, ti );
++
++      r += 2;
++    }
++
++    x += (double) _mm512_reduce_add_ps( tr );
++    y += (double) _mm512_reduce_add_ps( ti );
++
++  }
++
++  v.re = x;
++  v.im = y;
++
++  if ((icom==1)&&(NPROC>1))
++  {
++     MPI_Reduce(&v.re,&w.re,2,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
++     MPI_Bcast(&w.re,2,MPI_DOUBLE,0,MPI_COMM_WORLD);
++     z.re=(float)(w.re);
++     z.im=(float)(w.im);
++  }
++  else
++  {
++     z.re=(float)(v.re);
++     z.im=(float)(v.im);
++  }
++  return z;
++}
++
++#endif
+\ No newline at end of file
+diff --git a/modules/linalg/avx512/salg_dble_avx512.c b/modules/linalg/avx512/salg_dble_avx512.c
+new file mode 100644
+index 0000000..8fde96c
+--- /dev/null
++++ b/modules/linalg/avx512/salg_dble_avx512.c
+@@ -0,0 +1,402 @@
++/*******************************************************************************
++*
++* File salg_dble_avx512.c
++* 
++* This software is distributed under the terms of the GNU General Public
++* License (GPL)
++*
++* AVX512 implementations of single precision linear algebra
++* functions for spinors.
++*
++* See ../salg_dble.c for more information and alternative
++* implementations.
++*
++*******************************************************************************/
++
++#ifdef AVX512
++
++#include "global.h"
++#include "linalg.h"
++#include "mpi.h"
++#include "sflds.h"
++
++#include "avx512.h"
++#if __GNUC__ < 7
++/* This function was implemented to gcc 7 */
++extern __inline double _mm512_reduce_add_pd( __m512d a ) {
++  double * d = (double *) &a;
++  return d[0]+d[1]+d[2]+d[3]+d[4]+d[5]+d[6]+d[7] ;
++}
++#endif
++
++complex_dble spinor_prod_dble_avx512( spinor_dble const *s, spinor_dble const *smb, spinor_dble const *r)
++{
++  __m512d tr, ti, s1, s2, s3, r1, r2, r3, sign;
++  sign = _mm512_set_pd( -1,1,-1,1,-1,1,-1,1 );
++  complex_dble z;
++
++  tr = _mm512_setzero_pd();
++  ti = _mm512_setzero_pd();
++  for (; s < smb; s++) {
++    s1 = _mm512_loadu_pd( &(*s).c1.c1.re );
++    s2 = _mm512_loadu_pd( &(*s).c1.c1.re+8 );
++    s3 = _mm512_loadu_pd( &(*s).c1.c1.re+16 );
++    r1 = _mm512_loadu_pd( &(*r).c1.c1.re );
++    r2 = _mm512_loadu_pd( &(*r).c1.c1.re+8 );
++    r3 = _mm512_loadu_pd( &(*r).c1.c1.re+16 );
++    tr = _mm512_fmadd_pd( s1, r1, tr );
++    tr = _mm512_fmadd_pd( s2, r2, tr );
++    tr = _mm512_fmadd_pd( s3, r3, tr );
++    r1 = _mm512_permute_pd( r1, 0b01010101 );
++    r2 = _mm512_permute_pd( r2, 0b01010101 );
++    r3 = _mm512_permute_pd( r3, 0b01010101 );
++    sign = _mm512_set_pd( -1,1,-1,1,-1,1,-1,1 );
++    r1 = _mm512_mul_pd( r1, sign );
++    r2 = _mm512_mul_pd( r2, sign );
++    r3 = _mm512_mul_pd( r3, sign );
++    ti = _mm512_fmadd_pd( s1, r1, ti );
++    ti = _mm512_fmadd_pd( s2, r2, ti );
++    ti = _mm512_fmadd_pd( s3, r3, ti );
++    r += 1;
++  }
++  z.re = _mm512_reduce_add_pd( tr );
++  z.im = _mm512_reduce_add_pd( ti );
++  return z;
++}
++
++double spinor_prod_re_dble_avx512( spinor_dble const *s, spinor_dble const *smb, spinor_dble const *r)
++{
++  __m512d tr, ti, s1, s2, s3, r1, r2, r3;
++  float c;
++  tr = _mm512_setzero_pd();
++  ti = _mm512_setzero_pd();
++  for (; s < smb; s++) {
++    s1 = _mm512_loadu_pd( &(*s).c1.c1.re );
++    s2 = _mm512_loadu_pd( &(*s).c1.c1.re+8 );
++    s3 = _mm512_loadu_pd( &(*s).c1.c1.re+16 );
++    r1 = _mm512_loadu_pd( &(*r).c1.c1.re );
++    r2 = _mm512_loadu_pd( &(*r).c1.c1.re+8 );
++    r3 = _mm512_loadu_pd( &(*r).c1.c1.re+16 );
++    tr = _mm512_fmadd_pd( s1, r1, tr );
++    tr = _mm512_fmadd_pd( s2, r2, tr );
++    tr = _mm512_fmadd_pd( s3, r3, tr );
++    r1 = _mm512_permute_pd( r1, 0b01010101 );
++    r2 = _mm512_permute_pd( r2, 0b01010101 );
++    r3 = _mm512_permute_pd( r3, 0b01010101 );
++    r += 1;
++  }
++  c = _mm512_reduce_add_pd( tr );
++  return c;
++}
++
++complex_dble spinor_prod5_dble_avx512(spinor_dble const *s, spinor_dble const *smb, spinor_dble const *r)
++{
++  __m512d tr, ti, s1, s2, s3, r1, r2, r3, sign;
++  complex_dble z;
++
++  tr = _mm512_setzero_pd();
++  ti = _mm512_setzero_pd();
++  for (; s < smb; s++) {
++    s1 = _mm512_loadu_pd( &(*s).c1.c1.re );
++    s2 = _mm512_loadu_pd( &(*s).c1.c1.re+8 );
++    s3 = _mm512_loadu_pd( &(*s).c1.c1.re+16 );
++    r1 = _mm512_loadu_pd( &(*r).c1.c1.re );
++    r2 = _mm512_loadu_pd( &(*r).c1.c1.re+8 );
++    r3 = _mm512_loadu_pd( &(*r).c1.c1.re+16 );
++    sign = _mm512_set_pd( -1,-1,-1,-1,1,1,1,1 );
++    s2 = _mm512_mul_pd( s2, sign );
++    tr = _mm512_fmadd_pd( s1, r1, tr );
++    tr = _mm512_fmadd_pd( s2, r2, tr );
++    tr = _mm512_fnmadd_pd( s3, r3, tr );
++    r1 = _mm512_permute_pd( r1, 0b01010101 );
++    r2 = _mm512_permute_pd( r2, 0b01010101 );
++    r3 = _mm512_permute_pd( r3, 0b01010101 );
++    sign = _mm512_set_pd( -1,1,-1,1,-1,1,-1,1 );
++    r1 = _mm512_mul_pd( r1, sign );
++    r2 = _mm512_mul_pd( r2, sign );
++    r3 = _mm512_mul_pd( r3, sign );
++    ti = _mm512_fmadd_pd( s1, r1, ti );
++    ti = _mm512_fmadd_pd( s2, r2, ti );
++    ti = _mm512_fnmadd_pd( s3, r3, ti );
++    r += 1;
++  }
++  z.re = _mm512_reduce_add_pd( tr );
++  z.im = _mm512_reduce_add_pd( ti );
++  return z;
++}
++
++double norm_square_dble_avx512(spinor_dble const *s, spinor_dble const *smb)
++{
++  __m512d tmp, s1, s2, s3;
++  tmp = _mm512_setzero_pd();
++  for (; s < smb; s++) {
++    s1 = _mm512_loadu_pd( &(*s).c1.c1.re );
++    s2 = _mm512_loadu_pd( &(*s).c1.c1.re+8 );
++    s3 = _mm512_loadu_pd( &(*s).c1.c1.re+16 );
++    tmp = _mm512_fmadd_pd( s1, s1, tmp );
++    tmp = _mm512_fmadd_pd( s2, s2, tmp );
++    tmp = _mm512_fmadd_pd( s3, s3, tmp );
++  }
++  return _mm512_reduce_add_pd( tmp );
++}
++
++
++
++void mulc_spinor_add_dble(int vol, spinor_dble *s, spinor_dble *r,
++                          complex_dble z)
++{
++  spinor_dble *sm;
++  __m128d tr, ti;
++  __m512d zr, zi, t1, t2;
++
++  tr = _mm_load_sd( &z.re );
++  ti = _mm_load_sd( &z.im );
++  zr = _mm512_broadcastsd_pd( tr );
++  zi = _mm512_broadcastsd_pd( ti );
++
++  sm = s + vol;
++
++  for (; s < sm; s++) {
++    t1 = _mm512_loadu_pd( &(*r).c1.c1.re );
++    t2 = _mm512_mul_pd( zi, t1 );
++    t2 = _mm512_permute_pd( t2, 0b01010101 );
++    t2 = _mm512_fmaddsub_pd( zr, t1, t2 );
++    t1 = _mm512_loadu_pd( &(*s).c1.c1.re );
++    t1 = _mm512_add_pd( t1, t2 );
++    _mm512_storeu_pd( &(*s).c1.c1.re, t1 );
++
++    t1 = _mm512_loadu_pd( &(*r).c1.c1.re+8 );
++    t2 = _mm512_mul_pd( zi, t1 );
++    t2 = _mm512_permute_pd( t2, 0b01010101 );
++    t2 = _mm512_fmaddsub_pd( zr, t1, t2 );
++    t1 = _mm512_loadu_pd( &(*s).c1.c1.re+8 );
++    t1 = _mm512_add_pd( t1, t2 );
++    _mm512_storeu_pd( &(*s).c1.c1.re+8, t1 );
++
++    t1 = _mm512_loadu_pd( &(*r).c1.c1.re+16 );
++    t2 = _mm512_mul_pd( zi, t1 );
++    t2 = _mm512_permute_pd( t2, 0b01010101 );
++    t2 = _mm512_fmaddsub_pd( zr, t1, t2 );
++    t1 = _mm512_loadu_pd( &(*s).c1.c1.re+16 );
++    t1 = _mm512_add_pd( t1, t2 );
++    _mm512_storeu_pd( &(*s).c1.c1.re+16, t1 );
++
++    r += 1;
++  }
++}
++
++
++void mulr_spinor_add_dble(int vol, spinor_dble *s, spinor_dble *r,
++                          double c)
++{
++  spinor_dble *sm;
++  __m128d t128;
++  __m512d tc, t1, t2;
++
++  t128 = _mm_load_sd( &c );
++  tc = _mm512_broadcastsd_pd( t128 );
++
++  sm = s + vol;
++
++  for (; s < sm; s++) {
++    t1 = _mm512_loadu_pd( &(*r).c1.c1.re );
++    t2 = _mm512_mul_pd( tc, t1 );
++    t1 = _mm512_loadu_pd( &(*s).c1.c1.re );
++    t1 = _mm512_add_pd( t1, t2 );
++    _mm512_storeu_pd( &(*s).c1.c1.re, t1 );
++
++    t1 = _mm512_loadu_pd( &(*r).c1.c1.re+8 );
++    t2 = _mm512_mul_pd( tc, t1 );
++    t1 = _mm512_loadu_pd( &(*s).c1.c1.re+8 );
++    t1 = _mm512_add_pd( t1, t2 );
++    _mm512_storeu_pd( &(*s).c1.c1.re+8, t1 );
++
++    t1 = _mm512_loadu_pd( &(*r).c1.c1.re+16 );
++    t2 = _mm512_mul_pd( tc, t1 );
++    t1 = _mm512_loadu_pd( &(*s).c1.c1.re+16 );
++    t1 = _mm512_add_pd( t1, t2 );
++    _mm512_storeu_pd( &(*s).c1.c1.re+16, t1 );
++
++    r += 1;
++  }
++}
++
++
++void combine_spinor_dble(int vol, spinor_dble *s, spinor_dble *r,
++                         double cs, double cr)
++{
++  spinor_dble *sm;
++  __m128d ts128, tr128;
++  __m512d tcs, tcr, t1, t2;
++
++  ts128 = _mm_load_sd( &cs );
++  tr128 = _mm_load_sd( &cr );
++  tcs = _mm512_broadcastsd_pd( ts128 );
++  tcr = _mm512_broadcastsd_pd( tr128 );
++
++  sm = s + vol;
++
++  for (; s < sm; s++) {
++    t1 = _mm512_loadu_pd( &(*r).c1.c1.re );
++    t2 = _mm512_mul_pd( tcr, t1 );
++    t1 = _mm512_loadu_pd( &(*s).c1.c1.re );
++    t1 = _mm512_fmadd_pd( tcs, t1, t2 );
++    _mm512_storeu_pd( &(*s).c1.c1.re, t1 );
++
++    t1 = _mm512_loadu_pd( &(*r).c1.c1.re+8 );
++    t2 = _mm512_mul_pd( tcr, t1 );
++    t1 = _mm512_loadu_pd( &(*s).c1.c1.re+8 );
++    t1 = _mm512_fmadd_pd( tcs, t1, t2 );
++    _mm512_storeu_pd( &(*s).c1.c1.re+8, t1 );
++
++    t1 = _mm512_loadu_pd( &(*r).c1.c1.re+16 );
++    t2 = _mm512_mul_pd( tcr, t1 );
++    t1 = _mm512_loadu_pd( &(*s).c1.c1.re+16 );
++    t1 = _mm512_fmadd_pd( tcs, t1, t2 );
++    _mm512_storeu_pd( &(*s).c1.c1.re+16, t1 );
++
++    r += 1;
++  }
++}
++
++
++void scale_dble(int vol, double c, spinor_dble *s)
++{
++  spinor_dble *sm;
++  __m128d t128;
++  __m512d tc, t1;
++
++  t128 = _mm_load_sd( &c );
++  tc = _mm512_broadcastsd_pd( t128 );
++
++  sm = s + vol;
++
++  for (; s < sm; s++) {
++    t1 = _mm512_loadu_pd( &(*s).c1.c1.re );
++    t1 = _mm512_mul_pd( tc, t1 );
++    _mm512_storeu_pd( &(*s).c1.c1.re, t1 );
++
++    t1 = _mm512_loadu_pd( &(*s).c1.c1.re+8 );
++    t1 = _mm512_mul_pd( tc, t1 );
++    _mm512_storeu_pd( &(*s).c1.c1.re+8, t1 );
++
++    t1 = _mm512_loadu_pd( &(*s).c1.c1.re+16 );
++    t1 = _mm512_mul_pd( tc, t1 );
++    _mm512_storeu_pd( &(*s).c1.c1.re+16, t1 );
++  }
++}
++
++void rotate_dble_avx512(int n, int ix, spinor_dble **ppk, spinor_dble *psi, complex_dble const *v)
++{
++  spinor_dble *pk, *pj;
++  complex_dble const *z;
++  int k,j;
++
++  for (k = 0; k < n; k++) {
++    __m128d tr, ti;
++    __m512d zr,zi, t1,t2, p1,p2,p3, sign;
++
++    pk = psi + k;
++    pj = ppk[0] + ix;
++    z = v + k;
++
++    tr = _mm_load_pd1( &z->re );
++    ti = _mm_load_pd1( &z->im );
++    zr = _mm512_broadcastsd_pd( tr );
++    zi = _mm512_broadcastsd_pd( ti );
++
++    sign = _mm512_set_pd( -1,1,-1,1,-1,1,-1,1 );
++    zi = _mm512_mul_pd( zi, sign );
++
++    t1 = _mm512_loadu_pd( &(*pj).c1.c1.re );
++    t2 = _mm512_mul_pd( zi, t1 );
++    t2 = _mm512_permute_pd( t2, 0b01010101 );
++    p1 = _mm512_fmadd_pd( zr, t1, t2 );
++
++    t1 = _mm512_loadu_pd( &(*pj).c1.c1.re+8 );
++    t2 = _mm512_mul_pd( zi, t1 );
++    t2 = _mm512_permute_pd( t2, 0b01010101 );
++    p2 = _mm512_fmadd_pd( zr, t1, t2 );
++
++    t1 = _mm512_loadu_pd( &(*pj).c1.c1.re+16 );
++    t2 = _mm512_mul_pd( zi, t1 );
++    t2 = _mm512_permute_pd( t2, 0b01010101 );
++    p3 = _mm512_fmadd_pd( zr, t1, t2 );
++
++    for (j = 1; j < n; j++) {
++      pj = ppk[j] + ix;
++      z += n;
++
++      tr = _mm_load_pd1( &z->re );
++      ti = _mm_load_pd1( &z->im );
++      zr = _mm512_broadcastsd_pd( tr );
++      zi = _mm512_broadcastsd_pd( ti );
++      zi = _mm512_mul_pd( zi, sign );
++
++      t1 = _mm512_loadu_pd( &(*pj).c1.c1.re );
++      t2 = _mm512_mul_pd( zi, t1 );
++      t2 = _mm512_permute_pd( t2, 0b01010101 );
++      t1 = _mm512_fmadd_pd( zr, t1, t2 );
++      p1 = _mm512_add_pd( p1, t1 );
++
++      t1 = _mm512_loadu_pd( &(*pj).c1.c1.re+8 );
++      t2 = _mm512_mul_pd( zi, t1 );
++      t2 = _mm512_permute_pd( t2, 0b01010101 );
++      t1 = _mm512_fmadd_pd( zr, t1, t2 );
++      p2 = _mm512_add_pd( p2, t1 );
++
++      t1 = _mm512_loadu_pd( &(*pj).c1.c1.re+16 );
++      t2 = _mm512_mul_pd( zi, t1 );
++      t2 = _mm512_permute_pd( t2, 0b01010101 );
++      t1 = _mm512_fmadd_pd( zr, t1, t2 );
++      p3 = _mm512_add_pd( p3, t1 );
++    }
++
++    _mm512_storeu_pd( &(*pk).c1.c1.re, p1 );
++    _mm512_storeu_pd( &(*pk).c1.c1.re+8, p2 );
++    _mm512_storeu_pd( &(*pk).c1.c1.re+16, p3 );
++  }
++}
++
++
++void mulg5_dble(int vol, spinor_dble *s)
++{
++  spinor_dble *sm;
++
++  sm = s + vol;
++
++  for (; s < sm; s++) {
++    __m512d s1;
++    __m256d s2;
++
++    s1 = _mm512_loadu_pd( &(*s).c1.c1.re+12 );
++    s1 = _mm512_sub_pd( _mm512_setzero_pd(), s1 );
++    _mm512_storeu_pd( &(*s).c1.c1.re+12, s1 );
++
++    s2 = _mm256_loadu_pd( &(*s).c1.c1.re+20 );
++    s2 = _mm256_sub_pd( _mm256_setzero_pd(), s2 );
++    _mm256_storeu_pd( &(*s).c1.c1.re+20, s2 );
++  }
++}
++
++void mulmg5_dble(int vol, spinor_dble *s)
++{
++  spinor_dble *sm;
++
++  sm = s + vol;
++
++  for (; s < sm; s++) {
++    __m512d s1;
++    __m256d s2;
++
++    s1 = _mm512_loadu_pd( &(*s).c1.c1.re );
++    s1 = _mm512_sub_pd( _mm512_setzero_pd(), s1 );
++    _mm512_storeu_pd( &(*s).c1.c1.re, s1 );
++
++    s2 = _mm256_loadu_pd( &(*s).c1.c1.re+8 );
++    s2 = _mm256_sub_pd( _mm256_setzero_pd(), s2 );
++    _mm256_storeu_pd( &(*s).c1.c1.re+8, s2 );
++  }
++}
++
++#endif
+\ No newline at end of file
+diff --git a/modules/linalg/salg.c b/modules/linalg/salg.c
+index 2be05c1..f58aea8 100644
+--- a/modules/linalg/salg.c
++++ b/modules/linalg/salg.c
+@@ -89,9 +89,12 @@ static void alloc_wrotate(int n)
+    nrot=n;
+ }
+ 
++
++
+ #if (defined AVX)
+ #include "avx.h"
+ 
++#ifndef AVX512
+ #if (defined FMA3)
+ 
+ complex spinor_prod(int vol,int icom,spinor *s,spinor *r)
+@@ -323,9 +326,30 @@ complex spinor_prod(int vol,int icom,spinor *s,spinor *r)
+ 
+    return z;
+ }
++#endif
++
++void mulc_spinor_add(int vol, spinor *s, spinor *r, complex z)
++{
++  spinor *sm;
++
++  _avx_load_cmplx_up(z);
++  sm = s + vol;
++
++  for (; s < sm; s++) {
++    _avx_spinor_load(*s);
++    _avx_mulc_spinor_add(*r);
++    _avx_spinor_store(*s);
++
++    r += 1;
++  }
++
++  _avx_zeroupper();
++}
+ 
+ #endif
+ 
++
++
+ float spinor_prod_re(int vol,int icom,spinor *s,spinor *r)
+ {
+    double x,y;
+@@ -473,25 +497,6 @@ float norm_square(int vol,int icom,spinor *s)
+ }
+ 
+ 
+-void mulc_spinor_add(int vol,spinor *s,spinor *r,complex z)
+-{
+-   spinor *sm;
+-
+-   _avx_load_cmplx_up(z);
+-   sm=s+vol;
+-
+-   for (;s<sm;s++)
+-   {
+-      _avx_spinor_load(*s);
+-      _avx_mulc_spinor_add(*r);
+-      _avx_spinor_store(*s);
+-
+-      r+=1;
+-   }
+-
+-   _avx_zeroupper();
+-}
+-
+ 
+ void mulr_spinor_add(int vol,spinor *s,spinor *r,float c)
+ {
+@@ -666,6 +671,7 @@ void mulmg5(int vol,spinor *s)
+ #elif (defined x64)
+ #include "sse2.h"
+ 
++#ifndef AVX512
+ complex spinor_prod(int vol,int icom,spinor *s,spinor *r)
+ {
+    double x,y;
+@@ -850,6 +856,7 @@ complex spinor_prod(int vol,int icom,spinor *s,spinor *r)
+ 
+    return z;
+ }
++#endif
+ 
+ 
+ float spinor_prod_re(int vol,int icom,spinor *s,spinor *r)
+@@ -1228,6 +1235,7 @@ void mulmg5(int vol,spinor *s)
+ 
+ #else
+ 
++#ifndef AVX512
+ complex spinor_prod(int vol,int icom,spinor *s,spinor *r)
+ {
+    double x,y;
+@@ -1274,6 +1282,25 @@ complex spinor_prod(int vol,int icom,spinor *s,spinor *r)
+    return z;
+ }
+ 
++void mulc_spinor_add(int vol,spinor *s,spinor *r,complex z)
++{
++   spinor *sm;
++
++   sm=s+vol;
++
++   for (;s<sm;s++)
++   {
++      _vector_mulc_assign((*s).c1,z,(*r).c1);
++      _vector_mulc_assign((*s).c2,z,(*r).c2);
++      _vector_mulc_assign((*s).c3,z,(*r).c3);
++      _vector_mulc_assign((*s).c4,z,(*r).c4);
++
++      r+=1;
++   }
++}
++
++#endif
++
+ 
+ float spinor_prod_re(int vol,int icom,spinor *s,spinor *r)
+ {
+@@ -1331,23 +1358,6 @@ float norm_square(int vol,int icom,spinor *s)
+ }
+ 
+ 
+-void mulc_spinor_add(int vol,spinor *s,spinor *r,complex z)
+-{
+-   spinor *sm;
+-
+-   sm=s+vol;
+-
+-   for (;s<sm;s++)
+-   {
+-      _vector_mulc_assign((*s).c1,z,(*r).c1);
+-      _vector_mulc_assign((*s).c2,z,(*r).c2);
+-      _vector_mulc_assign((*s).c3,z,(*r).c3);
+-      _vector_mulc_assign((*s).c4,z,(*r).c4);
+-
+-      r+=1;
+-   }
+-}
+-
+ 
+ void mulr_spinor_add(int vol,spinor *s,spinor *r,float c)
+ {
+diff --git a/modules/linalg/salg_dble.c b/modules/linalg/salg_dble.c
+index cf0ce5a..153ca22 100644
+--- a/modules/linalg/salg_dble.c
++++ b/modules/linalg/salg_dble.c
+@@ -104,7 +104,170 @@ static void alloc_wrotate(int n)
+    nrot=n;
+ }
+ 
+-#if (defined AVX)
++
++#if (defined AVX512)
++#include "avx512.h"
++
++complex_dble spinor_prod_dble_avx512( spinor_dble *s, spinor_dble *smb, spinor_dble *r);
++complex_dble spinor_prod_dble(int vol, int icom, spinor_dble *s,
++                              spinor_dble *r)
++{
++  spinor_dble *sm, *smb;
++
++  if (init == 0) {
++    isx = init_hsum(1);
++    isz = init_hsum(2);
++    init = 1;
++  }
++  reset_hsum(isz);
++  sm = s + vol;
++
++  while (s < sm) {
++    smb = s + 8;
++    if (smb > sm) {
++      smb = sm;
++    }
++    smz = spinor_prod_dble_avx512( s, smb, r );
++    s = smb; r+=8;
++    add_to_hsum(isz, (double *)(&smz));
++  }
++
++  if ((icom == 1) && (NPROC > 1)) {
++    global_hsum(isz, (double *)(&smz));
++  } else {
++    local_hsum(isz, (double *)(&smz));
++  }
++
++  return smz;
++}
++
++double spinor_prod_re_dble_avx512( spinor_dble *s, spinor_dble *smb, spinor_dble *r);
++double spinor_prod_re_dble(int vol, int icom, spinor_dble *s,
++                           spinor_dble *r)
++{
++  spinor_dble *sm, *smb;
++
++  if (init == 0) {
++    isx = init_hsum(1);
++    isz = init_hsum(2);
++    init = 1;
++  }
++
++  reset_hsum(isx);
++  sm = s + vol;
++
++  while (s < sm) {
++    smb = s + 8;
++    if (smb > sm) {
++      smb = sm;
++    }
++    smx = spinor_prod_re_dble_avx512( s, smb, r );
++    s = smb; r+=8;
++
++    add_to_hsum(isx, &smx);
++  }
++
++  if ((icom == 1) && (NPROC > 1)) {
++    global_hsum(isx, &smx);
++  } else {
++    local_hsum(isx, &smx);
++  }
++
++  return smx;
++}
++
++complex_dble spinor_prod5_dble_avx512(spinor_dble *s, spinor_dble *smb, spinor_dble *r);
++complex_dble spinor_prod5_dble(int vol, int icom, spinor_dble *s,
++                               spinor_dble *r)
++{
++  spinor_dble *sm, *smb;
++
++  if (init == 0) {
++    isx = init_hsum(1);
++    isz = init_hsum(2);
++    init = 1;
++  }
++
++  reset_hsum(isz);
++  sm = s + vol;
++
++  while (s < sm) {
++    smb = s + 8;
++    if (smb > sm) {
++      smb = sm;
++    }
++
++    smz = spinor_prod5_dble_avx512( s, smb, r );
++    s = smb; r+=8;
++
++    add_to_hsum(isz, (double *)(&smz));
++  }
++
++  if ((icom == 1) && (NPROC > 1)) {
++    global_hsum(isz, (double *)(&smz));
++  } else {
++    local_hsum(isz, (double *)(&smz));
++  }
++
++  return smz;
++}
++
++double norm_square_dble_avx512(spinor_dble *s, spinor_dble *smb);
++double norm_square_dble(int vol, int icom, spinor_dble *s)
++{
++  spinor_dble *sm, *smb;
++
++  if (init == 0) {
++    isx = init_hsum(1);
++    isz = init_hsum(2);
++    init = 1;
++  }
++
++  reset_hsum(isx);
++  sm = s + vol;
++
++  while (s < sm) {
++    smb = s + 8;
++    if (smb > sm) {
++      smb = sm;
++    }
++
++    smx = norm_square_dble_avx512( s, smb );
++    s = smb;
++
++    add_to_hsum(isx, &smx);
++  }
++
++  if ((icom == 1) && (NPROC > 1)) {
++    global_hsum(isx, &smx);
++  } else {
++    local_hsum(isx, &smx);
++  }
++
++  return smx;
++}
++
++
++void rotate_dble_avx512(int n, int ix, spinor_dble **ppk, spinor_dble *psi, complex_dble *v);
++void rotate_dble(int vol, int n, spinor_dble **ppk, complex_dble *v)
++{
++  int ix,k;
++
++  if (n > nrot) {
++    alloc_wrotate(n);
++  }
++
++  for (ix = 0; ix < vol; ix++) {
++    rotate_dble_avx512( n, ix, ppk, psi, v );
++
++    for (k = 0; k < n; k++) {
++      *(ppk[k] + ix) = psi[k];
++    }
++  }
++}
++
++
++#elif (defined AVX)
+ #include "avx.h"
+ 
+ #if (defined FMA3)
+diff --git a/modules/sw_term/avx512/pauli_avx512.c b/modules/sw_term/avx512/pauli_avx512.c
+new file mode 100644
+index 0000000..89e9d50
+--- /dev/null
++++ b/modules/sw_term/avx512/pauli_avx512.c
+@@ -0,0 +1,235 @@
++/*******************************************************************************
++*
++* File pauli_avx512.c
++*
++* This software is distributed under the terms of the GNU General Public
++* License (GPL)
++*
++* AVX512 implementations of the clover term multiplication in
++* single precision.
++*
++* See ../pauli_avx512.c for more information and alternative
++* implementations.
++*
++*******************************************************************************/
++
++#ifdef AVX512
++
++#include <stdlib.h>
++#include <stdio.h>
++#include <math.h>
++#include "su3.h"
++#include "sw_term.h"
++typedef union
++{
++  spinor s;
++  weyl w[2];
++} spin_t;
++
++#include "avx512.h"
++
++
++void mul_pauli2(float mu, pauli *m, spinor *source, spinor *res )
++{
++  spin_t *ps, *pr;
++  float const *u, *u2;
++  __m512i idx;
++  __m512 tr1,tr2,tr3;
++  __m512 ts1, ts2, ts3, tsi1, tsi2, tsi3, u512;
++  __m512 tu11, tu12, tu13, tu21, tu22, tu23;
++  __m512 tu1, tu2, tu3, tu4;
++  __m512 umu;
++  __m256 t256;
++  __m128 t128a, t128b, tmu;
++
++  ps = (spin_t *)(source);
++  pr = (spin_t *)(res);
++
++  u = (*m).u;
++  u2 = (m+1)->u;
++
++  weyl * s = (*ps).w;
++  weyl * r = (*pr).w;
++  weyl * s2 = (*ps).w+1;
++  weyl * r2 = (*pr).w+1;
++
++  s += 4;
++  _prefetch_spinor(s);
++  s -= 4;
++
++  tr1 = _mm512_loadu_ps( &(*s).c1.c1.re );
++  tr2 = _mm512_castps256_ps512( _mm256_loadu_ps( &(*s).c1.c1.re+16 ) );
++  idx = _mm512_setr_epi32( 0,1,2,3,6,7,8,9, 12,13,14,15,18,19,20,21 );
++  ts1 = _mm512_permutex2var_ps( tr1, idx, tr2 );
++  idx = _mm512_setr_epi32( 2,3,4,5,8,9,10,11, 14,15,16,17,20,21,22,23 );
++  ts2 = _mm512_permutex2var_ps( tr1, idx, tr2 );
++  idx = _mm512_setr_epi32( 4,5,0,1,10,11,6,7, 16,17,12,13,22,23,18,19 );
++  ts3 = _mm512_permutex2var_ps( tr1, idx, tr2 );
++
++  tu11 = _mm512_loadu_ps( u );
++  tu12 = _mm512_loadu_ps( u+16 );
++  tu13 = _mm512_loadu_ps( u+32 );
++  tu21 = _mm512_loadu_ps( u2 );
++  tu22 = _mm512_loadu_ps( u2+16 );
++  tu23 = _mm512_loadu_ps( u2+32 );
++
++
++  tsi1 = _mm512_permute_ps ( ts1, 0b10110001 );
++  tsi2 = _mm512_permute_ps ( ts2, 0b10110001 );
++  tsi3 = _mm512_permute_ps ( ts3, 0b10110001 );
++
++
++  tmu = _mm_load_ps1( &mu );
++  umu = _mm512_broadcastss_ps( tmu );
++
++
++  idx = _mm512_setr_epi32( 0,1,10,11,4,5,2,3, 16,17,26,27,20,21,18,19 );
++  tu1 = _mm512_permutex2var_ps( tu11, idx, tu21 );
++  idx = _mm512_setr_epi32( 4,5,0,1,12,13,6,7, 20,21,16,17,28,29,22,23);
++  tu2 = _mm512_permutex2var_ps( tu12, idx, tu22 );
++
++  idx = _mm512_setr_epi32( 0,0,1,1,2,3,16+0,16+1, 8,8,9,9,10,11,16+8,16+9 );
++  tu4 = _mm512_permutex2var_ps( tu1, idx, tu2 );
++  u512 = _mm512_permute_ps( tu4, 0b10100000 );
++  tr1 = _mm512_mul_ps( ts1, u512 );
++
++  idx = _mm512_setr_epi32( 0,1,2,3,16+5,16+5,16+7,16+7,
++                           8,9,10,11,16+13,16+13,16+15,16+15 );
++  u512 = _mm512_permutex2var_ps( umu, idx, tu4 );
++  u512 = _mm512_mul_ps( u512, tsi1 );
++  tr1 = _mm512_mask_add_ps( tr1, 0b1010010110101010, tr1, u512 );
++  tr1 = _mm512_mask_sub_ps( tr1, 0b0101101001010101, tr1, u512 );
++
++
++  idx = _mm512_setr_epi32( 0,0,4,4,16+4,16+4,16+5,16+5,
++                           8,8,12,12,16+12,16+12,16+13,16+13 );
++  u512 = _mm512_permutex2var_ps( tu2, idx, tu1 );
++  tr2 = _mm512_mul_ps( ts2, u512 );
++
++  idx = _mm512_setr_epi32( 1,1,5,5,16+4,16+5,16+6,16+7,
++                           9,9,13,13,16+12,16+13,16+14,16+15 );
++  u512 = _mm512_permutex2var_ps( tu2, idx, umu );
++  u512 = _mm512_mul_ps( u512, tsi2 );
++  tr2 = _mm512_mask_add_ps( tr2, 0b0101010110100101, tr2, u512 );
++  tr2 = _mm512_mask_sub_ps( tr2, 0b1010101001011010, tr2, u512 );
++
++
++  idx = _mm512_setr_epi32( 6,6,2,3,16+4,16+5,7,7,
++                           14,14,10,11,16+12,16+13,15,15 );
++  tu4 = _mm512_permutex2var_ps( tu1, idx, tu2 );
++  u512 = _mm512_permute_ps( tu4, 0b10100000 );
++  tr3 = _mm512_mul_ps( ts3, u512 );
++
++  idx = _mm512_setr_epi32( 0,1,16+3,16+3,16+5,16+5,6,7,
++                           8,9,16+11,16+11,16+13,16+13,14,15 );
++  u512 = _mm512_permutex2var_ps( umu, idx, tu4 );
++  u512 = _mm512_mul_ps( u512, tsi3 );
++  tr3 = _mm512_mask_add_ps( tr3, 0b0110010110100110, tr3, u512 );
++  tr3 = _mm512_mask_sub_ps( tr3, 0b1001101001011001, tr3, u512 );
++
++
++
++  idx = _mm512_setr_epi32( 8,9,6,7,14,15,12,13,
++                           24,25,22,23,30,31,28,29 );
++  tu1 = _mm512_permutex2var_ps( tu11, idx, tu21 );
++
++  u512 = _mm512_shuffle_ps( tu1, tu2, 0b10101010 );
++  tr1 = _mm512_fmadd_ps( ts2, u512, tr1 );
++
++  u512 = _mm512_shuffle_ps( tu1, tu2, 0b11111111 );
++  u512 = _mm512_mul_ps( u512, tsi2 );
++  tr1 = _mm512_mask_add_ps( tr1, 0b1010101010101010, tr1, u512 );
++  tr1 = _mm512_mask_sub_ps( tr1, 0b0101010101010101, tr1, u512 );
++
++  m += 4;
++  _prefetch_pauli_dble(m);
++  m -= 4;
++
++  idx = _mm512_setr_epi32( 0,1,2,3,0,1,2,3, 16,17,18,19,16,17,18,19 );
++  tu13 = _mm512_permutex2var_ps( tu13, idx, tu23 );
++  idx = _mm512_setr_epi32( 6,7,2,3,8,9,14,15, 22,23,18,19,24,25,30,31 );
++  tu2 = _mm512_permutex2var_ps( tu12, idx, tu22 );
++
++  idx = _mm512_setr_epi32( 6,7,16+0,16+1,16+6,16+7,0,0,
++                           14,15,16+8,16+9,16+14,16+15,0,0 );
++  tu3 = _mm512_permutex2var_ps( tu1, idx, tu2 );
++  tu4 = _mm512_permute_ps( tu3, 0b10100000 );
++  u512 = _mm512_mask_shuffle_ps( tu4, 0b1111000011110000, tu4, tu13, 0b10100100 );
++  tr2 = _mm512_fmadd_ps( ts1, u512, tr2 );
++
++  tu4 = _mm512_permute_ps( tu3, 0b11110101 );
++  u512 = _mm512_mask_shuffle_ps( tu4, 0b1111000011110000, tu4, tu13, 0b11110100 );
++  u512 = _mm512_mul_ps( u512, tsi1 );
++  tr2 = _mm512_mask_add_ps( tr2, 0b0101010101010101, tr2, u512 );
++  tr2 = _mm512_mask_sub_ps( tr2, 0b1010101010101010, tr2, u512 );
++
++  tu3 = _mm512_mask_shuffle_ps( tu2, 0b0000111100001111, tu1, tu2, 0b11100100 );
++  u512 = _mm512_permute_ps( tu3, 0b10100000 );
++  tr3 = _mm512_fmadd_ps( ts1, u512, tr3 );
++
++  u512 = _mm512_permute_ps( tu3, 0b11110101 );
++  u512 = _mm512_mul_ps( u512, tsi1 );
++  tr3 = _mm512_mask_add_ps( tr3, 0b1010010110100101, tr3, u512 );
++  tr3 = _mm512_mask_sub_ps( tr3, 0b0101101001011010, tr3, u512 );
++
++  idx = _mm512_setr_epi32( 0,1,2,3, 4,5,16+2,16+3, 8,9,10,11,12,13,16+10,16+11 );
++  tu3 = _mm512_permutex2var_ps( tu1, idx, tu2 );
++  u512 = _mm512_permute_ps( tu3, 0b10100000 );
++  tr1 = _mm512_fmadd_ps( ts3, u512, tr1 );
++
++  u512 = _mm512_permute_ps( tu3, 0b11110101 );
++  u512 = _mm512_mul_ps( u512, tsi3 );
++  tr1 = _mm512_mask_add_ps( tr1, 0b1010011010100110, tr1, u512 );
++  tr1 = _mm512_mask_sub_ps( tr1, 0b0101100101011001, tr1, u512 );
++
++
++  idx = _mm512_setr_epi32( 0,1,8,9,10,11,-1,-1, 16,17,24,25,26,27,-1,-1 );
++  tu2 = _mm512_permutex2var_ps( tu12, idx, tu22 );
++
++  idx = _mm512_setr_epi32( 4,5,16+4,16+5,0,0,0,0, 12,13,16+12,16+13,0,0,0,0 );
++  tu3 = _mm512_permutex2var_ps( tu2, idx, tu1 );
++  u512 = _mm512_permute_ps( tu3, 0b10100000 );
++  u512 = _mm512_mask_permute_ps( u512, 0b1111000011110000, tu13, 0b00001010 );
++  tr2 = _mm512_fmadd_ps( ts3, u512, tr2 );
++
++  u512 = _mm512_permute_ps( tu3, 0b11110101 );
++  u512 = _mm512_mask_permute_ps( u512, 0b1111000011110000, tu13, 0b01011111 );
++  u512 = _mm512_mul_ps( u512, tsi3 );
++  tr2 = _mm512_mask_add_ps( tr2, 0b0110010101100101, tr2, u512 );
++  tr2 = _mm512_mask_sub_ps( tr2, 0b1001101010011010, tr2, u512 );
++
++  tu3 = _mm512_mask_shuffle_ps( tu2, 0b1111000011110000, tu2, tu13, 0b01000100 );
++  u512 = _mm512_permute_ps( tu3, 0b10100000 );
++  tr3 = _mm512_fmadd_ps( ts2, u512, tr3 );
++
++  u512 = _mm512_permute_ps( tu3, 0b11110101 );
++  u512 = _mm512_mul_ps( u512, tsi2 );
++  tr3 = _mm512_mask_add_ps( tr3, 0b1010010110100101, tr3, u512 );
++  tr3 = _mm512_mask_sub_ps( tr3, 0b0101101001011010, tr3, u512 );
++
++
++
++  idx = _mm512_setr_epi32( 0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27 );
++  ts1 = _mm512_permutex2var_ps( tr1, idx, tr3 );
++  idx = _mm512_setr_epi32( 4,5,6,7, 20,21,22,23, 12,13,14,15, 28,29,30,31 );
++  ts2 = _mm512_permutex2var_ps( tr1, idx, tr3 );
++  ts3 = _mm512_add_ps( ts1, ts2 );
++
++  t256 = _mm512_castps512_ps256( ts3 );
++  _mm256_storeu_ps( &(*r).c1.c1.re, t256 );
++
++  t128a = _mm512_castps512_ps128( tr2 );
++  t128b = _mm512_extractf32x4_ps( tr2, 1 );
++  t128b = _mm_add_ps( t128a, t128b );
++  _mm_storeu_ps( &(*r).c2.c2.re, t128b );
++
++  t256 = _mm256_castpd_ps( _mm512_extractf64x4_pd( _mm512_castps_pd(ts3), 1 ) );
++  _mm256_storeu_ps( &(*r2).c1.c1.re, t256 );
++
++  t128a = _mm512_extractf32x4_ps( tr2, 2 );
++  t128b = _mm512_extractf32x4_ps( tr2, 3 );
++  t128b = _mm_add_ps( t128a, t128b );
++  _mm_storeu_ps( &(*r2).c2.c2.re, t128b );
++}
++
++#endif
+\ No newline at end of file
+diff --git a/modules/sw_term/avx512/pauli_dble_avx512.c b/modules/sw_term/avx512/pauli_dble_avx512.c
+new file mode 100644
+index 0000000..5eb11fb
+--- /dev/null
++++ b/modules/sw_term/avx512/pauli_dble_avx512.c
+@@ -0,0 +1,489 @@
++/*******************************************************************************
++*
++* File pauli_avx512.c
++*
++* This software is distributed under the terms of the GNU General Public
++* License (GPL)
++*
++* AVX512 implementations of the clover term multiplication in
++* double precision.
++*
++* See ../pauli_dble_avx512.c for more information and alternative
++* implementations.
++*
++*******************************************************************************/
++
++#ifdef AVX512
++
++#include <stdlib.h>
++#include <stdio.h>
++#include <math.h>
++#include <float.h>
++#include "su3.h"
++#include "linalg.h"
++#include "sw_term.h"
++#define DELTA 1.0e-04
++
++typedef union
++{
++  spinor_dble s;
++  weyl_dble w[2];
++  complex_dble c[12];
++} spin_t;
++
++#include "avx512.h"
++
++
++void mul_pauli2_dble(double mu, pauli_dble *m, weyl_dble *s, weyl_dble *r)
++{
++  double const  *u = m->u, *u2 = (m+1)->u;
++
++  __m512d r1,r2,r3;
++  __m512d s1,s2,s3,s4,s5,s6, si1,si2,si3,si4,si5,si6, s512, u512;
++  __m512d t1,t2,t3,t4,t5,t6;
++  __m512d tu11, tu12, tu13, tu14, tu15, tu21, tu22, tu23, tu24, tu25;
++  __m512d tu1, tu2, tu3, tu4, tu5, tu6;
++  __m512d umu;
++  __m512i idx;
++
++  t1 = _mm512_loadu_pd( &(*s).c1.c1.re );
++  t2 = _mm512_loadu_pd( &(*s).c1.c1.re + 8 );
++  t3 = _mm512_loadu_pd( &(*s).c1.c1.re + 16 );
++
++  tu11 = _mm512_loadu_pd( u );
++  tu12 = _mm512_loadu_pd( u+8 );
++  tu13 = _mm512_loadu_pd( u+16 );
++  tu14 = _mm512_loadu_pd( u+24 );
++  tu15 = _mm512_loadu_pd( u+32 );
++  tu21 = _mm512_loadu_pd( u2 );
++  tu22 = _mm512_loadu_pd( u2+8 );
++  tu23 = _mm512_loadu_pd( u2+16 );
++  tu24 = _mm512_loadu_pd( u2+24 );
++  tu25 = _mm512_loadu_pd( u2+32 );
++
++  umu = _mm512_loadu_pd( &mu );
++
++
++  idx = _mm512_setr_epi64( 0,1,12,13,0,1,12,13 );
++  s1 = _mm512_permutex2var_pd( t1, idx, t2 );
++  idx = _mm512_setr_epi64( 2,3,14,15,2,3,14,15 );
++  s2 = _mm512_permutex2var_pd( t1, idx, t2 );
++  idx = _mm512_setr_epi64( 4,5,8,9,4,5,8,9 );
++  s3 = _mm512_permutex2var_pd( t1, idx, t3 );
++  idx = _mm512_setr_epi64( 0,1,12,13,0,1,12,13 );
++  s4 = _mm512_permutex2var_pd( t2, idx, t3 );
++  idx = _mm512_setr_epi64( 6,7,10,11,6,7,10,11 );
++  s5 = _mm512_permutex2var_pd( t1, idx, t3 );
++  idx = _mm512_setr_epi64( 2,3,14,15,2,3,14,15 );
++  s6 = _mm512_permutex2var_pd( t2, idx, t3 );
++
++  si1 = _mm512_permute_pd ( s1, 0b01010101 );
++  si2 = _mm512_permute_pd ( s2, 0b01010101 );
++  si3 = _mm512_permute_pd ( s3, 0b01010101 );
++  si4 = _mm512_permute_pd ( s4, 0b01010101 );
++  si5 = _mm512_permute_pd ( s5, 0b01010101 );
++  si6 = _mm512_permute_pd ( s6, 0b01010101 );
++
++
++  idx = _mm512_setr_epi64( 0,1,8,9,6,7,14,15 );
++  tu1 = _mm512_permutex2var_pd( tu11, idx, tu21 );
++  idx = _mm512_setr_epi64( 4,5,12,13,2,3,10,11 );
++  tu2 = _mm512_permutex2var_pd( tu12, idx, tu22 );
++
++  idx = _mm512_setr_epi64( 0,0,2,2,8+4,8+4,8+6,8+6 );
++  u512 = _mm512_permutex2var_pd( tu1, idx, tu2 );
++  r1 = _mm512_mul_pd( u512, s1 );
++  idx = _mm512_setr_epi64( 0,0,0,0,8+5,8+5,8+7,8+7 );
++  u512 = _mm512_permutex2var_pd( umu, idx, tu2 );
++  u512 = _mm512_mul_pd( u512, si1 );
++  r1 = _mm512_mask_add_pd( r1, 0b01010110, r1, u512 );
++  r1 = _mm512_mask_sub_pd( r1, 0b10101001, r1, u512 );
++
++  idx = _mm512_setr_epi64( 2,3,10,11,4,5,12,13 );
++  tu6 = _mm512_permutex2var_pd( tu11, idx, tu21 );
++
++  idx = _mm512_setr_epi64( 4,4,6,6,8+1,8+1,8+3,8+3 );
++  u512 = _mm512_permutex2var_pd( tu2, idx, tu6 );
++  r1 = _mm512_fmadd_pd( u512, s5, r1 );
++  idx = _mm512_setr_epi64( 5,5,7,7,8,8,8,8 );
++  u512 = _mm512_permutex2var_pd( tu2, idx, umu );
++  u512 = _mm512_mul_pd( u512, si5 );
++  r1 = _mm512_mask_add_pd( r1, 0b01101010, r1, u512 );
++  r1 = _mm512_mask_sub_pd( r1, 0b10010101, r1, u512 );
++
++
++  idx = _mm512_setr_epi64( 2,3,8+2,8+3,4,5,8+4,8+5 );
++  tu3 = _mm512_permutex2var_pd( tu13, idx, tu23 );
++
++  idx = _mm512_setr_epi64( 1,1,3,3,8+4,8+4,8+6,8+6 );
++  u512 = _mm512_permutex2var_pd( tu1, idx, tu3 );
++  r2 = _mm512_mul_pd( u512, s2 );
++  idx = _mm512_setr_epi64( 0,0,0,0,8+5,8+5,8+7,8+7 );
++  u512 = _mm512_permutex2var_pd( umu, idx, tu3 );
++  u512 = _mm512_mul_pd( u512, si2 );
++  r2 = _mm512_mask_add_pd( r2, 0b01010110, r2, u512 );
++  r2 = _mm512_mask_sub_pd( r2, 0b10101001, r2, u512 );
++
++  idx = _mm512_setr_epi64( 4,4,6,6,8+4,8+4,8+6,8+6 );
++  u512 = _mm512_permutex2var_pd( tu3, idx, tu6 );
++  r2 = _mm512_fmadd_pd( u512, s4, r2 );
++  idx = _mm512_setr_epi64( 5,5,7,7,8,8,8,8 );
++  u512 = _mm512_permutex2var_pd( tu3, idx, umu );
++  u512 = _mm512_mul_pd( u512, si4 );
++  r2 = _mm512_mask_add_pd( r2, 0b01101010, r2, u512 );
++  r2 = _mm512_mask_sub_pd( r2, 0b10010101, r2, u512 );
++
++
++  idx = _mm512_setr_epi64( 4,5,8+4,8+5,6,7,8+6,8+7 );
++  tu4 = _mm512_permutex2var_pd( tu14, idx, tu24 );
++
++  idx = _mm512_setr_epi64( 0,0,2,2,8+0,8+0,8+2,8+2 );
++  u512 = _mm512_permutex2var_pd( tu6, idx, tu4 );
++  r3 = _mm512_mul_pd( u512, s3 );
++  idx = _mm512_setr_epi64( 0,0,0,0,8+1,8+1,8+3,8+3 );
++  u512 = _mm512_permutex2var_pd( umu, idx, tu4 );
++  u512 = _mm512_mul_pd( u512, si3 );
++  r3 = _mm512_mask_add_pd( r3, 0b01010110, r3, u512 );
++  r3 = _mm512_mask_sub_pd( r3, 0b10101001, r3, u512 );
++
++  idx = _mm512_setr_epi64( 0,0,2,2,8+5,8+5,8+7,8+7 );
++  u512 = _mm512_permutex2var_pd( tu4, idx, tu6 );
++  r3 = _mm512_fmadd_pd( u512, s6, r3 );
++  idx = _mm512_setr_epi64( 1,1,3,3,8,8,8,8 );
++  u512 = _mm512_permutex2var_pd( tu4, idx, umu );
++  u512 = _mm512_mul_pd( u512, si6 );
++  r3 = _mm512_mask_add_pd( r3, 0b01101010, r3, u512 );
++  r3 = _mm512_mask_sub_pd( r3, 0b10010101, r3, u512 );
++
++
++  idx = _mm512_setr_epi64( 4,4,6,6,8,8,8+2,8+2 );
++  u512 = _mm512_permutex2var_pd( tu1, idx, tu2 );
++  r2 = _mm512_fmadd_pd( u512, s1, r2 );
++  idx = _mm512_setr_epi64( 5,5,7,7,9,9,8+3,8+3 );
++  u512 = _mm512_permutex2var_pd( tu1, idx, tu2 );
++  u512 = _mm512_mul_pd( u512, si1 );
++  r2 = _mm512_mask_add_pd( r2, 0b01010101, r2, u512 );
++  r2 = _mm512_mask_sub_pd( r2, 0b10101010, r2, u512 );
++
++  idx = _mm512_setr_epi64( 0,0,2,2,8+4,8+4,8+6,8+6 );
++  u512 = _mm512_permutex2var_pd( tu2, idx, tu4 );
++  r1 = _mm512_fmadd_pd( u512, s4, r1 );
++  idx = _mm512_setr_epi64( 1,1,3,3,8+5,8+5,8+7,8+7 );
++  u512 = _mm512_permutex2var_pd( tu2, idx, tu4 );
++  u512 = _mm512_mul_pd( u512, si4 );
++  r1 = _mm512_mask_add_pd( r1, 0b10101010, r1, u512 );
++  r1 = _mm512_mask_sub_pd( r1, 0b01010101, r1, u512 );
++
++
++  idx = _mm512_setr_epi64( 4,4,6,6,8+0,8+0,8+2,8+2 );
++  u512 = _mm512_permutex2var_pd( tu1, idx, tu3 );
++  r1 = _mm512_fmadd_pd( u512, s2, r1 );
++  idx = _mm512_setr_epi64( 5,5,7,7,8+1,8+1,8+3,8+3 );
++  u512 = _mm512_permutex2var_pd( tu1, idx, tu3 );
++  u512 = _mm512_mul_pd( u512, si2 );
++  r1 = _mm512_mask_add_pd( r1, 0b01011010, r1, u512 );
++  r1 = _mm512_mask_sub_pd( r1, 0b10100101, r1, u512 );
++
++  idx = _mm512_setr_epi64( 0,0,2,2,8+4,8+4,8+6,8+6 );
++  u512 = _mm512_permutex2var_pd( tu3, idx, tu4 );
++  r2 = _mm512_fmadd_pd( u512, s5, r2 );
++  idx = _mm512_setr_epi64( 1,1,3,3,8+5,8+5,8+7,8+7 );
++  u512 = _mm512_permutex2var_pd( tu3, idx, tu4 );
++  u512 = _mm512_mul_pd( u512, si5 );
++  r2 = _mm512_mask_add_pd( r2, 0b01011010, r2, u512 );
++  r2 = _mm512_mask_sub_pd( r2, 0b10100101, r2, u512 );
++
++  idx = _mm512_setr_epi64( 0,0,8,8,6,6,14,14 );
++  u512 = _mm512_permutex2var_pd( tu12, idx, tu22 );
++  r3 = _mm512_fmadd_pd( u512, s1, r3 );
++  idx = _mm512_setr_epi64( 1,1,9,9,7,7,15,15 );
++  u512 = _mm512_permutex2var_pd( tu12, idx, tu22 );
++  u512 = _mm512_mul_pd( u512, si1 );
++  r3 = _mm512_mask_add_pd( r3, 0b01010101, r3, u512 );
++  r3 = _mm512_mask_sub_pd( r3, 0b10101010, r3, u512 );
++
++
++  idx = _mm512_setr_epi64( 0,0,8,8,6,6,14,14 );
++  u512 = _mm512_permutex2var_pd( tu13, idx, tu23 );
++  r3 = _mm512_fmadd_pd( u512, s2, r3 );
++  idx = _mm512_setr_epi64( 1,1,9,9,7,7,15,15 );
++  u512 = _mm512_permutex2var_pd( tu13, idx, tu23 );
++  u512 = _mm512_mul_pd( u512, si2 );
++  r3 = _mm512_mask_add_pd( r3, 0b01010101, r3, u512 );
++  r3 = _mm512_mask_sub_pd( r3, 0b10101010, r3, u512 );
++
++
++
++  idx = _mm512_setr_epi64( 0,1,8+0,8+1,6,7,8+6,8+7 );
++  tu2 = _mm512_permutex2var_pd( tu12, idx, tu22 );
++  idx = _mm512_setr_epi64( 2,3,8+2,8+3,0,1,8+0,8+1 );
++  tu4 = _mm512_permutex2var_pd( tu14, idx, tu24 );
++
++  idx = _mm512_setr_epi64( 0,0,2,2,8+4,8+4,8+6,8+6 );
++  u512 = _mm512_permutex2var_pd( tu2, idx, tu4 );
++  r1 = _mm512_fmadd_pd( u512, s3, r1 );
++  idx = _mm512_setr_epi64( 1,1,3,3,8+5,8+5,8+7,8+7 );
++  u512 = _mm512_permutex2var_pd( tu2, idx, tu4 );
++  u512 = _mm512_mul_pd( u512, si3 );
++  r1 = _mm512_mask_add_pd( r1, 0b01011010, r1, u512 );
++  r1 = _mm512_mask_sub_pd( r1, 0b10100101, r1, u512 );
++
++
++  idx = _mm512_setr_epi64( 0,1,8+0,8+1,2,3,8+2,8+3 );
++  tu5 = _mm512_permutex2var_pd( tu15, idx, tu25 );
++
++  idx = _mm512_setr_epi64( 4,4,6,6,8+0,8+0,8+2,8+2 );
++  u512 = _mm512_permutex2var_pd( tu4, idx, tu5 );
++  r3 = _mm512_fmadd_pd( u512, s5, r3 );
++  idx = _mm512_setr_epi64( 5,5,7,7,8+1,8+1,8+3,8+3 );
++  u512 = _mm512_permutex2var_pd( tu4, idx, tu5 );
++  u512 = _mm512_mul_pd( u512, si5 );
++  r3 = _mm512_mask_add_pd( r3, 0b01011010, r3, u512 );
++  r3 = _mm512_mask_sub_pd( r3, 0b10100101, r3, u512 );
++
++  idx = _mm512_setr_epi64( 0,0,2,2,8+4,8+4,8+6,8+6 );
++  u512 = _mm512_permutex2var_pd( tu4, idx, tu5 );
++  r3 = _mm512_fmadd_pd( u512, s4, r3 );
++  idx = _mm512_setr_epi64( 1,1,3,3,8+5,8+5,8+7,8+7 );
++  u512 = _mm512_permutex2var_pd( tu4, idx, tu5 );
++  u512 = _mm512_mul_pd( u512, si4 );
++  r3 = _mm512_mask_add_pd( r3, 0b01011010, r3, u512 );
++  r3 = _mm512_mask_sub_pd( r3, 0b10100101, r3, u512 );
++
++  idx = _mm512_setr_epi64( 4,4,6,6,8+0,8+0,8+2,8+2 );
++  u512 = _mm512_permutex2var_pd( tu2, idx, tu5 );
++  r1 = _mm512_fmadd_pd( u512, s6, r1 );
++  idx = _mm512_setr_epi64( 5,5,7,7,8+1,8+1,8+3,8+3 );
++  u512 = _mm512_permutex2var_pd( tu2, idx, tu5 );
++  u512 = _mm512_mul_pd( u512, si6 );
++  r1 = _mm512_mask_add_pd( r1, 0b10101010, r1, u512 );
++  r1 = _mm512_mask_sub_pd( r1, 0b01010101, r1, u512 );
++
++
++  idx = _mm512_setr_epi64( 6,7,8+6,8+7,0,1,8+0,8+1 );
++  tu3 = _mm512_permutex2var_pd( tu13, idx, tu23 );
++
++  idx = _mm512_setr_epi64( 4,4,6,6,8+0,8+0,8+2,8+2 );
++  u512 = _mm512_permutex2var_pd( tu3, idx, tu4 );
++  r2 = _mm512_fmadd_pd( u512, s3, r2 );
++  idx = _mm512_setr_epi64( 5,5,7,7,8+1,8+1,8+3,8+3 );
++  u512 = _mm512_permutex2var_pd( tu3, idx, tu4 );
++  u512 = _mm512_mul_pd( u512, si3 );
++  r2 = _mm512_mask_add_pd( r2, 0b01011010, r2, u512 );
++  r2 = _mm512_mask_sub_pd( r2, 0b10100101, r2, u512 );
++
++  idx = _mm512_setr_epi64( 0,0,2,2,8+4,8+4,8+6,8+6 );
++  u512 = _mm512_permutex2var_pd( tu3, idx, tu5 );
++  r2 = _mm512_fmadd_pd( u512, s6, r2 );
++  idx = _mm512_setr_epi64( 1,1,3,3,8+5,8+5,8+7,8+7 );
++  u512 = _mm512_permutex2var_pd( tu3, idx, tu5 );
++  u512 = _mm512_mul_pd( u512, si6 );
++  r2 = _mm512_mask_add_pd( r2, 0b10101010, r2, u512 );
++  r2 = _mm512_mask_sub_pd( r2, 0b01010101, r2, u512 );
++
++  idx = _mm512_setr_epi64( 0,1,8,9,2,3,10,11 );
++  t1 = _mm512_permutex2var_pd( r1, idx, r2 );
++  idx = _mm512_setr_epi64( 2,3,14,15,0,1,12,13 );
++  t2 = _mm512_permutex2var_pd( r3, idx, r1 );
++  idx = _mm512_setr_epi64( 4,5,12,13,6,7,14,15 );
++  t3 = _mm512_permutex2var_pd( r2, idx, r3 );
++  r1 = _mm512_mask_blend_pd( 0b11110000, t1, t2 );
++  r2 = _mm512_mask_blend_pd( 0b11110000, t3, t1 );
++  r3 = _mm512_mask_blend_pd( 0b11110000, t2, t3 );
++
++  _mm512_storeu_pd( &r[0].c1.c1.re, r1 );
++  _mm512_storeu_pd( &r[0].c2.c2.re, r2 );
++  _mm512_storeu_pd( &r[1].c1.c3.re, r3 );
++}
++
++
++int fwd_house_avx512(double eps, complex_dble *aa, complex_dble *dd, double * rr )
++{
++  int i, j, k, ifail;
++  double r1, r2, r3;
++  complex_dble z;
++
++  ifail = 0;
++
++  for (k = 0; k < 5; k++) {
++    r1 = aa[6 * k + k].re * aa[6 * k + k].re +
++         aa[6 * k + k].im * aa[6 * k + k].im;
++    r2 = sqrt(r1);
++
++    for (j = (k + 1); j < 6; j++)
++      r1 += (aa[6 * j + k].re * aa[6 * j + k].re +
++             aa[6 * j + k].im * aa[6 * j + k].im);
++
++    if (r1 >= eps)
++      r1 = sqrt(r1);
++    else {
++      ifail = 1;
++      r1 = 1.0;
++    }
++
++    if (r2 >= (DBL_EPSILON * r1)) {
++      r3 = 1.0 / r2;
++      z.re = r3 * aa[6 * k + k].re;
++      z.im = r3 * aa[6 * k + k].im;
++    } else {
++      z.re = 1.0;
++      z.im = 0.0;
++    }
++
++    aa[6 * k + k].re += r1 * z.re;
++    aa[6 * k + k].im += r1 * z.im;
++
++    r3 = 1.0 / (r1 * (r1 + r2));
++    rr[k] = r3;
++    dd[k].re = -(r1 + r2) * r3 * z.re;
++    dd[k].im = (r1 + r2) * r3 * z.im;
++
++    for (j = (k + 1); j < 6; j++) {
++      complex_dble z, *ak, *aj;
++      __m128d mz, t1, t2, t3;
++      mz = _mm_setzero_pd();
++
++      ak = aa + 6 * k + k;
++      aj = aa + 6 * k + j;
++
++      for (i = k; i < 6; i++) {
++        t1 = _mm_loaddup_pd(&ak->re);
++        t2 = _mm_loaddup_pd(&ak->im);
++        t3 = _mm_load_pd(&aj->re);
++        t2 = _mm_mul_pd( t2, t3 );
++        t2 = _mm_permute_pd( t2, 0b01 );
++        t1 = _mm_fmsubadd_pd( t1, t3, t2 );
++        mz = _mm_add_pd( mz, t1 );
++        ak += 6;
++        aj += 6;
++      }
++
++      t1 = _mm_loaddup_pd(&r3);
++      mz = _mm_mul_pd( mz, t1 );
++      _mm_storeu_pd( &z.re, mz );
++
++      ak = aa + 6 * k + k;
++      aj = aa + 6 * k + j;
++      for (i = k; i < 6; i++) {
++        t1 = _mm_loaddup_pd(&ak->re);
++        t2 = _mm_loaddup_pd(&ak->im);
++        t3 = _mm_load_pd(&aj->re);
++        t2 = _mm_mul_pd( mz, t2 );
++        t2 = _mm_permute_pd( t2, 0b01 );
++        t1 = _mm_fmaddsub_pd( mz,t1, t2 );
++        t3 = _mm_sub_pd( t3, t1 );
++        _mm_storeu_pd( &aj->re, t3 );
++        ak += 6;
++        aj += 6;
++      }
++    }
++  }
++
++  r1 = aa[35].re * aa[35].re + aa[35].im * aa[35].im;
++
++  if (r1 >= eps)
++    r1 = 1.0 / r1;
++  else {
++    ifail = 1;
++    r1 = 1.0;
++  }
++
++  dd[5].re = r1 * aa[35].re;
++  dd[5].im = -r1 * aa[35].im;
++
++  return ifail;
++}
++
++
++void solv_sys_avx512( complex_dble *aa, complex_dble *dd )
++{
++  int i, j, k;
++  complex_dble z;
++  __m128d mz, t1, t2, t3;
++
++  for (k = 5; k > 0; k--) {
++    for (i = (k - 1); i >= 0; i--) {
++      t1 = _mm_loaddup_pd(&aa[6 * i + k].re);
++      t2 = _mm_loaddup_pd(&aa[6 * i + k].im);
++      t3 = _mm_load_pd(&dd[k].re);
++      t2 = _mm_mul_pd( t2, t3 );
++      t2 = _mm_permute_pd( t2, 0b01 );
++      mz = _mm_fmaddsub_pd( t1, t3, t2 );
++
++      for (j = (k - 1); j > i; j--) {
++        t1 = _mm_loaddup_pd(&aa[6 * i + j].re);
++        t2 = _mm_loaddup_pd(&aa[6 * i + j].im);
++        t3 = _mm_load_pd(&aa[6 * j + k].re);
++        t2 = _mm_mul_pd( t2, t3 );
++        t2 = _mm_permute_pd( t2, 0b01 );
++        t1 = _mm_fmaddsub_pd( t1, t3, t2 );
++        mz = _mm_add_pd( mz, t1 );
++      }
++
++      t1 = _mm_loaddup_pd(&dd[i].re);
++      t2 = _mm_loaddup_pd(&dd[i].im);
++      t2 = _mm_mul_pd( t2, mz );
++      t2 = _mm_permute_pd( t2, 0b01 );
++      t1 = _mm_fmaddsub_pd( t1, mz, t2 );
++      t1 = _mm_sub_pd( _mm_setzero_pd(), t1);  /* this line flips the sign of t1 */
++      _mm_storeu_pd( &aa[6 * i + k].re, t1 );
++    }
++  }
++}
++
++void bck_house_avx512( complex_dble *aa, complex_dble *dd, double * rr )
++{
++  int i, j, k;
++  complex_dble z;
++
++  aa[35].re = dd[5].re;
++  aa[35].im = dd[5].im;
++
++  for (k = 4; k >= 0; k--) {
++    z.re = dd[k].re;
++    z.im = dd[k].im;
++    dd[k].re = aa[6 * k + k].re;
++    dd[k].im = aa[6 * k + k].im;
++    aa[6 * k + k].re = z.re;
++    aa[6 * k + k].im = z.im;
++
++    for (j = (k + 1); j < 6; j++) {
++      dd[j].re = aa[6 * j + k].re;
++      dd[j].im = aa[6 * j + k].im;
++      aa[6 * j + k].re = 0.0;
++      aa[6 * j + k].im = 0.0;
++    }
++
++    for (i = 0; i < 6; i++) {
++      __m128d mz, t1, t2, t3;
++      mz = _mm_setzero_pd();
++
++      for (j = k; j < 6; j++) {
++        t1 = _mm_loaddup_pd(&aa[6 * i + j].re);
++        t2 = _mm_loaddup_pd(&aa[6 * i + j].im);
++        t3 = _mm_load_pd(&dd[j].re);
++        t2 = _mm_mul_pd( t2, t3 );
++        t2 = _mm_permute_pd( t2, 0b01 );
++        t1 = _mm_fmaddsub_pd( t1, t3, t2 );
++        mz = _mm_add_pd( mz, t1 );
++      }
++
++      t1 = _mm_loaddup_pd( rr+k );
++      mz = _mm_mul_pd( mz, t1 );
++
++      for (j = k; j < 6; j++) {
++        t1 = _mm_loaddup_pd( &dd[j].re );
++        t2 = _mm_loaddup_pd( &dd[j].im );
++        t2 = _mm_mul_pd( t2, mz );
++        t2 = _mm_permute_pd( t2, 0b01 );
++        t1 = _mm_fmsubadd_pd( t1, mz, t2 );
++
++        t2 = _mm_load_pd( &aa[6 * i + j].re );
++        t1 = _mm_sub_pd( t2, t1 );
++        _mm_storeu_pd( &aa[6 * i + j].re, t1 );
++      }
++    }
++  }
++}
++
++#endif
+\ No newline at end of file
+diff --git a/modules/sw_term/pauli.c b/modules/sw_term/pauli.c
+index 7c8a7b6..8e14bee 100644
+--- a/modules/sw_term/pauli.c
++++ b/modules/sw_term/pauli.c
+@@ -409,7 +409,96 @@ void mul_pauli(float mu,pauli *m,weyl *s,weyl *r)
+                          "xmm6", "xmm7");
+ }
+ 
+-#if (defined AVX)
++
++#else
++
++static weyl rs;
++
++void mul_pauli(float mu, pauli *m, weyl *s, weyl *r)
++{
++  float const *u;
++
++  u = (*m).u;
++
++  rs.c1.c1.re =
++      u[0] * (*s).c1.c1.re - mu * (*s).c1.c1.im + u[6] * (*s).c1.c2.re -
++      u[7] * (*s).c1.c2.im + u[8] * (*s).c1.c3.re - u[9] * (*s).c1.c3.im +
++      u[10] * (*s).c2.c1.re - u[11] * (*s).c2.c1.im + u[12] * (*s).c2.c2.re -
++      u[13] * (*s).c2.c2.im + u[14] * (*s).c2.c3.re - u[15] * (*s).c2.c3.im;
++
++  rs.c1.c1.im =
++      u[0] * (*s).c1.c1.im + mu * (*s).c1.c1.re + u[6] * (*s).c1.c2.im +
++      u[7] * (*s).c1.c2.re + u[8] * (*s).c1.c3.im + u[9] * (*s).c1.c3.re +
++      u[10] * (*s).c2.c1.im + u[11] * (*s).c2.c1.re + u[12] * (*s).c2.c2.im +
++      u[13] * (*s).c2.c2.re + u[14] * (*s).c2.c3.im + u[15] * (*s).c2.c3.re;
++
++  rs.c1.c2.re =
++      u[6] * (*s).c1.c1.re + u[7] * (*s).c1.c1.im + u[1] * (*s).c1.c2.re -
++      mu * (*s).c1.c2.im + u[16] * (*s).c1.c3.re - u[17] * (*s).c1.c3.im +
++      u[18] * (*s).c2.c1.re - u[19] * (*s).c2.c1.im + u[20] * (*s).c2.c2.re -
++      u[21] * (*s).c2.c2.im + u[22] * (*s).c2.c3.re - u[23] * (*s).c2.c3.im;
++
++  rs.c1.c2.im =
++      u[6] * (*s).c1.c1.im - u[7] * (*s).c1.c1.re + u[1] * (*s).c1.c2.im +
++      mu * (*s).c1.c2.re + u[16] * (*s).c1.c3.im + u[17] * (*s).c1.c3.re +
++      u[18] * (*s).c2.c1.im + u[19] * (*s).c2.c1.re + u[20] * (*s).c2.c2.im +
++      u[21] * (*s).c2.c2.re + u[22] * (*s).c2.c3.im + u[23] * (*s).c2.c3.re;
++
++  rs.c1.c3.re =
++      u[8] * (*s).c1.c1.re + u[9] * (*s).c1.c1.im + u[16] * (*s).c1.c2.re +
++      u[17] * (*s).c1.c2.im + u[2] * (*s).c1.c3.re - mu * (*s).c1.c3.im +
++      u[24] * (*s).c2.c1.re - u[25] * (*s).c2.c1.im + u[26] * (*s).c2.c2.re -
++      u[27] * (*s).c2.c2.im + u[28] * (*s).c2.c3.re - u[29] * (*s).c2.c3.im;
++
++  rs.c1.c3.im =
++      u[8] * (*s).c1.c1.im - u[9] * (*s).c1.c1.re + u[16] * (*s).c1.c2.im -
++      u[17] * (*s).c1.c2.re + u[2] * (*s).c1.c3.im + mu * (*s).c1.c3.re +
++      u[24] * (*s).c2.c1.im + u[25] * (*s).c2.c1.re + u[26] * (*s).c2.c2.im +
++      u[27] * (*s).c2.c2.re + u[28] * (*s).c2.c3.im + u[29] * (*s).c2.c3.re;
++
++  rs.c2.c1.re =
++      u[10] * (*s).c1.c1.re + u[11] * (*s).c1.c1.im + u[18] * (*s).c1.c2.re +
++      u[19] * (*s).c1.c2.im + u[24] * (*s).c1.c3.re + u[25] * (*s).c1.c3.im +
++      u[3] * (*s).c2.c1.re - mu * (*s).c2.c1.im + u[30] * (*s).c2.c2.re -
++      u[31] * (*s).c2.c2.im + u[32] * (*s).c2.c3.re - u[33] * (*s).c2.c3.im;
++
++  rs.c2.c1.im =
++      u[10] * (*s).c1.c1.im - u[11] * (*s).c1.c1.re + u[18] * (*s).c1.c2.im -
++      u[19] * (*s).c1.c2.re + u[24] * (*s).c1.c3.im - u[25] * (*s).c1.c3.re +
++      u[3] * (*s).c2.c1.im + mu * (*s).c2.c1.re + u[30] * (*s).c2.c2.im +
++      u[31] * (*s).c2.c2.re + u[32] * (*s).c2.c3.im + u[33] * (*s).c2.c3.re;
++
++  rs.c2.c2.re =
++      u[12] * (*s).c1.c1.re + u[13] * (*s).c1.c1.im + u[20] * (*s).c1.c2.re +
++      u[21] * (*s).c1.c2.im + u[26] * (*s).c1.c3.re + u[27] * (*s).c1.c3.im +
++      u[30] * (*s).c2.c1.re + u[31] * (*s).c2.c1.im + u[4] * (*s).c2.c2.re -
++      mu * (*s).c2.c2.im + u[34] * (*s).c2.c3.re - u[35] * (*s).c2.c3.im;
++
++  rs.c2.c2.im =
++      u[12] * (*s).c1.c1.im - u[13] * (*s).c1.c1.re + u[20] * (*s).c1.c2.im -
++      u[21] * (*s).c1.c2.re + u[26] * (*s).c1.c3.im - u[27] * (*s).c1.c3.re +
++      u[30] * (*s).c2.c1.im - u[31] * (*s).c2.c1.re + u[4] * (*s).c2.c2.im +
++      mu * (*s).c2.c2.re + u[34] * (*s).c2.c3.im + u[35] * (*s).c2.c3.re;
++
++  rs.c2.c3.re =
++      u[14] * (*s).c1.c1.re + u[15] * (*s).c1.c1.im + u[22] * (*s).c1.c2.re +
++      u[23] * (*s).c1.c2.im + u[28] * (*s).c1.c3.re + u[29] * (*s).c1.c3.im +
++      u[32] * (*s).c2.c1.re + u[33] * (*s).c2.c1.im + u[34] * (*s).c2.c2.re +
++      u[35] * (*s).c2.c2.im + u[5] * (*s).c2.c3.re - mu * (*s).c2.c3.im;
++
++  rs.c2.c3.im =
++      u[14] * (*s).c1.c1.im - u[15] * (*s).c1.c1.re + u[22] * (*s).c1.c2.im -
++      u[23] * (*s).c1.c2.re + u[28] * (*s).c1.c3.im - u[29] * (*s).c1.c3.re +
++      u[32] * (*s).c2.c1.im - u[33] * (*s).c2.c1.re + u[34] * (*s).c2.c2.im -
++      u[35] * (*s).c2.c2.re + u[5] * (*s).c2.c3.im + mu * (*s).c2.c3.re;
++
++  (*r) = rs;
++}
++
++#endif
++
++#ifndef AVX512
++#ifdef AVX
+ #include "avx.h"
+ 
+ void mul_pauli2(float mu,pauli *m,spinor *s,spinor *r)
+@@ -952,128 +1041,6 @@ void mul_pauli2(float mu,pauli *m,spinor *s,spinor *r)
+ }
+ 
+ #endif
+-#else
+-
+-static weyl rs;
+-
+-
+-void mul_pauli(float mu,pauli *m,weyl *s,weyl *r)
+-{
+-   float *u;
+-
+-   u=(*m).u;
+-
+-   rs.c1.c1.re=
+-      u[ 0]*(*s).c1.c1.re-   mu*(*s).c1.c1.im+
+-      u[ 6]*(*s).c1.c2.re-u[ 7]*(*s).c1.c2.im+
+-      u[ 8]*(*s).c1.c3.re-u[ 9]*(*s).c1.c3.im+
+-      u[10]*(*s).c2.c1.re-u[11]*(*s).c2.c1.im+
+-      u[12]*(*s).c2.c2.re-u[13]*(*s).c2.c2.im+
+-      u[14]*(*s).c2.c3.re-u[15]*(*s).c2.c3.im;
+-
+-   rs.c1.c1.im=
+-      u[ 0]*(*s).c1.c1.im+   mu*(*s).c1.c1.re+
+-      u[ 6]*(*s).c1.c2.im+u[ 7]*(*s).c1.c2.re+
+-      u[ 8]*(*s).c1.c3.im+u[ 9]*(*s).c1.c3.re+
+-      u[10]*(*s).c2.c1.im+u[11]*(*s).c2.c1.re+
+-      u[12]*(*s).c2.c2.im+u[13]*(*s).c2.c2.re+
+-      u[14]*(*s).c2.c3.im+u[15]*(*s).c2.c3.re;
+-
+-   rs.c1.c2.re=
+-      u[ 6]*(*s).c1.c1.re+u[ 7]*(*s).c1.c1.im+
+-      u[ 1]*(*s).c1.c2.re-   mu*(*s).c1.c2.im+
+-      u[16]*(*s).c1.c3.re-u[17]*(*s).c1.c3.im+
+-      u[18]*(*s).c2.c1.re-u[19]*(*s).c2.c1.im+
+-      u[20]*(*s).c2.c2.re-u[21]*(*s).c2.c2.im+
+-      u[22]*(*s).c2.c3.re-u[23]*(*s).c2.c3.im;
+-
+-   rs.c1.c2.im=
+-      u[ 6]*(*s).c1.c1.im-u[ 7]*(*s).c1.c1.re+
+-      u[ 1]*(*s).c1.c2.im+   mu*(*s).c1.c2.re+
+-      u[16]*(*s).c1.c3.im+u[17]*(*s).c1.c3.re+
+-      u[18]*(*s).c2.c1.im+u[19]*(*s).c2.c1.re+
+-      u[20]*(*s).c2.c2.im+u[21]*(*s).c2.c2.re+
+-      u[22]*(*s).c2.c3.im+u[23]*(*s).c2.c3.re;
+-
+-   rs.c1.c3.re=
+-      u[ 8]*(*s).c1.c1.re+u[ 9]*(*s).c1.c1.im+
+-      u[16]*(*s).c1.c2.re+u[17]*(*s).c1.c2.im+
+-      u[ 2]*(*s).c1.c3.re-   mu*(*s).c1.c3.im+
+-      u[24]*(*s).c2.c1.re-u[25]*(*s).c2.c1.im+
+-      u[26]*(*s).c2.c2.re-u[27]*(*s).c2.c2.im+
+-      u[28]*(*s).c2.c3.re-u[29]*(*s).c2.c3.im;
+-
+-   rs.c1.c3.im=
+-      u[ 8]*(*s).c1.c1.im-u[ 9]*(*s).c1.c1.re+
+-      u[16]*(*s).c1.c2.im-u[17]*(*s).c1.c2.re+
+-      u[ 2]*(*s).c1.c3.im+   mu*(*s).c1.c3.re+
+-      u[24]*(*s).c2.c1.im+u[25]*(*s).c2.c1.re+
+-      u[26]*(*s).c2.c2.im+u[27]*(*s).c2.c2.re+
+-      u[28]*(*s).c2.c3.im+u[29]*(*s).c2.c3.re;
+-
+-   rs.c2.c1.re=
+-      u[10]*(*s).c1.c1.re+u[11]*(*s).c1.c1.im+
+-      u[18]*(*s).c1.c2.re+u[19]*(*s).c1.c2.im+
+-      u[24]*(*s).c1.c3.re+u[25]*(*s).c1.c3.im+
+-      u[ 3]*(*s).c2.c1.re-   mu*(*s).c2.c1.im+
+-      u[30]*(*s).c2.c2.re-u[31]*(*s).c2.c2.im+
+-      u[32]*(*s).c2.c3.re-u[33]*(*s).c2.c3.im;
+-
+-   rs.c2.c1.im=
+-      u[10]*(*s).c1.c1.im-u[11]*(*s).c1.c1.re+
+-      u[18]*(*s).c1.c2.im-u[19]*(*s).c1.c2.re+
+-      u[24]*(*s).c1.c3.im-u[25]*(*s).c1.c3.re+
+-      u[ 3]*(*s).c2.c1.im+   mu*(*s).c2.c1.re+
+-      u[30]*(*s).c2.c2.im+u[31]*(*s).c2.c2.re+
+-      u[32]*(*s).c2.c3.im+u[33]*(*s).c2.c3.re;
+-
+-   rs.c2.c2.re=
+-      u[12]*(*s).c1.c1.re+u[13]*(*s).c1.c1.im+
+-      u[20]*(*s).c1.c2.re+u[21]*(*s).c1.c2.im+
+-      u[26]*(*s).c1.c3.re+u[27]*(*s).c1.c3.im+
+-      u[30]*(*s).c2.c1.re+u[31]*(*s).c2.c1.im+
+-      u[ 4]*(*s).c2.c2.re-   mu*(*s).c2.c2.im+
+-      u[34]*(*s).c2.c3.re-u[35]*(*s).c2.c3.im;
+-
+-   rs.c2.c2.im=
+-      u[12]*(*s).c1.c1.im-u[13]*(*s).c1.c1.re+
+-      u[20]*(*s).c1.c2.im-u[21]*(*s).c1.c2.re+
+-      u[26]*(*s).c1.c3.im-u[27]*(*s).c1.c3.re+
+-      u[30]*(*s).c2.c1.im-u[31]*(*s).c2.c1.re+
+-      u[ 4]*(*s).c2.c2.im+   mu*(*s).c2.c2.re+
+-      u[34]*(*s).c2.c3.im+u[35]*(*s).c2.c3.re;
+-
+-   rs.c2.c3.re=
+-      u[14]*(*s).c1.c1.re+u[15]*(*s).c1.c1.im+
+-      u[22]*(*s).c1.c2.re+u[23]*(*s).c1.c2.im+
+-      u[28]*(*s).c1.c3.re+u[29]*(*s).c1.c3.im+
+-      u[32]*(*s).c2.c1.re+u[33]*(*s).c2.c1.im+
+-      u[34]*(*s).c2.c2.re+u[35]*(*s).c2.c2.im+
+-      u[ 5]*(*s).c2.c3.re-   mu*(*s).c2.c3.im;
+-
+-   rs.c2.c3.im=
+-      u[14]*(*s).c1.c1.im-u[15]*(*s).c1.c1.re+
+-      u[22]*(*s).c1.c2.im-u[23]*(*s).c1.c2.re+
+-      u[28]*(*s).c1.c3.im-u[29]*(*s).c1.c3.re+
+-      u[32]*(*s).c2.c1.im-u[33]*(*s).c2.c1.re+
+-      u[34]*(*s).c2.c2.im-u[35]*(*s).c2.c2.re+
+-      u[ 5]*(*s).c2.c3.im+   mu*(*s).c2.c3.re;
+-
+-   (*r)=rs;
+-}
+-
+-
+-void mul_pauli2(float mu,pauli *m,spinor *s,spinor *r)
+-{
+-   spin_t *ps,*pr;
+-
+-   ps=(spin_t*)(s);
+-   pr=(spin_t*)(r);
+-
+-   mul_pauli(mu,m,(*ps).w,(*pr).w);
+-   mul_pauli(-mu,m+1,(*ps).w+1,(*pr).w+1);
+-}
+-
+ #endif
+ 
+ void assign_pauli(int vol,pauli_dble *md,pauli *m)
+diff --git a/modules/sw_term/pauli_dble.c b/modules/sw_term/pauli_dble.c
+index 39be1d5..65f7376 100644
+--- a/modules/sw_term/pauli_dble.c
++++ b/modules/sw_term/pauli_dble.c
+@@ -87,6 +87,39 @@ static complex_dble aa[36] ALIGNED16;
+ static complex_dble cc[6] ALIGNED16;
+ static complex_dble dd[6] ALIGNED16;
+ 
++
++
++#if (defined AVX512)
++
++int fwd_house_avx512(double eps, complex_dble *aa, complex_dble *dd, double * rr );
++static int fwd_house(double eps ){
++  return fwd_house_avx512( eps, aa, dd, rr );
++}
++
++void solv_sys_avx512( complex_dble *aa, complex_dble *dd );
++static void solv_sys(void){
++  solv_sys_avx512( aa, dd );
++}
++
++void bck_house_avx512( complex_dble *aa, complex_dble *dd, double * rr );
++static void bck_house(void){
++  bck_house_avx512( aa, dd, rr );
++}
++
++#else
++
++void mul_pauli2_dble(double mu, pauli_dble *m, weyl_dble *s, weyl_dble *r)
++{
++  mul_pauli_dble( mu, m, s, r );
++  mul_pauli_dble( -mu, m+1, s+1, r+1 );
++}
++#endif
++
++
++
++
++
++
+ #if (defined x64)
+ #include "sse2.h"
+ 
+@@ -997,6 +1030,7 @@ void mul_pauli_dble(double mu,pauli_dble *m,weyl_dble *s,weyl_dble *r)
+ 
+ #endif
+ 
++#ifndef AVX512
+ static int fwd_house(double eps)
+ {
+    int i,j,k,ifail;
+@@ -1313,10 +1347,12 @@ static void bck_house(void)
+    }
+ }
+ 
++#endif
++
+ #else
+ 
+-static weyl_dble rs;
+ 
++static weyl_dble rs;
+ 
+ void mul_pauli_dble(double mu,pauli_dble *m,weyl_dble *s,weyl_dble *r)
+ {
+@@ -1423,7 +1459,7 @@ void mul_pauli_dble(double mu,pauli_dble *m,weyl_dble *s,weyl_dble *r)
+    (*r)=rs;
+ }
+ 
+-
++#ifndef AVX512
+ static int fwd_house(double eps)
+ {
+    int i,j,k,ifail;
+@@ -1582,6 +1618,8 @@ static void bck_house(void)
+ 
+ #endif
+ 
++#endif
++
+ static double set_aa(double mu,pauli_dble *m)
+ {
+    int i,j;
+@@ -1780,10 +1818,8 @@ void apply_sw_dble(int vol,double mu,pauli_dble *m,spinor_dble *s,
+ 
+    for (;ps<pm;ps++)
+    {
+-      mul_pauli_dble(mu,m,(*ps).w,(*pr).w);
+-      m+=1;
+-      mul_pauli_dble(-mu,m,(*ps).w+1,(*pr).w+1);
+-      m+=1;
++      mul_pauli2_dble( mu, m, (*ps).w, (*pr).w );
++      m+=2;
+       pr+=1;
+    }
+ }
+-- 
+2.17.1
+