From 8770e571b01199bca333313fa4f301f0728f6406 Mon Sep 17 00:00:00 2001 From: "j.m.o.rantaharju" Date: Mon, 8 Oct 2018 16:18:56 +0100 Subject: [PATCH] Patch to reproduce changes in openQCD 1.6 --- AVX512_patch | 4959 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 4959 insertions(+) create mode 100644 AVX512_patch diff --git a/AVX512_patch b/AVX512_patch new file mode 100644 index 0000000..1dde314 --- /dev/null +++ b/AVX512_patch @@ -0,0 +1,4959 @@ +From 7b114a257f936ac26be81aab8af96d586c38c4e0 Mon Sep 17 00:00:00 2001 +From: "j.m.o.rantaharju" +Date: Thu, 19 Apr 2018 14:39:10 +0100 +Subject: [PATCH] This patch extends openQCD-1.6 with an implementation of the + Dirac operator with intel Intrincic operations in order to use the full + vector width on Intel Skylake, Knights Landing and other processors. + +Add patch file to reproduce changes + +Removing assembly files + +Compilation of AVX code with ICC is is now possible +--- + CITATION.cff | 39 + + README-AVX512 | 24 + + devel/archive/Makefile | 37 +- + devel/block/Makefile | 36 +- + devel/dfl/Makefile | 36 +- + devel/dirac/Makefile | 39 +- + devel/forces/Makefile | 36 +- + devel/linalg/Makefile | 17 +- + devel/little/Makefile | 36 +- + devel/sap/Makefile | 36 +- + devel/sflds/Makefile | 34 +- + devel/sw_term/Makefile | 21 +- + devel/update/Makefile | 37 +- + devel/vflds/Makefile | 33 +- + include/avx512.h | 978 +++++++++++++++++++++ + include/sw_term.h | 1 + + main/Makefile | 36 +- + modules/dirac/Dw.c | 22 +- + modules/dirac/Dw_dble.c | 74 +- + modules/dirac/avx512/Dw_avx512.c | 221 +++++ + modules/dirac/avx512/Dw_dble_avx512.c | 261 ++++++ + modules/linalg/avx512/salg_avx512.c | 154 ++++ + modules/linalg/avx512/salg_dble_avx512.c | 402 +++++++++ + modules/linalg/salg.c | 82 +- + modules/linalg/salg_dble.c | 165 +++- + modules/sw_term/avx512/pauli_avx512.c | 235 +++++ + modules/sw_term/avx512/pauli_dble_avx512.c | 489 +++++++++++ + modules/sw_term/pauli.c | 213 ++--- + modules/sw_term/pauli_dble.c | 48 +- + 29 files changed, 3512 insertions(+), 330 deletions(-) + create mode 100644 CITATION.cff + create mode 100644 README-AVX512 + create mode 100644 include/avx512.h + create mode 100644 modules/dirac/avx512/Dw_avx512.c + create mode 100644 modules/dirac/avx512/Dw_dble_avx512.c + create mode 100644 modules/linalg/avx512/salg_avx512.c + create mode 100644 modules/linalg/avx512/salg_dble_avx512.c + create mode 100644 modules/sw_term/avx512/pauli_avx512.c + create mode 100644 modules/sw_term/avx512/pauli_dble_avx512.c + +diff --git a/CITATION.cff b/CITATION.cff +new file mode 100644 +index 0000000..c2e855d +--- /dev/null ++++ b/CITATION.cff +@@ -0,0 +1,39 @@ ++ ++cff-version: 1.0.3 ++message: "Please cite the following works when using this software." ++ ++title: OpenQCD AVX512 Extension ++version: 1.0 ++date-released: 19 January 2018 ++authors: ++ - name: "Swansea Academy of Advanced Computing" ++ website: http://www.swansea.ac.uk/iss/sa2c/ ++ email: sa2c-support@swansea.ac.uk ++ - family-names: Rantaharju ++ given-names: Jarno ++ orcid: https://orcid.org/0000-0002-0072-7707 ++ - family-names: Messiti ++ given-names: Michele ++ ++references: ++ - type: software-code ++ name: OpenQCD ++ version: 1.6 ++ website: luscher.web.cern.ch/luscher/openQCD/ ++ authors: ++ - family-names: Lüscher ++ given-names: Martin ++ - family-names: Schaefer ++ given-names: Stefan ++ - family-names: Bulava ++ given-names: John ++ - family-names: Del Debbio ++ given-names: Luigi ++ - family-names: Giusti ++ given-names: Leonardo ++ - family-names: Leder ++ given-names: Björn ++ - family-names: Palombi ++ given-names: Filippo ++ date-released: 22. April 2014 ++ email: luscher@mail.cern.ch +diff --git a/README-AVX512 b/README-AVX512 +new file mode 100644 +index 0000000..25a47ff +--- /dev/null ++++ b/README-AVX512 +@@ -0,0 +1,24 @@ ++An optimized version of openQCD-1.6 for Intel processors with 512 bit vector width ++================================================================================== ++ ++DESCRIPTION ++ ++ This code extends openQCD-1.6 with an implementation of the Dirac operator ++ with intel Intrincic operations in order to use the full vector width on Intel ++ Skylake, Knight Landing and other processors. ++ ++ The extension is enabled with the -DAVX521 flag in CFLAGS in the Makefiles. ++ ++ ++ ++AUTHORS ++ ++ This patch extends the openQCD code written by Martin Luscher, Stefan ++ Schaefer and other contributors (see http://luscher.web.cern.ch/luscher/openQCD/). ++ ++ The batch has been written by and tested by Jarno Rantaharju and Michele Messiti. ++ ++LICENSE ++ ++ The code may be used under the terms of the GNU General Public License (GPL) ++ http://www.fsf.org/licensing/licenses/gpl.html +diff --git a/devel/archive/Makefile b/devel/archive/Makefile +index 8c94a12..c5e949a 100644 +--- a/devel/archive/Makefile ++++ b/devel/archive/Makefile +@@ -39,9 +39,13 @@ UFLDS = plaq_sum uflds udcom + + UTILS = endian error hsum mutils utils wspace + +-MODULES = $(FLAGS) $(LATTICE) $(ARCHIVE) $(LINALG) $(RANDOM) $(UFLDS) \ ++STD_MODULES = $(FLAGS) $(LATTICE) $(ARCHIVE) $(LINALG) $(RANDOM) $(UFLDS) \ + $(SFLDS) $(SU3FCTS) $(UTILS) + ++AVX512_MODULES = salg_dble_avx512 ++ ++AVX512_ASM_MODULES = salg_dble_avx512_asm ++ + + # Logging option (-mpilog or -mpitrace or -mpianim) + +@@ -54,7 +58,7 @@ MDIR = ../../modules + + VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/sflds:\ +- $(MDIR)/su3fcts:$(MDIR)/utils ++ $(MDIR)/su3fcts:$(MDIR)/utils:$(MDIR)/linalg/avx512: + + + # additional include directories +@@ -72,10 +76,10 @@ LIBPATH = $(MPI_HOME)/lib + # scheduling and optimization options + + CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ +- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ +- -O -mno-avx -Dx64 -DPM ++ -Wall -Wno-long-long -Wstrict-prototypes \ ++ -O2 -DAVX512 + +-LFLAGS = ++LFLAGS = $(CFLAGS) + + + ############################## do not change ################################### +@@ -84,25 +88,40 @@ SHELL=/bin/bash + CC=mpicc + CLINKER=$(CC) + ++#Check CFLAGS to find which AVX512 flags are active ++ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) ++MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) ++else ++ifneq (,$(findstring AVX512,$(CFLAGS))) ++MODULES= $(STD_MODULES) $(AVX512_MODULES) ++else ++MODULES= $(STD_MODULES) ++endif ++endif ++ + PGMS= $(MAIN) $(MODULES) + + -include $(addsuffix .d,$(PGMS)) + + +-# rule to make dependencies + ++ ++# rule to make dependencies + $(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + +- + # rule to compile source programs ++$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile ++ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + +-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile ++# pattern to compile files in the avx512 directiories ++$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + ++$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile ++ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + # rule to link object files +- + $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ +diff --git a/devel/block/Makefile b/devel/block/Makefile +index ad90f62..3856581 100644 +--- a/devel/block/Makefile ++++ b/devel/block/Makefile +@@ -45,9 +45,15 @@ UFLDS = uflds udcom shift + + UTILS = endian error hsum mutils utils wspace + +-MODULES = $(FLAGS) $(RANDOM) $(LATTICE) $(BLOCK) $(UFLDS) $(SFLDS) \ ++STD_MODULES = $(FLAGS) $(RANDOM) $(LATTICE) $(BLOCK) $(UFLDS) $(SFLDS) \ + $(LINALG) $(SU3FCTS) $(UTILS) $(TCHARGE) $(SW_TERM) $(SAP) + ++AVX512_MODULES = salg_avx512 salg_dble_avx512 \ ++ pauli_avx512 pauli_dble_avx512 ++ ++AVX512_ASM_MODULES = salg_avx512_asm salg_dble_avx512_asm \ ++ pauli_avx512_asm pauli_dble_avx512_asm ++ + + # Logging option (-mpilog or -mpitrace or -mpianim) + +@@ -60,7 +66,7 @@ MDIR = ../../modules + + VPATH = .:$(MDIR)/flags:$(MDIR)/random:$(MDIR)/lattice:$(MDIR)/block:\ + $(MDIR)/uflds:$(MDIR)/sflds:$(MDIR)/su3fcts:$(MDIR)/utils:\ +- $(MDIR)/linalg:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/sap ++ $(MDIR)/linalg:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/sap:$(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512 + + + # additional include directories +@@ -78,10 +84,10 @@ LIBPATH = $(MPI_HOME)/lib + # scheduling and optimization options + + CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ +- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ +- -O -mno-avx -Dx64 -DPM ++ -Wall -Wno-long-long -Wstrict-prototypes \ ++ -O2 -DAVX512 + +-LFLAGS = ++LFLAGS = $(CFLAGS) + + + ############################## do not change ################################### +@@ -90,25 +96,39 @@ SHELL=/bin/bash + CC=mpicc + CLINKER=$(CC) + ++#Check CFLAGS to find which AVX512 flags are active ++ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) ++MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) ++else ++ifneq (,$(findstring AVX512,$(CFLAGS))) ++MODULES= $(STD_MODULES) $(AVX512_MODULES) ++else ++MODULES= $(STD_MODULES) ++endif ++endif ++ + PGMS= $(MAIN) $(MODULES) + + -include $(addsuffix .d,$(PGMS)) + + + # rule to make dependencies +- + $(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + + # rule to compile source programs ++$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile ++ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + +-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile ++# pattern to compile files in the avx512 directiories ++$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + ++$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile ++ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + # rule to link object files +- + $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ +diff --git a/devel/dfl/Makefile b/devel/dfl/Makefile +index 9def70f..0a6ce2e 100644 +--- a/devel/dfl/Makefile ++++ b/devel/dfl/Makefile +@@ -57,10 +57,15 @@ UTILS = endian error hsum mutils utils wspace + + VFLDS = vflds vinit vcom vdcom + +-MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \ ++STD_MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \ + $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) $(DIRAC) \ + $(BLOCK) $(SAP) $(ARCHIVE) $(DFL) $(VFLDS) $(LITTLE) + ++AVX512_MODULES = Dw_avx512 Dw_dble_avx512 salg_avx512 salg_dble_avx512 \ ++ pauli_avx512 pauli_dble_avx512 ++ ++AVX512_ASM_MODULES = Dw_avx512_asm Dw_dble_avx512_asm salg_avx512_asm \ ++ salg_dble_avx512_asm pauli_avx512_asm pauli_dble_avx512_asm + + # Logging option (-mpilog or -mpitrace or -mpianim) + +@@ -75,7 +80,8 @@ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/linsolv:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:\ + $(MDIR)/sflds:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:\ + $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive:$(MDIR)/dfl:\ +- $(MDIR)/vflds:$(MDIR)/little ++ $(MDIR)/vflds:$(MDIR)/little\ ++ $(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512 + + # additional include directories + +@@ -92,10 +98,10 @@ LIBPATH = $(MPI_HOME)/lib + # scheduling and optimization options + + CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ +- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ +- -O -mno-avx -Dx64 -DPM ++ -Wall -Wno-long-long -Wstrict-prototypes \ ++ -O2 -DAVX512 + +-LFLAGS = ++LFLAGS = $(CFLAGS) + + # -DFGCR_DBG -DFGCR4VD_DBG -DDFL_MODES_DBG + +@@ -106,25 +112,39 @@ SHELL=/bin/bash + CC=mpicc + CLINKER=$(CC) + ++#Check CFLAGS to find which AVX512 flags are active ++ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) ++MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) ++else ++ifneq (,$(findstring AVX512,$(CFLAGS))) ++MODULES= $(STD_MODULES) $(AVX512_MODULES) ++else ++MODULES= $(STD_MODULES) ++endif ++endif ++ + PGMS= $(MAIN) $(MODULES) + + -include $(addsuffix .d,$(PGMS)) + + + # rule to make dependencies +- + $(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + + # rule to compile source programs ++$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile ++ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + +-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile ++# pattern to compile files in the avx512 directiories ++$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + ++$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile ++ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + # rule to link object files +- + $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ +diff --git a/devel/dirac/Makefile b/devel/dirac/Makefile +index 8fac63d..f467889 100644 +--- a/devel/dirac/Makefile ++++ b/devel/dirac/Makefile +@@ -12,6 +12,7 @@ + # Dependencies on included files are automatically taken care of + # + ################################################################################ ++################################################################################ + + all: rmxeq mkdep mkxeq + .PHONY: all +@@ -48,10 +49,15 @@ UFLDS = plaq_sum shift uflds udcom + + UTILS = endian error hsum mutils utils wspace + +-MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(RANDOM) $(UFLDS) \ ++STD_MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(RANDOM) $(UFLDS) \ + $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) \ + $(DIRAC) $(BLOCK) $(SAP) + ++AVX512_MODULES = Dw_avx512 Dw_dble_avx512 salg_avx512 salg_dble_avx512 \ ++ pauli_avx512 pauli_dble_avx512 ++ ++AVX512_ASM_MODULES = Dw_avx512_asm Dw_dble_avx512_asm salg_avx512_asm \ ++ salg_dble_avx512_asm pauli_avx512_asm pauli_dble_avx512_asm + + # Logging option (-mpilog or -mpitrace or -mpianim) + +@@ -65,7 +71,7 @@ MDIR = ../../modules + VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/random:\ + $(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:$(MDIR)/sflds:\ + $(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:$(MDIR)/block:\ +- $(MDIR)/sap ++ $(MDIR)/sap:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512:$(MDIR)/linalg/avx512 + + # additional include directories + +@@ -82,11 +88,10 @@ LIBPATH = $(MPI_HOME)/lib + # scheduling and optimization options + + CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ +- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ +- -O -mno-avx -Dx64 -DPM +- +-LFLAGS = ++ -Wall -Wno-long-long -Wstrict-prototypes \ ++ -O2 -DAVX512 -DAVX -DPM + ++LFLAGS = $(CFLAGS) + + ############################## do not change ################################### + +@@ -94,30 +99,46 @@ SHELL=/bin/bash + CC=mpicc + CLINKER=$(CC) + ++ ++#Check CFLAGS to find which AVX512 flags are active ++ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) ++MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) ++else ++ifneq (,$(findstring AVX512,$(CFLAGS))) ++MODULES= $(STD_MODULES) $(AVX512_MODULES) ++else ++MODULES= $(STD_MODULES) ++endif ++endif ++ + PGMS= $(MAIN) $(MODULES) + + -include $(addsuffix .d,$(PGMS)) + + + # rule to make dependencies +- + $(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + + # rule to compile source programs ++$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile ++ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + +-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile ++# pattern to compile files in the avx512 directiories ++$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + ++$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile ++ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + # rule to link object files +- + $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + ++ + # produce executables + + mkxeq: $(MAIN) +diff --git a/devel/forces/Makefile b/devel/forces/Makefile +index e6955fb..7bf668c 100644 +--- a/devel/forces/Makefile ++++ b/devel/forces/Makefile +@@ -68,11 +68,16 @@ UTILS = endian error hsum mutils utils wspace + + VFLDS = vflds vinit vcom vdcom + +-MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \ ++STD_MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \ + $(LATTICE) $(LINALG) $(LINSOLV) $(LITTLE) $(MDFLDS) $(RANDOM) \ + $(RATFCTS) $(SAP) $(SFLDS) $(SU3FCTS) $(SW_TERM) $(TCHARGE) \ + $(UFLDS) $(UPDATE) $(UTILS) $(VFLDS) + ++AVX512_MODULES = Dw_avx512 Dw_dble_avx512 salg_avx512 salg_dble_avx512 \ ++ pauli_avx512 pauli_dble_avx512 ++ ++AVX512_ASM_MODULES = Dw_avx512_asm Dw_dble_avx512_asm salg_avx512_asm \ ++ salg_dble_avx512_asm pauli_avx512_asm pauli_dble_avx512_asm + + # Logging option (-mpilog or -mpitrace or -mpianim) + +@@ -88,7 +93,8 @@ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\ + $(MDIR)/utils:$(MDIR)/forces:$(MDIR)/sflds:$(MDIR)/dirac:\ + $(MDIR)/sw_term:$(MDIR)/tcharge:$(MDIR)/block:$(MDIR)/sap:\ + $(MDIR)/linsolv:$(MDIR)/dfl:$(MDIR)/vflds:$(MDIR)/little:\ +- $(MDIR)/update:$(MDIR)/ratfcts ++ $(MDIR)/update:$(MDIR)/ratfcts\ ++ $(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512 + + + # additional include directories +@@ -106,10 +112,10 @@ LIBPATH = $(MPI_HOME)/lib + # scheduling and optimization options + + CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ +- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ +- -O -mno-avx -Dx64 -DPM ++ -Wall -Wno-long-long -Wstrict-prototypes \ ++ -O2 -DAVX512 + +-LFLAGS = ++LFLAGS = $(CFLAGS) + + # -DCGNE_DBG -DFGCR_DBG -DMSCG_DBG + # -DDFL_MODES_DBG +@@ -121,25 +127,39 @@ SHELL=/bin/bash + CC=mpicc + CLINKER=$(CC) + ++#Check CFLAGS to find which AVX512 flags are active ++ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) ++MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) ++else ++ifneq (,$(findstring AVX512,$(CFLAGS))) ++MODULES= $(STD_MODULES) $(AVX512_MODULES) ++else ++MODULES= $(STD_MODULES) ++endif ++endif ++ + PGMS= $(MAIN) $(MODULES) + + -include $(addsuffix .d,$(PGMS)) + + + # rule to make dependencies +- + $(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + + # rule to compile source programs ++$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile ++ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + +-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile ++# pattern to compile files in the avx512 directiories ++$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + ++$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile ++ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + # rule to link object files +- + $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ +diff --git a/devel/linalg/Makefile b/devel/linalg/Makefile +index 76ff701..e13bd0b 100644 +--- a/devel/linalg/Makefile ++++ b/devel/linalg/Makefile +@@ -25,7 +25,7 @@ FLAGS = flags lat_parms dfl_parms + + LATTICE = bcnds uidx geometry + +-LINALG = cmatrix_dble liealg salg salg_dble valg valg_dble ++LINALG = cmatrix_dble liealg salg salg_dble valg valg_dble salg_avx512 salg_dble_avx512 + + RANDOM = ranlux ranlxs ranlxd gauss + +@@ -54,7 +54,7 @@ MDIR = ../../modules + + VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/random:$(MDIR)/linalg:\ + $(MDIR)/utils:$(MDIR)/uflds:$(MDIR)/sflds:$(MDIR)/su3fcts:\ +- $(MDIR)/vflds ++ $(MDIR)/vflds:$(MDIR)/linalg/avx512 + + + # additional include directories +@@ -72,10 +72,10 @@ LIBPATH = $(MPI_HOME)/lib + # scheduling and optimization options + + CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ +- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ +- -O -mno-avx -Dx64 -DPM ++ -Wall -Wno-long-long -Wstrict-prototypes \ ++ -O2 -DAVX512 -DAVX -DPM + +-LFLAGS = ++LFLAGS = $(CFLAGS) + + + ############################## do not change ################################### +@@ -90,24 +90,19 @@ PGMS= $(MAIN) $(MODULES) + + + # rule to make dependencies +- + $(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + + # rule to compile source programs +- +-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile ++$(addsuffix .o,$(MAIN) $(MODULES)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + +- + # rule to link object files +- + $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + +- + # produce executables + + mkxeq: $(MAIN) +diff --git a/devel/little/Makefile b/devel/little/Makefile +index 3ab8aee..295f92c 100644 +--- a/devel/little/Makefile ++++ b/devel/little/Makefile +@@ -57,10 +57,15 @@ UTILS = endian error hsum mutils utils wspace + + VFLDS = vflds vinit vcom vdcom + +-MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \ ++STD_MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \ + $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) $(DIRAC) \ + $(BLOCK) $(SAP) $(ARCHIVE) $(DFL) $(VFLDS) $(LITTLE) + ++AVX512_MODULES = Dw_avx512 Dw_dble_avx512 salg_avx512 salg_dble_avx512 \ ++ pauli_avx512 pauli_dble_avx512 ++ ++AVX512_ASM_MODULES = Dw_avx512_asm Dw_dble_avx512_asm salg_avx512_asm \ ++ salg_dble_avx512_asm pauli_avx512_asm pauli_dble_avx512_asm + + # Logging option (-mpilog or -mpitrace or -mpianim) + +@@ -75,7 +80,8 @@ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/linsolv:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:\ + $(MDIR)/sflds:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:\ + $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive:$(MDIR)/dfl:\ +- $(MDIR)/vflds:$(MDIR)/little ++ $(MDIR)/vflds:$(MDIR)/little\ ++ $(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512 + + + # additional include directories +@@ -93,10 +99,10 @@ LIBPATH = $(MPI_HOME)/lib + # scheduling and optimization options + + CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ +- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ +- -O -mno-avx -Dx64 -DPM ++ -Wall -Wno-long-long -Wstrict-prototypes \ ++ -O2 -DAVX512 + +-LFLAGS = ++LFLAGS = $(CFLAGS) + + + ############################## do not change ################################### +@@ -105,25 +111,39 @@ SHELL=/bin/bash + CC=mpicc + CLINKER=$(CC) + ++#Check CFLAGS to find which AVX512 flags are active ++ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) ++MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) ++else ++ifneq (,$(findstring AVX512,$(CFLAGS))) ++MODULES= $(STD_MODULES) $(AVX512_MODULES) ++else ++MODULES= $(STD_MODULES) ++endif ++endif ++ + PGMS= $(MAIN) $(MODULES) + + -include $(addsuffix .d,$(PGMS)) + + + # rule to make dependencies +- + $(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + + # rule to compile source programs ++$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile ++ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + +-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile ++# pattern to compile files in the avx512 directiories ++$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + ++$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile ++ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + # rule to link object files +- + $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ +diff --git a/devel/sap/Makefile b/devel/sap/Makefile +index f56144d..b39f521 100644 +--- a/devel/sap/Makefile ++++ b/devel/sap/Makefile +@@ -51,10 +51,15 @@ UFLDS = plaq_sum shift uflds udcom + + UTILS = endian error hsum mutils utils wspace + +-MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \ ++STD_MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \ + $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) $(DIRAC) \ + $(BLOCK) $(SAP) $(ARCHIVE) + ++AVX512_MODULES = Dw_avx512 Dw_dble_avx512 salg_avx512 salg_dble_avx512 \ ++ pauli_avx512 pauli_dble_avx512 ++ ++AVX512_ASM_MODULES = Dw_avx512_asm Dw_dble_avx512_asm salg_avx512_asm \ ++ salg_dble_avx512_asm pauli_avx512_asm pauli_dble_avx512_asm + + # Logging option (-mpilog or -mpitrace or -mpianim) + +@@ -68,7 +73,8 @@ MDIR = ../../modules + VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/linsolv:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:\ + $(MDIR)/sflds:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:\ +- $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive: ++ $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive:\ ++ $(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512 + + + # additional include directories +@@ -86,10 +92,10 @@ LIBPATH = $(MPI_HOME)/lib + # scheduling and optimization options + + CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ +- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ +- -O -mno-avx -Dx64 -DPM ++ -Wall -Wno-long-long -Wstrict-prototypes \ ++ -O2 -DAVX512 + +-LFLAGS = ++LFLAGS = $(CFLAGS) + + # -DFGCR_DBG + +@@ -100,25 +106,39 @@ SHELL=/bin/bash + CC=mpicc + CLINKER=$(CC) + ++#Check CFLAGS to find which AVX512 flags are active ++ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) ++MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) ++else ++ifneq (,$(findstring AVX512,$(CFLAGS))) ++MODULES= $(STD_MODULES) $(AVX512_MODULES) ++else ++MODULES= $(STD_MODULES) ++endif ++endif ++ + PGMS= $(MAIN) $(MODULES) + + -include $(addsuffix .d,$(PGMS)) + + + # rule to make dependencies +- + $(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + + # rule to compile source programs ++$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile ++ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + +-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile ++# pattern to compile files in the avx512 directiories ++$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + ++$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile ++ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + # rule to link object files +- + $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ +diff --git a/devel/sflds/Makefile b/devel/sflds/Makefile +index 111e826..b424540 100644 +--- a/devel/sflds/Makefile ++++ b/devel/sflds/Makefile +@@ -37,9 +37,12 @@ UFLDS = uflds + + UTILS = endian error hsum mutils utils wspace + +-MODULES = $(FLAGS) $(LATTICE) $(RANDOM) $(LINALG) $(UTILS) \ ++STD_MODULES = $(FLAGS) $(LATTICE) $(RANDOM) $(LINALG) $(UTILS) \ + $(UFLDS) $(SFLDS) $(SU3FCTS) + ++AVX512_MODULES = salg_avx512 salg_dble_avx512 ++ ++AVX512_ASM_MODULES = salg_avx512_asm salg_dble_avx512_asm + + # Logging option (-mpilog or -mpitrace or -mpianim) + +@@ -51,7 +54,8 @@ LOGOPTION = + MDIR = ../../modules + + VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/random:$(MDIR)/linalg:\ +- $(MDIR)/utils:$(MDIR)/uflds:$(MDIR)/sflds:$(MDIR)/su3fcts ++ $(MDIR)/utils:$(MDIR)/uflds:$(MDIR)/sflds:$(MDIR)/su3fcts\ ++ $(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512 + + + # additional include directories +@@ -69,10 +73,10 @@ LIBPATH = $(MPI_HOME)/lib + # scheduling and optimization options + + CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ +- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ +- -O -mno-avx -Dx64 -DPM ++ -Wall -Wno-long-long -Wstrict-prototypes \ ++ -O2 -DAVX512 + +-LFLAGS = ++LFLAGS = $(CFLAGS) + + + ############################## do not change ################################### +@@ -81,25 +85,39 @@ SHELL=/bin/bash + CC=mpicc + CLINKER=$(CC) + ++#Check CFLAGS to find which AVX512 flags are active ++ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) ++MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) ++else ++ifneq (,$(findstring AVX512,$(CFLAGS))) ++MODULES= $(STD_MODULES) $(AVX512_MODULES) ++else ++MODULES= $(STD_MODULES) ++endif ++endif ++ + PGMS= $(MAIN) $(MODULES) + + -include $(addsuffix .d,$(PGMS)) + + + # rule to make dependencies +- + $(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + + # rule to compile source programs ++$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile ++ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + +-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile ++# pattern to compile files in the avx512 directiories ++$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + ++$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile ++ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + # rule to link object files +- + $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ +diff --git a/devel/sw_term/Makefile b/devel/sw_term/Makefile +index b9c80c4..c7f455f 100644 +--- a/devel/sw_term/Makefile ++++ b/devel/sw_term/Makefile +@@ -25,7 +25,7 @@ FLAGS = flags lat_parms dfl_parms + + LATTICE = bcnds uidx ftidx geometry + +-LINALG = salg_dble liealg cmatrix_dble ++LINALG = salg_dble liealg cmatrix_dble salg_dble_avx512 + + RANDOM = ranlux ranlxs ranlxd gauss + +@@ -33,7 +33,7 @@ SFLDS = sflds + + SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +-SW_TERM = pauli pauli_dble swflds sw_term ++SW_TERM = pauli pauli_dble swflds sw_term pauli_avx512 pauli_dble_avx512 + + TCHARGE = ftcom ftensor + +@@ -44,7 +44,6 @@ UTILS = endian error hsum mutils utils wspace + MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(RANDOM) $(UFLDS) \ + $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) + +- + # Logging option (-mpilog or -mpitrace or -mpianim) + + LOGOPTION = +@@ -56,7 +55,8 @@ MDIR = ../../modules + + VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/random:\ + $(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:$(MDIR)/sflds:\ +- $(MDIR)/tcharge:$(MDIR)/sw_term ++ $(MDIR)/tcharge:$(MDIR)/sw_term:\ ++ $(MDIR)/linalg/avx512:$(MDIR)/sw_term/avx512 + + + # additional include directories +@@ -74,10 +74,10 @@ LIBPATH = $(MPI_HOME)/lib + # scheduling and optimization options + + CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ +- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ +- -O -mno-avx -Dx64 -DPM ++ -Wall -Wno-long-long -Wstrict-prototypes \ ++ -O2 -DAVX512 -DAVX -DPM + +-LFLAGS = ++LFLAGS = $(CFLAGS) + + ############################## do not change ################################### + +@@ -91,24 +91,19 @@ PGMS= $(MAIN) $(MODULES) + + + # rule to make dependencies +- + $(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + + # rule to compile source programs +- +-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile ++$(addsuffix .o,$(MAIN) $(MODULES)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + +- + # rule to link object files +- + $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + +- + # produce executables + + mkxeq: $(MAIN) +diff --git a/devel/update/Makefile b/devel/update/Makefile +index d3e8910..749ade4 100644 +--- a/devel/update/Makefile ++++ b/devel/update/Makefile +@@ -67,11 +67,16 @@ UTILS = endian error hsum mutils utils wspace + + VFLDS = vflds vinit vcom vdcom + +-MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \ ++STD_MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \ + $(LATTICE) $(LINALG) $(LINSOLV) $(LITTLE) $(MDFLDS) $(RANDOM) \ + $(RATFCTS) $(SAP) $(SFLDS) $(SU3FCTS) $(SW_TERM) $(TCHARGE) \ + $(UFLDS) $(UPDATE) $(UTILS) $(VFLDS) + ++AVX512_MODULES = Dw_avx512 Dw_dble_avx512 salg_avx512 salg_dble_avx512 \ ++ pauli_avx512 pauli_dble_avx512 ++ ++AVX512_ASM_MODULES = Dw_avx512_asm Dw_dble_avx512_asm salg_avx512_asm \ ++ salg_dble_avx512_asm pauli_avx512_asm pauli_dble_avx512_asm + + # Logging option (-mpilog or -mpitrace or -mpianim) + +@@ -87,7 +92,8 @@ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\ + $(MDIR)/utils:$(MDIR)/forces:$(MDIR)/sflds:$(MDIR)/dirac:\ + $(MDIR)/sw_term:$(MDIR)/tcharge:$(MDIR)/block:$(MDIR)/sap:\ + $(MDIR)/linsolv:$(MDIR)/dfl:$(MDIR)/vflds:$(MDIR)/little:\ +- $(MDIR)/update:$(MDIR)/ratfcts ++ $(MDIR)/update:$(MDIR)/ratfcts\ ++ $(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512 + + + # additional include directories +@@ -105,10 +111,11 @@ LIBPATH = $(MPI_HOME)/lib + # scheduling and optimization options + + CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ +- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ +- -O -mno-avx -Dx64 -DPM ++ -Wall -Wno-long-long -Wstrict-prototypes \ ++ -O2 -DAVX512 ++ ++LFLAGS = $(CFLAGS) + +-LFLAGS = + + # -DMDINT_DBG -DRWRAT_DBG + +@@ -119,25 +126,39 @@ SHELL=/bin/bash + CC=mpicc + CLINKER=$(CC) + ++#Check CFLAGS to find which AVX512 flags are active ++ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) ++MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) ++else ++ifneq (,$(findstring AVX512,$(CFLAGS))) ++MODULES= $(STD_MODULES) $(AVX512_MODULES) ++else ++MODULES= $(STD_MODULES) ++endif ++endif ++ + PGMS= $(MAIN) $(MODULES) + + -include $(addsuffix .d,$(PGMS)) + + + # rule to make dependencies +- + $(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + + # rule to compile source programs ++$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile ++ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + +-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile ++# pattern to compile files in the avx512 directiories ++$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + ++$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile ++ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + # rule to link object files +- + $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ +diff --git a/devel/vflds/Makefile b/devel/vflds/Makefile +index 11778a2..a29b334 100644 +--- a/devel/vflds/Makefile ++++ b/devel/vflds/Makefile +@@ -59,7 +59,11 @@ MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \ + $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) $(DIRAC) \ + $(BLOCK) $(SAP) $(ARCHIVE) $(DFL) $(VFLDS) + ++AVX512_MODULES = Dw_avx512 Dw_dble_avx512 salg_avx512 salg_dble_avx512 \ ++ pauli_avx512 pauli_dble_avx512 + ++AVX512_ASM_MODULES = Dw_avx512_asm Dw_dble_avx512_asm salg_avx512_asm \ ++ salg_dble_avx512_asm pauli_avx512_asm pauli_dble_avx512_asm + # Logging option (-mpilog or -mpitrace or -mpianim) + + LOGOPTION = +@@ -73,7 +77,8 @@ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/linsolv:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:\ + $(MDIR)/sflds:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:\ + $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive:$(MDIR)/dfl:\ +- $(MDIR)/vflds ++ $(MDIR)/vflds\ ++ $(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512 + + # additional include directories + +@@ -90,10 +95,10 @@ LIBPATH = $(MPI_HOME)/lib + # scheduling and optimization options + + CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ +- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ +- -O -mno-avx -Dx64 -DPM ++ -Wall -Wno-long-long -Wstrict-prototypes \ ++ -O2 -DAVX512 + +-LFLAGS = ++LFLAGS = $(CFLAGS) + + + ############################## do not change ################################### +@@ -102,25 +107,39 @@ SHELL=/bin/bash + CC=mpicc + CLINKER=$(CC) + ++#Check CFLAGS to find which AVX512 flags are active ++ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) ++MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) ++else ++ifneq (,$(findstring AVX512,$(CFLAGS))) ++MODULES= $(STD_MODULES) $(AVX512_MODULES) ++else ++MODULES= $(STD_MODULES) ++endif ++endif ++ + PGMS= $(MAIN) $(MODULES) + + -include $(addsuffix .d,$(PGMS)) + + + # rule to make dependencies +- + $(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + + # rule to compile source programs ++$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile ++ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + +-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile ++# pattern to compile files in the avx512 directiories ++$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + ++$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile ++ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + # rule to link object files +- + $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ +diff --git a/include/avx512.h b/include/avx512.h +new file mode 100644 +index 0000000..ac73c60 +--- /dev/null ++++ b/include/avx512.h +@@ -0,0 +1,978 @@ ++ ++/******************************************************************************* ++* ++* File avx512.h ++* ++* This software is distributed under the terms of the GNU General Public ++* License (GPL) ++* ++* Macros for operating on SU(3) vectors and matrices using Intel intrinsic ++* operations for AVX512 compatible processors ++* ++*******************************************************************************/ ++ ++#ifndef AVX512_H ++#define AVX512_H ++ ++#ifndef SSE2_H ++#include "sse2.h" ++#endif ++ ++#include "immintrin.h" ++ ++ ++ ++/* Macros for single precision floating point numbers */ ++ ++/* Write 6 color weyl vectors as a spinor */ ++#define _avx512_write_6_hwv_f( c1,c2,c3, a ){ \ ++ __m256 t256; __m128 t128; \ ++ t256 = _mm256_shuffle_ps( c1, c2, 0b01000100 ); \ ++ t128 = _mm256_castps256_ps128( t256 ); \ ++ _mm_storeu_ps( a, t128 ); \ ++ t128 = _mm256_extractf128_ps( t256, 1 ); \ ++ _mm_storeu_ps( a+12, t128 ); \ ++ \ ++ t256 = _mm256_shuffle_ps( c3, c1, 0b11100100 ); \ ++ t128 = _mm256_castps256_ps128( t256 ); \ ++ _mm_storeu_ps( a+4, t128 ); \ ++ t128 = _mm256_extractf128_ps( t256, 1 ); \ ++ _mm_storeu_ps( a+16, t128 ); \ ++ \ ++ t256 = _mm256_shuffle_ps( c2, c3, 0b11101110 ); \ ++ t128 = _mm256_castps256_ps128( t256 ); \ ++ _mm_storeu_ps( a+8, t128 ); \ ++ t128 = _mm256_extractf128_ps( t256, 1 ); \ ++ _mm_storeu_ps( a+20, t128 ); \ ++} ++ ++ ++/* Load 6 color weyl vectors from a spinor */ ++#define _avx512_load_6_hwv_f( c1,c2,c3, a ){ \ ++ __m256 t1,t2,t3; __m128 t11, t14; \ ++ t11 = _mm_loadu_ps( a ); \ ++ t14 = _mm_loadu_ps( a+12 ); \ ++ t1 = _mm256_castps128_ps256( t11 ); \ ++ t1 = _mm256_insertf128_ps( t1, t14, 1 ); \ ++ \ ++ t11 = _mm_loadu_ps( a+4 ); \ ++ t14 = _mm_loadu_ps( a+16 ); \ ++ t2 = _mm256_castps128_ps256( t11 ); \ ++ t2 = _mm256_insertf128_ps( t2, t14, 1 ); \ ++ \ ++ t11 = _mm_loadu_ps( a+8 ); \ ++ t14 = _mm_loadu_ps( a+20 ); \ ++ t3 = _mm256_castps128_ps256( t11 ); \ ++ t3 = _mm256_insertf128_ps( t3, t14, 1 ); \ ++ \ ++ c1 = _mm256_shuffle_ps( t1, t2, 0b11100100 ); \ ++ c2 = _mm256_shuffle_ps( t1, t3, 0b01001110 ); \ ++ c3 = _mm256_shuffle_ps( t2, t3, 0b11100100 ); \ ++} ++ ++ ++ ++/* Load 4x2 complex numbers into an aray */ ++#define _avx512_load_4_cf( r, c1,c2,c3,c4 ){ \ ++ __m128 t128; \ ++ t128 = _mm_loadu_ps( c1 ); \ ++ r = _mm512_castps128_ps512( t128 ); \ ++ t128 = _mm_loadu_ps( c2 ); \ ++ r = _mm512_insertf32x4 ( r, t128, 1); \ ++ t128 = _mm_loadu_ps( c3 ); \ ++ r = _mm512_insertf32x4 ( r, t128, 2); \ ++ t128 = _mm_loadu_ps( c4 ); \ ++ r = _mm512_insertf32x4 ( r, t128, 3); \ ++} ++ ++/* Load 4 half-spinors and organize colorwise into vectors r1, r2 and r3 */ ++#define _avx512_load_4_halfspinor_f( r1, r2, r3, s1, s2, s3, s4 ) \ ++{ \ ++ __m512 t512a, t512b, t512c; \ ++ _avx512_load_4_cf( t512a, s1,s2,s3,s4 ); \ ++ _avx512_load_4_cf( t512b, s1+4,s2+4,s3+4,s4+4 ); \ ++ _avx512_load_4_cf( t512c, s1+8,s2+8,s3+8,s4+8 ); \ ++ \ ++ r1 = _mm512_shuffle_ps(t512a,t512b, 0b11100100); \ ++ r2 = _mm512_shuffle_ps(t512a,t512c, 0b01001110); \ ++ r3 = _mm512_shuffle_ps(t512b,t512c, 0b11100100); \ ++} ++ ++/* Load 4 half-spinors reversing the second two spinor indeces and ++ * organize colorwise into vectors r1, r2 and r3 */ ++#define _avx512_load_4_halfspinor_f_reverse_up( r1, r2, r3, s1, s2, s3, s4 ) \ ++{ \ ++ __m512 t512a, t512b, t512c; \ ++ __m512i idx; \ ++ _avx512_load_4_cf( t512a, s1,s2,s3,s4 ); \ ++ _avx512_load_4_cf( t512b, s1+4,s2+4,s3+4,s4+4 ); \ ++ _avx512_load_4_cf( t512c, s1+8,s2+8,s3+8,s4+8 ); \ ++ \ ++ idx = _mm512_setr_epi32( 0,1,16+2,16+3, 4,5,16+6,16+7, \ ++ 16+10,16+11,8,9, 16+14,16+15,12,13 ); \ ++ r1 = _mm512_permutex2var_ps( t512a, idx, t512b ); \ ++ idx = _mm512_setr_epi32( 2,3,16+0,16+1, 6,7,16+4,16+5, \ ++ 16+8,16+9,10,11, 16+12,16+13,14,15 ); \ ++ r2 = _mm512_permutex2var_ps( t512a, idx, t512c ); \ ++ idx = _mm512_setr_epi32( 0,1,16+2,16+3, 4,5,16+6,16+7, \ ++ 16+10,16+11,8,9, 16+14,16+15,12,13 ); \ ++ r3 = _mm512_permutex2var_ps( t512b, idx, t512c ); \ ++} ++ ++/* Load 4 half-spinors reversing first two the spinor indeces and ++ * organize colorwise into vectors r1, r2 and r3 */ ++#define _avx512_load_4_halfspinor_f_reverse_dn( r1, r2, r3, s1, s2, s3, s4 ) \ ++{ \ ++ __m512 t512a, t512b, t512c; \ ++ __m512i idx; \ ++ _avx512_load_4_cf( t512a, s1,s2,s3,s4 ); \ ++ _avx512_load_4_cf( t512b, s1+4,s2+4,s3+4,s4+4 ); \ ++ _avx512_load_4_cf( t512c, s1+8,s2+8,s3+8,s4+8 ); \ ++ \ ++ idx = _mm512_setr_epi32( 16+2,16+3,0,1, 16+6,16+7,4,5, \ ++ 8,9,16+10,16+11, 12,13,16+14,16+15 ); \ ++ r1 = _mm512_permutex2var_ps( t512a, idx, t512b ); \ ++ idx = _mm512_setr_epi32( 16+0,16+1,2,3, 16+4,16+5,6,7, \ ++ 10,11,16+8,16+9, 14,15,16+12,16+13 ); \ ++ r2 = _mm512_permutex2var_ps( t512a, idx, t512c ); \ ++ idx = _mm512_setr_epi32( 16+2,16+3,0,1, 16+6,16+7,4,5, \ ++ 8,9,16+10,16+11, 12,13,16+14,16+15 ); \ ++ r3 = _mm512_permutex2var_ps( t512b, idx, t512c ); \ ++} ++ ++/* Load 4x2 complex numbers into an aray */ ++#define _avx512_write_4_cf( r, c1,c2,c3,c4 ){ \ ++ __m512 t512; \ ++ __m128 t128 = _mm512_castps512_ps128( r ); \ ++ _mm_storeu_ps( c1, t128 ); \ ++ t128 = _mm512_extractf32x4_ps( r, 1 ); \ ++ _mm_storeu_ps( c2, t128 ); \ ++ t128 = _mm512_extractf32x4_ps( r, 2 ); \ ++ _mm_storeu_ps( c3, t128 ); \ ++ t128 = _mm512_extractf32x4_ps( r, 3 ); \ ++ _mm_storeu_ps( c4, t128 ); \ ++} ++ ++/* Store 4 half-spinors from color vectors */ ++#define _avx512_write_4_halfspinor_f( r1, r2, r3, s1, s2, s3, s4 ) \ ++{ \ ++ __m512 t512a, t512b, t512c; \ ++ \ ++ t512a = _mm512_shuffle_ps(r1,r2, 0b01000100); \ ++ t512b = _mm512_shuffle_ps(r3,r1, 0b11100100); \ ++ t512c = _mm512_shuffle_ps(r2,r3, 0b11101110); \ ++ \ ++ _avx512_write_4_cf( t512a, s1,s2,s3,s4 ); \ ++ _avx512_write_4_cf( t512b, s1+4,s2+4,s3+4,s4+4 ); \ ++ _avx512_write_4_cf( t512c, s1+8,s2+8,s3+8,s4+8 ); \ ++} ++ ++/* Store 4 half-spinors from color vectors reversing the first two Dirac indeces */ ++#define _avx512_write_4_halfspinor_f_reverse_up( r1, r2, r3, s1, s2, s3, s4 ) \ ++{ \ ++ __m512 t512a, t512b, t512c; \ ++ __m512i idx; \ ++ idx = _mm512_setr_epi32( 0,1,16+0,16+1, 4,5,16+4,16+5, \ ++ 10,11,16+10,16+11, 14,15,16+14,16+15 ); \ ++ t512a = _mm512_permutex2var_ps( r1, idx, r2 ); \ ++ idx = _mm512_setr_epi32( 0,1,16+2,16+3, 4,5,16+6,16+7, \ ++ 10,11,16+8,16+9, 14,15,16+12,16+13 ); \ ++ t512b = _mm512_permutex2var_ps( r3, idx, r1 ); \ ++ idx = _mm512_setr_epi32( 2,3,16+2,16+3, 6,7,16+6,16+7, \ ++ 8,9,16+8,16+9, 12,13,16+12,16+13 ); \ ++ t512c = _mm512_permutex2var_ps( r2, idx, r3 ); \ ++ \ ++ _avx512_write_4_cf( t512a, s1,s2,s3,s4 ); \ ++ _avx512_write_4_cf( t512b, s1+4,s2+4,s3+4,s4+4 ); \ ++ _avx512_write_4_cf( t512c, s1+8,s2+8,s3+8,s4+8 ); \ ++} ++ ++ ++/* Store 4 half-spinors from color vectors reversing the second two Dirac indeces */ ++#define _avx512_write_4_halfspinor_f_reverse_dn( r1, r2, r3, s1, s2, s3, s4 ) \ ++{ \ ++ __m512 t512a, t512b, t512c; \ ++ __m512i idx; \ ++ idx = _mm512_setr_epi32( 2,3,16+2,16+3, 6,7,16+6,16+7, \ ++ 8,9,16+8,16+9, 12,13,16+12,16+13 ); \ ++ t512a = _mm512_permutex2var_ps( r1, idx, r2 ); \ ++ idx = _mm512_setr_epi32( 2,3,16+0,16+1, 6,7,16+4,16+5, \ ++ 8,9,16+10,16+11, 12,13,16+14,16+15 ); \ ++ t512b = _mm512_permutex2var_ps( r3, idx, r1 ); \ ++ idx = _mm512_setr_epi32( 0,1,16+0,16+1, 4,5,16+4,16+5, \ ++ 10,11,16+10,16+11, 14,15,16+14,16+15 ); \ ++ t512c = _mm512_permutex2var_ps( r2, idx, r3 ); \ ++ \ ++ _avx512_write_4_cf( t512a, s1,s2,s3,s4 ); \ ++ _avx512_write_4_cf( t512b, s1+4,s2+4,s3+4,s4+4 ); \ ++ _avx512_write_4_cf( t512c, s1+8,s2+8,s3+8,s4+8 ); \ ++} ++ ++ ++ ++ ++ ++ ++ ++ ++/* Combine Dirac spinors to half-spinors in deo and doe. ++ */ ++#define _avx512_dirac_combine_f_1( a, b ) \ ++{ \ ++ __m512i indexes; \ ++ __m512 c; \ ++ indexes = _mm512_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7, \ ++ 9, 8, 11, 10, 13, 12, 15, 14 ); \ ++ c = _mm512_permutexvar_ps( indexes, b); \ ++ a = _mm512_mask_add_ps( a, 0b0101101000001111, a, c ); \ ++ a = _mm512_mask_sub_ps( a, 0b1010010111110000, a, c ); \ ++} ++ ++#define _avx512_dirac_combine_f_2( a, b ) \ ++{ \ ++ __m512i indexes; \ ++ __m512 c; \ ++ indexes = _mm512_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7, \ ++ 9, 8, 11, 10, 13, 12, 15, 14 ); \ ++ c = _mm512_permutexvar_ps( indexes, b); \ ++ a = _mm512_mask_add_ps( a, 0b1001011011000011, a, c ); \ ++ a = _mm512_mask_sub_ps( a, 0b0110100100111100, a, c ); \ ++} ++ ++ ++#define _avx512_dirac_combine_f_3( a, b ) \ ++{ \ ++ __m512i indexes; \ ++ __m512 c; \ ++ indexes = _mm512_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7, \ ++ 9, 8, 11, 10, 13, 12, 15, 14 ); \ ++ c = _mm512_permutexvar_ps( indexes, b); \ ++ a = _mm512_mask_add_ps( a, 0b1010010100001111, a, c ); \ ++ a = _mm512_mask_sub_ps( a, 0b0101101011110000, a, c ); \ ++} ++ ++ ++#define _avx512_dirac_combine_f_4( a, b ) \ ++{ \ ++ __m512i indexes; \ ++ __m512 c; \ ++ indexes = _mm512_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7, \ ++ 9, 8, 11, 10, 13, 12, 15, 14 ); \ ++ c = _mm512_permutexvar_ps( indexes, b); \ ++ a = _mm512_mask_add_ps( a, 0b0110100111000011, a, c ); \ ++ a = _mm512_mask_sub_ps( a, 0b1001011000111100, a, c ); \ ++} ++ ++ ++ ++ ++/* Multiply 4 vectors with a su(3) matrices, taking the inverse of every ++ * second matrix ++ */ ++#define avx512_su3_mixed_multiply_8( u1, um1, u2, um2, b1,b2,b3, a1,a2,a3 ) \ ++ { \ ++ __m512 ut11, ut21, ut31, ut41, ut12, ut22, ut32, ut42; \ ++ __m512 ut1, ut2, sign, c; \ ++ __m512 t1,t2,t3; \ ++ __m512i indexes; \ ++ ut11 = _mm512_loadu_ps( &(u1).c11.re ); ut12 = _mm512_loadu_ps( &(u1).c33.re ); \ ++ ut21 = _mm512_loadu_ps( &(um1).c11.re ); ut22 = _mm512_loadu_ps( &(um1).c33.re ); \ ++ ut31 = _mm512_loadu_ps( &(u2).c11.re ); ut32 = _mm512_loadu_ps( &(u2).c33.re ); \ ++ ut41 = _mm512_loadu_ps( &(um2).c11.re ); ut42 = _mm512_loadu_ps( &(um2).c33.re ); \ ++ \ ++ indexes = _mm512_setr_epi32( 0, 1, 2, 3, 8, 9, 6, 7, \ ++ 16, 17, 18, 19, 24, 25, 22, 23 ); \ ++ ut1 = _mm512_permutex2var_ps( ut11, indexes, ut21 ); \ ++ ut2 = _mm512_permutex2var_ps( ut31, indexes, ut41 ); \ ++ \ ++ indexes = _mm512_setr_epi32( 0,0,0,0, 8,8,8,8, 16,16,16,16, 24,24,24,24 ); \ ++ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ ++ b1 = _mm512_mul_ps ( a1, c ); \ ++ \ ++ indexes = _mm512_setr_epi32( 4,4,4,4, 12,12,12,12, 20,20,20,20, 28,28,28,28 ); \ ++ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ ++ b2 = _mm512_mul_ps ( a2, c ); \ ++ \ ++ indexes = _mm512_setr_epi32( 2,2,2,2, 14,14,14,14, 18,18,18,18, 30,30,30,30 ); \ ++ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ ++ b1 = _mm512_fmadd_ps ( a2, c, b1 ); \ ++ \ ++ indexes = _mm512_setr_epi32( 6,6,6,6, 10,10,10,10, 22,22,22,22, 26,26,26,26 ); \ ++ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ ++ b2 = _mm512_fmadd_ps ( a1, c, b2 ); \ ++ \ ++ \ ++ sign = _mm512_set_ps( -1,1,-1,1, 1,-1,1,-1, -1,1,-1,1, 1,-1,1,-1 ); \ ++ t1 = _mm512_permute_ps( a1, 0b10110001 ); \ ++ t1 = _mm512_mul_ps( t1, sign ); \ ++ t2 = _mm512_permute_ps( a2, 0b10110001 ); \ ++ t2 = _mm512_mul_ps( t2, sign ); \ ++ \ ++ indexes = _mm512_setr_epi32( 1,1,1,1, 9,9,9,9, 17,17,17,17, 25,25,25,25 ); \ ++ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ ++ b1 = _mm512_fmadd_ps ( t1, c, b1 ); \ ++ \ ++ indexes = _mm512_setr_epi32( 5,5,5,5, 13,13,13,13, 21,21,21,21, 29,29,29,29 ); \ ++ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ ++ b2 = _mm512_fmadd_ps ( t2, c, b2 ); \ ++ \ ++ indexes = _mm512_setr_epi32( 3,3,3,3, 15,15,15,15, 19,19,19,19, 31,31,31,31 ); \ ++ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ ++ b1 = _mm512_fmadd_ps ( t2, c, b1 ); \ ++ \ ++ indexes = _mm512_setr_epi32( 7,7,7,7, 11,11,11,11, 23,23,23,23, 27,27,27,27 ); \ ++ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ ++ b2 = _mm512_fmadd_ps ( t1, c, b2 ); \ ++ \ ++ \ ++ indexes = _mm512_setr_epi32( 4, 5, 12, 13, 10, 11, 14, 15, \ ++ 20, 21, 28, 29, 26, 27, 30, 31 ); \ ++ ut1 = _mm512_permutex2var_ps( ut11, indexes, ut21 ); \ ++ ut2 = _mm512_permutex2var_ps( ut31, indexes, ut41 ); \ ++ \ ++ indexes = _mm512_setr_epi32( 0,0,0,0, 10,10,10,10, 16,16,16,16, 26,26,26,26 ); \ ++ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ ++ b1 = _mm512_fmadd_ps ( a3, c, b1 ); \ ++ \ ++ indexes = _mm512_setr_epi32( 2,2,2,2, 8,8,8,8, 18,18,18,18, 24,24,24,24 ); \ ++ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ ++ b3 = _mm512_mul_ps ( a1, c ); \ ++ \ ++ indexes = _mm512_setr_epi32( 4,4,4,4, 14,14,14,14, 20,20,20,20, 30,30,30,30 ); \ ++ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ ++ b2 = _mm512_fmadd_ps ( a3, c, b2 ); \ ++ \ ++ indexes = _mm512_setr_epi32( 6,6,6,6, 12,12,12,12, 22,22,22,22, 28,28,28,28 ); \ ++ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ ++ b3 = _mm512_fmadd_ps ( a2, c, b3 ); \ ++ \ ++ \ ++ t3 = _mm512_permute_ps( a3, 0b10110001 ); \ ++ t3 = _mm512_mul_ps( t3, sign ); \ ++ \ ++ indexes = _mm512_setr_epi32( 1,1,1,1, 11,11,11,11, 17,17,17,17, 27,27,27,27 ); \ ++ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ ++ b1 = _mm512_fmadd_ps ( t3, c, b1 ); \ ++ \ ++ indexes = _mm512_setr_epi32( 3,3,3,3, 9,9,9,9, 19,19,19,19, 25,25,25,25 ); \ ++ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ ++ b3 = _mm512_fmadd_ps ( t1, c, b3 ); \ ++ \ ++ indexes = _mm512_setr_epi32( 5,5,5,5, 15,15,15,15, 21,21,21,21, 31,31,31,31 ); \ ++ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ ++ b2 = _mm512_fmadd_ps ( t3, c, b2 ); \ ++ \ ++ indexes = _mm512_setr_epi32( 7,7,7,7, 13,13,13,13, 23,23,23,23, 29,29,29,29 ); \ ++ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ ++ b3 = _mm512_fmadd_ps ( t2, c, b3 ); \ ++ \ ++ \ ++ indexes = _mm512_setr_epi32( 0, 1, 16, 17, 0,0,0,0, 0,0,0,0, 0,0,0,0 ); \ ++ ut1 = _mm512_permutex2var_ps( ut12, indexes, ut22 ); \ ++ ut2 = _mm512_permutex2var_ps( ut32, indexes, ut42 ); \ ++ \ ++ indexes = _mm512_setr_epi32( 0,0,0,0, 2,2,2,2, 16,16,16,16, 18,18,18,18 ); \ ++ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ ++ b3 = _mm512_fmadd_ps ( a3, c, b3 ); \ ++ \ ++ indexes = _mm512_setr_epi32( 1,1,1,1, 3,3,3,3, 17,17,17,17, 19,19,19,19 ); \ ++ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ ++ b3 = _mm512_fmadd_ps ( t3, c, b3 ); \ ++} ++ ++ ++ ++/* Insert 256 bits into a 512 bit single precision vector */ ++#define _avx512_insert_256_h_f( a, t ) \ ++{ \ ++ __m512d td512; \ ++ __m256d td256; \ ++ td512 = _mm512_castps_pd( a ); \ ++ td256 = _mm256_castps_pd( t ); \ ++ td512 = _mm512_insertf64x4( td512, td256, 1 ); \ ++ a = _mm512_castpd_ps( td512 ); \ ++} ++ ++/* Extract 256 bits from a 512 bit single precision vector */ ++#define _avx512_extract_256_h_f( t, a ) \ ++{ \ ++ __m512d td512; \ ++ __m256d td256; \ ++ td512 = _mm512_castps_pd( a ); \ ++ td256 = _mm256_castps_pd( t ); \ ++ td256 = _mm512_extractf64x4_pd( td512, 1 ); \ ++ t = _mm256_castpd_ps( td256 ); \ ++} ++ ++ ++ ++/* Accumulate elements of Dirac vectors into a Weyl vector in deo and doe */ ++#define _avx512_to_weyl_f_12( c, b ){ \ ++ __m256 w, sign, t5,t6; \ ++ __m256i idx; \ ++ t5 = _mm512_castps512_ps256( b ); \ ++ \ ++ idx = _mm256_setr_epi32( 4, 5, 6, 7, 0, 1, 2, 3 ); \ ++ t6 = _mm256_permutevar8x32_ps( t5, idx ); \ ++ sign = _mm256_set_ps( -1,-1,-1,-1, 1,1,1,1 ); \ ++ c = _mm256_fmadd_ps( t5, sign, t6 ); \ ++ \ ++ _avx512_extract_256_h_f( t5, b ); \ ++ t6 = _mm256_permutevar8x32_ps( t5, idx ); \ ++ sign = _mm256_set_ps( -1,-1,-1,-1, 1,1,1,1 ); \ ++ w = _mm256_fmadd_ps( t6, sign, t5 ); \ ++ idx = _mm256_setr_epi32( 0, 1, 2, 3, 3, 2, 1, 0 ); \ ++ w = _mm256_permutevar_ps( w, idx ); \ ++ sign = _mm256_set_ps( 1,-1,1,-1, 1,1,1,1 ); \ ++ c = _mm256_fmadd_ps( w, sign, c ); \ ++} ++ ++#define _avx512_to_weyl_f_34( c, b ){ \ ++ __m256 w, sign, t5,t6; \ ++ __m256i idx; \ ++ t5 = _mm512_castps512_ps256( b ); \ ++ \ ++ idx = _mm256_setr_epi32( 4, 5, 6, 7, 0, 1, 2, 3 ); \ ++ t6 = _mm256_permutevar8x32_ps( t5, idx ); \ ++ sign = _mm256_set_ps( -1,-1,-1,-1, 1,1,1,1 ); \ ++ w = _mm256_fmadd_ps( t6, sign, t5 ); \ ++ idx = _mm256_setr_epi32( 0, 1, 2, 3, 2, 3, 0, 1 ); \ ++ w = _mm256_permutevar_ps( w, idx ); \ ++ sign = _mm256_set_ps( -1,-1,1,1, 1,1,1,1 ); \ ++ c = _mm256_fmadd_ps( w, sign, c ); \ ++ \ ++ _avx512_extract_256_h_f( t5, b ); \ ++ idx = _mm256_setr_epi32( 4, 5, 6, 7, 0, 1, 2, 3 ); \ ++ t6 = _mm256_permutevar8x32_ps( t5, idx ); \ ++ sign = _mm256_set_ps( -1,-1,-1,-1, 1,1,1,1 ); \ ++ w = _mm256_fmadd_ps( t6, sign, t5 ); \ ++ idx = _mm256_setr_epi32( 0, 1, 2, 3, 1, 0, 3, 2 ); \ ++ w = _mm256_permutevar_ps( w, idx ); \ ++ sign = _mm256_set_ps( -1,1,1,-1, 1,1,1,1 ); \ ++ c = _mm256_fmadd_ps( w, sign, c ); \ ++} ++ ++ ++/* Expand a Weyl vector into a Dirac vector in deo and doe */ ++#define _avx512_to_dirac_f_1( a1, c1 ) \ ++{ \ ++ __m256 t1,t2, sign; \ ++ __m256i idx; \ ++ idx = _mm256_setr_epi32( 4, 5, 6, 7, 0, 1, 2, 3 ); \ ++ t1 = _mm256_permutevar8x32_ps( c1, idx ); \ ++ sign = _mm256_set_ps( -1,-1,-1,-1, 1,1,1,1 ); \ ++ t2 = _mm256_fmadd_ps( c1, sign, t1 ); \ ++ a1 = _mm512_castps256_ps512( t2 ); \ ++} ++ ++ ++#define _avx512_to_dirac_f_2( a1, c1 ) \ ++{ \ ++ __m256 t1,t2, sign; \ ++ __m256i idx; \ ++ idx = _mm256_setr_epi32( 7, 6, 5, 4, 7, 6, 5, 4 ); \ ++ t1 = _mm256_permutevar8x32_ps( c1, idx ); \ ++ idx = _mm256_setr_epi32( 0, 1, 2, 3, 0, 1, 2, 3 ); \ ++ t2 = _mm256_permutevar8x32_ps( c1, idx ); \ ++ sign = _mm256_set_ps( -1,1,-1,1, 1,-1,1,-1 ); \ ++ t2 = _mm256_fmadd_ps( t1, sign, t2 ); \ ++ _avx512_insert_256_h_f( a1, t2 ); \ ++} ++ ++#define _avx512_to_dirac_f_3( a1, c1 ) \ ++{ \ ++ __m512 t5,t6,t7; \ ++ __m512i idx; \ ++ t5 = _mm512_castps256_ps512( c1 ); \ ++ idx = _mm512_setr_epi32( 6,7,4,5, 6,7,4,5, 5,4,7,6, 5,4,7,6 ); \ ++ t6 = _mm512_permutexvar_ps( idx, t5 ); \ ++ idx = _mm512_setr_epi32( 0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3 ); \ ++ t7 = _mm512_permutexvar_ps( idx, t5 ); \ ++ a1 = _mm512_maskz_add_ps( 0b1001011011000011, t7, t6 ); \ ++ a1 = _mm512_mask_sub_ps( a1, 0b0110100100111100, t7, t6 ); \ ++} ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++/* Macros for double precision numbers */ ++ ++/* Load 2 half-spinors and organize colorwise into vectors s1, s2 and s3 */ ++#define _avx512_load_2_halfspinor_d( s1, s2, s3, sp, sm ) \ ++{ \ ++ __m512d a1,a2,a3,a4,a5,a6; \ ++ __m512i idx; \ ++ a1 = _mm512_loadu_pd( sm ); \ ++ a2 = _mm512_loadu_pd( sm+8 ); \ ++ a3 = _mm512_loadu_pd( sp ); \ ++ a4 = _mm512_loadu_pd( sp+8 ); \ ++ \ ++ idx = _mm512_setr_epi64( 8,9,14,15, 0,1,6,7 ); \ ++ s1 = _mm512_permutex2var_pd( a1, idx, a3 ); \ ++ \ ++ idx = _mm512_setr_epi64( 4,5,10,11, 2,3,8,9 ); \ ++ a5 = _mm512_permutex2var_pd( a1, idx, a2 ); \ ++ idx = _mm512_setr_epi64( 2,3,8,9, 4,5,10,11 ); \ ++ a6 = _mm512_permutex2var_pd( a3, idx, a4 ); \ ++ idx = _mm512_setr_epi64( 0,1,2,3, 12,13,14,15 ); \ ++ s2 = _mm512_permutex2var_pd( a6, idx, a5 ); \ ++ \ ++ idx = _mm512_setr_epi64( 4,5,6,7, 8,9,10,11 ); \ ++ s3 = _mm512_permutex2var_pd( a6, idx, a5 ); \ ++} ++ ++/* Load 2 half-spinors reversing the spinor indeces and ++ * organize colorwise into vectors s1, s2 and s3 */ ++#define _avx512_load_2_halfspinor_d_reverse( s1, s2, s3, sp, sm ) \ ++{ \ ++ __m512d a1,a2,a3,a4,a5,a6; \ ++ __m512i idx; \ ++ a1 = _mm512_loadu_pd( sm ); \ ++ a2 = _mm512_loadu_pd( sm+8 ); \ ++ a3 = _mm512_loadu_pd( sp ); \ ++ a4 = _mm512_loadu_pd( sp+8 ); \ ++ \ ++ idx = _mm512_setr_epi64( 14,15,8,9, 6,7,0,1 ); \ ++ s1 = _mm512_permutex2var_pd( a1, idx, a3 ); \ ++ \ ++ idx = _mm512_setr_epi64( 10,11,4,5, 8,9,2,3 ); \ ++ a5 = _mm512_permutex2var_pd( a1, idx, a2 ); \ ++ idx = _mm512_setr_epi64( 8,9,2,3, 10,11,4,5 ); \ ++ a6 = _mm512_permutex2var_pd( a3, idx, a4 ); \ ++ idx = _mm512_setr_epi64( 0,1,2,3, 12,13,14,15 ); \ ++ s2 = _mm512_permutex2var_pd( a6, idx, a5 ); \ ++ \ ++ idx = _mm512_setr_epi64( 4,5,6,7, 8,9,10,11 ); \ ++ s3 = _mm512_permutex2var_pd( a6, idx, a5 ); \ ++} ++ ++/* Write 2 half-spinors from three color vectors */ ++#define _avx512_store_2_halfspinor_d( s1, s2, s3, sp, sm ) \ ++{ \ ++ __m512d a1,a2,a3,a4,a5,a6; \ ++ __m256d l; \ ++ __m512i idx; \ ++ idx = _mm512_setr_epi64( 0,1,8,9, 4,5,12,13 ); \ ++ a1 = _mm512_permutex2var_pd( s1, idx, s2 ); \ ++ idx = _mm512_setr_epi64( 0,1,10,11, 4,5,14,15 ); \ ++ a2 = _mm512_permutex2var_pd( s3, idx, s1 ); \ ++ idx = _mm512_setr_epi64( 2,3,10,11, 6,7,14,15 ); \ ++ a3 = _mm512_permutex2var_pd( s2, idx, s3 ); \ ++ \ ++ l = _mm512_castpd512_pd256( a1 ); \ ++ _mm256_storeu_pd( sp, l ); \ ++ l = _mm512_castpd512_pd256( a2 ); \ ++ _mm256_storeu_pd( sp+4, l ); \ ++ l = _mm512_castpd512_pd256( a3 ); \ ++ _mm256_storeu_pd( sp+8, l ); \ ++ \ ++ l = _mm512_extractf64x4_pd( a1, 1 ); \ ++ _mm256_storeu_pd( sm, l ); \ ++ l = _mm512_extractf64x4_pd( a2, 1 ); \ ++ _mm256_storeu_pd( sm+4, l ); \ ++ l = _mm512_extractf64x4_pd( a3, 1 ); \ ++ _mm256_storeu_pd( sm+8, l ); \ ++} ++ ++ ++ ++/* Multiply the lower half of the color vectors distributed in c1, c2 and c3 ++ * by the su3 matrix u and the upper half by the conjugate of um ++ * Store in b1, b2 and b3 ++ */ ++#define avx512_su3_mul_quad_dble( u, um, b1, b2, b3, c1, c2, c3 ) \ ++{ \ ++ __m512d tu1, tu2, tu3, tum1, tum2, tum3; \ ++ __m512d u1; \ ++ __m512d t1, t2, t3, sign; \ ++ __m512i indexes; \ ++ tu1 = _mm512_loadu_pd( &(u).c11.re ); \ ++ tu2 = _mm512_loadu_pd( &(u).c22.re ); \ ++ tu3 = _mm512_loadu_pd( &(u).c33.re ); \ ++ tum1 = _mm512_loadu_pd( &(um).c11.re ); \ ++ tum2 = _mm512_loadu_pd( &(um).c22.re ); \ ++ tum3 = _mm512_loadu_pd( &(um).c33.re ); \ ++ \ ++ sign = _mm512_set_pd( -1,1,-1,1, 1,-1,1,-1 ); \ ++ t1 = _mm512_permute_pd( c1, 0b01010101 ); \ ++ t2 = _mm512_permute_pd( c2, 0b01010101 ); \ ++ t3 = _mm512_permute_pd( c3, 0b01010101 ); \ ++ t1 = _mm512_mul_pd( t1, sign ); \ ++ t2 = _mm512_mul_pd( t2, sign ); \ ++ t3 = _mm512_mul_pd( t3, sign ); \ ++ \ ++ indexes = _mm512_setr_epi64( 0, 0, 0, 0, 8, 8, 8, 8 ); \ ++ u1 = _mm512_permutex2var_pd( tu1, indexes, tum1 ); \ ++ b1 = _mm512_mul_pd ( u1, c1 ); \ ++ indexes = _mm512_setr_epi64( 1, 1, 1, 1, 9, 9, 9, 9 ); \ ++ u1 = _mm512_permutex2var_pd( tu1, indexes, tum1 ); \ ++ b1 = _mm512_fmadd_pd ( u1, t1, b1 ); \ ++ \ ++ indexes = _mm512_setr_epi64( 2, 2, 2, 2, 14, 14, 14, 14 ); \ ++ u1 = _mm512_permutex2var_pd( tu1, indexes, tum1 ); \ ++ b1 = _mm512_fmadd_pd ( u1, c2, b1 ); \ ++ indexes = _mm512_setr_epi64( 3, 3, 3, 3, 15, 15, 15, 15 ); \ ++ u1 = _mm512_permutex2var_pd( tu1, indexes, tum1 ); \ ++ b1 = _mm512_fmadd_pd ( u1, t2, b1 ); \ ++ \ ++ indexes = _mm512_setr_epi64( 4, 4, 4, 4, 12, 12, 12, 12 ); \ ++ u1 = _mm512_permutex2var_pd( tu1, indexes, tum2 ); \ ++ b1 = _mm512_fmadd_pd ( u1, c3, b1 ); \ ++ indexes = _mm512_setr_epi64( 5, 5, 5, 5, 13, 13, 13, 13 ); \ ++ u1 = _mm512_permutex2var_pd( tu1, indexes, tum2 ); \ ++ b1 = _mm512_fmadd_pd ( u1, t3, b1 ); \ ++ \ ++ indexes = _mm512_setr_epi64( 6, 6, 6, 6, 10, 10, 10, 10 ); \ ++ u1 = _mm512_permutex2var_pd( tu1, indexes, tum1 ); \ ++ b2 = _mm512_mul_pd ( u1, c1 ); \ ++ indexes = _mm512_setr_epi64( 7, 7, 7, 7, 11, 11, 11, 11 ); \ ++ u1 = _mm512_permutex2var_pd( tu1, indexes, tum1 ); \ ++ b2 = _mm512_fmadd_pd ( u1, t1, b2 ); \ ++ \ ++ indexes = _mm512_setr_epi64( 0, 0, 0, 0, 8, 8, 8, 8 ); \ ++ u1 = _mm512_permutex2var_pd( tu2, indexes, tum2 ); \ ++ b2 = _mm512_fmadd_pd ( u1, c2, b2 ); \ ++ indexes = _mm512_setr_epi64( 1, 1, 1, 1, 9, 9, 9, 9 ); \ ++ u1 = _mm512_permutex2var_pd( tu2, indexes, tum2 ); \ ++ b2 = _mm512_fmadd_pd ( u1, t2, b2 ); \ ++ \ ++ indexes = _mm512_setr_epi64( 2, 2, 2, 2, 14, 14, 14, 14 ); \ ++ u1 = _mm512_permutex2var_pd( tu2, indexes, tum2 ); \ ++ b2 = _mm512_fmadd_pd ( u1, c3, b2 ); \ ++ indexes = _mm512_setr_epi64( 3, 3, 3, 3, 15, 15, 15, 15 ); \ ++ u1 = _mm512_permutex2var_pd( tu2, indexes, tum2 ); \ ++ b2 = _mm512_fmadd_pd ( u1, t3, b2 ); \ ++ \ ++ indexes = _mm512_setr_epi64( 4, 4, 4, 4, 12, 12, 12, 12 ); \ ++ u1 = _mm512_permutex2var_pd( tu2, indexes, tum1 ); \ ++ b3 = _mm512_mul_pd ( u1, c1 ); \ ++ indexes = _mm512_setr_epi64( 5, 5, 5, 5, 13, 13, 13, 13 ); \ ++ u1 = _mm512_permutex2var_pd( tu2, indexes, tum1 ); \ ++ b3 = _mm512_fmadd_pd ( u1, t1, b3 ); \ ++ \ ++ indexes = _mm512_setr_epi64( 6, 6, 6, 6, 10, 10, 10, 10 ); \ ++ u1 = _mm512_permutex2var_pd( tu2, indexes, tum2 ); \ ++ b3 = _mm512_fmadd_pd ( u1, c2, b3 ); \ ++ indexes = _mm512_setr_epi64( 7, 7, 7, 7, 11, 11, 11, 11 ); \ ++ u1 = _mm512_permutex2var_pd( tu2, indexes, tum2 ); \ ++ b3 = _mm512_fmadd_pd ( u1, t2, b3 ); \ ++ \ ++ indexes = _mm512_setr_epi64( 0, 0, 0, 0, 8, 8, 8, 8 ); \ ++ u1 = _mm512_permutex2var_pd( tu3, indexes, tum3 ); \ ++ b3 = _mm512_fmadd_pd ( u1, c3, b3 ); \ ++ indexes = _mm512_setr_epi64( 1, 1, 1, 1, 9, 9, 9, 9 ); \ ++ u1 = _mm512_permutex2var_pd( tu3, indexes, tum3 ); \ ++ b3 = _mm512_fmadd_pd ( u1, t3, b3 ); \ ++} ++ ++ ++ ++ ++ ++ ++/* Combine spinor entries into 2 weyl vectors ++ stored in high and low entries of a spinor ++ */ ++#define _avx512_to_weyl_1( w, b ){ \ ++ __m512i indexes; \ ++ __m512d _t; \ ++ indexes = _mm512_setr_epi64( 4, 5, 6, 7, 0, 1, 2, 3 ); \ ++ _t = _mm512_permutexvar_pd( indexes, (b) ); \ ++ w = _mm512_maskz_add_pd( 0b00001111, _t, (b) ); \ ++ w = _mm512_mask_sub_pd( w, 0b11110000, _t, (b) ); \ ++} ++ ++#define _avx512_to_weyl_2( w, b ){ \ ++ __m512i indexes; \ ++ __m512d _t; \ ++ indexes = _mm512_setr_epi64( 4, 5, 6, 7, 0, 1, 2, 3 ); \ ++ _t = _mm512_permutexvar_pd( indexes, (b) ); \ ++ _t = _mm512_mask_add_pd( _t, 0b00001111, _t, (b) ); \ ++ _t = _mm512_mask_sub_pd( _t, 0b11110000, (b), _t ); \ ++ indexes = _mm512_setr_epi64( 0, 1, 2, 3, 7, 6, 5, 4 ); \ ++ _t = _mm512_permutexvar_pd( indexes, _t ); \ ++ w = _mm512_mask_add_pd( w, 0b10101111, w, _t ); \ ++ w = _mm512_mask_sub_pd( w, 0b01010000, w, _t ); \ ++} ++ ++#define _avx512_to_weyl_3( w, b ){ \ ++ __m512i indexes; \ ++ __m512d _t; \ ++ indexes = _mm512_setr_epi64( 4, 5, 6, 7, 0, 1, 2, 3 ); \ ++ _t = _mm512_permutexvar_pd( indexes, (b) ); \ ++ _t = _mm512_mask_add_pd( _t, 0b00001111, _t, (b) ); \ ++ _t = _mm512_mask_sub_pd( _t, 0b11110000, (b), _t ); \ ++ indexes = _mm512_setr_epi64( 0, 1, 2, 3, 6, 7, 4, 5 ); \ ++ _t = _mm512_permutexvar_pd( indexes, _t ); \ ++ w = _mm512_mask_add_pd( w, 0b00111111, w, _t ); \ ++ w = _mm512_mask_sub_pd( w, 0b11000000, w, _t ); \ ++} ++ ++#define _avx512_to_weyl_4( w, b ){ \ ++ __m512i indexes; \ ++ __m512d _t; \ ++ indexes = _mm512_setr_epi64( 4, 5, 6, 7, 0, 1, 2, 3 ); \ ++ _t = _mm512_permutexvar_pd( indexes, (b) ); \ ++ _t = _mm512_mask_add_pd( _t, 0b00001111, _t, (b) ); \ ++ _t = _mm512_mask_sub_pd( _t, 0b11110000, (b), _t ); \ ++ indexes = _mm512_setr_epi64( 0, 1, 2, 3, 5, 4, 7, 6 ); \ ++ _t = _mm512_permutexvar_pd( indexes, _t ); \ ++ w = _mm512_mask_add_pd( w, 0b01101111, w, _t ); \ ++ w = _mm512_mask_sub_pd( w, 0b10010000, w, _t ); \ ++} ++ ++ ++ ++ ++/* Create a full Dirac vector by adding and subtracting the indeces of ++ * a weyl vector */ ++#define _avx512_expand_weyl( a, w ){ \ ++ __m512i indexes; \ ++ __m512d _t; \ ++ indexes = _mm512_setr_epi64( 4, 5, 6, 7, 0, 1, 2, 3 ); \ ++ _t = _mm512_permutexvar_pd( indexes, (w) ); \ ++ a = _mm512_maskz_add_pd( 0b00001111, _t, w ); \ ++ a = _mm512_mask_sub_pd( a, 0b11110000, _t, w ); \ ++} ++ ++#define _avx512_expand_weyl_2( a, w ){ \ ++ __m512i indexes; \ ++ __m512d _t1, _t2; \ ++ indexes = _mm512_setr_epi64( 7, 6, 5, 4, 7, 6, 5, 4 ); \ ++ _t1 = _mm512_permutexvar_pd( indexes, (w) ); \ ++ indexes = _mm512_setr_epi64( 0, 1, 2, 3, 0, 1, 2, 3 ); \ ++ _t2 = _mm512_permutexvar_pd( indexes, (w) ); \ ++ a = _mm512_maskz_add_pd( 0b01011010, _t2, _t1 ); \ ++ a = _mm512_mask_sub_pd( a, 0b10100101, _t2, _t1 ); \ ++} ++ ++#define _avx512_expand_weyl_3( a, w ){ \ ++ __m512i indexes; \ ++ __m512d _t1, _t2; \ ++ indexes = _mm512_setr_epi64( 6, 7, 4, 5, 6, 7, 4, 5 ); \ ++ _t1 = _mm512_permutexvar_pd( indexes, (w) ); \ ++ indexes = _mm512_setr_epi64( 0, 1, 2, 3, 0, 1, 2, 3 ); \ ++ _t2 = _mm512_permutexvar_pd( indexes, (w) ); \ ++ a = _mm512_maskz_add_pd( 0b11000011, _t2, _t1 ); \ ++ a = _mm512_mask_sub_pd( a, 0b00111100, _t2, _t1 ); \ ++} ++ ++#define _avx512_expand_weyl_4( a, w ){ \ ++ __m512i indexes; \ ++ __m512d _t1, _t2; \ ++ indexes = _mm512_setr_epi64( 5, 4, 7, 6, 5, 4, 7, 6 ); \ ++ _t1 = _mm512_permutexvar_pd( indexes, (w) ); \ ++ indexes = _mm512_setr_epi64( 0, 1, 2, 3, 0, 1, 2, 3 ); \ ++ _t2 = _mm512_permutexvar_pd( indexes, (w) ); \ ++ a = _mm512_maskz_add_pd( 0b10010110, _t2, _t1 ); \ ++ a = _mm512_mask_sub_pd( a, 0b01101001, _t2, _t1 ); \ ++} ++ ++ ++ ++ ++ ++/* Load four complex numbers. */ ++#define _avx512_load_4_d( v, c1, c2, c3, c4 ) \ ++{ \ ++ __m128d t128l, t128u; \ ++ __m256d t256l ,t256u; \ ++ t128l = _mm_loadu_pd( &(c1).re ); \ ++ t128u = _mm_loadu_pd( &(c2).re ); \ ++ t256l = _mm256_castpd128_pd256( t128l ); \ ++ t256l = _mm256_insertf128_pd( t256l, t128u, 1 ); \ ++ t128l = _mm_loadu_pd( &(c3).re ); \ ++ t128u = _mm_loadu_pd( &(c4).re ); \ ++ t256u = _mm256_castpd128_pd256( t128l ); \ ++ t256u = _mm256_insertf128_pd( t256u, t128u, 1 ); \ ++ v = _mm512_castpd256_pd512( t256l ); \ ++ v = _mm512_insertf64x4( v, t256u, 1 ); \ ++} ++ ++/* Store four complex numbers */ ++#define _avx512_store_4_d( r, v1, v2, v3, v4 ) \ ++{ \ ++ __m256d t256; \ ++ __m128d t128; \ ++ t256 = _mm512_extractf64x4_pd( r, 1 ); \ ++ t128 = _mm256_extractf128_pd( t256, 1 ); \ ++ _mm_storeu_pd( &(v4).re, t128 ); \ ++ t128 = _mm256_castpd256_pd128( t256 ); \ ++ _mm_storeu_pd( &(v3).re, t128 ); \ ++ t256 = _mm512_castpd512_pd256( r ); \ ++ t128 = _mm256_extractf128_pd( t256, 1 ); \ ++ _mm_storeu_pd( &(v2).re, t128 ); \ ++ t128 = _mm256_castpd256_pd128( t256 ); \ ++ _mm_storeu_pd( &(v1).re, t128 ); \ ++} ++ ++ ++ ++ ++/* Adding a vectors to a spinors */ ++ ++/* Load half spinors and organize for adding */ ++#define _avx512_load_s_ud_d( s1,s2,s3,s4, t1,t2,t3, sp, sm ) \ ++ s1 = _mm512_loadu_pd( (sm) ); \ ++ s2 = _mm512_loadu_pd( (sm)+8 ); \ ++ s3 = _mm512_loadu_pd( (sp) ); \ ++ s4 = _mm512_loadu_pd( (sp)+8 ); \ ++ \ ++ idx = _mm512_setr_epi64( 0,1,2,3, 8,9,10,11 ); \ ++ t1 = _mm512_permutex2var_pd( s1, idx, s3 ); \ ++ idx = _mm512_setr_epi64( 4,5,6,7, 12,13,14,15 ); \ ++ t2 = _mm512_permutex2var_pd( s1, idx, s3 ); \ ++ idx = _mm512_setr_epi64( 0,1,2,3, 8,9,10,11 ); \ ++ t3 = _mm512_permutex2var_pd( s2, idx, s4 ); \ ++ ++/* reorganize weyl vectors for adding */ ++#define _avx512_reorganize_a_ud_d( b1,b2,b3, a1,a2,a3 ) \ ++ idx = _mm512_setr_epi64( 0,1,8,9, 4,5,12,13 ); \ ++ a1 = _mm512_permutex2var_pd( b1, idx, b2 ); \ ++ idx = _mm512_setr_epi64( 0,1,10,11, 4,5,14,15 ); \ ++ a2 = _mm512_permutex2var_pd( b3, idx, b1 ); \ ++ idx = _mm512_setr_epi64( 2,3,10,11, 6,7,14,15 ); \ ++ a3 = _mm512_permutex2var_pd( b2, idx, b3 ); \ ++ ++/* store after adding */ ++#define _avx512_write_a_ud_d( a1, a2, a3, sp, sm ){ \ ++ __m256d l; \ ++ l = _mm512_castpd512_pd256( a1 ); \ ++ _mm256_storeu_pd( (sm), l ); \ ++ l = _mm512_castpd512_pd256( a2 ); \ ++ _mm256_storeu_pd( (sm)+4, l ); \ ++ l = _mm512_castpd512_pd256( a3 ); \ ++ _mm256_storeu_pd( (sm)+8, l ); \ ++ \ ++ l = _mm512_extractf64x4_pd( a1, 1 ); \ ++ _mm256_storeu_pd( (sp), l ); \ ++ l = _mm512_extractf64x4_pd( a2, 1 ); \ ++ _mm256_storeu_pd( (sp)+4, l ); \ ++ l = _mm512_extractf64x4_pd( a3, 1 ); \ ++ _mm256_storeu_pd( (sp)+8, l ); \ ++} ++ ++ ++#define _avx512_add_to_spinors( b1, b2, b3, sp, sm ) \ ++{ \ ++ __m512d s1,s2,s3,s4, a1,a2,a3, t1,t2,t3; \ ++ __m512i idx; \ ++ \ ++ _avx512_load_s_ud_d( s1,s2,s3,s4, t1,t2,t3, sp, sm ); \ ++ _avx512_reorganize_a_ud_d( b1,b2,b3, a1,a2,a3 ); \ ++ \ ++ t1 = _mm512_add_pd( a1, t1 ); \ ++ t2 = _mm512_add_pd( a2, t2 ); \ ++ t3 = _mm512_add_pd( a3, t3 ); \ ++ \ ++ _avx512_write_a_ud_d( t1, t2, t3, sp, sm ); \ ++} ++ ++#define _avx512_add_to_spinors_2( b1, b2, b3, sp, sm ) \ ++{ \ ++ __m512d s1,s2,s3,s4, a1,a2,a3, t1,t2,t3; \ ++ __m512i idx; \ ++ \ ++ _avx512_load_s_ud_d( s1,s2,s3,s4, t1,t2,t3, sp, sm ); \ ++ _avx512_reorganize_a_ud_d( b1,b2,b3, a1,a2,a3 ); \ ++ \ ++ t1 = _mm512_mask_add_pd( t1, 0b00001111, t1, a1 ); \ ++ t1 = _mm512_mask_sub_pd( t1, 0b11110000, t1, a1 ); \ ++ t2 = _mm512_mask_add_pd( t2, 0b00001111, t2, a2 ); \ ++ t2 = _mm512_mask_sub_pd( t2, 0b11110000, t2, a2 ); \ ++ t3 = _mm512_mask_add_pd( t3, 0b00001111, t3, a3 ); \ ++ t3 = _mm512_mask_sub_pd( t3, 0b11110000, t3, a3 ); \ ++ \ ++ _avx512_write_a_ud_d( t1, t2, t3, sp, sm ); \ ++} ++ ++#define _avx512_add_to_spinors_3( b1, b2, b3, sp, sm ) \ ++{ \ ++ __m512d s1,s2,s3,s4, a1,a2,a3, t1,t2,t3; \ ++ __m512i idx; \ ++ \ ++ _avx512_load_s_ud_d( s1,s2,s3,s4, t1,t2,t3, sp, sm ); \ ++ idx = _mm512_setr_epi64( 3,2,11,10, 7,6,15,14 ); \ ++ a1 = _mm512_permutex2var_pd( b1, idx, b2 ); \ ++ idx = _mm512_setr_epi64( 3,2,9,8, 7,6,13,12 ); \ ++ a2 = _mm512_permutex2var_pd( b3, idx, b1 ); \ ++ idx = _mm512_setr_epi64( 1,0,9,8, 5,4,13,12 ); \ ++ a3 = _mm512_permutex2var_pd( b2, idx, b3 ); \ ++ \ ++ t1 = _mm512_mask_add_pd( t1, 0b10100101, t1, a1 ); \ ++ t1 = _mm512_mask_sub_pd( t1, 0b01011010, t1, a1 ); \ ++ t2 = _mm512_mask_add_pd( t2, 0b10100101, t2, a2 ); \ ++ t2 = _mm512_mask_sub_pd( t2, 0b01011010, t2, a2 ); \ ++ t3 = _mm512_mask_add_pd( t3, 0b10100101, t3, a3 ); \ ++ t3 = _mm512_mask_sub_pd( t3, 0b01011010, t3, a3 ); \ ++ \ ++ _avx512_write_a_ud_d( t1, t2, t3, sp, sm ); \ ++} ++ ++#define _avx512_add_to_spinors_4( b1, b2, b3, sp, sm ) \ ++{ \ ++ __m512d s1,s2,s3,s4, a1,a2,a3, t1,t2,t3; \ ++ __m512i idx; \ ++ idx = _mm512_setr_epi64( 2,3,0,1, 6,7,4,5 ); \ ++ b1 = _mm512_permutexvar_pd( idx, b1 ); \ ++ b2 = _mm512_permutexvar_pd( idx, b2 ); \ ++ b3 = _mm512_permutexvar_pd( idx, b3 ); \ ++ \ ++ _avx512_load_s_ud_d( s1,s2,s3,s4, t1,t2,t3, sp, sm ); \ ++ idx = _mm512_setr_epi64( 0,1,8,9, 4,5,12,13 ); \ ++ a1 = _mm512_permutex2var_pd( b1, idx, b2 ); \ ++ idx = _mm512_setr_epi64( 0,1,10,11, 4,5,14,15 ); \ ++ a2 = _mm512_permutex2var_pd( b3, idx, b1 ); \ ++ idx = _mm512_setr_epi64( 2,3,10,11, 6,7,14,15 ); \ ++ a3 = _mm512_permutex2var_pd( b2, idx, b3 ); \ ++ \ ++ t1 = _mm512_mask_add_pd( t1, 0b11110000, t1, a1 ); \ ++ t1 = _mm512_mask_sub_pd( t1, 0b00001111, t1, a1 ); \ ++ t2 = _mm512_mask_add_pd( t2, 0b00111100, t2, a2 ); \ ++ t2 = _mm512_mask_sub_pd( t2, 0b11000011, t2, a2 ); \ ++ t3 = _mm512_mask_add_pd( t3, 0b00001111, t3, a3 ); \ ++ t3 = _mm512_mask_sub_pd( t3, 0b11110000, t3, a3 ); \ ++ \ ++ _avx512_write_a_ud_d( t1, t2, t3, sp, sm ); \ ++} ++ ++ ++#define _avx512_add_to_spinors_5( b1, b2, b3, sp, sm ) \ ++{ \ ++ __m512d s1,s2,s3,s4, a1,a2,a3, t1,t2,t3; \ ++ __m512i idx; \ ++ \ ++ _avx512_load_s_ud_d( s1,s2,s3,s4, t1,t2,t3, sp, sm ); \ ++ idx = _mm512_setr_epi64( 1,0,9,8, 5,4,13,12 ); \ ++ a1 = _mm512_permutex2var_pd( b1, idx, b2 ); \ ++ idx = _mm512_setr_epi64( 1,0,11,10, 5,4,15,14 ); \ ++ a2 = _mm512_permutex2var_pd( b3, idx, b1 ); \ ++ idx = _mm512_setr_epi64( 3,2,11,10, 7,6,15,14 ); \ ++ a3 = _mm512_permutex2var_pd( b2, idx, b3 ); \ ++ \ ++ t1 = _mm512_mask_add_pd( t1, 0b10100101, t1, a1 ); \ ++ t1 = _mm512_mask_sub_pd( t1, 0b01011010, t1, a1 ); \ ++ t2 = _mm512_mask_add_pd( t2, 0b01101001, t2, a2 ); \ ++ t2 = _mm512_mask_sub_pd( t2, 0b10010110, t2, a2 ); \ ++ t3 = _mm512_mask_add_pd( t3, 0b01011010, t3, a3 ); \ ++ t3 = _mm512_mask_sub_pd( t3, 0b10100101, t3, a3 ); \ ++ \ ++ _avx512_write_a_ud_d( t1, t2, t3, sp, sm ); \ ++} ++ ++ ++ ++ ++ ++#endif +diff --git a/include/sw_term.h b/include/sw_term.h +index f1ef622..b47d0a7 100644 +--- a/include/sw_term.h ++++ b/include/sw_term.h +@@ -29,6 +29,7 @@ extern void apply_sw(int vol,float mu,pauli *m,spinor *s,spinor *r); + + /* PAULI_DBLE_C */ + extern void mul_pauli_dble(double mu,pauli_dble *m,weyl_dble *s,weyl_dble *r); ++void mul_pauli2_dble(double mu, pauli_dble *m, weyl_dble *s, weyl_dble *r); + extern int inv_pauli_dble(double mu,pauli_dble *m,pauli_dble *im); + extern complex_dble det_pauli_dble(double mu,pauli_dble *m); + extern void apply_sw_dble(int vol,double mu,pauli_dble *m,spinor_dble *s, +diff --git a/main/Makefile b/main/Makefile +index 6dc4e1b..8c739d5 100644 +--- a/main/Makefile ++++ b/main/Makefile +@@ -26,7 +26,7 @@ BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + + DFL = dfl_geometry dfl_subspace ltl_gcr dfl_sap_gcr dfl_modes + +-DIRAC = Dw_dble Dw Dw_bnd ++DIRAC = Dw_dble Dw Dw_bnd Dw_avx512 Dw_dble_avx512 + + FLAGS = flags action_parms dfl_parms force_parms hmc_parms lat_parms \ + mdint_parms rat_parms rw_parms sap_parms solver_parms +@@ -36,7 +36,8 @@ FORCES = force0 force1 force2 force3 force4 force5 \ + + LATTICE = bcnds uidx ftidx geometry + +-LINALG = salg salg_dble valg valg_dble liealg cmatrix_dble cmatrix ++LINALG = salg salg_dble valg valg_dble liealg cmatrix_dble cmatrix \ ++ salg_avx512 salg_dble_avx512 + + LINSOLV = cgne mscg fgcr fgcr4vd + +@@ -54,7 +55,7 @@ SFLDS = sflds scom sdcom Pbnd Pbnd_dble + + SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +-SW_TERM = pauli pauli_dble swflds sw_term ++SW_TERM = pauli pauli_dble swflds sw_term pauli_avx512 pauli_dble_avx512 + + TCHARGE = ftcom ftensor tcharge ym_action + +@@ -68,12 +69,13 @@ VFLDS = vflds vinit vcom vdcom + + WFLOW = wflow + +-MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \ ++STD_MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \ + $(LATTICE) $(LINALG) $(LINSOLV) $(LITTLE) $(MDFLDS) $(RANDOM) \ + $(RATFCTS) $(SAP) $(SFLDS) $(SU3FCTS) $(SW_TERM) $(TCHARGE) \ + $(UFLDS) $(UPDATE) $(UTILS) $(VFLDS) $(WFLOW) + + ++ + # Logging option (-mpilog or -mpitrace or -mpianim) + + LOGOPTION = +@@ -88,7 +90,9 @@ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\ + $(MDIR)/utils:$(MDIR)/forces:$(MDIR)/sflds:$(MDIR)/dirac:\ + $(MDIR)/sw_term:$(MDIR)/tcharge:$(MDIR)/block:$(MDIR)/sap:\ + $(MDIR)/linsolv:$(MDIR)/dfl:$(MDIR)/vflds:$(MDIR)/little:\ +- $(MDIR)/update:$(MDIR)/wflow:$(MDIR)/ratfcts ++ $(MDIR)/update:$(MDIR)/wflow:$(MDIR)/ratfcts:\ ++ $(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512 ++ + + + # additional include directories +@@ -103,16 +107,17 @@ LIBS = m + LIBPATH = $(MPI_HOME)/lib + + +-# scheduling and optimization options ++# compilation, scheduling and optimization options + + CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ +- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ +- -O -mno-avx -Dx64 -DPM ++ -Wall -Wno-long-long -Wstrict-prototypes \ ++ -O2 -DAVX512 -DAVX -DPM ++ ++LFLAGS = $(CFLAGS) + +-LFLAGS = + + # See the README in the top directory for alternative choices of the +-# optimization options -O -mno-avx -Dx64 -DPM. ++# optimization options -O2 -DAVX512. + # + # The available debugging flags are + # +@@ -127,25 +132,24 @@ SHELL=/bin/bash + CC=mpicc + CLINKER=$(CC) + ++ ++#Check CFLAGS to find which AVX512 flags are active ++MODULES= $(STD_MODULES) ++ + PGMS= $(MAIN) $(MODULES) + + -include $(addsuffix .d,$(PGMS)) + + + # rule to make dependencies +- + $(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + +- + # rule to compile source programs +- +-$(addsuffix .o,$(PGMS)): %.o: %.c Makefile ++$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + +- + # rule to link object files +- + $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ +diff --git a/modules/dirac/Dw.c b/modules/dirac/Dw.c +index 3900283..bb18a72 100644 +--- a/modules/dirac/Dw.c ++++ b/modules/dirac/Dw.c +@@ -122,7 +122,27 @@ static const spinor s0={{{0.0f,0.0f},{0.0f,0.0f},{0.0f,0.0f}}, + {{0.0f,0.0f},{0.0f,0.0f},{0.0f,0.0f}}}; + static spin_t rs ALIGNED32; + +-#if (defined AVX) ++ ++#if ( defined AVX512 ) ++ ++#include "avx512.h" ++#include "sse.h" ++ ++void doe_avx512(int *piup, int *pidn, su3 *u, spinor *pk, float coe, spin_t *rs); ++static void doe(int *piup, int *pidn, su3 *u, spinor *pk) ++{ ++ doe_avx512( piup, pidn, u, pk, coe, &rs ); ++} ++ ++void deo_avx512(int *piup, int *pidn, su3 *u, spinor *pl, float ceo, spin_t *rs); ++static void deo(int *piup, int *pidn, su3 *u, spinor *pl) ++{ ++ deo_avx512( piup, pidn, u, pl, ceo, &rs ); ++} ++ ++ ++ ++#elif ( defined AVX ) + #include "avx.h" + + #define _load_cst(c) \ +diff --git a/modules/dirac/Dw_dble.c b/modules/dirac/Dw_dble.c +index d683cbb..4fc58f5 100644 +--- a/modules/dirac/Dw_dble.c ++++ b/modules/dirac/Dw_dble.c +@@ -1,4 +1,3 @@ +- + /******************************************************************************* + * + * File Dw_dble.c +@@ -121,7 +120,22 @@ static const spinor_dble sd0={{{0.0,0.0},{0.0,0.0},{0.0,0.0}}, + {{0.0,0.0},{0.0,0.0},{0.0,0.0}}}; + static spin_t rs ALIGNED32; + +-#if (defined AVX) ++#if ( defined AVX512 ) ++ ++void doe_dble_avx512(int *piup,int *pidn,su3_dble *u,spinor_dble *pk,double coe, spin_t *rs); ++static void doe(int *piup,int *pidn,su3_dble *u,spinor_dble *pk) ++{ ++ doe_dble_avx512( piup, pidn, u, pk, coe, &rs ); ++} ++ ++void deo_dble_avx512( int *piup, int *pidn, su3_dble *u, spinor_dble *pl, double ceo, spin_t *rs); ++static void deo(int *piup, int *pidn, su3_dble *u, spinor_dble *pl) ++{ ++ deo_dble_avx512( piup, pidn, u, pl, ceo, &rs ); ++} ++ ++ ++#elif ( defined AVX ) + #include "avx.h" + + #define _load_cst(c) \ +@@ -1412,9 +1426,7 @@ void Dw_dble(double mu,spinor_dble *s,spinor_dble *r) + { + doe(piup,pidn,u,s); + +- mul_pauli_dble(mu,m,(*so).w,(*ro).w); +- mul_pauli_dble(-mu,m+1,(*so).w+1,(*ro).w+1); +- ++ mul_pauli2_dble(mu, m, (*so).w, (*ro).w); + _vector_add_assign((*ro).s.c1,rs.s.c1); + _vector_add_assign((*ro).s.c2,rs.s.c2); + _vector_add_assign((*ro).s.c3,rs.s.c3); +@@ -1442,9 +1454,7 @@ void Dw_dble(double mu,spinor_dble *s,spinor_dble *r) + { + doe(piup,pidn,u,s); + +- mul_pauli_dble(mu,m,(*so).w,(*ro).w); +- mul_pauli_dble(-mu,m+1,(*so).w+1,(*ro).w+1); +- ++ mul_pauli2_dble(mu, m, (*so).w, (*ro).w); + _vector_add_assign((*ro).s.c1,rs.s.c1); + _vector_add_assign((*ro).s.c2,rs.s.c2); + _vector_add_assign((*ro).s.c3,rs.s.c3); +@@ -1488,8 +1498,7 @@ void Dwee_dble(double mu,spinor_dble *s,spinor_dble *r) + + if ((t>0)&&((t<(N0-1))||(bc!=0))) + { +- mul_pauli_dble(mu,m,(*se).w,(*re).w); +- mul_pauli_dble(-mu,m+1,(*se).w+1,(*re).w+1); ++ mul_pauli2_dble(mu, m, (*se).w, (*re).w); + } + else + { +@@ -1505,9 +1514,7 @@ void Dwee_dble(double mu,spinor_dble *s,spinor_dble *r) + { + for (;m0)&&((t<(N0-1))||(bc!=0))) + { +- mul_pauli_dble(mu,m,(*so).w,(*ro).w); +- mul_pauli_dble(-mu,m+1,(*so).w+1,(*ro).w+1); ++ mul_pauli2_dble(mu, m, (*so).w, (*ro).w); + } + else + { +@@ -1559,9 +1565,7 @@ void Dwoo_dble(double mu,spinor_dble *s,spinor_dble *r) + { + for (;m ++#include ++#include ++#include "mpi.h" ++#include "su3.h" ++#include "utils.h" ++#include "flags.h" ++#include "lattice.h" ++#include "uflds.h" ++#include "sflds.h" ++#include "sw_term.h" ++#include "block.h" ++#include "dirac.h" ++#include "global.h" ++ ++#define N0 (NPROC0 * L0) ++ ++typedef union ++{ ++ spinor s; ++ weyl w[2]; ++} spin_t; ++ ++#include "avx512.h" ++void doe_avx512(int *piup, int *pidn, su3 *u, spinor *pk, float coe, spin_t *rs) ++{ ++ spinor *sp, *sm, *sp2, *sm2; ++ su3 *up, *up1, *u1, *up2, *u2; ++ ++ /* 512-bit wide stores for the spinor for each color */ ++ __m512 a1, a2, a3; ++ __m512 b1, b2, b3; ++ __m256 c1, c2, c3; ++ ++ __m256 c256; ++ ++ /******************************* direction 0,1 **********************************/ ++ ++ sp = pk + (*(piup++)); ++ sm = pk + (*(pidn++)); ++ sp2 = pk + (*(piup++)); ++ sm2 = pk + (*(pidn++)); ++ ++ _avx512_load_4_halfspinor_f( a1, a2, a3, ++ &(*sp).c1.c1.re, &(*sm).c1.c1.re, ++ &(*sp2).c1.c1.re, &(*sm2).c1.c1.re ); ++ _avx512_load_4_halfspinor_f_reverse_up( b1, b2, b3, ++ &(*sp).c3.c1.re, &(*sm).c3.c1.re, ++ &(*sp2).c3.c1.re, &(*sm2).c3.c1.re ); ++ ++ sp = pk + (*(piup++)); ++ _mm_prefetch( (char *) sp, _MM_HINT_T0 ); ++ sm = pk + (*(pidn++)); ++ _mm_prefetch( (char *) sm, _MM_HINT_T0 ); ++ sp2 = pk + (*(piup)); ++ _mm_prefetch( (char *) sp2, _MM_HINT_T0 ); ++ sm2 = pk + (*(pidn)); ++ _mm_prefetch( (char *) sm2, _MM_HINT_T0 ); ++ ++ up1 = u; ++ u1 = u+1; ++ up2 = u+2; ++ u2 = u+3; u=u2; ++ _avx512_dirac_combine_f_1( a1, b1 ); ++ _avx512_dirac_combine_f_1( a2, b2 ); ++ _avx512_dirac_combine_f_1( a3, b3 ); ++ ++ avx512_su3_mixed_multiply_8( *up1, *u1, *up2, *u2, b1, b2, b3, a1, a2, a3 ); ++ ++ _avx512_to_weyl_f_12( c1, b1 ); ++ _avx512_to_weyl_f_12( c2, b2 ); ++ _avx512_to_weyl_f_12( c3, b3 ); ++ ++ /******************************* direction 2,3 *********************************/ ++ ++ _avx512_load_4_halfspinor_f( a1, a2, a3, ++ &(*sp).c1.c1.re,&(*sm).c1.c1.re, ++ &(*sp2).c1.c1.re, &(*sm2).c1.c1.re ); ++ _avx512_load_4_halfspinor_f_reverse_dn( b1, b2, b3, ++ &(*sp).c3.c1.re, &(*sm).c3.c1.re, ++ &(*sp2).c3.c1.re, &(*sm2).c3.c1.re ); ++ ++ _avx512_dirac_combine_f_2( a1, b1 ); ++ _avx512_dirac_combine_f_2( a2, b2 ); ++ _avx512_dirac_combine_f_2( a3, b3 ); ++ ++ up1 = u+1; ++ u1 = u+2; ++ up2 = u+3; ++ u2 = u+4; ++ avx512_su3_mixed_multiply_8( *up1, *u1, *up2, *u2, b1, b2, b3, a1, a2, a3 ); ++ ++ ++ c256 = _mm256_broadcast_ss( &coe ); ++ ++ _avx512_to_weyl_f_34( c1, b1 ); ++ _avx512_to_weyl_f_34( c2, b2 ); ++ _avx512_to_weyl_f_34( c3, b3 ); ++ ++ c1 = _mm256_mul_ps( c1, c256); ++ c2 = _mm256_mul_ps( c2, c256); ++ c3 = _mm256_mul_ps( c3, c256); ++ ++ _avx512_write_6_hwv_f( c1, c2, c3, &rs->s.c1.c1.re); ++} ++ ++void deo_avx512(int *piup, int *pidn, su3 *u, spinor *pl, float ceo, spin_t *rs) ++{ ++ spinor *sp, *sm, *sp2, *sm2; ++ su3 *up, *up1, *u1, *up2, *u2; ++ ++ /* 512-bit wide stores for the spinor for each color */ ++ __m512 a1, a2, a3; ++ __m512 b1, b2, b3; ++ __m256 c1, c2, c3; ++ ++ __m256 c256; ++ ++ ++ /******************************* direction 0 *********************************/ ++ ++ sp = pl + (*(piup++)); ++ _mm_prefetch( (char *) sp, _MM_HINT_T0 ); ++ sm = pl + (*(pidn++)); ++ _mm_prefetch( (char *) sm, _MM_HINT_T0 ); ++ sp2 = pl + (*(piup++)); ++ _mm_prefetch( (char *) sp2, _MM_HINT_T0 ); ++ sm2 = pl + (*(pidn++)); ++ _mm_prefetch( (char *) sm2, _MM_HINT_T0 ); ++ ++ _avx512_load_6_hwv_f( c1,c2,c3, &rs->s.c1.c1.re ); ++ ++ c256 = _mm256_broadcast_ss( &ceo ); ++ c1 = _mm256_mul_ps( c1, c256 ); ++ c2 = _mm256_mul_ps( c2, c256 ); ++ c3 = _mm256_mul_ps( c3, c256 ); ++ ++ _avx512_to_dirac_f_1( a1, c1 ); ++ _avx512_to_dirac_f_1( a2, c2 ); ++ _avx512_to_dirac_f_1( a3, c3 ); ++ ++ _avx512_to_dirac_f_2( a1, c1 ); ++ _avx512_to_dirac_f_2( a2, c2 ); ++ _avx512_to_dirac_f_2( a3, c3 ); ++ ++ up1 = u; ++ u1 = u+1; ++ up2 = u+2; ++ u2 = u+3; u=u2; ++ avx512_su3_mixed_multiply_8( *u1, *up1, *u2, *up2, b1, b2, b3, a1, a2, a3 ); ++ ++ _avx512_load_4_halfspinor_f( a1, a2, a3, ++ &(*sm).c1.c1.re, &(*sp).c1.c1.re, ++ &(*sm2).c1.c1.re, &(*sp2).c1.c1.re ); ++ a1 = _mm512_add_ps( a1, b1 ); ++ a2 = _mm512_add_ps( a2, b2 ); ++ a3 = _mm512_add_ps( a3, b3 ); ++ _avx512_write_4_halfspinor_f( a1, a2, a3, ++ &(*sm).c1.c1.re, &(*sp).c1.c1.re, ++ &(*sm2).c1.c1.re, &(*sp2).c1.c1.re ); ++ ++ _avx512_load_4_halfspinor_f_reverse_up( a1, a2, a3, ++ &(*sm).c3.c1.re, &(*sp).c3.c1.re, ++ &(*sm2).c3.c1.re, &(*sp2).c3.c1.re ); ++ _avx512_dirac_combine_f_3( a1, b1 ); ++ _avx512_dirac_combine_f_3( a2, b2 ); ++ _avx512_dirac_combine_f_3( a3, b3 ); ++ _avx512_write_4_halfspinor_f_reverse_up( a1, a2, a3, ++ &(*sm).c3.c1.re, &(*sp).c3.c1.re, ++ &(*sm2).c3.c1.re, &(*sp2).c3.c1.re ); ++ ++ /******************************* direction 2 *********************************/ ++ ++ sp = pl + (*(piup++)); ++ _mm_prefetch( (char *) sp, _MM_HINT_T0 ); ++ sm = pl + (*(pidn++)); ++ _mm_prefetch( (char *) sm, _MM_HINT_T0 ); ++ sp2 = pl + (*(piup++)); ++ _mm_prefetch( (char *) sp2, _MM_HINT_T0 ); ++ sm2 = pl + (*(pidn++)); ++ _mm_prefetch( (char *) sm2, _MM_HINT_T0 ); ++ ++ _avx512_to_dirac_f_3( a1, c1 ); ++ _avx512_to_dirac_f_3( a2, c2 ); ++ _avx512_to_dirac_f_3( a3, c3 ); ++ ++ up1 = u+1; ++ u1 = u+2; ++ up2 = u+3; ++ u2 = u+4; ++ avx512_su3_mixed_multiply_8( *u1, *up1, *u2, *up2, b1, b2, b3, a1, a2, a3 ); ++ ++ _avx512_load_4_halfspinor_f( a1, a2, a3, &(*sm).c1.c1.re, &(*sp).c1.c1.re, &(*sm2).c1.c1.re, &(*sp2).c1.c1.re ); ++ a1 = _mm512_add_ps( a1, b1 ); ++ a2 = _mm512_add_ps( a2, b2 ); ++ a3 = _mm512_add_ps( a3, b3 ); ++ _avx512_write_4_halfspinor_f( a1, a2, a3, &(*sm).c1.c1.re, &(*sp).c1.c1.re, &(*sm2).c1.c1.re, &(*sp2).c1.c1.re ); ++ ++ _avx512_load_4_halfspinor_f_reverse_dn( a1, a2, a3, &(*sm).c3.c1.re, &(*sp).c3.c1.re, &(*sm2).c3.c1.re, &(*sp2).c3.c1.re ); ++ _avx512_dirac_combine_f_4( a1, b1 ); ++ _avx512_dirac_combine_f_4( a2, b2 ); ++ _avx512_dirac_combine_f_4( a3, b3 ); ++ _avx512_write_4_halfspinor_f_reverse_dn( a1, a2, a3, &(*sm).c3.c1.re, &(*sp).c3.c1.re, &(*sm2).c3.c1.re, &(*sp2).c3.c1.re ); ++} ++ ++#endif +\ No newline at end of file +diff --git a/modules/dirac/avx512/Dw_dble_avx512.c b/modules/dirac/avx512/Dw_dble_avx512.c +new file mode 100644 +index 0000000..cf07bfa +--- /dev/null ++++ b/modules/dirac/avx512/Dw_dble_avx512.c +@@ -0,0 +1,261 @@ ++/******************************************************************************* ++* ++* File Dw_dble_avx512.c ++* ++* This software is distributed under the terms of the GNU General Public ++* License (GPL) ++* ++* AVX512 implementation of the O(a)-improved Wilson-Dirac operator D (double- ++* precision programs). ++* ++* See ../Dw_dble.c for more details and alternative implementations ++ *******************************************************************************/ ++ ++#ifdef AVX512 ++ ++ ++#include ++#include ++#include ++#include "mpi.h" ++#include "su3.h" ++#include "utils.h" ++#include "flags.h" ++#include "lattice.h" ++#include "uflds.h" ++#include "sflds.h" ++#include "sw_term.h" ++#include "dirac.h" ++#include "global.h" ++ ++#define N0 (NPROC0 * L0) ++ ++typedef union ++{ ++ spinor_dble s; ++ weyl_dble w[2]; ++} spin_t; ++ ++#include "avx512.h" ++#include "sse.h" ++ ++void doe_dble_avx512( const int *piup, const int *pidn, const su3_dble *u, const spinor_dble *pk, double coe, spin_t *rs) ++{ ++ const spinor_dble *sp, *sm; ++ const su3_dble *up; ++ ++ /* 512-bit wide stores for the spinor for each color */ ++ __m512d a1, a2, a3; ++ __m512d b1, b2, b3; ++ __m512d w1, w2, w3; ++ __m512d t1, t2, t3, t4, t5, t6; ++ ++ __m128d tc; ++ __m512d c512; ++ ++ /******************************* direction 0 *********************************/ ++ ++ sp = pk + (*(piup++)); ++ sm = pk + (*(pidn++)); ++ ++ _avx512_load_2_halfspinor_d( t1, t2, t3, &(*sp).c1.c1.re, &(*sm).c1.c1.re ); ++ _avx512_load_2_halfspinor_d( t4, t5, t6, &(*sp).c3.c1.re, &(*sm).c3.c1.re ); ++ ++ a1 = _mm512_maskz_add_pd( 0b00001111, t1, t4 ); ++ a1 = _mm512_mask_sub_pd( a1, 0b11110000, t1, t4 ); ++ a2 = _mm512_maskz_add_pd( 0b00001111, t2, t5 ); ++ a2 = _mm512_mask_sub_pd( a2, 0b11110000, t2, t5 ); ++ a3 = _mm512_maskz_add_pd( 0b00001111, t3, t6 ); ++ a3 = _mm512_mask_sub_pd( a3, 0b11110000, t3, t6 ); ++ ++ sp = pk + (*(piup++)); ++ _mm_prefetch( (char *) sp, _MM_HINT_T0 ); ++ sm = pk + (*(pidn++)); ++ _mm_prefetch( (char *) sm, _MM_HINT_T0 ); ++ ++ up = u; ++ u += 1; ++ avx512_su3_mul_quad_dble( *up, *u, b1, b2, b3, a1, a2, a3 ); ++ ++ _avx512_to_weyl_1( w1, b1 ); ++ _avx512_to_weyl_1( w2, b2 ); ++ _avx512_to_weyl_1( w3, b3 ); ++ ++ ++ /******************************* direction 1 *********************************/ ++ _avx512_load_2_halfspinor_d( t1, t2, t3, &(*sp).c1.c1.re, &(*sm).c1.c1.re ); ++ _avx512_load_2_halfspinor_d_reverse( t4, t5, t6, &(*sp).c3.c1.re, &(*sm).c3.c1.re ); ++ ++ t4 = _mm512_permute_pd ( t4, 0b01010101 ); ++ a1 = _mm512_maskz_add_pd( 0b01011010, t1, t4 ); ++ a1 = _mm512_mask_sub_pd( a1, 0b10100101, t1, t4 ); ++ t5 = _mm512_permute_pd ( t5, 0b01010101 ); ++ a2 = _mm512_maskz_add_pd( 0b01011010, t2, t5 ); ++ a2 = _mm512_mask_sub_pd( a2, 0b10100101, t2, t5 ); ++ t6 = _mm512_permute_pd ( t6, 0b01010101 ); ++ a3 = _mm512_maskz_add_pd( 0b01011010, t3, t6 ); ++ a3 = _mm512_mask_sub_pd( a3, 0b10100101, t3, t6 ); ++ ++ sp = pk + (*(piup++)); ++ _mm_prefetch( (char *) sp, _MM_HINT_T0 ); ++ sm = pk + (*(pidn++)); ++ _mm_prefetch( (char *) sm, _MM_HINT_T0 ); ++ up = ++u; ++ u += 1; ++ ++ avx512_su3_mul_quad_dble( *up, *u, b1, b2, b3, a1, a2, a3 ); ++ ++ _avx512_to_weyl_2( w1, b1 ); ++ _avx512_to_weyl_2( w2, b2 ); ++ _avx512_to_weyl_2( w3, b3 ); ++ ++ /******************************* direction 2 *********************************/ ++ ++ _avx512_load_2_halfspinor_d( t1, t2, t3, &(*sp).c1.c1.re, &(*sm).c1.c1.re ); ++ _avx512_load_2_halfspinor_d_reverse( t4, t5, t6, &(*sp).c3.c1.re, &(*sm).c3.c1.re ); ++ ++ a1 = _mm512_maskz_add_pd( 0b11000011, t1, t4 ); ++ a1 = _mm512_mask_sub_pd( a1, 0b00111100, t1, t4 ); ++ a2 = _mm512_maskz_add_pd( 0b11000011, t2, t5 ); ++ a2 = _mm512_mask_sub_pd( a2, 0b00111100, t2, t5 ); ++ a3 = _mm512_maskz_add_pd( 0b11000011, t3, t6 ); ++ a3 = _mm512_mask_sub_pd( a3, 0b00111100, t3, t6 ); ++ ++ sp = pk + (*(piup)); ++ _mm_prefetch( (char *) sp, _MM_HINT_T0 ); ++ sm = pk + (*(pidn)); ++ _mm_prefetch( (char *) sm, _MM_HINT_T0 ); ++ up = ++u; ++ u += 1; ++ ++ avx512_su3_mul_quad_dble( *up, *u, b1, b2, b3, a1, a2, a3 ); ++ ++ _avx512_to_weyl_3( w1, b1 ); ++ _avx512_to_weyl_3( w2, b2 ); ++ _avx512_to_weyl_3( w3, b3 ); ++ ++ ++ /******************************* direction 3 *********************************/ ++ _avx512_load_2_halfspinor_d( t1, t2, t3, &(*sp).c1.c1.re, &(*sm).c1.c1.re ); ++ _avx512_load_2_halfspinor_d( t4, t5, t6, &(*sp).c3.c1.re, &(*sm).c3.c1.re ); ++ ++ t4 = _mm512_permute_pd ( t4, 0b01010101 ); ++ a1 = _mm512_maskz_add_pd( 0b10010110, t1, t4 ); ++ a1 = _mm512_mask_sub_pd( a1, 0b01101001, t1, t4 ); ++ t5 = _mm512_permute_pd ( t5, 0b01010101 ); ++ a2 = _mm512_maskz_add_pd( 0b10010110, t2, t5 ); ++ a2 = _mm512_mask_sub_pd( a2, 0b01101001, t2, t5 ); ++ t6 = _mm512_permute_pd ( t6, 0b01010101 ); ++ a3 = _mm512_maskz_add_pd( 0b10010110, t3, t6 ); ++ a3 = _mm512_mask_sub_pd( a3, 0b01101001, t3, t6 ); ++ ++ up = ++u; ++ u += 1; ++ avx512_su3_mul_quad_dble( *up, *u, b1, b2, b3, a1, a2, a3 ); ++ ++ _avx512_to_weyl_4( w1, b1 ); ++ _avx512_to_weyl_4( w2, b2 ); ++ _avx512_to_weyl_4( w3, b3 ); ++ ++ tc = _mm_load_sd( &coe ); ++ c512 = _mm512_broadcastsd_pd( tc ); ++ w1 = _mm512_mul_pd( c512, w1 ); ++ w2 = _mm512_mul_pd( c512, w2 ); ++ w3 = _mm512_mul_pd( c512, w3 ); ++ ++ _avx512_store_2_halfspinor_d( w1, w2, w3, &rs->s.c1.c1.re, &rs->s.c3.c1.re ); ++} ++ ++void deo_dble_avx512( const int *piup, const int *pidn, const su3_dble *u, spinor_dble *pl, double ceo, spin_t *rs) ++{ ++ const su3_dble *up; ++ spinor_dble *sp, *sm; ++ ++ /* 512-bit wide stores for the spinor for each color */ ++ __m512d a1, a2, a3; ++ __m512d b1, b2, b3; ++ __m512d w1, w2, w3; ++ ++ __m128d tc; ++ __m512d c512; ++ ++ /******************************* direction 0 *********************************/ ++ ++ sp = pl + (*(piup++)); ++ _mm_prefetch( (char *) sp, _MM_HINT_T0 ); ++ sm = pl + (*(pidn++)); ++ _mm_prefetch( (char *) sm, _MM_HINT_T0 ); ++ ++ _avx512_load_2_halfspinor_d( w1, w2, w3, &rs->s.c1.c1.re, &rs->s.c3.c1.re ); ++ ++ tc = _mm_load_sd( &ceo ); ++ c512 = _mm512_broadcastsd_pd( tc ); ++ w1 = _mm512_mul_pd( c512, w1 ); ++ w2 = _mm512_mul_pd( c512, w2 ); ++ w3 = _mm512_mul_pd( c512, w3 ); ++ ++ _avx512_expand_weyl( a1, w1 ) ++ _avx512_expand_weyl( a2, w2 ) ++ _avx512_expand_weyl( a3, w3 ) ++ ++ up = u; ++ u += 1; ++ avx512_su3_mul_quad_dble( *u, *up, b1, b2, b3, a1, a2, a3 ); ++ ++ _avx512_add_to_spinors( b1, b2, b3, &(*sp).c1.c1.re, &(*sm).c1.c1.re ); ++ _avx512_add_to_spinors_2( b1, b2, b3, &(*sp).c3.c1.re, &(*sm).c3.c1.re ); ++ ++ /******************************* direction 1 *********************************/ ++ sp = pl + (*(piup++)); ++ _mm_prefetch( (char *) sp, _MM_HINT_T0 ); ++ sm = pl + (*(pidn++)); ++ _mm_prefetch( (char *) sm, _MM_HINT_T0 ); ++ ++ _avx512_expand_weyl_2( a1, w1 ); ++ _avx512_expand_weyl_2( a2, w2 ); ++ _avx512_expand_weyl_2( a3, w3 ); ++ ++ up = ++u; ++ u += 1; ++ avx512_su3_mul_quad_dble( *u, *up, b1, b2, b3, a1, a2, a3 ); ++ ++ _avx512_add_to_spinors( b1, b2, b3, &(*sp).c1.c1.re, &(*sm).c1.c1.re ); ++ _avx512_add_to_spinors_3( b1, b2, b3, &(*sp).c3.c1.re, &(*sm).c3.c1.re ); ++ ++ /******************************* direction 2 *********************************/ ++ sp = pl + (*(piup++)); ++ _mm_prefetch( (char *) sp, _MM_HINT_T0 ); ++ sm = pl + (*(pidn++)); ++ _mm_prefetch( (char *) sm, _MM_HINT_T0 ); ++ ++ _avx512_expand_weyl_3( a1, w1 ); ++ _avx512_expand_weyl_3( a2, w2 ); ++ _avx512_expand_weyl_3( a3, w3 ); ++ ++ up = ++u; ++ u += 1; ++ avx512_su3_mul_quad_dble( *u, *up, b1, b2, b3, a1, a2, a3 ); ++ ++ _avx512_add_to_spinors( b1, b2, b3, &(*sp).c1.c1.re, &(*sm).c1.c1.re ); ++ _avx512_add_to_spinors_4( b1, b2, b3, &(*sp).c3.c1.re, &(*sm).c3.c1.re ); ++ ++ ++ /******************************* direction 3 *********************************/ ++ sp = pl + (*(piup++)); ++ _mm_prefetch( (char *) sp, _MM_HINT_T0 ); ++ sm = pl + (*(pidn++)); ++ _mm_prefetch( (char *) sm, _MM_HINT_T0 ); ++ ++ _avx512_expand_weyl_4( a1, w1 ); ++ _avx512_expand_weyl_4( a2, w2 ); ++ _avx512_expand_weyl_4( a3, w3 ); ++ ++ up = ++u; ++ u += 1; ++ avx512_su3_mul_quad_dble( *u, *up, b1, b2, b3, a1, a2, a3 ); ++ ++ _avx512_add_to_spinors( b1, b2, b3, &(*sp).c1.c1.re, &(*sm).c1.c1.re ); ++ _avx512_add_to_spinors_5( b1, b2, b3, &(*sp).c3.c1.re, &(*sm).c3.c1.re ); ++} ++ ++#endif +\ No newline at end of file +diff --git a/modules/linalg/avx512/salg_avx512.c b/modules/linalg/avx512/salg_avx512.c +new file mode 100644 +index 0000000..509e78f +--- /dev/null ++++ b/modules/linalg/avx512/salg_avx512.c +@@ -0,0 +1,154 @@ ++/******************************************************************************* ++* ++* File salg_avx512.c ++* ++* This software is distributed under the terms of the GNU General Public ++* License (GPL) ++* ++* AVX512 implementations of single precision linear algebra ++* functions for spinors. ++* ++* See ../salg.c for more information and alternative ++* implementations. ++* ++*******************************************************************************/ ++ ++#ifdef AVX512 ++ ++#include "global.h" ++#include "linalg.h" ++#include "mpi.h" ++#include "sflds.h" ++ ++#include "avx512.h" ++ ++void mulc_spinor_add(int vol, spinor *s, spinor *r, complex z) ++{ ++ spinor *sm; ++ __m128 tr, ti; ++ __m512 zr, zi, t1, t2; ++ __m512 sign; ++ ++ sign = _mm512_set_ps( -1,1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1 ); ++ sm = s + vol; ++ ++ tr = _mm_load_ps1( &z.re ); ++ ti = _mm_load_ps1( &z.im ); ++ zr = _mm512_broadcast_f32x4( tr ); ++ zi = _mm512_broadcast_f32x4( ti ); ++ ++ zi = _mm512_mul_ps( zi, sign ); ++ ++ for (; s < sm; s+=2) { ++ t1 = _mm512_loadu_ps( &(*r).c1.c1.re ); ++ t2 = _mm512_mul_ps( zi, t1 ); ++ t2 = _mm512_permute_ps( t2, 0b10110001 ); ++ t2 = _mm512_fmadd_ps( zr, t1, t2 ); ++ t1 = _mm512_loadu_ps( &(*s).c1.c1.re ); ++ t1 = _mm512_add_ps( t1, t2 ); ++ _mm512_storeu_ps( &(*s).c1.c1.re, t1 ); ++ ++ t1 = _mm512_loadu_ps( &(*r).c1.c1.re + 16 ); ++ t2 = _mm512_mul_ps( zi, t1 ); ++ t2 = _mm512_permute_ps( t2, 0b10110001 ); ++ t2 = _mm512_fmadd_ps( zr, t1, t2 ); ++ t1 = _mm512_loadu_ps( &(*s).c1.c1.re + 16 ); ++ t1 = _mm512_add_ps( t1, t2 ); ++ _mm512_storeu_ps( &(*s).c1.c1.re + 16, t1 ); ++ ++ t1 = _mm512_loadu_ps( &(*r).c1.c1.re + 32 ); ++ t2 = _mm512_mul_ps( zi, t1 ); ++ t2 = _mm512_permute_ps( t2, 0b10110001 ); ++ t2 = _mm512_fmadd_ps( zr, t1, t2 ); ++ t1 = _mm512_loadu_ps( &(*s).c1.c1.re + 32 ); ++ t1 = _mm512_add_ps( t1, t2 ); ++ _mm512_storeu_ps( &(*s).c1.c1.re + 32, t1 ); ++ ++ r += 2; ++ } ++} ++ ++#if __GNUC__ < 7 ++/* This function was implemented to gcc 7 */ ++extern __inline double _mm512_reduce_add_ps( __m512 a ) { ++ float * d = (float *) &a; ++ return d[0]+d[1]+d[2]+d[3]+d[4]+d[5]+d[6]+d[7] ++ +d[8]+d[9]+d[10]+d[11]+d[12]+d[13]+d[14]+d[15] ; ++} ++#endif ++ ++complex spinor_prod(int vol, int icom, spinor *s, spinor *r ) ++{ ++ complex z; ++ complex_dble v, w; ++ spinor const *sm, *smb; ++ __m512 tr, ti, s1, s2, s3, r1, r2, r3, sign; ++ ++ double x, y; ++ ++ x = 0.0; ++ y = 0.0; ++ sm = s + vol; ++ ++ ++ while (s < sm) { ++ smb = s + 8; ++ if (smb > sm) { ++ smb = sm; ++ } ++ ++ tr = _mm512_setzero_ps(); ++ ti = _mm512_setzero_ps(); ++ ++ for (; s < smb; s+=2) { ++ s1 = _mm512_loadu_ps( &(*s).c1.c1.re ); ++ s2 = _mm512_loadu_ps( &(*s).c1.c1.re+16 ); ++ s3 = _mm512_loadu_ps( &(*s).c1.c1.re+32 ); ++ r1 = _mm512_loadu_ps( &(*r).c1.c1.re ); ++ r2 = _mm512_loadu_ps( &(*r).c1.c1.re+16 ); ++ r3 = _mm512_loadu_ps( &(*r).c1.c1.re+32 ); ++ ++ tr = _mm512_fmadd_ps( s1, r1, tr ); ++ tr = _mm512_fmadd_ps( s2, r2, tr ); ++ tr = _mm512_fmadd_ps( s3, r3, tr ); ++ ++ r1 = _mm512_permute_ps( r1, 0b10110001 ); ++ r2 = _mm512_permute_ps( r2, 0b10110001 ); ++ r3 = _mm512_permute_ps( r3, 0b10110001 ); ++ ++ sign = _mm512_set_ps( -1,1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1 ); ++ r1 = _mm512_mul_ps( r1, sign ); ++ r2 = _mm512_mul_ps( r2, sign ); ++ r3 = _mm512_mul_ps( r3, sign ); ++ ++ ti = _mm512_fmadd_ps( s1, r1, ti ); ++ ti = _mm512_fmadd_ps( s2, r2, ti ); ++ ti = _mm512_fmadd_ps( s3, r3, ti ); ++ ++ r += 2; ++ } ++ ++ x += (double) _mm512_reduce_add_ps( tr ); ++ y += (double) _mm512_reduce_add_ps( ti ); ++ ++ } ++ ++ v.re = x; ++ v.im = y; ++ ++ if ((icom==1)&&(NPROC>1)) ++ { ++ MPI_Reduce(&v.re,&w.re,2,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); ++ MPI_Bcast(&w.re,2,MPI_DOUBLE,0,MPI_COMM_WORLD); ++ z.re=(float)(w.re); ++ z.im=(float)(w.im); ++ } ++ else ++ { ++ z.re=(float)(v.re); ++ z.im=(float)(v.im); ++ } ++ return z; ++} ++ ++#endif +\ No newline at end of file +diff --git a/modules/linalg/avx512/salg_dble_avx512.c b/modules/linalg/avx512/salg_dble_avx512.c +new file mode 100644 +index 0000000..8fde96c +--- /dev/null ++++ b/modules/linalg/avx512/salg_dble_avx512.c +@@ -0,0 +1,402 @@ ++/******************************************************************************* ++* ++* File salg_dble_avx512.c ++* ++* This software is distributed under the terms of the GNU General Public ++* License (GPL) ++* ++* AVX512 implementations of single precision linear algebra ++* functions for spinors. ++* ++* See ../salg_dble.c for more information and alternative ++* implementations. ++* ++*******************************************************************************/ ++ ++#ifdef AVX512 ++ ++#include "global.h" ++#include "linalg.h" ++#include "mpi.h" ++#include "sflds.h" ++ ++#include "avx512.h" ++#if __GNUC__ < 7 ++/* This function was implemented to gcc 7 */ ++extern __inline double _mm512_reduce_add_pd( __m512d a ) { ++ double * d = (double *) &a; ++ return d[0]+d[1]+d[2]+d[3]+d[4]+d[5]+d[6]+d[7] ; ++} ++#endif ++ ++complex_dble spinor_prod_dble_avx512( spinor_dble const *s, spinor_dble const *smb, spinor_dble const *r) ++{ ++ __m512d tr, ti, s1, s2, s3, r1, r2, r3, sign; ++ sign = _mm512_set_pd( -1,1,-1,1,-1,1,-1,1 ); ++ complex_dble z; ++ ++ tr = _mm512_setzero_pd(); ++ ti = _mm512_setzero_pd(); ++ for (; s < smb; s++) { ++ s1 = _mm512_loadu_pd( &(*s).c1.c1.re ); ++ s2 = _mm512_loadu_pd( &(*s).c1.c1.re+8 ); ++ s3 = _mm512_loadu_pd( &(*s).c1.c1.re+16 ); ++ r1 = _mm512_loadu_pd( &(*r).c1.c1.re ); ++ r2 = _mm512_loadu_pd( &(*r).c1.c1.re+8 ); ++ r3 = _mm512_loadu_pd( &(*r).c1.c1.re+16 ); ++ tr = _mm512_fmadd_pd( s1, r1, tr ); ++ tr = _mm512_fmadd_pd( s2, r2, tr ); ++ tr = _mm512_fmadd_pd( s3, r3, tr ); ++ r1 = _mm512_permute_pd( r1, 0b01010101 ); ++ r2 = _mm512_permute_pd( r2, 0b01010101 ); ++ r3 = _mm512_permute_pd( r3, 0b01010101 ); ++ sign = _mm512_set_pd( -1,1,-1,1,-1,1,-1,1 ); ++ r1 = _mm512_mul_pd( r1, sign ); ++ r2 = _mm512_mul_pd( r2, sign ); ++ r3 = _mm512_mul_pd( r3, sign ); ++ ti = _mm512_fmadd_pd( s1, r1, ti ); ++ ti = _mm512_fmadd_pd( s2, r2, ti ); ++ ti = _mm512_fmadd_pd( s3, r3, ti ); ++ r += 1; ++ } ++ z.re = _mm512_reduce_add_pd( tr ); ++ z.im = _mm512_reduce_add_pd( ti ); ++ return z; ++} ++ ++double spinor_prod_re_dble_avx512( spinor_dble const *s, spinor_dble const *smb, spinor_dble const *r) ++{ ++ __m512d tr, ti, s1, s2, s3, r1, r2, r3; ++ float c; ++ tr = _mm512_setzero_pd(); ++ ti = _mm512_setzero_pd(); ++ for (; s < smb; s++) { ++ s1 = _mm512_loadu_pd( &(*s).c1.c1.re ); ++ s2 = _mm512_loadu_pd( &(*s).c1.c1.re+8 ); ++ s3 = _mm512_loadu_pd( &(*s).c1.c1.re+16 ); ++ r1 = _mm512_loadu_pd( &(*r).c1.c1.re ); ++ r2 = _mm512_loadu_pd( &(*r).c1.c1.re+8 ); ++ r3 = _mm512_loadu_pd( &(*r).c1.c1.re+16 ); ++ tr = _mm512_fmadd_pd( s1, r1, tr ); ++ tr = _mm512_fmadd_pd( s2, r2, tr ); ++ tr = _mm512_fmadd_pd( s3, r3, tr ); ++ r1 = _mm512_permute_pd( r1, 0b01010101 ); ++ r2 = _mm512_permute_pd( r2, 0b01010101 ); ++ r3 = _mm512_permute_pd( r3, 0b01010101 ); ++ r += 1; ++ } ++ c = _mm512_reduce_add_pd( tr ); ++ return c; ++} ++ ++complex_dble spinor_prod5_dble_avx512(spinor_dble const *s, spinor_dble const *smb, spinor_dble const *r) ++{ ++ __m512d tr, ti, s1, s2, s3, r1, r2, r3, sign; ++ complex_dble z; ++ ++ tr = _mm512_setzero_pd(); ++ ti = _mm512_setzero_pd(); ++ for (; s < smb; s++) { ++ s1 = _mm512_loadu_pd( &(*s).c1.c1.re ); ++ s2 = _mm512_loadu_pd( &(*s).c1.c1.re+8 ); ++ s3 = _mm512_loadu_pd( &(*s).c1.c1.re+16 ); ++ r1 = _mm512_loadu_pd( &(*r).c1.c1.re ); ++ r2 = _mm512_loadu_pd( &(*r).c1.c1.re+8 ); ++ r3 = _mm512_loadu_pd( &(*r).c1.c1.re+16 ); ++ sign = _mm512_set_pd( -1,-1,-1,-1,1,1,1,1 ); ++ s2 = _mm512_mul_pd( s2, sign ); ++ tr = _mm512_fmadd_pd( s1, r1, tr ); ++ tr = _mm512_fmadd_pd( s2, r2, tr ); ++ tr = _mm512_fnmadd_pd( s3, r3, tr ); ++ r1 = _mm512_permute_pd( r1, 0b01010101 ); ++ r2 = _mm512_permute_pd( r2, 0b01010101 ); ++ r3 = _mm512_permute_pd( r3, 0b01010101 ); ++ sign = _mm512_set_pd( -1,1,-1,1,-1,1,-1,1 ); ++ r1 = _mm512_mul_pd( r1, sign ); ++ r2 = _mm512_mul_pd( r2, sign ); ++ r3 = _mm512_mul_pd( r3, sign ); ++ ti = _mm512_fmadd_pd( s1, r1, ti ); ++ ti = _mm512_fmadd_pd( s2, r2, ti ); ++ ti = _mm512_fnmadd_pd( s3, r3, ti ); ++ r += 1; ++ } ++ z.re = _mm512_reduce_add_pd( tr ); ++ z.im = _mm512_reduce_add_pd( ti ); ++ return z; ++} ++ ++double norm_square_dble_avx512(spinor_dble const *s, spinor_dble const *smb) ++{ ++ __m512d tmp, s1, s2, s3; ++ tmp = _mm512_setzero_pd(); ++ for (; s < smb; s++) { ++ s1 = _mm512_loadu_pd( &(*s).c1.c1.re ); ++ s2 = _mm512_loadu_pd( &(*s).c1.c1.re+8 ); ++ s3 = _mm512_loadu_pd( &(*s).c1.c1.re+16 ); ++ tmp = _mm512_fmadd_pd( s1, s1, tmp ); ++ tmp = _mm512_fmadd_pd( s2, s2, tmp ); ++ tmp = _mm512_fmadd_pd( s3, s3, tmp ); ++ } ++ return _mm512_reduce_add_pd( tmp ); ++} ++ ++ ++ ++void mulc_spinor_add_dble(int vol, spinor_dble *s, spinor_dble *r, ++ complex_dble z) ++{ ++ spinor_dble *sm; ++ __m128d tr, ti; ++ __m512d zr, zi, t1, t2; ++ ++ tr = _mm_load_sd( &z.re ); ++ ti = _mm_load_sd( &z.im ); ++ zr = _mm512_broadcastsd_pd( tr ); ++ zi = _mm512_broadcastsd_pd( ti ); ++ ++ sm = s + vol; ++ ++ for (; s < sm; s++) { ++ t1 = _mm512_loadu_pd( &(*r).c1.c1.re ); ++ t2 = _mm512_mul_pd( zi, t1 ); ++ t2 = _mm512_permute_pd( t2, 0b01010101 ); ++ t2 = _mm512_fmaddsub_pd( zr, t1, t2 ); ++ t1 = _mm512_loadu_pd( &(*s).c1.c1.re ); ++ t1 = _mm512_add_pd( t1, t2 ); ++ _mm512_storeu_pd( &(*s).c1.c1.re, t1 ); ++ ++ t1 = _mm512_loadu_pd( &(*r).c1.c1.re+8 ); ++ t2 = _mm512_mul_pd( zi, t1 ); ++ t2 = _mm512_permute_pd( t2, 0b01010101 ); ++ t2 = _mm512_fmaddsub_pd( zr, t1, t2 ); ++ t1 = _mm512_loadu_pd( &(*s).c1.c1.re+8 ); ++ t1 = _mm512_add_pd( t1, t2 ); ++ _mm512_storeu_pd( &(*s).c1.c1.re+8, t1 ); ++ ++ t1 = _mm512_loadu_pd( &(*r).c1.c1.re+16 ); ++ t2 = _mm512_mul_pd( zi, t1 ); ++ t2 = _mm512_permute_pd( t2, 0b01010101 ); ++ t2 = _mm512_fmaddsub_pd( zr, t1, t2 ); ++ t1 = _mm512_loadu_pd( &(*s).c1.c1.re+16 ); ++ t1 = _mm512_add_pd( t1, t2 ); ++ _mm512_storeu_pd( &(*s).c1.c1.re+16, t1 ); ++ ++ r += 1; ++ } ++} ++ ++ ++void mulr_spinor_add_dble(int vol, spinor_dble *s, spinor_dble *r, ++ double c) ++{ ++ spinor_dble *sm; ++ __m128d t128; ++ __m512d tc, t1, t2; ++ ++ t128 = _mm_load_sd( &c ); ++ tc = _mm512_broadcastsd_pd( t128 ); ++ ++ sm = s + vol; ++ ++ for (; s < sm; s++) { ++ t1 = _mm512_loadu_pd( &(*r).c1.c1.re ); ++ t2 = _mm512_mul_pd( tc, t1 ); ++ t1 = _mm512_loadu_pd( &(*s).c1.c1.re ); ++ t1 = _mm512_add_pd( t1, t2 ); ++ _mm512_storeu_pd( &(*s).c1.c1.re, t1 ); ++ ++ t1 = _mm512_loadu_pd( &(*r).c1.c1.re+8 ); ++ t2 = _mm512_mul_pd( tc, t1 ); ++ t1 = _mm512_loadu_pd( &(*s).c1.c1.re+8 ); ++ t1 = _mm512_add_pd( t1, t2 ); ++ _mm512_storeu_pd( &(*s).c1.c1.re+8, t1 ); ++ ++ t1 = _mm512_loadu_pd( &(*r).c1.c1.re+16 ); ++ t2 = _mm512_mul_pd( tc, t1 ); ++ t1 = _mm512_loadu_pd( &(*s).c1.c1.re+16 ); ++ t1 = _mm512_add_pd( t1, t2 ); ++ _mm512_storeu_pd( &(*s).c1.c1.re+16, t1 ); ++ ++ r += 1; ++ } ++} ++ ++ ++void combine_spinor_dble(int vol, spinor_dble *s, spinor_dble *r, ++ double cs, double cr) ++{ ++ spinor_dble *sm; ++ __m128d ts128, tr128; ++ __m512d tcs, tcr, t1, t2; ++ ++ ts128 = _mm_load_sd( &cs ); ++ tr128 = _mm_load_sd( &cr ); ++ tcs = _mm512_broadcastsd_pd( ts128 ); ++ tcr = _mm512_broadcastsd_pd( tr128 ); ++ ++ sm = s + vol; ++ ++ for (; s < sm; s++) { ++ t1 = _mm512_loadu_pd( &(*r).c1.c1.re ); ++ t2 = _mm512_mul_pd( tcr, t1 ); ++ t1 = _mm512_loadu_pd( &(*s).c1.c1.re ); ++ t1 = _mm512_fmadd_pd( tcs, t1, t2 ); ++ _mm512_storeu_pd( &(*s).c1.c1.re, t1 ); ++ ++ t1 = _mm512_loadu_pd( &(*r).c1.c1.re+8 ); ++ t2 = _mm512_mul_pd( tcr, t1 ); ++ t1 = _mm512_loadu_pd( &(*s).c1.c1.re+8 ); ++ t1 = _mm512_fmadd_pd( tcs, t1, t2 ); ++ _mm512_storeu_pd( &(*s).c1.c1.re+8, t1 ); ++ ++ t1 = _mm512_loadu_pd( &(*r).c1.c1.re+16 ); ++ t2 = _mm512_mul_pd( tcr, t1 ); ++ t1 = _mm512_loadu_pd( &(*s).c1.c1.re+16 ); ++ t1 = _mm512_fmadd_pd( tcs, t1, t2 ); ++ _mm512_storeu_pd( &(*s).c1.c1.re+16, t1 ); ++ ++ r += 1; ++ } ++} ++ ++ ++void scale_dble(int vol, double c, spinor_dble *s) ++{ ++ spinor_dble *sm; ++ __m128d t128; ++ __m512d tc, t1; ++ ++ t128 = _mm_load_sd( &c ); ++ tc = _mm512_broadcastsd_pd( t128 ); ++ ++ sm = s + vol; ++ ++ for (; s < sm; s++) { ++ t1 = _mm512_loadu_pd( &(*s).c1.c1.re ); ++ t1 = _mm512_mul_pd( tc, t1 ); ++ _mm512_storeu_pd( &(*s).c1.c1.re, t1 ); ++ ++ t1 = _mm512_loadu_pd( &(*s).c1.c1.re+8 ); ++ t1 = _mm512_mul_pd( tc, t1 ); ++ _mm512_storeu_pd( &(*s).c1.c1.re+8, t1 ); ++ ++ t1 = _mm512_loadu_pd( &(*s).c1.c1.re+16 ); ++ t1 = _mm512_mul_pd( tc, t1 ); ++ _mm512_storeu_pd( &(*s).c1.c1.re+16, t1 ); ++ } ++} ++ ++void rotate_dble_avx512(int n, int ix, spinor_dble **ppk, spinor_dble *psi, complex_dble const *v) ++{ ++ spinor_dble *pk, *pj; ++ complex_dble const *z; ++ int k,j; ++ ++ for (k = 0; k < n; k++) { ++ __m128d tr, ti; ++ __m512d zr,zi, t1,t2, p1,p2,p3, sign; ++ ++ pk = psi + k; ++ pj = ppk[0] + ix; ++ z = v + k; ++ ++ tr = _mm_load_pd1( &z->re ); ++ ti = _mm_load_pd1( &z->im ); ++ zr = _mm512_broadcastsd_pd( tr ); ++ zi = _mm512_broadcastsd_pd( ti ); ++ ++ sign = _mm512_set_pd( -1,1,-1,1,-1,1,-1,1 ); ++ zi = _mm512_mul_pd( zi, sign ); ++ ++ t1 = _mm512_loadu_pd( &(*pj).c1.c1.re ); ++ t2 = _mm512_mul_pd( zi, t1 ); ++ t2 = _mm512_permute_pd( t2, 0b01010101 ); ++ p1 = _mm512_fmadd_pd( zr, t1, t2 ); ++ ++ t1 = _mm512_loadu_pd( &(*pj).c1.c1.re+8 ); ++ t2 = _mm512_mul_pd( zi, t1 ); ++ t2 = _mm512_permute_pd( t2, 0b01010101 ); ++ p2 = _mm512_fmadd_pd( zr, t1, t2 ); ++ ++ t1 = _mm512_loadu_pd( &(*pj).c1.c1.re+16 ); ++ t2 = _mm512_mul_pd( zi, t1 ); ++ t2 = _mm512_permute_pd( t2, 0b01010101 ); ++ p3 = _mm512_fmadd_pd( zr, t1, t2 ); ++ ++ for (j = 1; j < n; j++) { ++ pj = ppk[j] + ix; ++ z += n; ++ ++ tr = _mm_load_pd1( &z->re ); ++ ti = _mm_load_pd1( &z->im ); ++ zr = _mm512_broadcastsd_pd( tr ); ++ zi = _mm512_broadcastsd_pd( ti ); ++ zi = _mm512_mul_pd( zi, sign ); ++ ++ t1 = _mm512_loadu_pd( &(*pj).c1.c1.re ); ++ t2 = _mm512_mul_pd( zi, t1 ); ++ t2 = _mm512_permute_pd( t2, 0b01010101 ); ++ t1 = _mm512_fmadd_pd( zr, t1, t2 ); ++ p1 = _mm512_add_pd( p1, t1 ); ++ ++ t1 = _mm512_loadu_pd( &(*pj).c1.c1.re+8 ); ++ t2 = _mm512_mul_pd( zi, t1 ); ++ t2 = _mm512_permute_pd( t2, 0b01010101 ); ++ t1 = _mm512_fmadd_pd( zr, t1, t2 ); ++ p2 = _mm512_add_pd( p2, t1 ); ++ ++ t1 = _mm512_loadu_pd( &(*pj).c1.c1.re+16 ); ++ t2 = _mm512_mul_pd( zi, t1 ); ++ t2 = _mm512_permute_pd( t2, 0b01010101 ); ++ t1 = _mm512_fmadd_pd( zr, t1, t2 ); ++ p3 = _mm512_add_pd( p3, t1 ); ++ } ++ ++ _mm512_storeu_pd( &(*pk).c1.c1.re, p1 ); ++ _mm512_storeu_pd( &(*pk).c1.c1.re+8, p2 ); ++ _mm512_storeu_pd( &(*pk).c1.c1.re+16, p3 ); ++ } ++} ++ ++ ++void mulg5_dble(int vol, spinor_dble *s) ++{ ++ spinor_dble *sm; ++ ++ sm = s + vol; ++ ++ for (; s < sm; s++) { ++ __m512d s1; ++ __m256d s2; ++ ++ s1 = _mm512_loadu_pd( &(*s).c1.c1.re+12 ); ++ s1 = _mm512_sub_pd( _mm512_setzero_pd(), s1 ); ++ _mm512_storeu_pd( &(*s).c1.c1.re+12, s1 ); ++ ++ s2 = _mm256_loadu_pd( &(*s).c1.c1.re+20 ); ++ s2 = _mm256_sub_pd( _mm256_setzero_pd(), s2 ); ++ _mm256_storeu_pd( &(*s).c1.c1.re+20, s2 ); ++ } ++} ++ ++void mulmg5_dble(int vol, spinor_dble *s) ++{ ++ spinor_dble *sm; ++ ++ sm = s + vol; ++ ++ for (; s < sm; s++) { ++ __m512d s1; ++ __m256d s2; ++ ++ s1 = _mm512_loadu_pd( &(*s).c1.c1.re ); ++ s1 = _mm512_sub_pd( _mm512_setzero_pd(), s1 ); ++ _mm512_storeu_pd( &(*s).c1.c1.re, s1 ); ++ ++ s2 = _mm256_loadu_pd( &(*s).c1.c1.re+8 ); ++ s2 = _mm256_sub_pd( _mm256_setzero_pd(), s2 ); ++ _mm256_storeu_pd( &(*s).c1.c1.re+8, s2 ); ++ } ++} ++ ++#endif +\ No newline at end of file +diff --git a/modules/linalg/salg.c b/modules/linalg/salg.c +index 2be05c1..f58aea8 100644 +--- a/modules/linalg/salg.c ++++ b/modules/linalg/salg.c +@@ -89,9 +89,12 @@ static void alloc_wrotate(int n) + nrot=n; + } + ++ ++ + #if (defined AVX) + #include "avx.h" + ++#ifndef AVX512 + #if (defined FMA3) + + complex spinor_prod(int vol,int icom,spinor *s,spinor *r) +@@ -323,9 +326,30 @@ complex spinor_prod(int vol,int icom,spinor *s,spinor *r) + + return z; + } ++#endif ++ ++void mulc_spinor_add(int vol, spinor *s, spinor *r, complex z) ++{ ++ spinor *sm; ++ ++ _avx_load_cmplx_up(z); ++ sm = s + vol; ++ ++ for (; s < sm; s++) { ++ _avx_spinor_load(*s); ++ _avx_mulc_spinor_add(*r); ++ _avx_spinor_store(*s); ++ ++ r += 1; ++ } ++ ++ _avx_zeroupper(); ++} + + #endif + ++ ++ + float spinor_prod_re(int vol,int icom,spinor *s,spinor *r) + { + double x,y; +@@ -473,25 +497,6 @@ float norm_square(int vol,int icom,spinor *s) + } + + +-void mulc_spinor_add(int vol,spinor *s,spinor *r,complex z) +-{ +- spinor *sm; +- +- _avx_load_cmplx_up(z); +- sm=s+vol; +- +- for (;s sm) { ++ smb = sm; ++ } ++ smz = spinor_prod_dble_avx512( s, smb, r ); ++ s = smb; r+=8; ++ add_to_hsum(isz, (double *)(&smz)); ++ } ++ ++ if ((icom == 1) && (NPROC > 1)) { ++ global_hsum(isz, (double *)(&smz)); ++ } else { ++ local_hsum(isz, (double *)(&smz)); ++ } ++ ++ return smz; ++} ++ ++double spinor_prod_re_dble_avx512( spinor_dble *s, spinor_dble *smb, spinor_dble *r); ++double spinor_prod_re_dble(int vol, int icom, spinor_dble *s, ++ spinor_dble *r) ++{ ++ spinor_dble *sm, *smb; ++ ++ if (init == 0) { ++ isx = init_hsum(1); ++ isz = init_hsum(2); ++ init = 1; ++ } ++ ++ reset_hsum(isx); ++ sm = s + vol; ++ ++ while (s < sm) { ++ smb = s + 8; ++ if (smb > sm) { ++ smb = sm; ++ } ++ smx = spinor_prod_re_dble_avx512( s, smb, r ); ++ s = smb; r+=8; ++ ++ add_to_hsum(isx, &smx); ++ } ++ ++ if ((icom == 1) && (NPROC > 1)) { ++ global_hsum(isx, &smx); ++ } else { ++ local_hsum(isx, &smx); ++ } ++ ++ return smx; ++} ++ ++complex_dble spinor_prod5_dble_avx512(spinor_dble *s, spinor_dble *smb, spinor_dble *r); ++complex_dble spinor_prod5_dble(int vol, int icom, spinor_dble *s, ++ spinor_dble *r) ++{ ++ spinor_dble *sm, *smb; ++ ++ if (init == 0) { ++ isx = init_hsum(1); ++ isz = init_hsum(2); ++ init = 1; ++ } ++ ++ reset_hsum(isz); ++ sm = s + vol; ++ ++ while (s < sm) { ++ smb = s + 8; ++ if (smb > sm) { ++ smb = sm; ++ } ++ ++ smz = spinor_prod5_dble_avx512( s, smb, r ); ++ s = smb; r+=8; ++ ++ add_to_hsum(isz, (double *)(&smz)); ++ } ++ ++ if ((icom == 1) && (NPROC > 1)) { ++ global_hsum(isz, (double *)(&smz)); ++ } else { ++ local_hsum(isz, (double *)(&smz)); ++ } ++ ++ return smz; ++} ++ ++double norm_square_dble_avx512(spinor_dble *s, spinor_dble *smb); ++double norm_square_dble(int vol, int icom, spinor_dble *s) ++{ ++ spinor_dble *sm, *smb; ++ ++ if (init == 0) { ++ isx = init_hsum(1); ++ isz = init_hsum(2); ++ init = 1; ++ } ++ ++ reset_hsum(isx); ++ sm = s + vol; ++ ++ while (s < sm) { ++ smb = s + 8; ++ if (smb > sm) { ++ smb = sm; ++ } ++ ++ smx = norm_square_dble_avx512( s, smb ); ++ s = smb; ++ ++ add_to_hsum(isx, &smx); ++ } ++ ++ if ((icom == 1) && (NPROC > 1)) { ++ global_hsum(isx, &smx); ++ } else { ++ local_hsum(isx, &smx); ++ } ++ ++ return smx; ++} ++ ++ ++void rotate_dble_avx512(int n, int ix, spinor_dble **ppk, spinor_dble *psi, complex_dble *v); ++void rotate_dble(int vol, int n, spinor_dble **ppk, complex_dble *v) ++{ ++ int ix,k; ++ ++ if (n > nrot) { ++ alloc_wrotate(n); ++ } ++ ++ for (ix = 0; ix < vol; ix++) { ++ rotate_dble_avx512( n, ix, ppk, psi, v ); ++ ++ for (k = 0; k < n; k++) { ++ *(ppk[k] + ix) = psi[k]; ++ } ++ } ++} ++ ++ ++#elif (defined AVX) + #include "avx.h" + + #if (defined FMA3) +diff --git a/modules/sw_term/avx512/pauli_avx512.c b/modules/sw_term/avx512/pauli_avx512.c +new file mode 100644 +index 0000000..89e9d50 +--- /dev/null ++++ b/modules/sw_term/avx512/pauli_avx512.c +@@ -0,0 +1,235 @@ ++/******************************************************************************* ++* ++* File pauli_avx512.c ++* ++* This software is distributed under the terms of the GNU General Public ++* License (GPL) ++* ++* AVX512 implementations of the clover term multiplication in ++* single precision. ++* ++* See ../pauli_avx512.c for more information and alternative ++* implementations. ++* ++*******************************************************************************/ ++ ++#ifdef AVX512 ++ ++#include ++#include ++#include ++#include "su3.h" ++#include "sw_term.h" ++typedef union ++{ ++ spinor s; ++ weyl w[2]; ++} spin_t; ++ ++#include "avx512.h" ++ ++ ++void mul_pauli2(float mu, pauli *m, spinor *source, spinor *res ) ++{ ++ spin_t *ps, *pr; ++ float const *u, *u2; ++ __m512i idx; ++ __m512 tr1,tr2,tr3; ++ __m512 ts1, ts2, ts3, tsi1, tsi2, tsi3, u512; ++ __m512 tu11, tu12, tu13, tu21, tu22, tu23; ++ __m512 tu1, tu2, tu3, tu4; ++ __m512 umu; ++ __m256 t256; ++ __m128 t128a, t128b, tmu; ++ ++ ps = (spin_t *)(source); ++ pr = (spin_t *)(res); ++ ++ u = (*m).u; ++ u2 = (m+1)->u; ++ ++ weyl * s = (*ps).w; ++ weyl * r = (*pr).w; ++ weyl * s2 = (*ps).w+1; ++ weyl * r2 = (*pr).w+1; ++ ++ s += 4; ++ _prefetch_spinor(s); ++ s -= 4; ++ ++ tr1 = _mm512_loadu_ps( &(*s).c1.c1.re ); ++ tr2 = _mm512_castps256_ps512( _mm256_loadu_ps( &(*s).c1.c1.re+16 ) ); ++ idx = _mm512_setr_epi32( 0,1,2,3,6,7,8,9, 12,13,14,15,18,19,20,21 ); ++ ts1 = _mm512_permutex2var_ps( tr1, idx, tr2 ); ++ idx = _mm512_setr_epi32( 2,3,4,5,8,9,10,11, 14,15,16,17,20,21,22,23 ); ++ ts2 = _mm512_permutex2var_ps( tr1, idx, tr2 ); ++ idx = _mm512_setr_epi32( 4,5,0,1,10,11,6,7, 16,17,12,13,22,23,18,19 ); ++ ts3 = _mm512_permutex2var_ps( tr1, idx, tr2 ); ++ ++ tu11 = _mm512_loadu_ps( u ); ++ tu12 = _mm512_loadu_ps( u+16 ); ++ tu13 = _mm512_loadu_ps( u+32 ); ++ tu21 = _mm512_loadu_ps( u2 ); ++ tu22 = _mm512_loadu_ps( u2+16 ); ++ tu23 = _mm512_loadu_ps( u2+32 ); ++ ++ ++ tsi1 = _mm512_permute_ps ( ts1, 0b10110001 ); ++ tsi2 = _mm512_permute_ps ( ts2, 0b10110001 ); ++ tsi3 = _mm512_permute_ps ( ts3, 0b10110001 ); ++ ++ ++ tmu = _mm_load_ps1( &mu ); ++ umu = _mm512_broadcastss_ps( tmu ); ++ ++ ++ idx = _mm512_setr_epi32( 0,1,10,11,4,5,2,3, 16,17,26,27,20,21,18,19 ); ++ tu1 = _mm512_permutex2var_ps( tu11, idx, tu21 ); ++ idx = _mm512_setr_epi32( 4,5,0,1,12,13,6,7, 20,21,16,17,28,29,22,23); ++ tu2 = _mm512_permutex2var_ps( tu12, idx, tu22 ); ++ ++ idx = _mm512_setr_epi32( 0,0,1,1,2,3,16+0,16+1, 8,8,9,9,10,11,16+8,16+9 ); ++ tu4 = _mm512_permutex2var_ps( tu1, idx, tu2 ); ++ u512 = _mm512_permute_ps( tu4, 0b10100000 ); ++ tr1 = _mm512_mul_ps( ts1, u512 ); ++ ++ idx = _mm512_setr_epi32( 0,1,2,3,16+5,16+5,16+7,16+7, ++ 8,9,10,11,16+13,16+13,16+15,16+15 ); ++ u512 = _mm512_permutex2var_ps( umu, idx, tu4 ); ++ u512 = _mm512_mul_ps( u512, tsi1 ); ++ tr1 = _mm512_mask_add_ps( tr1, 0b1010010110101010, tr1, u512 ); ++ tr1 = _mm512_mask_sub_ps( tr1, 0b0101101001010101, tr1, u512 ); ++ ++ ++ idx = _mm512_setr_epi32( 0,0,4,4,16+4,16+4,16+5,16+5, ++ 8,8,12,12,16+12,16+12,16+13,16+13 ); ++ u512 = _mm512_permutex2var_ps( tu2, idx, tu1 ); ++ tr2 = _mm512_mul_ps( ts2, u512 ); ++ ++ idx = _mm512_setr_epi32( 1,1,5,5,16+4,16+5,16+6,16+7, ++ 9,9,13,13,16+12,16+13,16+14,16+15 ); ++ u512 = _mm512_permutex2var_ps( tu2, idx, umu ); ++ u512 = _mm512_mul_ps( u512, tsi2 ); ++ tr2 = _mm512_mask_add_ps( tr2, 0b0101010110100101, tr2, u512 ); ++ tr2 = _mm512_mask_sub_ps( tr2, 0b1010101001011010, tr2, u512 ); ++ ++ ++ idx = _mm512_setr_epi32( 6,6,2,3,16+4,16+5,7,7, ++ 14,14,10,11,16+12,16+13,15,15 ); ++ tu4 = _mm512_permutex2var_ps( tu1, idx, tu2 ); ++ u512 = _mm512_permute_ps( tu4, 0b10100000 ); ++ tr3 = _mm512_mul_ps( ts3, u512 ); ++ ++ idx = _mm512_setr_epi32( 0,1,16+3,16+3,16+5,16+5,6,7, ++ 8,9,16+11,16+11,16+13,16+13,14,15 ); ++ u512 = _mm512_permutex2var_ps( umu, idx, tu4 ); ++ u512 = _mm512_mul_ps( u512, tsi3 ); ++ tr3 = _mm512_mask_add_ps( tr3, 0b0110010110100110, tr3, u512 ); ++ tr3 = _mm512_mask_sub_ps( tr3, 0b1001101001011001, tr3, u512 ); ++ ++ ++ ++ idx = _mm512_setr_epi32( 8,9,6,7,14,15,12,13, ++ 24,25,22,23,30,31,28,29 ); ++ tu1 = _mm512_permutex2var_ps( tu11, idx, tu21 ); ++ ++ u512 = _mm512_shuffle_ps( tu1, tu2, 0b10101010 ); ++ tr1 = _mm512_fmadd_ps( ts2, u512, tr1 ); ++ ++ u512 = _mm512_shuffle_ps( tu1, tu2, 0b11111111 ); ++ u512 = _mm512_mul_ps( u512, tsi2 ); ++ tr1 = _mm512_mask_add_ps( tr1, 0b1010101010101010, tr1, u512 ); ++ tr1 = _mm512_mask_sub_ps( tr1, 0b0101010101010101, tr1, u512 ); ++ ++ m += 4; ++ _prefetch_pauli_dble(m); ++ m -= 4; ++ ++ idx = _mm512_setr_epi32( 0,1,2,3,0,1,2,3, 16,17,18,19,16,17,18,19 ); ++ tu13 = _mm512_permutex2var_ps( tu13, idx, tu23 ); ++ idx = _mm512_setr_epi32( 6,7,2,3,8,9,14,15, 22,23,18,19,24,25,30,31 ); ++ tu2 = _mm512_permutex2var_ps( tu12, idx, tu22 ); ++ ++ idx = _mm512_setr_epi32( 6,7,16+0,16+1,16+6,16+7,0,0, ++ 14,15,16+8,16+9,16+14,16+15,0,0 ); ++ tu3 = _mm512_permutex2var_ps( tu1, idx, tu2 ); ++ tu4 = _mm512_permute_ps( tu3, 0b10100000 ); ++ u512 = _mm512_mask_shuffle_ps( tu4, 0b1111000011110000, tu4, tu13, 0b10100100 ); ++ tr2 = _mm512_fmadd_ps( ts1, u512, tr2 ); ++ ++ tu4 = _mm512_permute_ps( tu3, 0b11110101 ); ++ u512 = _mm512_mask_shuffle_ps( tu4, 0b1111000011110000, tu4, tu13, 0b11110100 ); ++ u512 = _mm512_mul_ps( u512, tsi1 ); ++ tr2 = _mm512_mask_add_ps( tr2, 0b0101010101010101, tr2, u512 ); ++ tr2 = _mm512_mask_sub_ps( tr2, 0b1010101010101010, tr2, u512 ); ++ ++ tu3 = _mm512_mask_shuffle_ps( tu2, 0b0000111100001111, tu1, tu2, 0b11100100 ); ++ u512 = _mm512_permute_ps( tu3, 0b10100000 ); ++ tr3 = _mm512_fmadd_ps( ts1, u512, tr3 ); ++ ++ u512 = _mm512_permute_ps( tu3, 0b11110101 ); ++ u512 = _mm512_mul_ps( u512, tsi1 ); ++ tr3 = _mm512_mask_add_ps( tr3, 0b1010010110100101, tr3, u512 ); ++ tr3 = _mm512_mask_sub_ps( tr3, 0b0101101001011010, tr3, u512 ); ++ ++ idx = _mm512_setr_epi32( 0,1,2,3, 4,5,16+2,16+3, 8,9,10,11,12,13,16+10,16+11 ); ++ tu3 = _mm512_permutex2var_ps( tu1, idx, tu2 ); ++ u512 = _mm512_permute_ps( tu3, 0b10100000 ); ++ tr1 = _mm512_fmadd_ps( ts3, u512, tr1 ); ++ ++ u512 = _mm512_permute_ps( tu3, 0b11110101 ); ++ u512 = _mm512_mul_ps( u512, tsi3 ); ++ tr1 = _mm512_mask_add_ps( tr1, 0b1010011010100110, tr1, u512 ); ++ tr1 = _mm512_mask_sub_ps( tr1, 0b0101100101011001, tr1, u512 ); ++ ++ ++ idx = _mm512_setr_epi32( 0,1,8,9,10,11,-1,-1, 16,17,24,25,26,27,-1,-1 ); ++ tu2 = _mm512_permutex2var_ps( tu12, idx, tu22 ); ++ ++ idx = _mm512_setr_epi32( 4,5,16+4,16+5,0,0,0,0, 12,13,16+12,16+13,0,0,0,0 ); ++ tu3 = _mm512_permutex2var_ps( tu2, idx, tu1 ); ++ u512 = _mm512_permute_ps( tu3, 0b10100000 ); ++ u512 = _mm512_mask_permute_ps( u512, 0b1111000011110000, tu13, 0b00001010 ); ++ tr2 = _mm512_fmadd_ps( ts3, u512, tr2 ); ++ ++ u512 = _mm512_permute_ps( tu3, 0b11110101 ); ++ u512 = _mm512_mask_permute_ps( u512, 0b1111000011110000, tu13, 0b01011111 ); ++ u512 = _mm512_mul_ps( u512, tsi3 ); ++ tr2 = _mm512_mask_add_ps( tr2, 0b0110010101100101, tr2, u512 ); ++ tr2 = _mm512_mask_sub_ps( tr2, 0b1001101010011010, tr2, u512 ); ++ ++ tu3 = _mm512_mask_shuffle_ps( tu2, 0b1111000011110000, tu2, tu13, 0b01000100 ); ++ u512 = _mm512_permute_ps( tu3, 0b10100000 ); ++ tr3 = _mm512_fmadd_ps( ts2, u512, tr3 ); ++ ++ u512 = _mm512_permute_ps( tu3, 0b11110101 ); ++ u512 = _mm512_mul_ps( u512, tsi2 ); ++ tr3 = _mm512_mask_add_ps( tr3, 0b1010010110100101, tr3, u512 ); ++ tr3 = _mm512_mask_sub_ps( tr3, 0b0101101001011010, tr3, u512 ); ++ ++ ++ ++ idx = _mm512_setr_epi32( 0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27 ); ++ ts1 = _mm512_permutex2var_ps( tr1, idx, tr3 ); ++ idx = _mm512_setr_epi32( 4,5,6,7, 20,21,22,23, 12,13,14,15, 28,29,30,31 ); ++ ts2 = _mm512_permutex2var_ps( tr1, idx, tr3 ); ++ ts3 = _mm512_add_ps( ts1, ts2 ); ++ ++ t256 = _mm512_castps512_ps256( ts3 ); ++ _mm256_storeu_ps( &(*r).c1.c1.re, t256 ); ++ ++ t128a = _mm512_castps512_ps128( tr2 ); ++ t128b = _mm512_extractf32x4_ps( tr2, 1 ); ++ t128b = _mm_add_ps( t128a, t128b ); ++ _mm_storeu_ps( &(*r).c2.c2.re, t128b ); ++ ++ t256 = _mm256_castpd_ps( _mm512_extractf64x4_pd( _mm512_castps_pd(ts3), 1 ) ); ++ _mm256_storeu_ps( &(*r2).c1.c1.re, t256 ); ++ ++ t128a = _mm512_extractf32x4_ps( tr2, 2 ); ++ t128b = _mm512_extractf32x4_ps( tr2, 3 ); ++ t128b = _mm_add_ps( t128a, t128b ); ++ _mm_storeu_ps( &(*r2).c2.c2.re, t128b ); ++} ++ ++#endif +\ No newline at end of file +diff --git a/modules/sw_term/avx512/pauli_dble_avx512.c b/modules/sw_term/avx512/pauli_dble_avx512.c +new file mode 100644 +index 0000000..5eb11fb +--- /dev/null ++++ b/modules/sw_term/avx512/pauli_dble_avx512.c +@@ -0,0 +1,489 @@ ++/******************************************************************************* ++* ++* File pauli_avx512.c ++* ++* This software is distributed under the terms of the GNU General Public ++* License (GPL) ++* ++* AVX512 implementations of the clover term multiplication in ++* double precision. ++* ++* See ../pauli_dble_avx512.c for more information and alternative ++* implementations. ++* ++*******************************************************************************/ ++ ++#ifdef AVX512 ++ ++#include ++#include ++#include ++#include ++#include "su3.h" ++#include "linalg.h" ++#include "sw_term.h" ++#define DELTA 1.0e-04 ++ ++typedef union ++{ ++ spinor_dble s; ++ weyl_dble w[2]; ++ complex_dble c[12]; ++} spin_t; ++ ++#include "avx512.h" ++ ++ ++void mul_pauli2_dble(double mu, pauli_dble *m, weyl_dble *s, weyl_dble *r) ++{ ++ double const *u = m->u, *u2 = (m+1)->u; ++ ++ __m512d r1,r2,r3; ++ __m512d s1,s2,s3,s4,s5,s6, si1,si2,si3,si4,si5,si6, s512, u512; ++ __m512d t1,t2,t3,t4,t5,t6; ++ __m512d tu11, tu12, tu13, tu14, tu15, tu21, tu22, tu23, tu24, tu25; ++ __m512d tu1, tu2, tu3, tu4, tu5, tu6; ++ __m512d umu; ++ __m512i idx; ++ ++ t1 = _mm512_loadu_pd( &(*s).c1.c1.re ); ++ t2 = _mm512_loadu_pd( &(*s).c1.c1.re + 8 ); ++ t3 = _mm512_loadu_pd( &(*s).c1.c1.re + 16 ); ++ ++ tu11 = _mm512_loadu_pd( u ); ++ tu12 = _mm512_loadu_pd( u+8 ); ++ tu13 = _mm512_loadu_pd( u+16 ); ++ tu14 = _mm512_loadu_pd( u+24 ); ++ tu15 = _mm512_loadu_pd( u+32 ); ++ tu21 = _mm512_loadu_pd( u2 ); ++ tu22 = _mm512_loadu_pd( u2+8 ); ++ tu23 = _mm512_loadu_pd( u2+16 ); ++ tu24 = _mm512_loadu_pd( u2+24 ); ++ tu25 = _mm512_loadu_pd( u2+32 ); ++ ++ umu = _mm512_loadu_pd( &mu ); ++ ++ ++ idx = _mm512_setr_epi64( 0,1,12,13,0,1,12,13 ); ++ s1 = _mm512_permutex2var_pd( t1, idx, t2 ); ++ idx = _mm512_setr_epi64( 2,3,14,15,2,3,14,15 ); ++ s2 = _mm512_permutex2var_pd( t1, idx, t2 ); ++ idx = _mm512_setr_epi64( 4,5,8,9,4,5,8,9 ); ++ s3 = _mm512_permutex2var_pd( t1, idx, t3 ); ++ idx = _mm512_setr_epi64( 0,1,12,13,0,1,12,13 ); ++ s4 = _mm512_permutex2var_pd( t2, idx, t3 ); ++ idx = _mm512_setr_epi64( 6,7,10,11,6,7,10,11 ); ++ s5 = _mm512_permutex2var_pd( t1, idx, t3 ); ++ idx = _mm512_setr_epi64( 2,3,14,15,2,3,14,15 ); ++ s6 = _mm512_permutex2var_pd( t2, idx, t3 ); ++ ++ si1 = _mm512_permute_pd ( s1, 0b01010101 ); ++ si2 = _mm512_permute_pd ( s2, 0b01010101 ); ++ si3 = _mm512_permute_pd ( s3, 0b01010101 ); ++ si4 = _mm512_permute_pd ( s4, 0b01010101 ); ++ si5 = _mm512_permute_pd ( s5, 0b01010101 ); ++ si6 = _mm512_permute_pd ( s6, 0b01010101 ); ++ ++ ++ idx = _mm512_setr_epi64( 0,1,8,9,6,7,14,15 ); ++ tu1 = _mm512_permutex2var_pd( tu11, idx, tu21 ); ++ idx = _mm512_setr_epi64( 4,5,12,13,2,3,10,11 ); ++ tu2 = _mm512_permutex2var_pd( tu12, idx, tu22 ); ++ ++ idx = _mm512_setr_epi64( 0,0,2,2,8+4,8+4,8+6,8+6 ); ++ u512 = _mm512_permutex2var_pd( tu1, idx, tu2 ); ++ r1 = _mm512_mul_pd( u512, s1 ); ++ idx = _mm512_setr_epi64( 0,0,0,0,8+5,8+5,8+7,8+7 ); ++ u512 = _mm512_permutex2var_pd( umu, idx, tu2 ); ++ u512 = _mm512_mul_pd( u512, si1 ); ++ r1 = _mm512_mask_add_pd( r1, 0b01010110, r1, u512 ); ++ r1 = _mm512_mask_sub_pd( r1, 0b10101001, r1, u512 ); ++ ++ idx = _mm512_setr_epi64( 2,3,10,11,4,5,12,13 ); ++ tu6 = _mm512_permutex2var_pd( tu11, idx, tu21 ); ++ ++ idx = _mm512_setr_epi64( 4,4,6,6,8+1,8+1,8+3,8+3 ); ++ u512 = _mm512_permutex2var_pd( tu2, idx, tu6 ); ++ r1 = _mm512_fmadd_pd( u512, s5, r1 ); ++ idx = _mm512_setr_epi64( 5,5,7,7,8,8,8,8 ); ++ u512 = _mm512_permutex2var_pd( tu2, idx, umu ); ++ u512 = _mm512_mul_pd( u512, si5 ); ++ r1 = _mm512_mask_add_pd( r1, 0b01101010, r1, u512 ); ++ r1 = _mm512_mask_sub_pd( r1, 0b10010101, r1, u512 ); ++ ++ ++ idx = _mm512_setr_epi64( 2,3,8+2,8+3,4,5,8+4,8+5 ); ++ tu3 = _mm512_permutex2var_pd( tu13, idx, tu23 ); ++ ++ idx = _mm512_setr_epi64( 1,1,3,3,8+4,8+4,8+6,8+6 ); ++ u512 = _mm512_permutex2var_pd( tu1, idx, tu3 ); ++ r2 = _mm512_mul_pd( u512, s2 ); ++ idx = _mm512_setr_epi64( 0,0,0,0,8+5,8+5,8+7,8+7 ); ++ u512 = _mm512_permutex2var_pd( umu, idx, tu3 ); ++ u512 = _mm512_mul_pd( u512, si2 ); ++ r2 = _mm512_mask_add_pd( r2, 0b01010110, r2, u512 ); ++ r2 = _mm512_mask_sub_pd( r2, 0b10101001, r2, u512 ); ++ ++ idx = _mm512_setr_epi64( 4,4,6,6,8+4,8+4,8+6,8+6 ); ++ u512 = _mm512_permutex2var_pd( tu3, idx, tu6 ); ++ r2 = _mm512_fmadd_pd( u512, s4, r2 ); ++ idx = _mm512_setr_epi64( 5,5,7,7,8,8,8,8 ); ++ u512 = _mm512_permutex2var_pd( tu3, idx, umu ); ++ u512 = _mm512_mul_pd( u512, si4 ); ++ r2 = _mm512_mask_add_pd( r2, 0b01101010, r2, u512 ); ++ r2 = _mm512_mask_sub_pd( r2, 0b10010101, r2, u512 ); ++ ++ ++ idx = _mm512_setr_epi64( 4,5,8+4,8+5,6,7,8+6,8+7 ); ++ tu4 = _mm512_permutex2var_pd( tu14, idx, tu24 ); ++ ++ idx = _mm512_setr_epi64( 0,0,2,2,8+0,8+0,8+2,8+2 ); ++ u512 = _mm512_permutex2var_pd( tu6, idx, tu4 ); ++ r3 = _mm512_mul_pd( u512, s3 ); ++ idx = _mm512_setr_epi64( 0,0,0,0,8+1,8+1,8+3,8+3 ); ++ u512 = _mm512_permutex2var_pd( umu, idx, tu4 ); ++ u512 = _mm512_mul_pd( u512, si3 ); ++ r3 = _mm512_mask_add_pd( r3, 0b01010110, r3, u512 ); ++ r3 = _mm512_mask_sub_pd( r3, 0b10101001, r3, u512 ); ++ ++ idx = _mm512_setr_epi64( 0,0,2,2,8+5,8+5,8+7,8+7 ); ++ u512 = _mm512_permutex2var_pd( tu4, idx, tu6 ); ++ r3 = _mm512_fmadd_pd( u512, s6, r3 ); ++ idx = _mm512_setr_epi64( 1,1,3,3,8,8,8,8 ); ++ u512 = _mm512_permutex2var_pd( tu4, idx, umu ); ++ u512 = _mm512_mul_pd( u512, si6 ); ++ r3 = _mm512_mask_add_pd( r3, 0b01101010, r3, u512 ); ++ r3 = _mm512_mask_sub_pd( r3, 0b10010101, r3, u512 ); ++ ++ ++ idx = _mm512_setr_epi64( 4,4,6,6,8,8,8+2,8+2 ); ++ u512 = _mm512_permutex2var_pd( tu1, idx, tu2 ); ++ r2 = _mm512_fmadd_pd( u512, s1, r2 ); ++ idx = _mm512_setr_epi64( 5,5,7,7,9,9,8+3,8+3 ); ++ u512 = _mm512_permutex2var_pd( tu1, idx, tu2 ); ++ u512 = _mm512_mul_pd( u512, si1 ); ++ r2 = _mm512_mask_add_pd( r2, 0b01010101, r2, u512 ); ++ r2 = _mm512_mask_sub_pd( r2, 0b10101010, r2, u512 ); ++ ++ idx = _mm512_setr_epi64( 0,0,2,2,8+4,8+4,8+6,8+6 ); ++ u512 = _mm512_permutex2var_pd( tu2, idx, tu4 ); ++ r1 = _mm512_fmadd_pd( u512, s4, r1 ); ++ idx = _mm512_setr_epi64( 1,1,3,3,8+5,8+5,8+7,8+7 ); ++ u512 = _mm512_permutex2var_pd( tu2, idx, tu4 ); ++ u512 = _mm512_mul_pd( u512, si4 ); ++ r1 = _mm512_mask_add_pd( r1, 0b10101010, r1, u512 ); ++ r1 = _mm512_mask_sub_pd( r1, 0b01010101, r1, u512 ); ++ ++ ++ idx = _mm512_setr_epi64( 4,4,6,6,8+0,8+0,8+2,8+2 ); ++ u512 = _mm512_permutex2var_pd( tu1, idx, tu3 ); ++ r1 = _mm512_fmadd_pd( u512, s2, r1 ); ++ idx = _mm512_setr_epi64( 5,5,7,7,8+1,8+1,8+3,8+3 ); ++ u512 = _mm512_permutex2var_pd( tu1, idx, tu3 ); ++ u512 = _mm512_mul_pd( u512, si2 ); ++ r1 = _mm512_mask_add_pd( r1, 0b01011010, r1, u512 ); ++ r1 = _mm512_mask_sub_pd( r1, 0b10100101, r1, u512 ); ++ ++ idx = _mm512_setr_epi64( 0,0,2,2,8+4,8+4,8+6,8+6 ); ++ u512 = _mm512_permutex2var_pd( tu3, idx, tu4 ); ++ r2 = _mm512_fmadd_pd( u512, s5, r2 ); ++ idx = _mm512_setr_epi64( 1,1,3,3,8+5,8+5,8+7,8+7 ); ++ u512 = _mm512_permutex2var_pd( tu3, idx, tu4 ); ++ u512 = _mm512_mul_pd( u512, si5 ); ++ r2 = _mm512_mask_add_pd( r2, 0b01011010, r2, u512 ); ++ r2 = _mm512_mask_sub_pd( r2, 0b10100101, r2, u512 ); ++ ++ idx = _mm512_setr_epi64( 0,0,8,8,6,6,14,14 ); ++ u512 = _mm512_permutex2var_pd( tu12, idx, tu22 ); ++ r3 = _mm512_fmadd_pd( u512, s1, r3 ); ++ idx = _mm512_setr_epi64( 1,1,9,9,7,7,15,15 ); ++ u512 = _mm512_permutex2var_pd( tu12, idx, tu22 ); ++ u512 = _mm512_mul_pd( u512, si1 ); ++ r3 = _mm512_mask_add_pd( r3, 0b01010101, r3, u512 ); ++ r3 = _mm512_mask_sub_pd( r3, 0b10101010, r3, u512 ); ++ ++ ++ idx = _mm512_setr_epi64( 0,0,8,8,6,6,14,14 ); ++ u512 = _mm512_permutex2var_pd( tu13, idx, tu23 ); ++ r3 = _mm512_fmadd_pd( u512, s2, r3 ); ++ idx = _mm512_setr_epi64( 1,1,9,9,7,7,15,15 ); ++ u512 = _mm512_permutex2var_pd( tu13, idx, tu23 ); ++ u512 = _mm512_mul_pd( u512, si2 ); ++ r3 = _mm512_mask_add_pd( r3, 0b01010101, r3, u512 ); ++ r3 = _mm512_mask_sub_pd( r3, 0b10101010, r3, u512 ); ++ ++ ++ ++ idx = _mm512_setr_epi64( 0,1,8+0,8+1,6,7,8+6,8+7 ); ++ tu2 = _mm512_permutex2var_pd( tu12, idx, tu22 ); ++ idx = _mm512_setr_epi64( 2,3,8+2,8+3,0,1,8+0,8+1 ); ++ tu4 = _mm512_permutex2var_pd( tu14, idx, tu24 ); ++ ++ idx = _mm512_setr_epi64( 0,0,2,2,8+4,8+4,8+6,8+6 ); ++ u512 = _mm512_permutex2var_pd( tu2, idx, tu4 ); ++ r1 = _mm512_fmadd_pd( u512, s3, r1 ); ++ idx = _mm512_setr_epi64( 1,1,3,3,8+5,8+5,8+7,8+7 ); ++ u512 = _mm512_permutex2var_pd( tu2, idx, tu4 ); ++ u512 = _mm512_mul_pd( u512, si3 ); ++ r1 = _mm512_mask_add_pd( r1, 0b01011010, r1, u512 ); ++ r1 = _mm512_mask_sub_pd( r1, 0b10100101, r1, u512 ); ++ ++ ++ idx = _mm512_setr_epi64( 0,1,8+0,8+1,2,3,8+2,8+3 ); ++ tu5 = _mm512_permutex2var_pd( tu15, idx, tu25 ); ++ ++ idx = _mm512_setr_epi64( 4,4,6,6,8+0,8+0,8+2,8+2 ); ++ u512 = _mm512_permutex2var_pd( tu4, idx, tu5 ); ++ r3 = _mm512_fmadd_pd( u512, s5, r3 ); ++ idx = _mm512_setr_epi64( 5,5,7,7,8+1,8+1,8+3,8+3 ); ++ u512 = _mm512_permutex2var_pd( tu4, idx, tu5 ); ++ u512 = _mm512_mul_pd( u512, si5 ); ++ r3 = _mm512_mask_add_pd( r3, 0b01011010, r3, u512 ); ++ r3 = _mm512_mask_sub_pd( r3, 0b10100101, r3, u512 ); ++ ++ idx = _mm512_setr_epi64( 0,0,2,2,8+4,8+4,8+6,8+6 ); ++ u512 = _mm512_permutex2var_pd( tu4, idx, tu5 ); ++ r3 = _mm512_fmadd_pd( u512, s4, r3 ); ++ idx = _mm512_setr_epi64( 1,1,3,3,8+5,8+5,8+7,8+7 ); ++ u512 = _mm512_permutex2var_pd( tu4, idx, tu5 ); ++ u512 = _mm512_mul_pd( u512, si4 ); ++ r3 = _mm512_mask_add_pd( r3, 0b01011010, r3, u512 ); ++ r3 = _mm512_mask_sub_pd( r3, 0b10100101, r3, u512 ); ++ ++ idx = _mm512_setr_epi64( 4,4,6,6,8+0,8+0,8+2,8+2 ); ++ u512 = _mm512_permutex2var_pd( tu2, idx, tu5 ); ++ r1 = _mm512_fmadd_pd( u512, s6, r1 ); ++ idx = _mm512_setr_epi64( 5,5,7,7,8+1,8+1,8+3,8+3 ); ++ u512 = _mm512_permutex2var_pd( tu2, idx, tu5 ); ++ u512 = _mm512_mul_pd( u512, si6 ); ++ r1 = _mm512_mask_add_pd( r1, 0b10101010, r1, u512 ); ++ r1 = _mm512_mask_sub_pd( r1, 0b01010101, r1, u512 ); ++ ++ ++ idx = _mm512_setr_epi64( 6,7,8+6,8+7,0,1,8+0,8+1 ); ++ tu3 = _mm512_permutex2var_pd( tu13, idx, tu23 ); ++ ++ idx = _mm512_setr_epi64( 4,4,6,6,8+0,8+0,8+2,8+2 ); ++ u512 = _mm512_permutex2var_pd( tu3, idx, tu4 ); ++ r2 = _mm512_fmadd_pd( u512, s3, r2 ); ++ idx = _mm512_setr_epi64( 5,5,7,7,8+1,8+1,8+3,8+3 ); ++ u512 = _mm512_permutex2var_pd( tu3, idx, tu4 ); ++ u512 = _mm512_mul_pd( u512, si3 ); ++ r2 = _mm512_mask_add_pd( r2, 0b01011010, r2, u512 ); ++ r2 = _mm512_mask_sub_pd( r2, 0b10100101, r2, u512 ); ++ ++ idx = _mm512_setr_epi64( 0,0,2,2,8+4,8+4,8+6,8+6 ); ++ u512 = _mm512_permutex2var_pd( tu3, idx, tu5 ); ++ r2 = _mm512_fmadd_pd( u512, s6, r2 ); ++ idx = _mm512_setr_epi64( 1,1,3,3,8+5,8+5,8+7,8+7 ); ++ u512 = _mm512_permutex2var_pd( tu3, idx, tu5 ); ++ u512 = _mm512_mul_pd( u512, si6 ); ++ r2 = _mm512_mask_add_pd( r2, 0b10101010, r2, u512 ); ++ r2 = _mm512_mask_sub_pd( r2, 0b01010101, r2, u512 ); ++ ++ idx = _mm512_setr_epi64( 0,1,8,9,2,3,10,11 ); ++ t1 = _mm512_permutex2var_pd( r1, idx, r2 ); ++ idx = _mm512_setr_epi64( 2,3,14,15,0,1,12,13 ); ++ t2 = _mm512_permutex2var_pd( r3, idx, r1 ); ++ idx = _mm512_setr_epi64( 4,5,12,13,6,7,14,15 ); ++ t3 = _mm512_permutex2var_pd( r2, idx, r3 ); ++ r1 = _mm512_mask_blend_pd( 0b11110000, t1, t2 ); ++ r2 = _mm512_mask_blend_pd( 0b11110000, t3, t1 ); ++ r3 = _mm512_mask_blend_pd( 0b11110000, t2, t3 ); ++ ++ _mm512_storeu_pd( &r[0].c1.c1.re, r1 ); ++ _mm512_storeu_pd( &r[0].c2.c2.re, r2 ); ++ _mm512_storeu_pd( &r[1].c1.c3.re, r3 ); ++} ++ ++ ++int fwd_house_avx512(double eps, complex_dble *aa, complex_dble *dd, double * rr ) ++{ ++ int i, j, k, ifail; ++ double r1, r2, r3; ++ complex_dble z; ++ ++ ifail = 0; ++ ++ for (k = 0; k < 5; k++) { ++ r1 = aa[6 * k + k].re * aa[6 * k + k].re + ++ aa[6 * k + k].im * aa[6 * k + k].im; ++ r2 = sqrt(r1); ++ ++ for (j = (k + 1); j < 6; j++) ++ r1 += (aa[6 * j + k].re * aa[6 * j + k].re + ++ aa[6 * j + k].im * aa[6 * j + k].im); ++ ++ if (r1 >= eps) ++ r1 = sqrt(r1); ++ else { ++ ifail = 1; ++ r1 = 1.0; ++ } ++ ++ if (r2 >= (DBL_EPSILON * r1)) { ++ r3 = 1.0 / r2; ++ z.re = r3 * aa[6 * k + k].re; ++ z.im = r3 * aa[6 * k + k].im; ++ } else { ++ z.re = 1.0; ++ z.im = 0.0; ++ } ++ ++ aa[6 * k + k].re += r1 * z.re; ++ aa[6 * k + k].im += r1 * z.im; ++ ++ r3 = 1.0 / (r1 * (r1 + r2)); ++ rr[k] = r3; ++ dd[k].re = -(r1 + r2) * r3 * z.re; ++ dd[k].im = (r1 + r2) * r3 * z.im; ++ ++ for (j = (k + 1); j < 6; j++) { ++ complex_dble z, *ak, *aj; ++ __m128d mz, t1, t2, t3; ++ mz = _mm_setzero_pd(); ++ ++ ak = aa + 6 * k + k; ++ aj = aa + 6 * k + j; ++ ++ for (i = k; i < 6; i++) { ++ t1 = _mm_loaddup_pd(&ak->re); ++ t2 = _mm_loaddup_pd(&ak->im); ++ t3 = _mm_load_pd(&aj->re); ++ t2 = _mm_mul_pd( t2, t3 ); ++ t2 = _mm_permute_pd( t2, 0b01 ); ++ t1 = _mm_fmsubadd_pd( t1, t3, t2 ); ++ mz = _mm_add_pd( mz, t1 ); ++ ak += 6; ++ aj += 6; ++ } ++ ++ t1 = _mm_loaddup_pd(&r3); ++ mz = _mm_mul_pd( mz, t1 ); ++ _mm_storeu_pd( &z.re, mz ); ++ ++ ak = aa + 6 * k + k; ++ aj = aa + 6 * k + j; ++ for (i = k; i < 6; i++) { ++ t1 = _mm_loaddup_pd(&ak->re); ++ t2 = _mm_loaddup_pd(&ak->im); ++ t3 = _mm_load_pd(&aj->re); ++ t2 = _mm_mul_pd( mz, t2 ); ++ t2 = _mm_permute_pd( t2, 0b01 ); ++ t1 = _mm_fmaddsub_pd( mz,t1, t2 ); ++ t3 = _mm_sub_pd( t3, t1 ); ++ _mm_storeu_pd( &aj->re, t3 ); ++ ak += 6; ++ aj += 6; ++ } ++ } ++ } ++ ++ r1 = aa[35].re * aa[35].re + aa[35].im * aa[35].im; ++ ++ if (r1 >= eps) ++ r1 = 1.0 / r1; ++ else { ++ ifail = 1; ++ r1 = 1.0; ++ } ++ ++ dd[5].re = r1 * aa[35].re; ++ dd[5].im = -r1 * aa[35].im; ++ ++ return ifail; ++} ++ ++ ++void solv_sys_avx512( complex_dble *aa, complex_dble *dd ) ++{ ++ int i, j, k; ++ complex_dble z; ++ __m128d mz, t1, t2, t3; ++ ++ for (k = 5; k > 0; k--) { ++ for (i = (k - 1); i >= 0; i--) { ++ t1 = _mm_loaddup_pd(&aa[6 * i + k].re); ++ t2 = _mm_loaddup_pd(&aa[6 * i + k].im); ++ t3 = _mm_load_pd(&dd[k].re); ++ t2 = _mm_mul_pd( t2, t3 ); ++ t2 = _mm_permute_pd( t2, 0b01 ); ++ mz = _mm_fmaddsub_pd( t1, t3, t2 ); ++ ++ for (j = (k - 1); j > i; j--) { ++ t1 = _mm_loaddup_pd(&aa[6 * i + j].re); ++ t2 = _mm_loaddup_pd(&aa[6 * i + j].im); ++ t3 = _mm_load_pd(&aa[6 * j + k].re); ++ t2 = _mm_mul_pd( t2, t3 ); ++ t2 = _mm_permute_pd( t2, 0b01 ); ++ t1 = _mm_fmaddsub_pd( t1, t3, t2 ); ++ mz = _mm_add_pd( mz, t1 ); ++ } ++ ++ t1 = _mm_loaddup_pd(&dd[i].re); ++ t2 = _mm_loaddup_pd(&dd[i].im); ++ t2 = _mm_mul_pd( t2, mz ); ++ t2 = _mm_permute_pd( t2, 0b01 ); ++ t1 = _mm_fmaddsub_pd( t1, mz, t2 ); ++ t1 = _mm_sub_pd( _mm_setzero_pd(), t1); /* this line flips the sign of t1 */ ++ _mm_storeu_pd( &aa[6 * i + k].re, t1 ); ++ } ++ } ++} ++ ++void bck_house_avx512( complex_dble *aa, complex_dble *dd, double * rr ) ++{ ++ int i, j, k; ++ complex_dble z; ++ ++ aa[35].re = dd[5].re; ++ aa[35].im = dd[5].im; ++ ++ for (k = 4; k >= 0; k--) { ++ z.re = dd[k].re; ++ z.im = dd[k].im; ++ dd[k].re = aa[6 * k + k].re; ++ dd[k].im = aa[6 * k + k].im; ++ aa[6 * k + k].re = z.re; ++ aa[6 * k + k].im = z.im; ++ ++ for (j = (k + 1); j < 6; j++) { ++ dd[j].re = aa[6 * j + k].re; ++ dd[j].im = aa[6 * j + k].im; ++ aa[6 * j + k].re = 0.0; ++ aa[6 * j + k].im = 0.0; ++ } ++ ++ for (i = 0; i < 6; i++) { ++ __m128d mz, t1, t2, t3; ++ mz = _mm_setzero_pd(); ++ ++ for (j = k; j < 6; j++) { ++ t1 = _mm_loaddup_pd(&aa[6 * i + j].re); ++ t2 = _mm_loaddup_pd(&aa[6 * i + j].im); ++ t3 = _mm_load_pd(&dd[j].re); ++ t2 = _mm_mul_pd( t2, t3 ); ++ t2 = _mm_permute_pd( t2, 0b01 ); ++ t1 = _mm_fmaddsub_pd( t1, t3, t2 ); ++ mz = _mm_add_pd( mz, t1 ); ++ } ++ ++ t1 = _mm_loaddup_pd( rr+k ); ++ mz = _mm_mul_pd( mz, t1 ); ++ ++ for (j = k; j < 6; j++) { ++ t1 = _mm_loaddup_pd( &dd[j].re ); ++ t2 = _mm_loaddup_pd( &dd[j].im ); ++ t2 = _mm_mul_pd( t2, mz ); ++ t2 = _mm_permute_pd( t2, 0b01 ); ++ t1 = _mm_fmsubadd_pd( t1, mz, t2 ); ++ ++ t2 = _mm_load_pd( &aa[6 * i + j].re ); ++ t1 = _mm_sub_pd( t2, t1 ); ++ _mm_storeu_pd( &aa[6 * i + j].re, t1 ); ++ } ++ } ++ } ++} ++ ++#endif +\ No newline at end of file +diff --git a/modules/sw_term/pauli.c b/modules/sw_term/pauli.c +index 7c8a7b6..8e14bee 100644 +--- a/modules/sw_term/pauli.c ++++ b/modules/sw_term/pauli.c +@@ -409,7 +409,96 @@ void mul_pauli(float mu,pauli *m,weyl *s,weyl *r) + "xmm6", "xmm7"); + } + +-#if (defined AVX) ++ ++#else ++ ++static weyl rs; ++ ++void mul_pauli(float mu, pauli *m, weyl *s, weyl *r) ++{ ++ float const *u; ++ ++ u = (*m).u; ++ ++ rs.c1.c1.re = ++ u[0] * (*s).c1.c1.re - mu * (*s).c1.c1.im + u[6] * (*s).c1.c2.re - ++ u[7] * (*s).c1.c2.im + u[8] * (*s).c1.c3.re - u[9] * (*s).c1.c3.im + ++ u[10] * (*s).c2.c1.re - u[11] * (*s).c2.c1.im + u[12] * (*s).c2.c2.re - ++ u[13] * (*s).c2.c2.im + u[14] * (*s).c2.c3.re - u[15] * (*s).c2.c3.im; ++ ++ rs.c1.c1.im = ++ u[0] * (*s).c1.c1.im + mu * (*s).c1.c1.re + u[6] * (*s).c1.c2.im + ++ u[7] * (*s).c1.c2.re + u[8] * (*s).c1.c3.im + u[9] * (*s).c1.c3.re + ++ u[10] * (*s).c2.c1.im + u[11] * (*s).c2.c1.re + u[12] * (*s).c2.c2.im + ++ u[13] * (*s).c2.c2.re + u[14] * (*s).c2.c3.im + u[15] * (*s).c2.c3.re; ++ ++ rs.c1.c2.re = ++ u[6] * (*s).c1.c1.re + u[7] * (*s).c1.c1.im + u[1] * (*s).c1.c2.re - ++ mu * (*s).c1.c2.im + u[16] * (*s).c1.c3.re - u[17] * (*s).c1.c3.im + ++ u[18] * (*s).c2.c1.re - u[19] * (*s).c2.c1.im + u[20] * (*s).c2.c2.re - ++ u[21] * (*s).c2.c2.im + u[22] * (*s).c2.c3.re - u[23] * (*s).c2.c3.im; ++ ++ rs.c1.c2.im = ++ u[6] * (*s).c1.c1.im - u[7] * (*s).c1.c1.re + u[1] * (*s).c1.c2.im + ++ mu * (*s).c1.c2.re + u[16] * (*s).c1.c3.im + u[17] * (*s).c1.c3.re + ++ u[18] * (*s).c2.c1.im + u[19] * (*s).c2.c1.re + u[20] * (*s).c2.c2.im + ++ u[21] * (*s).c2.c2.re + u[22] * (*s).c2.c3.im + u[23] * (*s).c2.c3.re; ++ ++ rs.c1.c3.re = ++ u[8] * (*s).c1.c1.re + u[9] * (*s).c1.c1.im + u[16] * (*s).c1.c2.re + ++ u[17] * (*s).c1.c2.im + u[2] * (*s).c1.c3.re - mu * (*s).c1.c3.im + ++ u[24] * (*s).c2.c1.re - u[25] * (*s).c2.c1.im + u[26] * (*s).c2.c2.re - ++ u[27] * (*s).c2.c2.im + u[28] * (*s).c2.c3.re - u[29] * (*s).c2.c3.im; ++ ++ rs.c1.c3.im = ++ u[8] * (*s).c1.c1.im - u[9] * (*s).c1.c1.re + u[16] * (*s).c1.c2.im - ++ u[17] * (*s).c1.c2.re + u[2] * (*s).c1.c3.im + mu * (*s).c1.c3.re + ++ u[24] * (*s).c2.c1.im + u[25] * (*s).c2.c1.re + u[26] * (*s).c2.c2.im + ++ u[27] * (*s).c2.c2.re + u[28] * (*s).c2.c3.im + u[29] * (*s).c2.c3.re; ++ ++ rs.c2.c1.re = ++ u[10] * (*s).c1.c1.re + u[11] * (*s).c1.c1.im + u[18] * (*s).c1.c2.re + ++ u[19] * (*s).c1.c2.im + u[24] * (*s).c1.c3.re + u[25] * (*s).c1.c3.im + ++ u[3] * (*s).c2.c1.re - mu * (*s).c2.c1.im + u[30] * (*s).c2.c2.re - ++ u[31] * (*s).c2.c2.im + u[32] * (*s).c2.c3.re - u[33] * (*s).c2.c3.im; ++ ++ rs.c2.c1.im = ++ u[10] * (*s).c1.c1.im - u[11] * (*s).c1.c1.re + u[18] * (*s).c1.c2.im - ++ u[19] * (*s).c1.c2.re + u[24] * (*s).c1.c3.im - u[25] * (*s).c1.c3.re + ++ u[3] * (*s).c2.c1.im + mu * (*s).c2.c1.re + u[30] * (*s).c2.c2.im + ++ u[31] * (*s).c2.c2.re + u[32] * (*s).c2.c3.im + u[33] * (*s).c2.c3.re; ++ ++ rs.c2.c2.re = ++ u[12] * (*s).c1.c1.re + u[13] * (*s).c1.c1.im + u[20] * (*s).c1.c2.re + ++ u[21] * (*s).c1.c2.im + u[26] * (*s).c1.c3.re + u[27] * (*s).c1.c3.im + ++ u[30] * (*s).c2.c1.re + u[31] * (*s).c2.c1.im + u[4] * (*s).c2.c2.re - ++ mu * (*s).c2.c2.im + u[34] * (*s).c2.c3.re - u[35] * (*s).c2.c3.im; ++ ++ rs.c2.c2.im = ++ u[12] * (*s).c1.c1.im - u[13] * (*s).c1.c1.re + u[20] * (*s).c1.c2.im - ++ u[21] * (*s).c1.c2.re + u[26] * (*s).c1.c3.im - u[27] * (*s).c1.c3.re + ++ u[30] * (*s).c2.c1.im - u[31] * (*s).c2.c1.re + u[4] * (*s).c2.c2.im + ++ mu * (*s).c2.c2.re + u[34] * (*s).c2.c3.im + u[35] * (*s).c2.c3.re; ++ ++ rs.c2.c3.re = ++ u[14] * (*s).c1.c1.re + u[15] * (*s).c1.c1.im + u[22] * (*s).c1.c2.re + ++ u[23] * (*s).c1.c2.im + u[28] * (*s).c1.c3.re + u[29] * (*s).c1.c3.im + ++ u[32] * (*s).c2.c1.re + u[33] * (*s).c2.c1.im + u[34] * (*s).c2.c2.re + ++ u[35] * (*s).c2.c2.im + u[5] * (*s).c2.c3.re - mu * (*s).c2.c3.im; ++ ++ rs.c2.c3.im = ++ u[14] * (*s).c1.c1.im - u[15] * (*s).c1.c1.re + u[22] * (*s).c1.c2.im - ++ u[23] * (*s).c1.c2.re + u[28] * (*s).c1.c3.im - u[29] * (*s).c1.c3.re + ++ u[32] * (*s).c2.c1.im - u[33] * (*s).c2.c1.re + u[34] * (*s).c2.c2.im - ++ u[35] * (*s).c2.c2.re + u[5] * (*s).c2.c3.im + mu * (*s).c2.c3.re; ++ ++ (*r) = rs; ++} ++ ++#endif ++ ++#ifndef AVX512 ++#ifdef AVX + #include "avx.h" + + void mul_pauli2(float mu,pauli *m,spinor *s,spinor *r) +@@ -952,128 +1041,6 @@ void mul_pauli2(float mu,pauli *m,spinor *s,spinor *r) + } + + #endif +-#else +- +-static weyl rs; +- +- +-void mul_pauli(float mu,pauli *m,weyl *s,weyl *r) +-{ +- float *u; +- +- u=(*m).u; +- +- rs.c1.c1.re= +- u[ 0]*(*s).c1.c1.re- mu*(*s).c1.c1.im+ +- u[ 6]*(*s).c1.c2.re-u[ 7]*(*s).c1.c2.im+ +- u[ 8]*(*s).c1.c3.re-u[ 9]*(*s).c1.c3.im+ +- u[10]*(*s).c2.c1.re-u[11]*(*s).c2.c1.im+ +- u[12]*(*s).c2.c2.re-u[13]*(*s).c2.c2.im+ +- u[14]*(*s).c2.c3.re-u[15]*(*s).c2.c3.im; +- +- rs.c1.c1.im= +- u[ 0]*(*s).c1.c1.im+ mu*(*s).c1.c1.re+ +- u[ 6]*(*s).c1.c2.im+u[ 7]*(*s).c1.c2.re+ +- u[ 8]*(*s).c1.c3.im+u[ 9]*(*s).c1.c3.re+ +- u[10]*(*s).c2.c1.im+u[11]*(*s).c2.c1.re+ +- u[12]*(*s).c2.c2.im+u[13]*(*s).c2.c2.re+ +- u[14]*(*s).c2.c3.im+u[15]*(*s).c2.c3.re; +- +- rs.c1.c2.re= +- u[ 6]*(*s).c1.c1.re+u[ 7]*(*s).c1.c1.im+ +- u[ 1]*(*s).c1.c2.re- mu*(*s).c1.c2.im+ +- u[16]*(*s).c1.c3.re-u[17]*(*s).c1.c3.im+ +- u[18]*(*s).c2.c1.re-u[19]*(*s).c2.c1.im+ +- u[20]*(*s).c2.c2.re-u[21]*(*s).c2.c2.im+ +- u[22]*(*s).c2.c3.re-u[23]*(*s).c2.c3.im; +- +- rs.c1.c2.im= +- u[ 6]*(*s).c1.c1.im-u[ 7]*(*s).c1.c1.re+ +- u[ 1]*(*s).c1.c2.im+ mu*(*s).c1.c2.re+ +- u[16]*(*s).c1.c3.im+u[17]*(*s).c1.c3.re+ +- u[18]*(*s).c2.c1.im+u[19]*(*s).c2.c1.re+ +- u[20]*(*s).c2.c2.im+u[21]*(*s).c2.c2.re+ +- u[22]*(*s).c2.c3.im+u[23]*(*s).c2.c3.re; +- +- rs.c1.c3.re= +- u[ 8]*(*s).c1.c1.re+u[ 9]*(*s).c1.c1.im+ +- u[16]*(*s).c1.c2.re+u[17]*(*s).c1.c2.im+ +- u[ 2]*(*s).c1.c3.re- mu*(*s).c1.c3.im+ +- u[24]*(*s).c2.c1.re-u[25]*(*s).c2.c1.im+ +- u[26]*(*s).c2.c2.re-u[27]*(*s).c2.c2.im+ +- u[28]*(*s).c2.c3.re-u[29]*(*s).c2.c3.im; +- +- rs.c1.c3.im= +- u[ 8]*(*s).c1.c1.im-u[ 9]*(*s).c1.c1.re+ +- u[16]*(*s).c1.c2.im-u[17]*(*s).c1.c2.re+ +- u[ 2]*(*s).c1.c3.im+ mu*(*s).c1.c3.re+ +- u[24]*(*s).c2.c1.im+u[25]*(*s).c2.c1.re+ +- u[26]*(*s).c2.c2.im+u[27]*(*s).c2.c2.re+ +- u[28]*(*s).c2.c3.im+u[29]*(*s).c2.c3.re; +- +- rs.c2.c1.re= +- u[10]*(*s).c1.c1.re+u[11]*(*s).c1.c1.im+ +- u[18]*(*s).c1.c2.re+u[19]*(*s).c1.c2.im+ +- u[24]*(*s).c1.c3.re+u[25]*(*s).c1.c3.im+ +- u[ 3]*(*s).c2.c1.re- mu*(*s).c2.c1.im+ +- u[30]*(*s).c2.c2.re-u[31]*(*s).c2.c2.im+ +- u[32]*(*s).c2.c3.re-u[33]*(*s).c2.c3.im; +- +- rs.c2.c1.im= +- u[10]*(*s).c1.c1.im-u[11]*(*s).c1.c1.re+ +- u[18]*(*s).c1.c2.im-u[19]*(*s).c1.c2.re+ +- u[24]*(*s).c1.c3.im-u[25]*(*s).c1.c3.re+ +- u[ 3]*(*s).c2.c1.im+ mu*(*s).c2.c1.re+ +- u[30]*(*s).c2.c2.im+u[31]*(*s).c2.c2.re+ +- u[32]*(*s).c2.c3.im+u[33]*(*s).c2.c3.re; +- +- rs.c2.c2.re= +- u[12]*(*s).c1.c1.re+u[13]*(*s).c1.c1.im+ +- u[20]*(*s).c1.c2.re+u[21]*(*s).c1.c2.im+ +- u[26]*(*s).c1.c3.re+u[27]*(*s).c1.c3.im+ +- u[30]*(*s).c2.c1.re+u[31]*(*s).c2.c1.im+ +- u[ 4]*(*s).c2.c2.re- mu*(*s).c2.c2.im+ +- u[34]*(*s).c2.c3.re-u[35]*(*s).c2.c3.im; +- +- rs.c2.c2.im= +- u[12]*(*s).c1.c1.im-u[13]*(*s).c1.c1.re+ +- u[20]*(*s).c1.c2.im-u[21]*(*s).c1.c2.re+ +- u[26]*(*s).c1.c3.im-u[27]*(*s).c1.c3.re+ +- u[30]*(*s).c2.c1.im-u[31]*(*s).c2.c1.re+ +- u[ 4]*(*s).c2.c2.im+ mu*(*s).c2.c2.re+ +- u[34]*(*s).c2.c3.im+u[35]*(*s).c2.c3.re; +- +- rs.c2.c3.re= +- u[14]*(*s).c1.c1.re+u[15]*(*s).c1.c1.im+ +- u[22]*(*s).c1.c2.re+u[23]*(*s).c1.c2.im+ +- u[28]*(*s).c1.c3.re+u[29]*(*s).c1.c3.im+ +- u[32]*(*s).c2.c1.re+u[33]*(*s).c2.c1.im+ +- u[34]*(*s).c2.c2.re+u[35]*(*s).c2.c2.im+ +- u[ 5]*(*s).c2.c3.re- mu*(*s).c2.c3.im; +- +- rs.c2.c3.im= +- u[14]*(*s).c1.c1.im-u[15]*(*s).c1.c1.re+ +- u[22]*(*s).c1.c2.im-u[23]*(*s).c1.c2.re+ +- u[28]*(*s).c1.c3.im-u[29]*(*s).c1.c3.re+ +- u[32]*(*s).c2.c1.im-u[33]*(*s).c2.c1.re+ +- u[34]*(*s).c2.c2.im-u[35]*(*s).c2.c2.re+ +- u[ 5]*(*s).c2.c3.im+ mu*(*s).c2.c3.re; +- +- (*r)=rs; +-} +- +- +-void mul_pauli2(float mu,pauli *m,spinor *s,spinor *r) +-{ +- spin_t *ps,*pr; +- +- ps=(spin_t*)(s); +- pr=(spin_t*)(r); +- +- mul_pauli(mu,m,(*ps).w,(*pr).w); +- mul_pauli(-mu,m+1,(*ps).w+1,(*pr).w+1); +-} +- + #endif + + void assign_pauli(int vol,pauli_dble *md,pauli *m) +diff --git a/modules/sw_term/pauli_dble.c b/modules/sw_term/pauli_dble.c +index 39be1d5..65f7376 100644 +--- a/modules/sw_term/pauli_dble.c ++++ b/modules/sw_term/pauli_dble.c +@@ -87,6 +87,39 @@ static complex_dble aa[36] ALIGNED16; + static complex_dble cc[6] ALIGNED16; + static complex_dble dd[6] ALIGNED16; + ++ ++ ++#if (defined AVX512) ++ ++int fwd_house_avx512(double eps, complex_dble *aa, complex_dble *dd, double * rr ); ++static int fwd_house(double eps ){ ++ return fwd_house_avx512( eps, aa, dd, rr ); ++} ++ ++void solv_sys_avx512( complex_dble *aa, complex_dble *dd ); ++static void solv_sys(void){ ++ solv_sys_avx512( aa, dd ); ++} ++ ++void bck_house_avx512( complex_dble *aa, complex_dble *dd, double * rr ); ++static void bck_house(void){ ++ bck_house_avx512( aa, dd, rr ); ++} ++ ++#else ++ ++void mul_pauli2_dble(double mu, pauli_dble *m, weyl_dble *s, weyl_dble *r) ++{ ++ mul_pauli_dble( mu, m, s, r ); ++ mul_pauli_dble( -mu, m+1, s+1, r+1 ); ++} ++#endif ++ ++ ++ ++ ++ ++ + #if (defined x64) + #include "sse2.h" + +@@ -997,6 +1030,7 @@ void mul_pauli_dble(double mu,pauli_dble *m,weyl_dble *s,weyl_dble *r) + + #endif + ++#ifndef AVX512 + static int fwd_house(double eps) + { + int i,j,k,ifail; +@@ -1313,10 +1347,12 @@ static void bck_house(void) + } + } + ++#endif ++ + #else + +-static weyl_dble rs; + ++static weyl_dble rs; + + void mul_pauli_dble(double mu,pauli_dble *m,weyl_dble *s,weyl_dble *r) + { +@@ -1423,7 +1459,7 @@ void mul_pauli_dble(double mu,pauli_dble *m,weyl_dble *s,weyl_dble *r) + (*r)=rs; + } + +- ++#ifndef AVX512 + static int fwd_house(double eps) + { + int i,j,k,ifail; +@@ -1582,6 +1618,8 @@ static void bck_house(void) + + #endif + ++#endif ++ + static double set_aa(double mu,pauli_dble *m) + { + int i,j; +@@ -1780,10 +1818,8 @@ void apply_sw_dble(int vol,double mu,pauli_dble *m,spinor_dble *s, + + for (;ps