From 855f2b86fff8b51c9b61a0e487135c50fa8ae987 Mon Sep 17 00:00:00 2001 From: "j.m.o.rantaharju" Date: Mon, 8 Oct 2018 14:59:10 +0100 Subject: [PATCH] Removing assembly files Compilation of AVX code with ICC is is now possible --- AVX512_patch | 9978 ----------------- README-AVX512 | 2 - devel/dirac/Makefile | 2 +- devel/linalg/Makefile | 27 +- devel/sw_term/Makefile | 36 +- main/Makefile | 38 +- modules/dirac/avx512/Dw_avx512.c | 4 + modules/dirac/avx512/Dw_avx512_asm.s | 1064 -- modules/dirac/avx512/Dw_dble_avx512.c | 5 + modules/dirac/avx512/Dw_dble_avx512_asm.s | 1306 --- modules/linalg/avx512/salg_avx512.c | 30 +- modules/linalg/avx512/salg_avx512_asm.s | 230 - modules/linalg/avx512/salg_dble_avx512.c | 23 +- modules/linalg/avx512/salg_dble_avx512_asm.s | 768 -- modules/linalg/salg.c | 36 +- modules/linalg/salg_dble.c | 43 - modules/sw_term/avx512/pauli_avx512.c | 7 +- modules/sw_term/avx512/pauli_avx512_asm.s | 295 - modules/sw_term/avx512/pauli_dble_avx512.c | 8 +- .../sw_term/avx512/pauli_dble_avx512_asm.s | 1235 -- modules/sw_term/pauli.c | 14 +- modules/sw_term/pauli_dble.c | 6 - 22 files changed, 84 insertions(+), 15073 deletions(-) delete mode 100644 AVX512_patch delete mode 100644 modules/dirac/avx512/Dw_avx512_asm.s delete mode 100644 modules/dirac/avx512/Dw_dble_avx512_asm.s delete mode 100644 modules/linalg/avx512/salg_avx512_asm.s delete mode 100644 modules/linalg/avx512/salg_dble_avx512_asm.s delete mode 100644 modules/sw_term/avx512/pauli_avx512_asm.s delete mode 100644 modules/sw_term/avx512/pauli_dble_avx512_asm.s diff --git a/AVX512_patch b/AVX512_patch deleted file mode 100644 index 5effb90..0000000 --- a/AVX512_patch +++ /dev/null @@ -1,9978 +0,0 @@ -From e5490198de82ef164b0448bff2b330519ece823d Mon Sep 17 00:00:00 2001 -From: "j.m.o.rantaharju" -Date: Thu, 19 Apr 2018 14:39:10 +0100 -Subject: [PATCH] This patch extends openQCD-1.6 with an implementation of the - Dirac operator with intel Intrincic operations in order to use the full - vector width on Intel Skylake, Knights Landing and other processors. - ---- - CITATION.cff | 39 + - README-AVX512 | 26 + - devel/archive/Makefile | 37 +- - devel/block/Makefile | 36 +- - devel/dfl/Makefile | 36 +- - devel/dirac/Makefile | 39 +- - devel/forces/Makefile | 36 +- - devel/linalg/Makefile | 32 +- - devel/little/Makefile | 36 +- - devel/sap/Makefile | 36 +- - devel/sflds/Makefile | 34 +- - devel/sw_term/Makefile | 37 +- - devel/update/Makefile | 37 +- - devel/vflds/Makefile | 33 +- - include/avx512.h | 978 ++++++++++++ - include/sw_term.h | 1 + - main/Makefile | 50 +- - modules/dirac/Dw.c | 22 +- - modules/dirac/Dw_dble.c | 74 +- - modules/dirac/avx512/Dw_avx512.c | 217 +++ - modules/dirac/avx512/Dw_avx512_asm.s | 1064 ++++++++++++++ - modules/dirac/avx512/Dw_dble_avx512.c | 256 ++++ - modules/dirac/avx512/Dw_dble_avx512_asm.s | 1306 +++++++++++++++++ - modules/linalg/avx512/salg_avx512.c | 138 ++ - modules/linalg/avx512/salg_avx512_asm.s | 230 +++ - modules/linalg/avx512/salg_dble_avx512.c | 391 +++++ - modules/linalg/avx512/salg_dble_avx512_asm.s | 768 ++++++++++ - modules/linalg/salg.c | 116 +- - modules/linalg/salg_dble.c | 208 ++- - modules/sw_term/avx512/pauli_avx512.c | 230 +++ - modules/sw_term/avx512/pauli_avx512_asm.s | 295 ++++ - modules/sw_term/avx512/pauli_dble_avx512.c | 483 ++++++ - .../sw_term/avx512/pauli_dble_avx512_asm.s | 1235 ++++++++++++++++ - modules/sw_term/pauli.c | 223 ++- - modules/sw_term/pauli_dble.c | 54 +- - 35 files changed, 8513 insertions(+), 320 deletions(-) - create mode 100644 CITATION.cff - create mode 100644 README-AVX512 - create mode 100644 include/avx512.h - create mode 100644 modules/dirac/avx512/Dw_avx512.c - create mode 100644 modules/dirac/avx512/Dw_avx512_asm.s - create mode 100644 modules/dirac/avx512/Dw_dble_avx512.c - create mode 100644 modules/dirac/avx512/Dw_dble_avx512_asm.s - create mode 100644 modules/linalg/avx512/salg_avx512.c - create mode 100644 modules/linalg/avx512/salg_avx512_asm.s - create mode 100644 modules/linalg/avx512/salg_dble_avx512.c - create mode 100644 modules/linalg/avx512/salg_dble_avx512_asm.s - create mode 100644 modules/sw_term/avx512/pauli_avx512.c - create mode 100644 modules/sw_term/avx512/pauli_avx512_asm.s - create mode 100644 modules/sw_term/avx512/pauli_dble_avx512.c - create mode 100644 modules/sw_term/avx512/pauli_dble_avx512_asm.s - -diff --git a/CITATION.cff b/CITATION.cff -new file mode 100644 -index 0000000..c2e855d ---- /dev/null -+++ b/CITATION.cff -@@ -0,0 +1,39 @@ -+ -+cff-version: 1.0.3 -+message: "Please cite the following works when using this software." -+ -+title: OpenQCD AVX512 Extension -+version: 1.0 -+date-released: 19 January 2018 -+authors: -+ - name: "Swansea Academy of Advanced Computing" -+ website: http://www.swansea.ac.uk/iss/sa2c/ -+ email: sa2c-support@swansea.ac.uk -+ - family-names: Rantaharju -+ given-names: Jarno -+ orcid: https://orcid.org/0000-0002-0072-7707 -+ - family-names: Messiti -+ given-names: Michele -+ -+references: -+ - type: software-code -+ name: OpenQCD -+ version: 1.6 -+ website: luscher.web.cern.ch/luscher/openQCD/ -+ authors: -+ - family-names: Lüscher -+ given-names: Martin -+ - family-names: Schaefer -+ given-names: Stefan -+ - family-names: Bulava -+ given-names: John -+ - family-names: Del Debbio -+ given-names: Luigi -+ - family-names: Giusti -+ given-names: Leonardo -+ - family-names: Leder -+ given-names: Björn -+ - family-names: Palombi -+ given-names: Filippo -+ date-released: 22. April 2014 -+ email: luscher@mail.cern.ch -diff --git a/README-AVX512 b/README-AVX512 -new file mode 100644 -index 0000000..04a7c53 ---- /dev/null -+++ b/README-AVX512 -@@ -0,0 +1,26 @@ -+An optimized version of openQCD-1.6 for Intel processors with 512 bit vector width -+================================================================================== -+ -+DESCRIPTION -+ -+ This code extends openQCD-1.6 with an implementation of the Dirac operator -+ with intel Intrincic operations in order to use the full vector width on Intel -+ Skylake, Knight Landing and other processors. -+ -+ The extension is enabled with the -DAVX521 flag in CFLAGS in the Makefiles. -+ -+ To enable assembly versions pre-compiled for Skylake Xeon Scalable CPUs also include -DAVX512_ASM in CFLAGS. -+ -+ -+ -+AUTHORS -+ -+ This patch extends the openQCD code written by Martin Luscher, Stefan -+ Schaefer and other contributors (see http://luscher.web.cern.ch/luscher/openQCD/). -+ -+ The batch has been written by and tested by Jarno Rantaharju and Michele Messiti. -+ -+LICENSE -+ -+ The code may be used under the terms of the GNU General Public License (GPL) -+ http://www.fsf.org/licensing/licenses/gpl.html -diff --git a/devel/archive/Makefile b/devel/archive/Makefile -index 8c94a12..c5e949a 100644 ---- a/devel/archive/Makefile -+++ b/devel/archive/Makefile -@@ -39,9 +39,13 @@ UFLDS = plaq_sum uflds udcom - - UTILS = endian error hsum mutils utils wspace - --MODULES = $(FLAGS) $(LATTICE) $(ARCHIVE) $(LINALG) $(RANDOM) $(UFLDS) \ -+STD_MODULES = $(FLAGS) $(LATTICE) $(ARCHIVE) $(LINALG) $(RANDOM) $(UFLDS) \ - $(SFLDS) $(SU3FCTS) $(UTILS) - -+AVX512_MODULES = salg_dble_avx512 -+ -+AVX512_ASM_MODULES = salg_dble_avx512_asm -+ - - # Logging option (-mpilog or -mpitrace or -mpianim) - -@@ -54,7 +58,7 @@ MDIR = ../../modules - - VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\ - $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/sflds:\ -- $(MDIR)/su3fcts:$(MDIR)/utils -+ $(MDIR)/su3fcts:$(MDIR)/utils:$(MDIR)/linalg/avx512: - - - # additional include directories -@@ -72,10 +76,10 @@ LIBPATH = $(MPI_HOME)/lib - # scheduling and optimization options - - CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ -- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ -- -O -mno-avx -Dx64 -DPM -+ -Wall -Wno-long-long -Wstrict-prototypes \ -+ -O2 -DAVX512 - --LFLAGS = -+LFLAGS = $(CFLAGS) - - - ############################## do not change ################################### -@@ -84,25 +88,40 @@ SHELL=/bin/bash - CC=mpicc - CLINKER=$(CC) - -+#Check CFLAGS to find which AVX512 flags are active -+ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) -+else -+ifneq (,$(findstring AVX512,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_MODULES) -+else -+MODULES= $(STD_MODULES) -+endif -+endif -+ - PGMS= $(MAIN) $(MODULES) - - -include $(addsuffix .d,$(PGMS)) - - --# rule to make dependencies - -+ -+# rule to make dependencies - $(addsuffix .d,$(PGMS)): %.d: %.c Makefile - @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ - -- - # rule to compile source programs -+$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - --$(addsuffix .o,$(PGMS)): %.o: %.c Makefile -+# pattern to compile files in the avx512 directiories -+$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile - $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - -+$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - - # rule to link object files -- - $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile - $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ - $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ -diff --git a/devel/block/Makefile b/devel/block/Makefile -index ad90f62..3856581 100644 ---- a/devel/block/Makefile -+++ b/devel/block/Makefile -@@ -45,9 +45,15 @@ UFLDS = uflds udcom shift - - UTILS = endian error hsum mutils utils wspace - --MODULES = $(FLAGS) $(RANDOM) $(LATTICE) $(BLOCK) $(UFLDS) $(SFLDS) \ -+STD_MODULES = $(FLAGS) $(RANDOM) $(LATTICE) $(BLOCK) $(UFLDS) $(SFLDS) \ - $(LINALG) $(SU3FCTS) $(UTILS) $(TCHARGE) $(SW_TERM) $(SAP) - -+AVX512_MODULES = salg_avx512 salg_dble_avx512 \ -+ pauli_avx512 pauli_dble_avx512 -+ -+AVX512_ASM_MODULES = salg_avx512_asm salg_dble_avx512_asm \ -+ pauli_avx512_asm pauli_dble_avx512_asm -+ - - # Logging option (-mpilog or -mpitrace or -mpianim) - -@@ -60,7 +66,7 @@ MDIR = ../../modules - - VPATH = .:$(MDIR)/flags:$(MDIR)/random:$(MDIR)/lattice:$(MDIR)/block:\ - $(MDIR)/uflds:$(MDIR)/sflds:$(MDIR)/su3fcts:$(MDIR)/utils:\ -- $(MDIR)/linalg:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/sap -+ $(MDIR)/linalg:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/sap:$(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512 - - - # additional include directories -@@ -78,10 +84,10 @@ LIBPATH = $(MPI_HOME)/lib - # scheduling and optimization options - - CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ -- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ -- -O -mno-avx -Dx64 -DPM -+ -Wall -Wno-long-long -Wstrict-prototypes \ -+ -O2 -DAVX512 - --LFLAGS = -+LFLAGS = $(CFLAGS) - - - ############################## do not change ################################### -@@ -90,25 +96,39 @@ SHELL=/bin/bash - CC=mpicc - CLINKER=$(CC) - -+#Check CFLAGS to find which AVX512 flags are active -+ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) -+else -+ifneq (,$(findstring AVX512,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_MODULES) -+else -+MODULES= $(STD_MODULES) -+endif -+endif -+ - PGMS= $(MAIN) $(MODULES) - - -include $(addsuffix .d,$(PGMS)) - - - # rule to make dependencies -- - $(addsuffix .d,$(PGMS)): %.d: %.c Makefile - @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ - - - # rule to compile source programs -+$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - --$(addsuffix .o,$(PGMS)): %.o: %.c Makefile -+# pattern to compile files in the avx512 directiories -+$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile - $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - -+$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - - # rule to link object files -- - $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile - $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ - $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ -diff --git a/devel/dfl/Makefile b/devel/dfl/Makefile -index 9def70f..0a6ce2e 100644 ---- a/devel/dfl/Makefile -+++ b/devel/dfl/Makefile -@@ -57,10 +57,15 @@ UTILS = endian error hsum mutils utils wspace - - VFLDS = vflds vinit vcom vdcom - --MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \ -+STD_MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \ - $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) $(DIRAC) \ - $(BLOCK) $(SAP) $(ARCHIVE) $(DFL) $(VFLDS) $(LITTLE) - -+AVX512_MODULES = Dw_avx512 Dw_dble_avx512 salg_avx512 salg_dble_avx512 \ -+ pauli_avx512 pauli_dble_avx512 -+ -+AVX512_ASM_MODULES = Dw_avx512_asm Dw_dble_avx512_asm salg_avx512_asm \ -+ salg_dble_avx512_asm pauli_avx512_asm pauli_dble_avx512_asm - - # Logging option (-mpilog or -mpitrace or -mpianim) - -@@ -75,7 +80,8 @@ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/linsolv:\ - $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:\ - $(MDIR)/sflds:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:\ - $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive:$(MDIR)/dfl:\ -- $(MDIR)/vflds:$(MDIR)/little -+ $(MDIR)/vflds:$(MDIR)/little\ -+ $(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512 - - # additional include directories - -@@ -92,10 +98,10 @@ LIBPATH = $(MPI_HOME)/lib - # scheduling and optimization options - - CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ -- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ -- -O -mno-avx -Dx64 -DPM -+ -Wall -Wno-long-long -Wstrict-prototypes \ -+ -O2 -DAVX512 - --LFLAGS = -+LFLAGS = $(CFLAGS) - - # -DFGCR_DBG -DFGCR4VD_DBG -DDFL_MODES_DBG - -@@ -106,25 +112,39 @@ SHELL=/bin/bash - CC=mpicc - CLINKER=$(CC) - -+#Check CFLAGS to find which AVX512 flags are active -+ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) -+else -+ifneq (,$(findstring AVX512,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_MODULES) -+else -+MODULES= $(STD_MODULES) -+endif -+endif -+ - PGMS= $(MAIN) $(MODULES) - - -include $(addsuffix .d,$(PGMS)) - - - # rule to make dependencies -- - $(addsuffix .d,$(PGMS)): %.d: %.c Makefile - @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ - - - # rule to compile source programs -+$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - --$(addsuffix .o,$(PGMS)): %.o: %.c Makefile -+# pattern to compile files in the avx512 directiories -+$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile - $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - -+$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - - # rule to link object files -- - $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile - $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ - $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ -diff --git a/devel/dirac/Makefile b/devel/dirac/Makefile -index 8fac63d..a75c752 100644 ---- a/devel/dirac/Makefile -+++ b/devel/dirac/Makefile -@@ -12,6 +12,7 @@ - # Dependencies on included files are automatically taken care of - # - ################################################################################ -+################################################################################ - - all: rmxeq mkdep mkxeq - .PHONY: all -@@ -48,10 +49,15 @@ UFLDS = plaq_sum shift uflds udcom - - UTILS = endian error hsum mutils utils wspace - --MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(RANDOM) $(UFLDS) \ -+STD_MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(RANDOM) $(UFLDS) \ - $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) \ - $(DIRAC) $(BLOCK) $(SAP) - -+AVX512_MODULES = Dw_avx512 Dw_dble_avx512 salg_avx512 salg_dble_avx512 \ -+ pauli_avx512 pauli_dble_avx512 -+ -+AVX512_ASM_MODULES = Dw_avx512_asm Dw_dble_avx512_asm salg_avx512_asm \ -+ salg_dble_avx512_asm pauli_avx512_asm pauli_dble_avx512_asm - - # Logging option (-mpilog or -mpitrace or -mpianim) - -@@ -65,7 +71,7 @@ MDIR = ../../modules - VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/random:\ - $(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:$(MDIR)/sflds:\ - $(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:$(MDIR)/block:\ -- $(MDIR)/sap -+ $(MDIR)/sap:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512:$(MDIR)/linalg/avx512 - - # additional include directories - -@@ -82,11 +88,10 @@ LIBPATH = $(MPI_HOME)/lib - # scheduling and optimization options - - CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ -- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ -- -O -mno-avx -Dx64 -DPM -- --LFLAGS = -+ -Wall -Wno-long-long -Wstrict-prototypes \ -+ -O2 -DAVX512 - -+LFLAGS = $(CFLAGS) - - ############################## do not change ################################### - -@@ -94,30 +99,46 @@ SHELL=/bin/bash - CC=mpicc - CLINKER=$(CC) - -+ -+#Check CFLAGS to find which AVX512 flags are active -+ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) -+else -+ifneq (,$(findstring AVX512,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_MODULES) -+else -+MODULES= $(STD_MODULES) -+endif -+endif -+ - PGMS= $(MAIN) $(MODULES) - - -include $(addsuffix .d,$(PGMS)) - - - # rule to make dependencies -- - $(addsuffix .d,$(PGMS)): %.d: %.c Makefile - @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ - - - # rule to compile source programs -+$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - --$(addsuffix .o,$(PGMS)): %.o: %.c Makefile -+# pattern to compile files in the avx512 directiories -+$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile - $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - -+$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - - # rule to link object files -- - $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile - $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ - $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ - - -+ - # produce executables - - mkxeq: $(MAIN) -diff --git a/devel/forces/Makefile b/devel/forces/Makefile -index e6955fb..7bf668c 100644 ---- a/devel/forces/Makefile -+++ b/devel/forces/Makefile -@@ -68,11 +68,16 @@ UTILS = endian error hsum mutils utils wspace - - VFLDS = vflds vinit vcom vdcom - --MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \ -+STD_MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \ - $(LATTICE) $(LINALG) $(LINSOLV) $(LITTLE) $(MDFLDS) $(RANDOM) \ - $(RATFCTS) $(SAP) $(SFLDS) $(SU3FCTS) $(SW_TERM) $(TCHARGE) \ - $(UFLDS) $(UPDATE) $(UTILS) $(VFLDS) - -+AVX512_MODULES = Dw_avx512 Dw_dble_avx512 salg_avx512 salg_dble_avx512 \ -+ pauli_avx512 pauli_dble_avx512 -+ -+AVX512_ASM_MODULES = Dw_avx512_asm Dw_dble_avx512_asm salg_avx512_asm \ -+ salg_dble_avx512_asm pauli_avx512_asm pauli_dble_avx512_asm - - # Logging option (-mpilog or -mpitrace or -mpianim) - -@@ -88,7 +93,8 @@ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\ - $(MDIR)/utils:$(MDIR)/forces:$(MDIR)/sflds:$(MDIR)/dirac:\ - $(MDIR)/sw_term:$(MDIR)/tcharge:$(MDIR)/block:$(MDIR)/sap:\ - $(MDIR)/linsolv:$(MDIR)/dfl:$(MDIR)/vflds:$(MDIR)/little:\ -- $(MDIR)/update:$(MDIR)/ratfcts -+ $(MDIR)/update:$(MDIR)/ratfcts\ -+ $(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512 - - - # additional include directories -@@ -106,10 +112,10 @@ LIBPATH = $(MPI_HOME)/lib - # scheduling and optimization options - - CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ -- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ -- -O -mno-avx -Dx64 -DPM -+ -Wall -Wno-long-long -Wstrict-prototypes \ -+ -O2 -DAVX512 - --LFLAGS = -+LFLAGS = $(CFLAGS) - - # -DCGNE_DBG -DFGCR_DBG -DMSCG_DBG - # -DDFL_MODES_DBG -@@ -121,25 +127,39 @@ SHELL=/bin/bash - CC=mpicc - CLINKER=$(CC) - -+#Check CFLAGS to find which AVX512 flags are active -+ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) -+else -+ifneq (,$(findstring AVX512,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_MODULES) -+else -+MODULES= $(STD_MODULES) -+endif -+endif -+ - PGMS= $(MAIN) $(MODULES) - - -include $(addsuffix .d,$(PGMS)) - - - # rule to make dependencies -- - $(addsuffix .d,$(PGMS)): %.d: %.c Makefile - @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ - - - # rule to compile source programs -+$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - --$(addsuffix .o,$(PGMS)): %.o: %.c Makefile -+# pattern to compile files in the avx512 directiories -+$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile - $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - -+$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - - # rule to link object files -- - $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile - $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ - $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ -diff --git a/devel/linalg/Makefile b/devel/linalg/Makefile -index 76ff701..b3e555f 100644 ---- a/devel/linalg/Makefile -+++ b/devel/linalg/Makefile -@@ -42,6 +42,9 @@ VFLDS = vflds vinit - MODULES = $(FLAGS) $(LATTICE) $(RANDOM) $(LINALG) $(UTILS) \ - $(UFLDS) $(SFLDS) $(SU3FCTS) $(VFLDS) - -+AVX512_MODULES = salg_avx512 salg_dble_avx512 -+ -+AVX512_ASM_MODULES = salg_avx512_asm salg_dble_avx512_asm - - # Logging option (-mpilog or -mpitrace or -mpianim) - -@@ -54,7 +57,7 @@ MDIR = ../../modules - - VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/random:$(MDIR)/linalg:\ - $(MDIR)/utils:$(MDIR)/uflds:$(MDIR)/sflds:$(MDIR)/su3fcts:\ -- $(MDIR)/vflds -+ $(MDIR)/vflds:$(MDIR)/linalg/avx512 - - - # additional include directories -@@ -72,10 +75,10 @@ LIBPATH = $(MPI_HOME)/lib - # scheduling and optimization options - - CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ -- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ -- -O -mno-avx -Dx64 -DPM -+ -Wall -Wno-long-long -Wstrict-prototypes \ -+ -O2 -DAVX512 - --LFLAGS = -+LFLAGS = $(CFLAGS) - - - ############################## do not change ################################### -@@ -84,30 +87,43 @@ SHELL=/bin/bash - CC=mpicc - CLINKER=$(CC) - -+#Check CFLAGS to find which AVX512 flags are active -+ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) -+else -+ifneq (,$(findstring AVX512,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_MODULES) -+else -+MODULES= $(STD_MODULES) -+endif -+endif -+ - PGMS= $(MAIN) $(MODULES) - - -include $(addsuffix .d,$(PGMS)) - - - # rule to make dependencies -- - $(addsuffix .d,$(PGMS)): %.d: %.c Makefile - @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ - - - # rule to compile source programs -+$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - --$(addsuffix .o,$(PGMS)): %.o: %.c Makefile -+# pattern to compile files in the avx512 directiories -+$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile - $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - -+$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - - # rule to link object files -- - $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile - $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ - $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ - -- - # produce executables - - mkxeq: $(MAIN) -diff --git a/devel/little/Makefile b/devel/little/Makefile -index 3ab8aee..295f92c 100644 ---- a/devel/little/Makefile -+++ b/devel/little/Makefile -@@ -57,10 +57,15 @@ UTILS = endian error hsum mutils utils wspace - - VFLDS = vflds vinit vcom vdcom - --MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \ -+STD_MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \ - $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) $(DIRAC) \ - $(BLOCK) $(SAP) $(ARCHIVE) $(DFL) $(VFLDS) $(LITTLE) - -+AVX512_MODULES = Dw_avx512 Dw_dble_avx512 salg_avx512 salg_dble_avx512 \ -+ pauli_avx512 pauli_dble_avx512 -+ -+AVX512_ASM_MODULES = Dw_avx512_asm Dw_dble_avx512_asm salg_avx512_asm \ -+ salg_dble_avx512_asm pauli_avx512_asm pauli_dble_avx512_asm - - # Logging option (-mpilog or -mpitrace or -mpianim) - -@@ -75,7 +80,8 @@ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/linsolv:\ - $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:\ - $(MDIR)/sflds:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:\ - $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive:$(MDIR)/dfl:\ -- $(MDIR)/vflds:$(MDIR)/little -+ $(MDIR)/vflds:$(MDIR)/little\ -+ $(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512 - - - # additional include directories -@@ -93,10 +99,10 @@ LIBPATH = $(MPI_HOME)/lib - # scheduling and optimization options - - CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ -- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ -- -O -mno-avx -Dx64 -DPM -+ -Wall -Wno-long-long -Wstrict-prototypes \ -+ -O2 -DAVX512 - --LFLAGS = -+LFLAGS = $(CFLAGS) - - - ############################## do not change ################################### -@@ -105,25 +111,39 @@ SHELL=/bin/bash - CC=mpicc - CLINKER=$(CC) - -+#Check CFLAGS to find which AVX512 flags are active -+ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) -+else -+ifneq (,$(findstring AVX512,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_MODULES) -+else -+MODULES= $(STD_MODULES) -+endif -+endif -+ - PGMS= $(MAIN) $(MODULES) - - -include $(addsuffix .d,$(PGMS)) - - - # rule to make dependencies -- - $(addsuffix .d,$(PGMS)): %.d: %.c Makefile - @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ - - - # rule to compile source programs -+$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - --$(addsuffix .o,$(PGMS)): %.o: %.c Makefile -+# pattern to compile files in the avx512 directiories -+$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile - $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - -+$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - - # rule to link object files -- - $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile - $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ - $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ -diff --git a/devel/sap/Makefile b/devel/sap/Makefile -index f56144d..b39f521 100644 ---- a/devel/sap/Makefile -+++ b/devel/sap/Makefile -@@ -51,10 +51,15 @@ UFLDS = plaq_sum shift uflds udcom - - UTILS = endian error hsum mutils utils wspace - --MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \ -+STD_MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \ - $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) $(DIRAC) \ - $(BLOCK) $(SAP) $(ARCHIVE) - -+AVX512_MODULES = Dw_avx512 Dw_dble_avx512 salg_avx512 salg_dble_avx512 \ -+ pauli_avx512 pauli_dble_avx512 -+ -+AVX512_ASM_MODULES = Dw_avx512_asm Dw_dble_avx512_asm salg_avx512_asm \ -+ salg_dble_avx512_asm pauli_avx512_asm pauli_dble_avx512_asm - - # Logging option (-mpilog or -mpitrace or -mpianim) - -@@ -68,7 +73,8 @@ MDIR = ../../modules - VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/linsolv:\ - $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:\ - $(MDIR)/sflds:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:\ -- $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive: -+ $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive:\ -+ $(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512 - - - # additional include directories -@@ -86,10 +92,10 @@ LIBPATH = $(MPI_HOME)/lib - # scheduling and optimization options - - CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ -- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ -- -O -mno-avx -Dx64 -DPM -+ -Wall -Wno-long-long -Wstrict-prototypes \ -+ -O2 -DAVX512 - --LFLAGS = -+LFLAGS = $(CFLAGS) - - # -DFGCR_DBG - -@@ -100,25 +106,39 @@ SHELL=/bin/bash - CC=mpicc - CLINKER=$(CC) - -+#Check CFLAGS to find which AVX512 flags are active -+ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) -+else -+ifneq (,$(findstring AVX512,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_MODULES) -+else -+MODULES= $(STD_MODULES) -+endif -+endif -+ - PGMS= $(MAIN) $(MODULES) - - -include $(addsuffix .d,$(PGMS)) - - - # rule to make dependencies -- - $(addsuffix .d,$(PGMS)): %.d: %.c Makefile - @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ - - - # rule to compile source programs -+$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - --$(addsuffix .o,$(PGMS)): %.o: %.c Makefile -+# pattern to compile files in the avx512 directiories -+$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile - $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - -+$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - - # rule to link object files -- - $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile - $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ - $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ -diff --git a/devel/sflds/Makefile b/devel/sflds/Makefile -index 111e826..b424540 100644 ---- a/devel/sflds/Makefile -+++ b/devel/sflds/Makefile -@@ -37,9 +37,12 @@ UFLDS = uflds - - UTILS = endian error hsum mutils utils wspace - --MODULES = $(FLAGS) $(LATTICE) $(RANDOM) $(LINALG) $(UTILS) \ -+STD_MODULES = $(FLAGS) $(LATTICE) $(RANDOM) $(LINALG) $(UTILS) \ - $(UFLDS) $(SFLDS) $(SU3FCTS) - -+AVX512_MODULES = salg_avx512 salg_dble_avx512 -+ -+AVX512_ASM_MODULES = salg_avx512_asm salg_dble_avx512_asm - - # Logging option (-mpilog or -mpitrace or -mpianim) - -@@ -51,7 +54,8 @@ LOGOPTION = - MDIR = ../../modules - - VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/random:$(MDIR)/linalg:\ -- $(MDIR)/utils:$(MDIR)/uflds:$(MDIR)/sflds:$(MDIR)/su3fcts -+ $(MDIR)/utils:$(MDIR)/uflds:$(MDIR)/sflds:$(MDIR)/su3fcts\ -+ $(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512 - - - # additional include directories -@@ -69,10 +73,10 @@ LIBPATH = $(MPI_HOME)/lib - # scheduling and optimization options - - CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ -- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ -- -O -mno-avx -Dx64 -DPM -+ -Wall -Wno-long-long -Wstrict-prototypes \ -+ -O2 -DAVX512 - --LFLAGS = -+LFLAGS = $(CFLAGS) - - - ############################## do not change ################################### -@@ -81,25 +85,39 @@ SHELL=/bin/bash - CC=mpicc - CLINKER=$(CC) - -+#Check CFLAGS to find which AVX512 flags are active -+ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) -+else -+ifneq (,$(findstring AVX512,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_MODULES) -+else -+MODULES= $(STD_MODULES) -+endif -+endif -+ - PGMS= $(MAIN) $(MODULES) - - -include $(addsuffix .d,$(PGMS)) - - - # rule to make dependencies -- - $(addsuffix .d,$(PGMS)): %.d: %.c Makefile - @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ - - - # rule to compile source programs -+$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - --$(addsuffix .o,$(PGMS)): %.o: %.c Makefile -+# pattern to compile files in the avx512 directiories -+$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile - $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - -+$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - - # rule to link object files -- - $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile - $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ - $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ -diff --git a/devel/sw_term/Makefile b/devel/sw_term/Makefile -index b9c80c4..877265e 100644 ---- a/devel/sw_term/Makefile -+++ b/devel/sw_term/Makefile -@@ -41,9 +41,14 @@ UFLDS = plaq_sum shift uflds udcom - - UTILS = endian error hsum mutils utils wspace - --MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(RANDOM) $(UFLDS) \ -+STD_MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(RANDOM) $(UFLDS) \ - $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) - -+AVX512_MODULES = salg_dble_avx512 \ -+ pauli_avx512 pauli_dble_avx512 -+ -+AVX512_ASM_MODULES = salg_dble_avx512_asm \ -+ pauli_avx512_asm pauli_dble_avx512_asm - - # Logging option (-mpilog or -mpitrace or -mpianim) - -@@ -56,7 +61,8 @@ MDIR = ../../modules - - VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/random:\ - $(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:$(MDIR)/sflds:\ -- $(MDIR)/tcharge:$(MDIR)/sw_term -+ $(MDIR)/tcharge:$(MDIR)/sw_term:\ -+ $(MDIR)/linalg/avx512:$(MDIR)/sw_term/avx512 - - - # additional include directories -@@ -74,10 +80,10 @@ LIBPATH = $(MPI_HOME)/lib - # scheduling and optimization options - - CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ -- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ -- -O -mno-avx -Dx64 -DPM -+ -Wall -Wno-long-long -Wstrict-prototypes \ -+ -O2 -DAVX512 - --LFLAGS = -+LFLAGS = $(CFLAGS) - - ############################## do not change ################################### - -@@ -85,30 +91,43 @@ SHELL=/bin/bash - CC=mpicc - CLINKER=$(CC) - -+#Check CFLAGS to find which AVX512 flags are active -+ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) -+else -+ifneq (,$(findstring AVX512,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_MODULES) -+else -+MODULES= $(STD_MODULES) -+endif -+endif -+ - PGMS= $(MAIN) $(MODULES) - - -include $(addsuffix .d,$(PGMS)) - - - # rule to make dependencies -- - $(addsuffix .d,$(PGMS)): %.d: %.c Makefile - @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ - - - # rule to compile source programs -+$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - --$(addsuffix .o,$(PGMS)): %.o: %.c Makefile -+# pattern to compile files in the avx512 directiories -+$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile - $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - -+$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - - # rule to link object files -- - $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile - $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ - $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ - -- - # produce executables - - mkxeq: $(MAIN) -diff --git a/devel/update/Makefile b/devel/update/Makefile -index d3e8910..749ade4 100644 ---- a/devel/update/Makefile -+++ b/devel/update/Makefile -@@ -67,11 +67,16 @@ UTILS = endian error hsum mutils utils wspace - - VFLDS = vflds vinit vcom vdcom - --MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \ -+STD_MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \ - $(LATTICE) $(LINALG) $(LINSOLV) $(LITTLE) $(MDFLDS) $(RANDOM) \ - $(RATFCTS) $(SAP) $(SFLDS) $(SU3FCTS) $(SW_TERM) $(TCHARGE) \ - $(UFLDS) $(UPDATE) $(UTILS) $(VFLDS) - -+AVX512_MODULES = Dw_avx512 Dw_dble_avx512 salg_avx512 salg_dble_avx512 \ -+ pauli_avx512 pauli_dble_avx512 -+ -+AVX512_ASM_MODULES = Dw_avx512_asm Dw_dble_avx512_asm salg_avx512_asm \ -+ salg_dble_avx512_asm pauli_avx512_asm pauli_dble_avx512_asm - - # Logging option (-mpilog or -mpitrace or -mpianim) - -@@ -87,7 +92,8 @@ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\ - $(MDIR)/utils:$(MDIR)/forces:$(MDIR)/sflds:$(MDIR)/dirac:\ - $(MDIR)/sw_term:$(MDIR)/tcharge:$(MDIR)/block:$(MDIR)/sap:\ - $(MDIR)/linsolv:$(MDIR)/dfl:$(MDIR)/vflds:$(MDIR)/little:\ -- $(MDIR)/update:$(MDIR)/ratfcts -+ $(MDIR)/update:$(MDIR)/ratfcts\ -+ $(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512 - - - # additional include directories -@@ -105,10 +111,11 @@ LIBPATH = $(MPI_HOME)/lib - # scheduling and optimization options - - CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ -- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ -- -O -mno-avx -Dx64 -DPM -+ -Wall -Wno-long-long -Wstrict-prototypes \ -+ -O2 -DAVX512 -+ -+LFLAGS = $(CFLAGS) - --LFLAGS = - - # -DMDINT_DBG -DRWRAT_DBG - -@@ -119,25 +126,39 @@ SHELL=/bin/bash - CC=mpicc - CLINKER=$(CC) - -+#Check CFLAGS to find which AVX512 flags are active -+ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) -+else -+ifneq (,$(findstring AVX512,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_MODULES) -+else -+MODULES= $(STD_MODULES) -+endif -+endif -+ - PGMS= $(MAIN) $(MODULES) - - -include $(addsuffix .d,$(PGMS)) - - - # rule to make dependencies -- - $(addsuffix .d,$(PGMS)): %.d: %.c Makefile - @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ - - - # rule to compile source programs -+$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - --$(addsuffix .o,$(PGMS)): %.o: %.c Makefile -+# pattern to compile files in the avx512 directiories -+$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile - $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - -+$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - - # rule to link object files -- - $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile - $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ - $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ -diff --git a/devel/vflds/Makefile b/devel/vflds/Makefile -index 11778a2..a29b334 100644 ---- a/devel/vflds/Makefile -+++ b/devel/vflds/Makefile -@@ -59,7 +59,11 @@ MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \ - $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) $(DIRAC) \ - $(BLOCK) $(SAP) $(ARCHIVE) $(DFL) $(VFLDS) - -+AVX512_MODULES = Dw_avx512 Dw_dble_avx512 salg_avx512 salg_dble_avx512 \ -+ pauli_avx512 pauli_dble_avx512 - -+AVX512_ASM_MODULES = Dw_avx512_asm Dw_dble_avx512_asm salg_avx512_asm \ -+ salg_dble_avx512_asm pauli_avx512_asm pauli_dble_avx512_asm - # Logging option (-mpilog or -mpitrace or -mpianim) - - LOGOPTION = -@@ -73,7 +77,8 @@ VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/linsolv:\ - $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:\ - $(MDIR)/sflds:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:\ - $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive:$(MDIR)/dfl:\ -- $(MDIR)/vflds -+ $(MDIR)/vflds\ -+ $(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512 - - # additional include directories - -@@ -90,10 +95,10 @@ LIBPATH = $(MPI_HOME)/lib - # scheduling and optimization options - - CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ -- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ -- -O -mno-avx -Dx64 -DPM -+ -Wall -Wno-long-long -Wstrict-prototypes \ -+ -O2 -DAVX512 - --LFLAGS = -+LFLAGS = $(CFLAGS) - - - ############################## do not change ################################### -@@ -102,25 +107,39 @@ SHELL=/bin/bash - CC=mpicc - CLINKER=$(CC) - -+#Check CFLAGS to find which AVX512 flags are active -+ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) -+else -+ifneq (,$(findstring AVX512,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_MODULES) -+else -+MODULES= $(STD_MODULES) -+endif -+endif -+ - PGMS= $(MAIN) $(MODULES) - - -include $(addsuffix .d,$(PGMS)) - - - # rule to make dependencies -- - $(addsuffix .d,$(PGMS)): %.d: %.c Makefile - @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ - - - # rule to compile source programs -+$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - --$(addsuffix .o,$(PGMS)): %.o: %.c Makefile -+# pattern to compile files in the avx512 directiories -+$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile - $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - -+$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - - # rule to link object files -- - $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile - $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ - $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ -diff --git a/include/avx512.h b/include/avx512.h -new file mode 100644 -index 0000000..ac73c60 ---- /dev/null -+++ b/include/avx512.h -@@ -0,0 +1,978 @@ -+ -+/******************************************************************************* -+* -+* File avx512.h -+* -+* This software is distributed under the terms of the GNU General Public -+* License (GPL) -+* -+* Macros for operating on SU(3) vectors and matrices using Intel intrinsic -+* operations for AVX512 compatible processors -+* -+*******************************************************************************/ -+ -+#ifndef AVX512_H -+#define AVX512_H -+ -+#ifndef SSE2_H -+#include "sse2.h" -+#endif -+ -+#include "immintrin.h" -+ -+ -+ -+/* Macros for single precision floating point numbers */ -+ -+/* Write 6 color weyl vectors as a spinor */ -+#define _avx512_write_6_hwv_f( c1,c2,c3, a ){ \ -+ __m256 t256; __m128 t128; \ -+ t256 = _mm256_shuffle_ps( c1, c2, 0b01000100 ); \ -+ t128 = _mm256_castps256_ps128( t256 ); \ -+ _mm_storeu_ps( a, t128 ); \ -+ t128 = _mm256_extractf128_ps( t256, 1 ); \ -+ _mm_storeu_ps( a+12, t128 ); \ -+ \ -+ t256 = _mm256_shuffle_ps( c3, c1, 0b11100100 ); \ -+ t128 = _mm256_castps256_ps128( t256 ); \ -+ _mm_storeu_ps( a+4, t128 ); \ -+ t128 = _mm256_extractf128_ps( t256, 1 ); \ -+ _mm_storeu_ps( a+16, t128 ); \ -+ \ -+ t256 = _mm256_shuffle_ps( c2, c3, 0b11101110 ); \ -+ t128 = _mm256_castps256_ps128( t256 ); \ -+ _mm_storeu_ps( a+8, t128 ); \ -+ t128 = _mm256_extractf128_ps( t256, 1 ); \ -+ _mm_storeu_ps( a+20, t128 ); \ -+} -+ -+ -+/* Load 6 color weyl vectors from a spinor */ -+#define _avx512_load_6_hwv_f( c1,c2,c3, a ){ \ -+ __m256 t1,t2,t3; __m128 t11, t14; \ -+ t11 = _mm_loadu_ps( a ); \ -+ t14 = _mm_loadu_ps( a+12 ); \ -+ t1 = _mm256_castps128_ps256( t11 ); \ -+ t1 = _mm256_insertf128_ps( t1, t14, 1 ); \ -+ \ -+ t11 = _mm_loadu_ps( a+4 ); \ -+ t14 = _mm_loadu_ps( a+16 ); \ -+ t2 = _mm256_castps128_ps256( t11 ); \ -+ t2 = _mm256_insertf128_ps( t2, t14, 1 ); \ -+ \ -+ t11 = _mm_loadu_ps( a+8 ); \ -+ t14 = _mm_loadu_ps( a+20 ); \ -+ t3 = _mm256_castps128_ps256( t11 ); \ -+ t3 = _mm256_insertf128_ps( t3, t14, 1 ); \ -+ \ -+ c1 = _mm256_shuffle_ps( t1, t2, 0b11100100 ); \ -+ c2 = _mm256_shuffle_ps( t1, t3, 0b01001110 ); \ -+ c3 = _mm256_shuffle_ps( t2, t3, 0b11100100 ); \ -+} -+ -+ -+ -+/* Load 4x2 complex numbers into an aray */ -+#define _avx512_load_4_cf( r, c1,c2,c3,c4 ){ \ -+ __m128 t128; \ -+ t128 = _mm_loadu_ps( c1 ); \ -+ r = _mm512_castps128_ps512( t128 ); \ -+ t128 = _mm_loadu_ps( c2 ); \ -+ r = _mm512_insertf32x4 ( r, t128, 1); \ -+ t128 = _mm_loadu_ps( c3 ); \ -+ r = _mm512_insertf32x4 ( r, t128, 2); \ -+ t128 = _mm_loadu_ps( c4 ); \ -+ r = _mm512_insertf32x4 ( r, t128, 3); \ -+} -+ -+/* Load 4 half-spinors and organize colorwise into vectors r1, r2 and r3 */ -+#define _avx512_load_4_halfspinor_f( r1, r2, r3, s1, s2, s3, s4 ) \ -+{ \ -+ __m512 t512a, t512b, t512c; \ -+ _avx512_load_4_cf( t512a, s1,s2,s3,s4 ); \ -+ _avx512_load_4_cf( t512b, s1+4,s2+4,s3+4,s4+4 ); \ -+ _avx512_load_4_cf( t512c, s1+8,s2+8,s3+8,s4+8 ); \ -+ \ -+ r1 = _mm512_shuffle_ps(t512a,t512b, 0b11100100); \ -+ r2 = _mm512_shuffle_ps(t512a,t512c, 0b01001110); \ -+ r3 = _mm512_shuffle_ps(t512b,t512c, 0b11100100); \ -+} -+ -+/* Load 4 half-spinors reversing the second two spinor indeces and -+ * organize colorwise into vectors r1, r2 and r3 */ -+#define _avx512_load_4_halfspinor_f_reverse_up( r1, r2, r3, s1, s2, s3, s4 ) \ -+{ \ -+ __m512 t512a, t512b, t512c; \ -+ __m512i idx; \ -+ _avx512_load_4_cf( t512a, s1,s2,s3,s4 ); \ -+ _avx512_load_4_cf( t512b, s1+4,s2+4,s3+4,s4+4 ); \ -+ _avx512_load_4_cf( t512c, s1+8,s2+8,s3+8,s4+8 ); \ -+ \ -+ idx = _mm512_setr_epi32( 0,1,16+2,16+3, 4,5,16+6,16+7, \ -+ 16+10,16+11,8,9, 16+14,16+15,12,13 ); \ -+ r1 = _mm512_permutex2var_ps( t512a, idx, t512b ); \ -+ idx = _mm512_setr_epi32( 2,3,16+0,16+1, 6,7,16+4,16+5, \ -+ 16+8,16+9,10,11, 16+12,16+13,14,15 ); \ -+ r2 = _mm512_permutex2var_ps( t512a, idx, t512c ); \ -+ idx = _mm512_setr_epi32( 0,1,16+2,16+3, 4,5,16+6,16+7, \ -+ 16+10,16+11,8,9, 16+14,16+15,12,13 ); \ -+ r3 = _mm512_permutex2var_ps( t512b, idx, t512c ); \ -+} -+ -+/* Load 4 half-spinors reversing first two the spinor indeces and -+ * organize colorwise into vectors r1, r2 and r3 */ -+#define _avx512_load_4_halfspinor_f_reverse_dn( r1, r2, r3, s1, s2, s3, s4 ) \ -+{ \ -+ __m512 t512a, t512b, t512c; \ -+ __m512i idx; \ -+ _avx512_load_4_cf( t512a, s1,s2,s3,s4 ); \ -+ _avx512_load_4_cf( t512b, s1+4,s2+4,s3+4,s4+4 ); \ -+ _avx512_load_4_cf( t512c, s1+8,s2+8,s3+8,s4+8 ); \ -+ \ -+ idx = _mm512_setr_epi32( 16+2,16+3,0,1, 16+6,16+7,4,5, \ -+ 8,9,16+10,16+11, 12,13,16+14,16+15 ); \ -+ r1 = _mm512_permutex2var_ps( t512a, idx, t512b ); \ -+ idx = _mm512_setr_epi32( 16+0,16+1,2,3, 16+4,16+5,6,7, \ -+ 10,11,16+8,16+9, 14,15,16+12,16+13 ); \ -+ r2 = _mm512_permutex2var_ps( t512a, idx, t512c ); \ -+ idx = _mm512_setr_epi32( 16+2,16+3,0,1, 16+6,16+7,4,5, \ -+ 8,9,16+10,16+11, 12,13,16+14,16+15 ); \ -+ r3 = _mm512_permutex2var_ps( t512b, idx, t512c ); \ -+} -+ -+/* Load 4x2 complex numbers into an aray */ -+#define _avx512_write_4_cf( r, c1,c2,c3,c4 ){ \ -+ __m512 t512; \ -+ __m128 t128 = _mm512_castps512_ps128( r ); \ -+ _mm_storeu_ps( c1, t128 ); \ -+ t128 = _mm512_extractf32x4_ps( r, 1 ); \ -+ _mm_storeu_ps( c2, t128 ); \ -+ t128 = _mm512_extractf32x4_ps( r, 2 ); \ -+ _mm_storeu_ps( c3, t128 ); \ -+ t128 = _mm512_extractf32x4_ps( r, 3 ); \ -+ _mm_storeu_ps( c4, t128 ); \ -+} -+ -+/* Store 4 half-spinors from color vectors */ -+#define _avx512_write_4_halfspinor_f( r1, r2, r3, s1, s2, s3, s4 ) \ -+{ \ -+ __m512 t512a, t512b, t512c; \ -+ \ -+ t512a = _mm512_shuffle_ps(r1,r2, 0b01000100); \ -+ t512b = _mm512_shuffle_ps(r3,r1, 0b11100100); \ -+ t512c = _mm512_shuffle_ps(r2,r3, 0b11101110); \ -+ \ -+ _avx512_write_4_cf( t512a, s1,s2,s3,s4 ); \ -+ _avx512_write_4_cf( t512b, s1+4,s2+4,s3+4,s4+4 ); \ -+ _avx512_write_4_cf( t512c, s1+8,s2+8,s3+8,s4+8 ); \ -+} -+ -+/* Store 4 half-spinors from color vectors reversing the first two Dirac indeces */ -+#define _avx512_write_4_halfspinor_f_reverse_up( r1, r2, r3, s1, s2, s3, s4 ) \ -+{ \ -+ __m512 t512a, t512b, t512c; \ -+ __m512i idx; \ -+ idx = _mm512_setr_epi32( 0,1,16+0,16+1, 4,5,16+4,16+5, \ -+ 10,11,16+10,16+11, 14,15,16+14,16+15 ); \ -+ t512a = _mm512_permutex2var_ps( r1, idx, r2 ); \ -+ idx = _mm512_setr_epi32( 0,1,16+2,16+3, 4,5,16+6,16+7, \ -+ 10,11,16+8,16+9, 14,15,16+12,16+13 ); \ -+ t512b = _mm512_permutex2var_ps( r3, idx, r1 ); \ -+ idx = _mm512_setr_epi32( 2,3,16+2,16+3, 6,7,16+6,16+7, \ -+ 8,9,16+8,16+9, 12,13,16+12,16+13 ); \ -+ t512c = _mm512_permutex2var_ps( r2, idx, r3 ); \ -+ \ -+ _avx512_write_4_cf( t512a, s1,s2,s3,s4 ); \ -+ _avx512_write_4_cf( t512b, s1+4,s2+4,s3+4,s4+4 ); \ -+ _avx512_write_4_cf( t512c, s1+8,s2+8,s3+8,s4+8 ); \ -+} -+ -+ -+/* Store 4 half-spinors from color vectors reversing the second two Dirac indeces */ -+#define _avx512_write_4_halfspinor_f_reverse_dn( r1, r2, r3, s1, s2, s3, s4 ) \ -+{ \ -+ __m512 t512a, t512b, t512c; \ -+ __m512i idx; \ -+ idx = _mm512_setr_epi32( 2,3,16+2,16+3, 6,7,16+6,16+7, \ -+ 8,9,16+8,16+9, 12,13,16+12,16+13 ); \ -+ t512a = _mm512_permutex2var_ps( r1, idx, r2 ); \ -+ idx = _mm512_setr_epi32( 2,3,16+0,16+1, 6,7,16+4,16+5, \ -+ 8,9,16+10,16+11, 12,13,16+14,16+15 ); \ -+ t512b = _mm512_permutex2var_ps( r3, idx, r1 ); \ -+ idx = _mm512_setr_epi32( 0,1,16+0,16+1, 4,5,16+4,16+5, \ -+ 10,11,16+10,16+11, 14,15,16+14,16+15 ); \ -+ t512c = _mm512_permutex2var_ps( r2, idx, r3 ); \ -+ \ -+ _avx512_write_4_cf( t512a, s1,s2,s3,s4 ); \ -+ _avx512_write_4_cf( t512b, s1+4,s2+4,s3+4,s4+4 ); \ -+ _avx512_write_4_cf( t512c, s1+8,s2+8,s3+8,s4+8 ); \ -+} -+ -+ -+ -+ -+ -+ -+ -+ -+/* Combine Dirac spinors to half-spinors in deo and doe. -+ */ -+#define _avx512_dirac_combine_f_1( a, b ) \ -+{ \ -+ __m512i indexes; \ -+ __m512 c; \ -+ indexes = _mm512_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7, \ -+ 9, 8, 11, 10, 13, 12, 15, 14 ); \ -+ c = _mm512_permutexvar_ps( indexes, b); \ -+ a = _mm512_mask_add_ps( a, 0b0101101000001111, a, c ); \ -+ a = _mm512_mask_sub_ps( a, 0b1010010111110000, a, c ); \ -+} -+ -+#define _avx512_dirac_combine_f_2( a, b ) \ -+{ \ -+ __m512i indexes; \ -+ __m512 c; \ -+ indexes = _mm512_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7, \ -+ 9, 8, 11, 10, 13, 12, 15, 14 ); \ -+ c = _mm512_permutexvar_ps( indexes, b); \ -+ a = _mm512_mask_add_ps( a, 0b1001011011000011, a, c ); \ -+ a = _mm512_mask_sub_ps( a, 0b0110100100111100, a, c ); \ -+} -+ -+ -+#define _avx512_dirac_combine_f_3( a, b ) \ -+{ \ -+ __m512i indexes; \ -+ __m512 c; \ -+ indexes = _mm512_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7, \ -+ 9, 8, 11, 10, 13, 12, 15, 14 ); \ -+ c = _mm512_permutexvar_ps( indexes, b); \ -+ a = _mm512_mask_add_ps( a, 0b1010010100001111, a, c ); \ -+ a = _mm512_mask_sub_ps( a, 0b0101101011110000, a, c ); \ -+} -+ -+ -+#define _avx512_dirac_combine_f_4( a, b ) \ -+{ \ -+ __m512i indexes; \ -+ __m512 c; \ -+ indexes = _mm512_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7, \ -+ 9, 8, 11, 10, 13, 12, 15, 14 ); \ -+ c = _mm512_permutexvar_ps( indexes, b); \ -+ a = _mm512_mask_add_ps( a, 0b0110100111000011, a, c ); \ -+ a = _mm512_mask_sub_ps( a, 0b1001011000111100, a, c ); \ -+} -+ -+ -+ -+ -+/* Multiply 4 vectors with a su(3) matrices, taking the inverse of every -+ * second matrix -+ */ -+#define avx512_su3_mixed_multiply_8( u1, um1, u2, um2, b1,b2,b3, a1,a2,a3 ) \ -+ { \ -+ __m512 ut11, ut21, ut31, ut41, ut12, ut22, ut32, ut42; \ -+ __m512 ut1, ut2, sign, c; \ -+ __m512 t1,t2,t3; \ -+ __m512i indexes; \ -+ ut11 = _mm512_loadu_ps( &(u1).c11.re ); ut12 = _mm512_loadu_ps( &(u1).c33.re ); \ -+ ut21 = _mm512_loadu_ps( &(um1).c11.re ); ut22 = _mm512_loadu_ps( &(um1).c33.re ); \ -+ ut31 = _mm512_loadu_ps( &(u2).c11.re ); ut32 = _mm512_loadu_ps( &(u2).c33.re ); \ -+ ut41 = _mm512_loadu_ps( &(um2).c11.re ); ut42 = _mm512_loadu_ps( &(um2).c33.re ); \ -+ \ -+ indexes = _mm512_setr_epi32( 0, 1, 2, 3, 8, 9, 6, 7, \ -+ 16, 17, 18, 19, 24, 25, 22, 23 ); \ -+ ut1 = _mm512_permutex2var_ps( ut11, indexes, ut21 ); \ -+ ut2 = _mm512_permutex2var_ps( ut31, indexes, ut41 ); \ -+ \ -+ indexes = _mm512_setr_epi32( 0,0,0,0, 8,8,8,8, 16,16,16,16, 24,24,24,24 ); \ -+ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ -+ b1 = _mm512_mul_ps ( a1, c ); \ -+ \ -+ indexes = _mm512_setr_epi32( 4,4,4,4, 12,12,12,12, 20,20,20,20, 28,28,28,28 ); \ -+ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ -+ b2 = _mm512_mul_ps ( a2, c ); \ -+ \ -+ indexes = _mm512_setr_epi32( 2,2,2,2, 14,14,14,14, 18,18,18,18, 30,30,30,30 ); \ -+ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ -+ b1 = _mm512_fmadd_ps ( a2, c, b1 ); \ -+ \ -+ indexes = _mm512_setr_epi32( 6,6,6,6, 10,10,10,10, 22,22,22,22, 26,26,26,26 ); \ -+ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ -+ b2 = _mm512_fmadd_ps ( a1, c, b2 ); \ -+ \ -+ \ -+ sign = _mm512_set_ps( -1,1,-1,1, 1,-1,1,-1, -1,1,-1,1, 1,-1,1,-1 ); \ -+ t1 = _mm512_permute_ps( a1, 0b10110001 ); \ -+ t1 = _mm512_mul_ps( t1, sign ); \ -+ t2 = _mm512_permute_ps( a2, 0b10110001 ); \ -+ t2 = _mm512_mul_ps( t2, sign ); \ -+ \ -+ indexes = _mm512_setr_epi32( 1,1,1,1, 9,9,9,9, 17,17,17,17, 25,25,25,25 ); \ -+ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ -+ b1 = _mm512_fmadd_ps ( t1, c, b1 ); \ -+ \ -+ indexes = _mm512_setr_epi32( 5,5,5,5, 13,13,13,13, 21,21,21,21, 29,29,29,29 ); \ -+ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ -+ b2 = _mm512_fmadd_ps ( t2, c, b2 ); \ -+ \ -+ indexes = _mm512_setr_epi32( 3,3,3,3, 15,15,15,15, 19,19,19,19, 31,31,31,31 ); \ -+ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ -+ b1 = _mm512_fmadd_ps ( t2, c, b1 ); \ -+ \ -+ indexes = _mm512_setr_epi32( 7,7,7,7, 11,11,11,11, 23,23,23,23, 27,27,27,27 ); \ -+ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ -+ b2 = _mm512_fmadd_ps ( t1, c, b2 ); \ -+ \ -+ \ -+ indexes = _mm512_setr_epi32( 4, 5, 12, 13, 10, 11, 14, 15, \ -+ 20, 21, 28, 29, 26, 27, 30, 31 ); \ -+ ut1 = _mm512_permutex2var_ps( ut11, indexes, ut21 ); \ -+ ut2 = _mm512_permutex2var_ps( ut31, indexes, ut41 ); \ -+ \ -+ indexes = _mm512_setr_epi32( 0,0,0,0, 10,10,10,10, 16,16,16,16, 26,26,26,26 ); \ -+ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ -+ b1 = _mm512_fmadd_ps ( a3, c, b1 ); \ -+ \ -+ indexes = _mm512_setr_epi32( 2,2,2,2, 8,8,8,8, 18,18,18,18, 24,24,24,24 ); \ -+ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ -+ b3 = _mm512_mul_ps ( a1, c ); \ -+ \ -+ indexes = _mm512_setr_epi32( 4,4,4,4, 14,14,14,14, 20,20,20,20, 30,30,30,30 ); \ -+ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ -+ b2 = _mm512_fmadd_ps ( a3, c, b2 ); \ -+ \ -+ indexes = _mm512_setr_epi32( 6,6,6,6, 12,12,12,12, 22,22,22,22, 28,28,28,28 ); \ -+ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ -+ b3 = _mm512_fmadd_ps ( a2, c, b3 ); \ -+ \ -+ \ -+ t3 = _mm512_permute_ps( a3, 0b10110001 ); \ -+ t3 = _mm512_mul_ps( t3, sign ); \ -+ \ -+ indexes = _mm512_setr_epi32( 1,1,1,1, 11,11,11,11, 17,17,17,17, 27,27,27,27 ); \ -+ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ -+ b1 = _mm512_fmadd_ps ( t3, c, b1 ); \ -+ \ -+ indexes = _mm512_setr_epi32( 3,3,3,3, 9,9,9,9, 19,19,19,19, 25,25,25,25 ); \ -+ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ -+ b3 = _mm512_fmadd_ps ( t1, c, b3 ); \ -+ \ -+ indexes = _mm512_setr_epi32( 5,5,5,5, 15,15,15,15, 21,21,21,21, 31,31,31,31 ); \ -+ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ -+ b2 = _mm512_fmadd_ps ( t3, c, b2 ); \ -+ \ -+ indexes = _mm512_setr_epi32( 7,7,7,7, 13,13,13,13, 23,23,23,23, 29,29,29,29 ); \ -+ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ -+ b3 = _mm512_fmadd_ps ( t2, c, b3 ); \ -+ \ -+ \ -+ indexes = _mm512_setr_epi32( 0, 1, 16, 17, 0,0,0,0, 0,0,0,0, 0,0,0,0 ); \ -+ ut1 = _mm512_permutex2var_ps( ut12, indexes, ut22 ); \ -+ ut2 = _mm512_permutex2var_ps( ut32, indexes, ut42 ); \ -+ \ -+ indexes = _mm512_setr_epi32( 0,0,0,0, 2,2,2,2, 16,16,16,16, 18,18,18,18 ); \ -+ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ -+ b3 = _mm512_fmadd_ps ( a3, c, b3 ); \ -+ \ -+ indexes = _mm512_setr_epi32( 1,1,1,1, 3,3,3,3, 17,17,17,17, 19,19,19,19 ); \ -+ c = _mm512_permutex2var_ps( ut1, indexes, ut2 ); \ -+ b3 = _mm512_fmadd_ps ( t3, c, b3 ); \ -+} -+ -+ -+ -+/* Insert 256 bits into a 512 bit single precision vector */ -+#define _avx512_insert_256_h_f( a, t ) \ -+{ \ -+ __m512d td512; \ -+ __m256d td256; \ -+ td512 = _mm512_castps_pd( a ); \ -+ td256 = _mm256_castps_pd( t ); \ -+ td512 = _mm512_insertf64x4( td512, td256, 1 ); \ -+ a = _mm512_castpd_ps( td512 ); \ -+} -+ -+/* Extract 256 bits from a 512 bit single precision vector */ -+#define _avx512_extract_256_h_f( t, a ) \ -+{ \ -+ __m512d td512; \ -+ __m256d td256; \ -+ td512 = _mm512_castps_pd( a ); \ -+ td256 = _mm256_castps_pd( t ); \ -+ td256 = _mm512_extractf64x4_pd( td512, 1 ); \ -+ t = _mm256_castpd_ps( td256 ); \ -+} -+ -+ -+ -+/* Accumulate elements of Dirac vectors into a Weyl vector in deo and doe */ -+#define _avx512_to_weyl_f_12( c, b ){ \ -+ __m256 w, sign, t5,t6; \ -+ __m256i idx; \ -+ t5 = _mm512_castps512_ps256( b ); \ -+ \ -+ idx = _mm256_setr_epi32( 4, 5, 6, 7, 0, 1, 2, 3 ); \ -+ t6 = _mm256_permutevar8x32_ps( t5, idx ); \ -+ sign = _mm256_set_ps( -1,-1,-1,-1, 1,1,1,1 ); \ -+ c = _mm256_fmadd_ps( t5, sign, t6 ); \ -+ \ -+ _avx512_extract_256_h_f( t5, b ); \ -+ t6 = _mm256_permutevar8x32_ps( t5, idx ); \ -+ sign = _mm256_set_ps( -1,-1,-1,-1, 1,1,1,1 ); \ -+ w = _mm256_fmadd_ps( t6, sign, t5 ); \ -+ idx = _mm256_setr_epi32( 0, 1, 2, 3, 3, 2, 1, 0 ); \ -+ w = _mm256_permutevar_ps( w, idx ); \ -+ sign = _mm256_set_ps( 1,-1,1,-1, 1,1,1,1 ); \ -+ c = _mm256_fmadd_ps( w, sign, c ); \ -+} -+ -+#define _avx512_to_weyl_f_34( c, b ){ \ -+ __m256 w, sign, t5,t6; \ -+ __m256i idx; \ -+ t5 = _mm512_castps512_ps256( b ); \ -+ \ -+ idx = _mm256_setr_epi32( 4, 5, 6, 7, 0, 1, 2, 3 ); \ -+ t6 = _mm256_permutevar8x32_ps( t5, idx ); \ -+ sign = _mm256_set_ps( -1,-1,-1,-1, 1,1,1,1 ); \ -+ w = _mm256_fmadd_ps( t6, sign, t5 ); \ -+ idx = _mm256_setr_epi32( 0, 1, 2, 3, 2, 3, 0, 1 ); \ -+ w = _mm256_permutevar_ps( w, idx ); \ -+ sign = _mm256_set_ps( -1,-1,1,1, 1,1,1,1 ); \ -+ c = _mm256_fmadd_ps( w, sign, c ); \ -+ \ -+ _avx512_extract_256_h_f( t5, b ); \ -+ idx = _mm256_setr_epi32( 4, 5, 6, 7, 0, 1, 2, 3 ); \ -+ t6 = _mm256_permutevar8x32_ps( t5, idx ); \ -+ sign = _mm256_set_ps( -1,-1,-1,-1, 1,1,1,1 ); \ -+ w = _mm256_fmadd_ps( t6, sign, t5 ); \ -+ idx = _mm256_setr_epi32( 0, 1, 2, 3, 1, 0, 3, 2 ); \ -+ w = _mm256_permutevar_ps( w, idx ); \ -+ sign = _mm256_set_ps( -1,1,1,-1, 1,1,1,1 ); \ -+ c = _mm256_fmadd_ps( w, sign, c ); \ -+} -+ -+ -+/* Expand a Weyl vector into a Dirac vector in deo and doe */ -+#define _avx512_to_dirac_f_1( a1, c1 ) \ -+{ \ -+ __m256 t1,t2, sign; \ -+ __m256i idx; \ -+ idx = _mm256_setr_epi32( 4, 5, 6, 7, 0, 1, 2, 3 ); \ -+ t1 = _mm256_permutevar8x32_ps( c1, idx ); \ -+ sign = _mm256_set_ps( -1,-1,-1,-1, 1,1,1,1 ); \ -+ t2 = _mm256_fmadd_ps( c1, sign, t1 ); \ -+ a1 = _mm512_castps256_ps512( t2 ); \ -+} -+ -+ -+#define _avx512_to_dirac_f_2( a1, c1 ) \ -+{ \ -+ __m256 t1,t2, sign; \ -+ __m256i idx; \ -+ idx = _mm256_setr_epi32( 7, 6, 5, 4, 7, 6, 5, 4 ); \ -+ t1 = _mm256_permutevar8x32_ps( c1, idx ); \ -+ idx = _mm256_setr_epi32( 0, 1, 2, 3, 0, 1, 2, 3 ); \ -+ t2 = _mm256_permutevar8x32_ps( c1, idx ); \ -+ sign = _mm256_set_ps( -1,1,-1,1, 1,-1,1,-1 ); \ -+ t2 = _mm256_fmadd_ps( t1, sign, t2 ); \ -+ _avx512_insert_256_h_f( a1, t2 ); \ -+} -+ -+#define _avx512_to_dirac_f_3( a1, c1 ) \ -+{ \ -+ __m512 t5,t6,t7; \ -+ __m512i idx; \ -+ t5 = _mm512_castps256_ps512( c1 ); \ -+ idx = _mm512_setr_epi32( 6,7,4,5, 6,7,4,5, 5,4,7,6, 5,4,7,6 ); \ -+ t6 = _mm512_permutexvar_ps( idx, t5 ); \ -+ idx = _mm512_setr_epi32( 0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3 ); \ -+ t7 = _mm512_permutexvar_ps( idx, t5 ); \ -+ a1 = _mm512_maskz_add_ps( 0b1001011011000011, t7, t6 ); \ -+ a1 = _mm512_mask_sub_ps( a1, 0b0110100100111100, t7, t6 ); \ -+} -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+/* Macros for double precision numbers */ -+ -+/* Load 2 half-spinors and organize colorwise into vectors s1, s2 and s3 */ -+#define _avx512_load_2_halfspinor_d( s1, s2, s3, sp, sm ) \ -+{ \ -+ __m512d a1,a2,a3,a4,a5,a6; \ -+ __m512i idx; \ -+ a1 = _mm512_loadu_pd( sm ); \ -+ a2 = _mm512_loadu_pd( sm+8 ); \ -+ a3 = _mm512_loadu_pd( sp ); \ -+ a4 = _mm512_loadu_pd( sp+8 ); \ -+ \ -+ idx = _mm512_setr_epi64( 8,9,14,15, 0,1,6,7 ); \ -+ s1 = _mm512_permutex2var_pd( a1, idx, a3 ); \ -+ \ -+ idx = _mm512_setr_epi64( 4,5,10,11, 2,3,8,9 ); \ -+ a5 = _mm512_permutex2var_pd( a1, idx, a2 ); \ -+ idx = _mm512_setr_epi64( 2,3,8,9, 4,5,10,11 ); \ -+ a6 = _mm512_permutex2var_pd( a3, idx, a4 ); \ -+ idx = _mm512_setr_epi64( 0,1,2,3, 12,13,14,15 ); \ -+ s2 = _mm512_permutex2var_pd( a6, idx, a5 ); \ -+ \ -+ idx = _mm512_setr_epi64( 4,5,6,7, 8,9,10,11 ); \ -+ s3 = _mm512_permutex2var_pd( a6, idx, a5 ); \ -+} -+ -+/* Load 2 half-spinors reversing the spinor indeces and -+ * organize colorwise into vectors s1, s2 and s3 */ -+#define _avx512_load_2_halfspinor_d_reverse( s1, s2, s3, sp, sm ) \ -+{ \ -+ __m512d a1,a2,a3,a4,a5,a6; \ -+ __m512i idx; \ -+ a1 = _mm512_loadu_pd( sm ); \ -+ a2 = _mm512_loadu_pd( sm+8 ); \ -+ a3 = _mm512_loadu_pd( sp ); \ -+ a4 = _mm512_loadu_pd( sp+8 ); \ -+ \ -+ idx = _mm512_setr_epi64( 14,15,8,9, 6,7,0,1 ); \ -+ s1 = _mm512_permutex2var_pd( a1, idx, a3 ); \ -+ \ -+ idx = _mm512_setr_epi64( 10,11,4,5, 8,9,2,3 ); \ -+ a5 = _mm512_permutex2var_pd( a1, idx, a2 ); \ -+ idx = _mm512_setr_epi64( 8,9,2,3, 10,11,4,5 ); \ -+ a6 = _mm512_permutex2var_pd( a3, idx, a4 ); \ -+ idx = _mm512_setr_epi64( 0,1,2,3, 12,13,14,15 ); \ -+ s2 = _mm512_permutex2var_pd( a6, idx, a5 ); \ -+ \ -+ idx = _mm512_setr_epi64( 4,5,6,7, 8,9,10,11 ); \ -+ s3 = _mm512_permutex2var_pd( a6, idx, a5 ); \ -+} -+ -+/* Write 2 half-spinors from three color vectors */ -+#define _avx512_store_2_halfspinor_d( s1, s2, s3, sp, sm ) \ -+{ \ -+ __m512d a1,a2,a3,a4,a5,a6; \ -+ __m256d l; \ -+ __m512i idx; \ -+ idx = _mm512_setr_epi64( 0,1,8,9, 4,5,12,13 ); \ -+ a1 = _mm512_permutex2var_pd( s1, idx, s2 ); \ -+ idx = _mm512_setr_epi64( 0,1,10,11, 4,5,14,15 ); \ -+ a2 = _mm512_permutex2var_pd( s3, idx, s1 ); \ -+ idx = _mm512_setr_epi64( 2,3,10,11, 6,7,14,15 ); \ -+ a3 = _mm512_permutex2var_pd( s2, idx, s3 ); \ -+ \ -+ l = _mm512_castpd512_pd256( a1 ); \ -+ _mm256_storeu_pd( sp, l ); \ -+ l = _mm512_castpd512_pd256( a2 ); \ -+ _mm256_storeu_pd( sp+4, l ); \ -+ l = _mm512_castpd512_pd256( a3 ); \ -+ _mm256_storeu_pd( sp+8, l ); \ -+ \ -+ l = _mm512_extractf64x4_pd( a1, 1 ); \ -+ _mm256_storeu_pd( sm, l ); \ -+ l = _mm512_extractf64x4_pd( a2, 1 ); \ -+ _mm256_storeu_pd( sm+4, l ); \ -+ l = _mm512_extractf64x4_pd( a3, 1 ); \ -+ _mm256_storeu_pd( sm+8, l ); \ -+} -+ -+ -+ -+/* Multiply the lower half of the color vectors distributed in c1, c2 and c3 -+ * by the su3 matrix u and the upper half by the conjugate of um -+ * Store in b1, b2 and b3 -+ */ -+#define avx512_su3_mul_quad_dble( u, um, b1, b2, b3, c1, c2, c3 ) \ -+{ \ -+ __m512d tu1, tu2, tu3, tum1, tum2, tum3; \ -+ __m512d u1; \ -+ __m512d t1, t2, t3, sign; \ -+ __m512i indexes; \ -+ tu1 = _mm512_loadu_pd( &(u).c11.re ); \ -+ tu2 = _mm512_loadu_pd( &(u).c22.re ); \ -+ tu3 = _mm512_loadu_pd( &(u).c33.re ); \ -+ tum1 = _mm512_loadu_pd( &(um).c11.re ); \ -+ tum2 = _mm512_loadu_pd( &(um).c22.re ); \ -+ tum3 = _mm512_loadu_pd( &(um).c33.re ); \ -+ \ -+ sign = _mm512_set_pd( -1,1,-1,1, 1,-1,1,-1 ); \ -+ t1 = _mm512_permute_pd( c1, 0b01010101 ); \ -+ t2 = _mm512_permute_pd( c2, 0b01010101 ); \ -+ t3 = _mm512_permute_pd( c3, 0b01010101 ); \ -+ t1 = _mm512_mul_pd( t1, sign ); \ -+ t2 = _mm512_mul_pd( t2, sign ); \ -+ t3 = _mm512_mul_pd( t3, sign ); \ -+ \ -+ indexes = _mm512_setr_epi64( 0, 0, 0, 0, 8, 8, 8, 8 ); \ -+ u1 = _mm512_permutex2var_pd( tu1, indexes, tum1 ); \ -+ b1 = _mm512_mul_pd ( u1, c1 ); \ -+ indexes = _mm512_setr_epi64( 1, 1, 1, 1, 9, 9, 9, 9 ); \ -+ u1 = _mm512_permutex2var_pd( tu1, indexes, tum1 ); \ -+ b1 = _mm512_fmadd_pd ( u1, t1, b1 ); \ -+ \ -+ indexes = _mm512_setr_epi64( 2, 2, 2, 2, 14, 14, 14, 14 ); \ -+ u1 = _mm512_permutex2var_pd( tu1, indexes, tum1 ); \ -+ b1 = _mm512_fmadd_pd ( u1, c2, b1 ); \ -+ indexes = _mm512_setr_epi64( 3, 3, 3, 3, 15, 15, 15, 15 ); \ -+ u1 = _mm512_permutex2var_pd( tu1, indexes, tum1 ); \ -+ b1 = _mm512_fmadd_pd ( u1, t2, b1 ); \ -+ \ -+ indexes = _mm512_setr_epi64( 4, 4, 4, 4, 12, 12, 12, 12 ); \ -+ u1 = _mm512_permutex2var_pd( tu1, indexes, tum2 ); \ -+ b1 = _mm512_fmadd_pd ( u1, c3, b1 ); \ -+ indexes = _mm512_setr_epi64( 5, 5, 5, 5, 13, 13, 13, 13 ); \ -+ u1 = _mm512_permutex2var_pd( tu1, indexes, tum2 ); \ -+ b1 = _mm512_fmadd_pd ( u1, t3, b1 ); \ -+ \ -+ indexes = _mm512_setr_epi64( 6, 6, 6, 6, 10, 10, 10, 10 ); \ -+ u1 = _mm512_permutex2var_pd( tu1, indexes, tum1 ); \ -+ b2 = _mm512_mul_pd ( u1, c1 ); \ -+ indexes = _mm512_setr_epi64( 7, 7, 7, 7, 11, 11, 11, 11 ); \ -+ u1 = _mm512_permutex2var_pd( tu1, indexes, tum1 ); \ -+ b2 = _mm512_fmadd_pd ( u1, t1, b2 ); \ -+ \ -+ indexes = _mm512_setr_epi64( 0, 0, 0, 0, 8, 8, 8, 8 ); \ -+ u1 = _mm512_permutex2var_pd( tu2, indexes, tum2 ); \ -+ b2 = _mm512_fmadd_pd ( u1, c2, b2 ); \ -+ indexes = _mm512_setr_epi64( 1, 1, 1, 1, 9, 9, 9, 9 ); \ -+ u1 = _mm512_permutex2var_pd( tu2, indexes, tum2 ); \ -+ b2 = _mm512_fmadd_pd ( u1, t2, b2 ); \ -+ \ -+ indexes = _mm512_setr_epi64( 2, 2, 2, 2, 14, 14, 14, 14 ); \ -+ u1 = _mm512_permutex2var_pd( tu2, indexes, tum2 ); \ -+ b2 = _mm512_fmadd_pd ( u1, c3, b2 ); \ -+ indexes = _mm512_setr_epi64( 3, 3, 3, 3, 15, 15, 15, 15 ); \ -+ u1 = _mm512_permutex2var_pd( tu2, indexes, tum2 ); \ -+ b2 = _mm512_fmadd_pd ( u1, t3, b2 ); \ -+ \ -+ indexes = _mm512_setr_epi64( 4, 4, 4, 4, 12, 12, 12, 12 ); \ -+ u1 = _mm512_permutex2var_pd( tu2, indexes, tum1 ); \ -+ b3 = _mm512_mul_pd ( u1, c1 ); \ -+ indexes = _mm512_setr_epi64( 5, 5, 5, 5, 13, 13, 13, 13 ); \ -+ u1 = _mm512_permutex2var_pd( tu2, indexes, tum1 ); \ -+ b3 = _mm512_fmadd_pd ( u1, t1, b3 ); \ -+ \ -+ indexes = _mm512_setr_epi64( 6, 6, 6, 6, 10, 10, 10, 10 ); \ -+ u1 = _mm512_permutex2var_pd( tu2, indexes, tum2 ); \ -+ b3 = _mm512_fmadd_pd ( u1, c2, b3 ); \ -+ indexes = _mm512_setr_epi64( 7, 7, 7, 7, 11, 11, 11, 11 ); \ -+ u1 = _mm512_permutex2var_pd( tu2, indexes, tum2 ); \ -+ b3 = _mm512_fmadd_pd ( u1, t2, b3 ); \ -+ \ -+ indexes = _mm512_setr_epi64( 0, 0, 0, 0, 8, 8, 8, 8 ); \ -+ u1 = _mm512_permutex2var_pd( tu3, indexes, tum3 ); \ -+ b3 = _mm512_fmadd_pd ( u1, c3, b3 ); \ -+ indexes = _mm512_setr_epi64( 1, 1, 1, 1, 9, 9, 9, 9 ); \ -+ u1 = _mm512_permutex2var_pd( tu3, indexes, tum3 ); \ -+ b3 = _mm512_fmadd_pd ( u1, t3, b3 ); \ -+} -+ -+ -+ -+ -+ -+ -+/* Combine spinor entries into 2 weyl vectors -+ stored in high and low entries of a spinor -+ */ -+#define _avx512_to_weyl_1( w, b ){ \ -+ __m512i indexes; \ -+ __m512d _t; \ -+ indexes = _mm512_setr_epi64( 4, 5, 6, 7, 0, 1, 2, 3 ); \ -+ _t = _mm512_permutexvar_pd( indexes, (b) ); \ -+ w = _mm512_maskz_add_pd( 0b00001111, _t, (b) ); \ -+ w = _mm512_mask_sub_pd( w, 0b11110000, _t, (b) ); \ -+} -+ -+#define _avx512_to_weyl_2( w, b ){ \ -+ __m512i indexes; \ -+ __m512d _t; \ -+ indexes = _mm512_setr_epi64( 4, 5, 6, 7, 0, 1, 2, 3 ); \ -+ _t = _mm512_permutexvar_pd( indexes, (b) ); \ -+ _t = _mm512_mask_add_pd( _t, 0b00001111, _t, (b) ); \ -+ _t = _mm512_mask_sub_pd( _t, 0b11110000, (b), _t ); \ -+ indexes = _mm512_setr_epi64( 0, 1, 2, 3, 7, 6, 5, 4 ); \ -+ _t = _mm512_permutexvar_pd( indexes, _t ); \ -+ w = _mm512_mask_add_pd( w, 0b10101111, w, _t ); \ -+ w = _mm512_mask_sub_pd( w, 0b01010000, w, _t ); \ -+} -+ -+#define _avx512_to_weyl_3( w, b ){ \ -+ __m512i indexes; \ -+ __m512d _t; \ -+ indexes = _mm512_setr_epi64( 4, 5, 6, 7, 0, 1, 2, 3 ); \ -+ _t = _mm512_permutexvar_pd( indexes, (b) ); \ -+ _t = _mm512_mask_add_pd( _t, 0b00001111, _t, (b) ); \ -+ _t = _mm512_mask_sub_pd( _t, 0b11110000, (b), _t ); \ -+ indexes = _mm512_setr_epi64( 0, 1, 2, 3, 6, 7, 4, 5 ); \ -+ _t = _mm512_permutexvar_pd( indexes, _t ); \ -+ w = _mm512_mask_add_pd( w, 0b00111111, w, _t ); \ -+ w = _mm512_mask_sub_pd( w, 0b11000000, w, _t ); \ -+} -+ -+#define _avx512_to_weyl_4( w, b ){ \ -+ __m512i indexes; \ -+ __m512d _t; \ -+ indexes = _mm512_setr_epi64( 4, 5, 6, 7, 0, 1, 2, 3 ); \ -+ _t = _mm512_permutexvar_pd( indexes, (b) ); \ -+ _t = _mm512_mask_add_pd( _t, 0b00001111, _t, (b) ); \ -+ _t = _mm512_mask_sub_pd( _t, 0b11110000, (b), _t ); \ -+ indexes = _mm512_setr_epi64( 0, 1, 2, 3, 5, 4, 7, 6 ); \ -+ _t = _mm512_permutexvar_pd( indexes, _t ); \ -+ w = _mm512_mask_add_pd( w, 0b01101111, w, _t ); \ -+ w = _mm512_mask_sub_pd( w, 0b10010000, w, _t ); \ -+} -+ -+ -+ -+ -+/* Create a full Dirac vector by adding and subtracting the indeces of -+ * a weyl vector */ -+#define _avx512_expand_weyl( a, w ){ \ -+ __m512i indexes; \ -+ __m512d _t; \ -+ indexes = _mm512_setr_epi64( 4, 5, 6, 7, 0, 1, 2, 3 ); \ -+ _t = _mm512_permutexvar_pd( indexes, (w) ); \ -+ a = _mm512_maskz_add_pd( 0b00001111, _t, w ); \ -+ a = _mm512_mask_sub_pd( a, 0b11110000, _t, w ); \ -+} -+ -+#define _avx512_expand_weyl_2( a, w ){ \ -+ __m512i indexes; \ -+ __m512d _t1, _t2; \ -+ indexes = _mm512_setr_epi64( 7, 6, 5, 4, 7, 6, 5, 4 ); \ -+ _t1 = _mm512_permutexvar_pd( indexes, (w) ); \ -+ indexes = _mm512_setr_epi64( 0, 1, 2, 3, 0, 1, 2, 3 ); \ -+ _t2 = _mm512_permutexvar_pd( indexes, (w) ); \ -+ a = _mm512_maskz_add_pd( 0b01011010, _t2, _t1 ); \ -+ a = _mm512_mask_sub_pd( a, 0b10100101, _t2, _t1 ); \ -+} -+ -+#define _avx512_expand_weyl_3( a, w ){ \ -+ __m512i indexes; \ -+ __m512d _t1, _t2; \ -+ indexes = _mm512_setr_epi64( 6, 7, 4, 5, 6, 7, 4, 5 ); \ -+ _t1 = _mm512_permutexvar_pd( indexes, (w) ); \ -+ indexes = _mm512_setr_epi64( 0, 1, 2, 3, 0, 1, 2, 3 ); \ -+ _t2 = _mm512_permutexvar_pd( indexes, (w) ); \ -+ a = _mm512_maskz_add_pd( 0b11000011, _t2, _t1 ); \ -+ a = _mm512_mask_sub_pd( a, 0b00111100, _t2, _t1 ); \ -+} -+ -+#define _avx512_expand_weyl_4( a, w ){ \ -+ __m512i indexes; \ -+ __m512d _t1, _t2; \ -+ indexes = _mm512_setr_epi64( 5, 4, 7, 6, 5, 4, 7, 6 ); \ -+ _t1 = _mm512_permutexvar_pd( indexes, (w) ); \ -+ indexes = _mm512_setr_epi64( 0, 1, 2, 3, 0, 1, 2, 3 ); \ -+ _t2 = _mm512_permutexvar_pd( indexes, (w) ); \ -+ a = _mm512_maskz_add_pd( 0b10010110, _t2, _t1 ); \ -+ a = _mm512_mask_sub_pd( a, 0b01101001, _t2, _t1 ); \ -+} -+ -+ -+ -+ -+ -+/* Load four complex numbers. */ -+#define _avx512_load_4_d( v, c1, c2, c3, c4 ) \ -+{ \ -+ __m128d t128l, t128u; \ -+ __m256d t256l ,t256u; \ -+ t128l = _mm_loadu_pd( &(c1).re ); \ -+ t128u = _mm_loadu_pd( &(c2).re ); \ -+ t256l = _mm256_castpd128_pd256( t128l ); \ -+ t256l = _mm256_insertf128_pd( t256l, t128u, 1 ); \ -+ t128l = _mm_loadu_pd( &(c3).re ); \ -+ t128u = _mm_loadu_pd( &(c4).re ); \ -+ t256u = _mm256_castpd128_pd256( t128l ); \ -+ t256u = _mm256_insertf128_pd( t256u, t128u, 1 ); \ -+ v = _mm512_castpd256_pd512( t256l ); \ -+ v = _mm512_insertf64x4( v, t256u, 1 ); \ -+} -+ -+/* Store four complex numbers */ -+#define _avx512_store_4_d( r, v1, v2, v3, v4 ) \ -+{ \ -+ __m256d t256; \ -+ __m128d t128; \ -+ t256 = _mm512_extractf64x4_pd( r, 1 ); \ -+ t128 = _mm256_extractf128_pd( t256, 1 ); \ -+ _mm_storeu_pd( &(v4).re, t128 ); \ -+ t128 = _mm256_castpd256_pd128( t256 ); \ -+ _mm_storeu_pd( &(v3).re, t128 ); \ -+ t256 = _mm512_castpd512_pd256( r ); \ -+ t128 = _mm256_extractf128_pd( t256, 1 ); \ -+ _mm_storeu_pd( &(v2).re, t128 ); \ -+ t128 = _mm256_castpd256_pd128( t256 ); \ -+ _mm_storeu_pd( &(v1).re, t128 ); \ -+} -+ -+ -+ -+ -+/* Adding a vectors to a spinors */ -+ -+/* Load half spinors and organize for adding */ -+#define _avx512_load_s_ud_d( s1,s2,s3,s4, t1,t2,t3, sp, sm ) \ -+ s1 = _mm512_loadu_pd( (sm) ); \ -+ s2 = _mm512_loadu_pd( (sm)+8 ); \ -+ s3 = _mm512_loadu_pd( (sp) ); \ -+ s4 = _mm512_loadu_pd( (sp)+8 ); \ -+ \ -+ idx = _mm512_setr_epi64( 0,1,2,3, 8,9,10,11 ); \ -+ t1 = _mm512_permutex2var_pd( s1, idx, s3 ); \ -+ idx = _mm512_setr_epi64( 4,5,6,7, 12,13,14,15 ); \ -+ t2 = _mm512_permutex2var_pd( s1, idx, s3 ); \ -+ idx = _mm512_setr_epi64( 0,1,2,3, 8,9,10,11 ); \ -+ t3 = _mm512_permutex2var_pd( s2, idx, s4 ); \ -+ -+/* reorganize weyl vectors for adding */ -+#define _avx512_reorganize_a_ud_d( b1,b2,b3, a1,a2,a3 ) \ -+ idx = _mm512_setr_epi64( 0,1,8,9, 4,5,12,13 ); \ -+ a1 = _mm512_permutex2var_pd( b1, idx, b2 ); \ -+ idx = _mm512_setr_epi64( 0,1,10,11, 4,5,14,15 ); \ -+ a2 = _mm512_permutex2var_pd( b3, idx, b1 ); \ -+ idx = _mm512_setr_epi64( 2,3,10,11, 6,7,14,15 ); \ -+ a3 = _mm512_permutex2var_pd( b2, idx, b3 ); \ -+ -+/* store after adding */ -+#define _avx512_write_a_ud_d( a1, a2, a3, sp, sm ){ \ -+ __m256d l; \ -+ l = _mm512_castpd512_pd256( a1 ); \ -+ _mm256_storeu_pd( (sm), l ); \ -+ l = _mm512_castpd512_pd256( a2 ); \ -+ _mm256_storeu_pd( (sm)+4, l ); \ -+ l = _mm512_castpd512_pd256( a3 ); \ -+ _mm256_storeu_pd( (sm)+8, l ); \ -+ \ -+ l = _mm512_extractf64x4_pd( a1, 1 ); \ -+ _mm256_storeu_pd( (sp), l ); \ -+ l = _mm512_extractf64x4_pd( a2, 1 ); \ -+ _mm256_storeu_pd( (sp)+4, l ); \ -+ l = _mm512_extractf64x4_pd( a3, 1 ); \ -+ _mm256_storeu_pd( (sp)+8, l ); \ -+} -+ -+ -+#define _avx512_add_to_spinors( b1, b2, b3, sp, sm ) \ -+{ \ -+ __m512d s1,s2,s3,s4, a1,a2,a3, t1,t2,t3; \ -+ __m512i idx; \ -+ \ -+ _avx512_load_s_ud_d( s1,s2,s3,s4, t1,t2,t3, sp, sm ); \ -+ _avx512_reorganize_a_ud_d( b1,b2,b3, a1,a2,a3 ); \ -+ \ -+ t1 = _mm512_add_pd( a1, t1 ); \ -+ t2 = _mm512_add_pd( a2, t2 ); \ -+ t3 = _mm512_add_pd( a3, t3 ); \ -+ \ -+ _avx512_write_a_ud_d( t1, t2, t3, sp, sm ); \ -+} -+ -+#define _avx512_add_to_spinors_2( b1, b2, b3, sp, sm ) \ -+{ \ -+ __m512d s1,s2,s3,s4, a1,a2,a3, t1,t2,t3; \ -+ __m512i idx; \ -+ \ -+ _avx512_load_s_ud_d( s1,s2,s3,s4, t1,t2,t3, sp, sm ); \ -+ _avx512_reorganize_a_ud_d( b1,b2,b3, a1,a2,a3 ); \ -+ \ -+ t1 = _mm512_mask_add_pd( t1, 0b00001111, t1, a1 ); \ -+ t1 = _mm512_mask_sub_pd( t1, 0b11110000, t1, a1 ); \ -+ t2 = _mm512_mask_add_pd( t2, 0b00001111, t2, a2 ); \ -+ t2 = _mm512_mask_sub_pd( t2, 0b11110000, t2, a2 ); \ -+ t3 = _mm512_mask_add_pd( t3, 0b00001111, t3, a3 ); \ -+ t3 = _mm512_mask_sub_pd( t3, 0b11110000, t3, a3 ); \ -+ \ -+ _avx512_write_a_ud_d( t1, t2, t3, sp, sm ); \ -+} -+ -+#define _avx512_add_to_spinors_3( b1, b2, b3, sp, sm ) \ -+{ \ -+ __m512d s1,s2,s3,s4, a1,a2,a3, t1,t2,t3; \ -+ __m512i idx; \ -+ \ -+ _avx512_load_s_ud_d( s1,s2,s3,s4, t1,t2,t3, sp, sm ); \ -+ idx = _mm512_setr_epi64( 3,2,11,10, 7,6,15,14 ); \ -+ a1 = _mm512_permutex2var_pd( b1, idx, b2 ); \ -+ idx = _mm512_setr_epi64( 3,2,9,8, 7,6,13,12 ); \ -+ a2 = _mm512_permutex2var_pd( b3, idx, b1 ); \ -+ idx = _mm512_setr_epi64( 1,0,9,8, 5,4,13,12 ); \ -+ a3 = _mm512_permutex2var_pd( b2, idx, b3 ); \ -+ \ -+ t1 = _mm512_mask_add_pd( t1, 0b10100101, t1, a1 ); \ -+ t1 = _mm512_mask_sub_pd( t1, 0b01011010, t1, a1 ); \ -+ t2 = _mm512_mask_add_pd( t2, 0b10100101, t2, a2 ); \ -+ t2 = _mm512_mask_sub_pd( t2, 0b01011010, t2, a2 ); \ -+ t3 = _mm512_mask_add_pd( t3, 0b10100101, t3, a3 ); \ -+ t3 = _mm512_mask_sub_pd( t3, 0b01011010, t3, a3 ); \ -+ \ -+ _avx512_write_a_ud_d( t1, t2, t3, sp, sm ); \ -+} -+ -+#define _avx512_add_to_spinors_4( b1, b2, b3, sp, sm ) \ -+{ \ -+ __m512d s1,s2,s3,s4, a1,a2,a3, t1,t2,t3; \ -+ __m512i idx; \ -+ idx = _mm512_setr_epi64( 2,3,0,1, 6,7,4,5 ); \ -+ b1 = _mm512_permutexvar_pd( idx, b1 ); \ -+ b2 = _mm512_permutexvar_pd( idx, b2 ); \ -+ b3 = _mm512_permutexvar_pd( idx, b3 ); \ -+ \ -+ _avx512_load_s_ud_d( s1,s2,s3,s4, t1,t2,t3, sp, sm ); \ -+ idx = _mm512_setr_epi64( 0,1,8,9, 4,5,12,13 ); \ -+ a1 = _mm512_permutex2var_pd( b1, idx, b2 ); \ -+ idx = _mm512_setr_epi64( 0,1,10,11, 4,5,14,15 ); \ -+ a2 = _mm512_permutex2var_pd( b3, idx, b1 ); \ -+ idx = _mm512_setr_epi64( 2,3,10,11, 6,7,14,15 ); \ -+ a3 = _mm512_permutex2var_pd( b2, idx, b3 ); \ -+ \ -+ t1 = _mm512_mask_add_pd( t1, 0b11110000, t1, a1 ); \ -+ t1 = _mm512_mask_sub_pd( t1, 0b00001111, t1, a1 ); \ -+ t2 = _mm512_mask_add_pd( t2, 0b00111100, t2, a2 ); \ -+ t2 = _mm512_mask_sub_pd( t2, 0b11000011, t2, a2 ); \ -+ t3 = _mm512_mask_add_pd( t3, 0b00001111, t3, a3 ); \ -+ t3 = _mm512_mask_sub_pd( t3, 0b11110000, t3, a3 ); \ -+ \ -+ _avx512_write_a_ud_d( t1, t2, t3, sp, sm ); \ -+} -+ -+ -+#define _avx512_add_to_spinors_5( b1, b2, b3, sp, sm ) \ -+{ \ -+ __m512d s1,s2,s3,s4, a1,a2,a3, t1,t2,t3; \ -+ __m512i idx; \ -+ \ -+ _avx512_load_s_ud_d( s1,s2,s3,s4, t1,t2,t3, sp, sm ); \ -+ idx = _mm512_setr_epi64( 1,0,9,8, 5,4,13,12 ); \ -+ a1 = _mm512_permutex2var_pd( b1, idx, b2 ); \ -+ idx = _mm512_setr_epi64( 1,0,11,10, 5,4,15,14 ); \ -+ a2 = _mm512_permutex2var_pd( b3, idx, b1 ); \ -+ idx = _mm512_setr_epi64( 3,2,11,10, 7,6,15,14 ); \ -+ a3 = _mm512_permutex2var_pd( b2, idx, b3 ); \ -+ \ -+ t1 = _mm512_mask_add_pd( t1, 0b10100101, t1, a1 ); \ -+ t1 = _mm512_mask_sub_pd( t1, 0b01011010, t1, a1 ); \ -+ t2 = _mm512_mask_add_pd( t2, 0b01101001, t2, a2 ); \ -+ t2 = _mm512_mask_sub_pd( t2, 0b10010110, t2, a2 ); \ -+ t3 = _mm512_mask_add_pd( t3, 0b01011010, t3, a3 ); \ -+ t3 = _mm512_mask_sub_pd( t3, 0b10100101, t3, a3 ); \ -+ \ -+ _avx512_write_a_ud_d( t1, t2, t3, sp, sm ); \ -+} -+ -+ -+ -+ -+ -+#endif -diff --git a/include/sw_term.h b/include/sw_term.h -index f1ef622..b47d0a7 100644 ---- a/include/sw_term.h -+++ b/include/sw_term.h -@@ -29,6 +29,7 @@ extern void apply_sw(int vol,float mu,pauli *m,spinor *s,spinor *r); - - /* PAULI_DBLE_C */ - extern void mul_pauli_dble(double mu,pauli_dble *m,weyl_dble *s,weyl_dble *r); -+void mul_pauli2_dble(double mu, pauli_dble *m, weyl_dble *s, weyl_dble *r); - extern int inv_pauli_dble(double mu,pauli_dble *m,pauli_dble *im); - extern complex_dble det_pauli_dble(double mu,pauli_dble *m); - extern void apply_sw_dble(int vol,double mu,pauli_dble *m,spinor_dble *s, -diff --git a/main/Makefile b/main/Makefile -index 6dc4e1b..64daeaf 100644 ---- a/main/Makefile -+++ b/main/Makefile -@@ -68,12 +68,19 @@ VFLDS = vflds vinit vcom vdcom - - WFLOW = wflow - --MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \ -+STD_MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \ - $(LATTICE) $(LINALG) $(LINSOLV) $(LITTLE) $(MDFLDS) $(RANDOM) \ - $(RATFCTS) $(SAP) $(SFLDS) $(SU3FCTS) $(SW_TERM) $(TCHARGE) \ - $(UFLDS) $(UPDATE) $(UTILS) $(VFLDS) $(WFLOW) - - -+AVX512_MODULES = Dw_avx512 Dw_dble_avx512 salg_avx512 salg_dble_avx512 \ -+ pauli_avx512 pauli_dble_avx512 -+ -+AVX512_ASM_MODULES = Dw_avx512_asm Dw_dble_avx512_asm salg_avx512_asm \ -+ salg_dble_avx512_asm pauli_avx512_asm pauli_dble_avx512_asm -+ -+ - # Logging option (-mpilog or -mpitrace or -mpianim) - - LOGOPTION = -@@ -86,9 +93,11 @@ MDIR = ../modules - VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\ - $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/mdflds:$(MDIR)/su3fcts:\ - $(MDIR)/utils:$(MDIR)/forces:$(MDIR)/sflds:$(MDIR)/dirac:\ -- $(MDIR)/sw_term:$(MDIR)/tcharge:$(MDIR)/block:$(MDIR)/sap:\ -- $(MDIR)/linsolv:$(MDIR)/dfl:$(MDIR)/vflds:$(MDIR)/little:\ -- $(MDIR)/update:$(MDIR)/wflow:$(MDIR)/ratfcts -+ $(MDIR)/sw_term:$(MDIR)/tcharge:$(MDIR)/block:$(MDIR)/sap:\ -+ $(MDIR)/linsolv:$(MDIR)/dfl:$(MDIR)/vflds:$(MDIR)/little:\ -+ $(MDIR)/update:$(MDIR)/wflow:$(MDIR)/ratfcts:\ -+ $(MDIR)/linalg/avx512:$(MDIR)/dirac/avx512:$(MDIR)/sw_term/avx512 -+ - - - # additional include directories -@@ -103,21 +112,23 @@ LIBS = m - LIBPATH = $(MPI_HOME)/lib - - --# scheduling and optimization options -+# compilation, scheduling and optimization options - - CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ -- -Wall -Wno-long-long -Wstrict-prototypes -Werror \ -- -O -mno-avx -Dx64 -DPM -+ -Wall -Wno-long-long -Wstrict-prototypes \ -+ -O2 -DAVX512 -+ -+LFLAGS = $(CFLAGS) - --LFLAGS = - - # See the README in the top directory for alternative choices of the --# optimization options -O -mno-avx -Dx64 -DPM. -+# optimization options -O2 -DAVX512. - # - # The available debugging flags are - # - # -DCGNE_DBG -DFGCR_DBG -FGCR4VD_DBG -DMSCG_DBG - # -DDFL_MODES_DBG -DMDINT_DBG -DRWRAT_DBG -+# -DAVX512 -DAVX512_ASM - # - # Add these (or some of these) to CFLAGS if debugging output is desired. - -@@ -127,25 +138,40 @@ SHELL=/bin/bash - CC=mpicc - CLINKER=$(CC) - -+ -+#Check CFLAGS to find which AVX512 flags are active -+ifneq (,$(findstring AVX512_ASM,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_ASM_MODULES) -+else -+ifneq (,$(findstring AVX512,$(CFLAGS))) -+MODULES= $(STD_MODULES) $(AVX512_MODULES) -+else -+MODULES= $(STD_MODULES) -+endif -+endif -+ - PGMS= $(MAIN) $(MODULES) - - -include $(addsuffix .d,$(PGMS)) - - - # rule to make dependencies -- - $(addsuffix .d,$(PGMS)): %.d: %.c Makefile - @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ - - - # rule to compile source programs -+$(addsuffix .o,$(MAIN) $(STD_MODULES)): %.o: %.c Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - --$(addsuffix .o,$(PGMS)): %.o: %.c Makefile -+# pattern to compile files in the avx512 directiories -+$(addsuffix .o,$(AVX512_MODULES)): %.o: %.c Makefile - $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - -+$(addsuffix .o,$(AVX512_ASM_MODULES)): %.o: %.s Makefile -+ $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) - - # rule to link object files -- - $(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile - $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(LFLAGS) $(LOGOPTION) \ - $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ -diff --git a/modules/dirac/Dw.c b/modules/dirac/Dw.c -index 3900283..bb18a72 100644 ---- a/modules/dirac/Dw.c -+++ b/modules/dirac/Dw.c -@@ -122,7 +122,27 @@ static const spinor s0={{{0.0f,0.0f},{0.0f,0.0f},{0.0f,0.0f}}, - {{0.0f,0.0f},{0.0f,0.0f},{0.0f,0.0f}}}; - static spin_t rs ALIGNED32; - --#if (defined AVX) -+ -+#if ( defined AVX512 ) -+ -+#include "avx512.h" -+#include "sse.h" -+ -+void doe_avx512(int *piup, int *pidn, su3 *u, spinor *pk, float coe, spin_t *rs); -+static void doe(int *piup, int *pidn, su3 *u, spinor *pk) -+{ -+ doe_avx512( piup, pidn, u, pk, coe, &rs ); -+} -+ -+void deo_avx512(int *piup, int *pidn, su3 *u, spinor *pl, float ceo, spin_t *rs); -+static void deo(int *piup, int *pidn, su3 *u, spinor *pl) -+{ -+ deo_avx512( piup, pidn, u, pl, ceo, &rs ); -+} -+ -+ -+ -+#elif ( defined AVX ) - #include "avx.h" - - #define _load_cst(c) \ -diff --git a/modules/dirac/Dw_dble.c b/modules/dirac/Dw_dble.c -index d683cbb..4fc58f5 100644 ---- a/modules/dirac/Dw_dble.c -+++ b/modules/dirac/Dw_dble.c -@@ -1,4 +1,3 @@ -- - /******************************************************************************* - * - * File Dw_dble.c -@@ -121,7 +120,22 @@ static const spinor_dble sd0={{{0.0,0.0},{0.0,0.0},{0.0,0.0}}, - {{0.0,0.0},{0.0,0.0},{0.0,0.0}}}; - static spin_t rs ALIGNED32; - --#if (defined AVX) -+#if ( defined AVX512 ) -+ -+void doe_dble_avx512(int *piup,int *pidn,su3_dble *u,spinor_dble *pk,double coe, spin_t *rs); -+static void doe(int *piup,int *pidn,su3_dble *u,spinor_dble *pk) -+{ -+ doe_dble_avx512( piup, pidn, u, pk, coe, &rs ); -+} -+ -+void deo_dble_avx512( int *piup, int *pidn, su3_dble *u, spinor_dble *pl, double ceo, spin_t *rs); -+static void deo(int *piup, int *pidn, su3_dble *u, spinor_dble *pl) -+{ -+ deo_dble_avx512( piup, pidn, u, pl, ceo, &rs ); -+} -+ -+ -+#elif ( defined AVX ) - #include "avx.h" - - #define _load_cst(c) \ -@@ -1412,9 +1426,7 @@ void Dw_dble(double mu,spinor_dble *s,spinor_dble *r) - { - doe(piup,pidn,u,s); - -- mul_pauli_dble(mu,m,(*so).w,(*ro).w); -- mul_pauli_dble(-mu,m+1,(*so).w+1,(*ro).w+1); -- -+ mul_pauli2_dble(mu, m, (*so).w, (*ro).w); - _vector_add_assign((*ro).s.c1,rs.s.c1); - _vector_add_assign((*ro).s.c2,rs.s.c2); - _vector_add_assign((*ro).s.c3,rs.s.c3); -@@ -1442,9 +1454,7 @@ void Dw_dble(double mu,spinor_dble *s,spinor_dble *r) - { - doe(piup,pidn,u,s); - -- mul_pauli_dble(mu,m,(*so).w,(*ro).w); -- mul_pauli_dble(-mu,m+1,(*so).w+1,(*ro).w+1); -- -+ mul_pauli2_dble(mu, m, (*so).w, (*ro).w); - _vector_add_assign((*ro).s.c1,rs.s.c1); - _vector_add_assign((*ro).s.c2,rs.s.c2); - _vector_add_assign((*ro).s.c3,rs.s.c3); -@@ -1488,8 +1498,7 @@ void Dwee_dble(double mu,spinor_dble *s,spinor_dble *r) - - if ((t>0)&&((t<(N0-1))||(bc!=0))) - { -- mul_pauli_dble(mu,m,(*se).w,(*re).w); -- mul_pauli_dble(-mu,m+1,(*se).w+1,(*re).w+1); -+ mul_pauli2_dble(mu, m, (*se).w, (*re).w); - } - else - { -@@ -1505,9 +1514,7 @@ void Dwee_dble(double mu,spinor_dble *s,spinor_dble *r) - { - for (;m0)&&((t<(N0-1))||(bc!=0))) - { -- mul_pauli_dble(mu,m,(*so).w,(*ro).w); -- mul_pauli_dble(-mu,m+1,(*so).w+1,(*ro).w+1); -+ mul_pauli2_dble(mu, m, (*so).w, (*ro).w); - } - else - { -@@ -1559,9 +1565,7 @@ void Dwoo_dble(double mu,spinor_dble *s,spinor_dble *r) - { - for (;m -+#include -+#include -+#include "mpi.h" -+#include "su3.h" -+#include "utils.h" -+#include "flags.h" -+#include "lattice.h" -+#include "uflds.h" -+#include "sflds.h" -+#include "sw_term.h" -+#include "block.h" -+#include "dirac.h" -+#include "global.h" -+ -+#define N0 (NPROC0 * L0) -+ -+typedef union -+{ -+ spinor s; -+ weyl w[2]; -+} spin_t; -+ -+#include "avx512.h" -+void doe_avx512(int *piup, int *pidn, su3 *u, spinor *pk, float coe, spin_t *rs) -+{ -+ spinor *sp, *sm, *sp2, *sm2; -+ su3 *up, *up1, *u1, *up2, *u2; -+ -+ /* 512-bit wide stores for the spinor for each color */ -+ __m512 a1, a2, a3; -+ __m512 b1, b2, b3; -+ __m256 c1, c2, c3; -+ -+ __m256 c256; -+ -+ /******************************* direction 0,1 **********************************/ -+ -+ sp = pk + (*(piup++)); -+ sm = pk + (*(pidn++)); -+ sp2 = pk + (*(piup++)); -+ sm2 = pk + (*(pidn++)); -+ -+ _avx512_load_4_halfspinor_f( a1, a2, a3, -+ &(*sp).c1.c1.re, &(*sm).c1.c1.re, -+ &(*sp2).c1.c1.re, &(*sm2).c1.c1.re ); -+ _avx512_load_4_halfspinor_f_reverse_up( b1, b2, b3, -+ &(*sp).c3.c1.re, &(*sm).c3.c1.re, -+ &(*sp2).c3.c1.re, &(*sm2).c3.c1.re ); -+ -+ sp = pk + (*(piup++)); -+ _mm_prefetch( (char *) sp, _MM_HINT_T0 ); -+ sm = pk + (*(pidn++)); -+ _mm_prefetch( (char *) sm, _MM_HINT_T0 ); -+ sp2 = pk + (*(piup)); -+ _mm_prefetch( (char *) sp2, _MM_HINT_T0 ); -+ sm2 = pk + (*(pidn)); -+ _mm_prefetch( (char *) sm2, _MM_HINT_T0 ); -+ -+ up1 = u; -+ u1 = u+1; -+ up2 = u+2; -+ u2 = u+3; u=u2; -+ _avx512_dirac_combine_f_1( a1, b1 ); -+ _avx512_dirac_combine_f_1( a2, b2 ); -+ _avx512_dirac_combine_f_1( a3, b3 ); -+ -+ avx512_su3_mixed_multiply_8( *up1, *u1, *up2, *u2, b1, b2, b3, a1, a2, a3 ); -+ -+ _avx512_to_weyl_f_12( c1, b1 ); -+ _avx512_to_weyl_f_12( c2, b2 ); -+ _avx512_to_weyl_f_12( c3, b3 ); -+ -+ /******************************* direction 2,3 *********************************/ -+ -+ _avx512_load_4_halfspinor_f( a1, a2, a3, -+ &(*sp).c1.c1.re,&(*sm).c1.c1.re, -+ &(*sp2).c1.c1.re, &(*sm2).c1.c1.re ); -+ _avx512_load_4_halfspinor_f_reverse_dn( b1, b2, b3, -+ &(*sp).c3.c1.re, &(*sm).c3.c1.re, -+ &(*sp2).c3.c1.re, &(*sm2).c3.c1.re ); -+ -+ _avx512_dirac_combine_f_2( a1, b1 ); -+ _avx512_dirac_combine_f_2( a2, b2 ); -+ _avx512_dirac_combine_f_2( a3, b3 ); -+ -+ up1 = u+1; -+ u1 = u+2; -+ up2 = u+3; -+ u2 = u+4; -+ avx512_su3_mixed_multiply_8( *up1, *u1, *up2, *u2, b1, b2, b3, a1, a2, a3 ); -+ -+ -+ c256 = _mm256_broadcast_ss( &coe ); -+ -+ _avx512_to_weyl_f_34( c1, b1 ); -+ _avx512_to_weyl_f_34( c2, b2 ); -+ _avx512_to_weyl_f_34( c3, b3 ); -+ -+ c1 = _mm256_mul_ps( c1, c256); -+ c2 = _mm256_mul_ps( c2, c256); -+ c3 = _mm256_mul_ps( c3, c256); -+ -+ _avx512_write_6_hwv_f( c1, c2, c3, &rs->s.c1.c1.re); -+} -+ -+void deo_avx512(int *piup, int *pidn, su3 *u, spinor *pl, float ceo, spin_t *rs) -+{ -+ spinor *sp, *sm, *sp2, *sm2; -+ su3 *up, *up1, *u1, *up2, *u2; -+ -+ /* 512-bit wide stores for the spinor for each color */ -+ __m512 a1, a2, a3; -+ __m512 b1, b2, b3; -+ __m256 c1, c2, c3; -+ -+ __m256 c256; -+ -+ -+ /******************************* direction 0 *********************************/ -+ -+ sp = pl + (*(piup++)); -+ _mm_prefetch( (char *) sp, _MM_HINT_T0 ); -+ sm = pl + (*(pidn++)); -+ _mm_prefetch( (char *) sm, _MM_HINT_T0 ); -+ sp2 = pl + (*(piup++)); -+ _mm_prefetch( (char *) sp2, _MM_HINT_T0 ); -+ sm2 = pl + (*(pidn++)); -+ _mm_prefetch( (char *) sm2, _MM_HINT_T0 ); -+ -+ _avx512_load_6_hwv_f( c1,c2,c3, &rs->s.c1.c1.re ); -+ -+ c256 = _mm256_broadcast_ss( &ceo ); -+ c1 = _mm256_mul_ps( c1, c256 ); -+ c2 = _mm256_mul_ps( c2, c256 ); -+ c3 = _mm256_mul_ps( c3, c256 ); -+ -+ _avx512_to_dirac_f_1( a1, c1 ); -+ _avx512_to_dirac_f_1( a2, c2 ); -+ _avx512_to_dirac_f_1( a3, c3 ); -+ -+ _avx512_to_dirac_f_2( a1, c1 ); -+ _avx512_to_dirac_f_2( a2, c2 ); -+ _avx512_to_dirac_f_2( a3, c3 ); -+ -+ up1 = u; -+ u1 = u+1; -+ up2 = u+2; -+ u2 = u+3; u=u2; -+ avx512_su3_mixed_multiply_8( *u1, *up1, *u2, *up2, b1, b2, b3, a1, a2, a3 ); -+ -+ _avx512_load_4_halfspinor_f( a1, a2, a3, -+ &(*sm).c1.c1.re, &(*sp).c1.c1.re, -+ &(*sm2).c1.c1.re, &(*sp2).c1.c1.re ); -+ a1 = _mm512_add_ps( a1, b1 ); -+ a2 = _mm512_add_ps( a2, b2 ); -+ a3 = _mm512_add_ps( a3, b3 ); -+ _avx512_write_4_halfspinor_f( a1, a2, a3, -+ &(*sm).c1.c1.re, &(*sp).c1.c1.re, -+ &(*sm2).c1.c1.re, &(*sp2).c1.c1.re ); -+ -+ _avx512_load_4_halfspinor_f_reverse_up( a1, a2, a3, -+ &(*sm).c3.c1.re, &(*sp).c3.c1.re, -+ &(*sm2).c3.c1.re, &(*sp2).c3.c1.re ); -+ _avx512_dirac_combine_f_3( a1, b1 ); -+ _avx512_dirac_combine_f_3( a2, b2 ); -+ _avx512_dirac_combine_f_3( a3, b3 ); -+ _avx512_write_4_halfspinor_f_reverse_up( a1, a2, a3, -+ &(*sm).c3.c1.re, &(*sp).c3.c1.re, -+ &(*sm2).c3.c1.re, &(*sp2).c3.c1.re ); -+ -+ /******************************* direction 2 *********************************/ -+ -+ sp = pl + (*(piup++)); -+ _mm_prefetch( (char *) sp, _MM_HINT_T0 ); -+ sm = pl + (*(pidn++)); -+ _mm_prefetch( (char *) sm, _MM_HINT_T0 ); -+ sp2 = pl + (*(piup++)); -+ _mm_prefetch( (char *) sp2, _MM_HINT_T0 ); -+ sm2 = pl + (*(pidn++)); -+ _mm_prefetch( (char *) sm2, _MM_HINT_T0 ); -+ -+ _avx512_to_dirac_f_3( a1, c1 ); -+ _avx512_to_dirac_f_3( a2, c2 ); -+ _avx512_to_dirac_f_3( a3, c3 ); -+ -+ up1 = u+1; -+ u1 = u+2; -+ up2 = u+3; -+ u2 = u+4; -+ avx512_su3_mixed_multiply_8( *u1, *up1, *u2, *up2, b1, b2, b3, a1, a2, a3 ); -+ -+ _avx512_load_4_halfspinor_f( a1, a2, a3, &(*sm).c1.c1.re, &(*sp).c1.c1.re, &(*sm2).c1.c1.re, &(*sp2).c1.c1.re ); -+ a1 = _mm512_add_ps( a1, b1 ); -+ a2 = _mm512_add_ps( a2, b2 ); -+ a3 = _mm512_add_ps( a3, b3 ); -+ _avx512_write_4_halfspinor_f( a1, a2, a3, &(*sm).c1.c1.re, &(*sp).c1.c1.re, &(*sm2).c1.c1.re, &(*sp2).c1.c1.re ); -+ -+ _avx512_load_4_halfspinor_f_reverse_dn( a1, a2, a3, &(*sm).c3.c1.re, &(*sp).c3.c1.re, &(*sm2).c3.c1.re, &(*sp2).c3.c1.re ); -+ _avx512_dirac_combine_f_4( a1, b1 ); -+ _avx512_dirac_combine_f_4( a2, b2 ); -+ _avx512_dirac_combine_f_4( a3, b3 ); -+ _avx512_write_4_halfspinor_f_reverse_dn( a1, a2, a3, &(*sm).c3.c1.re, &(*sp).c3.c1.re, &(*sm2).c3.c1.re, &(*sp2).c3.c1.re ); -+} -diff --git a/modules/dirac/avx512/Dw_avx512_asm.s b/modules/dirac/avx512/Dw_avx512_asm.s -new file mode 100644 -index 0000000..4ccc5db ---- /dev/null -+++ b/modules/dirac/avx512/Dw_avx512_asm.s -@@ -0,0 +1,1064 @@ -+# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 17.0.4.196 Build 20170411"; -+# mark_description "-I../../../include -I.. -I/cineca/prod/opt/compilers/intel/pe-xe-2017/binary/impi/2017.3.196/intel64/include"; -+# mark_description " -isystem /cineca/prod/opt/compilers/intel/pe-xe-2018/binary/impi/2018.1.163/include64/ -std=c89 -xCORE-AVX5"; -+# mark_description "12 -mtune=skylake -DAVX512 -O3 -Ddirac_counters -pedantic -fstrict-aliasing -Wno-long-long -Wstrict-prototyp"; -+# mark_description "es -S"; -+ .file "Dw_avx512.c" -+ .text -+..TXTST0: -+# -- Begin doe_avx512 -+ .text -+# mark_begin; -+ .align 16,0x90 -+ .globl doe_avx512 -+# --- doe_avx512(int *, int *, su3 *, spinor *, float, spin_t *) -+doe_avx512: -+# parameter 1: %rdi -+# parameter 2: %rsi -+# parameter 3: %rdx -+# parameter 4: %rcx -+# parameter 5: %xmm0 -+# parameter 6: %r8 -+..B1.1: # Preds ..B1.0 -+ # Execution count [1.00e+00] -+ .cfi_startproc -+..___tag_value_doe_avx512.1: -+..L2: -+ #26.1 -+ pushq %rbx #26.1 -+ .cfi_def_cfa_offset 16 -+ movq %rsp, %rbx #26.1 -+ .cfi_def_cfa 3, 16 -+ .cfi_offset 3, -16 -+ andq $-64, %rsp #26.1 -+ pushq %rbp #26.1 -+ pushq %rbp #26.1 -+ movq 8(%rbx), %rbp #26.1 -+ movq %rbp, 8(%rsp) #26.1 -+ movq %rsp, %rbp #26.1 -+ .cfi_escape 0x10, 0x06, 0x02, 0x76, 0x00 -+ movslq (%rdi), %rax #39.16 -+ movslq (%rsi), %r11 #40.16 -+ movslq 4(%rdi), %r10 #41.17 -+ vmovss %xmm0, -16(%rbp) #26.1 -+ lea (%rax,%rax,2), %r9 #39.8 -+ shlq $5, %r9 #39.8 -+ lea (%r11,%r11,2), %r11 #40.8 -+ shlq $5, %r11 #40.8 -+ lea (%r10,%r10,2), %r10 #41.9 -+ movslq 4(%rsi), %rax #42.17 -+ shlq $5, %r10 #41.9 -+ vmovups (%rcx,%r9), %xmm29 #44.3 -+ vmovups 16(%rcx,%r9), %xmm10 #44.3 -+ vmovups 32(%rcx,%r9), %xmm8 #44.3 -+ vmovups 48(%rcx,%r9), %xmm14 #47.3 -+ vmovups 64(%rcx,%r9), %xmm12 #47.3 -+ vmovups 80(%rcx,%r9), %xmm26 #47.3 -+ vmovups .L_2il0floatpacket.10(%rip), %zmm11 #47.3 -+ vmovups .L_2il0floatpacket.11(%rip), %zmm6 #47.3 -+ movslq 8(%rdi), %r9 #51.16 -+ lea (%rax,%rax,2), %rax #42.9 -+ shlq $5, %rax #42.9 -+ vmovaps %zmm11, %zmm25 #47.3 -+ lea (%r9,%r9,2), %r9 #51.8 -+ shlq $5, %r9 #51.8 -+ vinsertf32x4 $1, (%rcx,%r11), %zmm29, %zmm21 #44.3 -+ vinsertf32x4 $2, (%rcx,%r10), %zmm21, %zmm22 #44.3 -+ vinsertf32x4 $3, (%rcx,%rax), %zmm22, %zmm19 #44.3 -+ vinsertf32x4 $1, 16(%rcx,%r11), %zmm10, %zmm16 #44.3 -+ vinsertf32x4 $1, 32(%rcx,%r11), %zmm8, %zmm9 #44.3 -+ vinsertf32x4 $1, 48(%rcx,%r11), %zmm14, %zmm0 #47.3 -+ vinsertf32x4 $1, 64(%rcx,%r11), %zmm12, %zmm7 #47.3 -+ vinsertf32x4 $1, 80(%rcx,%r11), %zmm26, %zmm29 #47.3 -+ vinsertf32x4 $2, 16(%rcx,%r10), %zmm16, %zmm17 #44.3 -+ vinsertf32x4 $2, 32(%rcx,%r10), %zmm9, %zmm18 #44.3 -+ vinsertf32x4 $2, 48(%rcx,%r10), %zmm0, %zmm15 #47.3 -+ vinsertf32x4 $2, 64(%rcx,%r10), %zmm7, %zmm24 #47.3 -+ vinsertf32x4 $2, 80(%rcx,%r10), %zmm29, %zmm5 #47.3 -+ vinsertf32x4 $3, 16(%rcx,%rax), %zmm17, %zmm20 #44.3 -+ vinsertf32x4 $3, 32(%rcx,%rax), %zmm18, %zmm13 #44.3 -+ vinsertf32x4 $3, 48(%rcx,%rax), %zmm15, %zmm22 #47.3 -+ vinsertf32x4 $3, 64(%rcx,%rax), %zmm24, %zmm16 #47.3 -+ vinsertf32x4 $3, 80(%rcx,%rax), %zmm5, %zmm28 #47.3 -+ vshufps $228, %zmm20, %zmm19, %zmm27 #44.3 -+ vshufps $78, %zmm13, %zmm19, %zmm3 #44.3 -+ vshufps $228, %zmm13, %zmm20, %zmm4 #44.3 -+ vpermi2ps %zmm16, %zmm22, %zmm25 #47.3 -+ vpermt2ps %zmm28, %zmm6, %zmm22 #47.3 -+ vpermt2ps %zmm28, %zmm11, %zmm16 #47.3 -+ prefetcht0 (%rcx,%r9) #52.3 -+ movslq 8(%rsi), %r10 #53.16 -+ lea (%r10,%r10,2), %rax #53.8 -+ movl $23055, %r10d #64.3 -+ shlq $5, %rax #53.8 -+ kmovw %r10d, %k1 #64.3 -+ movl $42480, %r10d #64.3 -+ kmovw %r10d, %k2 #64.3 -+ movl $38595, %r10d #83.3 -+ kmovw %r10d, %k3 #83.3 -+ movl $26940, %r10d #83.3 -+ kmovw %r10d, %k4 #83.3 -+ prefetcht0 (%rcx,%rax) #54.3 -+ movslq 12(%rdi), %rdi #55.16 -+ lea (%rdi,%rdi,2), %r10 #55.9 -+ shlq $5, %r10 #55.9 -+ prefetcht0 (%rcx,%r10) #56.3 -+ movslq 12(%rsi), %rsi #57.16 -+ lea (%rsi,%rsi,2), %rdi #57.9 -+ shlq $5, %rdi #57.9 -+ prefetcht0 (%rcx,%rdi) #58.3 -+ vmovups .L_2il0floatpacket.12(%rip), %zmm11 #64.3 -+ vmovups (%rdx), %zmm31 #68.3 -+ vmovups 144(%rdx), %zmm0 #68.3 -+ vmovups .L_2il0floatpacket.15(%rip), %zmm12 #68.3 -+ vmovups .L_2il0floatpacket.14(%rip), %zmm7 #68.3 -+ vmovups .L_2il0floatpacket.18(%rip), %zmm15 #68.3 -+ vmovups .L_2il0floatpacket.17(%rip), %zmm14 #68.3 -+ vmovups .L_2il0floatpacket.16(%rip), %zmm13 #68.3 -+ vmovups 64(%rdx), %zmm5 #68.3 -+ vmovups 208(%rdx), %zmm6 #68.3 -+ vpermps %zmm22, %zmm11, %zmm10 #65.3 -+ vpermps %zmm25, %zmm11, %zmm21 #64.3 -+ vpermps %zmm16, %zmm11, %zmm17 #66.3 -+ vaddps %zmm10, %zmm3, %zmm3{%k1} #65.3 -+ vaddps %zmm21, %zmm27, %zmm27{%k1} #64.3 -+ vaddps %zmm17, %zmm4, %zmm4{%k1} #66.3 -+ vsubps %zmm10, %zmm3, %zmm3{%k2} #65.3 -+ vsubps %zmm21, %zmm27, %zmm27{%k2} #64.3 -+ vsubps %zmm17, %zmm4, %zmm4{%k2} #66.3 -+ vmovups .L_2il0floatpacket.13(%rip), %zmm10 #68.3 -+ vmovups .L_2il0floatpacket.20(%rip), %zmm17 #68.3 -+ vmovups .L_2il0floatpacket.19(%rip), %zmm16 #68.3 -+ vmovups .L_2il0floatpacket.27(%rip), %zmm22 #68.3 -+ vmovaps %zmm31, %zmm26 #68.3 -+ vpermt2ps 72(%rdx), %zmm10, %zmm26 #68.3 -+ vmovaps %zmm0, %zmm24 #68.3 -+ vpermt2ps 216(%rdx), %zmm10, %zmm24 #68.3 -+ vmovaps %zmm26, %zmm9 #68.3 -+ vpermt2ps %zmm24, %zmm12, %zmm9 #68.3 -+ vmovaps %zmm26, %zmm8 #68.3 -+ vpermt2ps %zmm24, %zmm7, %zmm8 #68.3 -+ vmulps %zmm9, %zmm3, %zmm28 #68.3 -+ vmulps %zmm8, %zmm27, %zmm25 #68.3 -+ vmovups .L_2il0floatpacket.23(%rip), %zmm9 #68.3 -+ vmovups .L_2il0floatpacket.21(%rip), %zmm8 #68.3 -+ vpermt2ps 72(%rdx), %zmm9, %zmm31 #68.3 -+ vpermt2ps 216(%rdx), %zmm9, %zmm0 #68.3 -+ vpermilps $177, %zmm3, %zmm2 #68.3 -+ vmulps %zmm2, %zmm15, %zmm1 #68.3 -+ vpermilps $177, %zmm27, %zmm20 #68.3 -+ vmovaps %zmm26, %zmm19 #68.3 -+ vmulps %zmm15, %zmm20, %zmm30 #68.3 -+ vmovups .L_2il0floatpacket.25(%rip), %zmm20 #68.3 -+ vpermt2ps %zmm24, %zmm14, %zmm19 #68.3 -+ vmovaps %zmm26, %zmm18 #68.3 -+ vpermt2ps %zmm24, %zmm13, %zmm18 #68.3 -+ vfmadd231ps %zmm27, %zmm19, %zmm28 #68.3 -+ vmovups .L_2il0floatpacket.24(%rip), %zmm19 #68.3 -+ vfmadd231ps %zmm3, %zmm18, %zmm25 #68.3 -+ vmovups .L_2il0floatpacket.22(%rip), %zmm18 #68.3 -+ vmovaps %zmm26, %zmm29 #68.3 -+ vpermt2ps %zmm24, %zmm17, %zmm29 #68.3 -+ vmovaps %zmm26, %zmm23 #68.3 -+ vpermt2ps %zmm24, %zmm16, %zmm23 #68.3 -+ vfmadd231ps %zmm1, %zmm29, %zmm28 #68.3 -+ vfmadd231ps %zmm30, %zmm23, %zmm25 #68.3 -+ vmovaps %zmm26, %zmm21 #68.3 -+ vpermt2ps %zmm24, %zmm18, %zmm26 #68.3 -+ vpermt2ps %zmm24, %zmm8, %zmm21 #68.3 -+ vfmadd231ps %zmm30, %zmm26, %zmm28 #68.3 -+ vfmadd231ps %zmm1, %zmm21, %zmm25 #68.3 -+ vmovups .L_2il0floatpacket.26(%rip), %zmm21 #68.3 -+ vmovaps %zmm31, %zmm26 #68.3 -+ vpermt2ps %zmm0, %zmm20, %zmm26 #68.3 -+ vmulps %zmm26, %zmm27, %zmm26 #68.3 -+ vmovaps %zmm31, %zmm27 #68.3 -+ vmovaps %zmm31, %zmm24 #68.3 -+ vpermt2ps %zmm0, %zmm21, %zmm27 #68.3 -+ vpermt2ps %zmm0, %zmm19, %zmm24 #68.3 -+ vfmadd231ps %zmm4, %zmm27, %zmm28 #68.3 -+ vfmadd231ps %zmm4, %zmm24, %zmm25 #68.3 -+ vpermilps $177, %zmm4, %zmm27 #68.3 -+ vmovaps %zmm31, %zmm24 #68.3 -+ vmulps %zmm27, %zmm15, %zmm23 #68.3 -+ vmovups .L_2il0floatpacket.30(%rip), %zmm27 #68.3 -+ vpermt2ps %zmm0, %zmm22, %zmm24 #68.3 -+ vfmadd213ps %zmm26, %zmm24, %zmm3 #68.3 -+ vmovups .L_2il0floatpacket.28(%rip), %zmm24 #68.3 -+ vmovups .L_2il0floatpacket.29(%rip), %zmm26 #68.3 -+ vmovaps %zmm31, %zmm29 #68.3 -+ vmovaps %zmm31, %zmm2 #68.3 -+ vpermt2ps %zmm0, %zmm24, %zmm29 #68.3 -+ vpermt2ps %zmm0, %zmm26, %zmm2 #68.3 -+ vfmadd231ps %zmm23, %zmm29, %zmm25 #68.3 -+ vmovups .L_2il0floatpacket.31(%rip), %zmm29 #68.3 -+ vfmadd213ps %zmm3, %zmm2, %zmm30 #68.3 -+ vmovups .L_2il0floatpacket.32(%rip), %zmm2 #68.3 -+ vmovaps %zmm31, %zmm3 #68.3 -+ vpermt2ps %zmm0, %zmm27, %zmm3 #68.3 -+ vpermt2ps %zmm0, %zmm29, %zmm31 #68.3 -+ vpermt2ps 136(%rdx), %zmm2, %zmm5 #68.3 -+ vpermt2ps 280(%rdx), %zmm2, %zmm6 #68.3 -+ vfmadd231ps %zmm23, %zmm3, %zmm28 #68.3 -+ vmovups .L_2il0floatpacket.33(%rip), %zmm3 #68.3 -+ vfmadd213ps %zmm30, %zmm31, %zmm1 #68.3 -+ vmovups .L_2il0floatpacket.35(%rip), %ymm0 #70.3 -+ vmovaps %zmm5, %zmm31 #68.3 -+ vpermt2ps %zmm6, %zmm3, %zmm31 #68.3 -+ vfmadd213ps %zmm1, %zmm31, %zmm4 #68.3 -+ vmovups .L_2il0floatpacket.34(%rip), %zmm1 #68.3 -+ vpermt2ps %zmm6, %zmm1, %zmm5 #68.3 -+ vfmadd213ps %zmm4, %zmm5, %zmm23 #68.3 -+ vmovups .L_2il0floatpacket.36(%rip), %ymm4 #70.3 -+ vextractf64x4 $1, %zmm25, %ymm5 #70.3 -+ vmovaps %zmm25, %zmm6 #70.3 -+ vpermps %ymm5, %ymm0, %ymm25 #70.3 -+ vpermps %ymm6, %ymm0, %ymm30 #70.3 -+ vfmadd213ps %ymm5, %ymm4, %ymm25 #70.3 -+ vmovups .L_2il0floatpacket.37(%rip), %ymm5 #70.3 -+ vfmadd213ps %ymm30, %ymm4, %ymm6 #70.3 -+ vmovups .L_2il0floatpacket.38(%rip), %ymm30 #70.3 -+ vpermilps %ymm5, %ymm25, %ymm31 #70.3 -+ vfmadd213ps %ymm6, %ymm30, %ymm31 #70.3 -+ vmovups %ymm31, -112(%rbp) #70.3[spill] -+ vmovaps %zmm28, %zmm31 #71.3 -+ vextractf64x4 $1, %zmm28, %ymm28 #71.3 -+ vpermps %ymm31, %ymm0, %ymm6 #71.3 -+ vfmadd213ps %ymm6, %ymm4, %ymm31 #71.3 -+ vpermps %ymm28, %ymm0, %ymm6 #71.3 -+ vfmadd213ps %ymm28, %ymm4, %ymm6 #71.3 -+ vpermilps %ymm5, %ymm6, %ymm25 #71.3 -+ vfmadd213ps %ymm31, %ymm30, %ymm25 #71.3 -+ vmovups %ymm25, -80(%rbp) #71.3[spill] -+ vmovaps %zmm23, %zmm25 #72.3 -+ vextractf64x4 $1, %zmm23, %ymm23 #72.3 -+ vpermps %ymm23, %ymm0, %ymm31 #72.3 -+ vpermps %ymm25, %ymm0, %ymm28 #72.3 -+ vfmadd213ps %ymm23, %ymm4, %ymm31 #72.3 -+ vfmadd213ps %ymm28, %ymm4, %ymm25 #72.3 -+ vpermilps %ymm5, %ymm31, %ymm4 #72.3 -+ vfmadd213ps %ymm25, %ymm30, %ymm4 #72.3 -+ vmovups 16(%rcx,%r9), %xmm25 #76.3 -+ vmovups (%rcx,%r9), %xmm5 #76.3 -+ vmovups 32(%rcx,%r9), %xmm30 #76.3 -+ vmovups %ymm4, -48(%rbp) #72.3[spill] -+ vinsertf32x4 $1, 16(%rcx,%rax), %zmm25, %zmm23 #76.3 -+ vinsertf32x4 $2, 16(%rcx,%r10), %zmm23, %zmm31 #76.3 -+ vinsertf32x4 $1, (%rcx,%rax), %zmm5, %zmm6 #76.3 -+ vinsertf32x4 $2, (%rcx,%r10), %zmm6, %zmm28 #76.3 -+ vinsertf32x4 $3, 16(%rcx,%rdi), %zmm31, %zmm6 #76.3 -+ vmovups 48(%rcx,%r9), %xmm31 #79.3 -+ vinsertf32x4 $3, (%rcx,%rdi), %zmm28, %zmm5 #76.3 -+ vshufps $228, %zmm6, %zmm5, %zmm23 #76.3 -+ vinsertf32x4 $1, 32(%rcx,%rax), %zmm30, %zmm4 #76.3 -+ vinsertf32x4 $2, 32(%rcx,%r10), %zmm4, %zmm28 #76.3 -+ vinsertf32x4 $3, 32(%rcx,%rdi), %zmm28, %zmm25 #76.3 -+ vshufps $78, %zmm25, %zmm5, %zmm5 #76.3 -+ vshufps $228, %zmm25, %zmm6, %zmm6 #76.3 -+ vmovups 64(%rcx,%r9), %xmm25 #79.3 -+ vinsertf32x4 $1, 48(%rcx,%rax), %zmm31, %zmm30 #79.3 -+ vinsertf32x4 $2, 48(%rcx,%r10), %zmm30, %zmm4 #79.3 -+ vinsertf32x4 $3, 48(%rcx,%rdi), %zmm4, %zmm28 #79.3 -+ vmovups 80(%rcx,%r9), %xmm4 #79.3 -+ vinsertf32x4 $1, 64(%rcx,%rax), %zmm25, %zmm31 #79.3 -+ vinsertf32x4 $2, 64(%rcx,%r10), %zmm31, %zmm30 #79.3 -+ vinsertf32x4 $3, 64(%rcx,%rdi), %zmm30, %zmm30 #79.3 -+ vinsertf32x4 $1, 80(%rcx,%rax), %zmm4, %zmm25 #79.3 -+ vinsertf32x4 $2, 80(%rcx,%r10), %zmm25, %zmm31 #79.3 -+ vmovups .L_2il0floatpacket.39(%rip), %zmm25 #79.3 -+ vinsertf32x4 $3, 80(%rcx,%rdi), %zmm31, %zmm31 #79.3 -+ vmovaps %zmm28, %zmm4 #79.3 -+ vpermt2ps %zmm30, %zmm25, %zmm4 #79.3 -+ vpermt2ps %zmm31, %zmm25, %zmm30 #79.3 -+ vpermps %zmm4, %zmm11, %zmm25 #83.3 -+ vpermps %zmm30, %zmm11, %zmm30 #85.3 -+ vmovups 496(%rdx), %zmm4 #91.3 -+ vaddps %zmm25, %zmm23, %zmm23{%k3} #83.3 -+ vaddps %zmm30, %zmm6, %zmm6{%k3} #85.3 -+ vpermt2ps 568(%rdx), %zmm2, %zmm4 #91.3 -+ vsubps %zmm25, %zmm23, %zmm23{%k4} #83.3 -+ vsubps %zmm30, %zmm6, %zmm6{%k4} #85.3 -+ vmovups .L_2il0floatpacket.40(%rip), %zmm25 #79.3 -+ vpermt2ps %zmm31, %zmm25, %zmm28 #79.3 -+ vmovups 288(%rdx), %zmm25 #91.3 -+ vpermps %zmm28, %zmm11, %zmm11 #84.3 -+ vmovups 352(%rdx), %zmm28 #91.3 -+ vaddps %zmm11, %zmm5, %zmm5{%k3} #84.3 -+ vpermt2ps 424(%rdx), %zmm2, %zmm28 #91.3 -+ vsubps %zmm11, %zmm5, %zmm5{%k4} #84.3 -+ vmovups 432(%rdx), %zmm11 #91.3 -+ vpermi2ps %zmm4, %zmm28, %zmm3 #91.3 -+ vpermt2ps %zmm4, %zmm1, %zmm28 #91.3 -+ vmovaps %zmm25, %zmm2 #91.3 -+ vpermt2ps 360(%rdx), %zmm10, %zmm2 #91.3 -+ vpermi2ps 504(%rdx), %zmm11, %zmm10 #91.3 -+ vpermt2ps 504(%rdx), %zmm9, %zmm11 #91.3 -+ vpermt2ps 360(%rdx), %zmm9, %zmm25 #91.3 -+ vpermi2ps %zmm10, %zmm2, %zmm7 #91.3 -+ vpermi2ps %zmm10, %zmm2, %zmm12 #91.3 -+ vpermi2ps %zmm10, %zmm2, %zmm13 #91.3 -+ vpermi2ps %zmm10, %zmm2, %zmm14 #91.3 -+ vpermi2ps %zmm10, %zmm2, %zmm17 #91.3 -+ vpermi2ps %zmm10, %zmm2, %zmm16 #91.3 -+ vpermi2ps %zmm10, %zmm2, %zmm8 #91.3 -+ vpermt2ps %zmm10, %zmm18, %zmm2 #91.3 -+ vpermi2ps %zmm11, %zmm25, %zmm20 #91.3 -+ vpermi2ps %zmm11, %zmm25, %zmm22 #91.3 -+ vpermi2ps %zmm11, %zmm25, %zmm26 #91.3 -+ vpermi2ps %zmm11, %zmm25, %zmm19 #91.3 -+ vpermi2ps %zmm11, %zmm25, %zmm21 #91.3 -+ vpermi2ps %zmm11, %zmm25, %zmm24 #91.3 -+ vpermi2ps %zmm11, %zmm25, %zmm27 #91.3 -+ vpermt2ps %zmm11, %zmm29, %zmm25 #91.3 -+ vmulps %zmm7, %zmm23, %zmm7 #91.3 -+ vmulps %zmm12, %zmm5, %zmm12 #91.3 -+ vmovups .L_2il0floatpacket.42(%rip), %ymm18 #96.3 -+ vfmadd231ps %zmm5, %zmm13, %zmm7 #91.3 -+ vfmadd231ps %zmm23, %zmm14, %zmm12 #91.3 -+ vpermilps $177, %zmm5, %zmm13 #91.3 -+ vmulps %zmm13, %zmm15, %zmm4 #91.3 -+ vmovups .L_2il0floatpacket.41(%rip), %ymm13 #96.3 -+ vfmadd231ps %zmm4, %zmm17, %zmm12 #91.3 -+ vpermilps $177, %zmm23, %zmm1 #91.3 -+ vmulps %zmm15, %zmm1, %zmm1 #91.3 -+ vfmadd231ps %zmm1, %zmm2, %zmm12 #91.3 -+ vfmadd231ps %zmm1, %zmm16, %zmm7 #91.3 -+ vmulps %zmm20, %zmm23, %zmm2 #91.3 -+ vfmadd231ps %zmm4, %zmm8, %zmm7 #91.3 -+ vfmadd231ps %zmm6, %zmm21, %zmm12 #91.3 -+ vmovups .L_2il0floatpacket.43(%rip), %ymm21 #96.3 -+ vfmadd213ps %zmm2, %zmm22, %zmm5 #91.3 -+ vfmadd231ps %zmm6, %zmm19, %zmm7 #91.3 -+ vmovups .L_2il0floatpacket.36(%rip), %ymm19 #96.3 -+ vmovups .L_2il0floatpacket.44(%rip), %ymm22 #96.3 -+ vfmadd213ps %zmm5, %zmm26, %zmm1 #91.3 -+ vbroadcastss -16(%rbp), %ymm26 #94.10 -+ vfmadd213ps %zmm1, %zmm25, %zmm4 #91.3 -+ vpermilps $177, %zmm6, %zmm8 #91.3 -+ vmulps %zmm8, %zmm15, %zmm15 #91.3 -+ vfmadd213ps %zmm4, %zmm3, %zmm6 #91.3 -+ vfmadd231ps %zmm15, %zmm24, %zmm7 #91.3 -+ vfmadd231ps %zmm15, %zmm27, %zmm12 #91.3 -+ vfmadd213ps %zmm6, %zmm28, %zmm15 #91.3 -+ vpermps %ymm7, %ymm0, %ymm3 #96.3 -+ vpermps %ymm12, %ymm0, %ymm10 #97.3 -+ vpermps %ymm15, %ymm0, %ymm17 #98.3 -+ vfmadd213ps %ymm7, %ymm19, %ymm3 #96.3 -+ vfmadd213ps %ymm12, %ymm19, %ymm10 #97.3 -+ vfmadd213ps %ymm15, %ymm19, %ymm17 #98.3 -+ vextractf64x4 $1, %zmm7, %ymm5 #96.3 -+ vextractf64x4 $1, %zmm12, %ymm11 #97.3 -+ vextractf64x4 $1, %zmm15, %ymm20 #98.3 -+ vpermps %ymm5, %ymm0, %ymm6 #96.3 -+ vpermps %ymm11, %ymm0, %ymm14 #97.3 -+ vpermps %ymm20, %ymm0, %ymm0 #98.3 -+ vpermilps %ymm13, %ymm3, %ymm9 #96.3 -+ vfmadd213ps %ymm5, %ymm19, %ymm6 #96.3 -+ vfmadd213ps %ymm11, %ymm19, %ymm14 #97.3 -+ vfmadd213ps %ymm20, %ymm19, %ymm0 #98.3 -+ vfmadd213ps -112(%rbp), %ymm18, %ymm9 #96.3[spill] -+ vpermilps %ymm13, %ymm10, %ymm16 #97.3 -+ vpermilps %ymm13, %ymm17, %ymm23 #98.3 -+ vfmadd213ps -80(%rbp), %ymm18, %ymm16 #97.3[spill] -+ vfmadd213ps -48(%rbp), %ymm18, %ymm23 #98.3[spill] -+ vpermilps %ymm21, %ymm6, %ymm24 #96.3 -+ vpermilps %ymm21, %ymm14, %ymm25 #97.3 -+ vpermilps %ymm21, %ymm0, %ymm27 #98.3 -+ vfmadd213ps %ymm9, %ymm22, %ymm24 #96.3 -+ vfmadd213ps %ymm16, %ymm22, %ymm25 #97.3 -+ vfmadd213ps %ymm23, %ymm22, %ymm27 #98.3 -+ vmulps %ymm26, %ymm24, %ymm29 #100.8 -+ vmulps %ymm25, %ymm26, %ymm31 #101.8 -+ vmulps %ymm27, %ymm26, %ymm0 #102.8 -+ vshufps $68, %ymm31, %ymm29, %ymm28 #104.3 -+ vshufps $228, %ymm29, %ymm0, %ymm30 #104.3 -+ vshufps $238, %ymm0, %ymm31, %ymm1 #104.3 -+ vmovups %xmm28, (%r8) #104.3 -+ vmovups %xmm30, 16(%r8) #104.3 -+ vmovups %xmm1, 32(%r8) #104.3 -+ vextractf32x4 $1, %ymm28, 48(%r8) #104.3 -+ vextractf32x4 $1, %ymm30, 64(%r8) #104.3 -+ vextractf128 $1, %ymm1, 80(%r8) #104.3 -+ vzeroupper #105.1 -+ movq %rbp, %rsp #105.1 -+ popq %rbp #105.1 -+ .cfi_restore 6 -+ movq %rbx, %rsp #105.1 -+ popq %rbx #105.1 -+ .cfi_def_cfa 7, 8 -+ .cfi_restore 3 -+ ret #105.1 -+ .align 16,0x90 -+ # LOE -+ .cfi_endproc -+# mark_end; -+ .type doe_avx512,@function -+ .size doe_avx512,.-doe_avx512 -+ .data -+# -- End doe_avx512 -+ .text -+# -- Begin deo_avx512 -+ .text -+# mark_begin; -+ .align 16,0x90 -+ .globl deo_avx512 -+# --- deo_avx512(int *, int *, su3 *, spinor *, float, spin_t *) -+deo_avx512: -+# parameter 1: %rdi -+# parameter 2: %rsi -+# parameter 3: %rdx -+# parameter 4: %rcx -+# parameter 5: %xmm0 -+# parameter 6: %r8 -+..B2.1: # Preds ..B2.0 -+ # Execution count [1.00e+00] -+ .cfi_startproc -+..___tag_value_deo_avx512.11: -+..L12: -+ #108.1 -+ pushq %rbp #108.1 -+ .cfi_def_cfa_offset 16 -+ movq %rsp, %rbp #108.1 -+ .cfi_def_cfa 6, 16 -+ .cfi_offset 6, -16 -+ movslq (%rdi), %rax #122.16 -+ vmovss %xmm0, -16(%rbp) #108.1 -+ lea (%rax,%rax,2), %r9 #122.8 -+ shlq $5, %r9 #122.8 -+ prefetcht0 (%rcx,%r9) #123.3 -+ movl $42255, %r10d #165.3 -+ movslq (%rsi), %r11 #124.16 -+ kmovw %r10d, %k1 #165.3 -+ movl $23280, %r10d #165.3 -+ kmovw %r10d, %k2 #165.3 -+ movl $38595, %r10d #183.3 -+ lea (%r11,%r11,2), %rax #124.8 -+ shlq $5, %rax #124.8 -+ kmovw %r10d, %k3 #183.3 -+ movl $26940, %r10d #183.3 -+ kmovw %r10d, %k4 #183.3 -+ prefetcht0 (%rcx,%rax) #125.3 -+ movslq 4(%rdi), %r10 #126.17 -+ lea (%r10,%r10,2), %r11 #126.9 -+ shlq $5, %r11 #126.9 -+ prefetcht0 (%rcx,%r11) #127.3 -+ movslq 4(%rsi), %r10 #128.17 -+ lea (%r10,%r10,2), %r10 #128.9 -+ shlq $5, %r10 #128.9 -+ prefetcht0 (%rcx,%r10) #129.3 -+ vmovups (%r8), %xmm2 #131.3 -+ vmovups 16(%r8), %xmm6 #131.3 -+ vmovups 32(%r8), %xmm4 #131.3 -+ vbroadcastss -16(%rbp), %ymm27 #133.10 -+ vmovups .L_2il0floatpacket.35(%rip), %ymm23 #138.3 -+ vmovups .L_2il0floatpacket.36(%rip), %ymm28 #138.3 -+ vmovups .L_2il0floatpacket.47(%rip), %ymm18 #142.3 -+ vmovups 216(%rdx), %zmm9 #150.3 -+ vmovups .L_2il0floatpacket.15(%rip), %zmm26 #150.3 -+ vmovups .L_2il0floatpacket.16(%rip), %zmm8 #150.3 -+ vinsertf128 $1, 48(%r8), %ymm2, %ymm15 #131.3 -+ vinsertf128 $1, 64(%r8), %ymm6, %ymm13 #131.3 -+ vshufps $228, %ymm13, %ymm15, %ymm21 #131.3 -+ vmulps %ymm27, %ymm21, %ymm20 #134.8 -+ vpermps %ymm20, %ymm23, %ymm24 #138.3 -+ vfmadd231ps %ymm20, %ymm28, %ymm24 #138.3 -+ vinsertf128 $1, 80(%r8), %ymm4, %ymm1 #131.3 -+ vshufps $78, %ymm1, %ymm15, %ymm30 #131.3 -+ vshufps $228, %ymm1, %ymm13, %ymm31 #131.3 -+ vmovups .L_2il0floatpacket.45(%rip), %ymm13 #142.3 -+ vmovups .L_2il0floatpacket.46(%rip), %ymm1 #142.3 -+ vmulps %ymm30, %ymm27, %ymm25 #135.8 -+ vmulps %ymm31, %ymm27, %ymm14 #136.8 -+ vpermps %ymm20, %ymm13, %ymm15 #142.3 -+ vpermps %ymm20, %ymm1, %ymm21 #142.3 -+ vfmadd213ps %ymm21, %ymm18, %ymm15 #142.3 -+ vmovups .L_2il0floatpacket.14(%rip), %zmm21 #150.3 -+ vpermps %ymm25, %ymm1, %ymm16 #143.3 -+ vpermps %ymm14, %ymm1, %ymm12 #144.3 -+ vmovaps %zmm9, %zmm1 #150.3 -+ vpermps %ymm14, %ymm23, %ymm29 #140.3 -+ vpermps %ymm14, %ymm13, %ymm30 #144.3 -+ vfmadd231ps %ymm14, %ymm28, %ymm29 #140.3 -+ vfmadd213ps %ymm12, %ymm18, %ymm30 #144.3 -+ vpermps %ymm25, %ymm23, %ymm22 #139.3 -+ vpermps %ymm25, %ymm13, %ymm17 #143.3 -+ vfmadd231ps %ymm25, %ymm28, %ymm22 #139.3 -+ vfmadd213ps %ymm16, %ymm18, %ymm17 #143.3 -+ vmovups .L_2il0floatpacket.25(%rip), %zmm28 #150.3 -+ vmovups 280(%rdx), %zmm13 #150.3 -+ movslq 8(%rdi), %r8 #174.16 -+ vinsertf64x4 $1, %ymm15, %zmm24, %zmm10 #142.3 -+ lea (%r8,%r8,2), %r8 #174.8 -+ vmovups 72(%rdx), %zmm15 #150.3 -+ vmovups .L_2il0floatpacket.13(%rip), %zmm24 #150.3 -+ vmovaps %zmm15, %zmm7 #150.3 -+ vpermt2ps (%rdx), %zmm24, %zmm7 #150.3 -+ vpermt2ps 144(%rdx), %zmm24, %zmm1 #150.3 -+ vmovaps %zmm7, %zmm5 #150.3 -+ vpermt2ps %zmm1, %zmm21, %zmm5 #150.3 -+ vmulps %zmm5, %zmm10, %zmm18 #150.3 -+ vpermilps $177, %zmm10, %zmm31 #150.3 -+ vmovaps %zmm7, %zmm0 #150.3 -+ vmovaps %zmm7, %zmm11 #150.3 -+ vpermt2ps %zmm1, %zmm26, %zmm0 #150.3 -+ vpermt2ps %zmm1, %zmm8, %zmm11 #150.3 -+ vmovaps %zmm7, %zmm6 #150.3 -+ vmovaps %zmm7, %zmm23 #150.3 -+ vmovaps %zmm7, %zmm27 #150.3 -+ vmovaps %zmm7, %zmm4 #150.3 -+ shlq $5, %r8 #174.8 -+ vinsertf64x4 $1, %ymm30, %zmm29, %zmm12 #144.3 -+ vmovups .L_2il0floatpacket.18(%rip), %zmm30 #150.3 -+ vmovups .L_2il0floatpacket.17(%rip), %zmm29 #150.3 -+ vmulps %zmm30, %zmm31, %zmm5 #150.3 -+ vmovups .L_2il0floatpacket.27(%rip), %zmm31 #150.3 -+ vpermt2ps %zmm1, %zmm29, %zmm27 #150.3 -+ vinsertf64x4 $1, %ymm17, %zmm22, %zmm3 #143.3 -+ vmovups .L_2il0floatpacket.21(%rip), %zmm22 #150.3 -+ vmovups 136(%rdx), %zmm17 #150.3 -+ vfmadd231ps %zmm3, %zmm11, %zmm18 #150.3 -+ vmulps %zmm0, %zmm3, %zmm19 #150.3 -+ vpermt2ps %zmm1, %zmm22, %zmm23 #150.3 -+ vfmadd231ps %zmm10, %zmm27, %zmm19 #150.3 -+ vmovups .L_2il0floatpacket.26(%rip), %zmm27 #150.3 -+ vpermilps $177, %zmm3, %zmm2 #150.3 -+ vmulps %zmm2, %zmm30, %zmm16 #150.3 -+ vmovups .L_2il0floatpacket.19(%rip), %zmm2 #150.3 -+ vpermt2ps %zmm1, %zmm2, %zmm6 #150.3 -+ vfmadd231ps %zmm5, %zmm6, %zmm18 #150.3 -+ vmovups .L_2il0floatpacket.20(%rip), %zmm6 #150.3 -+ vfmadd231ps %zmm16, %zmm23, %zmm18 #150.3 -+ vmovups .L_2il0floatpacket.23(%rip), %zmm23 #150.3 -+ vpermt2ps %zmm1, %zmm6, %zmm4 #150.3 -+ vpermt2ps (%rdx), %zmm23, %zmm15 #150.3 -+ vpermt2ps 144(%rdx), %zmm23, %zmm9 #150.3 -+ vfmadd231ps %zmm16, %zmm4, %zmm19 #150.3 -+ vmovups .L_2il0floatpacket.22(%rip), %zmm4 #150.3 -+ vmovaps %zmm15, %zmm0 #150.3 -+ vpermt2ps %zmm9, %zmm28, %zmm0 #150.3 -+ vpermt2ps %zmm1, %zmm4, %zmm7 #150.3 -+ vmovups .L_2il0floatpacket.24(%rip), %zmm1 #150.3 -+ vmulps %zmm0, %zmm10, %zmm0 #150.3 -+ vfmadd231ps %zmm5, %zmm7, %zmm19 #150.3 -+ vmovaps %zmm15, %zmm10 #150.3 -+ vmovaps %zmm15, %zmm7 #150.3 -+ vpermt2ps %zmm9, %zmm27, %zmm10 #150.3 -+ vpermt2ps %zmm9, %zmm1, %zmm7 #150.3 -+ vfmadd231ps %zmm12, %zmm10, %zmm19 #150.3 -+ vfmadd231ps %zmm12, %zmm7, %zmm18 #150.3 -+ vpermilps $177, %zmm12, %zmm10 #150.3 -+ vmovaps %zmm15, %zmm7 #150.3 -+ vmulps %zmm10, %zmm30, %zmm11 #150.3 -+ vmovups .L_2il0floatpacket.29(%rip), %zmm10 #150.3 -+ vpermt2ps %zmm9, %zmm31, %zmm7 #150.3 -+ vfmadd213ps %zmm0, %zmm7, %zmm3 #150.3 -+ vmovups .L_2il0floatpacket.28(%rip), %zmm7 #150.3 -+ vmovaps %zmm15, %zmm0 #150.3 -+ vpermt2ps %zmm9, %zmm7, %zmm0 #150.3 -+ vfmadd231ps %zmm11, %zmm0, %zmm18 #150.3 -+ vmovaps %zmm15, %zmm0 #150.3 -+ vpermt2ps %zmm9, %zmm10, %zmm0 #150.3 -+ vfmadd213ps %zmm3, %zmm0, %zmm5 #150.3 -+ vmovups .L_2il0floatpacket.30(%rip), %zmm3 #150.3 -+ vmovaps %zmm15, %zmm0 #150.3 -+ vpermt2ps %zmm9, %zmm3, %zmm0 #150.3 -+ vfmadd231ps %zmm11, %zmm0, %zmm19 #150.3 -+ vmovups .L_2il0floatpacket.31(%rip), %zmm0 #150.3 -+ vpermt2ps %zmm9, %zmm0, %zmm15 #150.3 -+ vmovups .L_2il0floatpacket.32(%rip), %zmm9 #150.3 -+ vfmadd213ps %zmm5, %zmm15, %zmm16 #150.3 -+ vmovups .L_2il0floatpacket.33(%rip), %zmm5 #150.3 -+ vpermt2ps 64(%rdx), %zmm9, %zmm17 #150.3 -+ vpermt2ps 208(%rdx), %zmm9, %zmm13 #150.3 -+ vmovaps %zmm17, %zmm15 #150.3 -+ vpermt2ps %zmm13, %zmm5, %zmm15 #150.3 -+ vfmadd213ps %zmm16, %zmm15, %zmm12 #150.3 -+ vmovups .L_2il0floatpacket.34(%rip), %zmm16 #150.3 -+ vmovups 16(%rcx,%rax), %xmm15 #152.3 -+ vpermt2ps %zmm13, %zmm16, %zmm17 #150.3 -+ vfmadd213ps %zmm12, %zmm17, %zmm11 #150.3 -+ vmovups (%rcx,%rax), %xmm17 #152.3 -+ vinsertf32x4 $1, (%rcx,%r9), %zmm17, %zmm13 #152.3 -+ vinsertf32x4 $2, (%rcx,%r10), %zmm13, %zmm12 #152.3 -+ vinsertf32x4 $3, (%rcx,%r11), %zmm12, %zmm13 #152.3 -+ vmovups 32(%rcx,%rax), %xmm12 #152.3 -+ vinsertf32x4 $1, 16(%rcx,%r9), %zmm15, %zmm16 #152.3 -+ vinsertf32x4 $2, 16(%rcx,%r10), %zmm16, %zmm17 #152.3 -+ vinsertf32x4 $3, 16(%rcx,%r11), %zmm17, %zmm17 #152.3 -+ vinsertf32x4 $1, 32(%rcx,%r9), %zmm12, %zmm15 #152.3 -+ vinsertf32x4 $2, 32(%rcx,%r10), %zmm15, %zmm16 #152.3 -+ vshufps $228, %zmm17, %zmm13, %zmm15 #152.3 -+ vinsertf32x4 $3, 32(%rcx,%r11), %zmm16, %zmm12 #152.3 -+ vshufps $78, %zmm12, %zmm13, %zmm16 #152.3 -+ vshufps $228, %zmm12, %zmm17, %zmm13 #152.3 -+ vaddps %zmm18, %zmm15, %zmm12 #155.8 -+ vaddps %zmm19, %zmm16, %zmm17 #156.8 -+ vaddps %zmm11, %zmm13, %zmm13 #157.8 -+ vshufps $68, %zmm17, %zmm12, %zmm15 #158.3 -+ vshufps $228, %zmm12, %zmm13, %zmm16 #158.3 -+ vshufps $238, %zmm13, %zmm17, %zmm17 #158.3 -+ vmovups %xmm15, (%rcx,%rax) #158.3 -+ vextractf32x4 $1, %zmm15, (%rcx,%r9) #158.3 -+ vextractf32x4 $2, %zmm15, (%rcx,%r10) #158.3 -+ vextractf32x4 $3, %zmm15, (%rcx,%r11) #158.3 -+ vmovups %xmm16, 16(%rcx,%rax) #158.3 -+ vextractf32x4 $1, %zmm16, 16(%rcx,%r9) #158.3 -+ vextractf32x4 $2, %zmm16, 16(%rcx,%r10) #158.3 -+ vextractf32x4 $3, %zmm16, 16(%rcx,%r11) #158.3 -+ vmovups %xmm17, 32(%rcx,%rax) #158.3 -+ vextractf32x4 $1, %zmm17, 32(%rcx,%r9) #158.3 -+ vextractf32x4 $2, %zmm17, 32(%rcx,%r10) #158.3 -+ vextractf32x4 $3, %zmm17, 32(%rcx,%r11) #158.3 -+ vmovups 48(%rcx,%rax), %xmm12 #162.3 -+ vmovups 64(%rcx,%rax), %xmm13 #162.3 -+ vmovups 80(%rcx,%rax), %xmm17 #162.3 -+ vinsertf32x4 $1, 48(%rcx,%r9), %zmm12, %zmm15 #162.3 -+ vinsertf32x4 $2, 48(%rcx,%r10), %zmm15, %zmm16 #162.3 -+ vinsertf32x4 $3, 48(%rcx,%r11), %zmm16, %zmm12 #162.3 -+ vinsertf32x4 $1, 64(%rcx,%r9), %zmm13, %zmm15 #162.3 -+ vinsertf32x4 $2, 64(%rcx,%r10), %zmm15, %zmm16 #162.3 -+ vinsertf32x4 $3, 64(%rcx,%r11), %zmm16, %zmm13 #162.3 -+ vinsertf32x4 $1, 80(%rcx,%r9), %zmm17, %zmm15 #162.3 -+ vinsertf32x4 $2, 80(%rcx,%r10), %zmm15, %zmm16 #162.3 -+ vinsertf32x4 $3, 80(%rcx,%r11), %zmm16, %zmm17 #162.3 -+ vmovups .L_2il0floatpacket.10(%rip), %zmm16 #162.3 -+ prefetcht0 (%rcx,%r8) #175.3 -+ vmovaps %zmm12, %zmm15 #162.3 -+ vpermt2ps %zmm13, %zmm16, %zmm15 #162.3 -+ vpermt2ps %zmm17, %zmm16, %zmm13 #162.3 -+ vmovups .L_2il0floatpacket.11(%rip), %zmm16 #162.3 -+ vpermt2ps %zmm17, %zmm16, %zmm12 #162.3 -+ vmovups .L_2il0floatpacket.12(%rip), %zmm16 #165.3 -+ vpermps %zmm18, %zmm16, %zmm18 #165.3 -+ vpermps %zmm11, %zmm16, %zmm17 #167.3 -+ vpermps %zmm19, %zmm16, %zmm19 #166.3 -+ vaddps %zmm18, %zmm15, %zmm15{%k1} #165.3 -+ vaddps %zmm17, %zmm13, %zmm13{%k1} #167.3 -+ vaddps %zmm19, %zmm12, %zmm12{%k1} #166.3 -+ vsubps %zmm18, %zmm15, %zmm15{%k2} #165.3 -+ vsubps %zmm17, %zmm13, %zmm13{%k2} #167.3 -+ vsubps %zmm19, %zmm12, %zmm12{%k2} #166.3 -+ vmovups .L_2il0floatpacket.49(%rip), %zmm11 #168.3 -+ vmovups .L_2il0floatpacket.48(%rip), %zmm17 #168.3 -+ vpermi2ps %zmm15, %zmm13, %zmm11 #168.3 -+ vmovaps %zmm15, %zmm18 #168.3 -+ vmovups .L_2il0floatpacket.50(%rip), %zmm15 #168.3 -+ vpermt2ps %zmm12, %zmm17, %zmm18 #168.3 -+ vpermt2ps %zmm13, %zmm15, %zmm12 #168.3 -+ vmovups %xmm18, 48(%rcx,%rax) #168.3 -+ vextractf32x4 $1, %zmm18, 48(%rcx,%r9) #168.3 -+ vextractf32x4 $2, %zmm18, 48(%rcx,%r10) #168.3 -+ vextractf32x4 $3, %zmm18, 48(%rcx,%r11) #168.3 -+ vmovups %xmm11, 64(%rcx,%rax) #168.3 -+ vextractf32x4 $1, %zmm11, 64(%rcx,%r9) #168.3 -+ vextractf32x4 $2, %zmm11, 64(%rcx,%r10) #168.3 -+ vextractf32x4 $3, %zmm11, 64(%rcx,%r11) #168.3 -+ vmovups %xmm12, 80(%rcx,%rax) #168.3 -+ vextractf32x4 $1, %zmm12, 80(%rcx,%r9) #168.3 -+ vextractf32x4 $2, %zmm12, 80(%rcx,%r10) #168.3 -+ vextractf32x4 $3, %zmm12, 80(%rcx,%r11) #168.3 -+ movslq 8(%rsi), %r9 #176.16 -+ lea (%r9,%r9,2), %rax #176.8 -+ shlq $5, %rax #176.8 -+ prefetcht0 (%rcx,%rax) #177.3 -+ movslq 12(%rdi), %rdi #178.17 -+ lea (%rdi,%rdi,2), %rdi #178.9 -+ shlq $5, %rdi #178.9 -+ prefetcht0 (%rcx,%rdi) #179.3 -+ movslq 12(%rsi), %rsi #180.17 -+ lea (%rsi,%rsi,2), %rsi #180.9 -+ shlq $5, %rsi #180.9 -+ prefetcht0 (%rcx,%rsi) #181.3 -+ vmovups .L_2il0floatpacket.51(%rip), %zmm13 #183.3 -+ vmovups .L_2il0floatpacket.52(%rip), %zmm18 #183.3 -+ vmovups 424(%rdx), %zmm12 #191.3 -+ vpermps %zmm20, %zmm13, %zmm11 #183.3 -+ vpermps %zmm20, %zmm18, %zmm20 #183.3 -+ vpermt2ps 352(%rdx), %zmm9, %zmm12 #191.3 -+ vaddps %zmm11, %zmm20, %zmm19{%k3}{z} #183.3 -+ vsubps %zmm11, %zmm20, %zmm19{%k4} #183.3 -+ vpermps %zmm25, %zmm13, %zmm20 #184.3 -+ vpermps %zmm25, %zmm18, %zmm25 #184.3 -+ vpermps %zmm14, %zmm13, %zmm13 #185.3 -+ vpermps %zmm14, %zmm18, %zmm14 #185.3 -+ vaddps %zmm20, %zmm25, %zmm11{%k3}{z} #184.3 -+ vmovups 568(%rdx), %zmm18 #191.3 -+ vsubps %zmm20, %zmm25, %zmm11{%k4} #184.3 -+ vaddps %zmm13, %zmm14, %zmm20{%k3}{z} #185.3 -+ vpermt2ps 496(%rdx), %zmm9, %zmm18 #191.3 -+ vmovups 360(%rdx), %zmm25 #191.3 -+ vsubps %zmm13, %zmm14, %zmm20{%k4} #185.3 -+ vpermi2ps %zmm18, %zmm12, %zmm5 #191.3 -+ vmovups 504(%rdx), %zmm14 #191.3 -+ vmovaps %zmm25, %zmm9 #191.3 -+ vpermt2ps 288(%rdx), %zmm24, %zmm9 #191.3 -+ vpermi2ps 432(%rdx), %zmm14, %zmm24 #191.3 -+ vpermt2ps 432(%rdx), %zmm23, %zmm14 #191.3 -+ vpermt2ps 288(%rdx), %zmm23, %zmm25 #191.3 -+ vpermi2ps %zmm24, %zmm9, %zmm21 #191.3 -+ vpermi2ps %zmm24, %zmm9, %zmm26 #191.3 -+ vpermi2ps %zmm24, %zmm9, %zmm8 #191.3 -+ vpermi2ps %zmm24, %zmm9, %zmm29 #191.3 -+ vpermi2ps %zmm24, %zmm9, %zmm6 #191.3 -+ vpermi2ps %zmm24, %zmm9, %zmm2 #191.3 -+ vpermi2ps %zmm24, %zmm9, %zmm22 #191.3 -+ vpermt2ps %zmm24, %zmm4, %zmm9 #191.3 -+ vpermi2ps %zmm14, %zmm25, %zmm28 #191.3 -+ vpermi2ps %zmm14, %zmm25, %zmm31 #191.3 -+ vpermi2ps %zmm14, %zmm25, %zmm1 #191.3 -+ vpermi2ps %zmm14, %zmm25, %zmm10 #191.3 -+ vpermi2ps %zmm14, %zmm25, %zmm27 #191.3 -+ vpermi2ps %zmm14, %zmm25, %zmm7 #191.3 -+ vpermi2ps %zmm14, %zmm25, %zmm3 #191.3 -+ vpermt2ps %zmm14, %zmm0, %zmm25 #191.3 -+ vmovups (%rcx,%rax), %xmm0 #193.3 -+ vmulps %zmm21, %zmm19, %zmm21 #191.3 -+ vmulps %zmm26, %zmm11, %zmm13 #191.3 -+ vmovups 16(%rcx,%rax), %xmm4 #193.3 -+ vfmadd231ps %zmm11, %zmm8, %zmm21 #191.3 -+ vfmadd231ps %zmm19, %zmm29, %zmm13 #191.3 -+ vpermilps $177, %zmm11, %zmm26 #191.3 -+ vmulps %zmm26, %zmm30, %zmm26 #191.3 -+ vpermilps $177, %zmm19, %zmm8 #191.3 -+ vmulps %zmm30, %zmm8, %zmm8 #191.3 -+ vfmadd231ps %zmm26, %zmm6, %zmm13 #191.3 -+ vfmadd231ps %zmm8, %zmm2, %zmm21 #191.3 -+ vfmadd231ps %zmm8, %zmm9, %zmm13 #191.3 -+ vmulps %zmm28, %zmm19, %zmm9 #191.3 -+ vfmadd231ps %zmm26, %zmm22, %zmm21 #191.3 -+ vfmadd231ps %zmm20, %zmm27, %zmm13 #191.3 -+ vfmadd213ps %zmm9, %zmm31, %zmm11 #191.3 -+ vfmadd231ps %zmm20, %zmm1, %zmm21 #191.3 -+ vfmadd213ps %zmm11, %zmm10, %zmm8 #191.3 -+ vpermilps $177, %zmm20, %zmm28 #191.3 -+ vmulps %zmm28, %zmm30, %zmm1 #191.3 -+ vfmadd213ps %zmm8, %zmm25, %zmm26 #191.3 -+ vfmadd231ps %zmm1, %zmm3, %zmm13 #191.3 -+ vmovups .L_2il0floatpacket.34(%rip), %zmm3 #191.3 -+ vfmadd231ps %zmm1, %zmm7, %zmm21 #191.3 -+ vmovups 32(%rcx,%rax), %xmm7 #193.3 -+ vfmadd213ps %zmm26, %zmm5, %zmm20 #191.3 -+ vpermt2ps %zmm18, %zmm3, %zmm12 #191.3 -+ vfmadd213ps %zmm20, %zmm12, %zmm1 #191.3 -+ vinsertf32x4 $1, (%rcx,%r8), %zmm0, %zmm2 #193.3 -+ vinsertf32x4 $2, (%rcx,%rsi), %zmm2, %zmm3 #193.3 -+ vinsertf32x4 $3, (%rcx,%rdi), %zmm3, %zmm2 #193.3 -+ vinsertf32x4 $1, 16(%rcx,%r8), %zmm4, %zmm5 #193.3 -+ vinsertf32x4 $2, 16(%rcx,%rsi), %zmm5, %zmm6 #193.3 -+ vinsertf32x4 $3, 16(%rcx,%rdi), %zmm6, %zmm3 #193.3 -+ vinsertf32x4 $1, 32(%rcx,%r8), %zmm7, %zmm8 #193.3 -+ vinsertf32x4 $2, 32(%rcx,%rsi), %zmm8, %zmm0 #193.3 -+ # LOE rax rcx rbx rsi rdi r8 r12 r13 r14 r15 zmm0 zmm1 zmm2 zmm3 zmm13 zmm15 zmm16 zmm17 zmm21 -+..B2.4: # Preds ..B2.1 -+ # Execution count [1.00e+00] -+ vshufps $228, %zmm3, %zmm2, %zmm4 #193.3 -+ movl $27075, %edx #200.3 -+ vmovups .L_2il0floatpacket.39(%rip), %zmm26 #199.3 -+ vmovups .L_2il0floatpacket.40(%rip), %zmm25 #199.3 -+ vinsertf32x4 $3, 32(%rcx,%rdi), %zmm0, %zmm0 #193.3 -+ vaddps %zmm21, %zmm4, %zmm5 #194.8 -+ vpermps %zmm21, %zmm16, %zmm21 #200.3 -+ vshufps $78, %zmm0, %zmm2, %zmm2 #193.3 -+ vshufps $228, %zmm0, %zmm3, %zmm3 #193.3 -+ kmovw %edx, %k1 #200.3 -+ vaddps %zmm13, %zmm2, %zmm6 #195.8 -+ vaddps %zmm1, %zmm3, %zmm7 #196.8 -+ vpermps %zmm13, %zmm16, %zmm13 #201.3 -+ vpermps %zmm1, %zmm16, %zmm1 #202.3 -+ vshufps $68, %zmm6, %zmm5, %zmm8 #197.3 -+ vshufps $228, %zmm5, %zmm7, %zmm9 #197.3 -+ vshufps $238, %zmm7, %zmm6, %zmm10 #197.3 -+ vmovups .L_2il0floatpacket.53(%rip), %zmm16 #203.3 -+ vmovaps %zmm26, %zmm28 #199.3 -+ movl $38460, %edx #200.3 -+ kmovw %edx, %k2 #200.3 -+ vmovups %xmm8, (%rcx,%rax) #197.3 -+ vextractf32x4 $1, %zmm8, (%rcx,%r8) #197.3 -+ vextractf32x4 $2, %zmm8, (%rcx,%rsi) #197.3 -+ vextractf32x4 $3, %zmm8, (%rcx,%rdi) #197.3 -+ vmovups %xmm9, 16(%rcx,%rax) #197.3 -+ vextractf32x4 $1, %zmm9, 16(%rcx,%r8) #197.3 -+ vextractf32x4 $2, %zmm9, 16(%rcx,%rsi) #197.3 -+ vextractf32x4 $3, %zmm9, 16(%rcx,%rdi) #197.3 -+ vmovups %xmm10, 32(%rcx,%rax) #197.3 -+ vextractf32x4 $1, %zmm10, 32(%rcx,%r8) #197.3 -+ vextractf32x4 $2, %zmm10, 32(%rcx,%rsi) #197.3 -+ vextractf32x4 $3, %zmm10, 32(%rcx,%rdi) #197.3 -+ vmovups 48(%rcx,%rax), %xmm11 #199.3 -+ vmovups 64(%rcx,%rax), %xmm18 #199.3 -+ vmovups 80(%rcx,%rax), %xmm22 #199.3 -+ vinsertf32x4 $1, 48(%rcx,%r8), %zmm11, %zmm12 #199.3 -+ vinsertf32x4 $1, 64(%rcx,%r8), %zmm18, %zmm19 #199.3 -+ vinsertf32x4 $1, 80(%rcx,%r8), %zmm22, %zmm23 #199.3 -+ vinsertf32x4 $2, 48(%rcx,%rsi), %zmm12, %zmm14 #199.3 -+ vinsertf32x4 $2, 64(%rcx,%rsi), %zmm19, %zmm20 #199.3 -+ vinsertf32x4 $2, 80(%rcx,%rsi), %zmm23, %zmm24 #199.3 -+ vinsertf32x4 $3, 48(%rcx,%rdi), %zmm14, %zmm30 #199.3 -+ vinsertf32x4 $3, 64(%rcx,%rdi), %zmm20, %zmm29 #199.3 -+ vinsertf32x4 $3, 80(%rcx,%rdi), %zmm24, %zmm27 #199.3 -+ vpermi2ps %zmm29, %zmm30, %zmm28 #199.3 -+ vpermt2ps %zmm27, %zmm25, %zmm30 #199.3 -+ vpermt2ps %zmm27, %zmm26, %zmm29 #199.3 -+ vaddps %zmm21, %zmm28, %zmm28{%k1} #200.3 -+ vaddps %zmm13, %zmm30, %zmm30{%k1} #201.3 -+ vaddps %zmm1, %zmm29, %zmm29{%k1} #202.3 -+ vsubps %zmm21, %zmm28, %zmm28{%k2} #200.3 -+ vsubps %zmm13, %zmm30, %zmm30{%k2} #201.3 -+ vsubps %zmm1, %zmm29, %zmm29{%k2} #202.3 -+ vpermi2ps %zmm30, %zmm28, %zmm15 #203.3 -+ vpermi2ps %zmm28, %zmm29, %zmm16 #203.3 -+ vpermt2ps %zmm29, %zmm17, %zmm30 #203.3 -+ vmovups %xmm15, 48(%rcx,%rax) #203.3 -+ vextractf32x4 $1, %zmm15, 48(%rcx,%r8) #203.3 -+ vextractf32x4 $2, %zmm15, 48(%rcx,%rsi) #203.3 -+ vextractf32x4 $3, %zmm15, 48(%rcx,%rdi) #203.3 -+ vmovups %xmm16, 64(%rcx,%rax) #203.3 -+ vextractf32x4 $1, %zmm16, 64(%rcx,%r8) #203.3 -+ vextractf32x4 $2, %zmm16, 64(%rcx,%rsi) #203.3 -+ vextractf32x4 $3, %zmm16, 64(%rcx,%rdi) #203.3 -+ vmovups %xmm30, 80(%rcx,%rax) #203.3 -+ vextractf32x4 $1, %zmm30, 80(%rcx,%r8) #203.3 -+ vextractf32x4 $2, %zmm30, 80(%rcx,%rsi) #203.3 -+ vextractf32x4 $3, %zmm30, 80(%rcx,%rdi) #203.3 -+ vzeroupper #204.1 -+ movq %rbp, %rsp #204.1 -+ popq %rbp #204.1 -+ .cfi_restore 6 -+ ret #204.1 -+ .align 16,0x90 -+ # LOE -+ .cfi_endproc -+# mark_end; -+ .type deo_avx512,@function -+ .size deo_avx512,.-deo_avx512 -+ .data -+# -- End deo_avx512 -+ .section .rodata, "a" -+ .align 64 -+ .align 64 -+.L_2il0floatpacket.10: -+ .long 0x00000000,0x00000001,0x00000012,0x00000013,0x00000004,0x00000005,0x00000016,0x00000017,0x0000001a,0x0000001b,0x00000008,0x00000009,0x0000001e,0x0000001f,0x0000000c,0x0000000d -+ .type .L_2il0floatpacket.10,@object -+ .size .L_2il0floatpacket.10,64 -+ .align 64 -+.L_2il0floatpacket.11: -+ .long 0x00000002,0x00000003,0x00000010,0x00000011,0x00000006,0x00000007,0x00000014,0x00000015,0x00000018,0x00000019,0x0000000a,0x0000000b,0x0000001c,0x0000001d,0x0000000e,0x0000000f -+ .type .L_2il0floatpacket.11,@object -+ .size .L_2il0floatpacket.11,64 -+ .align 64 -+.L_2il0floatpacket.12: -+ .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007,0x00000009,0x00000008,0x0000000b,0x0000000a,0x0000000d,0x0000000c,0x0000000f,0x0000000e -+ .type .L_2il0floatpacket.12,@object -+ .size .L_2il0floatpacket.12,64 -+ .align 64 -+.L_2il0floatpacket.13: -+ .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000008,0x00000009,0x00000006,0x00000007,0x00000010,0x00000011,0x00000012,0x00000013,0x00000018,0x00000019,0x00000016,0x00000017 -+ .type .L_2il0floatpacket.13,@object -+ .size .L_2il0floatpacket.13,64 -+ .align 64 -+.L_2il0floatpacket.14: -+ .long 0x00000000,0x00000000,0x00000000,0x00000000,0x00000008,0x00000008,0x00000008,0x00000008,0x00000010,0x00000010,0x00000010,0x00000010,0x00000018,0x00000018,0x00000018,0x00000018 -+ .type .L_2il0floatpacket.14,@object -+ .size .L_2il0floatpacket.14,64 -+ .align 64 -+.L_2il0floatpacket.15: -+ .long 0x00000004,0x00000004,0x00000004,0x00000004,0x0000000c,0x0000000c,0x0000000c,0x0000000c,0x00000014,0x00000014,0x00000014,0x00000014,0x0000001c,0x0000001c,0x0000001c,0x0000001c -+ .type .L_2il0floatpacket.15,@object -+ .size .L_2il0floatpacket.15,64 -+ .align 64 -+.L_2il0floatpacket.16: -+ .long 0x00000002,0x00000002,0x00000002,0x00000002,0x0000000e,0x0000000e,0x0000000e,0x0000000e,0x00000012,0x00000012,0x00000012,0x00000012,0x0000001e,0x0000001e,0x0000001e,0x0000001e -+ .type .L_2il0floatpacket.16,@object -+ .size .L_2il0floatpacket.16,64 -+ .align 64 -+.L_2il0floatpacket.17: -+ .long 0x00000006,0x00000006,0x00000006,0x00000006,0x0000000a,0x0000000a,0x0000000a,0x0000000a,0x00000016,0x00000016,0x00000016,0x00000016,0x0000001a,0x0000001a,0x0000001a,0x0000001a -+ .type .L_2il0floatpacket.17,@object -+ .size .L_2il0floatpacket.17,64 -+ .align 64 -+.L_2il0floatpacket.18: -+ .long 0xbf800000,0x3f800000,0xbf800000,0x3f800000,0x3f800000,0xbf800000,0x3f800000,0xbf800000,0xbf800000,0x3f800000,0xbf800000,0x3f800000,0x3f800000,0xbf800000,0x3f800000,0xbf800000 -+ .type .L_2il0floatpacket.18,@object -+ .size .L_2il0floatpacket.18,64 -+ .align 64 -+.L_2il0floatpacket.19: -+ .long 0x00000001,0x00000001,0x00000001,0x00000001,0x00000009,0x00000009,0x00000009,0x00000009,0x00000011,0x00000011,0x00000011,0x00000011,0x00000019,0x00000019,0x00000019,0x00000019 -+ .type .L_2il0floatpacket.19,@object -+ .size .L_2il0floatpacket.19,64 -+ .align 64 -+.L_2il0floatpacket.20: -+ .long 0x00000005,0x00000005,0x00000005,0x00000005,0x0000000d,0x0000000d,0x0000000d,0x0000000d,0x00000015,0x00000015,0x00000015,0x00000015,0x0000001d,0x0000001d,0x0000001d,0x0000001d -+ .type .L_2il0floatpacket.20,@object -+ .size .L_2il0floatpacket.20,64 -+ .align 64 -+.L_2il0floatpacket.21: -+ .long 0x00000003,0x00000003,0x00000003,0x00000003,0x0000000f,0x0000000f,0x0000000f,0x0000000f,0x00000013,0x00000013,0x00000013,0x00000013,0x0000001f,0x0000001f,0x0000001f,0x0000001f -+ .type .L_2il0floatpacket.21,@object -+ .size .L_2il0floatpacket.21,64 -+ .align 64 -+.L_2il0floatpacket.22: -+ .long 0x00000007,0x00000007,0x00000007,0x00000007,0x0000000b,0x0000000b,0x0000000b,0x0000000b,0x00000017,0x00000017,0x00000017,0x00000017,0x0000001b,0x0000001b,0x0000001b,0x0000001b -+ .type .L_2il0floatpacket.22,@object -+ .size .L_2il0floatpacket.22,64 -+ .align 64 -+.L_2il0floatpacket.23: -+ .long 0x00000004,0x00000005,0x0000000c,0x0000000d,0x0000000a,0x0000000b,0x0000000e,0x0000000f,0x00000014,0x00000015,0x0000001c,0x0000001d,0x0000001a,0x0000001b,0x0000001e,0x0000001f -+ .type .L_2il0floatpacket.23,@object -+ .size .L_2il0floatpacket.23,64 -+ .align 64 -+.L_2il0floatpacket.24: -+ .long 0x00000000,0x00000000,0x00000000,0x00000000,0x0000000a,0x0000000a,0x0000000a,0x0000000a,0x00000010,0x00000010,0x00000010,0x00000010,0x0000001a,0x0000001a,0x0000001a,0x0000001a -+ .type .L_2il0floatpacket.24,@object -+ .size .L_2il0floatpacket.24,64 -+ .align 64 -+.L_2il0floatpacket.25: -+ .long 0x00000002,0x00000002,0x00000002,0x00000002,0x00000008,0x00000008,0x00000008,0x00000008,0x00000012,0x00000012,0x00000012,0x00000012,0x00000018,0x00000018,0x00000018,0x00000018 -+ .type .L_2il0floatpacket.25,@object -+ .size .L_2il0floatpacket.25,64 -+ .align 64 -+.L_2il0floatpacket.26: -+ .long 0x00000004,0x00000004,0x00000004,0x00000004,0x0000000e,0x0000000e,0x0000000e,0x0000000e,0x00000014,0x00000014,0x00000014,0x00000014,0x0000001e,0x0000001e,0x0000001e,0x0000001e -+ .type .L_2il0floatpacket.26,@object -+ .size .L_2il0floatpacket.26,64 -+ .align 64 -+.L_2il0floatpacket.27: -+ .long 0x00000006,0x00000006,0x00000006,0x00000006,0x0000000c,0x0000000c,0x0000000c,0x0000000c,0x00000016,0x00000016,0x00000016,0x00000016,0x0000001c,0x0000001c,0x0000001c,0x0000001c -+ .type .L_2il0floatpacket.27,@object -+ .size .L_2il0floatpacket.27,64 -+ .align 64 -+.L_2il0floatpacket.28: -+ .long 0x00000001,0x00000001,0x00000001,0x00000001,0x0000000b,0x0000000b,0x0000000b,0x0000000b,0x00000011,0x00000011,0x00000011,0x00000011,0x0000001b,0x0000001b,0x0000001b,0x0000001b -+ .type .L_2il0floatpacket.28,@object -+ .size .L_2il0floatpacket.28,64 -+ .align 64 -+.L_2il0floatpacket.29: -+ .long 0x00000003,0x00000003,0x00000003,0x00000003,0x00000009,0x00000009,0x00000009,0x00000009,0x00000013,0x00000013,0x00000013,0x00000013,0x00000019,0x00000019,0x00000019,0x00000019 -+ .type .L_2il0floatpacket.29,@object -+ .size .L_2il0floatpacket.29,64 -+ .align 64 -+.L_2il0floatpacket.30: -+ .long 0x00000005,0x00000005,0x00000005,0x00000005,0x0000000f,0x0000000f,0x0000000f,0x0000000f,0x00000015,0x00000015,0x00000015,0x00000015,0x0000001f,0x0000001f,0x0000001f,0x0000001f -+ .type .L_2il0floatpacket.30,@object -+ .size .L_2il0floatpacket.30,64 -+ .align 64 -+.L_2il0floatpacket.31: -+ .long 0x00000007,0x00000007,0x00000007,0x00000007,0x0000000d,0x0000000d,0x0000000d,0x0000000d,0x00000017,0x00000017,0x00000017,0x00000017,0x0000001d,0x0000001d,0x0000001d,0x0000001d -+ .type .L_2il0floatpacket.31,@object -+ .size .L_2il0floatpacket.31,64 -+ .align 64 -+.L_2il0floatpacket.32: -+ .long 0x00000000,0x00000001,0x00000010,0x00000011,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000 -+ .type .L_2il0floatpacket.32,@object -+ .size .L_2il0floatpacket.32,64 -+ .align 64 -+.L_2il0floatpacket.33: -+ .long 0x00000000,0x00000000,0x00000000,0x00000000,0x00000002,0x00000002,0x00000002,0x00000002,0x00000010,0x00000010,0x00000010,0x00000010,0x00000012,0x00000012,0x00000012,0x00000012 -+ .type .L_2il0floatpacket.33,@object -+ .size .L_2il0floatpacket.33,64 -+ .align 64 -+.L_2il0floatpacket.34: -+ .long 0x00000001,0x00000001,0x00000001,0x00000001,0x00000003,0x00000003,0x00000003,0x00000003,0x00000011,0x00000011,0x00000011,0x00000011,0x00000013,0x00000013,0x00000013,0x00000013 -+ .type .L_2il0floatpacket.34,@object -+ .size .L_2il0floatpacket.34,64 -+ .align 64 -+.L_2il0floatpacket.39: -+ .long 0x00000012,0x00000013,0x00000000,0x00000001,0x00000016,0x00000017,0x00000004,0x00000005,0x00000008,0x00000009,0x0000001a,0x0000001b,0x0000000c,0x0000000d,0x0000001e,0x0000001f -+ .type .L_2il0floatpacket.39,@object -+ .size .L_2il0floatpacket.39,64 -+ .align 64 -+.L_2il0floatpacket.40: -+ .long 0x00000010,0x00000011,0x00000002,0x00000003,0x00000014,0x00000015,0x00000006,0x00000007,0x0000000a,0x0000000b,0x00000018,0x00000019,0x0000000e,0x0000000f,0x0000001c,0x0000001d -+ .type .L_2il0floatpacket.40,@object -+ .size .L_2il0floatpacket.40,64 -+ .align 64 -+.L_2il0floatpacket.48: -+ .long 0x00000000,0x00000001,0x00000010,0x00000011,0x00000004,0x00000005,0x00000014,0x00000015,0x0000000a,0x0000000b,0x0000001a,0x0000001b,0x0000000e,0x0000000f,0x0000001e,0x0000001f -+ .type .L_2il0floatpacket.48,@object -+ .size .L_2il0floatpacket.48,64 -+ .align 64 -+.L_2il0floatpacket.49: -+ .long 0x00000000,0x00000001,0x00000012,0x00000013,0x00000004,0x00000005,0x00000016,0x00000017,0x0000000a,0x0000000b,0x00000018,0x00000019,0x0000000e,0x0000000f,0x0000001c,0x0000001d -+ .type .L_2il0floatpacket.49,@object -+ .size .L_2il0floatpacket.49,64 -+ .align 64 -+.L_2il0floatpacket.50: -+ .long 0x00000002,0x00000003,0x00000012,0x00000013,0x00000006,0x00000007,0x00000016,0x00000017,0x00000008,0x00000009,0x00000018,0x00000019,0x0000000c,0x0000000d,0x0000001c,0x0000001d -+ .type .L_2il0floatpacket.50,@object -+ .size .L_2il0floatpacket.50,64 -+ .align 64 -+.L_2il0floatpacket.51: -+ .long 0x00000006,0x00000007,0x00000004,0x00000005,0x00000006,0x00000007,0x00000004,0x00000005,0x00000005,0x00000004,0x00000007,0x00000006,0x00000005,0x00000004,0x00000007,0x00000006 -+ .type .L_2il0floatpacket.51,@object -+ .size .L_2il0floatpacket.51,64 -+ .align 64 -+.L_2il0floatpacket.52: -+ .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000000,0x00000001,0x00000002,0x00000003,0x00000000,0x00000001,0x00000002,0x00000003,0x00000000,0x00000001,0x00000002,0x00000003 -+ .type .L_2il0floatpacket.52,@object -+ .size .L_2il0floatpacket.52,64 -+ .align 64 -+.L_2il0floatpacket.53: -+ .long 0x00000002,0x00000003,0x00000010,0x00000011,0x00000006,0x00000007,0x00000014,0x00000015,0x00000008,0x00000009,0x0000001a,0x0000001b,0x0000000c,0x0000000d,0x0000001e,0x0000001f -+ .type .L_2il0floatpacket.53,@object -+ .size .L_2il0floatpacket.53,64 -+ .align 32 -+.L_2il0floatpacket.35: -+ .long 0x00000004,0x00000005,0x00000006,0x00000007,0x00000000,0x00000001,0x00000002,0x00000003 -+ .type .L_2il0floatpacket.35,@object -+ .size .L_2il0floatpacket.35,32 -+ .align 32 -+.L_2il0floatpacket.36: -+ .long 0x3f800000,0x3f800000,0x3f800000,0x3f800000,0xbf800000,0xbf800000,0xbf800000,0xbf800000 -+ .type .L_2il0floatpacket.36,@object -+ .size .L_2il0floatpacket.36,32 -+ .align 32 -+.L_2il0floatpacket.37: -+ .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000003,0x00000002,0x00000001,0x00000000 -+ .type .L_2il0floatpacket.37,@object -+ .size .L_2il0floatpacket.37,32 -+ .align 32 -+.L_2il0floatpacket.38: -+ .long 0x3f800000,0x3f800000,0x3f800000,0x3f800000,0xbf800000,0x3f800000,0xbf800000,0x3f800000 -+ .type .L_2il0floatpacket.38,@object -+ .size .L_2il0floatpacket.38,32 -+ .align 32 -+.L_2il0floatpacket.41: -+ .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000002,0x00000003,0x00000000,0x00000001 -+ .type .L_2il0floatpacket.41,@object -+ .size .L_2il0floatpacket.41,32 -+ .align 32 -+.L_2il0floatpacket.42: -+ .long 0x3f800000,0x3f800000,0x3f800000,0x3f800000,0x3f800000,0x3f800000,0xbf800000,0xbf800000 -+ .type .L_2il0floatpacket.42,@object -+ .size .L_2il0floatpacket.42,32 -+ .align 32 -+.L_2il0floatpacket.43: -+ .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000001,0x00000000,0x00000003,0x00000002 -+ .type .L_2il0floatpacket.43,@object -+ .size .L_2il0floatpacket.43,32 -+ .align 32 -+.L_2il0floatpacket.44: -+ .long 0x3f800000,0x3f800000,0x3f800000,0x3f800000,0xbf800000,0x3f800000,0x3f800000,0xbf800000 -+ .type .L_2il0floatpacket.44,@object -+ .size .L_2il0floatpacket.44,32 -+ .align 32 -+.L_2il0floatpacket.45: -+ .long 0x00000007,0x00000006,0x00000005,0x00000004,0x00000007,0x00000006,0x00000005,0x00000004 -+ .type .L_2il0floatpacket.45,@object -+ .size .L_2il0floatpacket.45,32 -+ .align 32 -+.L_2il0floatpacket.46: -+ .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000000,0x00000001,0x00000002,0x00000003 -+ .type .L_2il0floatpacket.46,@object -+ .size .L_2il0floatpacket.46,32 -+ .align 32 -+.L_2il0floatpacket.47: -+ .long 0xbf800000,0x3f800000,0xbf800000,0x3f800000,0x3f800000,0xbf800000,0x3f800000,0xbf800000 -+ .type .L_2il0floatpacket.47,@object -+ .size .L_2il0floatpacket.47,32 -+ .data -+ .section .note.GNU-stack, "" -+// -- Begin DWARF2 SEGMENT .eh_frame -+ .section .eh_frame,"a",@progbits -+.eh_frame_seg: -+ .align 8 -+# End -diff --git a/modules/dirac/avx512/Dw_dble_avx512.c b/modules/dirac/avx512/Dw_dble_avx512.c -new file mode 100644 -index 0000000..2f490be ---- /dev/null -+++ b/modules/dirac/avx512/Dw_dble_avx512.c -@@ -0,0 +1,256 @@ -+/******************************************************************************* -+* -+* File Dw_dble_avx512.c -+* -+* This software is distributed under the terms of the GNU General Public -+* License (GPL) -+* -+* AVX512 implementation of the O(a)-improved Wilson-Dirac operator D (double- -+* precision programs). -+* -+* See ../Dw_dble.c for more details and alternative implementations -+ *******************************************************************************/ -+ -+#include -+#include -+#include -+#include "mpi.h" -+#include "su3.h" -+#include "utils.h" -+#include "flags.h" -+#include "lattice.h" -+#include "uflds.h" -+#include "sflds.h" -+#include "sw_term.h" -+#include "dirac.h" -+#include "global.h" -+ -+#define N0 (NPROC0 * L0) -+ -+typedef union -+{ -+ spinor_dble s; -+ weyl_dble w[2]; -+} spin_t; -+ -+#include "avx512.h" -+#include "sse.h" -+ -+void doe_dble_avx512( const int *piup, const int *pidn, const su3_dble *u, const spinor_dble *pk, double coe, spin_t *rs) -+{ -+ const spinor_dble *sp, *sm; -+ const su3_dble *up; -+ -+ /* 512-bit wide stores for the spinor for each color */ -+ __m512d a1, a2, a3; -+ __m512d b1, b2, b3; -+ __m512d w1, w2, w3; -+ __m512d t1, t2, t3, t4, t5, t6; -+ -+ __m128d tc; -+ __m512d c512; -+ -+ /******************************* direction 0 *********************************/ -+ -+ sp = pk + (*(piup++)); -+ sm = pk + (*(pidn++)); -+ -+ _avx512_load_2_halfspinor_d( t1, t2, t3, &(*sp).c1.c1.re, &(*sm).c1.c1.re ); -+ _avx512_load_2_halfspinor_d( t4, t5, t6, &(*sp).c3.c1.re, &(*sm).c3.c1.re ); -+ -+ a1 = _mm512_maskz_add_pd( 0b00001111, t1, t4 ); -+ a1 = _mm512_mask_sub_pd( a1, 0b11110000, t1, t4 ); -+ a2 = _mm512_maskz_add_pd( 0b00001111, t2, t5 ); -+ a2 = _mm512_mask_sub_pd( a2, 0b11110000, t2, t5 ); -+ a3 = _mm512_maskz_add_pd( 0b00001111, t3, t6 ); -+ a3 = _mm512_mask_sub_pd( a3, 0b11110000, t3, t6 ); -+ -+ sp = pk + (*(piup++)); -+ _mm_prefetch( (char *) sp, _MM_HINT_T0 ); -+ sm = pk + (*(pidn++)); -+ _mm_prefetch( (char *) sm, _MM_HINT_T0 ); -+ -+ up = u; -+ u += 1; -+ avx512_su3_mul_quad_dble( *up, *u, b1, b2, b3, a1, a2, a3 ); -+ -+ _avx512_to_weyl_1( w1, b1 ); -+ _avx512_to_weyl_1( w2, b2 ); -+ _avx512_to_weyl_1( w3, b3 ); -+ -+ -+ /******************************* direction 1 *********************************/ -+ _avx512_load_2_halfspinor_d( t1, t2, t3, &(*sp).c1.c1.re, &(*sm).c1.c1.re ); -+ _avx512_load_2_halfspinor_d_reverse( t4, t5, t6, &(*sp).c3.c1.re, &(*sm).c3.c1.re ); -+ -+ t4 = _mm512_permute_pd ( t4, 0b01010101 ); -+ a1 = _mm512_maskz_add_pd( 0b01011010, t1, t4 ); -+ a1 = _mm512_mask_sub_pd( a1, 0b10100101, t1, t4 ); -+ t5 = _mm512_permute_pd ( t5, 0b01010101 ); -+ a2 = _mm512_maskz_add_pd( 0b01011010, t2, t5 ); -+ a2 = _mm512_mask_sub_pd( a2, 0b10100101, t2, t5 ); -+ t6 = _mm512_permute_pd ( t6, 0b01010101 ); -+ a3 = _mm512_maskz_add_pd( 0b01011010, t3, t6 ); -+ a3 = _mm512_mask_sub_pd( a3, 0b10100101, t3, t6 ); -+ -+ sp = pk + (*(piup++)); -+ _mm_prefetch( (char *) sp, _MM_HINT_T0 ); -+ sm = pk + (*(pidn++)); -+ _mm_prefetch( (char *) sm, _MM_HINT_T0 ); -+ up = ++u; -+ u += 1; -+ -+ avx512_su3_mul_quad_dble( *up, *u, b1, b2, b3, a1, a2, a3 ); -+ -+ _avx512_to_weyl_2( w1, b1 ); -+ _avx512_to_weyl_2( w2, b2 ); -+ _avx512_to_weyl_2( w3, b3 ); -+ -+ /******************************* direction 2 *********************************/ -+ -+ _avx512_load_2_halfspinor_d( t1, t2, t3, &(*sp).c1.c1.re, &(*sm).c1.c1.re ); -+ _avx512_load_2_halfspinor_d_reverse( t4, t5, t6, &(*sp).c3.c1.re, &(*sm).c3.c1.re ); -+ -+ a1 = _mm512_maskz_add_pd( 0b11000011, t1, t4 ); -+ a1 = _mm512_mask_sub_pd( a1, 0b00111100, t1, t4 ); -+ a2 = _mm512_maskz_add_pd( 0b11000011, t2, t5 ); -+ a2 = _mm512_mask_sub_pd( a2, 0b00111100, t2, t5 ); -+ a3 = _mm512_maskz_add_pd( 0b11000011, t3, t6 ); -+ a3 = _mm512_mask_sub_pd( a3, 0b00111100, t3, t6 ); -+ -+ sp = pk + (*(piup)); -+ _mm_prefetch( (char *) sp, _MM_HINT_T0 ); -+ sm = pk + (*(pidn)); -+ _mm_prefetch( (char *) sm, _MM_HINT_T0 ); -+ up = ++u; -+ u += 1; -+ -+ avx512_su3_mul_quad_dble( *up, *u, b1, b2, b3, a1, a2, a3 ); -+ -+ _avx512_to_weyl_3( w1, b1 ); -+ _avx512_to_weyl_3( w2, b2 ); -+ _avx512_to_weyl_3( w3, b3 ); -+ -+ -+ /******************************* direction 3 *********************************/ -+ _avx512_load_2_halfspinor_d( t1, t2, t3, &(*sp).c1.c1.re, &(*sm).c1.c1.re ); -+ _avx512_load_2_halfspinor_d( t4, t5, t6, &(*sp).c3.c1.re, &(*sm).c3.c1.re ); -+ -+ t4 = _mm512_permute_pd ( t4, 0b01010101 ); -+ a1 = _mm512_maskz_add_pd( 0b10010110, t1, t4 ); -+ a1 = _mm512_mask_sub_pd( a1, 0b01101001, t1, t4 ); -+ t5 = _mm512_permute_pd ( t5, 0b01010101 ); -+ a2 = _mm512_maskz_add_pd( 0b10010110, t2, t5 ); -+ a2 = _mm512_mask_sub_pd( a2, 0b01101001, t2, t5 ); -+ t6 = _mm512_permute_pd ( t6, 0b01010101 ); -+ a3 = _mm512_maskz_add_pd( 0b10010110, t3, t6 ); -+ a3 = _mm512_mask_sub_pd( a3, 0b01101001, t3, t6 ); -+ -+ up = ++u; -+ u += 1; -+ avx512_su3_mul_quad_dble( *up, *u, b1, b2, b3, a1, a2, a3 ); -+ -+ _avx512_to_weyl_4( w1, b1 ); -+ _avx512_to_weyl_4( w2, b2 ); -+ _avx512_to_weyl_4( w3, b3 ); -+ -+ tc = _mm_load_sd( &coe ); -+ c512 = _mm512_broadcastsd_pd( tc ); -+ w1 = _mm512_mul_pd( c512, w1 ); -+ w2 = _mm512_mul_pd( c512, w2 ); -+ w3 = _mm512_mul_pd( c512, w3 ); -+ -+ _avx512_store_2_halfspinor_d( w1, w2, w3, &rs->s.c1.c1.re, &rs->s.c3.c1.re ); -+} -+ -+void deo_dble_avx512( const int *piup, const int *pidn, const su3_dble *u, spinor_dble *pl, double ceo, spin_t *rs) -+{ -+ const su3_dble *up; -+ spinor_dble *sp, *sm; -+ -+ /* 512-bit wide stores for the spinor for each color */ -+ __m512d a1, a2, a3; -+ __m512d b1, b2, b3; -+ __m512d w1, w2, w3; -+ -+ __m128d tc; -+ __m512d c512; -+ -+ /******************************* direction 0 *********************************/ -+ -+ sp = pl + (*(piup++)); -+ _mm_prefetch( (char *) sp, _MM_HINT_T0 ); -+ sm = pl + (*(pidn++)); -+ _mm_prefetch( (char *) sm, _MM_HINT_T0 ); -+ -+ _avx512_load_2_halfspinor_d( w1, w2, w3, &rs->s.c1.c1.re, &rs->s.c3.c1.re ); -+ -+ tc = _mm_load_sd( &ceo ); -+ c512 = _mm512_broadcastsd_pd( tc ); -+ w1 = _mm512_mul_pd( c512, w1 ); -+ w2 = _mm512_mul_pd( c512, w2 ); -+ w3 = _mm512_mul_pd( c512, w3 ); -+ -+ _avx512_expand_weyl( a1, w1 ) -+ _avx512_expand_weyl( a2, w2 ) -+ _avx512_expand_weyl( a3, w3 ) -+ -+ up = u; -+ u += 1; -+ avx512_su3_mul_quad_dble( *u, *up, b1, b2, b3, a1, a2, a3 ); -+ -+ _avx512_add_to_spinors( b1, b2, b3, &(*sp).c1.c1.re, &(*sm).c1.c1.re ); -+ _avx512_add_to_spinors_2( b1, b2, b3, &(*sp).c3.c1.re, &(*sm).c3.c1.re ); -+ -+ /******************************* direction 1 *********************************/ -+ sp = pl + (*(piup++)); -+ _mm_prefetch( (char *) sp, _MM_HINT_T0 ); -+ sm = pl + (*(pidn++)); -+ _mm_prefetch( (char *) sm, _MM_HINT_T0 ); -+ -+ _avx512_expand_weyl_2( a1, w1 ); -+ _avx512_expand_weyl_2( a2, w2 ); -+ _avx512_expand_weyl_2( a3, w3 ); -+ -+ up = ++u; -+ u += 1; -+ avx512_su3_mul_quad_dble( *u, *up, b1, b2, b3, a1, a2, a3 ); -+ -+ _avx512_add_to_spinors( b1, b2, b3, &(*sp).c1.c1.re, &(*sm).c1.c1.re ); -+ _avx512_add_to_spinors_3( b1, b2, b3, &(*sp).c3.c1.re, &(*sm).c3.c1.re ); -+ -+ /******************************* direction 2 *********************************/ -+ sp = pl + (*(piup++)); -+ _mm_prefetch( (char *) sp, _MM_HINT_T0 ); -+ sm = pl + (*(pidn++)); -+ _mm_prefetch( (char *) sm, _MM_HINT_T0 ); -+ -+ _avx512_expand_weyl_3( a1, w1 ); -+ _avx512_expand_weyl_3( a2, w2 ); -+ _avx512_expand_weyl_3( a3, w3 ); -+ -+ up = ++u; -+ u += 1; -+ avx512_su3_mul_quad_dble( *u, *up, b1, b2, b3, a1, a2, a3 ); -+ -+ _avx512_add_to_spinors( b1, b2, b3, &(*sp).c1.c1.re, &(*sm).c1.c1.re ); -+ _avx512_add_to_spinors_4( b1, b2, b3, &(*sp).c3.c1.re, &(*sm).c3.c1.re ); -+ -+ -+ /******************************* direction 3 *********************************/ -+ sp = pl + (*(piup++)); -+ _mm_prefetch( (char *) sp, _MM_HINT_T0 ); -+ sm = pl + (*(pidn++)); -+ _mm_prefetch( (char *) sm, _MM_HINT_T0 ); -+ -+ _avx512_expand_weyl_4( a1, w1 ); -+ _avx512_expand_weyl_4( a2, w2 ); -+ _avx512_expand_weyl_4( a3, w3 ); -+ -+ up = ++u; -+ u += 1; -+ avx512_su3_mul_quad_dble( *u, *up, b1, b2, b3, a1, a2, a3 ); -+ -+ _avx512_add_to_spinors( b1, b2, b3, &(*sp).c1.c1.re, &(*sm).c1.c1.re ); -+ _avx512_add_to_spinors_5( b1, b2, b3, &(*sp).c3.c1.re, &(*sm).c3.c1.re ); -+} -diff --git a/modules/dirac/avx512/Dw_dble_avx512_asm.s b/modules/dirac/avx512/Dw_dble_avx512_asm.s -new file mode 100644 -index 0000000..f76b428 ---- /dev/null -+++ b/modules/dirac/avx512/Dw_dble_avx512_asm.s -@@ -0,0 +1,1306 @@ -+# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 17.0.4.196 Build 20170411"; -+# mark_description "-I../../../include -I.. -I/cineca/prod/opt/compilers/intel/pe-xe-2017/binary/impi/2017.3.196/intel64/include"; -+# mark_description " -isystem /cineca/prod/opt/compilers/intel/pe-xe-2018/binary/impi/2018.1.163/include64/ -std=c89 -xCORE-AVX5"; -+# mark_description "12 -mtune=skylake -DAVX512 -O3 -Ddirac_counters -pedantic -fstrict-aliasing -Wno-long-long -Wstrict-prototyp"; -+# mark_description "es -S"; -+ .file "Dw_dble_avx512.c" -+ .text -+..TXTST0: -+# -- Begin doe_dble_avx512 -+ .text -+# mark_begin; -+ .align 16,0x90 -+ .globl doe_dble_avx512 -+# --- doe_dble_avx512(const int *, const int *, const su3_dble *, const spinor_dble *, double, spin_t *) -+doe_dble_avx512: -+# parameter 1: %rdi -+# parameter 2: %rsi -+# parameter 3: %rdx -+# parameter 4: %rcx -+# parameter 5: %xmm0 -+# parameter 6: %r8 -+..B1.1: # Preds ..B1.0 -+ # Execution count [1.00e+00] -+ .cfi_startproc -+..___tag_value_doe_dble_avx512.1: -+..L2: -+ #27.1 -+ pushq %rbp #27.1 -+ .cfi_def_cfa_offset 16 -+ movq %rsp, %rbp #27.1 -+ .cfi_def_cfa 6, 16 -+ .cfi_offset 6, -16 -+ movslq (%rdi), %rax #42.16 -+ movslq (%rsi), %r9 #43.16 -+ vmovups .L_2il0floatpacket.14(%rip), %zmm13 #45.3 -+ vmovups .L_2il0floatpacket.15(%rip), %zmm15 #45.3 -+ vmovups .L_2il0floatpacket.16(%rip), %zmm16 #45.3 -+ vmovups .L_2il0floatpacket.17(%rip), %zmm19 #45.3 -+ vmovups .L_2il0floatpacket.18(%rip), %zmm18 #45.3 -+ vmovsd %xmm0, -16(%rbp) #27.1 -+ vmovaps %zmm13, %zmm28 #45.3 -+ lea (%rax,%rax,2), %r11 #42.8 -+ shlq $6, %r11 #42.8 -+ lea (%r9,%r9,2), %r10 #43.8 -+ shlq $6, %r10 #43.8 -+ movl $15, %eax #48.8 -+ vmovaps %zmm13, %zmm27 #46.3 -+ kmovw %eax, %k4 #48.8 -+ movl $240, %eax #49.8 -+ kmovw %eax, %k3 #49.8 -+ vmovups (%rcx,%r10), %zmm30 #45.3 -+ vmovups (%rcx,%r11), %zmm24 #45.3 -+ vmovups 96(%rcx,%r10), %zmm29 #46.3 -+ vmovups 96(%rcx,%r11), %zmm23 #46.3 -+ vpermi2pd %zmm24, %zmm30, %zmm28 #45.3 -+ vpermt2pd 64(%rcx,%r11), %zmm16, %zmm24 #45.3 -+ vpermt2pd 64(%rcx,%r10), %zmm15, %zmm30 #45.3 -+ vpermi2pd %zmm23, %zmm29, %zmm27 #46.3 -+ vpermt2pd 160(%rcx,%r10), %zmm15, %zmm29 #46.3 -+ vpermt2pd 160(%rcx,%r11), %zmm16, %zmm23 #46.3 -+ vaddpd %zmm27, %zmm28, %zmm7{%k4}{z} #48.8 -+ movslq 4(%rdi), %rax #55.16 -+ vmovaps %zmm19, %zmm26 #45.3 -+ vmovaps %zmm19, %zmm25 #46.3 -+ vpermi2pd %zmm30, %zmm24, %zmm26 #45.3 -+ lea (%rax,%rax,2), %rax #55.8 -+ vpermt2pd %zmm30, %zmm18, %zmm24 #45.3 -+ vpermi2pd %zmm29, %zmm23, %zmm25 #46.3 -+ vpermt2pd %zmm29, %zmm18, %zmm23 #46.3 -+ vsubpd %zmm27, %zmm28, %zmm7{%k3} #49.8 -+ vaddpd %zmm25, %zmm26, %zmm4{%k4}{z} #50.8 -+ vaddpd %zmm23, %zmm24, %zmm14{%k4}{z} #52.8 -+ vsubpd %zmm25, %zmm26, %zmm4{%k3} #51.8 -+ vsubpd %zmm23, %zmm24, %zmm14{%k3} #53.8 -+ shlq $6, %rax #55.8 -+ prefetcht0 (%rcx,%rax) #56.3 -+ movslq 4(%rsi), %r9 #57.16 -+ vpermilpd $85, %zmm7, %zmm22 #62.3 -+ vpermilpd $85, %zmm4, %zmm10 #62.3 -+ vpermilpd $85, %zmm14, %zmm29 #62.3 -+ lea (%r9,%r9,2), %r10 #57.8 -+ movl $90, %r9d #74.8 -+ kmovw %r9d, %k1 #74.8 -+ movl $165, %r9d #75.8 -+ kmovw %r9d, %k2 #75.8 -+ shlq $6, %r10 #57.8 -+ movl $175, %r9d #92.3 -+ kmovw %r9d, %k5 #92.3 -+ movl $80, %r9d #92.3 -+ kmovw %r9d, %k6 #92.3 -+ movl $60, %r9d #102.8 -+ kmovw %r9d, %k7 #102.8 -+ prefetcht0 (%rcx,%r10) #58.3 -+ vmovups .L_2il0floatpacket.19(%rip), %zmm30 #62.3 -+ vmovups (%rdx), %zmm11 #62.3 -+ vmovups .L_2il0floatpacket.25(%rip), %zmm24 #62.3 -+ vmovups 64(%rdx), %zmm17 #62.3 -+ vmovups 128(%rdx), %zmm8 #62.3 -+ vmulpd %zmm10, %zmm30, %zmm21 #62.3 -+ vmulpd %zmm29, %zmm30, %zmm10 #62.3 -+ vmulpd %zmm30, %zmm22, %zmm6 #62.3 -+ vmovups .L_2il0floatpacket.20(%rip), %zmm29 #62.3 -+ vmovups .L_2il0floatpacket.27(%rip), %zmm22 #62.3 -+ vmovaps %zmm11, %zmm28 #62.3 -+ vpermt2pd 144(%rdx), %zmm29, %zmm28 #62.3 -+ vmulpd %zmm7, %zmm28, %zmm27 #62.3 -+ vmovups .L_2il0floatpacket.21(%rip), %zmm28 #62.3 -+ vmovaps %zmm11, %zmm26 #62.3 -+ vpermt2pd 144(%rdx), %zmm28, %zmm26 #62.3 -+ vfmadd213pd %zmm27, %zmm6, %zmm26 #62.3 -+ vmovups .L_2il0floatpacket.22(%rip), %zmm27 #62.3 -+ vmovaps %zmm11, %zmm25 #62.3 -+ vpermt2pd 144(%rdx), %zmm27, %zmm25 #62.3 -+ vfmadd213pd %zmm26, %zmm4, %zmm25 #62.3 -+ vmovups .L_2il0floatpacket.23(%rip), %zmm26 #62.3 -+ vmovaps %zmm11, %zmm9 #62.3 -+ vpermt2pd 144(%rdx), %zmm26, %zmm9 #62.3 -+ vfmadd213pd %zmm25, %zmm21, %zmm9 #62.3 -+ vmovups .L_2il0floatpacket.24(%rip), %zmm25 #62.3 -+ vmovaps %zmm11, %zmm23 #62.3 -+ vpermt2pd 208(%rdx), %zmm25, %zmm23 #62.3 -+ vfmadd213pd %zmm9, %zmm14, %zmm23 #62.3 -+ vmovaps %zmm11, %zmm9 #62.3 -+ vpermt2pd 208(%rdx), %zmm24, %zmm9 #62.3 -+ vfmadd213pd %zmm23, %zmm10, %zmm9 #62.3 -+ vmovups .L_2il0floatpacket.26(%rip), %zmm23 #62.3 -+ vmovaps %zmm17, %zmm1 #62.3 -+ vmovaps %zmm11, %zmm3 #62.3 -+ vpermt2pd 144(%rdx), %zmm25, %zmm1 #62.3 -+ vpermt2pd 144(%rdx), %zmm23, %zmm3 #62.3 -+ vpermt2pd 144(%rdx), %zmm22, %zmm11 #62.3 -+ vmulpd %zmm1, %zmm7, %zmm31 #62.3 -+ vmulpd %zmm3, %zmm7, %zmm12 #62.3 -+ vmovups 96(%rcx,%r10), %zmm3 #71.3 -+ vfmadd213pd %zmm12, %zmm6, %zmm11 #62.3 -+ vmovaps %zmm17, %zmm7 #62.3 -+ vpermt2pd 144(%rdx), %zmm24, %zmm7 #62.3 -+ vfmadd213pd %zmm31, %zmm6, %zmm7 #62.3 -+ vmovaps %zmm17, %zmm6 #62.3 -+ vmovaps %zmm17, %zmm5 #62.3 -+ vpermt2pd 208(%rdx), %zmm23, %zmm6 #62.3 -+ vpermt2pd 208(%rdx), %zmm29, %zmm5 #62.3 -+ vfmadd213pd %zmm7, %zmm4, %zmm6 #62.3 -+ vfmadd213pd %zmm11, %zmm4, %zmm5 #62.3 -+ vmovups 96(%rcx,%rax), %zmm7 #71.3 -+ vmovaps %zmm17, %zmm2 #62.3 -+ vmovaps %zmm17, %zmm20 #62.3 -+ vmovaps %zmm17, %zmm11 #62.3 -+ vpermt2pd 208(%rdx), %zmm22, %zmm17 #62.3 -+ vpermt2pd 208(%rdx), %zmm28, %zmm2 #62.3 -+ vpermt2pd 208(%rdx), %zmm27, %zmm20 #62.3 -+ vpermt2pd 208(%rdx), %zmm26, %zmm11 #62.3 -+ vfmadd213pd %zmm6, %zmm21, %zmm17 #62.3 -+ vfmadd213pd %zmm5, %zmm21, %zmm2 #62.3 -+ vmovaps %zmm8, %zmm21 #62.3 -+ vpermt2pd 272(%rdx), %zmm29, %zmm21 #62.3 -+ vpermt2pd 272(%rdx), %zmm28, %zmm8 #62.3 -+ vfmadd213pd %zmm2, %zmm14, %zmm20 #62.3 -+ vfmadd213pd %zmm17, %zmm14, %zmm21 #62.3 -+ vfmadd213pd %zmm20, %zmm10, %zmm11 #62.3 -+ vfmadd213pd %zmm21, %zmm10, %zmm8 #62.3 -+ vmovups .L_2il0floatpacket.28(%rip), %zmm21 #64.3 -+ vmovups .L_2il0floatpacket.31(%rip), %zmm20 #71.3 -+ vpermpd %zmm9, %zmm21, %zmm17 #64.3 -+ vpermpd %zmm11, %zmm21, %zmm14 #65.3 -+ vpermpd %zmm8, %zmm21, %zmm4 #66.3 -+ vaddpd %zmm9, %zmm17, %zmm10{%k4}{z} #64.3 -+ vsubpd %zmm9, %zmm17, %zmm10{%k3} #64.3 -+ vaddpd %zmm11, %zmm14, %zmm9{%k4}{z} #65.3 -+ vmovups .L_2il0floatpacket.29(%rip), %zmm17 #71.3 -+ vsubpd %zmm11, %zmm14, %zmm9{%k3} #65.3 -+ vaddpd %zmm8, %zmm4, %zmm11{%k4}{z} #66.3 -+ vmovups .L_2il0floatpacket.30(%rip), %zmm14 #71.3 -+ vsubpd %zmm8, %zmm4, %zmm11{%k3} #66.3 -+ vmovups (%rcx,%r10), %zmm8 #70.3 -+ vmovups (%rcx,%rax), %zmm4 #70.3 -+ vmovaps %zmm3, %zmm12 #71.3 -+ vmovaps %zmm13, %zmm5 #70.3 -+ vpermt2pd %zmm7, %zmm17, %zmm12 #71.3 -+ vpermt2pd 160(%rcx,%r10), %zmm14, %zmm3 #71.3 -+ vpermt2pd 160(%rcx,%rax), %zmm20, %zmm7 #71.3 -+ vpermi2pd %zmm4, %zmm8, %zmm5 #70.3 -+ vpermt2pd 64(%rcx,%r10), %zmm15, %zmm8 #70.3 -+ vpermt2pd 64(%rcx,%rax), %zmm16, %zmm4 #70.3 -+ vmovaps %zmm19, %zmm0 #71.3 -+ vmovaps %zmm19, %zmm6 #70.3 -+ vpermi2pd %zmm3, %zmm7, %zmm0 #71.3 -+ vpermi2pd %zmm8, %zmm4, %zmm6 #70.3 -+ vpermt2pd %zmm8, %zmm18, %zmm4 #70.3 -+ vpermt2pd %zmm3, %zmm18, %zmm7 #71.3 -+ vpermilpd $85, %zmm12, %zmm1 #73.8 -+ vaddpd %zmm1, %zmm5, %zmm2{%k1}{z} #74.8 -+ vpermilpd $85, %zmm0, %zmm8 #76.8 -+ movslq 8(%rdi), %r11 #83.16 -+ vsubpd %zmm1, %zmm5, %zmm2{%k2} #75.8 -+ vaddpd %zmm8, %zmm6, %zmm5{%k1}{z} #77.8 -+ vsubpd %zmm8, %zmm6, %zmm5{%k2} #78.8 -+ lea (%r11,%r11,2), %r10 #83.8 -+ vpermilpd $85, %zmm7, %zmm6 #79.8 -+ shlq $6, %r10 #83.8 -+ vaddpd %zmm6, %zmm4, %zmm8{%k1}{z} #80.8 -+ vsubpd %zmm6, %zmm4, %zmm8{%k2} #81.8 -+ prefetcht0 (%rcx,%r10) #84.3 -+ movslq 8(%rsi), %rax #85.16 -+ vpermilpd $85, %zmm2, %zmm4 #90.3 -+ vpermilpd $85, %zmm5, %zmm12 #90.3 -+ vpermilpd $85, %zmm8, %zmm1 #90.3 -+ lea (%rax,%rax,2), %r9 #85.8 -+ shlq $6, %r9 #85.8 -+ movl $63, %eax #117.3 -+ kmovw %eax, %k1 #117.3 -+ movl $192, %eax #117.3 -+ kmovw %eax, %k2 #117.3 -+ vmulpd %zmm30, %zmm4, %zmm3 #90.3 -+ vmulpd %zmm12, %zmm30, %zmm4 #90.3 -+ vmulpd %zmm1, %zmm30, %zmm7 #90.3 -+ prefetcht0 (%rcx,%r9) #86.3 -+ movl $195, %eax #101.8 -+ vmovups 288(%rdx), %zmm1 #90.3 -+ vmovups 352(%rdx), %zmm12 #90.3 -+ vmovups 416(%rdx), %zmm6 #90.3 -+ vmovaps %zmm1, %zmm31 #90.3 -+ vpermt2pd 432(%rdx), %zmm29, %zmm31 #90.3 -+ vmulpd %zmm2, %zmm31, %zmm0 #90.3 -+ vmovaps %zmm1, %zmm31 #90.3 -+ vpermt2pd 432(%rdx), %zmm28, %zmm31 #90.3 -+ vfmadd213pd %zmm0, %zmm3, %zmm31 #90.3 -+ vmovaps %zmm1, %zmm0 #90.3 -+ vpermt2pd 432(%rdx), %zmm27, %zmm0 #90.3 -+ vfmadd213pd %zmm31, %zmm5, %zmm0 #90.3 -+ vmovaps %zmm1, %zmm31 #90.3 -+ vpermt2pd 432(%rdx), %zmm26, %zmm31 #90.3 -+ vfmadd213pd %zmm0, %zmm4, %zmm31 #90.3 -+ vmovaps %zmm1, %zmm0 #90.3 -+ vpermt2pd 496(%rdx), %zmm25, %zmm0 #90.3 -+ vfmadd213pd %zmm31, %zmm8, %zmm0 #90.3 -+ vmovaps %zmm1, %zmm31 #90.3 -+ vpermt2pd 496(%rdx), %zmm24, %zmm31 #90.3 -+ vfmadd213pd %zmm0, %zmm7, %zmm31 #90.3 -+ vmovaps %zmm1, %zmm0 #90.3 -+ vpermt2pd 432(%rdx), %zmm23, %zmm0 #90.3 -+ vpermt2pd 432(%rdx), %zmm22, %zmm1 #90.3 -+ vmulpd %zmm0, %zmm2, %zmm0 #90.3 -+ vfmadd213pd %zmm0, %zmm3, %zmm1 #90.3 -+ vmovaps %zmm12, %zmm0 #90.3 -+ vpermt2pd 496(%rdx), %zmm29, %zmm0 #90.3 -+ vfmadd213pd %zmm1, %zmm5, %zmm0 #90.3 -+ vmovaps %zmm12, %zmm1 #90.3 -+ vpermt2pd 496(%rdx), %zmm28, %zmm1 #90.3 -+ vfmadd213pd %zmm0, %zmm4, %zmm1 #90.3 -+ vmovaps %zmm12, %zmm0 #90.3 -+ vpermt2pd 496(%rdx), %zmm27, %zmm0 #90.3 -+ vfmadd213pd %zmm1, %zmm8, %zmm0 #90.3 -+ vmovaps %zmm12, %zmm1 #90.3 -+ vpermt2pd 496(%rdx), %zmm26, %zmm1 #90.3 -+ vfmadd213pd %zmm0, %zmm7, %zmm1 #90.3 -+ vmovaps %zmm12, %zmm0 #90.3 -+ vpermt2pd 432(%rdx), %zmm25, %zmm0 #90.3 -+ vmulpd %zmm0, %zmm2, %zmm2 #90.3 -+ vmovaps %zmm12, %zmm0 #90.3 -+ vpermt2pd 432(%rdx), %zmm24, %zmm0 #90.3 -+ vfmadd213pd %zmm2, %zmm3, %zmm0 #90.3 -+ vmovups .L_2il0floatpacket.32(%rip), %zmm2 #92.3 -+ vmovaps %zmm12, %zmm3 #90.3 -+ vpermt2pd 496(%rdx), %zmm23, %zmm3 #90.3 -+ vpermt2pd 496(%rdx), %zmm22, %zmm12 #90.3 -+ vfmadd213pd %zmm0, %zmm5, %zmm3 #90.3 -+ vfmadd213pd %zmm3, %zmm4, %zmm12 #90.3 -+ vpermpd %zmm31, %zmm21, %zmm4 #92.3 -+ vmovaps %zmm6, %zmm5 #90.3 -+ vpermt2pd 560(%rdx), %zmm29, %zmm5 #90.3 -+ vpermt2pd 560(%rdx), %zmm28, %zmm6 #90.3 -+ vaddpd %zmm31, %zmm4, %zmm4{%k4} #92.3 -+ vfmadd213pd %zmm12, %zmm8, %zmm5 #90.3 -+ vsubpd %zmm4, %zmm31, %zmm4{%k3} #92.3 -+ vpermpd %zmm1, %zmm21, %zmm8 #93.3 -+ vfmadd213pd %zmm5, %zmm7, %zmm6 #90.3 -+ vmovups 96(%rcx,%r9), %zmm31 #99.3 -+ vpermpd %zmm4, %zmm2, %zmm7 #92.3 -+ vpermpd %zmm6, %zmm21, %zmm0 #94.3 -+ vaddpd %zmm1, %zmm8, %zmm8{%k4} #93.3 -+ vaddpd %zmm7, %zmm10, %zmm10{%k5} #92.3 -+ vaddpd %zmm6, %zmm0, %zmm0{%k4} #94.3 -+ vsubpd %zmm8, %zmm1, %zmm8{%k3} #93.3 -+ vsubpd %zmm7, %zmm10, %zmm10{%k6} #92.3 -+ vsubpd %zmm0, %zmm6, %zmm0{%k3} #94.3 -+ vpermpd %zmm8, %zmm2, %zmm12 #93.3 -+ vmovups (%rcx,%r9), %zmm1 #98.3 -+ vmovups (%rcx,%r10), %zmm7 #98.3 -+ vmovups 96(%rcx,%r10), %zmm8 #99.3 -+ vpermpd %zmm0, %zmm2, %zmm6 #94.3 -+ vaddpd %zmm12, %zmm9, %zmm9{%k5} #93.3 -+ vpermi2pd %zmm8, %zmm31, %zmm17 #99.3 -+ vaddpd %zmm6, %zmm11, %zmm11{%k5} #94.3 -+ vpermt2pd 160(%rcx,%r10), %zmm20, %zmm8 #99.3 -+ vpermt2pd 160(%rcx,%r9), %zmm14, %zmm31 #99.3 -+ vsubpd %zmm6, %zmm11, %zmm11{%k6} #94.3 -+ vsubpd %zmm12, %zmm9, %zmm9{%k6} #93.3 -+ kmovw %eax, %k5 #101.8 -+ vmovaps %zmm13, %zmm3 #98.3 -+ vpermi2pd %zmm7, %zmm1, %zmm3 #98.3 -+ vpermt2pd 64(%rcx,%r9), %zmm15, %zmm1 #98.3 -+ vpermt2pd 64(%rcx,%r10), %zmm16, %zmm7 #98.3 -+ vaddpd %zmm17, %zmm3, %zmm2{%k5}{z} #101.8 -+ movslq 12(%rdi), %rdi #108.15 -+ vmovaps %zmm19, %zmm6 #98.3 -+ vmovaps %zmm19, %zmm20 #99.3 -+ vpermi2pd %zmm1, %zmm7, %zmm6 #98.3 -+ lea (%rdi,%rdi,2), %r9 #108.8 -+ vpermt2pd %zmm1, %zmm18, %zmm7 #98.3 -+ vpermi2pd %zmm31, %zmm8, %zmm20 #99.3 -+ vpermt2pd %zmm31, %zmm18, %zmm8 #99.3 -+ vsubpd %zmm17, %zmm3, %zmm2{%k7} #102.8 -+ vaddpd %zmm20, %zmm6, %zmm3{%k5}{z} #103.8 -+ vaddpd %zmm8, %zmm7, %zmm4{%k5}{z} #105.8 -+ vsubpd %zmm20, %zmm6, %zmm3{%k7} #104.8 -+ vsubpd %zmm8, %zmm7, %zmm4{%k7} #106.8 -+ shlq $6, %r9 #108.8 -+ prefetcht0 (%rcx,%r9) #109.3 -+ movslq 12(%rsi), %rsi #110.15 -+ vpermilpd $85, %zmm2, %zmm6 #115.3 -+ vpermilpd $85, %zmm3, %zmm17 #115.3 -+ vpermilpd $85, %zmm4, %zmm14 #115.3 -+ lea (%rsi,%rsi,2), %rax #110.8 -+ shlq $6, %rax #110.8 -+ movl $150, %esi #127.8 -+ kmovw %esi, %k5 #127.8 -+ movl $105, %esi #128.8 -+ kmovw %esi, %k6 #128.8 -+ vmulpd %zmm30, %zmm6, %zmm5 #115.3 -+ vmulpd %zmm17, %zmm30, %zmm12 #115.3 -+ vmulpd %zmm14, %zmm30, %zmm8 #115.3 -+ prefetcht0 (%rcx,%rax) #111.3 -+ vmovups 576(%rdx), %zmm20 #115.3 -+ vmovups 640(%rdx), %zmm7 #115.3 -+ vmovups 704(%rdx), %zmm6 #115.3 -+ vmovaps %zmm20, %zmm17 #115.3 -+ vpermt2pd 720(%rdx), %zmm29, %zmm17 #115.3 -+ vmulpd %zmm2, %zmm17, %zmm14 #115.3 -+ vmovaps %zmm20, %zmm0 #115.3 -+ vpermt2pd 720(%rdx), %zmm28, %zmm0 #115.3 -+ vfmadd213pd %zmm14, %zmm5, %zmm0 #115.3 -+ vmovaps %zmm20, %zmm1 #115.3 -+ vpermt2pd 720(%rdx), %zmm27, %zmm1 #115.3 -+ vfmadd213pd %zmm0, %zmm3, %zmm1 #115.3 -+ vmovaps %zmm20, %zmm31 #115.3 -+ vmovaps %zmm20, %zmm0 #115.3 -+ vpermt2pd 720(%rdx), %zmm26, %zmm31 #115.3 -+ vpermt2pd 720(%rdx), %zmm23, %zmm0 #115.3 -+ vfmadd213pd %zmm1, %zmm12, %zmm31 #115.3 -+ vmulpd %zmm0, %zmm2, %zmm1 #115.3 -+ vmovaps %zmm20, %zmm17 #115.3 -+ vpermt2pd 784(%rdx), %zmm25, %zmm17 #115.3 -+ vfmadd213pd %zmm31, %zmm4, %zmm17 #115.3 -+ vmovaps %zmm20, %zmm14 #115.3 -+ vpermt2pd 720(%rdx), %zmm22, %zmm20 #115.3 -+ vpermt2pd 784(%rdx), %zmm24, %zmm14 #115.3 -+ vfmadd213pd %zmm1, %zmm5, %zmm20 #115.3 -+ vfmadd213pd %zmm17, %zmm8, %zmm14 #115.3 -+ vmovaps %zmm7, %zmm17 #115.3 -+ vpermt2pd 784(%rdx), %zmm29, %zmm17 #115.3 -+ vfmadd213pd %zmm20, %zmm3, %zmm17 #115.3 -+ vmovaps %zmm7, %zmm20 #115.3 -+ vpermt2pd 784(%rdx), %zmm28, %zmm20 #115.3 -+ vmovaps %zmm7, %zmm1 #115.3 -+ vpermt2pd 720(%rdx), %zmm25, %zmm1 #115.3 -+ vfmadd213pd %zmm17, %zmm12, %zmm20 #115.3 -+ vmulpd %zmm1, %zmm2, %zmm2 #115.3 -+ vmovaps %zmm7, %zmm0 #115.3 -+ vpermt2pd 784(%rdx), %zmm27, %zmm0 #115.3 -+ vfmadd213pd %zmm20, %zmm4, %zmm0 #115.3 -+ vmovaps %zmm7, %zmm17 #115.3 -+ vpermt2pd 784(%rdx), %zmm26, %zmm17 #115.3 -+ vfmadd213pd %zmm0, %zmm8, %zmm17 #115.3 -+ vmovaps %zmm7, %zmm0 #115.3 -+ vpermt2pd 720(%rdx), %zmm24, %zmm0 #115.3 -+ vfmadd213pd %zmm2, %zmm5, %zmm0 #115.3 -+ vmovaps %zmm7, %zmm5 #115.3 -+ vpermt2pd 784(%rdx), %zmm23, %zmm5 #115.3 -+ vpermt2pd 784(%rdx), %zmm22, %zmm7 #115.3 -+ vfmadd213pd %zmm0, %zmm3, %zmm5 #115.3 -+ vmovups .L_2il0floatpacket.33(%rip), %zmm0 #117.3 -+ vfmadd213pd %zmm5, %zmm12, %zmm7 #115.3 -+ vmovups 96(%rcx,%rax), %zmm5 #124.3 -+ vmovaps %zmm6, %zmm3 #115.3 -+ vpermt2pd 848(%rdx), %zmm29, %zmm3 #115.3 -+ vpermt2pd 848(%rdx), %zmm28, %zmm6 #115.3 -+ vfmadd213pd %zmm7, %zmm4, %zmm3 #115.3 -+ vpermpd %zmm14, %zmm21, %zmm4 #117.3 -+ vfmadd213pd %zmm3, %zmm8, %zmm6 #115.3 -+ vmovups (%rcx,%rax), %zmm3 #123.3 -+ vaddpd %zmm14, %zmm4, %zmm4{%k4} #117.3 -+ vpermpd %zmm6, %zmm21, %zmm1 #119.3 -+ vsubpd %zmm4, %zmm14, %zmm4{%k3} #117.3 -+ vaddpd %zmm6, %zmm1, %zmm1{%k4} #119.3 -+ vpermpd %zmm4, %zmm0, %zmm12 #117.3 -+ vpermpd %zmm17, %zmm21, %zmm4 #118.3 -+ vsubpd %zmm1, %zmm6, %zmm1{%k3} #119.3 -+ vaddpd %zmm12, %zmm10, %zmm10{%k1} #117.3 -+ vaddpd %zmm17, %zmm4, %zmm4{%k4} #118.3 -+ vpermpd %zmm1, %zmm0, %zmm2 #119.3 -+ vsubpd %zmm12, %zmm10, %zmm10{%k2} #117.3 -+ vsubpd %zmm4, %zmm17, %zmm4{%k3} #118.3 -+ vaddpd %zmm2, %zmm11, %zmm11{%k1} #119.3 -+ vpermpd %zmm4, %zmm0, %zmm12 #118.3 -+ vsubpd %zmm2, %zmm11, %zmm11{%k2} #119.3 -+ vmovups (%rcx,%r9), %zmm4 #123.3 -+ vmovups 96(%rcx,%r9), %zmm2 #124.3 -+ vaddpd %zmm12, %zmm9, %zmm9{%k1} #118.3 -+ vmovaps %zmm13, %zmm6 #123.3 -+ vpermi2pd %zmm4, %zmm3, %zmm6 #123.3 -+ vpermt2pd 64(%rcx,%r9), %zmm16, %zmm4 #123.3 -+ vpermt2pd 64(%rcx,%rax), %zmm15, %zmm3 #123.3 -+ vpermi2pd %zmm2, %zmm5, %zmm13 #124.3 -+ vpermt2pd 160(%rcx,%rax), %zmm15, %zmm5 #124.3 -+ vpermt2pd 160(%rcx,%r9), %zmm16, %zmm2 #124.3 -+ vsubpd %zmm12, %zmm9, %zmm9{%k2} #118.3 -+ vmovaps %zmm19, %zmm0 #123.3 -+ vpermi2pd %zmm3, %zmm4, %zmm0 #123.3 -+ vpermt2pd %zmm3, %zmm18, %zmm4 #123.3 -+ vpermi2pd %zmm5, %zmm2, %zmm19 #124.3 -+ vpermt2pd %zmm5, %zmm18, %zmm2 #124.3 -+ vpermilpd $85, %zmm13, %zmm18 #126.8 -+ vaddpd %zmm18, %zmm6, %zmm13{%k5}{z} #127.8 -+ vpermilpd $85, %zmm19, %zmm1 #129.8 -+ vsubpd %zmm18, %zmm6, %zmm13{%k6} #128.8 -+ vaddpd %zmm1, %zmm0, %zmm12{%k5}{z} #130.8 -+ # LOE rdx rbx r8 r12 r13 r14 r15 zmm0 zmm1 zmm2 zmm4 zmm9 zmm10 zmm11 zmm12 zmm13 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 zmm27 zmm28 zmm29 zmm30 k3 k4 k5 k6 -+..B1.4: # Preds ..B1.1 -+ # Execution count [1.00e+00] -+ vpermilpd $85, %zmm2, %zmm14 #132.8 -+ movl $111, %eax #140.3 -+ vmovups 864(%rdx), %zmm3 #138.3 -+ vmovups 1008(%rdx), %zmm2 #138.3 -+ vmovups 928(%rdx), %zmm5 #138.3 -+ vmovups 992(%rdx), %zmm7 #138.3 -+ vmovups 1136(%rdx), %zmm6 #138.3 -+ vsubpd %zmm1, %zmm0, %zmm12{%k6} #131.8 -+ vaddpd %zmm14, %zmm4, %zmm8{%k5}{z} #133.8 -+ kmovw %eax, %k1 #140.3 -+ vsubpd %zmm14, %zmm4, %zmm8{%k6} #134.8 -+ vmovups 1072(%rdx), %zmm4 #138.3 -+ vmovaps %zmm29, %zmm18 #138.3 -+ movl $144, %eax #140.3 -+ vpermi2pd %zmm2, %zmm3, %zmm18 #138.3 -+ kmovw %eax, %k2 #140.3 -+ vmulpd %zmm13, %zmm18, %zmm19 #138.3 -+ vpermilpd $85, %zmm13, %zmm15 #138.3 -+ vpermilpd $85, %zmm12, %zmm16 #138.3 -+ vmulpd %zmm30, %zmm15, %zmm1 #138.3 -+ vmulpd %zmm16, %zmm30, %zmm0 #138.3 -+ vmovaps %zmm23, %zmm16 #138.3 -+ vmovaps %zmm25, %zmm15 #138.3 -+ vpermi2pd %zmm2, %zmm3, %zmm16 #138.3 -+ vpermi2pd %zmm2, %zmm5, %zmm25 #138.3 -+ vpermi2pd %zmm4, %zmm3, %zmm15 #138.3 -+ vpermi2pd %zmm4, %zmm5, %zmm23 #138.3 -+ vpermilpd $85, %zmm8, %zmm17 #138.3 -+ vmulpd %zmm17, %zmm30, %zmm30 #138.3 -+ vmulpd %zmm16, %zmm13, %zmm17 #138.3 -+ vmulpd %zmm25, %zmm13, %zmm13 #138.3 -+ vmovaps %zmm28, %zmm20 #138.3 -+ vpermi2pd %zmm2, %zmm3, %zmm20 #138.3 -+ vfmadd213pd %zmm19, %zmm1, %zmm20 #138.3 -+ vmovaps %zmm27, %zmm31 #138.3 -+ vmovaps %zmm26, %zmm14 #138.3 -+ vmovaps %zmm24, %zmm19 #138.3 -+ vpermi2pd %zmm2, %zmm3, %zmm31 #138.3 -+ vpermi2pd %zmm2, %zmm3, %zmm14 #138.3 -+ vpermi2pd %zmm4, %zmm3, %zmm19 #138.3 -+ vpermt2pd %zmm2, %zmm22, %zmm3 #138.3 -+ vpermi2pd %zmm2, %zmm5, %zmm24 #138.3 -+ vpermi2pd %zmm4, %zmm5, %zmm27 #138.3 -+ vpermi2pd %zmm4, %zmm5, %zmm26 #138.3 -+ vfmadd213pd %zmm17, %zmm1, %zmm3 #138.3 -+ vfmadd213pd %zmm13, %zmm1, %zmm24 #138.3 -+ vfmadd213pd %zmm20, %zmm12, %zmm31 #138.3 -+ vfmadd213pd %zmm24, %zmm12, %zmm23 #138.3 -+ vfmadd213pd %zmm31, %zmm0, %zmm14 #138.3 -+ vmovups .L_2il0floatpacket.34(%rip), %zmm24 #140.3 -+ vfmadd213pd %zmm14, %zmm8, %zmm15 #138.3 -+ vmovaps %zmm29, %zmm18 #138.3 -+ vpermi2pd %zmm4, %zmm5, %zmm18 #138.3 -+ vpermi2pd %zmm6, %zmm7, %zmm29 #138.3 -+ vpermt2pd %zmm6, %zmm28, %zmm7 #138.3 -+ vfmadd213pd %zmm3, %zmm12, %zmm18 #138.3 -+ vfmadd213pd %zmm15, %zmm30, %zmm19 #138.3 -+ vmovaps %zmm28, %zmm3 #138.3 -+ vpermi2pd %zmm4, %zmm5, %zmm3 #138.3 -+ vpermt2pd %zmm4, %zmm22, %zmm5 #138.3 -+ vpermpd %zmm19, %zmm21, %zmm12 #140.3 -+ vfmadd213pd %zmm18, %zmm0, %zmm3 #138.3 -+ vfmadd213pd %zmm23, %zmm0, %zmm5 #138.3 -+ vmovups .L_2il0floatpacket.35(%rip), %zmm28 #150.3 -+ vaddpd %zmm19, %zmm12, %zmm12{%k4} #140.3 -+ vfmadd213pd %zmm3, %zmm8, %zmm27 #138.3 -+ vfmadd213pd %zmm5, %zmm8, %zmm29 #138.3 -+ vsubpd %zmm12, %zmm19, %zmm12{%k3} #140.3 -+ vfmadd213pd %zmm27, %zmm30, %zmm26 #138.3 -+ vfmadd213pd %zmm29, %zmm30, %zmm7 #138.3 -+ vmovups .L_2il0floatpacket.36(%rip), %zmm29 #150.3 -+ vpermpd %zmm26, %zmm21, %zmm23 #141.3 -+ vpermpd %zmm7, %zmm21, %zmm21 #142.3 -+ vpermpd %zmm12, %zmm24, %zmm22 #140.3 -+ vaddpd %zmm26, %zmm23, %zmm23{%k4} #141.3 -+ vaddpd %zmm7, %zmm21, %zmm21{%k4} #142.3 -+ vaddpd %zmm22, %zmm10, %zmm10{%k1} #140.3 -+ vsubpd %zmm23, %zmm26, %zmm23{%k3} #141.3 -+ vsubpd %zmm21, %zmm7, %zmm21{%k3} #142.3 -+ vsubpd %zmm22, %zmm10, %zmm10{%k2} #140.3 -+ vpermpd %zmm23, %zmm24, %zmm26 #141.3 -+ vpermpd %zmm21, %zmm24, %zmm25 #142.3 -+ vbroadcastsd -16(%rbp), %zmm27 #145.10 -+ vaddpd %zmm26, %zmm9, %zmm9{%k1} #141.3 -+ vaddpd %zmm25, %zmm11, %zmm11{%k1} #142.3 -+ vmulpd %zmm10, %zmm27, %zmm10 #146.8 -+ vsubpd %zmm26, %zmm9, %zmm9{%k2} #141.3 -+ vsubpd %zmm25, %zmm11, %zmm11{%k2} #142.3 -+ vmulpd %zmm9, %zmm27, %zmm0 #147.8 -+ vmulpd %zmm11, %zmm27, %zmm9 #148.8 -+ vmovups .L_2il0floatpacket.37(%rip), %zmm11 #150.3 -+ vpermi2pd %zmm0, %zmm10, %zmm28 #150.3 -+ vpermi2pd %zmm10, %zmm9, %zmm29 #150.3 -+ vpermt2pd %zmm9, %zmm11, %zmm0 #150.3 -+ vmovupd %ymm28, (%r8) #150.3 -+ vmovupd %ymm29, 32(%r8) #150.3 -+ vmovupd %ymm0, 64(%r8) #150.3 -+ vextractf64x4 $1, %zmm28, 96(%r8) #150.3 -+ vextractf64x4 $1, %zmm29, 128(%r8) #150.3 -+ vextractf64x4 $1, %zmm0, 160(%r8) #150.3 -+ vzeroupper #151.1 -+ movq %rbp, %rsp #151.1 -+ popq %rbp #151.1 -+ .cfi_restore 6 -+ ret #151.1 -+ .align 16,0x90 -+ # LOE -+ .cfi_endproc -+# mark_end; -+ .type doe_dble_avx512,@function -+ .size doe_dble_avx512,.-doe_dble_avx512 -+ .data -+# -- End doe_dble_avx512 -+ .text -+# -- Begin deo_dble_avx512 -+ .text -+# mark_begin; -+ .align 16,0x90 -+ .globl deo_dble_avx512 -+# --- deo_dble_avx512(const int *, const int *, const su3_dble *, spinor_dble *, double, spin_t *) -+deo_dble_avx512: -+# parameter 1: %rdi -+# parameter 2: %rsi -+# parameter 3: %rdx -+# parameter 4: %rcx -+# parameter 5: %xmm0 -+# parameter 6: %r8 -+..B2.1: # Preds ..B2.0 -+ # Execution count [1.00e+00] -+ .cfi_startproc -+..___tag_value_deo_dble_avx512.8: -+..L9: -+ #154.1 -+ pushq %rbp #154.1 -+ .cfi_def_cfa_offset 16 -+ movq %rsp, %rbp #154.1 -+ .cfi_def_cfa 6, 16 -+ .cfi_offset 6, -16 -+ andq $-64, %rsp #154.1 -+ movslq (%rdi), %rax #168.16 -+ lea (%rax,%rax,2), %r11 #168.8 -+ shlq $6, %r11 #168.8 -+ prefetcht0 (%rcx,%r11) #169.3 -+ movl $15, %eax #181.3 -+ movslq (%rsi), %r9 #170.16 -+ kmovw %eax, %k5 #181.3 -+ movl $240, %eax #181.3 -+ kmovw %eax, %k6 #181.3 -+ vbroadcastsd %xmm0, %zmm24 #176.10 -+ movl $90, %eax #198.3 -+ lea (%r9,%r9,2), %r10 #170.8 -+ shlq $6, %r10 #170.8 -+ kmovw %eax, %k1 #198.3 -+ movl $165, %eax #198.3 -+ kmovw %eax, %k2 #198.3 -+ movl $195, %eax #215.3 -+ kmovw %eax, %k4 #215.3 -+ movl $60, %eax #215.3 -+ kmovw %eax, %k3 #215.3 -+ prefetcht0 (%rcx,%r10) #171.3 -+ vmovups 96(%r8), %zmm27 #173.3 -+ vmovups (%r8), %zmm23 #173.3 -+ vmovups .L_2il0floatpacket.14(%rip), %zmm26 #173.3 -+ vmovups .L_2il0floatpacket.15(%rip), %zmm30 #173.3 -+ vmovups .L_2il0floatpacket.16(%rip), %zmm29 #173.3 -+ vmovups .L_2il0floatpacket.17(%rip), %zmm25 #173.3 -+ vmovups .L_2il0floatpacket.18(%rip), %zmm28 #173.3 -+ vmovups .L_2il0floatpacket.28(%rip), %zmm20 #181.3 -+ vmovups 144(%rdx), %zmm18 #187.3 -+ vmovups 208(%rdx), %zmm13 #187.3 -+ vmovups 272(%rdx), %zmm1 #187.3 -+ vmovups (%rcx,%r10), %zmm3 #189.3 -+ vpermi2pd %zmm23, %zmm27, %zmm26 #173.3 -+ vpermt2pd 160(%r8), %zmm30, %zmm27 #173.3 -+ vpermt2pd 64(%r8), %zmm29, %zmm23 #173.3 -+ vmulpd %zmm26, %zmm24, %zmm30 #177.8 -+ vpermi2pd %zmm27, %zmm23, %zmm25 #173.3 -+ vpermt2pd %zmm27, %zmm28, %zmm23 #173.3 -+ vpermpd %zmm30, %zmm20, %zmm22 #181.3 -+ vmulpd %zmm25, %zmm24, %zmm29 #178.8 -+ vmulpd %zmm23, %zmm24, %zmm28 #179.8 -+ vmovups .L_2il0floatpacket.19(%rip), %zmm27 #187.3 -+ vpermpd %zmm29, %zmm20, %zmm21 #182.3 -+ vaddpd %zmm30, %zmm22, %zmm6{%k5}{z} #181.3 -+ vpermpd %zmm28, %zmm20, %zmm19 #183.3 -+ vaddpd %zmm29, %zmm21, %zmm10{%k5}{z} #182.3 -+ vsubpd %zmm30, %zmm22, %zmm6{%k6} #181.3 -+ vaddpd %zmm28, %zmm19, %zmm12{%k5}{z} #183.3 -+ vsubpd %zmm29, %zmm21, %zmm10{%k6} #182.3 -+ vsubpd %zmm28, %zmm19, %zmm12{%k6} #183.3 -+ vmovups .L_2il0floatpacket.27(%rip), %zmm19 #187.3 -+ vpermilpd $85, %zmm10, %zmm26 #187.3 -+ vmulpd %zmm26, %zmm27, %zmm16 #187.3 -+ vmovups .L_2il0floatpacket.20(%rip), %zmm26 #187.3 -+ vmovaps %zmm18, %zmm24 #187.3 -+ vpermt2pd (%rdx), %zmm26, %zmm24 #187.3 -+ vpermilpd $85, %zmm6, %zmm15 #187.3 -+ vmulpd %zmm27, %zmm15, %zmm17 #187.3 -+ vmulpd %zmm6, %zmm24, %zmm23 #187.3 -+ vmovups .L_2il0floatpacket.22(%rip), %zmm24 #187.3 -+ vpermilpd $85, %zmm12, %zmm25 #187.3 -+ vmulpd %zmm25, %zmm27, %zmm15 #187.3 -+ vmovups .L_2il0floatpacket.21(%rip), %zmm25 #187.3 -+ vmovaps %zmm18, %zmm22 #187.3 -+ vpermt2pd (%rdx), %zmm25, %zmm22 #187.3 -+ vfmadd213pd %zmm23, %zmm17, %zmm22 #187.3 -+ vmovups .L_2il0floatpacket.23(%rip), %zmm23 #187.3 -+ vmovaps %zmm18, %zmm14 #187.3 -+ vpermt2pd (%rdx), %zmm24, %zmm14 #187.3 -+ vfmadd213pd %zmm22, %zmm10, %zmm14 #187.3 -+ vmovups .L_2il0floatpacket.24(%rip), %zmm22 #187.3 -+ vmovaps %zmm18, %zmm21 #187.3 -+ vpermt2pd (%rdx), %zmm23, %zmm21 #187.3 -+ vfmadd213pd %zmm14, %zmm16, %zmm21 #187.3 -+ vmovaps %zmm18, %zmm20 #187.3 -+ vpermt2pd 64(%rdx), %zmm22, %zmm20 #187.3 -+ vfmadd213pd %zmm21, %zmm12, %zmm20 #187.3 -+ vmovups .L_2il0floatpacket.25(%rip), %zmm21 #187.3 -+ vmovaps %zmm18, %zmm14 #187.3 -+ vpermt2pd 64(%rdx), %zmm21, %zmm14 #187.3 -+ vfmadd213pd %zmm20, %zmm15, %zmm14 #187.3 -+ vmovups .L_2il0floatpacket.26(%rip), %zmm20 #187.3 -+ vmovaps %zmm18, %zmm11 #187.3 -+ vpermt2pd (%rdx), %zmm20, %zmm11 #187.3 -+ vpermt2pd (%rdx), %zmm19, %zmm18 #187.3 -+ vmulpd %zmm11, %zmm6, %zmm31 #187.3 -+ vfmadd213pd %zmm31, %zmm17, %zmm18 #187.3 -+ vmovaps %zmm13, %zmm8 #187.3 -+ vmovaps %zmm13, %zmm2 #187.3 -+ vpermt2pd 64(%rdx), %zmm26, %zmm8 #187.3 -+ vpermt2pd (%rdx), %zmm22, %zmm2 #187.3 -+ vfmadd213pd %zmm18, %zmm10, %zmm8 #187.3 -+ vmulpd %zmm2, %zmm6, %zmm18 #187.3 -+ vmovaps %zmm13, %zmm5 #187.3 -+ vpermt2pd (%rdx), %zmm21, %zmm5 #187.3 -+ vfmadd213pd %zmm18, %zmm17, %zmm5 #187.3 -+ vmovups .L_2il0floatpacket.38(%rip), %zmm18 #189.3 -+ vmovaps %zmm13, %zmm17 #187.3 -+ vpermt2pd 64(%rdx), %zmm20, %zmm17 #187.3 -+ vfmadd213pd %zmm5, %zmm10, %zmm17 #187.3 -+ vmovups 64(%rcx,%r10), %zmm10 #189.3 -+ vmovaps %zmm13, %zmm9 #187.3 -+ vmovaps %zmm13, %zmm7 #187.3 -+ vmovaps %zmm13, %zmm11 #187.3 -+ vpermt2pd 64(%rdx), %zmm19, %zmm13 #187.3 -+ vpermt2pd 64(%rdx), %zmm25, %zmm9 #187.3 -+ vpermt2pd 64(%rdx), %zmm24, %zmm7 #187.3 -+ vpermt2pd 64(%rdx), %zmm23, %zmm11 #187.3 -+ vpermt2pd 64(%rcx,%r11), %zmm18, %zmm10 #189.3 -+ vfmadd213pd %zmm17, %zmm16, %zmm13 #187.3 -+ vfmadd213pd %zmm8, %zmm16, %zmm9 #187.3 -+ vmovups 96(%rcx,%r10), %zmm8 #190.3 -+ vmovups .L_2il0floatpacket.39(%rip), %zmm17 #189.3 -+ vfmadd213pd %zmm9, %zmm12, %zmm7 #187.3 -+ vmovups 160(%rcx,%r10), %zmm9 #190.3 -+ vfmadd213pd %zmm7, %zmm15, %zmm11 #187.3 -+ vpermt2pd 160(%rcx,%r11), %zmm18, %zmm9 #190.3 -+ vmovaps %zmm1, %zmm16 #187.3 -+ vpermt2pd 128(%rdx), %zmm26, %zmm16 #187.3 -+ vpermt2pd 128(%rdx), %zmm25, %zmm1 #187.3 -+ vfmadd213pd %zmm13, %zmm12, %zmm16 #187.3 -+ vfmadd213pd %zmm16, %zmm15, %zmm1 #187.3 -+ vmovups .L_2il0floatpacket.36(%rip), %zmm15 #189.3 -+ vmovups .L_2il0floatpacket.35(%rip), %zmm16 #189.3 -+ vmovaps %zmm1, %zmm12 #189.3 -+ vmovaps %zmm14, %zmm13 #189.3 -+ vpermt2pd %zmm14, %zmm15, %zmm12 #189.3 -+ vpermt2pd %zmm11, %zmm16, %zmm13 #189.3 -+ vmovups .L_2il0floatpacket.37(%rip), %zmm14 #189.3 -+ vmovaps %zmm8, %zmm0 #190.3 -+ vpermt2pd %zmm1, %zmm14, %zmm11 #189.3 -+ vpermt2pd 96(%rcx,%r11), %zmm18, %zmm0 #190.3 -+ vpermt2pd 96(%rcx,%r11), %zmm17, %zmm8 #190.3 -+ vaddpd %zmm11, %zmm9, %zmm9{%k5} #190.3 -+ vaddpd %zmm13, %zmm0, %zmm0{%k5} #190.3 -+ vaddpd %zmm12, %zmm8, %zmm8{%k5} #190.3 -+ vaddpd %zmm11, %zmm10, %zmm31 #189.3 -+ vsubpd %zmm13, %zmm0, %zmm0{%k6} #190.3 -+ vsubpd %zmm12, %zmm8, %zmm8{%k6} #190.3 -+ vsubpd %zmm11, %zmm9, %zmm9{%k6} #190.3 -+ vmovaps %zmm3, %zmm4 #189.3 -+ vpermt2pd (%rcx,%r11), %zmm18, %zmm4 #189.3 -+ vpermt2pd (%rcx,%r11), %zmm17, %zmm3 #189.3 -+ vaddpd %zmm13, %zmm4, %zmm6 #189.3 -+ vaddpd %zmm12, %zmm3, %zmm1 #189.3 -+ movslq 4(%rdi), %rax #193.16 -+ vmovupd %ymm6, (%rcx,%r10) #189.3 -+ vmovupd %ymm1, 32(%rcx,%r10) #189.3 -+ vmovupd %ymm31, 64(%rcx,%r10) #189.3 -+ vextractf64x4 $1, %zmm6, (%rcx,%r11) #189.3 -+ vextractf64x4 $1, %zmm1, 32(%rcx,%r11) #189.3 -+ vextractf64x4 $1, %zmm31, 64(%rcx,%r11) #189.3 -+ vmovupd %ymm0, 96(%rcx,%r10) #190.3 -+ vmovupd %ymm8, 128(%rcx,%r10) #190.3 -+ vmovupd %ymm9, 160(%rcx,%r10) #190.3 -+ vextractf64x4 $1, %zmm0, 96(%rcx,%r11) #190.3 -+ vextractf64x4 $1, %zmm8, 128(%rcx,%r11) #190.3 -+ vextractf64x4 $1, %zmm9, 160(%rcx,%r11) #190.3 -+ lea (%rax,%rax,2), %r10 #193.8 -+ shlq $6, %r10 #193.8 -+ prefetcht0 (%rcx,%r10) #194.3 -+ movslq 4(%rsi), %r8 #195.16 -+ lea (%r8,%r8,2), %r9 #195.8 -+ shlq $6, %r9 #195.8 -+ prefetcht0 (%rcx,%r9) #196.3 -+ vmovups .L_2il0floatpacket.40(%rip), %zmm7 #198.3 -+ vmovups .L_2il0floatpacket.41(%rip), %zmm4 #198.3 -+ vmovups 432(%rdx), %zmm6 #204.3 -+ vmovups 496(%rdx), %zmm8 #204.3 -+ vmovups 560(%rdx), %zmm9 #204.3 -+ vpermpd %zmm30, %zmm7, %zmm12 #198.3 -+ vpermpd %zmm30, %zmm4, %zmm13 #198.3 -+ vpermpd %zmm29, %zmm7, %zmm11 #199.3 -+ vpermpd %zmm28, %zmm7, %zmm3 #200.3 -+ vaddpd %zmm12, %zmm13, %zmm5{%k1}{z} #198.3 -+ vsubpd %zmm12, %zmm13, %zmm5{%k2} #198.3 -+ vpermpd %zmm29, %zmm4, %zmm12 #199.3 -+ vaddpd %zmm11, %zmm12, %zmm2{%k1}{z} #199.3 -+ vsubpd %zmm11, %zmm12, %zmm2{%k2} #199.3 -+ vpermpd %zmm28, %zmm4, %zmm11 #200.3 -+ vaddpd %zmm3, %zmm11, %zmm7{%k1}{z} #200.3 -+ vsubpd %zmm3, %zmm11, %zmm7{%k2} #200.3 -+ vmovaps %zmm6, %zmm3 #204.3 -+ vpermt2pd 288(%rdx), %zmm26, %zmm3 #204.3 -+ vpermilpd $85, %zmm5, %zmm10 #204.3 -+ vmulpd %zmm27, %zmm10, %zmm0 #204.3 -+ vmulpd %zmm5, %zmm3, %zmm10 #204.3 -+ vpermilpd $85, %zmm2, %zmm1 #204.3 -+ vpermilpd $85, %zmm7, %zmm4 #204.3 -+ vmulpd %zmm1, %zmm27, %zmm31 #204.3 -+ vmulpd %zmm4, %zmm27, %zmm1 #204.3 -+ vmovaps %zmm6, %zmm4 #204.3 -+ vpermt2pd 288(%rdx), %zmm25, %zmm4 #204.3 -+ vfmadd213pd %zmm10, %zmm0, %zmm4 #204.3 -+ vmovaps %zmm6, %zmm10 #204.3 -+ vpermt2pd 288(%rdx), %zmm24, %zmm10 #204.3 -+ vfmadd213pd %zmm4, %zmm2, %zmm10 #204.3 -+ vmovaps %zmm6, %zmm3 #204.3 -+ vpermt2pd 288(%rdx), %zmm23, %zmm3 #204.3 -+ vfmadd213pd %zmm10, %zmm31, %zmm3 #204.3 -+ vmovaps %zmm6, %zmm4 #204.3 -+ vpermt2pd 352(%rdx), %zmm22, %zmm4 #204.3 -+ vfmadd213pd %zmm3, %zmm7, %zmm4 #204.3 -+ vmovaps %zmm6, %zmm10 #204.3 -+ vmovaps %zmm6, %zmm3 #204.3 -+ vpermt2pd 352(%rdx), %zmm21, %zmm10 #204.3 -+ vpermt2pd 288(%rdx), %zmm20, %zmm3 #204.3 -+ vpermt2pd 288(%rdx), %zmm19, %zmm6 #204.3 -+ vfmadd213pd %zmm4, %zmm1, %zmm10 #204.3 -+ vmulpd %zmm3, %zmm5, %zmm4 #204.3 -+ vfmadd213pd %zmm4, %zmm0, %zmm6 #204.3 -+ vmovaps %zmm8, %zmm3 #204.3 -+ vpermt2pd 352(%rdx), %zmm26, %zmm3 #204.3 -+ vfmadd213pd %zmm6, %zmm2, %zmm3 #204.3 -+ vmovaps %zmm8, %zmm6 #204.3 -+ vpermt2pd 352(%rdx), %zmm25, %zmm6 #204.3 -+ vfmadd213pd %zmm3, %zmm31, %zmm6 #204.3 -+ vmovaps %zmm8, %zmm3 #204.3 -+ vpermt2pd 288(%rdx), %zmm22, %zmm3 #204.3 -+ vmovaps %zmm8, %zmm4 #204.3 -+ vpermt2pd 352(%rdx), %zmm24, %zmm4 #204.3 -+ vmulpd %zmm3, %zmm5, %zmm5 #204.3 -+ vfmadd213pd %zmm6, %zmm7, %zmm4 #204.3 -+ vmovaps %zmm8, %zmm6 #204.3 -+ vpermt2pd 352(%rdx), %zmm23, %zmm6 #204.3 -+ vfmadd213pd %zmm4, %zmm1, %zmm6 #204.3 -+ vmovaps %zmm8, %zmm4 #204.3 -+ vpermt2pd 288(%rdx), %zmm21, %zmm4 #204.3 -+ vfmadd213pd %zmm5, %zmm0, %zmm4 #204.3 -+ vmovaps %zmm8, %zmm0 #204.3 -+ vpermt2pd 352(%rdx), %zmm20, %zmm0 #204.3 -+ vpermt2pd 352(%rdx), %zmm19, %zmm8 #204.3 -+ vfmadd213pd %zmm4, %zmm2, %zmm0 #204.3 -+ vfmadd213pd %zmm0, %zmm31, %zmm8 #204.3 -+ vmovups (%rcx,%r9), %zmm0 #206.3 -+ vmovaps %zmm9, %zmm2 #204.3 -+ vpermt2pd 416(%rdx), %zmm26, %zmm2 #204.3 -+ vpermt2pd 416(%rdx), %zmm25, %zmm9 #204.3 -+ vfmadd213pd %zmm8, %zmm7, %zmm2 #204.3 -+ vmovups 64(%rcx,%r9), %zmm7 #206.3 -+ vfmadd213pd %zmm2, %zmm1, %zmm9 #204.3 -+ vpermt2pd 64(%rcx,%r10), %zmm18, %zmm7 #206.3 -+ vmovaps %zmm9, %zmm8 #206.3 -+ vmovaps %zmm0, %zmm1 #206.3 -+ vpermt2pd (%rcx,%r10), %zmm17, %zmm0 #206.3 -+ vpermt2pd %zmm10, %zmm15, %zmm8 #206.3 -+ vpermt2pd (%rcx,%r10), %zmm18, %zmm1 #206.3 -+ vaddpd %zmm8, %zmm0, %zmm4 #206.3 -+ vmovups .L_2il0floatpacket.43(%rip), %zmm0 #207.3 -+ vmovaps %zmm10, %zmm31 #206.3 -+ vmovaps %zmm6, %zmm5 #206.3 -+ vpermt2pd %zmm6, %zmm16, %zmm31 #206.3 -+ vpermt2pd %zmm9, %zmm14, %zmm5 #206.3 -+ vpermi2pd %zmm10, %zmm9, %zmm0 #207.3 -+ vaddpd %zmm31, %zmm1, %zmm2 #206.3 -+ vaddpd %zmm5, %zmm7, %zmm3 #206.3 -+ vmovups 96(%rcx,%r9), %zmm7 #207.3 -+ vmovups .L_2il0floatpacket.42(%rip), %zmm1 #207.3 -+ vmovaps %zmm10, %zmm31 #207.3 -+ vmovups .L_2il0floatpacket.44(%rip), %zmm10 #207.3 -+ vmovups %zmm1, -64(%rsp) #207.3[spill] -+ vpermt2pd %zmm6, %zmm1, %zmm31 #207.3 -+ vpermt2pd %zmm9, %zmm10, %zmm6 #207.3 -+ vmovaps %zmm7, %zmm8 #207.3 -+ vpermt2pd 96(%rcx,%r10), %zmm18, %zmm8 #207.3 -+ vpermt2pd 96(%rcx,%r10), %zmm17, %zmm7 #207.3 -+ vaddpd %zmm31, %zmm8, %zmm8{%k2} #207.3 -+ vaddpd %zmm0, %zmm7, %zmm7{%k2} #207.3 -+ vsubpd %zmm31, %zmm8, %zmm8{%k1} #207.3 -+ vsubpd %zmm0, %zmm7, %zmm7{%k1} #207.3 -+ movslq 8(%rdi), %r11 #210.16 -+ lea (%r11,%r11,2), %r8 #210.8 -+ vmovupd %ymm2, (%rcx,%r9) #206.3 -+ vmovupd %ymm4, 32(%rcx,%r9) #206.3 -+ shlq $6, %r8 #210.8 -+ vmovupd %ymm3, 64(%rcx,%r9) #206.3 -+ vextractf64x4 $1, %zmm2, (%rcx,%r10) #206.3 -+ vmovups 160(%rcx,%r9), %zmm2 #207.3 -+ vextractf64x4 $1, %zmm4, 32(%rcx,%r10) #206.3 -+ vextractf64x4 $1, %zmm3, 64(%rcx,%r10) #206.3 -+ vpermt2pd 160(%rcx,%r10), %zmm18, %zmm2 #207.3 -+ vaddpd %zmm6, %zmm2, %zmm2{%k2} #207.3 -+ vsubpd %zmm6, %zmm2, %zmm2{%k1} #207.3 -+ vmovupd %ymm8, 96(%rcx,%r9) #207.3 -+ vmovupd %ymm7, 128(%rcx,%r9) #207.3 -+ vmovupd %ymm2, 160(%rcx,%r9) #207.3 -+ vextractf64x4 $1, %zmm8, 96(%rcx,%r10) #207.3 -+ vextractf64x4 $1, %zmm7, 128(%rcx,%r10) #207.3 -+ vextractf64x4 $1, %zmm2, 160(%rcx,%r10) #207.3 -+ prefetcht0 (%rcx,%r8) #211.3 -+ movslq 8(%rsi), %rax #212.16 -+ lea (%rax,%rax,2), %rax #212.8 -+ shlq $6, %rax #212.8 -+ prefetcht0 (%rcx,%rax) #213.3 -+ vmovups .L_2il0floatpacket.45(%rip), %zmm1 #215.3 -+ vpermpd %zmm30, %zmm1, %zmm6 #215.3 -+ vpermpd %zmm29, %zmm1, %zmm9 #216.3 -+ vaddpd %zmm6, %zmm13, %zmm2{%k4}{z} #215.3 -+ vaddpd %zmm9, %zmm12, %zmm5{%k4}{z} #216.3 -+ vsubpd %zmm6, %zmm13, %zmm2{%k3} #215.3 -+ vpermpd %zmm28, %zmm1, %zmm6 #217.3 -+ vsubpd %zmm9, %zmm12, %zmm5{%k3} #216.3 -+ vmovups 720(%rdx), %zmm1 #221.3 -+ vmovups 784(%rdx), %zmm9 #221.3 -+ vaddpd %zmm6, %zmm11, %zmm8{%k4}{z} #217.3 -+ vpermilpd $85, %zmm2, %zmm31 #221.3 -+ vmulpd %zmm27, %zmm31, %zmm3 #221.3 -+ vsubpd %zmm6, %zmm11, %zmm8{%k3} #217.3 -+ vmovups 848(%rdx), %zmm6 #221.3 -+ vmovaps %zmm1, %zmm31 #221.3 -+ vpermt2pd 576(%rdx), %zmm26, %zmm31 #221.3 -+ vpermilpd $85, %zmm5, %zmm0 #221.3 -+ vmulpd %zmm0, %zmm27, %zmm4 #221.3 -+ vmulpd %zmm2, %zmm31, %zmm0 #221.3 -+ vmovaps %zmm1, %zmm31 #221.3 -+ vpermt2pd 576(%rdx), %zmm25, %zmm31 #221.3 -+ vfmadd213pd %zmm0, %zmm3, %zmm31 #221.3 -+ vmovaps %zmm1, %zmm0 #221.3 -+ vpermt2pd 576(%rdx), %zmm24, %zmm0 #221.3 -+ vfmadd213pd %zmm31, %zmm5, %zmm0 #221.3 -+ vmovaps %zmm1, %zmm31 #221.3 -+ vpermt2pd 576(%rdx), %zmm23, %zmm31 #221.3 -+ vpermilpd $85, %zmm8, %zmm7 #221.3 -+ vmulpd %zmm7, %zmm27, %zmm7 #221.3 -+ vfmadd213pd %zmm0, %zmm4, %zmm31 #221.3 -+ vmovaps %zmm1, %zmm0 #221.3 -+ vpermt2pd 640(%rdx), %zmm22, %zmm0 #221.3 -+ vfmadd213pd %zmm31, %zmm8, %zmm0 #221.3 -+ vmovaps %zmm1, %zmm31 #221.3 -+ vpermt2pd 640(%rdx), %zmm21, %zmm31 #221.3 -+ vfmadd213pd %zmm0, %zmm7, %zmm31 #221.3 -+ vmovaps %zmm1, %zmm0 #221.3 -+ vpermt2pd 576(%rdx), %zmm20, %zmm0 #221.3 -+ vpermt2pd 576(%rdx), %zmm19, %zmm1 #221.3 -+ vmulpd %zmm0, %zmm2, %zmm0 #221.3 -+ vfmadd213pd %zmm0, %zmm3, %zmm1 #221.3 -+ vmovaps %zmm9, %zmm0 #221.3 -+ vpermt2pd 640(%rdx), %zmm26, %zmm0 #221.3 -+ vfmadd213pd %zmm1, %zmm5, %zmm0 #221.3 -+ vmovaps %zmm9, %zmm1 #221.3 -+ vpermt2pd 640(%rdx), %zmm25, %zmm1 #221.3 -+ vfmadd213pd %zmm0, %zmm4, %zmm1 #221.3 -+ vmovaps %zmm9, %zmm0 #221.3 -+ vpermt2pd 640(%rdx), %zmm24, %zmm0 #221.3 -+ vfmadd213pd %zmm1, %zmm8, %zmm0 #221.3 -+ vmovaps %zmm9, %zmm1 #221.3 -+ vpermt2pd 640(%rdx), %zmm23, %zmm1 #221.3 -+ vfmadd213pd %zmm0, %zmm7, %zmm1 #221.3 -+ vmovaps %zmm9, %zmm0 #221.3 -+ vpermt2pd 576(%rdx), %zmm22, %zmm0 #221.3 -+ vmulpd %zmm0, %zmm2, %zmm0 #221.3 -+ vmovaps %zmm9, %zmm2 #221.3 -+ vpermt2pd 576(%rdx), %zmm21, %zmm2 #221.3 -+ vfmadd213pd %zmm0, %zmm3, %zmm2 #221.3 -+ vmovaps %zmm9, %zmm3 #221.3 -+ vpermt2pd 640(%rdx), %zmm20, %zmm3 #221.3 -+ vpermt2pd 640(%rdx), %zmm19, %zmm9 #221.3 -+ vfmadd213pd %zmm2, %zmm5, %zmm3 #221.3 -+ vfmadd213pd %zmm3, %zmm4, %zmm9 #221.3 -+ vmovaps %zmm6, %zmm5 #221.3 -+ vpermt2pd 704(%rdx), %zmm26, %zmm5 #221.3 -+ vpermt2pd 704(%rdx), %zmm25, %zmm6 #221.3 -+ vfmadd213pd %zmm9, %zmm8, %zmm5 #221.3 -+ vmovups (%rcx,%rax), %zmm8 #223.3 -+ vfmadd213pd %zmm5, %zmm7, %zmm6 #221.3 -+ vmovups 64(%rcx,%rax), %zmm5 #223.3 -+ vmovaps %zmm8, %zmm2 #223.3 -+ vmovaps %zmm31, %zmm4 #223.3 -+ vmovaps %zmm6, %zmm3 #223.3 -+ vpermt2pd (%rcx,%r8), %zmm18, %zmm2 #223.3 -+ vpermt2pd (%rcx,%r8), %zmm17, %zmm8 #223.3 -+ vpermt2pd %zmm1, %zmm16, %zmm4 #223.3 -+ vpermt2pd %zmm31, %zmm15, %zmm3 #223.3 -+ vpermt2pd 64(%rcx,%r8), %zmm18, %zmm5 #223.3 -+ vaddpd %zmm4, %zmm2, %zmm0 #223.3 -+ vaddpd %zmm3, %zmm8, %zmm9 #223.3 -+ vmovaps %zmm1, %zmm7 #223.3 -+ vpermt2pd %zmm6, %zmm14, %zmm7 #223.3 -+ vaddpd %zmm7, %zmm5, %zmm5 #223.3 -+ vmovupd %ymm0, (%rcx,%rax) #223.3 -+ vmovupd %ymm9, 32(%rcx,%rax) #223.3 -+ # LOE rax rdx rcx rbx rsi rdi r8 r12 r13 r14 r15 zmm0 zmm1 zmm5 zmm6 zmm9 zmm10 zmm11 zmm12 zmm13 zmm14 zmm15 zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 zmm27 zmm28 zmm29 zmm30 zmm31 k1 k2 k3 k4 k5 k6 -+..B2.4: # Preds ..B2.1 -+ # Execution count [1.00e+00] -+ vmovups .L_2il0floatpacket.46(%rip), %zmm7 #224.3 -+ vmovups 96(%rcx,%r8), %zmm8 #224.3 -+ vpermpd %zmm31, %zmm7, %zmm4 #224.3 -+ vpermpd %zmm1, %zmm7, %zmm2 #224.3 -+ vmovups 160(%rcx,%rax), %zmm31 #224.3 -+ vmovaps %zmm18, %zmm1 #224.3 -+ vmovaps %zmm15, %zmm3 #224.3 -+ vpermt2pd 160(%rcx,%r8), %zmm18, %zmm31 #224.3 -+ vmovupd %ymm5, 64(%rcx,%rax) #223.3 -+ vextractf64x4 $1, %zmm9, 32(%rcx,%r8) #223.3 -+ vextractf64x4 $1, %zmm0, (%rcx,%r8) #223.3 -+ vextractf64x4 $1, %zmm5, 64(%rcx,%r8) #223.3 -+ vpermpd %zmm6, %zmm7, %zmm9 #224.3 -+ vmovups 96(%rcx,%rax), %zmm6 #224.3 -+ vpermi2pd %zmm4, %zmm9, %zmm3 #224.3 -+ vpermi2pd %zmm8, %zmm6, %zmm1 #224.3 -+ vpermt2pd %zmm8, %zmm17, %zmm6 #224.3 -+ vmovaps %zmm16, %zmm0 #224.3 -+ vpermi2pd %zmm2, %zmm4, %zmm0 #224.3 -+ vpermt2pd %zmm9, %zmm14, %zmm2 #224.3 -+ vaddpd %zmm3, %zmm6, %zmm6{%k3} #224.3 -+ vaddpd %zmm0, %zmm1, %zmm1{%k6} #224.3 -+ vaddpd %zmm2, %zmm31, %zmm31{%k5} #224.3 -+ vsubpd %zmm3, %zmm6, %zmm6{%k4} #224.3 -+ vsubpd %zmm0, %zmm1, %zmm1{%k5} #224.3 -+ vsubpd %zmm2, %zmm31, %zmm31{%k6} #224.3 -+ vmovupd %ymm1, 96(%rcx,%rax) #224.3 -+ vmovupd %ymm6, 128(%rcx,%rax) #224.3 -+ vmovupd %ymm31, 160(%rcx,%rax) #224.3 -+ vextractf64x4 $1, %zmm1, 96(%rcx,%r8) #224.3 -+ vextractf64x4 $1, %zmm6, 128(%rcx,%r8) #224.3 -+ vextractf64x4 $1, %zmm31, 160(%rcx,%r8) #224.3 -+ movslq 12(%rdi), %rax #228.16 -+ lea (%rax,%rax,2), %r8 #228.8 -+ shlq $6, %r8 #228.8 -+ prefetcht0 (%rcx,%r8) #229.3 -+ movl $150, %eax #233.3 -+ movslq 12(%rsi), %rsi #230.16 -+ kmovw %eax, %k4 #233.3 -+ movl $105, %eax #233.3 -+ kmovw %eax, %k3 #233.3 -+ lea (%rsi,%rsi,2), %rdi #230.8 -+ shlq $6, %rdi #230.8 -+ prefetcht0 (%rcx,%rdi) #231.3 -+ vmovups .L_2il0floatpacket.47(%rip), %zmm5 #233.3 -+ vmovups 1072(%rdx), %zmm1 #239.3 -+ vmovups 928(%rdx), %zmm6 #239.3 -+ vmovups 992(%rdx), %zmm31 #239.3 -+ vmovups 1136(%rdx), %zmm0 #239.3 -+ vpermpd %zmm30, %zmm5, %zmm4 #233.3 -+ vpermpd %zmm28, %zmm5, %zmm28 #235.3 -+ vaddpd %zmm4, %zmm13, %zmm30{%k4}{z} #233.3 -+ vaddpd %zmm28, %zmm11, %zmm2{%k4}{z} #235.3 -+ vsubpd %zmm4, %zmm13, %zmm30{%k3} #233.3 -+ vpermpd %zmm29, %zmm5, %zmm13 #234.3 -+ vsubpd %zmm28, %zmm11, %zmm2{%k3} #235.3 -+ vmovups 1008(%rdx), %zmm29 #239.3 -+ vaddpd %zmm13, %zmm12, %zmm3{%k4}{z} #234.3 -+ vsubpd %zmm13, %zmm12, %zmm3{%k3} #234.3 -+ vmovups 864(%rdx), %zmm13 #239.3 -+ vpermilpd $85, %zmm30, %zmm12 #239.3 -+ vpermilpd $85, %zmm3, %zmm11 #239.3 -+ vpermilpd $85, %zmm2, %zmm7 #239.3 -+ vmulpd %zmm27, %zmm12, %zmm28 #239.3 -+ vmulpd %zmm11, %zmm27, %zmm12 #239.3 -+ vmulpd %zmm7, %zmm27, %zmm11 #239.3 -+ vmovaps %zmm26, %zmm27 #239.3 -+ vpermi2pd %zmm13, %zmm29, %zmm27 #239.3 -+ vmulpd %zmm30, %zmm27, %zmm8 #239.3 -+ vmovaps %zmm25, %zmm27 #239.3 -+ vpermi2pd %zmm13, %zmm29, %zmm27 #239.3 -+ vfmadd213pd %zmm8, %zmm28, %zmm27 #239.3 -+ vmovaps %zmm24, %zmm4 #239.3 -+ vpermi2pd %zmm13, %zmm29, %zmm4 #239.3 -+ vpermi2pd %zmm6, %zmm1, %zmm24 #239.3 -+ vfmadd213pd %zmm27, %zmm3, %zmm4 #239.3 -+ vmovaps %zmm23, %zmm5 #239.3 -+ vmovaps %zmm20, %zmm9 #239.3 -+ vpermi2pd %zmm13, %zmm29, %zmm5 #239.3 -+ vpermi2pd %zmm13, %zmm29, %zmm9 #239.3 -+ vpermi2pd %zmm6, %zmm1, %zmm20 #239.3 -+ vpermi2pd %zmm6, %zmm1, %zmm23 #239.3 -+ vfmadd213pd %zmm4, %zmm12, %zmm5 #239.3 -+ vmulpd %zmm9, %zmm30, %zmm4 #239.3 -+ vmovaps %zmm22, %zmm7 #239.3 -+ vpermi2pd %zmm13, %zmm1, %zmm22 #239.3 -+ vpermi2pd %zmm6, %zmm29, %zmm7 #239.3 -+ vmulpd %zmm22, %zmm30, %zmm22 #239.3 -+ vfmadd213pd %zmm5, %zmm2, %zmm7 #239.3 -+ vmovaps %zmm21, %zmm27 #239.3 -+ vpermi2pd %zmm6, %zmm29, %zmm27 #239.3 -+ vpermt2pd %zmm13, %zmm19, %zmm29 #239.3 -+ vpermi2pd %zmm13, %zmm1, %zmm21 #239.3 -+ vfmadd213pd %zmm7, %zmm11, %zmm27 #239.3 -+ vfmadd213pd %zmm4, %zmm28, %zmm29 #239.3 -+ vfmadd213pd %zmm22, %zmm28, %zmm21 #239.3 -+ vmovaps %zmm26, %zmm5 #239.3 -+ vpermi2pd %zmm6, %zmm1, %zmm5 #239.3 -+ vpermi2pd %zmm31, %zmm0, %zmm26 #239.3 -+ vpermt2pd %zmm31, %zmm25, %zmm0 #239.3 -+ vfmadd213pd %zmm29, %zmm3, %zmm5 #239.3 -+ vfmadd213pd %zmm21, %zmm3, %zmm20 #239.3 -+ vmovups (%rcx,%rdi), %zmm21 #241.3 -+ vmovaps %zmm25, %zmm29 #239.3 -+ vpermi2pd %zmm6, %zmm1, %zmm29 #239.3 -+ vpermt2pd %zmm6, %zmm19, %zmm1 #239.3 -+ vmovups (%rcx,%r8), %zmm19 #241.3 -+ vfmadd213pd %zmm5, %zmm12, %zmm29 #239.3 -+ vfmadd213pd %zmm20, %zmm12, %zmm1 #239.3 -+ vmovups 96(%rcx,%r8), %zmm25 #242.3 -+ vfmadd213pd %zmm29, %zmm2, %zmm24 #239.3 -+ vfmadd213pd %zmm1, %zmm2, %zmm26 #239.3 -+ vmovups 160(%rcx,%rdi), %zmm2 #242.3 -+ vmovups 96(%rcx,%rdi), %zmm1 #242.3 -+ vfmadd213pd %zmm24, %zmm11, %zmm23 #239.3 -+ vmovups 64(%rcx,%rdi), %zmm24 #241.3 -+ vfmadd213pd %zmm26, %zmm11, %zmm0 #239.3 -+ vpermt2pd 160(%rcx,%r8), %zmm18, %zmm2 #242.3 -+ vpermt2pd 64(%rcx,%r8), %zmm18, %zmm24 #241.3 -+ vpermi2pd %zmm23, %zmm27, %zmm16 #241.3 -+ vpermi2pd %zmm0, %zmm23, %zmm14 #241.3 -+ vpermi2pd %zmm23, %zmm27, %zmm10 #242.3 -+ vpermi2pd %zmm27, %zmm0, %zmm15 #241.3 -+ vaddpd %zmm14, %zmm24, %zmm14 #241.3 -+ vmovaps %zmm18, %zmm20 #241.3 -+ vmovaps %zmm18, %zmm26 #242.3 -+ vpermi2pd %zmm19, %zmm21, %zmm20 #241.3 -+ vpermt2pd %zmm19, %zmm17, %zmm21 #241.3 -+ vpermi2pd %zmm25, %zmm1, %zmm26 #242.3 -+ vpermt2pd %zmm25, %zmm17, %zmm1 #242.3 -+ vaddpd %zmm16, %zmm20, %zmm16 #241.3 -+ vaddpd %zmm10, %zmm26, %zmm26{%k2} #242.3 -+ vaddpd %zmm15, %zmm21, %zmm15 #241.3 -+ vmovups .L_2il0floatpacket.48(%rip), %zmm18 #242.3 -+ vmovups -64(%rsp), %zmm17 #242.3[spill] -+ vsubpd %zmm10, %zmm26, %zmm26{%k1} #242.3 -+ vpermi2pd %zmm27, %zmm0, %zmm18 #242.3 -+ vpermt2pd %zmm0, %zmm17, %zmm23 #242.3 -+ vaddpd %zmm18, %zmm1, %zmm1{%k3} #242.3 -+ vaddpd %zmm23, %zmm2, %zmm2{%k1} #242.3 -+ vsubpd %zmm18, %zmm1, %zmm1{%k4} #242.3 -+ vsubpd %zmm23, %zmm2, %zmm2{%k2} #242.3 -+ vmovupd %ymm16, (%rcx,%rdi) #241.3 -+ vmovupd %ymm15, 32(%rcx,%rdi) #241.3 -+ vmovupd %ymm14, 64(%rcx,%rdi) #241.3 -+ vextractf64x4 $1, %zmm16, (%rcx,%r8) #241.3 -+ vextractf64x4 $1, %zmm15, 32(%rcx,%r8) #241.3 -+ vextractf64x4 $1, %zmm14, 64(%rcx,%r8) #241.3 -+ vmovupd %ymm26, 96(%rcx,%rdi) #242.3 -+ vmovupd %ymm1, 128(%rcx,%rdi) #242.3 -+ vmovupd %ymm2, 160(%rcx,%rdi) #242.3 -+ vextractf64x4 $1, %zmm26, 96(%rcx,%r8) #242.3 -+ vextractf64x4 $1, %zmm1, 128(%rcx,%r8) #242.3 -+ vextractf64x4 $1, %zmm2, 160(%rcx,%r8) #242.3 -+ vzeroupper #243.1 -+ movq %rbp, %rsp #243.1 -+ popq %rbp #243.1 -+ .cfi_def_cfa 7, 8 -+ .cfi_restore 6 -+ ret #243.1 -+ .align 16,0x90 -+ # LOE -+ .cfi_endproc -+# mark_end; -+ .type deo_dble_avx512,@function -+ .size deo_dble_avx512,.-deo_dble_avx512 -+ .data -+# -- End deo_dble_avx512 -+ .section .rodata, "a" -+ .align 64 -+ .align 64 -+.L_2il0floatpacket.14: -+ .long 0x00000008,0x00000000,0x00000009,0x00000000,0x0000000e,0x00000000,0x0000000f,0x00000000,0x00000000,0x00000000,0x00000001,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000 -+ .type .L_2il0floatpacket.14,@object -+ .size .L_2il0floatpacket.14,64 -+ .align 64 -+.L_2il0floatpacket.15: -+ .long 0x00000004,0x00000000,0x00000005,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000 -+ .type .L_2il0floatpacket.15,@object -+ .size .L_2il0floatpacket.15,64 -+ .align 64 -+.L_2il0floatpacket.16: -+ .long 0x00000002,0x00000000,0x00000003,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000 -+ .type .L_2il0floatpacket.16,@object -+ .size .L_2il0floatpacket.16,64 -+ .align 64 -+.L_2il0floatpacket.17: -+ .long 0x00000000,0x00000000,0x00000001,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x0000000c,0x00000000,0x0000000d,0x00000000,0x0000000e,0x00000000,0x0000000f,0x00000000 -+ .type .L_2il0floatpacket.17,@object -+ .size .L_2il0floatpacket.17,64 -+ .align 64 -+.L_2il0floatpacket.18: -+ .long 0x00000004,0x00000000,0x00000005,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000 -+ .type .L_2il0floatpacket.18,@object -+ .size .L_2il0floatpacket.18,64 -+ .align 64 -+.L_2il0floatpacket.19: -+ .long 0x00000000,0xbff00000,0x00000000,0x3ff00000,0x00000000,0xbff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0xbff00000,0x00000000,0x3ff00000,0x00000000,0xbff00000 -+ .type .L_2il0floatpacket.19,@object -+ .size .L_2il0floatpacket.19,64 -+ .align 64 -+.L_2il0floatpacket.20: -+ .long 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000 -+ .type .L_2il0floatpacket.20,@object -+ .size .L_2il0floatpacket.20,64 -+ .align 64 -+.L_2il0floatpacket.21: -+ .long 0x00000001,0x00000000,0x00000001,0x00000000,0x00000001,0x00000000,0x00000001,0x00000000,0x00000009,0x00000000,0x00000009,0x00000000,0x00000009,0x00000000,0x00000009,0x00000000 -+ .type .L_2il0floatpacket.21,@object -+ .size .L_2il0floatpacket.21,64 -+ .align 64 -+.L_2il0floatpacket.22: -+ .long 0x00000002,0x00000000,0x00000002,0x00000000,0x00000002,0x00000000,0x00000002,0x00000000,0x0000000e,0x00000000,0x0000000e,0x00000000,0x0000000e,0x00000000,0x0000000e,0x00000000 -+ .type .L_2il0floatpacket.22,@object -+ .size .L_2il0floatpacket.22,64 -+ .align 64 -+.L_2il0floatpacket.23: -+ .long 0x00000003,0x00000000,0x00000003,0x00000000,0x00000003,0x00000000,0x00000003,0x00000000,0x0000000f,0x00000000,0x0000000f,0x00000000,0x0000000f,0x00000000,0x0000000f,0x00000000 -+ .type .L_2il0floatpacket.23,@object -+ .size .L_2il0floatpacket.23,64 -+ .align 64 -+.L_2il0floatpacket.24: -+ .long 0x00000004,0x00000000,0x00000004,0x00000000,0x00000004,0x00000000,0x00000004,0x00000000,0x0000000c,0x00000000,0x0000000c,0x00000000,0x0000000c,0x00000000,0x0000000c,0x00000000 -+ .type .L_2il0floatpacket.24,@object -+ .size .L_2il0floatpacket.24,64 -+ .align 64 -+.L_2il0floatpacket.25: -+ .long 0x00000005,0x00000000,0x00000005,0x00000000,0x00000005,0x00000000,0x00000005,0x00000000,0x0000000d,0x00000000,0x0000000d,0x00000000,0x0000000d,0x00000000,0x0000000d,0x00000000 -+ .type .L_2il0floatpacket.25,@object -+ .size .L_2il0floatpacket.25,64 -+ .align 64 -+.L_2il0floatpacket.26: -+ .long 0x00000006,0x00000000,0x00000006,0x00000000,0x00000006,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000a,0x00000000,0x0000000a,0x00000000,0x0000000a,0x00000000 -+ .type .L_2il0floatpacket.26,@object -+ .size .L_2il0floatpacket.26,64 -+ .align 64 -+.L_2il0floatpacket.27: -+ .long 0x00000007,0x00000000,0x00000007,0x00000000,0x00000007,0x00000000,0x00000007,0x00000000,0x0000000b,0x00000000,0x0000000b,0x00000000,0x0000000b,0x00000000,0x0000000b,0x00000000 -+ .type .L_2il0floatpacket.27,@object -+ .size .L_2il0floatpacket.27,64 -+ .align 64 -+.L_2il0floatpacket.28: -+ .long 0x00000004,0x00000000,0x00000005,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000,0x00000000,0x00000000,0x00000001,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000 -+ .type .L_2il0floatpacket.28,@object -+ .size .L_2il0floatpacket.28,64 -+ .align 64 -+.L_2il0floatpacket.29: -+ .long 0x0000000e,0x00000000,0x0000000f,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000,0x00000000,0x00000000,0x00000001,0x00000000 -+ .type .L_2il0floatpacket.29,@object -+ .size .L_2il0floatpacket.29,64 -+ .align 64 -+.L_2il0floatpacket.30: -+ .long 0x0000000a,0x00000000,0x0000000b,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000 -+ .type .L_2il0floatpacket.30,@object -+ .size .L_2il0floatpacket.30,64 -+ .align 64 -+.L_2il0floatpacket.31: -+ .long 0x00000008,0x00000000,0x00000009,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000 -+ .type .L_2il0floatpacket.31,@object -+ .size .L_2il0floatpacket.31,64 -+ .align 64 -+.L_2il0floatpacket.32: -+ .long 0x00000000,0x00000000,0x00000001,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x00000007,0x00000000,0x00000006,0x00000000,0x00000005,0x00000000,0x00000004,0x00000000 -+ .type .L_2il0floatpacket.32,@object -+ .size .L_2il0floatpacket.32,64 -+ .align 64 -+.L_2il0floatpacket.33: -+ .long 0x00000000,0x00000000,0x00000001,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000 -+ .type .L_2il0floatpacket.33,@object -+ .size .L_2il0floatpacket.33,64 -+ .align 64 -+.L_2il0floatpacket.34: -+ .long 0x00000000,0x00000000,0x00000001,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x00000005,0x00000000,0x00000004,0x00000000,0x00000007,0x00000000,0x00000006,0x00000000 -+ .type .L_2il0floatpacket.34,@object -+ .size .L_2il0floatpacket.34,64 -+ .align 64 -+.L_2il0floatpacket.35: -+ .long 0x00000000,0x00000000,0x00000001,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000,0x0000000c,0x00000000,0x0000000d,0x00000000 -+ .type .L_2il0floatpacket.35,@object -+ .size .L_2il0floatpacket.35,64 -+ .align 64 -+.L_2il0floatpacket.36: -+ .long 0x00000000,0x00000000,0x00000001,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000,0x0000000e,0x00000000,0x0000000f,0x00000000 -+ .type .L_2il0floatpacket.36,@object -+ .size .L_2il0floatpacket.36,64 -+ .align 64 -+.L_2il0floatpacket.37: -+ .long 0x00000002,0x00000000,0x00000003,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000,0x0000000e,0x00000000,0x0000000f,0x00000000 -+ .type .L_2il0floatpacket.37,@object -+ .size .L_2il0floatpacket.37,64 -+ .align 64 -+.L_2il0floatpacket.38: -+ .long 0x00000000,0x00000000,0x00000001,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000 -+ .type .L_2il0floatpacket.38,@object -+ .size .L_2il0floatpacket.38,64 -+ .align 64 -+.L_2il0floatpacket.39: -+ .long 0x00000004,0x00000000,0x00000005,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000,0x0000000c,0x00000000,0x0000000d,0x00000000,0x0000000e,0x00000000,0x0000000f,0x00000000 -+ .type .L_2il0floatpacket.39,@object -+ .size .L_2il0floatpacket.39,64 -+ .align 64 -+.L_2il0floatpacket.40: -+ .long 0x00000007,0x00000000,0x00000006,0x00000000,0x00000005,0x00000000,0x00000004,0x00000000,0x00000007,0x00000000,0x00000006,0x00000000,0x00000005,0x00000000,0x00000004,0x00000000 -+ .type .L_2il0floatpacket.40,@object -+ .size .L_2il0floatpacket.40,64 -+ .align 64 -+.L_2il0floatpacket.41: -+ .long 0x00000000,0x00000000,0x00000001,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x00000000,0x00000000,0x00000001,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000 -+ .type .L_2il0floatpacket.41,@object -+ .size .L_2il0floatpacket.41,64 -+ .align 64 -+.L_2il0floatpacket.42: -+ .long 0x00000003,0x00000000,0x00000002,0x00000000,0x0000000b,0x00000000,0x0000000a,0x00000000,0x00000007,0x00000000,0x00000006,0x00000000,0x0000000f,0x00000000,0x0000000e,0x00000000 -+ .type .L_2il0floatpacket.42,@object -+ .size .L_2il0floatpacket.42,64 -+ .align 64 -+.L_2il0floatpacket.43: -+ .long 0x00000003,0x00000000,0x00000002,0x00000000,0x00000009,0x00000000,0x00000008,0x00000000,0x00000007,0x00000000,0x00000006,0x00000000,0x0000000d,0x00000000,0x0000000c,0x00000000 -+ .type .L_2il0floatpacket.43,@object -+ .size .L_2il0floatpacket.43,64 -+ .align 64 -+.L_2il0floatpacket.44: -+ .long 0x00000001,0x00000000,0x00000000,0x00000000,0x00000009,0x00000000,0x00000008,0x00000000,0x00000005,0x00000000,0x00000004,0x00000000,0x0000000d,0x00000000,0x0000000c,0x00000000 -+ .type .L_2il0floatpacket.44,@object -+ .size .L_2il0floatpacket.44,64 -+ .align 64 -+.L_2il0floatpacket.45: -+ .long 0x00000006,0x00000000,0x00000007,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000 -+ .type .L_2il0floatpacket.45,@object -+ .size .L_2il0floatpacket.45,64 -+ .align 64 -+.L_2il0floatpacket.46: -+ .long 0x00000002,0x00000000,0x00000003,0x00000000,0x00000000,0x00000000,0x00000001,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000 -+ .type .L_2il0floatpacket.46,@object -+ .size .L_2il0floatpacket.46,64 -+ .align 64 -+.L_2il0floatpacket.47: -+ .long 0x00000005,0x00000000,0x00000004,0x00000000,0x00000007,0x00000000,0x00000006,0x00000000,0x00000005,0x00000000,0x00000004,0x00000000,0x00000007,0x00000000,0x00000006,0x00000000 -+ .type .L_2il0floatpacket.47,@object -+ .size .L_2il0floatpacket.47,64 -+ .align 64 -+.L_2il0floatpacket.48: -+ .long 0x00000001,0x00000000,0x00000000,0x00000000,0x0000000b,0x00000000,0x0000000a,0x00000000,0x00000005,0x00000000,0x00000004,0x00000000,0x0000000f,0x00000000,0x0000000e,0x00000000 -+ .type .L_2il0floatpacket.48,@object -+ .size .L_2il0floatpacket.48,64 -+ .data -+ .section .note.GNU-stack, "" -+// -- Begin DWARF2 SEGMENT .eh_frame -+ .section .eh_frame,"a",@progbits -+.eh_frame_seg: -+ .align 8 -+# End -diff --git a/modules/linalg/avx512/salg_avx512.c b/modules/linalg/avx512/salg_avx512.c -new file mode 100644 -index 0000000..656711a ---- /dev/null -+++ b/modules/linalg/avx512/salg_avx512.c -@@ -0,0 +1,138 @@ -+/******************************************************************************* -+* -+* File salg_avx512.c -+* -+* This software is distributed under the terms of the GNU General Public -+* License (GPL) -+* -+* AVX512 implementations of single precision linear algebra -+* functions for spinors. -+* -+* See ../salg.c for more information and alternative -+* implementations. -+* -+*******************************************************************************/ -+ -+#include "global.h" -+#include "linalg.h" -+#include "mpi.h" -+#include "sflds.h" -+ -+#include "avx512.h" -+ -+void mulc_spinor_add_avx512(int vol, spinor *s, spinor const *r, complex z) -+{ -+ spinor *sm; -+ __m128 tr, ti; -+ __m512 zr, zi, t1, t2; -+ __m512 sign; -+ -+ sign = _mm512_set_ps( -1,1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1 ); -+ sm = s + vol; -+ -+ tr = _mm_load_ps1( &z.re ); -+ ti = _mm_load_ps1( &z.im ); -+ zr = _mm512_broadcast_f32x4( tr ); -+ zi = _mm512_broadcast_f32x4( ti ); -+ -+ zi = _mm512_mul_ps( zi, sign ); -+ -+ for (; s < sm; s+=2) { -+ t1 = _mm512_loadu_ps( &(*r).c1.c1.re ); -+ t2 = _mm512_mul_ps( zi, t1 ); -+ t2 = _mm512_permute_ps( t2, 0b10110001 ); -+ t2 = _mm512_fmadd_ps( zr, t1, t2 ); -+ t1 = _mm512_loadu_ps( &(*s).c1.c1.re ); -+ t1 = _mm512_add_ps( t1, t2 ); -+ _mm512_storeu_ps( &(*s).c1.c1.re, t1 ); -+ -+ t1 = _mm512_loadu_ps( &(*r).c1.c1.re + 16 ); -+ t2 = _mm512_mul_ps( zi, t1 ); -+ t2 = _mm512_permute_ps( t2, 0b10110001 ); -+ t2 = _mm512_fmadd_ps( zr, t1, t2 ); -+ t1 = _mm512_loadu_ps( &(*s).c1.c1.re + 16 ); -+ t1 = _mm512_add_ps( t1, t2 ); -+ _mm512_storeu_ps( &(*s).c1.c1.re + 16, t1 ); -+ -+ t1 = _mm512_loadu_ps( &(*r).c1.c1.re + 32 ); -+ t2 = _mm512_mul_ps( zi, t1 ); -+ t2 = _mm512_permute_ps( t2, 0b10110001 ); -+ t2 = _mm512_fmadd_ps( zr, t1, t2 ); -+ t1 = _mm512_loadu_ps( &(*s).c1.c1.re + 32 ); -+ t1 = _mm512_add_ps( t1, t2 ); -+ _mm512_storeu_ps( &(*s).c1.c1.re + 32, t1 ); -+ -+ r += 2; -+ } -+} -+ -+#if __GNUC__ < 7 -+/* This function was implemented to gcc 7 */ -+extern __inline double _mm512_reduce_add_ps( __m512 a ) { -+ float * d = (float *) &a; -+ return d[0]+d[1]+d[2]+d[3]+d[4]+d[5]+d[6]+d[7] -+ +d[8]+d[9]+d[10]+d[11]+d[12]+d[13]+d[14]+d[15] ; -+} -+#endif -+ -+complex_dble spinor_prod_avx512(int vol, spinor const *s, -+ spinor const *r ) -+{ -+ spinor const *sm, *smb; -+ __m512 tr, ti, s1, s2, s3, r1, r2, r3, sign; -+ -+ double x, y; -+ complex_dble z, v, w; -+ -+ x = 0.0; -+ y = 0.0; -+ sm = s + vol; -+ -+ -+ while (s < sm) { -+ smb = s + 8; -+ if (smb > sm) { -+ smb = sm; -+ } -+ -+ tr = _mm512_setzero_ps(); -+ ti = _mm512_setzero_ps(); -+ -+ for (; s < smb; s+=2) { -+ s1 = _mm512_loadu_ps( &(*s).c1.c1.re ); -+ s2 = _mm512_loadu_ps( &(*s).c1.c1.re+16 ); -+ s3 = _mm512_loadu_ps( &(*s).c1.c1.re+32 ); -+ r1 = _mm512_loadu_ps( &(*r).c1.c1.re ); -+ r2 = _mm512_loadu_ps( &(*r).c1.c1.re+16 ); -+ r3 = _mm512_loadu_ps( &(*r).c1.c1.re+32 ); -+ -+ tr = _mm512_fmadd_ps( s1, r1, tr ); -+ tr = _mm512_fmadd_ps( s2, r2, tr ); -+ tr = _mm512_fmadd_ps( s3, r3, tr ); -+ -+ r1 = _mm512_permute_ps( r1, 0b10110001 ); -+ r2 = _mm512_permute_ps( r2, 0b10110001 ); -+ r3 = _mm512_permute_ps( r3, 0b10110001 ); -+ -+ sign = _mm512_set_ps( -1,1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1,-1,1 ); -+ r1 = _mm512_mul_ps( r1, sign ); -+ r2 = _mm512_mul_ps( r2, sign ); -+ r3 = _mm512_mul_ps( r3, sign ); -+ -+ ti = _mm512_fmadd_ps( s1, r1, ti ); -+ ti = _mm512_fmadd_ps( s2, r2, ti ); -+ ti = _mm512_fmadd_ps( s3, r3, ti ); -+ -+ r += 2; -+ } -+ -+ x += (double) _mm512_reduce_add_ps( tr ); -+ y += (double) _mm512_reduce_add_ps( ti ); -+ -+ } -+ -+ z.re = x; -+ z.im = y; -+ -+ return z; -+} -diff --git a/modules/linalg/avx512/salg_avx512_asm.s b/modules/linalg/avx512/salg_avx512_asm.s -new file mode 100644 -index 0000000..8e845ae ---- /dev/null -+++ b/modules/linalg/avx512/salg_avx512_asm.s -@@ -0,0 +1,230 @@ -+# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 17.0.4.196 Build 20170411"; -+# mark_description "-I../../../include -I.. -I/cineca/prod/opt/compilers/intel/pe-xe-2017/binary/impi/2017.3.196/intel64/include"; -+# mark_description " -isystem /cineca/prod/opt/compilers/intel/pe-xe-2018/binary/impi/2018.1.163/include64/ -std=c89 -xCORE-AVX5"; -+# mark_description "12 -mtune=skylake -DAVX512 -O3 -Ddirac_counters -pedantic -fstrict-aliasing -Wno-long-long -Wstrict-prototyp"; -+# mark_description "es -S"; -+ .file "salg_avx512.c" -+ .text -+..TXTST0: -+# -- Begin mulc_spinor_add_avx512 -+ .text -+# mark_begin; -+ .align 16,0x90 -+ .globl mulc_spinor_add_avx512 -+# --- mulc_spinor_add_avx512(int, spinor *, const spinor *, complex) -+mulc_spinor_add_avx512: -+# parameter 1: %edi -+# parameter 2: %rsi -+# parameter 3: %rdx -+# parameter 4: %xmm0 -+..B1.1: # Preds ..B1.0 -+ # Execution count [1.00e+00] -+ .cfi_startproc -+..___tag_value_mulc_spinor_add_avx512.1: -+..L2: -+ #9.1 -+ vshufps $1, %xmm0, %xmm0, %xmm1 #9.1 -+ vbroadcastss %xmm1, %xmm3 #19.8 -+ vbroadcastss %xmm0, %xmm2 #18.8 -+ movslq %edi, %rdi #9.1 -+ vshuff32x4 $0, %zmm3, %zmm3, %zmm4 #21.8 -+ vmulps .L_2il0floatpacket.3(%rip), %zmm4, %zmm0 #23.8 -+ lea (%rdi,%rdi,2), %rax #16.8 -+ shlq $5, %rax #16.8 -+ addq %rsi, %rax #16.8 -+ vshuff32x4 $0, %zmm2, %zmm2, %zmm1 #20.8 -+ cmpq %rax, %rsi #25.14 -+ jae ..B1.5 # Prob 10% #25.14 -+ # LOE rax rdx rbx rbp rsi r12 r13 r14 r15 zmm0 zmm1 -+..B1.3: # Preds ..B1.1 ..B1.3 -+ # Execution count [5.00e+00] -+ vmovups (%rdx), %zmm3 #26.30 -+ vmulps %zmm3, %zmm0, %zmm2 #27.10 -+ vpermilps $177, %zmm2, %zmm4 #28.10 -+ vfmadd231ps %zmm1, %zmm3, %zmm4 #29.10 -+ vaddps (%rsi), %zmm4, %zmm5 #31.10 -+ vmovups %zmm5, (%rsi) #32.26 -+ vmovups 64(%rdx), %zmm7 #34.30 -+ vmulps %zmm7, %zmm0, %zmm6 #35.10 -+ vpermilps $177, %zmm6, %zmm8 #36.10 -+ vfmadd231ps %zmm1, %zmm7, %zmm8 #37.10 -+ vaddps 64(%rsi), %zmm8, %zmm9 #39.10 -+ vmovups %zmm9, 64(%rsi) #40.26 -+ vmovups 128(%rdx), %zmm11 #42.30 -+ addq $192, %rdx #50.5 -+ vmulps %zmm11, %zmm0, %zmm10 #43.10 -+ vpermilps $177, %zmm10, %zmm12 #44.10 -+ vfmadd231ps %zmm1, %zmm11, %zmm12 #45.10 -+ vaddps 128(%rsi), %zmm12, %zmm13 #47.10 -+ vmovups %zmm13, 128(%rsi) #48.26 -+ addq $192, %rsi #25.18 -+ cmpq %rax, %rsi #25.14 -+ jb ..B1.3 # Prob 82% #25.14 -+ # LOE rax rdx rbx rbp rsi r12 r13 r14 r15 zmm0 zmm1 -+..B1.5: # Preds ..B1.3 ..B1.1 -+ # Execution count [1.00e+00] -+ vzeroupper #52.1 -+ ret #52.1 -+ .align 16,0x90 -+ # LOE -+ .cfi_endproc -+# mark_end; -+ .type mulc_spinor_add_avx512,@function -+ .size mulc_spinor_add_avx512,.-mulc_spinor_add_avx512 -+ .data -+# -- End mulc_spinor_add_avx512 -+ .text -+# -- Begin spinor_prod_avx512 -+ .text -+# mark_begin; -+ .align 16,0x90 -+ .globl spinor_prod_avx512 -+# --- spinor_prod_avx512(int, const spinor *, const spinor *) -+spinor_prod_avx512: -+# parameter 1: %edi -+# parameter 2: %rsi -+# parameter 3: %rdx -+..B2.1: # Preds ..B2.0 -+ # Execution count [1.00e+00] -+ .cfi_startproc -+..___tag_value_spinor_prod_avx512.4: -+..L5: -+ #65.1 -+ subq $136, %rsp #65.1 -+ .cfi_def_cfa_offset 144 -+ movslq %edi, %rdi #65.1 -+ vxorpd %xmm0, %xmm0, %xmm0 #72.3 -+ vmovapd %xmm0, %xmm1 #73.3 -+ lea (%rdi,%rdi,2), %rax #74.8 -+ shlq $5, %rax #74.8 -+ addq %rsi, %rax #74.8 -+ cmpq %rax, %rsi #77.14 -+ jae ..B2.9 # Prob 10% #77.14 -+ # LOE rax rdx rbx rbp rsi r12 r13 r14 r15 xmm0 xmm1 -+..B2.2: # Preds ..B2.1 -+ # Execution count [9.00e-01] -+ vmovups .L_2il0floatpacket.3(%rip), %zmm5 #102.29 -+ vpxord %zmm4, %zmm4, %zmm4 #83.10 -+ vmovaps %zmm4, %zmm2 #83.10 -+ # LOE rax rdx rbx rbp rsi r12 r13 r14 r15 xmm0 xmm1 zmm2 zmm4 zmm5 -+..B2.3: # Preds ..B2.7 ..B2.2 -+ # Execution count [5.00e+00] -+ vmovaps %zmm4, %zmm3 #84.10 -+ lea 768(%rsi), %rcx #78.11 -+ cmpq %rax, %rcx #79.5 -+ vmovaps %zmm2, %zmm6 #83.10 -+ cmovae %rax, %rcx #79.5 -+ vmovaps %zmm3, %zmm2 #84.10 -+ cmpq %rcx, %rsi #86.16 -+ jae ..B2.7 # Prob 10% #86.16 -+ # LOE rax rdx rcx rbx rbp rsi r12 r13 r14 r15 xmm0 xmm1 zmm2 zmm3 zmm4 zmm5 zmm6 -+..B2.5: # Preds ..B2.3 ..B2.5 -+ # Execution count [2.50e+01] -+ vmovups (%rdx), %zmm7 #90.32 -+ vmovups 64(%rdx), %zmm8 #91.32 -+ vmovups 128(%rdx), %zmm9 #92.32 -+ vmovups (%rsi), %zmm14 #87.32 -+ vmovups 64(%rsi), %zmm16 #88.32 -+ vmovups 128(%rsi), %zmm18 #89.32 -+ vfmadd231ps %zmm14, %zmm7, %zmm6 #94.12 -+ vpermilps $177, %zmm7, %zmm10 #98.12 -+ addq $192, %rsi #86.21 -+ vmulps %zmm5, %zmm10, %zmm13 #103.12 -+ addq $192, %rdx #111.7 -+ vfmadd231ps %zmm16, %zmm8, %zmm6 #95.12 -+ vfmadd231ps %zmm14, %zmm13, %zmm3 #107.12 -+ vfmadd231ps %zmm18, %zmm9, %zmm6 #96.12 -+ vpermilps $177, %zmm8, %zmm11 #99.12 -+ vmulps %zmm11, %zmm5, %zmm15 #104.12 -+ vpermilps $177, %zmm9, %zmm12 #100.12 -+ vmulps %zmm12, %zmm5, %zmm17 #105.12 -+ vfmadd231ps %zmm16, %zmm15, %zmm3 #108.12 -+ vfmadd231ps %zmm18, %zmm17, %zmm3 #109.12 -+ cmpq %rcx, %rsi #86.16 -+ jb ..B2.5 # Prob 82% #86.16 -+ # LOE rax rdx rcx rbx rbp rsi r12 r13 r14 r15 xmm0 xmm1 zmm2 zmm3 zmm4 zmm5 zmm6 -+..B2.7: # Preds ..B2.5 ..B2.3 -+ # Execution count [5.00e+00] -+ vmovups %zmm6, (%rsp) #114.19 -+ vmovups %zmm3, 64(%rsp) #115.19 -+ vmovss (%rsp), %xmm6 #114.19 -+ vmovss 8(%rsp), %xmm7 #114.19 -+ vmovss 16(%rsp), %xmm10 #114.19 -+ vmovss 24(%rsp), %xmm11 #114.19 -+ vmovss 32(%rsp), %xmm16 #114.19 -+ vmovss 40(%rsp), %xmm17 #114.19 -+ vmovss 64(%rsp), %xmm3 #115.19 -+ vmovss 48(%rsp), %xmm20 #114.19 -+ vmovss 56(%rsp), %xmm21 #114.19 -+ vmovss 72(%rsp), %xmm29 #115.19 -+ vaddss 4(%rsp), %xmm6, %xmm8 #114.5 -+ vaddss 12(%rsp), %xmm7, %xmm9 #114.5 -+ vaddss 20(%rsp), %xmm10, %xmm12 #114.5 -+ vaddss 28(%rsp), %xmm11, %xmm13 #114.5 -+ vaddss 36(%rsp), %xmm16, %xmm18 #114.5 -+ vaddss 44(%rsp), %xmm17, %xmm19 #114.5 -+ vaddss %xmm9, %xmm8, %xmm14 #114.5 -+ vaddss 68(%rsp), %xmm3, %xmm30 #115.5 -+ vaddss %xmm13, %xmm12, %xmm15 #114.5 -+ vaddss 52(%rsp), %xmm20, %xmm22 #114.5 -+ vaddss 60(%rsp), %xmm21, %xmm23 #114.5 -+ vaddss 76(%rsp), %xmm29, %xmm31 #115.5 -+ vaddss %xmm19, %xmm18, %xmm24 #114.5 -+ vaddss %xmm15, %xmm14, %xmm26 #114.5 -+ vaddss %xmm23, %xmm22, %xmm25 #114.5 -+ vaddss %xmm31, %xmm30, %xmm9 #115.5 -+ vaddss %xmm25, %xmm24, %xmm27 #114.5 -+ vmovss 80(%rsp), %xmm3 #115.19 -+ vaddss %xmm27, %xmm26, %xmm28 #114.5 -+ vaddss 84(%rsp), %xmm3, %xmm7 #115.5 -+ vcvtss2sd %xmm28, %xmm28, %xmm28 #114.19 -+ vmovss 88(%rsp), %xmm6 #115.19 -+ vaddsd %xmm28, %xmm0, %xmm0 #114.5 -+ vaddss 92(%rsp), %xmm6, %xmm8 #115.5 -+ vmovss 96(%rsp), %xmm11 #115.19 -+ vaddss %xmm8, %xmm7, %xmm10 #115.5 -+ vaddss 100(%rsp), %xmm11, %xmm13 #115.5 -+ vaddss %xmm10, %xmm9, %xmm21 #115.5 -+ vmovss 104(%rsp), %xmm12 #115.19 -+ vmovss 112(%rsp), %xmm15 #115.19 -+ vmovss 120(%rsp), %xmm16 #115.19 -+ vaddss 108(%rsp), %xmm12, %xmm14 #115.5 -+ vaddss 116(%rsp), %xmm15, %xmm17 #115.5 -+ vaddss 124(%rsp), %xmm16, %xmm18 #115.5 -+ vaddss %xmm14, %xmm13, %xmm19 #115.5 -+ vaddss %xmm18, %xmm17, %xmm20 #115.5 -+ vaddss %xmm20, %xmm19, %xmm22 #115.5 -+ vaddss %xmm22, %xmm21, %xmm23 #115.5 -+ vcvtss2sd %xmm23, %xmm23, %xmm23 #115.19 -+ vaddsd %xmm23, %xmm1, %xmm1 #115.5 -+ cmpq %rax, %rsi #77.14 -+ jb ..B2.3 # Prob 82% #77.14 -+ # LOE rax rdx rbx rbp rsi r12 r13 r14 r15 xmm0 xmm1 zmm2 zmm4 zmm5 -+..B2.9: # Preds ..B2.7 ..B2.1 -+ # Execution count [1.00e+00] -+ vzeroupper #122.10 -+ addq $136, %rsp #122.10 -+ .cfi_def_cfa_offset 8 -+ ret #122.10 -+ .align 16,0x90 -+ # LOE -+ .cfi_endproc -+# mark_end; -+ .type spinor_prod_avx512,@function -+ .size spinor_prod_avx512,.-spinor_prod_avx512 -+ .data -+# -- End spinor_prod_avx512 -+ .section .rodata, "a" -+ .align 64 -+ .align 64 -+.L_2il0floatpacket.3: -+ .long 0x3f800000,0xbf800000,0x3f800000,0xbf800000,0x3f800000,0xbf800000,0x3f800000,0xbf800000,0x3f800000,0xbf800000,0x3f800000,0xbf800000,0x3f800000,0xbf800000,0x3f800000,0xbf800000 -+ .type .L_2il0floatpacket.3,@object -+ .size .L_2il0floatpacket.3,64 -+ .data -+ .section .note.GNU-stack, "" -+// -- Begin DWARF2 SEGMENT .eh_frame -+ .section .eh_frame,"a",@progbits -+.eh_frame_seg: -+ .align 8 -+# End -diff --git a/modules/linalg/avx512/salg_dble_avx512.c b/modules/linalg/avx512/salg_dble_avx512.c -new file mode 100644 -index 0000000..331a492 ---- /dev/null -+++ b/modules/linalg/avx512/salg_dble_avx512.c -@@ -0,0 +1,391 @@ -+/******************************************************************************* -+* -+* File salg_dble_avx512.c -+* -+* This software is distributed under the terms of the GNU General Public -+* License (GPL) -+* -+* AVX512 implementations of single precision linear algebra -+* functions for spinors. -+* -+* See ../salg_dble.c for more information and alternative -+* implementations. -+* -+*******************************************************************************/ -+#include "global.h" -+#include "linalg.h" -+#include "mpi.h" -+#include "sflds.h" -+ -+#include "avx512.h" -+#if __GNUC__ < 7 -+/* This function was implemented to gcc 7 */ -+extern __inline double _mm512_reduce_add_pd( __m512d a ) { -+ double * d = (double *) &a; -+ return d[0]+d[1]+d[2]+d[3]+d[4]+d[5]+d[6]+d[7] ; -+} -+#endif -+ -+complex_dble spinor_prod_dble_avx512( spinor_dble const *s, spinor_dble const *smb, spinor_dble const *r) -+{ -+ __m512d tr, ti, s1, s2, s3, r1, r2, r3, sign; -+ sign = _mm512_set_pd( -1,1,-1,1,-1,1,-1,1 ); -+ complex_dble z; -+ -+ tr = _mm512_setzero_pd(); -+ ti = _mm512_setzero_pd(); -+ for (; s < smb; s++) { -+ s1 = _mm512_loadu_pd( &(*s).c1.c1.re ); -+ s2 = _mm512_loadu_pd( &(*s).c1.c1.re+8 ); -+ s3 = _mm512_loadu_pd( &(*s).c1.c1.re+16 ); -+ r1 = _mm512_loadu_pd( &(*r).c1.c1.re ); -+ r2 = _mm512_loadu_pd( &(*r).c1.c1.re+8 ); -+ r3 = _mm512_loadu_pd( &(*r).c1.c1.re+16 ); -+ tr = _mm512_fmadd_pd( s1, r1, tr ); -+ tr = _mm512_fmadd_pd( s2, r2, tr ); -+ tr = _mm512_fmadd_pd( s3, r3, tr ); -+ r1 = _mm512_permute_pd( r1, 0b01010101 ); -+ r2 = _mm512_permute_pd( r2, 0b01010101 ); -+ r3 = _mm512_permute_pd( r3, 0b01010101 ); -+ sign = _mm512_set_pd( -1,1,-1,1,-1,1,-1,1 ); -+ r1 = _mm512_mul_pd( r1, sign ); -+ r2 = _mm512_mul_pd( r2, sign ); -+ r3 = _mm512_mul_pd( r3, sign ); -+ ti = _mm512_fmadd_pd( s1, r1, ti ); -+ ti = _mm512_fmadd_pd( s2, r2, ti ); -+ ti = _mm512_fmadd_pd( s3, r3, ti ); -+ r += 1; -+ } -+ z.re = _mm512_reduce_add_pd( tr ); -+ z.im = _mm512_reduce_add_pd( ti ); -+ return z; -+} -+ -+double spinor_prod_re_dble_avx512( spinor_dble const *s, spinor_dble const *smb, spinor_dble const *r) -+{ -+ __m512d tr, ti, s1, s2, s3, r1, r2, r3; -+ float c; -+ tr = _mm512_setzero_pd(); -+ ti = _mm512_setzero_pd(); -+ for (; s < smb; s++) { -+ s1 = _mm512_loadu_pd( &(*s).c1.c1.re ); -+ s2 = _mm512_loadu_pd( &(*s).c1.c1.re+8 ); -+ s3 = _mm512_loadu_pd( &(*s).c1.c1.re+16 ); -+ r1 = _mm512_loadu_pd( &(*r).c1.c1.re ); -+ r2 = _mm512_loadu_pd( &(*r).c1.c1.re+8 ); -+ r3 = _mm512_loadu_pd( &(*r).c1.c1.re+16 ); -+ tr = _mm512_fmadd_pd( s1, r1, tr ); -+ tr = _mm512_fmadd_pd( s2, r2, tr ); -+ tr = _mm512_fmadd_pd( s3, r3, tr ); -+ r1 = _mm512_permute_pd( r1, 0b01010101 ); -+ r2 = _mm512_permute_pd( r2, 0b01010101 ); -+ r3 = _mm512_permute_pd( r3, 0b01010101 ); -+ r += 1; -+ } -+ c = _mm512_reduce_add_pd( tr ); -+ return c; -+} -+ -+complex_dble spinor_prod5_dble_avx512(spinor_dble const *s, spinor_dble const *smb, spinor_dble const *r) -+{ -+ __m512d tr, ti, s1, s2, s3, r1, r2, r3, sign; -+ complex_dble z; -+ -+ tr = _mm512_setzero_pd(); -+ ti = _mm512_setzero_pd(); -+ for (; s < smb; s++) { -+ s1 = _mm512_loadu_pd( &(*s).c1.c1.re ); -+ s2 = _mm512_loadu_pd( &(*s).c1.c1.re+8 ); -+ s3 = _mm512_loadu_pd( &(*s).c1.c1.re+16 ); -+ r1 = _mm512_loadu_pd( &(*r).c1.c1.re ); -+ r2 = _mm512_loadu_pd( &(*r).c1.c1.re+8 ); -+ r3 = _mm512_loadu_pd( &(*r).c1.c1.re+16 ); -+ sign = _mm512_set_pd( -1,-1,-1,-1,1,1,1,1 ); -+ s2 = _mm512_mul_pd( s2, sign ); -+ tr = _mm512_fmadd_pd( s1, r1, tr ); -+ tr = _mm512_fmadd_pd( s2, r2, tr ); -+ tr = _mm512_fnmadd_pd( s3, r3, tr ); -+ r1 = _mm512_permute_pd( r1, 0b01010101 ); -+ r2 = _mm512_permute_pd( r2, 0b01010101 ); -+ r3 = _mm512_permute_pd( r3, 0b01010101 ); -+ sign = _mm512_set_pd( -1,1,-1,1,-1,1,-1,1 ); -+ r1 = _mm512_mul_pd( r1, sign ); -+ r2 = _mm512_mul_pd( r2, sign ); -+ r3 = _mm512_mul_pd( r3, sign ); -+ ti = _mm512_fmadd_pd( s1, r1, ti ); -+ ti = _mm512_fmadd_pd( s2, r2, ti ); -+ ti = _mm512_fnmadd_pd( s3, r3, ti ); -+ r += 1; -+ } -+ z.re = _mm512_reduce_add_pd( tr ); -+ z.im = _mm512_reduce_add_pd( ti ); -+ return z; -+} -+ -+double norm_square_dble_avx512(spinor_dble const *s, spinor_dble const *smb) -+{ -+ __m512d tmp, s1, s2, s3; -+ tmp = _mm512_setzero_pd(); -+ for (; s < smb; s++) { -+ s1 = _mm512_loadu_pd( &(*s).c1.c1.re ); -+ s2 = _mm512_loadu_pd( &(*s).c1.c1.re+8 ); -+ s3 = _mm512_loadu_pd( &(*s).c1.c1.re+16 ); -+ tmp = _mm512_fmadd_pd( s1, s1, tmp ); -+ tmp = _mm512_fmadd_pd( s2, s2, tmp ); -+ tmp = _mm512_fmadd_pd( s3, s3, tmp ); -+ } -+ return _mm512_reduce_add_pd( tmp ); -+} -+ -+void mulc_spinor_add_dble_avx512(int vol, spinor_dble *s, spinor_dble const *r, -+ complex_dble z) -+{ -+ spinor_dble *sm; -+ __m128d tr, ti; -+ __m512d zr, zi, t1, t2; -+ -+ tr = _mm_load_sd( &z.re ); -+ ti = _mm_load_sd( &z.im ); -+ zr = _mm512_broadcastsd_pd( tr ); -+ zi = _mm512_broadcastsd_pd( ti ); -+ -+ sm = s + vol; -+ -+ for (; s < sm; s++) { -+ t1 = _mm512_loadu_pd( &(*r).c1.c1.re ); -+ t2 = _mm512_mul_pd( zi, t1 ); -+ t2 = _mm512_permute_pd( t2, 0b01010101 ); -+ t2 = _mm512_fmaddsub_pd( zr, t1, t2 ); -+ t1 = _mm512_loadu_pd( &(*s).c1.c1.re ); -+ t1 = _mm512_add_pd( t1, t2 ); -+ _mm512_storeu_pd( &(*s).c1.c1.re, t1 ); -+ -+ t1 = _mm512_loadu_pd( &(*r).c1.c1.re+8 ); -+ t2 = _mm512_mul_pd( zi, t1 ); -+ t2 = _mm512_permute_pd( t2, 0b01010101 ); -+ t2 = _mm512_fmaddsub_pd( zr, t1, t2 ); -+ t1 = _mm512_loadu_pd( &(*s).c1.c1.re+8 ); -+ t1 = _mm512_add_pd( t1, t2 ); -+ _mm512_storeu_pd( &(*s).c1.c1.re+8, t1 ); -+ -+ t1 = _mm512_loadu_pd( &(*r).c1.c1.re+16 ); -+ t2 = _mm512_mul_pd( zi, t1 ); -+ t2 = _mm512_permute_pd( t2, 0b01010101 ); -+ t2 = _mm512_fmaddsub_pd( zr, t1, t2 ); -+ t1 = _mm512_loadu_pd( &(*s).c1.c1.re+16 ); -+ t1 = _mm512_add_pd( t1, t2 ); -+ _mm512_storeu_pd( &(*s).c1.c1.re+16, t1 ); -+ -+ r += 1; -+ } -+} -+ -+void mulr_spinor_add_dble_avx512(int vol, spinor_dble *s, spinor_dble const *r, -+ double c) -+{ -+ spinor_dble *sm; -+ __m128d t128; -+ __m512d tc, t1, t2; -+ -+ t128 = _mm_load_sd( &c ); -+ tc = _mm512_broadcastsd_pd( t128 ); -+ -+ sm = s + vol; -+ -+ for (; s < sm; s++) { -+ t1 = _mm512_loadu_pd( &(*r).c1.c1.re ); -+ t2 = _mm512_mul_pd( tc, t1 ); -+ t1 = _mm512_loadu_pd( &(*s).c1.c1.re ); -+ t1 = _mm512_add_pd( t1, t2 ); -+ _mm512_storeu_pd( &(*s).c1.c1.re, t1 ); -+ -+ t1 = _mm512_loadu_pd( &(*r).c1.c1.re+8 ); -+ t2 = _mm512_mul_pd( tc, t1 ); -+ t1 = _mm512_loadu_pd( &(*s).c1.c1.re+8 ); -+ t1 = _mm512_add_pd( t1, t2 ); -+ _mm512_storeu_pd( &(*s).c1.c1.re+8, t1 ); -+ -+ t1 = _mm512_loadu_pd( &(*r).c1.c1.re+16 ); -+ t2 = _mm512_mul_pd( tc, t1 ); -+ t1 = _mm512_loadu_pd( &(*s).c1.c1.re+16 ); -+ t1 = _mm512_add_pd( t1, t2 ); -+ _mm512_storeu_pd( &(*s).c1.c1.re+16, t1 ); -+ -+ r += 1; -+ } -+} -+ -+void combine_spinor_dble_avx512(int vol, spinor_dble *s, spinor_dble const *r, -+ double cs, double cr) -+{ -+ spinor_dble *sm; -+ __m128d ts128, tr128; -+ __m512d tcs, tcr, t1, t2; -+ -+ ts128 = _mm_load_sd( &cs ); -+ tr128 = _mm_load_sd( &cr ); -+ tcs = _mm512_broadcastsd_pd( ts128 ); -+ tcr = _mm512_broadcastsd_pd( tr128 ); -+ -+ sm = s + vol; -+ -+ for (; s < sm; s++) { -+ t1 = _mm512_loadu_pd( &(*r).c1.c1.re ); -+ t2 = _mm512_mul_pd( tcr, t1 ); -+ t1 = _mm512_loadu_pd( &(*s).c1.c1.re ); -+ t1 = _mm512_fmadd_pd( tcs, t1, t2 ); -+ _mm512_storeu_pd( &(*s).c1.c1.re, t1 ); -+ -+ t1 = _mm512_loadu_pd( &(*r).c1.c1.re+8 ); -+ t2 = _mm512_mul_pd( tcr, t1 ); -+ t1 = _mm512_loadu_pd( &(*s).c1.c1.re+8 ); -+ t1 = _mm512_fmadd_pd( tcs, t1, t2 ); -+ _mm512_storeu_pd( &(*s).c1.c1.re+8, t1 ); -+ -+ t1 = _mm512_loadu_pd( &(*r).c1.c1.re+16 ); -+ t2 = _mm512_mul_pd( tcr, t1 ); -+ t1 = _mm512_loadu_pd( &(*s).c1.c1.re+16 ); -+ t1 = _mm512_fmadd_pd( tcs, t1, t2 ); -+ _mm512_storeu_pd( &(*s).c1.c1.re+16, t1 ); -+ -+ r += 1; -+ } -+} -+ -+void scale_dble_avx512(int vol, double c, spinor_dble *s) -+{ -+ spinor_dble *sm; -+ __m128d t128; -+ __m512d tc, t1; -+ -+ t128 = _mm_load_sd( &c ); -+ tc = _mm512_broadcastsd_pd( t128 ); -+ -+ sm = s + vol; -+ -+ for (; s < sm; s++) { -+ t1 = _mm512_loadu_pd( &(*s).c1.c1.re ); -+ t1 = _mm512_mul_pd( tc, t1 ); -+ _mm512_storeu_pd( &(*s).c1.c1.re, t1 ); -+ -+ t1 = _mm512_loadu_pd( &(*s).c1.c1.re+8 ); -+ t1 = _mm512_mul_pd( tc, t1 ); -+ _mm512_storeu_pd( &(*s).c1.c1.re+8, t1 ); -+ -+ t1 = _mm512_loadu_pd( &(*s).c1.c1.re+16 ); -+ t1 = _mm512_mul_pd( tc, t1 ); -+ _mm512_storeu_pd( &(*s).c1.c1.re+16, t1 ); -+ } -+} -+ -+void rotate_dble_avx512(int n, int ix, spinor_dble **ppk, spinor_dble *psi, complex_dble const *v) -+{ -+ spinor_dble *pk, *pj; -+ complex_dble const *z; -+ int k,j; -+ -+ for (k = 0; k < n; k++) { -+ __m128d tr, ti; -+ __m512d zr,zi, t1,t2, p1,p2,p3, sign; -+ -+ pk = psi + k; -+ pj = ppk[0] + ix; -+ z = v + k; -+ -+ tr = _mm_load_pd1( &z->re ); -+ ti = _mm_load_pd1( &z->im ); -+ zr = _mm512_broadcastsd_pd( tr ); -+ zi = _mm512_broadcastsd_pd( ti ); -+ -+ sign = _mm512_set_pd( -1,1,-1,1,-1,1,-1,1 ); -+ zi = _mm512_mul_pd( zi, sign ); -+ -+ t1 = _mm512_loadu_pd( &(*pj).c1.c1.re ); -+ t2 = _mm512_mul_pd( zi, t1 ); -+ t2 = _mm512_permute_pd( t2, 0b01010101 ); -+ p1 = _mm512_fmadd_pd( zr, t1, t2 ); -+ -+ t1 = _mm512_loadu_pd( &(*pj).c1.c1.re+8 ); -+ t2 = _mm512_mul_pd( zi, t1 ); -+ t2 = _mm512_permute_pd( t2, 0b01010101 ); -+ p2 = _mm512_fmadd_pd( zr, t1, t2 ); -+ -+ t1 = _mm512_loadu_pd( &(*pj).c1.c1.re+16 ); -+ t2 = _mm512_mul_pd( zi, t1 ); -+ t2 = _mm512_permute_pd( t2, 0b01010101 ); -+ p3 = _mm512_fmadd_pd( zr, t1, t2 ); -+ -+ for (j = 1; j < n; j++) { -+ pj = ppk[j] + ix; -+ z += n; -+ -+ tr = _mm_load_pd1( &z->re ); -+ ti = _mm_load_pd1( &z->im ); -+ zr = _mm512_broadcastsd_pd( tr ); -+ zi = _mm512_broadcastsd_pd( ti ); -+ zi = _mm512_mul_pd( zi, sign ); -+ -+ t1 = _mm512_loadu_pd( &(*pj).c1.c1.re ); -+ t2 = _mm512_mul_pd( zi, t1 ); -+ t2 = _mm512_permute_pd( t2, 0b01010101 ); -+ t1 = _mm512_fmadd_pd( zr, t1, t2 ); -+ p1 = _mm512_add_pd( p1, t1 ); -+ -+ t1 = _mm512_loadu_pd( &(*pj).c1.c1.re+8 ); -+ t2 = _mm512_mul_pd( zi, t1 ); -+ t2 = _mm512_permute_pd( t2, 0b01010101 ); -+ t1 = _mm512_fmadd_pd( zr, t1, t2 ); -+ p2 = _mm512_add_pd( p2, t1 ); -+ -+ t1 = _mm512_loadu_pd( &(*pj).c1.c1.re+16 ); -+ t2 = _mm512_mul_pd( zi, t1 ); -+ t2 = _mm512_permute_pd( t2, 0b01010101 ); -+ t1 = _mm512_fmadd_pd( zr, t1, t2 ); -+ p3 = _mm512_add_pd( p3, t1 ); -+ } -+ -+ _mm512_storeu_pd( &(*pk).c1.c1.re, p1 ); -+ _mm512_storeu_pd( &(*pk).c1.c1.re+8, p2 ); -+ _mm512_storeu_pd( &(*pk).c1.c1.re+16, p3 ); -+ } -+} -+ -+void mulg5_dble_avx512(int vol, spinor_dble *s) -+{ -+ spinor_dble *sm; -+ -+ sm = s + vol; -+ -+ for (; s < sm; s++) { -+ __m512d s1; -+ __m256d s2; -+ -+ s1 = _mm512_loadu_pd( &(*s).c1.c1.re+12 ); -+ s1 = _mm512_sub_pd( _mm512_setzero_pd(), s1 ); -+ _mm512_storeu_pd( &(*s).c1.c1.re+12, s1 ); -+ -+ s2 = _mm256_loadu_pd( &(*s).c1.c1.re+20 ); -+ s2 = _mm256_sub_pd( _mm256_setzero_pd(), s2 ); -+ _mm256_storeu_pd( &(*s).c1.c1.re+20, s2 ); -+ } -+} -+ -+void mulmg5_dble_avx512(int vol, spinor_dble *s) -+{ -+ spinor_dble *sm; -+ -+ sm = s + vol; -+ -+ for (; s < sm; s++) { -+ __m512d s1; -+ __m256d s2; -+ -+ s1 = _mm512_loadu_pd( &(*s).c1.c1.re ); -+ s1 = _mm512_sub_pd( _mm512_setzero_pd(), s1 ); -+ _mm512_storeu_pd( &(*s).c1.c1.re, s1 ); -+ -+ s2 = _mm256_loadu_pd( &(*s).c1.c1.re+8 ); -+ s2 = _mm256_sub_pd( _mm256_setzero_pd(), s2 ); -+ _mm256_storeu_pd( &(*s).c1.c1.re+8, s2 ); -+ } -+} -diff --git a/modules/linalg/avx512/salg_dble_avx512_asm.s b/modules/linalg/avx512/salg_dble_avx512_asm.s -new file mode 100644 -index 0000000..d85271e ---- /dev/null -+++ b/modules/linalg/avx512/salg_dble_avx512_asm.s -@@ -0,0 +1,768 @@ -+# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 17.0.4.196 Build 20170411"; -+# mark_description "-I../../../include -I.. -I/cineca/prod/opt/compilers/intel/pe-xe-2017/binary/impi/2017.3.196/intel64/include"; -+# mark_description " -isystem /cineca/prod/opt/compilers/intel/pe-xe-2018/binary/impi/2018.1.163/include64/ -std=c89 -xCORE-AVX5"; -+# mark_description "12 -mtune=skylake -DAVX512 -O3 -Ddirac_counters -pedantic -fstrict-aliasing -Wno-long-long -Wstrict-prototyp"; -+# mark_description "es -S"; -+ .file "salg_dble_avx512.c" -+ .text -+..TXTST0: -+# -- Begin spinor_prod_dble_avx512 -+ .text -+# mark_begin; -+ .align 16,0x90 -+ .globl spinor_prod_dble_avx512 -+# --- spinor_prod_dble_avx512(const spinor_dble *, const spinor_dble *, const spinor_dble *) -+spinor_prod_dble_avx512: -+# parameter 1: %rdi -+# parameter 2: %rsi -+# parameter 3: %rdx -+..B1.1: # Preds ..B1.0 -+ # Execution count [1.00e+00] -+ .cfi_startproc -+..___tag_value_spinor_prod_dble_avx512.1: -+..L2: -+ #16.1 -+ subq $136, %rsp #16.1 -+ .cfi_def_cfa_offset 144 -+ vpxord %zmm2, %zmm2, %zmm2 #21.8 -+ vmovaps %zmm2, %zmm1 #22.8 -+ cmpq %rsi, %rdi #23.14 -+ jae ..B1.5 # Prob 10% #23.14 -+ # LOE rdx rbx rbp rsi rdi r12 r13 r14 r15 zmm1 zmm2 -+..B1.2: # Preds ..B1.1 -+ # Execution count [9.00e-01] -+ vmovups .L_2il0floatpacket.3(%rip), %zmm0 #36.27 -+ # LOE rdx rbx rbp rsi rdi r12 r13 r14 r15 zmm0 zmm1 zmm2 -+..B1.3: # Preds ..B1.3 ..B1.2 -+ # Execution count [5.00e+00] -+ vmovups (%rdx), %zmm3 #27.30 -+ vmovups 64(%rdx), %zmm4 #28.30 -+ vmovups 128(%rdx), %zmm5 #29.30 -+ vmovups (%rdi), %zmm10 #24.30 -+ vmovups 64(%rdi), %zmm12 #25.30 -+ vmovups 128(%rdi), %zmm14 #26.30 -+ vfmadd231pd %zmm10, %zmm3, %zmm2 #30.10 -+ vpermilpd $85, %zmm3, %zmm6 #33.10 -+ addq $192, %rdi #23.19 -+ vmulpd %zmm0, %zmm6, %zmm9 #37.10 -+ addq $192, %rdx #43.5 -+ vfmadd231pd %zmm12, %zmm4, %zmm2 #31.10 -+ vfmadd231pd %zmm10, %zmm9, %zmm1 #40.10 -+ vfmadd231pd %zmm14, %zmm5, %zmm2 #32.10 -+ vpermilpd $85, %zmm4, %zmm7 #34.10 -+ vmulpd %zmm7, %zmm0, %zmm11 #38.10 -+ vpermilpd $85, %zmm5, %zmm8 #35.10 -+ vmulpd %zmm8, %zmm0, %zmm13 #39.10 -+ vfmadd231pd %zmm12, %zmm11, %zmm1 #41.10 -+ vfmadd231pd %zmm14, %zmm13, %zmm1 #42.10 -+ cmpq %rsi, %rdi #23.14 -+ jb ..B1.3 # Prob 82% #23.14 -+ # LOE rdx rbx rbp rsi rdi r12 r13 r14 r15 zmm0 zmm1 zmm2 -+..B1.5: # Preds ..B1.3 ..B1.1 -+ # Execution count [1.00e+00] -+ vmovups %zmm2, (%rsp) #45.10 -+ vmovups %zmm1, 64(%rsp) #46.10 -+ vmovsd (%rsp), %xmm2 #45.10 -+ vmovsd 16(%rsp), %xmm3 #45.10 -+ vmovsd 32(%rsp), %xmm6 #45.10 -+ vmovsd 48(%rsp), %xmm7 #45.10 -+ vmovsd 64(%rsp), %xmm1 #46.10 -+ vmovsd 80(%rsp), %xmm12 #46.10 -+ vmovsd 96(%rsp), %xmm15 #46.10 -+ vmovsd 112(%rsp), %xmm16 #46.10 -+ vaddsd 8(%rsp), %xmm2, %xmm4 #45.3 -+ vaddsd 24(%rsp), %xmm3, %xmm5 #45.3 -+ vaddsd 40(%rsp), %xmm6, %xmm8 #45.3 -+ vaddsd 56(%rsp), %xmm7, %xmm9 #45.3 -+ vaddsd 72(%rsp), %xmm1, %xmm13 #46.3 -+ vaddsd 88(%rsp), %xmm12, %xmm14 #46.3 -+ vaddsd 104(%rsp), %xmm15, %xmm17 #46.3 -+ vaddsd 120(%rsp), %xmm16, %xmm18 #46.3 -+ vaddsd %xmm5, %xmm4, %xmm10 #45.3 -+ vaddsd %xmm9, %xmm8, %xmm11 #45.3 -+ vaddsd %xmm14, %xmm13, %xmm19 #46.3 -+ vaddsd %xmm18, %xmm17, %xmm20 #46.3 -+ vaddsd %xmm11, %xmm10, %xmm0 #45.3 -+ vaddsd %xmm20, %xmm19, %xmm1 #46.3 -+ vzeroupper #47.10 -+ addq $136, %rsp #47.10 -+ .cfi_def_cfa_offset 8 -+ ret #47.10 -+ .align 16,0x90 -+ # LOE -+ .cfi_endproc -+# mark_end; -+ .type spinor_prod_dble_avx512,@function -+ .size spinor_prod_dble_avx512,.-spinor_prod_dble_avx512 -+ .data -+# -- End spinor_prod_dble_avx512 -+ .text -+# -- Begin spinor_prod_re_dble_avx512 -+ .text -+# mark_begin; -+ .align 16,0x90 -+ .globl spinor_prod_re_dble_avx512 -+# --- spinor_prod_re_dble_avx512(const spinor_dble *, const spinor_dble *, const spinor_dble *) -+spinor_prod_re_dble_avx512: -+# parameter 1: %rdi -+# parameter 2: %rsi -+# parameter 3: %rdx -+..B2.1: # Preds ..B2.0 -+ # Execution count [1.00e+00] -+ .cfi_startproc -+..___tag_value_spinor_prod_re_dble_avx512.6: -+..L7: -+ #51.1 -+ vpxord %zmm0, %zmm0, %zmm0 #54.8 -+ cmpq %rsi, %rdi #56.14 -+ jae ..B2.5 # Prob 10% #56.14 -+ # LOE rdx rbx rbp rsi rdi r12 r13 r14 r15 zmm0 -+..B2.3: # Preds ..B2.1 ..B2.3 -+ # Execution count [5.00e+00] -+ vmovups (%rdx), %zmm1 #60.30 -+ vmovups 64(%rdx), %zmm2 #61.30 -+ vmovups 128(%rdx), %zmm3 #62.30 -+ vfmadd231pd (%rdi), %zmm1, %zmm0 #63.10 -+ vfmadd231pd 64(%rdi), %zmm2, %zmm0 #64.10 -+ addq $192, %rdx #69.5 -+ vfmadd231pd 128(%rdi), %zmm3, %zmm0 #65.10 -+ addq $192, %rdi #56.19 -+ cmpq %rsi, %rdi #56.14 -+ jb ..B2.3 # Prob 82% #56.14 -+ # LOE rdx rbx rbp rsi rdi r12 r13 r14 r15 zmm0 -+..B2.5: # Preds ..B2.3 ..B2.1 -+ # Execution count [1.00e+00] -+ vmovups %zmm0, -72(%rsp) #71.7 -+ vmovsd -72(%rsp), %xmm0 #71.7 -+ vmovsd -56(%rsp), %xmm1 #71.7 -+ vmovsd -40(%rsp), %xmm4 #71.7 -+ vmovsd -24(%rsp), %xmm5 #71.7 -+ vaddsd -64(%rsp), %xmm0, %xmm2 #72.10 -+ vaddsd -48(%rsp), %xmm1, %xmm3 #72.10 -+ vaddsd -32(%rsp), %xmm4, %xmm6 #72.10 -+ vaddsd -16(%rsp), %xmm5, %xmm7 #72.10 -+ vaddsd %xmm3, %xmm2, %xmm8 #72.10 -+ vaddsd %xmm7, %xmm6, %xmm9 #72.10 -+ vaddsd %xmm9, %xmm8, %xmm0 #72.10 -+ vcvtsd2ss %xmm0, %xmm0, %xmm0 #71.7 -+ vcvtss2sd %xmm0, %xmm0, %xmm0 #72.10 -+ vzeroupper #72.10 -+ ret #72.10 -+ .align 16,0x90 -+ # LOE -+ .cfi_endproc -+# mark_end; -+ .type spinor_prod_re_dble_avx512,@function -+ .size spinor_prod_re_dble_avx512,.-spinor_prod_re_dble_avx512 -+ .data -+# -- End spinor_prod_re_dble_avx512 -+ .text -+# -- Begin spinor_prod5_dble_avx512 -+ .text -+# mark_begin; -+ .align 16,0x90 -+ .globl spinor_prod5_dble_avx512 -+# --- spinor_prod5_dble_avx512(const spinor_dble *, const spinor_dble *, const spinor_dble *) -+spinor_prod5_dble_avx512: -+# parameter 1: %rdi -+# parameter 2: %rsi -+# parameter 3: %rdx -+..B3.1: # Preds ..B3.0 -+ # Execution count [1.00e+00] -+ .cfi_startproc -+..___tag_value_spinor_prod5_dble_avx512.9: -+..L10: -+ #76.1 -+ subq $136, %rsp #76.1 -+ .cfi_def_cfa_offset 144 -+ vpxord %zmm3, %zmm3, %zmm3 #80.8 -+ vmovaps %zmm3, %zmm2 #81.8 -+ cmpq %rsi, %rdi #82.14 -+ jae ..B3.5 # Prob 10% #82.14 -+ # LOE rdx rbx rbp rsi rdi r12 r13 r14 r15 zmm2 zmm3 -+..B3.2: # Preds ..B3.1 -+ # Execution count [9.00e-01] -+ vmovups .L_2il0floatpacket.4(%rip), %zmm1 #89.27 -+ vmovups .L_2il0floatpacket.3(%rip), %zmm0 #97.27 -+ # LOE rdx rbx rbp rsi rdi r12 r13 r14 r15 zmm0 zmm1 zmm2 zmm3 -+..B3.3: # Preds ..B3.3 ..B3.2 -+ # Execution count [5.00e+00] -+ vmovups (%rdx), %zmm4 #86.30 -+ vmovups 64(%rdx), %zmm5 #87.30 -+ vmovups 128(%rdx), %zmm6 #88.30 -+ vmovups (%rdi), %zmm11 #83.30 -+ vmovups 128(%rdi), %zmm15 #85.30 -+ vmulpd 64(%rdi), %zmm1, %zmm13 #90.10 -+ vfmadd231pd %zmm11, %zmm4, %zmm3 #91.10 -+ vpermilpd $85, %zmm4, %zmm7 #94.10 -+ addq $192, %rdi #82.19 -+ vmulpd %zmm0, %zmm7, %zmm10 #98.10 -+ addq $192, %rdx #104.5 -+ vfmadd231pd %zmm13, %zmm5, %zmm3 #92.10 -+ vfmadd231pd %zmm11, %zmm10, %zmm2 #101.10 -+ vfnmadd231pd %zmm15, %zmm6, %zmm3 #93.10 -+ vpermilpd $85, %zmm5, %zmm8 #95.10 -+ vmulpd %zmm8, %zmm0, %zmm12 #99.10 -+ vpermilpd $85, %zmm6, %zmm9 #96.10 -+ vmulpd %zmm9, %zmm0, %zmm14 #100.10 -+ vfmadd213pd %zmm2, %zmm12, %zmm13 #102.10 -+ vmovaps %zmm13, %zmm2 #103.10 -+ vfnmadd231pd %zmm15, %zmm14, %zmm2 #103.10 -+ cmpq %rsi, %rdi #82.14 -+ jb ..B3.3 # Prob 82% #82.14 -+ # LOE rdx rbx rbp rsi rdi r12 r13 r14 r15 zmm0 zmm1 zmm2 zmm3 -+..B3.5: # Preds ..B3.3 ..B3.1 -+ # Execution count [1.00e+00] -+ vmovups %zmm3, (%rsp) #106.10 -+ vmovups %zmm2, 64(%rsp) #107.10 -+ vmovsd (%rsp), %xmm1 #106.10 -+ vmovsd 16(%rsp), %xmm3 #106.10 -+ vmovsd 32(%rsp), %xmm6 #106.10 -+ vmovsd 48(%rsp), %xmm7 #106.10 -+ vmovsd 64(%rsp), %xmm2 #107.10 -+ vmovsd 80(%rsp), %xmm12 #107.10 -+ vmovsd 96(%rsp), %xmm15 #107.10 -+ vmovsd 112(%rsp), %xmm16 #107.10 -+ vaddsd 8(%rsp), %xmm1, %xmm4 #106.3 -+ vaddsd 24(%rsp), %xmm3, %xmm5 #106.3 -+ vaddsd 40(%rsp), %xmm6, %xmm8 #106.3 -+ vaddsd 56(%rsp), %xmm7, %xmm9 #106.3 -+ vaddsd 72(%rsp), %xmm2, %xmm13 #107.3 -+ vaddsd 88(%rsp), %xmm12, %xmm14 #107.3 -+ vaddsd 104(%rsp), %xmm15, %xmm17 #107.3 -+ vaddsd 120(%rsp), %xmm16, %xmm18 #107.3 -+ vaddsd %xmm5, %xmm4, %xmm10 #106.3 -+ vaddsd %xmm9, %xmm8, %xmm11 #106.3 -+ vaddsd %xmm14, %xmm13, %xmm19 #107.3 -+ vaddsd %xmm18, %xmm17, %xmm20 #107.3 -+ vaddsd %xmm11, %xmm10, %xmm0 #106.3 -+ vaddsd %xmm20, %xmm19, %xmm1 #107.3 -+ vzeroupper #108.10 -+ addq $136, %rsp #108.10 -+ .cfi_def_cfa_offset 8 -+ ret #108.10 -+ .align 16,0x90 -+ # LOE -+ .cfi_endproc -+# mark_end; -+ .type spinor_prod5_dble_avx512,@function -+ .size spinor_prod5_dble_avx512,.-spinor_prod5_dble_avx512 -+ .data -+# -- End spinor_prod5_dble_avx512 -+ .text -+# -- Begin norm_square_dble_avx512 -+ .text -+# mark_begin; -+ .align 16,0x90 -+ .globl norm_square_dble_avx512 -+# --- norm_square_dble_avx512(const spinor_dble *, const spinor_dble *) -+norm_square_dble_avx512: -+# parameter 1: %rdi -+# parameter 2: %rsi -+..B4.1: # Preds ..B4.0 -+ # Execution count [1.00e+00] -+ .cfi_startproc -+..___tag_value_norm_square_dble_avx512.14: -+..L15: -+ #112.1 -+ vpxord %zmm0, %zmm0, %zmm0 #114.9 -+ cmpq %rsi, %rdi #115.14 -+ jae ..B4.5 # Prob 10% #115.14 -+ # LOE rbx rbp rsi rdi r12 r13 r14 r15 zmm0 -+..B4.3: # Preds ..B4.1 ..B4.3 -+ # Execution count [5.00e+00] -+ vmovups (%rdi), %zmm1 #116.30 -+ vmovups 64(%rdi), %zmm2 #117.30 -+ vmovups 128(%rdi), %zmm3 #118.30 -+ vfmadd231pd %zmm1, %zmm1, %zmm0 #119.11 -+ vfmadd231pd %zmm2, %zmm2, %zmm0 #120.11 -+ addq $192, %rdi #115.19 -+ vfmadd231pd %zmm3, %zmm3, %zmm0 #121.11 -+ cmpq %rsi, %rdi #115.14 -+ jb ..B4.3 # Prob 82% #115.14 -+ # LOE rbx rbp rsi rdi r12 r13 r14 r15 zmm0 -+..B4.5: # Preds ..B4.3 ..B4.1 -+ # Execution count [1.00e+00] -+ vmovups %zmm0, -72(%rsp) #123.10 -+ vmovsd -72(%rsp), %xmm0 #123.10 -+ vmovsd -56(%rsp), %xmm1 #123.10 -+ vmovsd -40(%rsp), %xmm4 #123.10 -+ vmovsd -24(%rsp), %xmm5 #123.10 -+ vaddsd -64(%rsp), %xmm0, %xmm2 #123.10 -+ vaddsd -48(%rsp), %xmm1, %xmm3 #123.10 -+ vaddsd -32(%rsp), %xmm4, %xmm6 #123.10 -+ vaddsd -16(%rsp), %xmm5, %xmm7 #123.10 -+ vaddsd %xmm3, %xmm2, %xmm8 #123.10 -+ vaddsd %xmm7, %xmm6, %xmm9 #123.10 -+ vaddsd %xmm9, %xmm8, %xmm0 #123.10 -+ vzeroupper #123.10 -+ ret #123.10 -+ .align 16,0x90 -+ # LOE -+ .cfi_endproc -+# mark_end; -+ .type norm_square_dble_avx512,@function -+ .size norm_square_dble_avx512,.-norm_square_dble_avx512 -+ .data -+# -- End norm_square_dble_avx512 -+ .text -+# -- Begin mulc_spinor_add_dble_avx512 -+ .text -+# mark_begin; -+ .align 16,0x90 -+ .globl mulc_spinor_add_dble_avx512 -+# --- mulc_spinor_add_dble_avx512(int, spinor_dble *, const spinor_dble *, complex_dble) -+mulc_spinor_add_dble_avx512: -+# parameter 1: %edi -+# parameter 2: %rsi -+# parameter 3: %rdx -+# parameter 4: %xmm0 %xmm1 -+..B5.1: # Preds ..B5.0 -+ # Execution count [1.00e+00] -+ .cfi_startproc -+..___tag_value_mulc_spinor_add_dble_avx512.17: -+..L18: -+ #128.1 -+ vmovapd %xmm1, %xmm2 #134.8 -+ movslq %edi, %rdi #128.1 -+ vbroadcastsd %xmm0, %zmm1 #135.8 -+ vbroadcastsd %xmm2, %zmm0 #136.8 -+ lea (%rdi,%rdi,2), %rax #138.8 -+ shlq $6, %rax #138.8 -+ addq %rsi, %rax #138.8 -+ cmpq %rax, %rsi #140.14 -+ jae ..B5.5 # Prob 10% #140.14 -+ # LOE rax rdx rbx rbp rsi r12 r13 r14 r15 zmm0 zmm1 -+..B5.3: # Preds ..B5.1 ..B5.3 -+ # Execution count [5.00e+00] -+ vmovups (%rdx), %zmm3 #141.30 -+ vmulpd %zmm3, %zmm0, %zmm2 #142.10 -+ vpermilpd $85, %zmm2, %zmm4 #143.10 -+ vfmaddsub231pd %zmm1, %zmm3, %zmm4 #144.10 -+ vaddpd (%rsi), %zmm4, %zmm5 #146.10 -+ vmovups %zmm5, (%rsi) #147.26 -+ vmovups 64(%rdx), %zmm7 #149.30 -+ vmulpd %zmm7, %zmm0, %zmm6 #150.10 -+ vpermilpd $85, %zmm6, %zmm8 #151.10 -+ vfmaddsub231pd %zmm1, %zmm7, %zmm8 #152.10 -+ vaddpd 64(%rsi), %zmm8, %zmm9 #154.10 -+ vmovups %zmm9, 64(%rsi) #155.26 -+ vmovups 128(%rdx), %zmm11 #157.30 -+ addq $192, %rdx #165.5 -+ vmulpd %zmm11, %zmm0, %zmm10 #158.10 -+ vpermilpd $85, %zmm10, %zmm12 #159.10 -+ vfmaddsub231pd %zmm1, %zmm11, %zmm12 #160.10 -+ vaddpd 128(%rsi), %zmm12, %zmm13 #162.10 -+ vmovups %zmm13, 128(%rsi) #163.26 -+ addq $192, %rsi #140.18 -+ cmpq %rax, %rsi #140.14 -+ jb ..B5.3 # Prob 82% #140.14 -+ # LOE rax rdx rbx rbp rsi r12 r13 r14 r15 zmm0 zmm1 -+..B5.5: # Preds ..B5.3 ..B5.1 -+ # Execution count [1.00e+00] -+ vzeroupper #167.1 -+ ret #167.1 -+ .align 16,0x90 -+ # LOE -+ .cfi_endproc -+# mark_end; -+ .type mulc_spinor_add_dble_avx512,@function -+ .size mulc_spinor_add_dble_avx512,.-mulc_spinor_add_dble_avx512 -+ .data -+# -- End mulc_spinor_add_dble_avx512 -+ .text -+# -- Begin mulr_spinor_add_dble_avx512 -+ .text -+# mark_begin; -+ .align 16,0x90 -+ .globl mulr_spinor_add_dble_avx512 -+# --- mulr_spinor_add_dble_avx512(int, spinor_dble *, const spinor_dble *, double) -+mulr_spinor_add_dble_avx512: -+# parameter 1: %edi -+# parameter 2: %rsi -+# parameter 3: %rdx -+# parameter 4: %xmm0 -+..B6.1: # Preds ..B6.0 -+ # Execution count [1.00e+00] -+ .cfi_startproc -+..___tag_value_mulr_spinor_add_dble_avx512.20: -+..L21: -+ #171.1 -+ movslq %edi, %rdi #171.1 -+ vbroadcastsd %xmm0, %zmm0 #177.8 -+ lea (%rdi,%rdi,2), %rax #179.8 -+ shlq $6, %rax #179.8 -+ addq %rsi, %rax #179.8 -+ cmpq %rax, %rsi #181.14 -+ jae ..B6.5 # Prob 10% #181.14 -+ .align 16,0x90 -+ # LOE rax rdx rbx rbp rsi r12 r13 r14 r15 zmm0 -+..B6.3: # Preds ..B6.1 ..B6.3 -+ # Execution count [5.00e+00] -+ vmovups (%rdx), %zmm1 #182.30 -+ vfmadd213pd (%rsi), %zmm0, %zmm1 #185.10 -+ vmovups %zmm1, (%rsi) #186.26 -+ vmovups 64(%rdx), %zmm2 #188.30 -+ vfmadd213pd 64(%rsi), %zmm0, %zmm2 #191.10 -+ vmovups %zmm2, 64(%rsi) #192.26 -+ vmovups 128(%rdx), %zmm3 #194.30 -+ addq $192, %rdx #200.5 -+ vfmadd213pd 128(%rsi), %zmm0, %zmm3 #197.10 -+ vmovups %zmm3, 128(%rsi) #198.26 -+ addq $192, %rsi #181.18 -+ cmpq %rax, %rsi #181.14 -+ jb ..B6.3 # Prob 82% #181.14 -+ # LOE rax rdx rbx rbp rsi r12 r13 r14 r15 zmm0 -+..B6.5: # Preds ..B6.3 ..B6.1 -+ # Execution count [1.00e+00] -+ vzeroupper #202.1 -+ ret #202.1 -+ .align 16,0x90 -+ # LOE -+ .cfi_endproc -+# mark_end; -+ .type mulr_spinor_add_dble_avx512,@function -+ .size mulr_spinor_add_dble_avx512,.-mulr_spinor_add_dble_avx512 -+ .data -+# -- End mulr_spinor_add_dble_avx512 -+ .text -+# -- Begin combine_spinor_dble_avx512 -+ .text -+# mark_begin; -+ .align 16,0x90 -+ .globl combine_spinor_dble_avx512 -+# --- combine_spinor_dble_avx512(int, spinor_dble *, const spinor_dble *, double, double) -+combine_spinor_dble_avx512: -+# parameter 1: %edi -+# parameter 2: %rsi -+# parameter 3: %rdx -+# parameter 4: %xmm0 -+# parameter 5: %xmm1 -+..B7.1: # Preds ..B7.0 -+ # Execution count [1.00e+00] -+ .cfi_startproc -+..___tag_value_combine_spinor_dble_avx512.23: -+..L24: -+ #206.1 -+ vmovapd %xmm1, %xmm2 #212.11 -+ movslq %edi, %rdi #206.1 -+ vbroadcastsd %xmm0, %zmm1 #213.9 -+ vbroadcastsd %xmm2, %zmm0 #214.9 -+ lea (%rdi,%rdi,2), %rax #216.8 -+ shlq $6, %rax #216.8 -+ addq %rsi, %rax #216.8 -+ cmpq %rax, %rsi #218.14 -+ jae ..B7.5 # Prob 10% #218.14 -+ .align 16,0x90 -+ # LOE rax rdx rbx rbp rsi r12 r13 r14 r15 zmm0 zmm1 -+..B7.3: # Preds ..B7.1 ..B7.3 -+ # Execution count [5.00e+00] -+ vmulpd (%rdx), %zmm0, %zmm2 #220.10 -+ vfmadd231pd (%rsi), %zmm1, %zmm2 #222.10 -+ vmovups %zmm2, (%rsi) #223.26 -+ vmulpd 64(%rdx), %zmm0, %zmm3 #226.10 -+ vfmadd231pd 64(%rsi), %zmm1, %zmm3 #228.10 -+ vmovups %zmm3, 64(%rsi) #229.26 -+ vmulpd 128(%rdx), %zmm0, %zmm4 #232.10 -+ addq $192, %rdx #237.5 -+ vfmadd231pd 128(%rsi), %zmm1, %zmm4 #234.10 -+ vmovups %zmm4, 128(%rsi) #235.26 -+ addq $192, %rsi #218.18 -+ cmpq %rax, %rsi #218.14 -+ jb ..B7.3 # Prob 82% #218.14 -+ # LOE rax rdx rbx rbp rsi r12 r13 r14 r15 zmm0 zmm1 -+..B7.5: # Preds ..B7.3 ..B7.1 -+ # Execution count [1.00e+00] -+ vzeroupper #239.1 -+ ret #239.1 -+ .align 16,0x90 -+ # LOE -+ .cfi_endproc -+# mark_end; -+ .type combine_spinor_dble_avx512,@function -+ .size combine_spinor_dble_avx512,.-combine_spinor_dble_avx512 -+ .data -+# -- End combine_spinor_dble_avx512 -+ .text -+# -- Begin scale_dble_avx512 -+ .text -+# mark_begin; -+ .align 16,0x90 -+ .globl scale_dble_avx512 -+# --- scale_dble_avx512(int, double, spinor_dble *) -+scale_dble_avx512: -+# parameter 1: %edi -+# parameter 2: %xmm0 -+# parameter 3: %rsi -+..B8.1: # Preds ..B8.0 -+ # Execution count [1.00e+00] -+ .cfi_startproc -+..___tag_value_scale_dble_avx512.26: -+..L27: -+ #242.1 -+ movslq %edi, %rdi #242.1 -+ vbroadcastsd %xmm0, %zmm0 #248.8 -+ lea (%rdi,%rdi,2), %rax #250.8 -+ shlq $6, %rax #250.8 -+ addq %rsi, %rax #250.8 -+ cmpq %rax, %rsi #252.14 -+ jae ..B8.5 # Prob 10% #252.14 -+ # LOE rax rbx rbp rsi r12 r13 r14 r15 zmm0 -+..B8.3: # Preds ..B8.1 ..B8.3 -+ # Execution count [5.00e+00] -+ vmulpd (%rsi), %zmm0, %zmm1 #254.10 -+ vmulpd 64(%rsi), %zmm0, %zmm2 #258.10 -+ vmulpd 128(%rsi), %zmm0, %zmm3 #262.10 -+ vmovups %zmm1, (%rsi) #255.26 -+ vmovups %zmm2, 64(%rsi) #259.26 -+ vmovups %zmm3, 128(%rsi) #263.26 -+ addq $192, %rsi #252.18 -+ cmpq %rax, %rsi #252.14 -+ jb ..B8.3 # Prob 82% #252.14 -+ # LOE rax rbx rbp rsi r12 r13 r14 r15 zmm0 -+..B8.5: # Preds ..B8.3 ..B8.1 -+ # Execution count [1.00e+00] -+ vzeroupper #265.1 -+ ret #265.1 -+ .align 16,0x90 -+ # LOE -+ .cfi_endproc -+# mark_end; -+ .type scale_dble_avx512,@function -+ .size scale_dble_avx512,.-scale_dble_avx512 -+ .data -+# -- End scale_dble_avx512 -+ .text -+# -- Begin rotate_dble_avx512 -+ .text -+# mark_begin; -+ .align 16,0x90 -+ .globl rotate_dble_avx512 -+# --- rotate_dble_avx512(int, int, spinor_dble **, spinor_dble *, const complex_dble *) -+rotate_dble_avx512: -+# parameter 1: %edi -+# parameter 2: %esi -+# parameter 3: %rdx -+# parameter 4: %rcx -+# parameter 5: %r8 -+..B9.1: # Preds ..B9.0 -+ # Execution count [1.00e+00] -+ .cfi_startproc -+..___tag_value_rotate_dble_avx512.29: -+..L30: -+ #268.1 -+ xorl %r10d, %r10d #273.8 -+ movslq %edi, %r9 #268.1 -+ xorl %edi, %edi #273.8 -+ testq %r9, %r9 #273.19 -+ jle ..B9.9 # Prob 10% #273.19 -+ # LOE rdx rcx rbx rbp rdi r8 r9 r10 r12 r13 r14 r15 esi -+..B9.2: # Preds ..B9.1 -+ # Execution count [9.00e-01] -+ movslq %esi, %rsi #268.1 -+ movq %r9, %rax #306.7 -+ vmovups .L_2il0floatpacket.3(%rip), %zmm0 #286.12 -+ shlq $4, %rax #306.7 -+ lea (%rsi,%rsi,2), %rsi #278.10 -+ shlq $6, %rsi #278.10 -+ movq %r15, -24(%rsp) #306.7[spill] -+ movq %rbx, -16(%rsp) #306.7[spill] -+ .cfi_offset 3, -24 -+ .cfi_offset 15, -32 -+ # LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r12 r13 r14 zmm0 -+..B9.3: # Preds ..B9.7 ..B9.2 -+ # Execution count [5.00e+00] -+ vmulpd 8(%r8){1to8}, %zmm0, %zmm6 #287.10 -+ movq %r8, %r11 #279.5 -+ vbroadcastsd (%r8), %zmm1 #283.10 -+ movq (%rdx), %rbx #278.10 -+ vmovups (%rsi,%rbx), %zmm2 #289.30 -+ vmovups 64(%rsi,%rbx), %zmm5 #294.30 -+ vmovups 128(%rsi,%rbx), %zmm8 #299.30 -+ vmulpd %zmm2, %zmm6, %zmm3 #290.10 -+ vmulpd %zmm5, %zmm6, %zmm4 #295.10 -+ vmulpd %zmm8, %zmm6, %zmm7 #300.10 -+ vpermilpd $85, %zmm3, %zmm3 #291.10 -+ movl $1, %ebx #304.10 -+ vfmadd231pd %zmm1, %zmm2, %zmm3 #292.10 -+ vpermilpd $85, %zmm4, %zmm2 #296.10 -+ vpermilpd $85, %zmm7, %zmm9 #301.10 -+ vfmadd231pd %zmm1, %zmm5, %zmm2 #297.10 -+ vfmadd213pd %zmm9, %zmm8, %zmm1 #302.10 -+ cmpq $1, %r9 #304.21 -+ jle ..B9.7 # Prob 10% #304.21 -+ # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r11 r12 r13 r14 zmm0 zmm1 zmm2 zmm3 -+..B9.5: # Preds ..B9.3 ..B9.5 -+ # Execution count [2.50e+01] -+ addq %rax, %r11 #306.7 -+ movq (%rdx,%rbx,8), %r15 #305.12 -+ incq %rbx #304.24 -+ vmulpd 8(%r11){1to8}, %zmm0, %zmm10 #312.12 -+ vmovups (%r15,%rsi), %zmm5 #314.32 -+ vmovups 64(%r15,%rsi), %zmm8 #320.32 -+ vmovups 128(%r15,%rsi), %zmm12 #326.32 -+ vbroadcastsd (%r11), %zmm14 #310.12 -+ vmulpd %zmm5, %zmm10, %zmm4 #315.12 -+ vmulpd %zmm8, %zmm10, %zmm7 #321.12 -+ vmulpd %zmm12, %zmm10, %zmm11 #327.12 -+ vpermilpd $85, %zmm4, %zmm6 #316.12 -+ vpermilpd $85, %zmm7, %zmm9 #322.12 -+ vpermilpd $85, %zmm11, %zmm13 #328.12 -+ vfmadd231pd %zmm14, %zmm5, %zmm6 #317.12 -+ vfmadd231pd %zmm14, %zmm8, %zmm9 #323.12 -+ vfmadd213pd %zmm13, %zmm12, %zmm14 #329.12 -+ vaddpd %zmm3, %zmm6, %zmm3 #318.12 -+ vaddpd %zmm2, %zmm9, %zmm2 #324.12 -+ vaddpd %zmm1, %zmm14, %zmm1 #330.12 -+ cmpq %r9, %rbx #304.21 -+ jl ..B9.5 # Prob 82% #304.21 -+ # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r11 r12 r13 r14 zmm0 zmm1 zmm2 zmm3 -+..B9.7: # Preds ..B9.5 ..B9.3 -+ # Execution count [5.00e+00] -+ incq %r10 #273.22 -+ addq $16, %r8 #273.22 -+ vmovups %zmm3, (%rdi,%rcx) #333.26 -+ vmovups %zmm2, 64(%rdi,%rcx) #334.26 -+ vmovups %zmm1, 128(%rdi,%rcx) #335.26 -+ addq $192, %rdi #273.22 -+ cmpq %r9, %r10 #273.19 -+ jl ..B9.3 # Prob 82% #273.19 -+ # LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r12 r13 r14 zmm0 -+..B9.8: # Preds ..B9.7 -+ # Execution count [9.00e-01] -+ movq -24(%rsp), %r15 #[spill] -+ .cfi_restore 15 -+ movq -16(%rsp), %rbx #[spill] -+ .cfi_restore 3 -+ # LOE rbx rbp r12 r13 r14 r15 -+..B9.9: # Preds ..B9.8 ..B9.1 -+ # Execution count [1.00e+00] -+ vzeroupper #337.1 -+ ret #337.1 -+ .align 16,0x90 -+ # LOE -+ .cfi_endproc -+# mark_end; -+ .type rotate_dble_avx512,@function -+ .size rotate_dble_avx512,.-rotate_dble_avx512 -+ .data -+# -- End rotate_dble_avx512 -+ .text -+# -- Begin mulg5_dble_avx512 -+ .text -+# mark_begin; -+ .align 16,0x90 -+ .globl mulg5_dble_avx512 -+# --- mulg5_dble_avx512(int, spinor_dble *) -+mulg5_dble_avx512: -+# parameter 1: %edi -+# parameter 2: %rsi -+..B10.1: # Preds ..B10.0 -+ # Execution count [1.00e+00] -+ .cfi_startproc -+..___tag_value_mulg5_dble_avx512.36: -+..L37: -+ #340.1 -+ movslq %edi, %rdi #340.1 -+ lea (%rdi,%rdi,2), %rax #343.8 -+ shlq $6, %rax #343.8 -+ addq %rsi, %rax #343.8 -+ cmpq %rax, %rsi #345.14 -+ jae ..B10.5 # Prob 10% #345.14 -+ # LOE rax rbx rbp rsi r12 r13 r14 r15 -+..B10.2: # Preds ..B10.1 -+ # Execution count [9.00e-01] -+ vpxord %zmm1, %zmm1, %zmm1 #350.25 -+ vxorpd %ymm0, %ymm0, %ymm0 #354.25 -+ # LOE rax rbx rbp rsi r12 r13 r14 r15 ymm0 zmm1 -+..B10.3: # Preds ..B10.3 ..B10.2 -+ # Execution count [5.00e+00] -+ vsubpd 96(%rsi), %zmm1, %zmm2 #350.10 -+ vsubpd 160(%rsi), %ymm0, %ymm3 #354.10 -+ vmovups %zmm2, 96(%rsi) #351.26 -+ vmovupd %ymm3, 160(%rsi) #355.26 -+ addq $192, %rsi #345.18 -+ cmpq %rax, %rsi #345.14 -+ jb ..B10.3 # Prob 82% #345.14 -+ # LOE rax rbx rbp rsi r12 r13 r14 r15 ymm0 zmm1 -+..B10.5: # Preds ..B10.3 ..B10.1 -+ # Execution count [1.00e+00] -+ vzeroupper #357.1 -+ ret #357.1 -+ .align 16,0x90 -+ # LOE -+ .cfi_endproc -+# mark_end; -+ .type mulg5_dble_avx512,@function -+ .size mulg5_dble_avx512,.-mulg5_dble_avx512 -+ .data -+# -- End mulg5_dble_avx512 -+ .text -+# -- Begin mulmg5_dble_avx512 -+ .text -+# mark_begin; -+ .align 16,0x90 -+ .globl mulmg5_dble_avx512 -+# --- mulmg5_dble_avx512(int, spinor_dble *) -+mulmg5_dble_avx512: -+# parameter 1: %edi -+# parameter 2: %rsi -+..B11.1: # Preds ..B11.0 -+ # Execution count [1.00e+00] -+ .cfi_startproc -+..___tag_value_mulmg5_dble_avx512.39: -+..L40: -+ #360.1 -+ movslq %edi, %rdi #360.1 -+ lea (%rdi,%rdi,2), %rax #363.8 -+ shlq $6, %rax #363.8 -+ addq %rsi, %rax #363.8 -+ cmpq %rax, %rsi #365.14 -+ jae ..B11.5 # Prob 10% #365.14 -+ # LOE rax rbx rbp rsi r12 r13 r14 r15 -+..B11.2: # Preds ..B11.1 -+ # Execution count [9.00e-01] -+ vpxord %zmm1, %zmm1, %zmm1 #370.25 -+ vxorpd %ymm0, %ymm0, %ymm0 #374.25 -+ # LOE rax rbx rbp rsi r12 r13 r14 r15 ymm0 zmm1 -+..B11.3: # Preds ..B11.3 ..B11.2 -+ # Execution count [5.00e+00] -+ vsubpd (%rsi), %zmm1, %zmm2 #370.10 -+ vsubpd 64(%rsi), %ymm0, %ymm3 #374.10 -+ vmovups %zmm2, (%rsi) #371.26 -+ vmovupd %ymm3, 64(%rsi) #375.26 -+ addq $192, %rsi #365.18 -+ cmpq %rax, %rsi #365.14 -+ jb ..B11.3 # Prob 82% #365.14 -+ # LOE rax rbx rbp rsi r12 r13 r14 r15 ymm0 zmm1 -+..B11.5: # Preds ..B11.3 ..B11.1 -+ # Execution count [1.00e+00] -+ vzeroupper #377.1 -+ ret #377.1 -+ .align 16,0x90 -+ # LOE -+ .cfi_endproc -+# mark_end; -+ .type mulmg5_dble_avx512,@function -+ .size mulmg5_dble_avx512,.-mulmg5_dble_avx512 -+ .data -+# -- End mulmg5_dble_avx512 -+ .section .rodata, "a" -+ .align 64 -+ .align 64 -+.L_2il0floatpacket.3: -+ .long 0x00000000,0x3ff00000,0x00000000,0xbff00000,0x00000000,0x3ff00000,0x00000000,0xbff00000,0x00000000,0x3ff00000,0x00000000,0xbff00000,0x00000000,0x3ff00000,0x00000000,0xbff00000 -+ .type .L_2il0floatpacket.3,@object -+ .size .L_2il0floatpacket.3,64 -+ .align 64 -+.L_2il0floatpacket.4: -+ .long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0xbff00000,0x00000000,0xbff00000,0x00000000,0xbff00000,0x00000000,0xbff00000 -+ .type .L_2il0floatpacket.4,@object -+ .size .L_2il0floatpacket.4,64 -+ .data -+ .section .note.GNU-stack, "" -+// -- Begin DWARF2 SEGMENT .eh_frame -+ .section .eh_frame,"a",@progbits -+.eh_frame_seg: -+ .align 8 -+# End -diff --git a/modules/linalg/salg.c b/modules/linalg/salg.c -index 2be05c1..40e8479 100644 ---- a/modules/linalg/salg.c -+++ b/modules/linalg/salg.c -@@ -89,9 +89,46 @@ static void alloc_wrotate(int n) - nrot=n; - } - -+ -+#if (defined AVX512 ) -+ -+void mulc_spinor_add_avx512(int vol, spinor *s, spinor const *r, complex z); -+void mulc_spinor_add(int vol, spinor *s, spinor *r, complex z) -+{ -+ mulc_spinor_add_avx512( vol, s, r, z); -+} -+ -+complex_dble spinor_prod_avx512(int vol, spinor *s, spinor *r ); -+complex spinor_prod(int vol, int icom, spinor *s, spinor *r ) -+{ -+ complex z; -+ complex_dble v, w; -+ -+ v = spinor_prod_avx512(vol, s, r); -+ -+ if ((icom==1)&&(NPROC>1)) -+ { -+ MPI_Reduce(&v.re,&w.re,2,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); -+ MPI_Bcast(&w.re,2,MPI_DOUBLE,0,MPI_COMM_WORLD); -+ z.re=(float)(w.re); -+ z.im=(float)(w.im); -+ } -+ else -+ { -+ z.re=(float)(v.re); -+ z.im=(float)(v.im); -+ } -+ return z; -+} -+ -+#endif -+ -+ -+ - #if (defined AVX) - #include "avx.h" - -+#ifndef AVX512 - #if (defined FMA3) - - complex spinor_prod(int vol,int icom,spinor *s,spinor *r) -@@ -323,9 +360,30 @@ complex spinor_prod(int vol,int icom,spinor *s,spinor *r) - - return z; - } -+#endif -+ -+void mulc_spinor_add(int vol, spinor *s, spinor const *r, complex z) -+{ -+ spinor *sm; -+ -+ _avx_load_cmplx_up(z); -+ sm = s + vol; -+ -+ for (; s < sm; s++) { -+ _avx_spinor_load(*s); -+ _avx_mulc_spinor_add(*r); -+ _avx_spinor_store(*s); -+ -+ r += 1; -+ } -+ -+ _avx_zeroupper(); -+} - - #endif - -+ -+ - float spinor_prod_re(int vol,int icom,spinor *s,spinor *r) - { - double x,y; -@@ -473,25 +531,6 @@ float norm_square(int vol,int icom,spinor *s) - } - - --void mulc_spinor_add(int vol,spinor *s,spinor *r,complex z) --{ -- spinor *sm; -- -- _avx_load_cmplx_up(z); -- sm=s+vol; -- -- for (;s sm) { -+ smb = sm; -+ } -+ smz = spinor_prod_dble_avx512( s, smb, r ); -+ s = smb; r+=8; -+ add_to_hsum(isz, (double *)(&smz)); -+ } -+ -+ if ((icom == 1) && (NPROC > 1)) { -+ global_hsum(isz, (double *)(&smz)); -+ } else { -+ local_hsum(isz, (double *)(&smz)); -+ } -+ -+ return smz; -+} -+ -+double spinor_prod_re_dble_avx512( spinor_dble *s, spinor_dble *smb, spinor_dble *r); -+double spinor_prod_re_dble(int vol, int icom, spinor_dble *s, -+ spinor_dble *r) -+{ -+ spinor_dble *sm, *smb; -+ -+ if (init == 0) { -+ isx = init_hsum(1); -+ isz = init_hsum(2); -+ init = 1; -+ } -+ -+ reset_hsum(isx); -+ sm = s + vol; -+ -+ while (s < sm) { -+ smb = s + 8; -+ if (smb > sm) { -+ smb = sm; -+ } -+ smx = spinor_prod_re_dble_avx512( s, smb, r ); -+ s = smb; r+=8; -+ -+ add_to_hsum(isx, &smx); -+ } -+ -+ if ((icom == 1) && (NPROC > 1)) { -+ global_hsum(isx, &smx); -+ } else { -+ local_hsum(isx, &smx); -+ } -+ -+ return smx; -+} -+ -+complex_dble spinor_prod5_dble_avx512(spinor_dble *s, spinor_dble *smb, spinor_dble *r); -+complex_dble spinor_prod5_dble(int vol, int icom, spinor_dble *s, -+ spinor_dble *r) -+{ -+ spinor_dble *sm, *smb; -+ -+ if (init == 0) { -+ isx = init_hsum(1); -+ isz = init_hsum(2); -+ init = 1; -+ } -+ -+ reset_hsum(isz); -+ sm = s + vol; -+ -+ while (s < sm) { -+ smb = s + 8; -+ if (smb > sm) { -+ smb = sm; -+ } -+ -+ smz = spinor_prod5_dble_avx512( s, smb, r ); -+ s = smb; r+=8; -+ -+ add_to_hsum(isz, (double *)(&smz)); -+ } -+ -+ if ((icom == 1) && (NPROC > 1)) { -+ global_hsum(isz, (double *)(&smz)); -+ } else { -+ local_hsum(isz, (double *)(&smz)); -+ } -+ -+ return smz; -+} -+ -+double norm_square_dble_avx512(spinor_dble *s, spinor_dble *smb); -+double norm_square_dble(int vol, int icom, spinor_dble *s) -+{ -+ spinor_dble *sm, *smb; -+ -+ if (init == 0) { -+ isx = init_hsum(1); -+ isz = init_hsum(2); -+ init = 1; -+ } -+ -+ reset_hsum(isx); -+ sm = s + vol; -+ -+ while (s < sm) { -+ smb = s + 8; -+ if (smb > sm) { -+ smb = sm; -+ } -+ -+ smx = norm_square_dble_avx512( s, smb ); -+ s = smb; -+ -+ add_to_hsum(isx, &smx); -+ } -+ -+ if ((icom == 1) && (NPROC > 1)) { -+ global_hsum(isx, &smx); -+ } else { -+ local_hsum(isx, &smx); -+ } -+ -+ return smx; -+} -+ -+void mulc_spinor_add_dble_avx512(int vol, spinor_dble *s, spinor_dble *r, -+ complex_dble z); -+void mulc_spinor_add_dble(int vol, spinor_dble *s, spinor_dble *r, -+ complex_dble z) -+{ -+ mulc_spinor_add_dble_avx512(vol,s,r,z); -+} -+ -+void mulr_spinor_add_dble_avx512(int vol, spinor_dble *s, spinor_dble *r, -+ double c); -+void mulr_spinor_add_dble(int vol, spinor_dble *s, spinor_dble *r, -+ double c) -+{ -+ mulr_spinor_add_dble_avx512(vol,s,r,c); -+} -+ -+void combine_spinor_dble_avx512(int vol, spinor_dble *s, spinor_dble *r, -+ double cs, double cr); -+void combine_spinor_dble(int vol, spinor_dble *s, spinor_dble *r, -+ double cs, double cr) -+{ -+ combine_spinor_dble_avx512(vol,s,r,cs,cr); -+} -+ -+void scale_dble_avx512(int vol, double c, spinor_dble *s); -+void scale_dble(int vol, double c, spinor_dble *s) -+{ -+ scale_dble_avx512(vol,c,s); -+} -+ -+void rotate_dble_avx512(int n, int ix, spinor_dble **ppk, spinor_dble *psi, complex_dble *v); -+void rotate_dble(int vol, int n, spinor_dble **ppk, complex_dble *v) -+{ -+ int ix,k; -+ -+ if (n > nrot) { -+ alloc_wrotate(n); -+ } -+ -+ for (ix = 0; ix < vol; ix++) { -+ rotate_dble_avx512( n, ix, ppk, psi, v ); -+ -+ for (k = 0; k < n; k++) { -+ *(ppk[k] + ix) = psi[k]; -+ } -+ } -+} -+ -+void mulg5_dble_avx512(int vol, spinor_dble *s); -+void mulg5_dble(int vol, spinor_dble *s) -+{ -+ mulg5_dble_avx512( vol, s ); -+} -+ -+void mulmg5_dble_avx512(int vol, spinor_dble *s); -+void mulmg5_dble(int vol, spinor_dble *s) -+{ -+ mulmg5_dble_avx512( vol, s ); -+} -+ -+ -+ -+#elif (defined AVX) - #include "avx.h" - - #if (defined FMA3) -diff --git a/modules/sw_term/avx512/pauli_avx512.c b/modules/sw_term/avx512/pauli_avx512.c -new file mode 100644 -index 0000000..504ff80 ---- /dev/null -+++ b/modules/sw_term/avx512/pauli_avx512.c -@@ -0,0 +1,230 @@ -+/******************************************************************************* -+* -+* File pauli_avx512.c -+* -+* This software is distributed under the terms of the GNU General Public -+* License (GPL) -+* -+* AVX512 implementations of the clover term multiplication in -+* single precision. -+* -+* See ../pauli_avx512.c for more information and alternative -+* implementations. -+* -+*******************************************************************************/ -+ -+#include -+#include -+#include -+#include "su3.h" -+#include "sw_term.h" -+typedef union -+{ -+ spinor s; -+ weyl w[2]; -+} spin_t; -+ -+#include "avx512.h" -+ -+void mul_pauli2_avx512(float mu, pauli *m, spinor *source, spinor *res ) -+{ -+ spin_t *ps, *pr; -+ float const *u, *u2; -+ __m512i idx; -+ __m512 tr1,tr2,tr3; -+ __m512 ts1, ts2, ts3, tsi1, tsi2, tsi3, u512; -+ __m512 tu11, tu12, tu13, tu21, tu22, tu23; -+ __m512 tu1, tu2, tu3, tu4; -+ __m512 umu; -+ __m256 t256; -+ __m128 t128a, t128b, tmu; -+ -+ ps = (spin_t *)(source); -+ pr = (spin_t *)(res); -+ -+ u = (*m).u; -+ u2 = (m+1)->u; -+ -+ weyl * s = (*ps).w; -+ weyl * r = (*pr).w; -+ weyl * s2 = (*ps).w+1; -+ weyl * r2 = (*pr).w+1; -+ -+ s += 4; -+ _prefetch_spinor(s); -+ s -= 4; -+ -+ tr1 = _mm512_loadu_ps( &(*s).c1.c1.re ); -+ tr2 = _mm512_castps256_ps512( _mm256_loadu_ps( &(*s).c1.c1.re+16 ) ); -+ idx = _mm512_setr_epi32( 0,1,2,3,6,7,8,9, 12,13,14,15,18,19,20,21 ); -+ ts1 = _mm512_permutex2var_ps( tr1, idx, tr2 ); -+ idx = _mm512_setr_epi32( 2,3,4,5,8,9,10,11, 14,15,16,17,20,21,22,23 ); -+ ts2 = _mm512_permutex2var_ps( tr1, idx, tr2 ); -+ idx = _mm512_setr_epi32( 4,5,0,1,10,11,6,7, 16,17,12,13,22,23,18,19 ); -+ ts3 = _mm512_permutex2var_ps( tr1, idx, tr2 ); -+ -+ tu11 = _mm512_loadu_ps( u ); -+ tu12 = _mm512_loadu_ps( u+16 ); -+ tu13 = _mm512_loadu_ps( u+32 ); -+ tu21 = _mm512_loadu_ps( u2 ); -+ tu22 = _mm512_loadu_ps( u2+16 ); -+ tu23 = _mm512_loadu_ps( u2+32 ); -+ -+ -+ tsi1 = _mm512_permute_ps ( ts1, 0b10110001 ); -+ tsi2 = _mm512_permute_ps ( ts2, 0b10110001 ); -+ tsi3 = _mm512_permute_ps ( ts3, 0b10110001 ); -+ -+ -+ tmu = _mm_load_ps1( &mu ); -+ umu = _mm512_broadcastss_ps( tmu ); -+ -+ -+ idx = _mm512_setr_epi32( 0,1,10,11,4,5,2,3, 16,17,26,27,20,21,18,19 ); -+ tu1 = _mm512_permutex2var_ps( tu11, idx, tu21 ); -+ idx = _mm512_setr_epi32( 4,5,0,1,12,13,6,7, 20,21,16,17,28,29,22,23); -+ tu2 = _mm512_permutex2var_ps( tu12, idx, tu22 ); -+ -+ idx = _mm512_setr_epi32( 0,0,1,1,2,3,16+0,16+1, 8,8,9,9,10,11,16+8,16+9 ); -+ tu4 = _mm512_permutex2var_ps( tu1, idx, tu2 ); -+ u512 = _mm512_permute_ps( tu4, 0b10100000 ); -+ tr1 = _mm512_mul_ps( ts1, u512 ); -+ -+ idx = _mm512_setr_epi32( 0,1,2,3,16+5,16+5,16+7,16+7, -+ 8,9,10,11,16+13,16+13,16+15,16+15 ); -+ u512 = _mm512_permutex2var_ps( umu, idx, tu4 ); -+ u512 = _mm512_mul_ps( u512, tsi1 ); -+ tr1 = _mm512_mask_add_ps( tr1, 0b1010010110101010, tr1, u512 ); -+ tr1 = _mm512_mask_sub_ps( tr1, 0b0101101001010101, tr1, u512 ); -+ -+ -+ idx = _mm512_setr_epi32( 0,0,4,4,16+4,16+4,16+5,16+5, -+ 8,8,12,12,16+12,16+12,16+13,16+13 ); -+ u512 = _mm512_permutex2var_ps( tu2, idx, tu1 ); -+ tr2 = _mm512_mul_ps( ts2, u512 ); -+ -+ idx = _mm512_setr_epi32( 1,1,5,5,16+4,16+5,16+6,16+7, -+ 9,9,13,13,16+12,16+13,16+14,16+15 ); -+ u512 = _mm512_permutex2var_ps( tu2, idx, umu ); -+ u512 = _mm512_mul_ps( u512, tsi2 ); -+ tr2 = _mm512_mask_add_ps( tr2, 0b0101010110100101, tr2, u512 ); -+ tr2 = _mm512_mask_sub_ps( tr2, 0b1010101001011010, tr2, u512 ); -+ -+ -+ idx = _mm512_setr_epi32( 6,6,2,3,16+4,16+5,7,7, -+ 14,14,10,11,16+12,16+13,15,15 ); -+ tu4 = _mm512_permutex2var_ps( tu1, idx, tu2 ); -+ u512 = _mm512_permute_ps( tu4, 0b10100000 ); -+ tr3 = _mm512_mul_ps( ts3, u512 ); -+ -+ idx = _mm512_setr_epi32( 0,1,16+3,16+3,16+5,16+5,6,7, -+ 8,9,16+11,16+11,16+13,16+13,14,15 ); -+ u512 = _mm512_permutex2var_ps( umu, idx, tu4 ); -+ u512 = _mm512_mul_ps( u512, tsi3 ); -+ tr3 = _mm512_mask_add_ps( tr3, 0b0110010110100110, tr3, u512 ); -+ tr3 = _mm512_mask_sub_ps( tr3, 0b1001101001011001, tr3, u512 ); -+ -+ -+ -+ idx = _mm512_setr_epi32( 8,9,6,7,14,15,12,13, -+ 24,25,22,23,30,31,28,29 ); -+ tu1 = _mm512_permutex2var_ps( tu11, idx, tu21 ); -+ -+ u512 = _mm512_shuffle_ps( tu1, tu2, 0b10101010 ); -+ tr1 = _mm512_fmadd_ps( ts2, u512, tr1 ); -+ -+ u512 = _mm512_shuffle_ps( tu1, tu2, 0b11111111 ); -+ u512 = _mm512_mul_ps( u512, tsi2 ); -+ tr1 = _mm512_mask_add_ps( tr1, 0b1010101010101010, tr1, u512 ); -+ tr1 = _mm512_mask_sub_ps( tr1, 0b0101010101010101, tr1, u512 ); -+ -+ m += 4; -+ _prefetch_pauli_dble(m); -+ m -= 4; -+ -+ idx = _mm512_setr_epi32( 0,1,2,3,0,1,2,3, 16,17,18,19,16,17,18,19 ); -+ tu13 = _mm512_permutex2var_ps( tu13, idx, tu23 ); -+ idx = _mm512_setr_epi32( 6,7,2,3,8,9,14,15, 22,23,18,19,24,25,30,31 ); -+ tu2 = _mm512_permutex2var_ps( tu12, idx, tu22 ); -+ -+ idx = _mm512_setr_epi32( 6,7,16+0,16+1,16+6,16+7,0,0, -+ 14,15,16+8,16+9,16+14,16+15,0,0 ); -+ tu3 = _mm512_permutex2var_ps( tu1, idx, tu2 ); -+ tu4 = _mm512_permute_ps( tu3, 0b10100000 ); -+ u512 = _mm512_mask_shuffle_ps( tu4, 0b1111000011110000, tu4, tu13, 0b10100100 ); -+ tr2 = _mm512_fmadd_ps( ts1, u512, tr2 ); -+ -+ tu4 = _mm512_permute_ps( tu3, 0b11110101 ); -+ u512 = _mm512_mask_shuffle_ps( tu4, 0b1111000011110000, tu4, tu13, 0b11110100 ); -+ u512 = _mm512_mul_ps( u512, tsi1 ); -+ tr2 = _mm512_mask_add_ps( tr2, 0b0101010101010101, tr2, u512 ); -+ tr2 = _mm512_mask_sub_ps( tr2, 0b1010101010101010, tr2, u512 ); -+ -+ tu3 = _mm512_mask_shuffle_ps( tu2, 0b0000111100001111, tu1, tu2, 0b11100100 ); -+ u512 = _mm512_permute_ps( tu3, 0b10100000 ); -+ tr3 = _mm512_fmadd_ps( ts1, u512, tr3 ); -+ -+ u512 = _mm512_permute_ps( tu3, 0b11110101 ); -+ u512 = _mm512_mul_ps( u512, tsi1 ); -+ tr3 = _mm512_mask_add_ps( tr3, 0b1010010110100101, tr3, u512 ); -+ tr3 = _mm512_mask_sub_ps( tr3, 0b0101101001011010, tr3, u512 ); -+ -+ idx = _mm512_setr_epi32( 0,1,2,3, 4,5,16+2,16+3, 8,9,10,11,12,13,16+10,16+11 ); -+ tu3 = _mm512_permutex2var_ps( tu1, idx, tu2 ); -+ u512 = _mm512_permute_ps( tu3, 0b10100000 ); -+ tr1 = _mm512_fmadd_ps( ts3, u512, tr1 ); -+ -+ u512 = _mm512_permute_ps( tu3, 0b11110101 ); -+ u512 = _mm512_mul_ps( u512, tsi3 ); -+ tr1 = _mm512_mask_add_ps( tr1, 0b1010011010100110, tr1, u512 ); -+ tr1 = _mm512_mask_sub_ps( tr1, 0b0101100101011001, tr1, u512 ); -+ -+ -+ idx = _mm512_setr_epi32( 0,1,8,9,10,11,-1,-1, 16,17,24,25,26,27,-1,-1 ); -+ tu2 = _mm512_permutex2var_ps( tu12, idx, tu22 ); -+ -+ idx = _mm512_setr_epi32( 4,5,16+4,16+5,0,0,0,0, 12,13,16+12,16+13,0,0,0,0 ); -+ tu3 = _mm512_permutex2var_ps( tu2, idx, tu1 ); -+ u512 = _mm512_permute_ps( tu3, 0b10100000 ); -+ u512 = _mm512_mask_permute_ps( u512, 0b1111000011110000, tu13, 0b00001010 ); -+ tr2 = _mm512_fmadd_ps( ts3, u512, tr2 ); -+ -+ u512 = _mm512_permute_ps( tu3, 0b11110101 ); -+ u512 = _mm512_mask_permute_ps( u512, 0b1111000011110000, tu13, 0b01011111 ); -+ u512 = _mm512_mul_ps( u512, tsi3 ); -+ tr2 = _mm512_mask_add_ps( tr2, 0b0110010101100101, tr2, u512 ); -+ tr2 = _mm512_mask_sub_ps( tr2, 0b1001101010011010, tr2, u512 ); -+ -+ tu3 = _mm512_mask_shuffle_ps( tu2, 0b1111000011110000, tu2, tu13, 0b01000100 ); -+ u512 = _mm512_permute_ps( tu3, 0b10100000 ); -+ tr3 = _mm512_fmadd_ps( ts2, u512, tr3 ); -+ -+ u512 = _mm512_permute_ps( tu3, 0b11110101 ); -+ u512 = _mm512_mul_ps( u512, tsi2 ); -+ tr3 = _mm512_mask_add_ps( tr3, 0b1010010110100101, tr3, u512 ); -+ tr3 = _mm512_mask_sub_ps( tr3, 0b0101101001011010, tr3, u512 ); -+ -+ -+ -+ idx = _mm512_setr_epi32( 0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27 ); -+ ts1 = _mm512_permutex2var_ps( tr1, idx, tr3 ); -+ idx = _mm512_setr_epi32( 4,5,6,7, 20,21,22,23, 12,13,14,15, 28,29,30,31 ); -+ ts2 = _mm512_permutex2var_ps( tr1, idx, tr3 ); -+ ts3 = _mm512_add_ps( ts1, ts2 ); -+ -+ t256 = _mm512_castps512_ps256( ts3 ); -+ _mm256_storeu_ps( &(*r).c1.c1.re, t256 ); -+ -+ t128a = _mm512_castps512_ps128( tr2 ); -+ t128b = _mm512_extractf32x4_ps( tr2, 1 ); -+ t128b = _mm_add_ps( t128a, t128b ); -+ _mm_storeu_ps( &(*r).c2.c2.re, t128b ); -+ -+ t256 = _mm256_castpd_ps( _mm512_extractf64x4_pd( _mm512_castps_pd(ts3), 1 ) ); -+ _mm256_storeu_ps( &(*r2).c1.c1.re, t256 ); -+ -+ t128a = _mm512_extractf32x4_ps( tr2, 2 ); -+ t128b = _mm512_extractf32x4_ps( tr2, 3 ); -+ t128b = _mm_add_ps( t128a, t128b ); -+ _mm_storeu_ps( &(*r2).c2.c2.re, t128b ); -+} -diff --git a/modules/sw_term/avx512/pauli_avx512_asm.s b/modules/sw_term/avx512/pauli_avx512_asm.s -new file mode 100644 -index 0000000..d91d416 ---- /dev/null -+++ b/modules/sw_term/avx512/pauli_avx512_asm.s -@@ -0,0 +1,295 @@ -+# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 17.0.4.196 Build 20170411"; -+# mark_description "-I../../../include -I.. -I/cineca/prod/opt/compilers/intel/pe-xe-2017/binary/impi/2017.3.196/intel64/include"; -+# mark_description " -isystem /cineca/prod/opt/compilers/intel/pe-xe-2018/binary/impi/2018.1.163/include64/ -std=c89 -xCORE-AVX5"; -+# mark_description "12 -mtune=skylake -DAVX512 -O3 -Ddirac_counters -pedantic -fstrict-aliasing -Wno-long-long -Wstrict-prototyp"; -+# mark_description "es -S"; -+ .file "pauli_avx512.c" -+ .text -+..TXTST0: -+# -- Begin mul_pauli2_avx512 -+ .text -+# mark_begin; -+ .align 16,0x90 -+ .globl mul_pauli2_avx512 -+# --- mul_pauli2_avx512(float, pauli *, spinor *, spinor *) -+mul_pauli2_avx512: -+# parameter 1: %xmm0 -+# parameter 2: %rdi -+# parameter 3: %rsi -+# parameter 4: %rdx -+..B1.1: # Preds ..B1.0 -+ # Execution count [1.00e+00] -+ .cfi_startproc -+..___tag_value_mul_pauli2_avx512.1: -+..L2: -+ #15.1 -+ movl $42410, %eax #82.9 -+ vmovups (%rdi), %zmm8 #51.27 -+ vmovups 64(%rdi), %zmm7 #52.27 -+ vmovups 144(%rdi), %zmm29 #54.27 -+ vmovups 208(%rdi), %zmm3 #55.27 -+ vmovups .L_2il0floatpacket.11(%rip), %zmm25 #69.9 -+ vmovups .L_2il0floatpacket.12(%rip), %zmm31 #71.9 -+ vmovups .L_2il0floatpacket.13(%rip), %zmm16 #74.9 -+ vbroadcastss %xmm0, %xmm14 #64.9 -+ vmovups 64(%rsi), %ymm12 #43.9 -+ vmovups (%rsi), %zmm11 #42.29 -+ vmovups .L_2il0floatpacket.8(%rip), %zmm9 #45.9 -+ vmovups .L_2il0floatpacket.14(%rip), %zmm17 #80.10 -+ vmovups .L_2il0floatpacket.15(%rip), %zmm19 #88.10 -+ vmovups .L_2il0floatpacket.17(%rip), %zmm22 #101.9 -+ vmovups .L_2il0floatpacket.9(%rip), %zmm10 #47.9 -+ vmovups .L_2il0floatpacket.10(%rip), %zmm13 #49.9 -+ vmovups .L_2il0floatpacket.16(%rip), %zmm20 #93.10 -+ vmovups .L_2il0floatpacket.18(%rip), %zmm24 #107.10 -+ vmovups .L_2il0floatpacket.19(%rip), %zmm28 #116.9 -+ vmovups 128(%rdi), %zmm2 #53.27 -+ vpermi2ps %zmm29, %zmm8, %zmm25 #69.9 -+ vpermi2ps %zmm3, %zmm7, %zmm31 #71.9 -+ vbroadcastss %xmm14, %zmm26 #65.9 -+ vpermt2ps %zmm29, %zmm28, %zmm8 #116.9 -+ vpermi2ps %zmm31, %zmm25, %zmm16 #74.9 -+ vpermi2ps %zmm25, %zmm31, %zmm19 #88.10 -+ vpermt2ps %zmm31, %zmm22, %zmm25 #101.9 -+ vpermi2ps %zmm16, %zmm26, %zmm17 #80.10 -+ vpermi2ps %zmm26, %zmm31, %zmm20 #93.10 -+ vpermt2ps %zmm25, %zmm24, %zmm26 #107.10 -+ vmovups .L_2il0floatpacket.22(%rip), %zmm28 #137.9 -+ vshufps $170, %zmm31, %zmm8, %zmm30 #118.10 -+ vshufps $255, %zmm31, %zmm8, %zmm24 #121.10 -+ kmovw %eax, %k2 #82.9 -+ vpermilps $160, %zmm16, %zmm15 #75.10 -+ movl $23125, %eax #83.9 -+ kmovw %eax, %k3 #83.9 -+ movl $21925, %eax #95.9 -+ kmovw %eax, %k4 #95.9 -+ vpermilps $160, %zmm25, %zmm23 #102.10 -+ movl $43610, %eax #96.9 -+ kmovw %eax, %k5 #96.9 -+ movl $26022, %eax #109.9 -+ kmovw %eax, %k6 #109.9 -+ movl $39513, %eax #110.9 -+ kmovw %eax, %k7 #110.9 -+ movl $43690, %eax #123.9 -+ vpermi2ps %zmm12, %zmm11, %zmm9 #45.9 -+ vpermi2ps %zmm12, %zmm11, %zmm10 #47.9 -+ vpermt2ps %zmm12, %zmm13, %zmm11 #49.9 -+ vmulps %zmm15, %zmm9, %zmm1 #76.9 -+ vmulps %zmm19, %zmm10, %zmm0 #89.9 -+ vmulps %zmm23, %zmm11, %zmm12 #103.9 -+ vmovups .L_2il0floatpacket.21(%rip), %zmm15 #133.9 -+ vpermilps $177, %zmm9, %zmm5 #59.10 -+ vmulps %zmm5, %zmm17, %zmm18 #81.10 -+ vmovups .L_2il0floatpacket.23(%rip), %zmm17 #158.9 -+ vpermi2ps %zmm3, %zmm7, %zmm15 #133.9 -+ vaddps %zmm18, %zmm1, %zmm1{%k2} #82.9 -+ vpermi2ps %zmm15, %zmm8, %zmm28 #137.9 -+ vsubps %zmm18, %zmm1, %zmm1{%k3} #83.9 -+ vpermi2ps %zmm15, %zmm8, %zmm17 #158.9 -+ kmovw %eax, %k3 #123.9 -+ vfmadd231ps %zmm10, %zmm30, %zmm1 #119.9 -+ vpermilps $177, %zmm11, %zmm4 #61.10 -+ movl $21845, %eax #124.9 -+ vmulps %zmm4, %zmm26, %zmm27 #108.10 -+ vmovups .L_2il0floatpacket.20(%rip), %zmm26 #131.10 -+ kmovw %eax, %k2 #124.9 -+ vpermt2ps 272(%rdi), %zmm26, %zmm2 #131.10 -+ vaddps %zmm27, %zmm12, %zmm12{%k6} #109.9 -+ vpermilps $177, %zmm10, %zmm6 #60.10 -+ movl $61680, %eax #139.10 -+ vmulps %zmm6, %zmm20, %zmm21 #94.10 -+ vmulps %zmm24, %zmm6, %zmm25 #122.10 -+ vmovups .L_2il0floatpacket.24(%rip), %zmm20 #169.9 -+ vsubps %zmm27, %zmm12, %zmm12{%k7} #110.9 -+ vaddps %zmm21, %zmm0, %zmm0{%k4} #95.9 -+ vpermt2ps %zmm3, %zmm20, %zmm7 #169.9 -+ vaddps %zmm25, %zmm1, %zmm1{%k3} #123.9 -+ vsubps %zmm21, %zmm0, %zmm0{%k5} #96.9 -+ vmovups .L_2il0floatpacket.25(%rip), %zmm3 #172.9 -+ vmovups .L_2il0floatpacket.26(%rip), %zmm21 #195.9 -+ vsubps %zmm25, %zmm1, %zmm1{%k2} #124.9 -+ vpermi2ps %zmm8, %zmm7, %zmm3 #172.9 -+ kmovw %eax, %k1 #139.10 -+ vpermilps $245, %zmm28, %zmm29 #142.9 -+ movl $3855, %eax #148.9 -+ vshufps $244, %zmm2, %zmm29, %zmm29{%k1} #143.10 -+ vshufps $68, %zmm2, %zmm7, %zmm7{%k1} #183.9 -+ kmovw %eax, %k4 #148.9 -+ vmulps %zmm29, %zmm5, %zmm30 #144.10 -+ vpermilps $160, %zmm28, %zmm27 #138.9 -+ movl $42405, %eax #154.9 -+ vshufps $164, %zmm2, %zmm27, %zmm27{%k1} #139.10 -+ vmovaps %zmm15, %zmm13 #148.9 -+ vshufps $228, %zmm15, %zmm8, %zmm13{%k4} #148.9 -+ vfmadd231ps %zmm9, %zmm27, %zmm0 #140.9 -+ vpermilps $245, %zmm13, %zmm14 #152.10 -+ vmulps %zmm14, %zmm5, %zmm5 #153.10 -+ vaddps %zmm30, %zmm0, %zmm0{%k2} #145.9 -+ kmovw %eax, %k2 #154.9 -+ vsubps %zmm30, %zmm0, %zmm0{%k3} #146.9 -+ vpermilps $160, %zmm13, %zmm31 #149.10 -+ movl $23130, %eax #155.9 -+ vpermilps $160, %zmm3, %zmm8 #173.10 -+ vfmadd213ps %zmm12, %zmm31, %zmm9 #150.9 -+ kmovw %eax, %k3 #155.9 -+ vaddps %zmm5, %zmm9, %zmm9{%k2} #154.9 -+ vpermilps $160, %zmm17, %zmm16 #159.10 -+ movl $42662, %eax #164.9 -+ vpermilps $10, %zmm2, %zmm8{%k1} #174.10 -+ vfmadd231ps %zmm11, %zmm16, %zmm1 #160.9 -+ vfmadd213ps %zmm0, %zmm8, %zmm11 #175.9 -+ vsubps %zmm5, %zmm9, %zmm9{%k3} #155.9 -+ kmovw %eax, %k5 #164.9 -+ vpermilps $245, %zmm3, %zmm0 #177.10 -+ movl $22873, %eax #165.9 -+ vpermilps $245, %zmm17, %zmm18 #162.10 -+ vpermilps $95, %zmm2, %zmm0{%k1} #178.10 -+ vpermilps $160, %zmm7, %zmm2 #184.10 -+ vpermilps $245, %zmm7, %zmm7 #187.10 -+ vmulps %zmm18, %zmm4, %zmm19 #163.10 -+ vmulps %zmm7, %zmm6, %zmm6 #188.10 -+ vmulps %zmm0, %zmm4, %zmm4 #179.10 -+ vfmadd213ps %zmm9, %zmm2, %zmm10 #185.9 -+ vmovups .L_2il0floatpacket.27(%rip), %zmm9 #197.9 -+ vaddps %zmm19, %zmm1, %zmm1{%k5} #164.9 -+ vaddps %zmm6, %zmm10, %zmm10{%k2} #189.9 -+ kmovw %eax, %k6 #165.9 -+ vsubps %zmm6, %zmm10, %zmm10{%k3} #190.9 -+ vsubps %zmm19, %zmm1, %zmm1{%k6} #165.9 -+ movl $25957, %eax #180.9 -+ kmovw %eax, %k7 #180.9 -+ movl $39578, %eax #181.9 -+ kmovw %eax, %k5 #181.9 -+ vaddps %zmm4, %zmm11, %zmm11{%k7} #180.9 -+ vpermi2ps %zmm10, %zmm1, %zmm21 #195.9 -+ vpermt2ps %zmm10, %zmm9, %zmm1 #197.9 -+ vsubps %zmm4, %zmm11, %zmm11{%k5} #181.9 -+ vaddps %zmm1, %zmm21, %zmm1 #198.9 -+ vextractf32x4 $1, %zmm11, %xmm10 #204.11 -+ vextractf32x4 $2, %zmm11, %xmm0 #211.11 -+ vextractf32x4 $3, %zmm11, %xmm23 #212.11 -+ vaddps %xmm11, %xmm10, %xmm22 #205.11 -+ vmovups %xmm22, 32(%rdx) #206.21 -+ vmovups %ymm1, (%rdx) #201.24 -+ vextractf64x4 $1, %zmm1, 48(%rdx) #209.24 -+ vaddps %xmm0, %xmm23, %xmm1 #213.11 -+ vmovups %xmm1, 80(%rdx) #214.21 -+ vzeroupper #215.1 -+ ret #215.1 -+ .align 16,0x90 -+ # LOE -+ .cfi_endproc -+# mark_end; -+ .type mul_pauli2_avx512,@function -+ .size mul_pauli2_avx512,.-mul_pauli2_avx512 -+ .data -+# -- End mul_pauli2_avx512 -+ .section .rodata, "a" -+ .align 64 -+ .align 64 -+.L_2il0floatpacket.8: -+ .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000006,0x00000007,0x00000008,0x00000009,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000012,0x00000013,0x00000014,0x00000015 -+ .type .L_2il0floatpacket.8,@object -+ .size .L_2il0floatpacket.8,64 -+ .align 64 -+.L_2il0floatpacket.9: -+ .long 0x00000002,0x00000003,0x00000004,0x00000005,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000e,0x0000000f,0x00000010,0x00000011,0x00000014,0x00000015,0x00000016,0x00000017 -+ .type .L_2il0floatpacket.9,@object -+ .size .L_2il0floatpacket.9,64 -+ .align 64 -+.L_2il0floatpacket.10: -+ .long 0x00000004,0x00000005,0x00000000,0x00000001,0x0000000a,0x0000000b,0x00000006,0x00000007,0x00000010,0x00000011,0x0000000c,0x0000000d,0x00000016,0x00000017,0x00000012,0x00000013 -+ .type .L_2il0floatpacket.10,@object -+ .size .L_2il0floatpacket.10,64 -+ .align 64 -+.L_2il0floatpacket.11: -+ .long 0x00000000,0x00000001,0x0000000a,0x0000000b,0x00000004,0x00000005,0x00000002,0x00000003,0x00000010,0x00000011,0x0000001a,0x0000001b,0x00000014,0x00000015,0x00000012,0x00000013 -+ .type .L_2il0floatpacket.11,@object -+ .size .L_2il0floatpacket.11,64 -+ .align 64 -+.L_2il0floatpacket.12: -+ .long 0x00000004,0x00000005,0x00000000,0x00000001,0x0000000c,0x0000000d,0x00000006,0x00000007,0x00000014,0x00000015,0x00000010,0x00000011,0x0000001c,0x0000001d,0x00000016,0x00000017 -+ .type .L_2il0floatpacket.12,@object -+ .size .L_2il0floatpacket.12,64 -+ .align 64 -+.L_2il0floatpacket.13: -+ .long 0x00000000,0x00000000,0x00000001,0x00000001,0x00000002,0x00000003,0x00000010,0x00000011,0x00000008,0x00000008,0x00000009,0x00000009,0x0000000a,0x0000000b,0x00000018,0x00000019 -+ .type .L_2il0floatpacket.13,@object -+ .size .L_2il0floatpacket.13,64 -+ .align 64 -+.L_2il0floatpacket.14: -+ .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000015,0x00000015,0x00000017,0x00000017,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000001d,0x0000001d,0x0000001f,0x0000001f -+ .type .L_2il0floatpacket.14,@object -+ .size .L_2il0floatpacket.14,64 -+ .align 64 -+.L_2il0floatpacket.15: -+ .long 0x00000000,0x00000000,0x00000004,0x00000004,0x00000014,0x00000014,0x00000015,0x00000015,0x00000008,0x00000008,0x0000000c,0x0000000c,0x0000001c,0x0000001c,0x0000001d,0x0000001d -+ .type .L_2il0floatpacket.15,@object -+ .size .L_2il0floatpacket.15,64 -+ .align 64 -+.L_2il0floatpacket.16: -+ .long 0x00000001,0x00000001,0x00000005,0x00000005,0x00000014,0x00000015,0x00000016,0x00000017,0x00000009,0x00000009,0x0000000d,0x0000000d,0x0000001c,0x0000001d,0x0000001e,0x0000001f -+ .type .L_2il0floatpacket.16,@object -+ .size .L_2il0floatpacket.16,64 -+ .align 64 -+.L_2il0floatpacket.17: -+ .long 0x00000006,0x00000006,0x00000002,0x00000003,0x00000014,0x00000015,0x00000007,0x00000007,0x0000000e,0x0000000e,0x0000000a,0x0000000b,0x0000001c,0x0000001d,0x0000000f,0x0000000f -+ .type .L_2il0floatpacket.17,@object -+ .size .L_2il0floatpacket.17,64 -+ .align 64 -+.L_2il0floatpacket.18: -+ .long 0x00000000,0x00000001,0x00000013,0x00000013,0x00000015,0x00000015,0x00000006,0x00000007,0x00000008,0x00000009,0x0000001b,0x0000001b,0x0000001d,0x0000001d,0x0000000e,0x0000000f -+ .type .L_2il0floatpacket.18,@object -+ .size .L_2il0floatpacket.18,64 -+ .align 64 -+.L_2il0floatpacket.19: -+ .long 0x00000008,0x00000009,0x00000006,0x00000007,0x0000000e,0x0000000f,0x0000000c,0x0000000d,0x00000018,0x00000019,0x00000016,0x00000017,0x0000001e,0x0000001f,0x0000001c,0x0000001d -+ .type .L_2il0floatpacket.19,@object -+ .size .L_2il0floatpacket.19,64 -+ .align 64 -+.L_2il0floatpacket.20: -+ .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000000,0x00000001,0x00000002,0x00000003,0x00000010,0x00000011,0x00000012,0x00000013,0x00000010,0x00000011,0x00000012,0x00000013 -+ .type .L_2il0floatpacket.20,@object -+ .size .L_2il0floatpacket.20,64 -+ .align 64 -+.L_2il0floatpacket.21: -+ .long 0x00000006,0x00000007,0x00000002,0x00000003,0x00000008,0x00000009,0x0000000e,0x0000000f,0x00000016,0x00000017,0x00000012,0x00000013,0x00000018,0x00000019,0x0000001e,0x0000001f -+ .type .L_2il0floatpacket.21,@object -+ .size .L_2il0floatpacket.21,64 -+ .align 64 -+.L_2il0floatpacket.22: -+ .long 0x00000006,0x00000007,0x00000010,0x00000011,0x00000016,0x00000017,0x00000000,0x00000000,0x0000000e,0x0000000f,0x00000018,0x00000019,0x0000001e,0x0000001f,0x00000000,0x00000000 -+ .type .L_2il0floatpacket.22,@object -+ .size .L_2il0floatpacket.22,64 -+ .align 64 -+.L_2il0floatpacket.23: -+ .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000012,0x00000013,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000001a,0x0000001b -+ .type .L_2il0floatpacket.23,@object -+ .size .L_2il0floatpacket.23,64 -+ .align 64 -+.L_2il0floatpacket.24: -+ .long 0x00000000,0x00000001,0x00000008,0x00000009,0x0000000a,0x0000000b,0xffffffff,0xffffffff,0x00000010,0x00000011,0x00000018,0x00000019,0x0000001a,0x0000001b,0xffffffff,0xffffffff -+ .type .L_2il0floatpacket.24,@object -+ .size .L_2il0floatpacket.24,64 -+ .align 64 -+.L_2il0floatpacket.25: -+ .long 0x00000004,0x00000005,0x00000014,0x00000015,0x00000000,0x00000000,0x00000000,0x00000000,0x0000000c,0x0000000d,0x0000001c,0x0000001d,0x00000000,0x00000000,0x00000000,0x00000000 -+ .type .L_2il0floatpacket.25,@object -+ .size .L_2il0floatpacket.25,64 -+ .align 64 -+.L_2il0floatpacket.26: -+ .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000010,0x00000011,0x00000012,0x00000013,0x00000008,0x00000009,0x0000000a,0x0000000b,0x00000018,0x00000019,0x0000001a,0x0000001b -+ .type .L_2il0floatpacket.26,@object -+ .size .L_2il0floatpacket.26,64 -+ .align 64 -+.L_2il0floatpacket.27: -+ .long 0x00000004,0x00000005,0x00000006,0x00000007,0x00000014,0x00000015,0x00000016,0x00000017,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x0000001c,0x0000001d,0x0000001e,0x0000001f -+ .type .L_2il0floatpacket.27,@object -+ .size .L_2il0floatpacket.27,64 -+ .data -+ .section .note.GNU-stack, "" -+// -- Begin DWARF2 SEGMENT .eh_frame -+ .section .eh_frame,"a",@progbits -+.eh_frame_seg: -+ .align 8 -+# End -diff --git a/modules/sw_term/avx512/pauli_dble_avx512.c b/modules/sw_term/avx512/pauli_dble_avx512.c -new file mode 100644 -index 0000000..51d0af2 ---- /dev/null -+++ b/modules/sw_term/avx512/pauli_dble_avx512.c -@@ -0,0 +1,483 @@ -+/******************************************************************************* -+* -+* File pauli_avx512.c -+* -+* This software is distributed under the terms of the GNU General Public -+* License (GPL) -+* -+* AVX512 implementations of the clover term multiplication in -+* double precision. -+* -+* See ../pauli_dble_avx512.c for more information and alternative -+* implementations. -+* -+*******************************************************************************/ -+#include -+#include -+#include -+#include -+#include "su3.h" -+#include "linalg.h" -+#include "sw_term.h" -+#define DELTA 1.0e-04 -+ -+typedef union -+{ -+ spinor_dble s; -+ weyl_dble w[2]; -+ complex_dble c[12]; -+} spin_t; -+ -+#include "avx512.h" -+ -+void mul_pauli2_dble_avx512(double mu, pauli_dble *m, weyl_dble *s, weyl_dble *r) -+{ -+ double const *u = m->u, *u2 = (m+1)->u; -+ -+ __m512d r1,r2,r3; -+ __m512d s1,s2,s3,s4,s5,s6, si1,si2,si3,si4,si5,si6, s512, u512; -+ __m512d t1,t2,t3,t4,t5,t6; -+ __m512d tu11, tu12, tu13, tu14, tu15, tu21, tu22, tu23, tu24, tu25; -+ __m512d tu1, tu2, tu3, tu4, tu5, tu6; -+ __m512d umu; -+ __m512i idx; -+ -+ t1 = _mm512_loadu_pd( &(*s).c1.c1.re ); -+ t2 = _mm512_loadu_pd( &(*s).c1.c1.re + 8 ); -+ t3 = _mm512_loadu_pd( &(*s).c1.c1.re + 16 ); -+ -+ tu11 = _mm512_loadu_pd( u ); -+ tu12 = _mm512_loadu_pd( u+8 ); -+ tu13 = _mm512_loadu_pd( u+16 ); -+ tu14 = _mm512_loadu_pd( u+24 ); -+ tu15 = _mm512_loadu_pd( u+32 ); -+ tu21 = _mm512_loadu_pd( u2 ); -+ tu22 = _mm512_loadu_pd( u2+8 ); -+ tu23 = _mm512_loadu_pd( u2+16 ); -+ tu24 = _mm512_loadu_pd( u2+24 ); -+ tu25 = _mm512_loadu_pd( u2+32 ); -+ -+ umu = _mm512_loadu_pd( &mu ); -+ -+ -+ idx = _mm512_setr_epi64( 0,1,12,13,0,1,12,13 ); -+ s1 = _mm512_permutex2var_pd( t1, idx, t2 ); -+ idx = _mm512_setr_epi64( 2,3,14,15,2,3,14,15 ); -+ s2 = _mm512_permutex2var_pd( t1, idx, t2 ); -+ idx = _mm512_setr_epi64( 4,5,8,9,4,5,8,9 ); -+ s3 = _mm512_permutex2var_pd( t1, idx, t3 ); -+ idx = _mm512_setr_epi64( 0,1,12,13,0,1,12,13 ); -+ s4 = _mm512_permutex2var_pd( t2, idx, t3 ); -+ idx = _mm512_setr_epi64( 6,7,10,11,6,7,10,11 ); -+ s5 = _mm512_permutex2var_pd( t1, idx, t3 ); -+ idx = _mm512_setr_epi64( 2,3,14,15,2,3,14,15 ); -+ s6 = _mm512_permutex2var_pd( t2, idx, t3 ); -+ -+ si1 = _mm512_permute_pd ( s1, 0b01010101 ); -+ si2 = _mm512_permute_pd ( s2, 0b01010101 ); -+ si3 = _mm512_permute_pd ( s3, 0b01010101 ); -+ si4 = _mm512_permute_pd ( s4, 0b01010101 ); -+ si5 = _mm512_permute_pd ( s5, 0b01010101 ); -+ si6 = _mm512_permute_pd ( s6, 0b01010101 ); -+ -+ -+ idx = _mm512_setr_epi64( 0,1,8,9,6,7,14,15 ); -+ tu1 = _mm512_permutex2var_pd( tu11, idx, tu21 ); -+ idx = _mm512_setr_epi64( 4,5,12,13,2,3,10,11 ); -+ tu2 = _mm512_permutex2var_pd( tu12, idx, tu22 ); -+ -+ idx = _mm512_setr_epi64( 0,0,2,2,8+4,8+4,8+6,8+6 ); -+ u512 = _mm512_permutex2var_pd( tu1, idx, tu2 ); -+ r1 = _mm512_mul_pd( u512, s1 ); -+ idx = _mm512_setr_epi64( 0,0,0,0,8+5,8+5,8+7,8+7 ); -+ u512 = _mm512_permutex2var_pd( umu, idx, tu2 ); -+ u512 = _mm512_mul_pd( u512, si1 ); -+ r1 = _mm512_mask_add_pd( r1, 0b01010110, r1, u512 ); -+ r1 = _mm512_mask_sub_pd( r1, 0b10101001, r1, u512 ); -+ -+ idx = _mm512_setr_epi64( 2,3,10,11,4,5,12,13 ); -+ tu6 = _mm512_permutex2var_pd( tu11, idx, tu21 ); -+ -+ idx = _mm512_setr_epi64( 4,4,6,6,8+1,8+1,8+3,8+3 ); -+ u512 = _mm512_permutex2var_pd( tu2, idx, tu6 ); -+ r1 = _mm512_fmadd_pd( u512, s5, r1 ); -+ idx = _mm512_setr_epi64( 5,5,7,7,8,8,8,8 ); -+ u512 = _mm512_permutex2var_pd( tu2, idx, umu ); -+ u512 = _mm512_mul_pd( u512, si5 ); -+ r1 = _mm512_mask_add_pd( r1, 0b01101010, r1, u512 ); -+ r1 = _mm512_mask_sub_pd( r1, 0b10010101, r1, u512 ); -+ -+ -+ idx = _mm512_setr_epi64( 2,3,8+2,8+3,4,5,8+4,8+5 ); -+ tu3 = _mm512_permutex2var_pd( tu13, idx, tu23 ); -+ -+ idx = _mm512_setr_epi64( 1,1,3,3,8+4,8+4,8+6,8+6 ); -+ u512 = _mm512_permutex2var_pd( tu1, idx, tu3 ); -+ r2 = _mm512_mul_pd( u512, s2 ); -+ idx = _mm512_setr_epi64( 0,0,0,0,8+5,8+5,8+7,8+7 ); -+ u512 = _mm512_permutex2var_pd( umu, idx, tu3 ); -+ u512 = _mm512_mul_pd( u512, si2 ); -+ r2 = _mm512_mask_add_pd( r2, 0b01010110, r2, u512 ); -+ r2 = _mm512_mask_sub_pd( r2, 0b10101001, r2, u512 ); -+ -+ idx = _mm512_setr_epi64( 4,4,6,6,8+4,8+4,8+6,8+6 ); -+ u512 = _mm512_permutex2var_pd( tu3, idx, tu6 ); -+ r2 = _mm512_fmadd_pd( u512, s4, r2 ); -+ idx = _mm512_setr_epi64( 5,5,7,7,8,8,8,8 ); -+ u512 = _mm512_permutex2var_pd( tu3, idx, umu ); -+ u512 = _mm512_mul_pd( u512, si4 ); -+ r2 = _mm512_mask_add_pd( r2, 0b01101010, r2, u512 ); -+ r2 = _mm512_mask_sub_pd( r2, 0b10010101, r2, u512 ); -+ -+ -+ idx = _mm512_setr_epi64( 4,5,8+4,8+5,6,7,8+6,8+7 ); -+ tu4 = _mm512_permutex2var_pd( tu14, idx, tu24 ); -+ -+ idx = _mm512_setr_epi64( 0,0,2,2,8+0,8+0,8+2,8+2 ); -+ u512 = _mm512_permutex2var_pd( tu6, idx, tu4 ); -+ r3 = _mm512_mul_pd( u512, s3 ); -+ idx = _mm512_setr_epi64( 0,0,0,0,8+1,8+1,8+3,8+3 ); -+ u512 = _mm512_permutex2var_pd( umu, idx, tu4 ); -+ u512 = _mm512_mul_pd( u512, si3 ); -+ r3 = _mm512_mask_add_pd( r3, 0b01010110, r3, u512 ); -+ r3 = _mm512_mask_sub_pd( r3, 0b10101001, r3, u512 ); -+ -+ idx = _mm512_setr_epi64( 0,0,2,2,8+5,8+5,8+7,8+7 ); -+ u512 = _mm512_permutex2var_pd( tu4, idx, tu6 ); -+ r3 = _mm512_fmadd_pd( u512, s6, r3 ); -+ idx = _mm512_setr_epi64( 1,1,3,3,8,8,8,8 ); -+ u512 = _mm512_permutex2var_pd( tu4, idx, umu ); -+ u512 = _mm512_mul_pd( u512, si6 ); -+ r3 = _mm512_mask_add_pd( r3, 0b01101010, r3, u512 ); -+ r3 = _mm512_mask_sub_pd( r3, 0b10010101, r3, u512 ); -+ -+ -+ idx = _mm512_setr_epi64( 4,4,6,6,8,8,8+2,8+2 ); -+ u512 = _mm512_permutex2var_pd( tu1, idx, tu2 ); -+ r2 = _mm512_fmadd_pd( u512, s1, r2 ); -+ idx = _mm512_setr_epi64( 5,5,7,7,9,9,8+3,8+3 ); -+ u512 = _mm512_permutex2var_pd( tu1, idx, tu2 ); -+ u512 = _mm512_mul_pd( u512, si1 ); -+ r2 = _mm512_mask_add_pd( r2, 0b01010101, r2, u512 ); -+ r2 = _mm512_mask_sub_pd( r2, 0b10101010, r2, u512 ); -+ -+ idx = _mm512_setr_epi64( 0,0,2,2,8+4,8+4,8+6,8+6 ); -+ u512 = _mm512_permutex2var_pd( tu2, idx, tu4 ); -+ r1 = _mm512_fmadd_pd( u512, s4, r1 ); -+ idx = _mm512_setr_epi64( 1,1,3,3,8+5,8+5,8+7,8+7 ); -+ u512 = _mm512_permutex2var_pd( tu2, idx, tu4 ); -+ u512 = _mm512_mul_pd( u512, si4 ); -+ r1 = _mm512_mask_add_pd( r1, 0b10101010, r1, u512 ); -+ r1 = _mm512_mask_sub_pd( r1, 0b01010101, r1, u512 ); -+ -+ -+ idx = _mm512_setr_epi64( 4,4,6,6,8+0,8+0,8+2,8+2 ); -+ u512 = _mm512_permutex2var_pd( tu1, idx, tu3 ); -+ r1 = _mm512_fmadd_pd( u512, s2, r1 ); -+ idx = _mm512_setr_epi64( 5,5,7,7,8+1,8+1,8+3,8+3 ); -+ u512 = _mm512_permutex2var_pd( tu1, idx, tu3 ); -+ u512 = _mm512_mul_pd( u512, si2 ); -+ r1 = _mm512_mask_add_pd( r1, 0b01011010, r1, u512 ); -+ r1 = _mm512_mask_sub_pd( r1, 0b10100101, r1, u512 ); -+ -+ idx = _mm512_setr_epi64( 0,0,2,2,8+4,8+4,8+6,8+6 ); -+ u512 = _mm512_permutex2var_pd( tu3, idx, tu4 ); -+ r2 = _mm512_fmadd_pd( u512, s5, r2 ); -+ idx = _mm512_setr_epi64( 1,1,3,3,8+5,8+5,8+7,8+7 ); -+ u512 = _mm512_permutex2var_pd( tu3, idx, tu4 ); -+ u512 = _mm512_mul_pd( u512, si5 ); -+ r2 = _mm512_mask_add_pd( r2, 0b01011010, r2, u512 ); -+ r2 = _mm512_mask_sub_pd( r2, 0b10100101, r2, u512 ); -+ -+ idx = _mm512_setr_epi64( 0,0,8,8,6,6,14,14 ); -+ u512 = _mm512_permutex2var_pd( tu12, idx, tu22 ); -+ r3 = _mm512_fmadd_pd( u512, s1, r3 ); -+ idx = _mm512_setr_epi64( 1,1,9,9,7,7,15,15 ); -+ u512 = _mm512_permutex2var_pd( tu12, idx, tu22 ); -+ u512 = _mm512_mul_pd( u512, si1 ); -+ r3 = _mm512_mask_add_pd( r3, 0b01010101, r3, u512 ); -+ r3 = _mm512_mask_sub_pd( r3, 0b10101010, r3, u512 ); -+ -+ -+ idx = _mm512_setr_epi64( 0,0,8,8,6,6,14,14 ); -+ u512 = _mm512_permutex2var_pd( tu13, idx, tu23 ); -+ r3 = _mm512_fmadd_pd( u512, s2, r3 ); -+ idx = _mm512_setr_epi64( 1,1,9,9,7,7,15,15 ); -+ u512 = _mm512_permutex2var_pd( tu13, idx, tu23 ); -+ u512 = _mm512_mul_pd( u512, si2 ); -+ r3 = _mm512_mask_add_pd( r3, 0b01010101, r3, u512 ); -+ r3 = _mm512_mask_sub_pd( r3, 0b10101010, r3, u512 ); -+ -+ -+ -+ idx = _mm512_setr_epi64( 0,1,8+0,8+1,6,7,8+6,8+7 ); -+ tu2 = _mm512_permutex2var_pd( tu12, idx, tu22 ); -+ idx = _mm512_setr_epi64( 2,3,8+2,8+3,0,1,8+0,8+1 ); -+ tu4 = _mm512_permutex2var_pd( tu14, idx, tu24 ); -+ -+ idx = _mm512_setr_epi64( 0,0,2,2,8+4,8+4,8+6,8+6 ); -+ u512 = _mm512_permutex2var_pd( tu2, idx, tu4 ); -+ r1 = _mm512_fmadd_pd( u512, s3, r1 ); -+ idx = _mm512_setr_epi64( 1,1,3,3,8+5,8+5,8+7,8+7 ); -+ u512 = _mm512_permutex2var_pd( tu2, idx, tu4 ); -+ u512 = _mm512_mul_pd( u512, si3 ); -+ r1 = _mm512_mask_add_pd( r1, 0b01011010, r1, u512 ); -+ r1 = _mm512_mask_sub_pd( r1, 0b10100101, r1, u512 ); -+ -+ -+ idx = _mm512_setr_epi64( 0,1,8+0,8+1,2,3,8+2,8+3 ); -+ tu5 = _mm512_permutex2var_pd( tu15, idx, tu25 ); -+ -+ idx = _mm512_setr_epi64( 4,4,6,6,8+0,8+0,8+2,8+2 ); -+ u512 = _mm512_permutex2var_pd( tu4, idx, tu5 ); -+ r3 = _mm512_fmadd_pd( u512, s5, r3 ); -+ idx = _mm512_setr_epi64( 5,5,7,7,8+1,8+1,8+3,8+3 ); -+ u512 = _mm512_permutex2var_pd( tu4, idx, tu5 ); -+ u512 = _mm512_mul_pd( u512, si5 ); -+ r3 = _mm512_mask_add_pd( r3, 0b01011010, r3, u512 ); -+ r3 = _mm512_mask_sub_pd( r3, 0b10100101, r3, u512 ); -+ -+ idx = _mm512_setr_epi64( 0,0,2,2,8+4,8+4,8+6,8+6 ); -+ u512 = _mm512_permutex2var_pd( tu4, idx, tu5 ); -+ r3 = _mm512_fmadd_pd( u512, s4, r3 ); -+ idx = _mm512_setr_epi64( 1,1,3,3,8+5,8+5,8+7,8+7 ); -+ u512 = _mm512_permutex2var_pd( tu4, idx, tu5 ); -+ u512 = _mm512_mul_pd( u512, si4 ); -+ r3 = _mm512_mask_add_pd( r3, 0b01011010, r3, u512 ); -+ r3 = _mm512_mask_sub_pd( r3, 0b10100101, r3, u512 ); -+ -+ idx = _mm512_setr_epi64( 4,4,6,6,8+0,8+0,8+2,8+2 ); -+ u512 = _mm512_permutex2var_pd( tu2, idx, tu5 ); -+ r1 = _mm512_fmadd_pd( u512, s6, r1 ); -+ idx = _mm512_setr_epi64( 5,5,7,7,8+1,8+1,8+3,8+3 ); -+ u512 = _mm512_permutex2var_pd( tu2, idx, tu5 ); -+ u512 = _mm512_mul_pd( u512, si6 ); -+ r1 = _mm512_mask_add_pd( r1, 0b10101010, r1, u512 ); -+ r1 = _mm512_mask_sub_pd( r1, 0b01010101, r1, u512 ); -+ -+ -+ idx = _mm512_setr_epi64( 6,7,8+6,8+7,0,1,8+0,8+1 ); -+ tu3 = _mm512_permutex2var_pd( tu13, idx, tu23 ); -+ -+ idx = _mm512_setr_epi64( 4,4,6,6,8+0,8+0,8+2,8+2 ); -+ u512 = _mm512_permutex2var_pd( tu3, idx, tu4 ); -+ r2 = _mm512_fmadd_pd( u512, s3, r2 ); -+ idx = _mm512_setr_epi64( 5,5,7,7,8+1,8+1,8+3,8+3 ); -+ u512 = _mm512_permutex2var_pd( tu3, idx, tu4 ); -+ u512 = _mm512_mul_pd( u512, si3 ); -+ r2 = _mm512_mask_add_pd( r2, 0b01011010, r2, u512 ); -+ r2 = _mm512_mask_sub_pd( r2, 0b10100101, r2, u512 ); -+ -+ idx = _mm512_setr_epi64( 0,0,2,2,8+4,8+4,8+6,8+6 ); -+ u512 = _mm512_permutex2var_pd( tu3, idx, tu5 ); -+ r2 = _mm512_fmadd_pd( u512, s6, r2 ); -+ idx = _mm512_setr_epi64( 1,1,3,3,8+5,8+5,8+7,8+7 ); -+ u512 = _mm512_permutex2var_pd( tu3, idx, tu5 ); -+ u512 = _mm512_mul_pd( u512, si6 ); -+ r2 = _mm512_mask_add_pd( r2, 0b10101010, r2, u512 ); -+ r2 = _mm512_mask_sub_pd( r2, 0b01010101, r2, u512 ); -+ -+ idx = _mm512_setr_epi64( 0,1,8,9,2,3,10,11 ); -+ t1 = _mm512_permutex2var_pd( r1, idx, r2 ); -+ idx = _mm512_setr_epi64( 2,3,14,15,0,1,12,13 ); -+ t2 = _mm512_permutex2var_pd( r3, idx, r1 ); -+ idx = _mm512_setr_epi64( 4,5,12,13,6,7,14,15 ); -+ t3 = _mm512_permutex2var_pd( r2, idx, r3 ); -+ r1 = _mm512_mask_blend_pd( 0b11110000, t1, t2 ); -+ r2 = _mm512_mask_blend_pd( 0b11110000, t3, t1 ); -+ r3 = _mm512_mask_blend_pd( 0b11110000, t2, t3 ); -+ -+ _mm512_storeu_pd( &r[0].c1.c1.re, r1 ); -+ _mm512_storeu_pd( &r[0].c2.c2.re, r2 ); -+ _mm512_storeu_pd( &r[1].c1.c3.re, r3 ); -+} -+ -+ -+int fwd_house_avx512(double eps, complex_dble *aa, complex_dble *dd, double * rr ) -+{ -+ int i, j, k, ifail; -+ double r1, r2, r3; -+ complex_dble z; -+ -+ ifail = 0; -+ -+ for (k = 0; k < 5; k++) { -+ r1 = aa[6 * k + k].re * aa[6 * k + k].re + -+ aa[6 * k + k].im * aa[6 * k + k].im; -+ r2 = sqrt(r1); -+ -+ for (j = (k + 1); j < 6; j++) -+ r1 += (aa[6 * j + k].re * aa[6 * j + k].re + -+ aa[6 * j + k].im * aa[6 * j + k].im); -+ -+ if (r1 >= eps) -+ r1 = sqrt(r1); -+ else { -+ ifail = 1; -+ r1 = 1.0; -+ } -+ -+ if (r2 >= (DBL_EPSILON * r1)) { -+ r3 = 1.0 / r2; -+ z.re = r3 * aa[6 * k + k].re; -+ z.im = r3 * aa[6 * k + k].im; -+ } else { -+ z.re = 1.0; -+ z.im = 0.0; -+ } -+ -+ aa[6 * k + k].re += r1 * z.re; -+ aa[6 * k + k].im += r1 * z.im; -+ -+ r3 = 1.0 / (r1 * (r1 + r2)); -+ rr[k] = r3; -+ dd[k].re = -(r1 + r2) * r3 * z.re; -+ dd[k].im = (r1 + r2) * r3 * z.im; -+ -+ for (j = (k + 1); j < 6; j++) { -+ complex_dble z, *ak, *aj; -+ __m128d mz, t1, t2, t3; -+ mz = _mm_setzero_pd(); -+ -+ ak = aa + 6 * k + k; -+ aj = aa + 6 * k + j; -+ -+ for (i = k; i < 6; i++) { -+ t1 = _mm_loaddup_pd(&ak->re); -+ t2 = _mm_loaddup_pd(&ak->im); -+ t3 = _mm_load_pd(&aj->re); -+ t2 = _mm_mul_pd( t2, t3 ); -+ t2 = _mm_permute_pd( t2, 0b01 ); -+ t1 = _mm_fmsubadd_pd( t1, t3, t2 ); -+ mz = _mm_add_pd( mz, t1 ); -+ ak += 6; -+ aj += 6; -+ } -+ -+ t1 = _mm_loaddup_pd(&r3); -+ mz = _mm_mul_pd( mz, t1 ); -+ _mm_storeu_pd( &z.re, mz ); -+ -+ ak = aa + 6 * k + k; -+ aj = aa + 6 * k + j; -+ for (i = k; i < 6; i++) { -+ t1 = _mm_loaddup_pd(&ak->re); -+ t2 = _mm_loaddup_pd(&ak->im); -+ t3 = _mm_load_pd(&aj->re); -+ t2 = _mm_mul_pd( mz, t2 ); -+ t2 = _mm_permute_pd( t2, 0b01 ); -+ t1 = _mm_fmaddsub_pd( mz,t1, t2 ); -+ t3 = _mm_sub_pd( t3, t1 ); -+ _mm_storeu_pd( &aj->re, t3 ); -+ ak += 6; -+ aj += 6; -+ } -+ } -+ } -+ -+ r1 = aa[35].re * aa[35].re + aa[35].im * aa[35].im; -+ -+ if (r1 >= eps) -+ r1 = 1.0 / r1; -+ else { -+ ifail = 1; -+ r1 = 1.0; -+ } -+ -+ dd[5].re = r1 * aa[35].re; -+ dd[5].im = -r1 * aa[35].im; -+ -+ return ifail; -+} -+ -+ -+void solv_sys_avx512( complex_dble *aa, complex_dble *dd ) -+{ -+ int i, j, k; -+ complex_dble z; -+ __m128d mz, t1, t2, t3; -+ -+ for (k = 5; k > 0; k--) { -+ for (i = (k - 1); i >= 0; i--) { -+ t1 = _mm_loaddup_pd(&aa[6 * i + k].re); -+ t2 = _mm_loaddup_pd(&aa[6 * i + k].im); -+ t3 = _mm_load_pd(&dd[k].re); -+ t2 = _mm_mul_pd( t2, t3 ); -+ t2 = _mm_permute_pd( t2, 0b01 ); -+ mz = _mm_fmaddsub_pd( t1, t3, t2 ); -+ -+ for (j = (k - 1); j > i; j--) { -+ t1 = _mm_loaddup_pd(&aa[6 * i + j].re); -+ t2 = _mm_loaddup_pd(&aa[6 * i + j].im); -+ t3 = _mm_load_pd(&aa[6 * j + k].re); -+ t2 = _mm_mul_pd( t2, t3 ); -+ t2 = _mm_permute_pd( t2, 0b01 ); -+ t1 = _mm_fmaddsub_pd( t1, t3, t2 ); -+ mz = _mm_add_pd( mz, t1 ); -+ } -+ -+ t1 = _mm_loaddup_pd(&dd[i].re); -+ t2 = _mm_loaddup_pd(&dd[i].im); -+ t2 = _mm_mul_pd( t2, mz ); -+ t2 = _mm_permute_pd( t2, 0b01 ); -+ t1 = _mm_fmaddsub_pd( t1, mz, t2 ); -+ t1 = _mm_sub_pd( _mm_setzero_pd(), t1); /* this line flips the sign of t1 */ -+ _mm_storeu_pd( &aa[6 * i + k].re, t1 ); -+ } -+ } -+} -+ -+void bck_house_avx512( complex_dble *aa, complex_dble *dd, double * rr ) -+{ -+ int i, j, k; -+ complex_dble z; -+ -+ aa[35].re = dd[5].re; -+ aa[35].im = dd[5].im; -+ -+ for (k = 4; k >= 0; k--) { -+ z.re = dd[k].re; -+ z.im = dd[k].im; -+ dd[k].re = aa[6 * k + k].re; -+ dd[k].im = aa[6 * k + k].im; -+ aa[6 * k + k].re = z.re; -+ aa[6 * k + k].im = z.im; -+ -+ for (j = (k + 1); j < 6; j++) { -+ dd[j].re = aa[6 * j + k].re; -+ dd[j].im = aa[6 * j + k].im; -+ aa[6 * j + k].re = 0.0; -+ aa[6 * j + k].im = 0.0; -+ } -+ -+ for (i = 0; i < 6; i++) { -+ __m128d mz, t1, t2, t3; -+ mz = _mm_setzero_pd(); -+ -+ for (j = k; j < 6; j++) { -+ t1 = _mm_loaddup_pd(&aa[6 * i + j].re); -+ t2 = _mm_loaddup_pd(&aa[6 * i + j].im); -+ t3 = _mm_load_pd(&dd[j].re); -+ t2 = _mm_mul_pd( t2, t3 ); -+ t2 = _mm_permute_pd( t2, 0b01 ); -+ t1 = _mm_fmaddsub_pd( t1, t3, t2 ); -+ mz = _mm_add_pd( mz, t1 ); -+ } -+ -+ t1 = _mm_loaddup_pd( rr+k ); -+ mz = _mm_mul_pd( mz, t1 ); -+ -+ for (j = k; j < 6; j++) { -+ t1 = _mm_loaddup_pd( &dd[j].re ); -+ t2 = _mm_loaddup_pd( &dd[j].im ); -+ t2 = _mm_mul_pd( t2, mz ); -+ t2 = _mm_permute_pd( t2, 0b01 ); -+ t1 = _mm_fmsubadd_pd( t1, mz, t2 ); -+ -+ t2 = _mm_load_pd( &aa[6 * i + j].re ); -+ t1 = _mm_sub_pd( t2, t1 ); -+ _mm_storeu_pd( &aa[6 * i + j].re, t1 ); -+ } -+ } -+ } -+} -diff --git a/modules/sw_term/avx512/pauli_dble_avx512_asm.s b/modules/sw_term/avx512/pauli_dble_avx512_asm.s -new file mode 100644 -index 0000000..a3fbdf6 ---- /dev/null -+++ b/modules/sw_term/avx512/pauli_dble_avx512_asm.s -@@ -0,0 +1,1235 @@ -+# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 17.0.4.196 Build 20170411"; -+# mark_description "-I../../../include -I.. -I/cineca/prod/opt/compilers/intel/pe-xe-2017/binary/impi/2017.3.196/intel64/include"; -+# mark_description " -isystem /cineca/prod/opt/compilers/intel/pe-xe-2018/binary/impi/2018.1.163/include64/ -std=c89 -xCORE-AVX5"; -+# mark_description "12 -mtune=skylake -DAVX512 -O3 -Ddirac_counters -pedantic -fstrict-aliasing -Wno-long-long -Wstrict-prototyp"; -+# mark_description "es -S"; -+ .file "pauli_dble_avx512.c" -+ .text -+..TXTST0: -+# -- Begin mul_pauli2_dble_avx512 -+ .text -+# mark_begin; -+ .align 16,0x90 -+ .globl mul_pauli2_dble_avx512 -+# --- mul_pauli2_dble_avx512(double, pauli_dble *, weyl_dble *, weyl_dble *) -+mul_pauli2_dble_avx512: -+# parameter 1: %xmm0 -+# parameter 2: %rdi -+# parameter 3: %rsi -+# parameter 4: %rdx -+..B1.1: # Preds ..B1.0 -+ # Execution count [1.00e+00] -+ .cfi_startproc -+..___tag_value_mul_pauli2_dble_avx512.1: -+..L2: -+ #20.1 -+ pushq %rbp #20.1 -+ .cfi_def_cfa_offset 16 -+ movq %rsp, %rbp #20.1 -+ .cfi_def_cfa 6, 16 -+ .cfi_offset 6, -16 -+ movl $86, %eax #81.8 -+ vmovups .L_2il0floatpacket.9(%rip), %zmm17 #49.9 -+ vmovups .L_2il0floatpacket.10(%rip), %zmm9 #51.9 -+ vmovups (%rsi), %zmm22 #31.28 -+ vmovups 64(%rsi), %zmm16 #32.28 -+ vmovups 128(%rsi), %zmm14 #33.28 -+ vmovups .L_2il0floatpacket.13(%rip), %zmm5 #70.9 -+ vmovups (%rdi), %zmm19 #35.27 -+ vmovups 64(%rdi), %zmm29 #36.27 -+ vmovups 288(%rdi), %zmm21 #40.27 -+ vmovups 352(%rdi), %zmm6 #41.27 -+ vmovups .L_2il0floatpacket.14(%rip), %zmm23 #73.9 -+ vmovsd %xmm0, -16(%rbp) #20.1 -+ vmovups .L_2il0floatpacket.16(%rip), %zmm8 #78.9 -+ vmovups -16(%rbp), %zmm13 #46.27 -+ vmovups .L_2il0floatpacket.11(%rip), %zmm11 #54.8 -+ vmovups .L_2il0floatpacket.12(%rip), %zmm20 #57.9 -+ vmovups 128(%rdi), %zmm30 #37.27 -+ vmovups 416(%rdi), %zmm10 #42.27 -+ vmovups 480(%rdi), %zmm7 #43.27 -+ vmovups 192(%rdi), %zmm27 #38.27 -+ vmovups 256(%rdi), %zmm28 #39.27 -+ vpermi2pd %zmm6, %zmm29, %zmm23 #73.9 -+ vpermi2pd %zmm14, %zmm22, %zmm11 #54.8 -+ kmovw %eax, %k1 #81.8 -+ vmovaps %zmm17, %zmm2 #50.8 -+ movl $169, %eax #82.8 -+ vmovaps %zmm9, %zmm1 #52.8 -+ vpermi2pd %zmm16, %zmm22, %zmm2 #50.8 -+ vpermi2pd %zmm16, %zmm22, %zmm1 #52.8 -+ vpermi2pd %zmm14, %zmm16, %zmm17 #56.8 -+ vpermt2pd %zmm14, %zmm9, %zmm16 #60.8 -+ vpermt2pd %zmm14, %zmm20, %zmm22 #58.8 -+ vmovups .L_2il0floatpacket.15(%rip), %zmm9 #75.9 -+ kmovw %eax, %k2 #82.8 -+ vmovaps %zmm5, %zmm25 #71.9 -+ movl $106, %eax #93.8 -+ vpermi2pd %zmm21, %zmm19, %zmm25 #71.9 -+ kmovw %eax, %k3 #93.8 -+ vmovaps %zmm9, %zmm31 #76.10 -+ movl $149, %eax #94.8 -+ vmovaps %zmm8, %zmm26 #79.10 -+ vpermi2pd %zmm23, %zmm25, %zmm31 #76.10 -+ vpermi2pd %zmm23, %zmm13, %zmm26 #79.10 -+ kmovw %eax, %k4 #94.8 -+ vmulpd %zmm2, %zmm31, %zmm24 #77.8 -+ vmovups .L_2il0floatpacket.18(%rip), %zmm31 #88.10 -+ vpermilpd $85, %zmm2, %zmm4 #62.9 -+ movl $85, %eax #147.8 -+ vmulpd %zmm4, %zmm26, %zmm12 #80.10 -+ vmovups .L_2il0floatpacket.17(%rip), %zmm26 #84.9 -+ vaddpd %zmm12, %zmm24, %zmm24{%k1} #81.8 -+ vpermt2pd %zmm21, %zmm26, %zmm19 #85.9 -+ vpermi2pd %zmm10, %zmm30, %zmm26 #98.9 -+ vsubpd %zmm12, %zmm24, %zmm24{%k2} #82.8 -+ vpermi2pd %zmm19, %zmm23, %zmm31 #88.10 -+ vpermi2pd %zmm26, %zmm13, %zmm8 #104.10 -+ vmovups .L_2il0floatpacket.19(%rip), %zmm12 #90.9 -+ vfmadd213pd %zmm24, %zmm22, %zmm31 #89.8 -+ vmovaps %zmm12, %zmm21 #91.10 -+ vpermi2pd %zmm13, %zmm23, %zmm21 #91.10 -+ vpermi2pd %zmm13, %zmm26, %zmm12 #113.10 -+ vpermilpd $85, %zmm22, %zmm15 #66.9 -+ vmulpd %zmm15, %zmm21, %zmm0 #92.10 -+ vmovups .L_2il0floatpacket.20(%rip), %zmm21 #101.10 -+ vaddpd %zmm0, %zmm31, %zmm31{%k3} #93.8 -+ vpermi2pd %zmm26, %zmm25, %zmm21 #101.10 -+ vsubpd %zmm0, %zmm31, %zmm31{%k4} #94.8 -+ vmulpd %zmm1, %zmm21, %zmm21 #102.8 -+ vpermilpd $85, %zmm1, %zmm3 #63.9 -+ vmulpd %zmm3, %zmm8, %zmm24 #105.10 -+ vmovups .L_2il0floatpacket.21(%rip), %zmm8 #110.10 -+ vaddpd %zmm24, %zmm21, %zmm21{%k1} #106.8 -+ vpermi2pd %zmm19, %zmm26, %zmm8 #110.10 -+ vsubpd %zmm24, %zmm21, %zmm21{%k2} #107.8 -+ vmovups .L_2il0floatpacket.23(%rip), %zmm24 #123.10 -+ vfmadd213pd %zmm21, %zmm17, %zmm8 #111.8 -+ vpermilpd $85, %zmm17, %zmm18 #65.9 -+ vmulpd %zmm18, %zmm12, %zmm21 #114.10 -+ vmovups .L_2il0floatpacket.24(%rip), %zmm12 #126.10 -+ vaddpd %zmm21, %zmm8, %zmm8{%k3} #115.8 -+ vsubpd %zmm21, %zmm8, %zmm8{%k4} #116.8 -+ vmovups .L_2il0floatpacket.22(%rip), %zmm21 #119.9 -+ vmovaps %zmm21, %zmm0 #120.9 -+ vpermi2pd %zmm7, %zmm27, %zmm0 #120.9 -+ vpermi2pd %zmm0, %zmm19, %zmm24 #123.10 -+ vpermi2pd %zmm0, %zmm13, %zmm12 #126.10 -+ vmulpd %zmm11, %zmm24, %zmm24 #124.8 -+ vpermilpd $85, %zmm11, %zmm14 #64.9 -+ vmulpd %zmm14, %zmm12, %zmm12 #127.10 -+ vaddpd %zmm12, %zmm24, %zmm24{%k1} #128.8 -+ kmovw %eax, %k1 #147.8 -+ vsubpd %zmm12, %zmm24, %zmm24{%k2} #129.8 -+ vmovups .L_2il0floatpacket.25(%rip), %zmm12 #132.10 -+ vpermi2pd %zmm19, %zmm0, %zmm12 #132.10 -+ movl $170, %eax #148.8 -+ vmovups .L_2il0floatpacket.26(%rip), %zmm19 #135.10 -+ vfmadd213pd %zmm24, %zmm16, %zmm12 #133.8 -+ vpermi2pd %zmm13, %zmm0, %zmm19 #135.10 -+ kmovw %eax, %k7 #148.8 -+ vpermilpd $85, %zmm16, %zmm20 #67.9 -+ movl $90, %eax #166.8 -+ vmulpd %zmm20, %zmm19, %zmm13 #136.10 -+ vmovups .L_2il0floatpacket.27(%rip), %zmm19 #141.9 -+ kmovw %eax, %k5 #166.8 -+ vaddpd %zmm13, %zmm12, %zmm12{%k3} #137.8 -+ vsubpd %zmm13, %zmm12, %zmm12{%k4} #138.8 -+ movl $165, %eax #167.8 -+ kmovw %eax, %k6 #167.8 -+ vmovaps %zmm19, %zmm13 #142.10 -+ movl $240, %eax #272.8 -+ vpermi2pd %zmm23, %zmm25, %zmm13 #142.10 -+ kmovw %eax, %k2 #272.8 -+ vfmadd213pd %zmm8, %zmm2, %zmm13 #143.8 -+ vmovups .L_2il0floatpacket.28(%rip), %zmm8 #144.9 -+ vmovaps %zmm8, %zmm24 #145.10 -+ vpermi2pd %zmm23, %zmm25, %zmm24 #145.10 -+ vmulpd %zmm24, %zmm4, %zmm24 #146.10 -+ vaddpd %zmm24, %zmm13, %zmm13{%k1} #147.8 -+ vsubpd %zmm24, %zmm13, %zmm13{%k7} #148.8 -+ vmovaps %zmm9, %zmm24 #151.10 -+ vpermi2pd %zmm0, %zmm23, %zmm24 #151.10 -+ vfmadd213pd %zmm31, %zmm17, %zmm24 #152.8 -+ vmovups .L_2il0floatpacket.29(%rip), %zmm31 #153.9 -+ vpermt2pd %zmm0, %zmm31, %zmm23 #154.10 -+ vmulpd %zmm23, %zmm18, %zmm23 #155.10 -+ vaddpd %zmm23, %zmm24, %zmm24{%k7} #156.8 -+ vsubpd %zmm23, %zmm24, %zmm24{%k1} #157.8 -+ vmovaps %zmm19, %zmm23 #161.10 -+ vpermi2pd %zmm26, %zmm25, %zmm23 #161.10 -+ vpermt2pd %zmm26, %zmm8, %zmm25 #164.10 -+ vfmadd213pd %zmm24, %zmm1, %zmm23 #162.8 -+ vmulpd %zmm25, %zmm3, %zmm25 #165.10 -+ vaddpd %zmm25, %zmm23, %zmm23{%k5} #166.8 -+ vsubpd %zmm25, %zmm23, %zmm23{%k6} #167.8 -+ vmovaps %zmm9, %zmm25 #170.10 -+ vpermi2pd %zmm0, %zmm26, %zmm25 #170.10 -+ vpermt2pd %zmm0, %zmm31, %zmm26 #173.10 -+ vfmadd213pd %zmm13, %zmm22, %zmm25 #171.8 -+ vmulpd %zmm26, %zmm15, %zmm26 #174.10 -+ vaddpd %zmm26, %zmm25, %zmm25{%k5} #175.8 -+ vsubpd %zmm26, %zmm25, %zmm25{%k6} #176.8 -+ vmovups .L_2il0floatpacket.30(%rip), %zmm26 #178.9 -+ vmovaps %zmm26, %zmm0 #179.10 -+ vpermi2pd %zmm6, %zmm29, %zmm0 #179.10 -+ vpermi2pd %zmm10, %zmm30, %zmm26 #189.10 -+ vfmadd213pd %zmm12, %zmm2, %zmm0 #180.8 -+ vmovups .L_2il0floatpacket.31(%rip), %zmm12 #181.9 -+ vmovaps %zmm12, %zmm2 #182.10 -+ vpermi2pd %zmm6, %zmm29, %zmm2 #182.10 -+ vpermi2pd %zmm10, %zmm30, %zmm12 #192.10 -+ vpermt2pd %zmm6, %zmm5, %zmm29 #200.9 -+ vmulpd %zmm2, %zmm4, %zmm4 #183.10 -+ vaddpd %zmm4, %zmm0, %zmm0{%k1} #184.8 -+ vsubpd %zmm4, %zmm0, %zmm0{%k7} #185.8 -+ vfmadd213pd %zmm0, %zmm1, %zmm26 #190.8 -+ vmulpd %zmm12, %zmm3, %zmm1 #193.10 -+ vmovups .L_2il0floatpacket.32(%rip), %zmm3 #201.9 -+ vaddpd %zmm1, %zmm26, %zmm26{%k1} #194.8 -+ vpermt2pd %zmm7, %zmm3, %zmm27 #202.9 -+ vmovups .L_2il0floatpacket.33(%rip), %zmm7 #214.9 -+ vsubpd %zmm1, %zmm26, %zmm26{%k7} #195.8 -+ vpermt2pd 544(%rdi), %zmm7, %zmm28 #215.9 -+ vmovaps %zmm31, %zmm5 #208.10 -+ vpermi2pd %zmm27, %zmm29, %zmm5 #208.10 -+ vmulpd %zmm5, %zmm14, %zmm6 #209.10 -+ vmovaps %zmm9, %zmm0 #205.10 -+ vpermi2pd %zmm27, %zmm29, %zmm0 #205.10 -+ vfmadd213pd %zmm23, %zmm11, %zmm0 #206.8 -+ vaddpd %zmm6, %zmm0, %zmm0{%k5} #210.8 -+ vmovaps %zmm19, %zmm1 #236.10 -+ vsubpd %zmm6, %zmm0, %zmm0{%k6} #211.8 -+ vpermi2pd %zmm28, %zmm29, %zmm1 #236.10 -+ vpermt2pd %zmm28, %zmm8, %zmm29 #239.10 -+ vfmadd213pd %zmm0, %zmm16, %zmm1 #237.8 -+ vmovups .L_2il0floatpacket.34(%rip), %zmm0 #245.9 -+ vmulpd %zmm29, %zmm20, %zmm29 #240.10 -+ vpermt2pd %zmm10, %zmm0, %zmm30 #246.9 -+ vaddpd %zmm29, %zmm1, %zmm1{%k7} #241.8 -+ vmovaps %zmm19, %zmm5 #218.10 -+ vpermi2pd %zmm28, %zmm27, %zmm5 #218.10 -+ vpermi2pd %zmm27, %zmm30, %zmm19 #249.10 -+ vsubpd %zmm29, %zmm1, %zmm1{%k1} #242.8 -+ vfmadd213pd %zmm26, %zmm22, %zmm5 #219.8 -+ vfmadd213pd %zmm25, %zmm11, %zmm19 #250.8 -+ vmovaps %zmm8, %zmm22 #221.10 -+ vpermi2pd %zmm28, %zmm27, %zmm22 #221.10 -+ vpermi2pd %zmm27, %zmm30, %zmm8 #252.10 -+ vmulpd %zmm22, %zmm15, %zmm15 #222.10 -+ vmulpd %zmm8, %zmm14, %zmm11 #253.10 -+ vaddpd %zmm15, %zmm5, %zmm5{%k5} #223.8 -+ vaddpd %zmm11, %zmm19, %zmm19{%k5} #254.8 -+ vsubpd %zmm15, %zmm5, %zmm5{%k6} #224.8 -+ vsubpd %zmm11, %zmm19, %zmm19{%k6} #255.8 -+ vmovaps %zmm31, %zmm6 #230.10 -+ vmovaps %zmm9, %zmm15 #227.10 -+ vpermi2pd %zmm28, %zmm27, %zmm6 #230.10 -+ vpermi2pd %zmm28, %zmm30, %zmm9 #258.10 -+ vpermt2pd %zmm28, %zmm31, %zmm30 #261.10 -+ vpermi2pd %zmm28, %zmm27, %zmm15 #227.10 -+ vmulpd %zmm6, %zmm18, %zmm18 #231.10 -+ vfmadd213pd %zmm19, %zmm16, %zmm9 #259.8 -+ vfmadd213pd %zmm5, %zmm17, %zmm15 #228.8 -+ vmovups .L_2il0floatpacket.35(%rip), %zmm28 #268.9 -+ vmulpd %zmm30, %zmm20, %zmm16 #262.10 -+ vaddpd %zmm18, %zmm15, %zmm15{%k5} #232.8 -+ vaddpd %zmm16, %zmm9, %zmm9{%k7} #263.8 -+ vsubpd %zmm18, %zmm15, %zmm15{%k6} #233.8 -+ vsubpd %zmm16, %zmm9, %zmm9{%k1} #264.8 -+ vpermi2pd %zmm1, %zmm15, %zmm28 #269.8 -+ vpermi2pd %zmm9, %zmm1, %zmm7 #267.8 -+ vpermt2pd %zmm15, %zmm21, %zmm9 #271.8 -+ vblendmpd %zmm28, %zmm7, %zmm10{%k2} #272.8 -+ vblendmpd %zmm7, %zmm9, %zmm27{%k2} #273.8 -+ vblendmpd %zmm9, %zmm28, %zmm30{%k2} #274.8 -+ vmovups %zmm10, (%rdx) #276.22 -+ vmovups %zmm27, 64(%rdx) #277.22 -+ vmovups %zmm30, 128(%rdx) #278.22 -+ vzeroupper #279.1 -+ movq %rbp, %rsp #279.1 -+ popq %rbp #279.1 -+ .cfi_restore 6 -+ ret #279.1 -+ .align 16,0x90 -+ # LOE -+ .cfi_endproc -+# mark_end; -+ .type mul_pauli2_dble_avx512,@function -+ .size mul_pauli2_dble_avx512,.-mul_pauli2_dble_avx512 -+ .data -+# -- End mul_pauli2_dble_avx512 -+ .text -+# -- Begin fwd_house_avx512 -+ .text -+# mark_begin; -+ .align 16,0x90 -+ .globl fwd_house_avx512 -+# --- fwd_house_avx512(double, complex_dble *, complex_dble *, double *) -+fwd_house_avx512: -+# parameter 1: %xmm0 -+# parameter 2: %rdi -+# parameter 3: %rsi -+# parameter 4: %rdx -+..B2.1: # Preds ..B2.0 -+ # Execution count [1.00e+00] -+ .cfi_startproc -+..___tag_value_fwd_house_avx512.8: -+..L9: -+ #283.1 -+ pushq %r12 #283.1 -+ .cfi_def_cfa_offset 16 -+ .cfi_offset 12, -16 -+ pushq %r13 #283.1 -+ .cfi_def_cfa_offset 24 -+ .cfi_offset 13, -24 -+ pushq %r14 #283.1 -+ .cfi_def_cfa_offset 32 -+ .cfi_offset 14, -32 -+ pushq %r15 #283.1 -+ .cfi_def_cfa_offset 40 -+ .cfi_offset 15, -40 -+ pushq %rbx #283.1 -+ .cfi_def_cfa_offset 48 -+ .cfi_offset 3, -48 -+ pushq %rbp #283.1 -+ .cfi_def_cfa_offset 56 -+ .cfi_offset 6, -56 -+ xorl %eax, %eax #288.3 -+ xorl %r8d, %r8d #290.3 -+ movq %rdi, %r9 #283.1 -+ xorl %r11d, %r11d #290.3 -+ vmovapd %xmm0, %xmm14 #283.1 -+ xorl %r10d, %r10d #290.3 -+ vxorpd %xmm1, %xmm1, %xmm1 #326.12 -+ vmovsd .L_2il0floatpacket.38(%rip), %xmm11 #307.12 -+ xorl %edi, %edi #290.3 -+ vmovsd .L_2il0floatpacket.36(%rip), %xmm0 #306.16 -+ # LOE rdx rsi r8 r9 eax edi r10d r11d xmm0 xmm1 xmm11 xmm14 -+..B2.2: # Preds ..B2.35 ..B2.1 -+ # Execution count [5.00e+00] -+ movslq %r10d, %r12 #292.29 -+ lea 1(%r8), %ecx #295.10 -+ shlq $4, %r12 #291.10 -+ vmovsd 8(%r9,%r12), %xmm3 #292.29 -+ vmulsd %xmm3, %xmm3, %xmm12 #292.29 -+ vmovsd (%r9,%r12), %xmm2 #291.29 -+ vfmadd231sd %xmm2, %xmm2, %xmm12 #291.5 -+ vsqrtsd %xmm12, %xmm12, %xmm13 #293.10 -+ # LOE rdx rsi r8 r9 r12 eax ecx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -+..B2.3: # Preds ..B2.2 -+ # Execution count [5.00e+00] -+ xorl %r13d, %r13d #295.5 -+ lea 5(%r11), %r14d #295.5 -+ movl %r14d, %ebp #295.5 -+ movl $1, %ebx #295.5 -+ sarl $2, %ebp #295.5 -+ shrl $29, %ebp #295.5 -+ lea 5(%rbp,%r11), %r15d #295.5 -+ xorl %ebp, %ebp #296.7 -+ sarl $3, %r15d #295.5 -+ testl %r15d, %r15d #295.5 -+ jbe ..B2.7 # Prob 10% #295.5 -+ # LOE rdx rsi r8 r9 r12 eax ecx ebx ebp edi r10d r11d r13d r14d r15d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -+..B2.4: # Preds ..B2.3 -+ # Execution count [1.56e-02] -+ vxorpd %xmm10, %xmm10, %xmm10 #295.5 -+ vxorpd %xmm9, %xmm9, %xmm9 #295.5 -+ vxorpd %xmm8, %xmm8, %xmm8 #295.5 -+ vxorpd %xmm4, %xmm4, %xmm4 #295.5 -+ vxorpd %xmm7, %xmm7, %xmm7 #295.5 -+ vxorpd %xmm6, %xmm6, %xmm6 #295.5 -+ vxorpd %xmm5, %xmm5, %xmm5 #295.5 -+ # LOE rdx rsi r8 r9 r12 eax ecx ebp edi r10d r11d r13d r14d r15d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 -+..B2.5: # Preds ..B2.5 ..B2.4 -+ # Execution count [3.12e+00] -+ incl %r13d #295.5 -+ lea (%r10,%rbp), %ebx #297.33 -+ movslq %ebx, %rbx #296.14 -+ addl $48, %ebp #295.5 -+ shlq $4, %rbx #297.33 -+ vmovsd 104(%r9,%rbx), %xmm15 #297.14 -+ vmovsd 200(%r9,%rbx), %xmm18 #297.14 -+ vmulsd %xmm15, %xmm15, %xmm17 #296.7 -+ vmulsd %xmm18, %xmm18, %xmm20 #296.7 -+ vmovsd 192(%r9,%rbx), %xmm19 #296.14 -+ vmovsd 96(%r9,%rbx), %xmm16 #296.14 -+ vfmadd231sd %xmm16, %xmm16, %xmm17 #296.7 -+ vmovsd 296(%r9,%rbx), %xmm21 #297.14 -+ vmovsd 392(%r9,%rbx), %xmm24 #297.14 -+ vmovsd 488(%r9,%rbx), %xmm27 #297.14 -+ vmovsd 584(%r9,%rbx), %xmm30 #297.14 -+ vfmadd231sd %xmm19, %xmm19, %xmm20 #296.7 -+ vaddsd %xmm12, %xmm17, %xmm12 #296.7 -+ vmulsd %xmm21, %xmm21, %xmm23 #296.7 -+ vmulsd %xmm24, %xmm24, %xmm26 #296.7 -+ vmulsd %xmm27, %xmm27, %xmm29 #296.7 -+ vaddsd %xmm10, %xmm20, %xmm10 #296.7 -+ vmulsd %xmm30, %xmm30, %xmm15 #296.7 -+ vmovsd 680(%r9,%rbx), %xmm16 #297.14 -+ vmovsd 776(%r9,%rbx), %xmm19 #297.14 -+ vmulsd %xmm16, %xmm16, %xmm18 #296.7 -+ vmulsd %xmm19, %xmm19, %xmm21 #296.7 -+ vmovsd 768(%r9,%rbx), %xmm20 #296.14 -+ vmovsd 288(%r9,%rbx), %xmm22 #296.14 -+ vmovsd 384(%r9,%rbx), %xmm25 #296.14 -+ vmovsd 480(%r9,%rbx), %xmm28 #296.14 -+ vmovsd 576(%r9,%rbx), %xmm31 #296.14 -+ vmovsd 672(%r9,%rbx), %xmm17 #296.14 -+ vfmadd231sd %xmm22, %xmm22, %xmm23 #296.7 -+ vfmadd231sd %xmm25, %xmm25, %xmm26 #296.7 -+ vfmadd231sd %xmm28, %xmm28, %xmm29 #296.7 -+ vfmadd231sd %xmm31, %xmm31, %xmm15 #296.7 -+ vfmadd231sd %xmm17, %xmm17, %xmm18 #296.7 -+ vfmadd231sd %xmm20, %xmm20, %xmm21 #296.7 -+ vaddsd %xmm9, %xmm23, %xmm9 #296.7 -+ vaddsd %xmm8, %xmm26, %xmm8 #296.7 -+ vaddsd %xmm4, %xmm29, %xmm4 #296.7 -+ vaddsd %xmm7, %xmm15, %xmm7 #296.7 -+ vaddsd %xmm6, %xmm18, %xmm6 #296.7 -+ vaddsd %xmm5, %xmm21, %xmm5 #296.7 -+ cmpl %r15d, %r13d #295.5 -+ jb ..B2.5 # Prob 99% #295.5 -+ # LOE rdx rsi r8 r9 r12 eax ecx ebp edi r10d r11d r13d r14d r15d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 -+..B2.6: # Preds ..B2.5 -+ # Execution count [4.50e+00] -+ vaddsd %xmm10, %xmm12, %xmm10 #295.5 -+ vaddsd %xmm8, %xmm9, %xmm8 #295.5 -+ vaddsd %xmm7, %xmm4, %xmm4 #295.5 -+ vaddsd %xmm5, %xmm6, %xmm5 #295.5 -+ vaddsd %xmm8, %xmm10, %xmm9 #295.5 -+ vaddsd %xmm5, %xmm4, %xmm6 #295.5 -+ vaddsd %xmm6, %xmm9, %xmm12 #295.5 -+ lea 1(,%r13,8), %ebx #296.7 -+ # LOE rdx rsi r8 r9 r12 eax ecx ebx edi r10d r11d r14d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -+..B2.7: # Preds ..B2.6 ..B2.3 -+ # Execution count [5.00e+00] -+ cmpl %r14d, %ebx #295.5 -+ ja ..B2.23 # Prob 50% #295.5 -+ # LOE rdx rsi r8 r9 r12 eax ecx ebx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -+..B2.8: # Preds ..B2.7 -+ # Execution count [0.00e+00] -+ lea (%r8,%rbx), %ebp #295.5 -+ negl %ebp #295.5 -+ addl $5, %ebp #295.5 -+ jmp *.2.10_2.switchtab.4(,%rbp,8) #295.5 -+ # LOE rdx rsi r8 r9 r12 eax ecx ebx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -+..1.10_0.TAG.6: -+..B2.10: # Preds ..B2.8 -+ # Execution count [0.00e+00] -+ lea (%rbx,%rbx,2), %ebp #296.14 -+ lea (%r10,%rbp,2), %r13d #297.33 -+ movslq %r13d, %r13 #296.14 -+ shlq $4, %r13 #297.33 -+ lea 584(%r9,%r13), %r14 #297.14 -+ vmovsd (%r14), %xmm4 #297.14 -+ vmulsd %xmm4, %xmm4, %xmm6 #297.33 -+ vmovsd -8(%r14), %xmm5 #296.14 -+ vfmadd231sd %xmm5, %xmm5, %xmm6 #296.7 -+ vaddsd %xmm6, %xmm12, %xmm12 #296.7 -+ # LOE rdx rsi r8 r9 r12 eax ecx ebx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -+..1.10_0.TAG.5: -+..B2.12: # Preds ..B2.8 ..B2.10 -+ # Execution count [0.00e+00] -+ lea (%rbx,%rbx,2), %ebp #296.14 -+ lea (%r10,%rbp,2), %r13d #297.33 -+ movslq %r13d, %r13 #296.14 -+ shlq $4, %r13 #297.33 -+ lea 488(%r9,%r13), %r14 #297.14 -+ vmovsd (%r14), %xmm4 #297.14 -+ vmulsd %xmm4, %xmm4, %xmm6 #297.33 -+ vmovsd -8(%r14), %xmm5 #296.14 -+ vfmadd231sd %xmm5, %xmm5, %xmm6 #296.7 -+ vaddsd %xmm6, %xmm12, %xmm12 #296.7 -+ # LOE rdx rsi r8 r9 r12 eax ecx ebx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -+..1.10_0.TAG.4: -+..B2.14: # Preds ..B2.8 ..B2.12 -+ # Execution count [0.00e+00] -+ lea (%rbx,%rbx,2), %ebp #296.14 -+ lea (%r10,%rbp,2), %r13d #297.33 -+ movslq %r13d, %r13 #296.14 -+ shlq $4, %r13 #297.33 -+ lea 392(%r9,%r13), %r14 #297.14 -+ vmovsd (%r14), %xmm4 #297.14 -+ vmulsd %xmm4, %xmm4, %xmm6 #297.33 -+ vmovsd -8(%r14), %xmm5 #296.14 -+ vfmadd231sd %xmm5, %xmm5, %xmm6 #296.7 -+ vaddsd %xmm6, %xmm12, %xmm12 #296.7 -+ # LOE rdx rsi r8 r9 r12 eax ecx ebx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -+..1.10_0.TAG.3: -+..B2.16: # Preds ..B2.8 ..B2.14 -+ # Execution count [0.00e+00] -+ lea (%rbx,%rbx,2), %ebp #296.14 -+ lea (%r10,%rbp,2), %r13d #297.33 -+ movslq %r13d, %r13 #296.14 -+ shlq $4, %r13 #297.33 -+ lea 296(%r9,%r13), %r14 #297.14 -+ vmovsd (%r14), %xmm4 #297.14 -+ vmulsd %xmm4, %xmm4, %xmm6 #297.33 -+ vmovsd -8(%r14), %xmm5 #296.14 -+ vfmadd231sd %xmm5, %xmm5, %xmm6 #296.7 -+ vaddsd %xmm6, %xmm12, %xmm12 #296.7 -+ # LOE rdx rsi r8 r9 r12 eax ecx ebx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -+..1.10_0.TAG.2: -+..B2.18: # Preds ..B2.8 ..B2.16 -+ # Execution count [0.00e+00] -+ lea (%rbx,%rbx,2), %ebp #296.14 -+ lea (%r10,%rbp,2), %r13d #297.33 -+ movslq %r13d, %r13 #296.14 -+ shlq $4, %r13 #297.33 -+ lea 200(%r9,%r13), %r14 #297.14 -+ vmovsd (%r14), %xmm4 #297.14 -+ vmulsd %xmm4, %xmm4, %xmm6 #297.33 -+ vmovsd -8(%r14), %xmm5 #296.14 -+ vfmadd231sd %xmm5, %xmm5, %xmm6 #296.7 -+ vaddsd %xmm6, %xmm12, %xmm12 #296.7 -+ # LOE rdx rsi r8 r9 r12 eax ecx ebx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -+..1.10_0.TAG.1: -+..B2.20: # Preds ..B2.8 ..B2.18 -+ # Execution count [0.00e+00] -+ lea (%rbx,%rbx,2), %ebp #296.14 -+ lea (%r10,%rbp,2), %r13d #297.33 -+ movslq %r13d, %r13 #296.14 -+ shlq $4, %r13 #297.33 -+ vmovsd 104(%r9,%r13), %xmm4 #297.14 -+ vmulsd %xmm4, %xmm4, %xmm6 #297.33 -+ vmovsd 96(%r9,%r13), %xmm5 #296.14 -+ vfmadd231sd %xmm5, %xmm5, %xmm6 #296.7 -+ vaddsd %xmm6, %xmm12, %xmm12 #296.7 -+ # LOE rdx rsi r8 r9 r12 eax ecx ebx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -+..1.10_0.TAG.0: -+..B2.22: # Preds ..B2.8 ..B2.20 -+ # Execution count [4.50e+00] -+ lea (%rbx,%rbx,2), %ebx #296.14 -+ lea (%r10,%rbx,2), %ebp #297.33 -+ movslq %ebp, %rbp #296.14 -+ shlq $4, %rbp #297.33 -+ vmovsd 8(%r9,%rbp), %xmm4 #297.14 -+ vmulsd %xmm4, %xmm4, %xmm6 #297.33 -+ vmovsd (%r9,%rbp), %xmm5 #296.14 -+ vfmadd231sd %xmm5, %xmm5, %xmm6 #296.7 -+ vaddsd %xmm6, %xmm12, %xmm12 #296.7 -+ # LOE rdx rsi r8 r9 r12 eax ecx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -+..B2.23: # Preds ..B2.22 ..B2.7 -+ # Execution count [5.00e+00] -+ vcomisd %xmm14, %xmm12 #299.15 -+ jb ..B2.25 # Prob 50% #299.15 -+ # LOE rdx rsi r8 r9 r12 eax ecx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -+..B2.24: # Preds ..B2.23 -+ # Execution count [2.50e+00] -+ vsqrtsd %xmm12, %xmm12, %xmm12 #300.12 -+ jmp ..B2.26 # Prob 100% #300.12 -+ # LOE rdx rsi r8 r9 r12 eax ecx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -+..B2.25: # Preds ..B2.23 -+ # Execution count [2.50e+00] -+ vmovapd %xmm11, %xmm12 #303.7 -+ movl $1, %eax #302.7 -+ # LOE rdx rsi r8 r9 r12 eax ecx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -+..B2.26: # Preds ..B2.24 ..B2.25 -+ # Execution count [5.00e+00] -+ vmulsd %xmm0, %xmm12, %xmm4 #306.30 -+ vcomisd %xmm4, %xmm13 #306.30 -+ jb ..B2.28 # Prob 50% #306.30 -+ # LOE rdx rsi r8 r9 r12 eax ecx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -+..B2.27: # Preds ..B2.26 -+ # Execution count [2.50e+00] -+ vdivsd %xmm13, %xmm11, %xmm4 #307.18 -+ vmulsd %xmm4, %xmm2, %xmm5 #308.19 -+ vmulsd %xmm3, %xmm4, %xmm4 #309.19 -+ jmp ..B2.29 # Prob 100% #309.19 -+ # LOE rdx rsi r8 r9 r12 eax ecx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm11 xmm12 xmm13 xmm14 -+..B2.28: # Preds ..B2.26 -+ # Execution count [2.50e+00] -+ vmovapd %xmm11, %xmm5 #311.7 -+ vxorpd %xmm4, %xmm4, %xmm4 #312.7 -+ # LOE rdx rsi r8 r9 r12 eax ecx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm11 xmm12 xmm13 xmm14 -+..B2.29: # Preds ..B2.27 ..B2.28 -+ # Execution count [6.63e-01] -+ vfmadd231sd %xmm12, %xmm4, %xmm3 #316.5 -+ xorl %ebp, %ebp #323.5 -+ vfmadd231sd %xmm12, %xmm5, %xmm2 #315.5 -+ vmovsd %xmm3, 8(%r9,%r12) #316.5 -+ vaddsd %xmm13, %xmm12, %xmm3 #318.28 -+ vmulsd %xmm3, %xmm12, %xmm12 #318.28 -+ lea 6(%r11), %r13d #331.23 -+ vmulsd %xmm3, %xmm5, %xmm5 #320.5 -+ vmulsd %xmm3, %xmm4, %xmm4 #321.28 -+ vdivsd %xmm12, %xmm11, %xmm6 #318.28 -+ vmovsd %xmm2, (%r9,%r12) #315.5 -+ movq %r8, %rbx #320.5 -+ vmulsd %xmm6, %xmm5, %xmm2 #320.5 -+ vmulsd %xmm6, %xmm4, %xmm8 #321.33 -+ movslq %edi, %r12 #328.12 -+ shlq $4, %rbx #320.5 -+ addq %r8, %r12 #328.12 -+ shlq $4, %r12 #328.12 -+ vxorpd .L_2il0floatpacket.37(%rip), %xmm2, %xmm7 #320.5 -+ vmovsd %xmm6, (%rdx,%r8,8) #319.5 -+ negq %r8 #323.27 -+ vmovsd %xmm6, -24(%rsp) #318.5 -+ addq $5, %r8 #323.27 -+ vmovsd %xmm7, (%rsi,%rbx) #320.5 -+ vmovsd %xmm8, 8(%rsi,%rbx) #321.5 -+ lea (%r9,%r12), %rbx #328.12 -+ vmovddup -24(%rsp), %xmm2 #343.28 -+ lea 16(%r12,%r9), %r12 #329.12 -+ movq %rdx, -16(%rsp) #331.23[spill] -+ # LOE rbx rbp rsi r8 r9 r12 eax ecx edi r10d r11d r13d xmm0 xmm1 xmm2 xmm11 xmm14 -+..B2.30: # Preds ..B2.34 ..B2.29 -+ # Execution count [2.12e+01] -+ vmovapd %xmm1, %xmm3 #326.12 -+ movq %rbx, %r15 #328.7 -+ movq %r12, %r14 #329.7 -+ xorl %edx, %edx #331.7 -+ # LOE rbx rbp rsi r8 r9 r12 r14 r15 eax edx ecx edi r10d r11d r13d xmm0 xmm1 xmm2 xmm3 xmm11 xmm14 -+..B2.31: # Preds ..B2.31 ..B2.30 -+ # Execution count [1.18e+02] -+ vmovupd (%r14), %xmm5 #334.27 -+ incl %edx #331.7 -+ vmulpd 8(%r15){1to2}, %xmm5, %xmm4 #335.14 -+ vpermilpd $1, %xmm4, %xmm6 #336.14 -+ addq $96, %r14 #340.9 -+ vfmsubadd231pd (%r15){1to2}, %xmm5, %xmm6 #337.14 -+ addq $96, %r15 #339.9 -+ vaddpd %xmm3, %xmm6, %xmm3 #338.14 -+ cmpl %r13d, %edx #331.7 -+ jb ..B2.31 # Prob 82% #331.7 -+ # LOE rbx rbp rsi r8 r9 r12 r14 r15 eax edx ecx edi r10d r11d r13d xmm0 xmm1 xmm2 xmm3 xmm11 xmm14 -+..B2.32: # Preds ..B2.31 -+ # Execution count [2.25e+01] -+ vmulpd %xmm2, %xmm3, %xmm3 #344.12 -+ movq %rbx, %r15 #347.7 -+ movq %r12, %r14 #348.7 -+ xorl %edx, %edx #349.7 -+ # LOE rbx rbp rsi r8 r9 r12 r14 r15 eax edx ecx edi r10d r11d r13d xmm0 xmm1 xmm2 xmm3 xmm11 xmm14 -+..B2.33: # Preds ..B2.33 ..B2.32 -+ # Execution count [1.25e+02] -+ vmulpd 8(%r15){1to2}, %xmm3, %xmm4 #353.14 -+ vpermilpd $1, %xmm4, %xmm6 #354.14 -+ incl %edx #349.7 -+ vfmaddsub231pd (%r15){1to2}, %xmm3, %xmm6 #355.14 -+ addq $96, %r15 #358.9 -+ vmovupd (%r14), %xmm5 #352.27 -+ vsubpd %xmm6, %xmm5, %xmm7 #356.14 -+ vmovupd %xmm7, (%r14) #357.25 -+ addq $96, %r14 #359.9 -+ cmpl %r13d, %edx #349.7 -+ jb ..B2.33 # Prob 82% #349.7 -+ # LOE rbx rbp rsi r8 r9 r12 r14 r15 eax edx ecx edi r10d r11d r13d xmm0 xmm1 xmm2 xmm3 xmm11 xmm14 -+..B2.34: # Preds ..B2.33 -+ # Execution count [2.50e+01] -+ incq %rbp #323.5 -+ addq $16, %r12 #323.5 -+ cmpq %r8, %rbp #323.5 -+ jb ..B2.30 # Prob 81% #323.5 -+ # LOE rbx rbp rsi r8 r9 r12 eax ecx edi r10d r11d r13d xmm0 xmm1 xmm2 xmm11 xmm14 -+..B2.35: # Preds ..B2.34 -+ # Execution count [5.00e+00] -+ decl %r11d #295.10 -+ addl $7, %r10d #295.10 -+ addl $6, %edi #295.10 -+ movl %ecx, %r8d #290.3 -+ movq -16(%rsp), %rdx #[spill] -+ cmpl $5, %ecx #290.3 -+ jb ..B2.2 # Prob 79% #290.3 -+ # LOE rdx rsi r8 r9 eax edi r10d r11d xmm0 xmm1 xmm11 xmm14 -+..B2.36: # Preds ..B2.35 -+ # Execution count [1.00e+00] -+ vmovsd 568(%r9), %xmm2 #364.44 -+ vmulsd %xmm2, %xmm2, %xmm0 #364.44 -+ vmovsd 560(%r9), %xmm1 #364.8 -+ vfmadd231sd %xmm1, %xmm1, %xmm0 #364.3 -+ vcomisd %xmm14, %xmm0 #366.13 -+ jb ..B2.38 # Prob 50% #366.13 -+ # LOE rbx rbp rsi r12 r13 r14 r15 eax xmm0 xmm1 xmm2 xmm11 -+..B2.37: # Preds ..B2.36 -+ # Execution count [5.00e-01] -+ vdivsd %xmm0, %xmm11, %xmm11 #367.16 -+ jmp ..B2.39 # Prob 100% #367.16 -+ # LOE rbx rbp rsi r12 r13 r14 r15 eax xmm1 xmm2 xmm11 -+..B2.38: # Preds ..B2.36 -+ # Execution count [5.00e-01] -+ movl $1, %eax #369.5 -+ # LOE rbx rbp rsi r12 r13 r14 r15 eax xmm1 xmm2 xmm11 -+..B2.39: # Preds ..B2.37 ..B2.38 -+ # Execution count [1.00e+00] -+ vmulsd %xmm11, %xmm1, %xmm0 #373.19 -+ vmulsd %xmm2, %xmm11, %xmm1 #374.3 -+ vxorpd .L_2il0floatpacket.37(%rip), %xmm1, %xmm2 #374.3 -+ vmovsd %xmm0, 80(%rsi) #373.3 -+ vmovsd %xmm2, 88(%rsi) #374.3 -+ .cfi_restore 6 -+ popq %rbp #376.10 -+ .cfi_def_cfa_offset 48 -+ .cfi_restore 3 -+ popq %rbx #376.10 -+ .cfi_def_cfa_offset 40 -+ .cfi_restore 15 -+ popq %r15 #376.10 -+ .cfi_def_cfa_offset 32 -+ .cfi_restore 14 -+ popq %r14 #376.10 -+ .cfi_def_cfa_offset 24 -+ .cfi_restore 13 -+ popq %r13 #376.10 -+ .cfi_def_cfa_offset 16 -+ .cfi_restore 12 -+ popq %r12 #376.10 -+ .cfi_def_cfa_offset 8 -+ ret #376.10 -+ .align 16,0x90 -+ # LOE -+ .cfi_endproc -+# mark_end; -+ .type fwd_house_avx512,@function -+ .size fwd_house_avx512,.-fwd_house_avx512 -+ .section .rodata, "a" -+ .align 64 -+ .align 8 -+.2.10_2.switchtab.4: -+ .quad ..1.10_0.TAG.0 -+ .quad ..1.10_0.TAG.1 -+ .quad ..1.10_0.TAG.2 -+ .quad ..1.10_0.TAG.3 -+ .quad ..1.10_0.TAG.4 -+ .quad ..1.10_0.TAG.5 -+ .quad ..1.10_0.TAG.6 -+ .data -+# -- End fwd_house_avx512 -+ .text -+# -- Begin solv_sys_avx512 -+ .text -+# mark_begin; -+ .align 16,0x90 -+ .globl solv_sys_avx512 -+# --- solv_sys_avx512(complex_dble *, complex_dble *) -+solv_sys_avx512: -+# parameter 1: %rdi -+# parameter 2: %rsi -+..B3.1: # Preds ..B3.0 -+ # Execution count [1.00e+00] -+ .cfi_startproc -+..___tag_value_solv_sys_avx512.35: -+..L36: -+ #381.1 -+ pushq %r12 #381.1 -+ .cfi_def_cfa_offset 16 -+ .cfi_offset 12, -16 -+ pushq %r13 #381.1 -+ .cfi_def_cfa_offset 24 -+ .cfi_offset 13, -24 -+ pushq %r14 #381.1 -+ .cfi_def_cfa_offset 32 -+ .cfi_offset 14, -32 -+ pushq %r15 #381.1 -+ .cfi_def_cfa_offset 40 -+ .cfi_offset 15, -40 -+ pushq %rbx #381.1 -+ .cfi_def_cfa_offset 48 -+ .cfi_offset 3, -48 -+ pushq %rbp #381.1 -+ .cfi_def_cfa_offset 56 -+ .cfi_offset 6, -56 -+ movl $5, %edx #386.8 -+ vxorpd %xmm0, %xmm0, %xmm0 #410.24 -+ movl $80, %eax #386.8 -+ # LOE rax rdx rsi rdi xmm0 -+..B3.2: # Preds ..B3.10 ..B3.1 -+ # Execution count [5.00e+00] -+ lea -1(%rdx), %r13d #387.19 -+ movslq %r13d, %r14 #387.10 -+ lea -3(%rdx,%rdx,2), %ebp #387.10 -+ movq %r14, %r12 #405.28 -+ addl %ebp, %ebp #387.10 -+ shlq $4, %r12 #405.28 -+ movslq %ebp, %rbp #387.10 -+ addq %rsi, %r12 #381.1 -+ shlq $4, %rbp #388.28 -+ testl %r13d, %r13d #387.28 -+ js ..B3.10 # Prob 2% #387.28 -+ # LOE rax rdx rbp rsi rdi r12 r14 r13d xmm0 -+..B3.3: # Preds ..B3.2 -+ # Execution count [4.90e+00] -+ lea -1(%rdx), %r11 #395.21 -+ movq %r11, %rbx #395.12 -+ lea (%rdi,%rax), %r8 #388.28 -+ shlq $4, %rbx #395.12 -+ lea (%rbp,%r8), %r9 #388.28 -+ # LOE rax rdx rbx rbp rsi rdi r8 r9 r11 r12 r14 r13d xmm0 -+..B3.4: # Preds ..B3.8 ..B3.3 -+ # Execution count [2.72e+01] -+ vmovupd (%rax,%rsi), %xmm2 #390.25 -+ movq %r11, %rcx #395.12 -+ vmulpd 8(%r9){1to2}, %xmm2, %xmm1 #391.12 -+ vpermilpd $1, %xmm1, %xmm1 #392.12 -+ vfmaddsub231pd (%r9){1to2}, %xmm2, %xmm1 #393.12 -+ cmpq %r14, %r11 #395.29 -+ jle ..B3.8 # Prob 10% #395.29 -+ # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r11 r12 r14 r13d xmm0 xmm1 -+..B3.5: # Preds ..B3.4 -+ # Execution count [2.45e+01] -+ lea (%rdi,%rbp), %r10 #396.30 -+ addq %rbx, %r10 #396.30 -+ # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 -+..B3.6: # Preds ..B3.6 ..B3.5 -+ # Execution count [1.36e+02] -+ lea (%rcx,%rcx,2), %r15d #398.34 -+ addl %r15d, %r15d #398.34 -+ decq %rcx #395.32 -+ movslq %r15d, %r15 #398.27 -+ shlq $4, %r15 #398.27 -+ vmovupd (%r8,%r15), %xmm3 #398.27 -+ vmulpd 8(%r10){1to2}, %xmm3, %xmm2 #399.14 -+ vpermilpd $1, %xmm2, %xmm4 #400.14 -+ vfmaddsub231pd (%r10){1to2}, %xmm3, %xmm4 #401.14 -+ addq $-16, %r10 #395.32 -+ vaddpd %xmm4, %xmm1, %xmm1 #402.14 -+ cmpq %r14, %rcx #395.29 -+ jg ..B3.6 # Prob 82% #395.29 -+ # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 -+..B3.8: # Preds ..B3.6 ..B3.4 -+ # Execution count [2.72e+01] -+ vmulpd 8(%r12){1to2}, %xmm1, %xmm2 #407.12 -+ vpermilpd $1, %xmm2, %xmm3 #408.12 -+ addq $-96, %rbp #387.31 -+ vfmaddsub231pd (%r12){1to2}, %xmm1, %xmm3 #409.12 -+ decq %r14 #387.31 -+ vsubpd %xmm3, %xmm0, %xmm1 #410.12 -+ vmovupd %xmm1, (%r9) #411.23 -+ addq $-96, %r9 #387.31 -+ addq $-16, %r12 #387.31 -+ decl %r13d #387.31 -+ jns ..B3.4 # Prob 82% #387.28 -+ # LOE rax rdx rbx rbp rsi rdi r8 r9 r11 r12 r14 r13d xmm0 -+..B3.10: # Preds ..B3.8 ..B3.2 -+ # Execution count [5.00e+00] -+ .byte 15 #386.22 -+ .byte 31 #386.22 -+ .byte 128 #386.22 -+ .byte 0 #386.22 -+ .byte 0 #386.22 -+ .byte 0 #386.22 -+ .byte 0 #386.22 -+ addq $-16, %rax #386.22 -+ decq %rdx #386.22 -+ jg ..B3.2 # Prob 80% #386.19 -+ # LOE rax rdx rsi rdi xmm0 -+..B3.11: # Preds ..B3.10 -+ # Execution count [1.00e+00] -+ .cfi_restore 6 -+ popq %rbp #414.1 -+ .cfi_def_cfa_offset 48 -+ .cfi_restore 3 -+ popq %rbx #414.1 -+ .cfi_def_cfa_offset 40 -+ .cfi_restore 15 -+ popq %r15 #414.1 -+ .cfi_def_cfa_offset 32 -+ .cfi_restore 14 -+ popq %r14 #414.1 -+ .cfi_def_cfa_offset 24 -+ .cfi_restore 13 -+ popq %r13 #414.1 -+ .cfi_def_cfa_offset 16 -+ .cfi_restore 12 -+ popq %r12 #414.1 -+ .cfi_def_cfa_offset 8 -+ ret #414.1 -+ .align 16,0x90 -+ # LOE -+ .cfi_endproc -+# mark_end; -+ .type solv_sys_avx512,@function -+ .size solv_sys_avx512,.-solv_sys_avx512 -+ .data -+# -- End solv_sys_avx512 -+ .text -+# -- Begin bck_house_avx512 -+ .text -+# mark_begin; -+ .align 16,0x90 -+ .globl bck_house_avx512 -+# --- bck_house_avx512(complex_dble *, complex_dble *, double *) -+bck_house_avx512: -+# parameter 1: %rdi -+# parameter 2: %rsi -+# parameter 3: %rdx -+..B4.1: # Preds ..B4.0 -+ # Execution count [1.00e+00] -+ .cfi_startproc -+..___tag_value_bck_house_avx512.62: -+..L63: -+ #417.1 -+ pushq %r12 #417.1 -+ .cfi_def_cfa_offset 16 -+ .cfi_offset 12, -16 -+ pushq %r13 #417.1 -+ .cfi_def_cfa_offset 24 -+ .cfi_offset 13, -24 -+ pushq %r14 #417.1 -+ .cfi_def_cfa_offset 32 -+ .cfi_offset 14, -32 -+ pushq %r15 #417.1 -+ .cfi_def_cfa_offset 40 -+ .cfi_offset 15, -40 -+ pushq %rbx #417.1 -+ .cfi_def_cfa_offset 48 -+ .cfi_offset 3, -48 -+ pushq %rbp #417.1 -+ .cfi_def_cfa_offset 56 -+ .cfi_offset 6, -56 -+ movq %rsi, %r8 #417.1 -+ movq %rdx, %r9 #417.1 -+ xorl %edx, %edx #424.3 -+ xorl %esi, %esi #424.3 -+ vxorpd %xmm0, %xmm0, %xmm0 #441.12 -+ movq 80(%r8), %rax #421.15 -+ movq 88(%r8), %rcx #422.15 -+ movq %rax, 560(%rdi) #421.3 -+ xorl %eax, %eax #436.26 -+ movq %rcx, 568(%rdi) #422.3 -+ xorl %ecx, %ecx #424.3 -+ # LOE rax rdx rdi r8 r9 ecx esi xmm0 -+..B4.2: # Preds ..B4.15 ..B4.1 -+ # Execution count [5.00e+00] -+ movl %edx, %r12d #425.12 -+ movq %r8, %r11 #425.12 -+ movq %r12, %rbp #425.12 -+ movslq %esi, %r15 #427.16 -+ shlq $4, %rbp #425.12 -+ shlq $4, %r15 #427.16 -+ subq %rbp, %r11 #425.12 -+ movq 448(%rdi,%r15), %r13 #427.16 -+ movq 64(%r11), %rbx #425.12 -+ movq %r13, 64(%r11) #427.5 -+ lea 1(%rdx), %r13d #432.5 -+ movq %rbx, 448(%rdi,%r15) #429.5 -+ lea 5(%rcx), %ebx #432.10 -+ movq 456(%rdi,%r15), %r10 #428.16 -+ movq 72(%r11), %r14 #426.12 -+ movq %r10, 72(%r11) #428.5 -+ movq %r14, 456(%rdi,%r15) #430.5 -+ cmpl $6, %ebx #432.27 -+ jge ..B4.9 # Prob 50% #432.27 -+ # LOE rax rdx rbp rdi r8 r9 r11 r12 ecx esi r13d xmm0 -+..B4.3: # Preds ..B4.2 -+ # Execution count [5.00e+00] -+ xorl %r14d, %r14d #432.5 -+ lea 1(%rdx), %r15d #432.5 -+ shrl $1, %r15d #432.5 -+ movl $1, %ebx #432.5 -+ xorl %r10d, %r10d #433.7 -+ testl %r15d, %r15d #432.5 -+ jbe ..B4.7 # Prob 9% #432.5 -+ # LOE rax rdx rbp rdi r8 r9 r11 r12 ecx ebx esi r10d r13d r14d r15d xmm0 -+..B4.4: # Preds ..B4.3 -+ # Execution count [4.50e+00] -+ movq %r8, -24(%rsp) #[spill] -+ movq %r9, -16(%rsp) #[spill] -+ .align 16,0x90 -+ # LOE rax rdx rbp rdi r11 r12 ecx esi r10d r13d r14d r15d xmm0 -+..B4.5: # Preds ..B4.5 ..B4.4 -+ # Execution count [1.25e+01] -+ lea (%rsi,%r10), %ebx #434.18 -+ addl $12, %r10d #432.5 -+ movslq %ebx, %rbx #434.18 -+ lea (%r14,%r14), %r8d #434.7 -+ movslq %r8d, %r8 #434.7 -+ incl %r14d #432.5 -+ shlq $4, %rbx #434.18 -+ shlq $4, %r8 #434.7 -+ movq 552(%rdi,%rbx), %r9 #434.18 -+ movq %r9, 88(%r11,%r8) #434.7 -+ movq 544(%rdi,%rbx), %r9 #433.18 -+ movq %r9, 80(%r11,%r8) #433.7 -+ movq 648(%rdi,%rbx), %r9 #434.18 -+ movq %rax, 552(%rdi,%rbx) #436.7 -+ movq %r9, 104(%r11,%r8) #434.7 -+ movq 640(%rdi,%rbx), %r9 #433.18 -+ movq %rax, 544(%rdi,%rbx) #435.7 -+ movq %r9, 96(%r11,%r8) #433.7 -+ movq %rax, 648(%rdi,%rbx) #436.7 -+ movq %rax, 640(%rdi,%rbx) #435.7 -+ cmpl %r15d, %r14d #432.5 -+ jb ..B4.5 # Prob 63% #432.5 -+ # LOE rax rdx rbp rdi r11 r12 ecx esi r10d r13d r14d r15d xmm0 -+..B4.6: # Preds ..B4.5 -+ # Execution count [4.50e+00] -+ movq -24(%rsp), %r8 #[spill] -+ lea 1(%r14,%r14), %ebx #433.7 -+ movq -16(%rsp), %r9 #[spill] -+ # LOE rax rdx rbp rdi r8 r9 r11 r12 ecx ebx esi r13d xmm0 -+..B4.7: # Preds ..B4.6 ..B4.3 -+ # Execution count [5.00e+00] -+ lea -1(%rbx), %r10d #432.5 -+ cmpl %r13d, %r10d #432.5 -+ jae ..B4.9 # Prob 9% #432.5 -+ # LOE rax rdx rbp rdi r8 r9 r11 r12 ecx ebx esi r13d xmm0 -+..B4.8: # Preds ..B4.7 -+ # Execution count [4.50e+00] -+ movslq %ebx, %r10 #434.7 -+ lea (%rbx,%rbx,2), %ebx #434.18 -+ subq %r12, %r10 #434.7 -+ lea (%rsi,%rbx,2), %r14d #434.18 -+ movslq %r14d, %r14 #434.18 -+ shlq $4, %r14 #434.18 -+ shlq $4, %r10 #434.7 -+ movq 456(%rdi,%r14), %r15 #434.18 -+ movq %r15, 72(%r8,%r10) #434.7 -+ movq 448(%rdi,%r14), %r15 #433.18 -+ movq %r15, 64(%r8,%r10) #433.7 -+ movq %rax, 456(%rdi,%r14) #436.7 -+ movq %rax, 448(%rdi,%r14) #435.7 -+ # LOE rax rdx rbp rdi r8 r9 r11 r12 ecx esi r13d xmm0 -+..B4.9: # Preds ..B4.2 ..B4.8 ..B4.7 -+ # Execution count [3.96e-01] -+ shlq $3, %r12 #453.28 -+ negq %rbp #444.30 -+ negq %r12 #453.28 -+ addq %r9, %r12 #453.28 -+ addq %rdi, %rbp #444.30 -+ addq $2, %rdx #443.23 -+ xorb %bl, %bl #439.5 -+ # LOE rax rdx rbp rdi r8 r9 r11 r12 ecx esi r13d bl xmm0 -+..B4.10: # Preds ..B4.14 ..B4.9 -+ # Execution count [2.54e+01] -+ movq %rax, %r14 #443.7 -+ vmovapd %xmm0, %xmm1 #441.12 -+ movq %r14, %r10 #443.7 -+ # LOE rax rdx rbp rdi r8 r9 r10 r11 r12 r14 ecx esi r13d bl xmm0 xmm1 -+..B4.11: # Preds ..B4.11 ..B4.10 -+ # Execution count [1.41e+02] -+ vmovupd 64(%r10,%r11), %xmm3 #446.27 -+ incq %r14 #443.7 -+ vmulpd 72(%r10,%rbp){1to2}, %xmm3, %xmm2 #447.14 -+ vpermilpd $1, %xmm2, %xmm4 #448.14 -+ vfmaddsub231pd 64(%r10,%rbp){1to2}, %xmm3, %xmm4 #449.14 -+ addq $16, %r10 #443.7 -+ vaddpd %xmm1, %xmm4, %xmm1 #450.14 -+ cmpq %rdx, %r14 #443.7 -+ jb ..B4.11 # Prob 82% #443.7 -+ # LOE rax rdx rbp rdi r8 r9 r10 r11 r12 r14 ecx esi r13d bl xmm0 xmm1 -+..B4.12: # Preds ..B4.11 -+ # Execution count [2.70e+01] -+ movq %rax, %r15 #456.7 -+ lea 64(%rbp), %r10 #456.7 -+ vmulpd 32(%r12){1to2}, %xmm1, %xmm1 #454.12 -+ movq %r15, %r14 #456.7 -+ # LOE rax rdx rbp rdi r8 r9 r10 r11 r12 r14 r15 ecx esi r13d bl xmm0 xmm1 -+..B4.13: # Preds ..B4.13 ..B4.12 -+ # Execution count [1.50e+02] -+ vmulpd 72(%r14,%r11){1to2}, %xmm1, %xmm2 #459.14 -+ vpermilpd $1, %xmm2, %xmm4 #460.14 -+ incq %r15 #456.7 -+ vfmsubadd231pd 64(%r14,%r11){1to2}, %xmm1, %xmm4 #461.14 -+ addq $16, %r14 #456.7 -+ vmovupd (%r10), %xmm3 #463.28 -+ vsubpd %xmm4, %xmm3, %xmm5 #464.14 -+ vmovupd %xmm5, (%r10) #465.25 -+ addq $16, %r10 #456.7 -+ cmpq %rdx, %r15 #456.7 -+ jb ..B4.13 # Prob 82% #456.7 -+ # LOE rax rdx rbp rdi r8 r9 r10 r11 r12 r14 r15 ecx esi r13d bl xmm0 xmm1 -+..B4.14: # Preds ..B4.13 -+ # Execution count [3.00e+01] -+ incb %bl #439.5 -+ addq $96, %rbp #439.5 -+ cmpb $6, %bl #439.5 -+ jb ..B4.10 # Prob 83% #439.5 -+ # LOE rax rdx rbp rdi r8 r9 r11 r12 ecx esi r13d bl xmm0 -+..B4.15: # Preds ..B4.14 -+ # Execution count [5.00e+00] -+ addl $-7, %esi #432.5 -+ decl %ecx #432.5 -+ movl %r13d, %edx #424.3 -+ cmpl $5, %r13d #424.3 -+ jb ..B4.2 # Prob 79% #424.3 -+ # LOE rax rdx rdi r8 r9 ecx esi xmm0 -+..B4.16: # Preds ..B4.15 -+ # Execution count [1.00e+00] -+ .cfi_restore 6 -+ popq %rbp #469.1 -+ .cfi_def_cfa_offset 48 -+ .cfi_restore 3 -+ popq %rbx #469.1 -+ .cfi_def_cfa_offset 40 -+ .cfi_restore 15 -+ popq %r15 #469.1 -+ .cfi_def_cfa_offset 32 -+ .cfi_restore 14 -+ popq %r14 #469.1 -+ .cfi_def_cfa_offset 24 -+ .cfi_restore 13 -+ popq %r13 #469.1 -+ .cfi_def_cfa_offset 16 -+ .cfi_restore 12 -+ popq %r12 #469.1 -+ .cfi_def_cfa_offset 8 -+ ret #469.1 -+ .align 16,0x90 -+ # LOE -+ .cfi_endproc -+# mark_end; -+ .type bck_house_avx512,@function -+ .size bck_house_avx512,.-bck_house_avx512 -+ .data -+# -- End bck_house_avx512 -+ .section .rodata, "a" -+ .space 8, 0x00 # pad -+ .align 64 -+.L_2il0floatpacket.9: -+ .long 0x00000000,0x00000000,0x00000001,0x00000000,0x0000000c,0x00000000,0x0000000d,0x00000000,0x00000000,0x00000000,0x00000001,0x00000000,0x0000000c,0x00000000,0x0000000d,0x00000000 -+ .type .L_2il0floatpacket.9,@object -+ .size .L_2il0floatpacket.9,64 -+ .align 64 -+.L_2il0floatpacket.10: -+ .long 0x00000002,0x00000000,0x00000003,0x00000000,0x0000000e,0x00000000,0x0000000f,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x0000000e,0x00000000,0x0000000f,0x00000000 -+ .type .L_2il0floatpacket.10,@object -+ .size .L_2il0floatpacket.10,64 -+ .align 64 -+.L_2il0floatpacket.11: -+ .long 0x00000004,0x00000000,0x00000005,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000 -+ .type .L_2il0floatpacket.11,@object -+ .size .L_2il0floatpacket.11,64 -+ .align 64 -+.L_2il0floatpacket.12: -+ .long 0x00000006,0x00000000,0x00000007,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000 -+ .type .L_2il0floatpacket.12,@object -+ .size .L_2il0floatpacket.12,64 -+ .align 64 -+.L_2il0floatpacket.13: -+ .long 0x00000000,0x00000000,0x00000001,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000,0x0000000e,0x00000000,0x0000000f,0x00000000 -+ .type .L_2il0floatpacket.13,@object -+ .size .L_2il0floatpacket.13,64 -+ .align 64 -+.L_2il0floatpacket.14: -+ .long 0x00000004,0x00000000,0x00000005,0x00000000,0x0000000c,0x00000000,0x0000000d,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000 -+ .type .L_2il0floatpacket.14,@object -+ .size .L_2il0floatpacket.14,64 -+ .align 64 -+.L_2il0floatpacket.15: -+ .long 0x00000000,0x00000000,0x00000000,0x00000000,0x00000002,0x00000000,0x00000002,0x00000000,0x0000000c,0x00000000,0x0000000c,0x00000000,0x0000000e,0x00000000,0x0000000e,0x00000000 -+ .type .L_2il0floatpacket.15,@object -+ .size .L_2il0floatpacket.15,64 -+ .align 64 -+.L_2il0floatpacket.16: -+ .long 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x0000000d,0x00000000,0x0000000d,0x00000000,0x0000000f,0x00000000,0x0000000f,0x00000000 -+ .type .L_2il0floatpacket.16,@object -+ .size .L_2il0floatpacket.16,64 -+ .align 64 -+.L_2il0floatpacket.17: -+ .long 0x00000002,0x00000000,0x00000003,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000,0x0000000c,0x00000000,0x0000000d,0x00000000 -+ .type .L_2il0floatpacket.17,@object -+ .size .L_2il0floatpacket.17,64 -+ .align 64 -+.L_2il0floatpacket.18: -+ .long 0x00000004,0x00000000,0x00000004,0x00000000,0x00000006,0x00000000,0x00000006,0x00000000,0x00000009,0x00000000,0x00000009,0x00000000,0x0000000b,0x00000000,0x0000000b,0x00000000 -+ .type .L_2il0floatpacket.18,@object -+ .size .L_2il0floatpacket.18,64 -+ .align 64 -+.L_2il0floatpacket.19: -+ .long 0x00000005,0x00000000,0x00000005,0x00000000,0x00000007,0x00000000,0x00000007,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000 -+ .type .L_2il0floatpacket.19,@object -+ .size .L_2il0floatpacket.19,64 -+ .align 64 -+.L_2il0floatpacket.20: -+ .long 0x00000001,0x00000000,0x00000001,0x00000000,0x00000003,0x00000000,0x00000003,0x00000000,0x0000000c,0x00000000,0x0000000c,0x00000000,0x0000000e,0x00000000,0x0000000e,0x00000000 -+ .type .L_2il0floatpacket.20,@object -+ .size .L_2il0floatpacket.20,64 -+ .align 64 -+.L_2il0floatpacket.21: -+ .long 0x00000004,0x00000000,0x00000004,0x00000000,0x00000006,0x00000000,0x00000006,0x00000000,0x0000000c,0x00000000,0x0000000c,0x00000000,0x0000000e,0x00000000,0x0000000e,0x00000000 -+ .type .L_2il0floatpacket.21,@object -+ .size .L_2il0floatpacket.21,64 -+ .align 64 -+.L_2il0floatpacket.22: -+ .long 0x00000004,0x00000000,0x00000005,0x00000000,0x0000000c,0x00000000,0x0000000d,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000,0x0000000e,0x00000000,0x0000000f,0x00000000 -+ .type .L_2il0floatpacket.22,@object -+ .size .L_2il0floatpacket.22,64 -+ .align 64 -+.L_2il0floatpacket.23: -+ .long 0x00000000,0x00000000,0x00000000,0x00000000,0x00000002,0x00000000,0x00000002,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000,0x0000000a,0x00000000,0x0000000a,0x00000000 -+ .type .L_2il0floatpacket.23,@object -+ .size .L_2il0floatpacket.23,64 -+ .align 64 -+.L_2il0floatpacket.24: -+ .long 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000009,0x00000000,0x00000009,0x00000000,0x0000000b,0x00000000,0x0000000b,0x00000000 -+ .type .L_2il0floatpacket.24,@object -+ .size .L_2il0floatpacket.24,64 -+ .align 64 -+.L_2il0floatpacket.25: -+ .long 0x00000000,0x00000000,0x00000000,0x00000000,0x00000002,0x00000000,0x00000002,0x00000000,0x0000000d,0x00000000,0x0000000d,0x00000000,0x0000000f,0x00000000,0x0000000f,0x00000000 -+ .type .L_2il0floatpacket.25,@object -+ .size .L_2il0floatpacket.25,64 -+ .align 64 -+.L_2il0floatpacket.26: -+ .long 0x00000001,0x00000000,0x00000001,0x00000000,0x00000003,0x00000000,0x00000003,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000 -+ .type .L_2il0floatpacket.26,@object -+ .size .L_2il0floatpacket.26,64 -+ .align 64 -+.L_2il0floatpacket.27: -+ .long 0x00000004,0x00000000,0x00000004,0x00000000,0x00000006,0x00000000,0x00000006,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000,0x0000000a,0x00000000,0x0000000a,0x00000000 -+ .type .L_2il0floatpacket.27,@object -+ .size .L_2il0floatpacket.27,64 -+ .align 64 -+.L_2il0floatpacket.28: -+ .long 0x00000005,0x00000000,0x00000005,0x00000000,0x00000007,0x00000000,0x00000007,0x00000000,0x00000009,0x00000000,0x00000009,0x00000000,0x0000000b,0x00000000,0x0000000b,0x00000000 -+ .type .L_2il0floatpacket.28,@object -+ .size .L_2il0floatpacket.28,64 -+ .align 64 -+.L_2il0floatpacket.29: -+ .long 0x00000001,0x00000000,0x00000001,0x00000000,0x00000003,0x00000000,0x00000003,0x00000000,0x0000000d,0x00000000,0x0000000d,0x00000000,0x0000000f,0x00000000,0x0000000f,0x00000000 -+ .type .L_2il0floatpacket.29,@object -+ .size .L_2il0floatpacket.29,64 -+ .align 64 -+.L_2il0floatpacket.30: -+ .long 0x00000000,0x00000000,0x00000000,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000,0x00000006,0x00000000,0x00000006,0x00000000,0x0000000e,0x00000000,0x0000000e,0x00000000 -+ .type .L_2il0floatpacket.30,@object -+ .size .L_2il0floatpacket.30,64 -+ .align 64 -+.L_2il0floatpacket.31: -+ .long 0x00000001,0x00000000,0x00000001,0x00000000,0x00000009,0x00000000,0x00000009,0x00000000,0x00000007,0x00000000,0x00000007,0x00000000,0x0000000f,0x00000000,0x0000000f,0x00000000 -+ .type .L_2il0floatpacket.31,@object -+ .size .L_2il0floatpacket.31,64 -+ .align 64 -+.L_2il0floatpacket.32: -+ .long 0x00000002,0x00000000,0x00000003,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000,0x00000000,0x00000000,0x00000001,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000 -+ .type .L_2il0floatpacket.32,@object -+ .size .L_2il0floatpacket.32,64 -+ .align 64 -+.L_2il0floatpacket.33: -+ .long 0x00000000,0x00000000,0x00000001,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000 -+ .type .L_2il0floatpacket.33,@object -+ .size .L_2il0floatpacket.33,64 -+ .align 64 -+.L_2il0floatpacket.34: -+ .long 0x00000006,0x00000000,0x00000007,0x00000000,0x0000000e,0x00000000,0x0000000f,0x00000000,0x00000000,0x00000000,0x00000001,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000 -+ .type .L_2il0floatpacket.34,@object -+ .size .L_2il0floatpacket.34,64 -+ .align 64 -+.L_2il0floatpacket.35: -+ .long 0x00000002,0x00000000,0x00000003,0x00000000,0x0000000e,0x00000000,0x0000000f,0x00000000,0x00000000,0x00000000,0x00000001,0x00000000,0x0000000c,0x00000000,0x0000000d,0x00000000 -+ .type .L_2il0floatpacket.35,@object -+ .size .L_2il0floatpacket.35,64 -+ .align 16 -+.L_2il0floatpacket.37: -+ .long 0x00000000,0x80000000,0x00000000,0x00000000 -+ .type .L_2il0floatpacket.37,@object -+ .size .L_2il0floatpacket.37,16 -+ .align 8 -+.L_2il0floatpacket.36: -+ .long 0x00000000,0x3cb00000 -+ .type .L_2il0floatpacket.36,@object -+ .size .L_2il0floatpacket.36,8 -+ .align 8 -+.L_2il0floatpacket.38: -+ .long 0x00000000,0x3ff00000 -+ .type .L_2il0floatpacket.38,@object -+ .size .L_2il0floatpacket.38,8 -+ .data -+ .section .note.GNU-stack, "" -+// -- Begin DWARF2 SEGMENT .eh_frame -+ .section .eh_frame,"a",@progbits -+.eh_frame_seg: -+ .align 8 -+# End -diff --git a/modules/sw_term/pauli.c b/modules/sw_term/pauli.c -index 7c8a7b6..2547f1d 100644 ---- a/modules/sw_term/pauli.c -+++ b/modules/sw_term/pauli.c -@@ -409,7 +409,105 @@ void mul_pauli(float mu,pauli *m,weyl *s,weyl *r) - "xmm6", "xmm7"); - } - --#if (defined AVX) -+ -+#else -+ -+static weyl rs; -+ -+void mul_pauli(float mu, pauli *m, weyl *s, weyl *r) -+{ -+ float const *u; -+ -+ u = (*m).u; -+ -+ rs.c1.c1.re = -+ u[0] * (*s).c1.c1.re - mu * (*s).c1.c1.im + u[6] * (*s).c1.c2.re - -+ u[7] * (*s).c1.c2.im + u[8] * (*s).c1.c3.re - u[9] * (*s).c1.c3.im + -+ u[10] * (*s).c2.c1.re - u[11] * (*s).c2.c1.im + u[12] * (*s).c2.c2.re - -+ u[13] * (*s).c2.c2.im + u[14] * (*s).c2.c3.re - u[15] * (*s).c2.c3.im; -+ -+ rs.c1.c1.im = -+ u[0] * (*s).c1.c1.im + mu * (*s).c1.c1.re + u[6] * (*s).c1.c2.im + -+ u[7] * (*s).c1.c2.re + u[8] * (*s).c1.c3.im + u[9] * (*s).c1.c3.re + -+ u[10] * (*s).c2.c1.im + u[11] * (*s).c2.c1.re + u[12] * (*s).c2.c2.im + -+ u[13] * (*s).c2.c2.re + u[14] * (*s).c2.c3.im + u[15] * (*s).c2.c3.re; -+ -+ rs.c1.c2.re = -+ u[6] * (*s).c1.c1.re + u[7] * (*s).c1.c1.im + u[1] * (*s).c1.c2.re - -+ mu * (*s).c1.c2.im + u[16] * (*s).c1.c3.re - u[17] * (*s).c1.c3.im + -+ u[18] * (*s).c2.c1.re - u[19] * (*s).c2.c1.im + u[20] * (*s).c2.c2.re - -+ u[21] * (*s).c2.c2.im + u[22] * (*s).c2.c3.re - u[23] * (*s).c2.c3.im; -+ -+ rs.c1.c2.im = -+ u[6] * (*s).c1.c1.im - u[7] * (*s).c1.c1.re + u[1] * (*s).c1.c2.im + -+ mu * (*s).c1.c2.re + u[16] * (*s).c1.c3.im + u[17] * (*s).c1.c3.re + -+ u[18] * (*s).c2.c1.im + u[19] * (*s).c2.c1.re + u[20] * (*s).c2.c2.im + -+ u[21] * (*s).c2.c2.re + u[22] * (*s).c2.c3.im + u[23] * (*s).c2.c3.re; -+ -+ rs.c1.c3.re = -+ u[8] * (*s).c1.c1.re + u[9] * (*s).c1.c1.im + u[16] * (*s).c1.c2.re + -+ u[17] * (*s).c1.c2.im + u[2] * (*s).c1.c3.re - mu * (*s).c1.c3.im + -+ u[24] * (*s).c2.c1.re - u[25] * (*s).c2.c1.im + u[26] * (*s).c2.c2.re - -+ u[27] * (*s).c2.c2.im + u[28] * (*s).c2.c3.re - u[29] * (*s).c2.c3.im; -+ -+ rs.c1.c3.im = -+ u[8] * (*s).c1.c1.im - u[9] * (*s).c1.c1.re + u[16] * (*s).c1.c2.im - -+ u[17] * (*s).c1.c2.re + u[2] * (*s).c1.c3.im + mu * (*s).c1.c3.re + -+ u[24] * (*s).c2.c1.im + u[25] * (*s).c2.c1.re + u[26] * (*s).c2.c2.im + -+ u[27] * (*s).c2.c2.re + u[28] * (*s).c2.c3.im + u[29] * (*s).c2.c3.re; -+ -+ rs.c2.c1.re = -+ u[10] * (*s).c1.c1.re + u[11] * (*s).c1.c1.im + u[18] * (*s).c1.c2.re + -+ u[19] * (*s).c1.c2.im + u[24] * (*s).c1.c3.re + u[25] * (*s).c1.c3.im + -+ u[3] * (*s).c2.c1.re - mu * (*s).c2.c1.im + u[30] * (*s).c2.c2.re - -+ u[31] * (*s).c2.c2.im + u[32] * (*s).c2.c3.re - u[33] * (*s).c2.c3.im; -+ -+ rs.c2.c1.im = -+ u[10] * (*s).c1.c1.im - u[11] * (*s).c1.c1.re + u[18] * (*s).c1.c2.im - -+ u[19] * (*s).c1.c2.re + u[24] * (*s).c1.c3.im - u[25] * (*s).c1.c3.re + -+ u[3] * (*s).c2.c1.im + mu * (*s).c2.c1.re + u[30] * (*s).c2.c2.im + -+ u[31] * (*s).c2.c2.re + u[32] * (*s).c2.c3.im + u[33] * (*s).c2.c3.re; -+ -+ rs.c2.c2.re = -+ u[12] * (*s).c1.c1.re + u[13] * (*s).c1.c1.im + u[20] * (*s).c1.c2.re + -+ u[21] * (*s).c1.c2.im + u[26] * (*s).c1.c3.re + u[27] * (*s).c1.c3.im + -+ u[30] * (*s).c2.c1.re + u[31] * (*s).c2.c1.im + u[4] * (*s).c2.c2.re - -+ mu * (*s).c2.c2.im + u[34] * (*s).c2.c3.re - u[35] * (*s).c2.c3.im; -+ -+ rs.c2.c2.im = -+ u[12] * (*s).c1.c1.im - u[13] * (*s).c1.c1.re + u[20] * (*s).c1.c2.im - -+ u[21] * (*s).c1.c2.re + u[26] * (*s).c1.c3.im - u[27] * (*s).c1.c3.re + -+ u[30] * (*s).c2.c1.im - u[31] * (*s).c2.c1.re + u[4] * (*s).c2.c2.im + -+ mu * (*s).c2.c2.re + u[34] * (*s).c2.c3.im + u[35] * (*s).c2.c3.re; -+ -+ rs.c2.c3.re = -+ u[14] * (*s).c1.c1.re + u[15] * (*s).c1.c1.im + u[22] * (*s).c1.c2.re + -+ u[23] * (*s).c1.c2.im + u[28] * (*s).c1.c3.re + u[29] * (*s).c1.c3.im + -+ u[32] * (*s).c2.c1.re + u[33] * (*s).c2.c1.im + u[34] * (*s).c2.c2.re + -+ u[35] * (*s).c2.c2.im + u[5] * (*s).c2.c3.re - mu * (*s).c2.c3.im; -+ -+ rs.c2.c3.im = -+ u[14] * (*s).c1.c1.im - u[15] * (*s).c1.c1.re + u[22] * (*s).c1.c2.im - -+ u[23] * (*s).c1.c2.re + u[28] * (*s).c1.c3.im - u[29] * (*s).c1.c3.re + -+ u[32] * (*s).c2.c1.im - u[33] * (*s).c2.c1.re + u[34] * (*s).c2.c2.im - -+ u[35] * (*s).c2.c2.re + u[5] * (*s).c2.c3.im + mu * (*s).c2.c3.re; -+ -+ (*r) = rs; -+} -+ -+#endif -+ -+ -+#ifdef AVX512 -+ -+void mul_pauli2_avx512(float mu, pauli *m, spinor *source, spinor *res ); -+void mul_pauli2(float mu, pauli *m, spinor *source, spinor *res ) -+{ -+ mul_pauli2_avx512( mu, m, source, res ); -+} -+ -+ -+#elif (defined AVX) - #include "avx.h" - - void mul_pauli2(float mu,pauli *m,spinor *s,spinor *r) -@@ -940,129 +1038,6 @@ void mul_pauli2(float mu,pauli *m,spinor *s,spinor *r) - - #else - --void mul_pauli2(float mu,pauli *m,spinor *s,spinor *r) --{ -- spin_t *ps,*pr; -- -- ps=(spin_t*)(s); -- pr=(spin_t*)(r); -- -- mul_pauli(mu,m,(*ps).w,(*pr).w); -- mul_pauli(-mu,m+1,(*ps).w+1,(*pr).w+1); --} -- --#endif --#else -- --static weyl rs; -- -- --void mul_pauli(float mu,pauli *m,weyl *s,weyl *r) --{ -- float *u; -- -- u=(*m).u; -- -- rs.c1.c1.re= -- u[ 0]*(*s).c1.c1.re- mu*(*s).c1.c1.im+ -- u[ 6]*(*s).c1.c2.re-u[ 7]*(*s).c1.c2.im+ -- u[ 8]*(*s).c1.c3.re-u[ 9]*(*s).c1.c3.im+ -- u[10]*(*s).c2.c1.re-u[11]*(*s).c2.c1.im+ -- u[12]*(*s).c2.c2.re-u[13]*(*s).c2.c2.im+ -- u[14]*(*s).c2.c3.re-u[15]*(*s).c2.c3.im; -- -- rs.c1.c1.im= -- u[ 0]*(*s).c1.c1.im+ mu*(*s).c1.c1.re+ -- u[ 6]*(*s).c1.c2.im+u[ 7]*(*s).c1.c2.re+ -- u[ 8]*(*s).c1.c3.im+u[ 9]*(*s).c1.c3.re+ -- u[10]*(*s).c2.c1.im+u[11]*(*s).c2.c1.re+ -- u[12]*(*s).c2.c2.im+u[13]*(*s).c2.c2.re+ -- u[14]*(*s).c2.c3.im+u[15]*(*s).c2.c3.re; -- -- rs.c1.c2.re= -- u[ 6]*(*s).c1.c1.re+u[ 7]*(*s).c1.c1.im+ -- u[ 1]*(*s).c1.c2.re- mu*(*s).c1.c2.im+ -- u[16]*(*s).c1.c3.re-u[17]*(*s).c1.c3.im+ -- u[18]*(*s).c2.c1.re-u[19]*(*s).c2.c1.im+ -- u[20]*(*s).c2.c2.re-u[21]*(*s).c2.c2.im+ -- u[22]*(*s).c2.c3.re-u[23]*(*s).c2.c3.im; -- -- rs.c1.c2.im= -- u[ 6]*(*s).c1.c1.im-u[ 7]*(*s).c1.c1.re+ -- u[ 1]*(*s).c1.c2.im+ mu*(*s).c1.c2.re+ -- u[16]*(*s).c1.c3.im+u[17]*(*s).c1.c3.re+ -- u[18]*(*s).c2.c1.im+u[19]*(*s).c2.c1.re+ -- u[20]*(*s).c2.c2.im+u[21]*(*s).c2.c2.re+ -- u[22]*(*s).c2.c3.im+u[23]*(*s).c2.c3.re; -- -- rs.c1.c3.re= -- u[ 8]*(*s).c1.c1.re+u[ 9]*(*s).c1.c1.im+ -- u[16]*(*s).c1.c2.re+u[17]*(*s).c1.c2.im+ -- u[ 2]*(*s).c1.c3.re- mu*(*s).c1.c3.im+ -- u[24]*(*s).c2.c1.re-u[25]*(*s).c2.c1.im+ -- u[26]*(*s).c2.c2.re-u[27]*(*s).c2.c2.im+ -- u[28]*(*s).c2.c3.re-u[29]*(*s).c2.c3.im; -- -- rs.c1.c3.im= -- u[ 8]*(*s).c1.c1.im-u[ 9]*(*s).c1.c1.re+ -- u[16]*(*s).c1.c2.im-u[17]*(*s).c1.c2.re+ -- u[ 2]*(*s).c1.c3.im+ mu*(*s).c1.c3.re+ -- u[24]*(*s).c2.c1.im+u[25]*(*s).c2.c1.re+ -- u[26]*(*s).c2.c2.im+u[27]*(*s).c2.c2.re+ -- u[28]*(*s).c2.c3.im+u[29]*(*s).c2.c3.re; -- -- rs.c2.c1.re= -- u[10]*(*s).c1.c1.re+u[11]*(*s).c1.c1.im+ -- u[18]*(*s).c1.c2.re+u[19]*(*s).c1.c2.im+ -- u[24]*(*s).c1.c3.re+u[25]*(*s).c1.c3.im+ -- u[ 3]*(*s).c2.c1.re- mu*(*s).c2.c1.im+ -- u[30]*(*s).c2.c2.re-u[31]*(*s).c2.c2.im+ -- u[32]*(*s).c2.c3.re-u[33]*(*s).c2.c3.im; -- -- rs.c2.c1.im= -- u[10]*(*s).c1.c1.im-u[11]*(*s).c1.c1.re+ -- u[18]*(*s).c1.c2.im-u[19]*(*s).c1.c2.re+ -- u[24]*(*s).c1.c3.im-u[25]*(*s).c1.c3.re+ -- u[ 3]*(*s).c2.c1.im+ mu*(*s).c2.c1.re+ -- u[30]*(*s).c2.c2.im+u[31]*(*s).c2.c2.re+ -- u[32]*(*s).c2.c3.im+u[33]*(*s).c2.c3.re; -- -- rs.c2.c2.re= -- u[12]*(*s).c1.c1.re+u[13]*(*s).c1.c1.im+ -- u[20]*(*s).c1.c2.re+u[21]*(*s).c1.c2.im+ -- u[26]*(*s).c1.c3.re+u[27]*(*s).c1.c3.im+ -- u[30]*(*s).c2.c1.re+u[31]*(*s).c2.c1.im+ -- u[ 4]*(*s).c2.c2.re- mu*(*s).c2.c2.im+ -- u[34]*(*s).c2.c3.re-u[35]*(*s).c2.c3.im; -- -- rs.c2.c2.im= -- u[12]*(*s).c1.c1.im-u[13]*(*s).c1.c1.re+ -- u[20]*(*s).c1.c2.im-u[21]*(*s).c1.c2.re+ -- u[26]*(*s).c1.c3.im-u[27]*(*s).c1.c3.re+ -- u[30]*(*s).c2.c1.im-u[31]*(*s).c2.c1.re+ -- u[ 4]*(*s).c2.c2.im+ mu*(*s).c2.c2.re+ -- u[34]*(*s).c2.c3.im+u[35]*(*s).c2.c3.re; -- -- rs.c2.c3.re= -- u[14]*(*s).c1.c1.re+u[15]*(*s).c1.c1.im+ -- u[22]*(*s).c1.c2.re+u[23]*(*s).c1.c2.im+ -- u[28]*(*s).c1.c3.re+u[29]*(*s).c1.c3.im+ -- u[32]*(*s).c2.c1.re+u[33]*(*s).c2.c1.im+ -- u[34]*(*s).c2.c2.re+u[35]*(*s).c2.c2.im+ -- u[ 5]*(*s).c2.c3.re- mu*(*s).c2.c3.im; -- -- rs.c2.c3.im= -- u[14]*(*s).c1.c1.im-u[15]*(*s).c1.c1.re+ -- u[22]*(*s).c1.c2.im-u[23]*(*s).c1.c2.re+ -- u[28]*(*s).c1.c3.im-u[29]*(*s).c1.c3.re+ -- u[32]*(*s).c2.c1.im-u[33]*(*s).c2.c1.re+ -- u[34]*(*s).c2.c2.im-u[35]*(*s).c2.c2.re+ -- u[ 5]*(*s).c2.c3.im+ mu*(*s).c2.c3.re; -- -- (*r)=rs; --} -- -- - void mul_pauli2(float mu,pauli *m,spinor *s,spinor *r) - { - spin_t *ps,*pr; -diff --git a/modules/sw_term/pauli_dble.c b/modules/sw_term/pauli_dble.c -index 39be1d5..7b87d16 100644 ---- a/modules/sw_term/pauli_dble.c -+++ b/modules/sw_term/pauli_dble.c -@@ -87,6 +87,45 @@ static complex_dble aa[36] ALIGNED16; - static complex_dble cc[6] ALIGNED16; - static complex_dble dd[6] ALIGNED16; - -+ -+ -+#if (defined AVX512) -+ -+void mul_pauli2_dble_avx512(double mu, pauli_dble *m, weyl_dble *s, weyl_dble *r); -+void mul_pauli2_dble(double mu, pauli_dble *m, weyl_dble *s, weyl_dble *r) -+{ -+ mul_pauli2_dble_avx512( mu, m, s, r ); -+} -+ -+int fwd_house_avx512(double eps, complex_dble *aa, complex_dble *dd, double * rr ); -+static int fwd_house(double eps ){ -+ return fwd_house_avx512( eps, aa, dd, rr ); -+} -+ -+void solv_sys_avx512( complex_dble *aa, complex_dble *dd ); -+static void solv_sys(void){ -+ solv_sys_avx512( aa, dd ); -+} -+ -+void bck_house_avx512( complex_dble *aa, complex_dble *dd, double * rr ); -+static void bck_house(void){ -+ bck_house_avx512( aa, dd, rr ); -+} -+ -+#else -+ -+void mul_pauli2_dble(double mu, pauli_dble *m, weyl_dble *s, weyl_dble *r) -+{ -+ mul_pauli_dble( mu, m, s, r ); -+ mul_pauli_dble( -mu, m+1, s+1, r+1 ); -+} -+#endif -+ -+ -+ -+ -+ -+ - #if (defined x64) - #include "sse2.h" - -@@ -997,6 +1036,7 @@ void mul_pauli_dble(double mu,pauli_dble *m,weyl_dble *s,weyl_dble *r) - - #endif - -+#ifndef AVX512 - static int fwd_house(double eps) - { - int i,j,k,ifail; -@@ -1313,10 +1353,12 @@ static void bck_house(void) - } - } - -+#endif -+ - #else - --static weyl_dble rs; - -+static weyl_dble rs; - - void mul_pauli_dble(double mu,pauli_dble *m,weyl_dble *s,weyl_dble *r) - { -@@ -1423,7 +1465,7 @@ void mul_pauli_dble(double mu,pauli_dble *m,weyl_dble *s,weyl_dble *r) - (*r)=rs; - } - -- -+#ifndef AVX512 - static int fwd_house(double eps) - { - int i,j,k,ifail; -@@ -1582,6 +1624,8 @@ static void bck_house(void) - - #endif - -+#endif -+ - static double set_aa(double mu,pauli_dble *m) - { - int i,j; -@@ -1780,10 +1824,8 @@ void apply_sw_dble(int vol,double mu,pauli_dble *m,spinor_dble *s, - - for (;ps #include #include @@ -215,3 +217,5 @@ void deo_avx512(int *piup, int *pidn, su3 *u, spinor *pl, float ceo, spin_t *rs) _avx512_dirac_combine_f_4( a3, b3 ); _avx512_write_4_halfspinor_f_reverse_dn( a1, a2, a3, &(*sm).c3.c1.re, &(*sp).c3.c1.re, &(*sm2).c3.c1.re, &(*sp2).c3.c1.re ); } + +#endif \ No newline at end of file diff --git a/modules/dirac/avx512/Dw_avx512_asm.s b/modules/dirac/avx512/Dw_avx512_asm.s deleted file mode 100644 index 4ccc5db..0000000 --- a/modules/dirac/avx512/Dw_avx512_asm.s +++ /dev/null @@ -1,1064 +0,0 @@ -# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 17.0.4.196 Build 20170411"; -# mark_description "-I../../../include -I.. -I/cineca/prod/opt/compilers/intel/pe-xe-2017/binary/impi/2017.3.196/intel64/include"; -# mark_description " -isystem /cineca/prod/opt/compilers/intel/pe-xe-2018/binary/impi/2018.1.163/include64/ -std=c89 -xCORE-AVX5"; -# mark_description "12 -mtune=skylake -DAVX512 -O3 -Ddirac_counters -pedantic -fstrict-aliasing -Wno-long-long -Wstrict-prototyp"; -# mark_description "es -S"; - .file "Dw_avx512.c" - .text -..TXTST0: -# -- Begin doe_avx512 - .text -# mark_begin; - .align 16,0x90 - .globl doe_avx512 -# --- doe_avx512(int *, int *, su3 *, spinor *, float, spin_t *) -doe_avx512: -# parameter 1: %rdi -# parameter 2: %rsi -# parameter 3: %rdx -# parameter 4: %rcx -# parameter 5: %xmm0 -# parameter 6: %r8 -..B1.1: # Preds ..B1.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_doe_avx512.1: -..L2: - #26.1 - pushq %rbx #26.1 - .cfi_def_cfa_offset 16 - movq %rsp, %rbx #26.1 - .cfi_def_cfa 3, 16 - .cfi_offset 3, -16 - andq $-64, %rsp #26.1 - pushq %rbp #26.1 - pushq %rbp #26.1 - movq 8(%rbx), %rbp #26.1 - movq %rbp, 8(%rsp) #26.1 - movq %rsp, %rbp #26.1 - .cfi_escape 0x10, 0x06, 0x02, 0x76, 0x00 - movslq (%rdi), %rax #39.16 - movslq (%rsi), %r11 #40.16 - movslq 4(%rdi), %r10 #41.17 - vmovss %xmm0, -16(%rbp) #26.1 - lea (%rax,%rax,2), %r9 #39.8 - shlq $5, %r9 #39.8 - lea (%r11,%r11,2), %r11 #40.8 - shlq $5, %r11 #40.8 - lea (%r10,%r10,2), %r10 #41.9 - movslq 4(%rsi), %rax #42.17 - shlq $5, %r10 #41.9 - vmovups (%rcx,%r9), %xmm29 #44.3 - vmovups 16(%rcx,%r9), %xmm10 #44.3 - vmovups 32(%rcx,%r9), %xmm8 #44.3 - vmovups 48(%rcx,%r9), %xmm14 #47.3 - vmovups 64(%rcx,%r9), %xmm12 #47.3 - vmovups 80(%rcx,%r9), %xmm26 #47.3 - vmovups .L_2il0floatpacket.10(%rip), %zmm11 #47.3 - vmovups .L_2il0floatpacket.11(%rip), %zmm6 #47.3 - movslq 8(%rdi), %r9 #51.16 - lea (%rax,%rax,2), %rax #42.9 - shlq $5, %rax #42.9 - vmovaps %zmm11, %zmm25 #47.3 - lea (%r9,%r9,2), %r9 #51.8 - shlq $5, %r9 #51.8 - vinsertf32x4 $1, (%rcx,%r11), %zmm29, %zmm21 #44.3 - vinsertf32x4 $2, (%rcx,%r10), %zmm21, %zmm22 #44.3 - vinsertf32x4 $3, (%rcx,%rax), %zmm22, %zmm19 #44.3 - vinsertf32x4 $1, 16(%rcx,%r11), %zmm10, %zmm16 #44.3 - vinsertf32x4 $1, 32(%rcx,%r11), %zmm8, %zmm9 #44.3 - vinsertf32x4 $1, 48(%rcx,%r11), %zmm14, %zmm0 #47.3 - vinsertf32x4 $1, 64(%rcx,%r11), %zmm12, %zmm7 #47.3 - vinsertf32x4 $1, 80(%rcx,%r11), %zmm26, %zmm29 #47.3 - vinsertf32x4 $2, 16(%rcx,%r10), %zmm16, %zmm17 #44.3 - vinsertf32x4 $2, 32(%rcx,%r10), %zmm9, %zmm18 #44.3 - vinsertf32x4 $2, 48(%rcx,%r10), %zmm0, %zmm15 #47.3 - vinsertf32x4 $2, 64(%rcx,%r10), %zmm7, %zmm24 #47.3 - vinsertf32x4 $2, 80(%rcx,%r10), %zmm29, %zmm5 #47.3 - vinsertf32x4 $3, 16(%rcx,%rax), %zmm17, %zmm20 #44.3 - vinsertf32x4 $3, 32(%rcx,%rax), %zmm18, %zmm13 #44.3 - vinsertf32x4 $3, 48(%rcx,%rax), %zmm15, %zmm22 #47.3 - vinsertf32x4 $3, 64(%rcx,%rax), %zmm24, %zmm16 #47.3 - vinsertf32x4 $3, 80(%rcx,%rax), %zmm5, %zmm28 #47.3 - vshufps $228, %zmm20, %zmm19, %zmm27 #44.3 - vshufps $78, %zmm13, %zmm19, %zmm3 #44.3 - vshufps $228, %zmm13, %zmm20, %zmm4 #44.3 - vpermi2ps %zmm16, %zmm22, %zmm25 #47.3 - vpermt2ps %zmm28, %zmm6, %zmm22 #47.3 - vpermt2ps %zmm28, %zmm11, %zmm16 #47.3 - prefetcht0 (%rcx,%r9) #52.3 - movslq 8(%rsi), %r10 #53.16 - lea (%r10,%r10,2), %rax #53.8 - movl $23055, %r10d #64.3 - shlq $5, %rax #53.8 - kmovw %r10d, %k1 #64.3 - movl $42480, %r10d #64.3 - kmovw %r10d, %k2 #64.3 - movl $38595, %r10d #83.3 - kmovw %r10d, %k3 #83.3 - movl $26940, %r10d #83.3 - kmovw %r10d, %k4 #83.3 - prefetcht0 (%rcx,%rax) #54.3 - movslq 12(%rdi), %rdi #55.16 - lea (%rdi,%rdi,2), %r10 #55.9 - shlq $5, %r10 #55.9 - prefetcht0 (%rcx,%r10) #56.3 - movslq 12(%rsi), %rsi #57.16 - lea (%rsi,%rsi,2), %rdi #57.9 - shlq $5, %rdi #57.9 - prefetcht0 (%rcx,%rdi) #58.3 - vmovups .L_2il0floatpacket.12(%rip), %zmm11 #64.3 - vmovups (%rdx), %zmm31 #68.3 - vmovups 144(%rdx), %zmm0 #68.3 - vmovups .L_2il0floatpacket.15(%rip), %zmm12 #68.3 - vmovups .L_2il0floatpacket.14(%rip), %zmm7 #68.3 - vmovups .L_2il0floatpacket.18(%rip), %zmm15 #68.3 - vmovups .L_2il0floatpacket.17(%rip), %zmm14 #68.3 - vmovups .L_2il0floatpacket.16(%rip), %zmm13 #68.3 - vmovups 64(%rdx), %zmm5 #68.3 - vmovups 208(%rdx), %zmm6 #68.3 - vpermps %zmm22, %zmm11, %zmm10 #65.3 - vpermps %zmm25, %zmm11, %zmm21 #64.3 - vpermps %zmm16, %zmm11, %zmm17 #66.3 - vaddps %zmm10, %zmm3, %zmm3{%k1} #65.3 - vaddps %zmm21, %zmm27, %zmm27{%k1} #64.3 - vaddps %zmm17, %zmm4, %zmm4{%k1} #66.3 - vsubps %zmm10, %zmm3, %zmm3{%k2} #65.3 - vsubps %zmm21, %zmm27, %zmm27{%k2} #64.3 - vsubps %zmm17, %zmm4, %zmm4{%k2} #66.3 - vmovups .L_2il0floatpacket.13(%rip), %zmm10 #68.3 - vmovups .L_2il0floatpacket.20(%rip), %zmm17 #68.3 - vmovups .L_2il0floatpacket.19(%rip), %zmm16 #68.3 - vmovups .L_2il0floatpacket.27(%rip), %zmm22 #68.3 - vmovaps %zmm31, %zmm26 #68.3 - vpermt2ps 72(%rdx), %zmm10, %zmm26 #68.3 - vmovaps %zmm0, %zmm24 #68.3 - vpermt2ps 216(%rdx), %zmm10, %zmm24 #68.3 - vmovaps %zmm26, %zmm9 #68.3 - vpermt2ps %zmm24, %zmm12, %zmm9 #68.3 - vmovaps %zmm26, %zmm8 #68.3 - vpermt2ps %zmm24, %zmm7, %zmm8 #68.3 - vmulps %zmm9, %zmm3, %zmm28 #68.3 - vmulps %zmm8, %zmm27, %zmm25 #68.3 - vmovups .L_2il0floatpacket.23(%rip), %zmm9 #68.3 - vmovups .L_2il0floatpacket.21(%rip), %zmm8 #68.3 - vpermt2ps 72(%rdx), %zmm9, %zmm31 #68.3 - vpermt2ps 216(%rdx), %zmm9, %zmm0 #68.3 - vpermilps $177, %zmm3, %zmm2 #68.3 - vmulps %zmm2, %zmm15, %zmm1 #68.3 - vpermilps $177, %zmm27, %zmm20 #68.3 - vmovaps %zmm26, %zmm19 #68.3 - vmulps %zmm15, %zmm20, %zmm30 #68.3 - vmovups .L_2il0floatpacket.25(%rip), %zmm20 #68.3 - vpermt2ps %zmm24, %zmm14, %zmm19 #68.3 - vmovaps %zmm26, %zmm18 #68.3 - vpermt2ps %zmm24, %zmm13, %zmm18 #68.3 - vfmadd231ps %zmm27, %zmm19, %zmm28 #68.3 - vmovups .L_2il0floatpacket.24(%rip), %zmm19 #68.3 - vfmadd231ps %zmm3, %zmm18, %zmm25 #68.3 - vmovups .L_2il0floatpacket.22(%rip), %zmm18 #68.3 - vmovaps %zmm26, %zmm29 #68.3 - vpermt2ps %zmm24, %zmm17, %zmm29 #68.3 - vmovaps %zmm26, %zmm23 #68.3 - vpermt2ps %zmm24, %zmm16, %zmm23 #68.3 - vfmadd231ps %zmm1, %zmm29, %zmm28 #68.3 - vfmadd231ps %zmm30, %zmm23, %zmm25 #68.3 - vmovaps %zmm26, %zmm21 #68.3 - vpermt2ps %zmm24, %zmm18, %zmm26 #68.3 - vpermt2ps %zmm24, %zmm8, %zmm21 #68.3 - vfmadd231ps %zmm30, %zmm26, %zmm28 #68.3 - vfmadd231ps %zmm1, %zmm21, %zmm25 #68.3 - vmovups .L_2il0floatpacket.26(%rip), %zmm21 #68.3 - vmovaps %zmm31, %zmm26 #68.3 - vpermt2ps %zmm0, %zmm20, %zmm26 #68.3 - vmulps %zmm26, %zmm27, %zmm26 #68.3 - vmovaps %zmm31, %zmm27 #68.3 - vmovaps %zmm31, %zmm24 #68.3 - vpermt2ps %zmm0, %zmm21, %zmm27 #68.3 - vpermt2ps %zmm0, %zmm19, %zmm24 #68.3 - vfmadd231ps %zmm4, %zmm27, %zmm28 #68.3 - vfmadd231ps %zmm4, %zmm24, %zmm25 #68.3 - vpermilps $177, %zmm4, %zmm27 #68.3 - vmovaps %zmm31, %zmm24 #68.3 - vmulps %zmm27, %zmm15, %zmm23 #68.3 - vmovups .L_2il0floatpacket.30(%rip), %zmm27 #68.3 - vpermt2ps %zmm0, %zmm22, %zmm24 #68.3 - vfmadd213ps %zmm26, %zmm24, %zmm3 #68.3 - vmovups .L_2il0floatpacket.28(%rip), %zmm24 #68.3 - vmovups .L_2il0floatpacket.29(%rip), %zmm26 #68.3 - vmovaps %zmm31, %zmm29 #68.3 - vmovaps %zmm31, %zmm2 #68.3 - vpermt2ps %zmm0, %zmm24, %zmm29 #68.3 - vpermt2ps %zmm0, %zmm26, %zmm2 #68.3 - vfmadd231ps %zmm23, %zmm29, %zmm25 #68.3 - vmovups .L_2il0floatpacket.31(%rip), %zmm29 #68.3 - vfmadd213ps %zmm3, %zmm2, %zmm30 #68.3 - vmovups .L_2il0floatpacket.32(%rip), %zmm2 #68.3 - vmovaps %zmm31, %zmm3 #68.3 - vpermt2ps %zmm0, %zmm27, %zmm3 #68.3 - vpermt2ps %zmm0, %zmm29, %zmm31 #68.3 - vpermt2ps 136(%rdx), %zmm2, %zmm5 #68.3 - vpermt2ps 280(%rdx), %zmm2, %zmm6 #68.3 - vfmadd231ps %zmm23, %zmm3, %zmm28 #68.3 - vmovups .L_2il0floatpacket.33(%rip), %zmm3 #68.3 - vfmadd213ps %zmm30, %zmm31, %zmm1 #68.3 - vmovups .L_2il0floatpacket.35(%rip), %ymm0 #70.3 - vmovaps %zmm5, %zmm31 #68.3 - vpermt2ps %zmm6, %zmm3, %zmm31 #68.3 - vfmadd213ps %zmm1, %zmm31, %zmm4 #68.3 - vmovups .L_2il0floatpacket.34(%rip), %zmm1 #68.3 - vpermt2ps %zmm6, %zmm1, %zmm5 #68.3 - vfmadd213ps %zmm4, %zmm5, %zmm23 #68.3 - vmovups .L_2il0floatpacket.36(%rip), %ymm4 #70.3 - vextractf64x4 $1, %zmm25, %ymm5 #70.3 - vmovaps %zmm25, %zmm6 #70.3 - vpermps %ymm5, %ymm0, %ymm25 #70.3 - vpermps %ymm6, %ymm0, %ymm30 #70.3 - vfmadd213ps %ymm5, %ymm4, %ymm25 #70.3 - vmovups .L_2il0floatpacket.37(%rip), %ymm5 #70.3 - vfmadd213ps %ymm30, %ymm4, %ymm6 #70.3 - vmovups .L_2il0floatpacket.38(%rip), %ymm30 #70.3 - vpermilps %ymm5, %ymm25, %ymm31 #70.3 - vfmadd213ps %ymm6, %ymm30, %ymm31 #70.3 - vmovups %ymm31, -112(%rbp) #70.3[spill] - vmovaps %zmm28, %zmm31 #71.3 - vextractf64x4 $1, %zmm28, %ymm28 #71.3 - vpermps %ymm31, %ymm0, %ymm6 #71.3 - vfmadd213ps %ymm6, %ymm4, %ymm31 #71.3 - vpermps %ymm28, %ymm0, %ymm6 #71.3 - vfmadd213ps %ymm28, %ymm4, %ymm6 #71.3 - vpermilps %ymm5, %ymm6, %ymm25 #71.3 - vfmadd213ps %ymm31, %ymm30, %ymm25 #71.3 - vmovups %ymm25, -80(%rbp) #71.3[spill] - vmovaps %zmm23, %zmm25 #72.3 - vextractf64x4 $1, %zmm23, %ymm23 #72.3 - vpermps %ymm23, %ymm0, %ymm31 #72.3 - vpermps %ymm25, %ymm0, %ymm28 #72.3 - vfmadd213ps %ymm23, %ymm4, %ymm31 #72.3 - vfmadd213ps %ymm28, %ymm4, %ymm25 #72.3 - vpermilps %ymm5, %ymm31, %ymm4 #72.3 - vfmadd213ps %ymm25, %ymm30, %ymm4 #72.3 - vmovups 16(%rcx,%r9), %xmm25 #76.3 - vmovups (%rcx,%r9), %xmm5 #76.3 - vmovups 32(%rcx,%r9), %xmm30 #76.3 - vmovups %ymm4, -48(%rbp) #72.3[spill] - vinsertf32x4 $1, 16(%rcx,%rax), %zmm25, %zmm23 #76.3 - vinsertf32x4 $2, 16(%rcx,%r10), %zmm23, %zmm31 #76.3 - vinsertf32x4 $1, (%rcx,%rax), %zmm5, %zmm6 #76.3 - vinsertf32x4 $2, (%rcx,%r10), %zmm6, %zmm28 #76.3 - vinsertf32x4 $3, 16(%rcx,%rdi), %zmm31, %zmm6 #76.3 - vmovups 48(%rcx,%r9), %xmm31 #79.3 - vinsertf32x4 $3, (%rcx,%rdi), %zmm28, %zmm5 #76.3 - vshufps $228, %zmm6, %zmm5, %zmm23 #76.3 - vinsertf32x4 $1, 32(%rcx,%rax), %zmm30, %zmm4 #76.3 - vinsertf32x4 $2, 32(%rcx,%r10), %zmm4, %zmm28 #76.3 - vinsertf32x4 $3, 32(%rcx,%rdi), %zmm28, %zmm25 #76.3 - vshufps $78, %zmm25, %zmm5, %zmm5 #76.3 - vshufps $228, %zmm25, %zmm6, %zmm6 #76.3 - vmovups 64(%rcx,%r9), %xmm25 #79.3 - vinsertf32x4 $1, 48(%rcx,%rax), %zmm31, %zmm30 #79.3 - vinsertf32x4 $2, 48(%rcx,%r10), %zmm30, %zmm4 #79.3 - vinsertf32x4 $3, 48(%rcx,%rdi), %zmm4, %zmm28 #79.3 - vmovups 80(%rcx,%r9), %xmm4 #79.3 - vinsertf32x4 $1, 64(%rcx,%rax), %zmm25, %zmm31 #79.3 - vinsertf32x4 $2, 64(%rcx,%r10), %zmm31, %zmm30 #79.3 - vinsertf32x4 $3, 64(%rcx,%rdi), %zmm30, %zmm30 #79.3 - vinsertf32x4 $1, 80(%rcx,%rax), %zmm4, %zmm25 #79.3 - vinsertf32x4 $2, 80(%rcx,%r10), %zmm25, %zmm31 #79.3 - vmovups .L_2il0floatpacket.39(%rip), %zmm25 #79.3 - vinsertf32x4 $3, 80(%rcx,%rdi), %zmm31, %zmm31 #79.3 - vmovaps %zmm28, %zmm4 #79.3 - vpermt2ps %zmm30, %zmm25, %zmm4 #79.3 - vpermt2ps %zmm31, %zmm25, %zmm30 #79.3 - vpermps %zmm4, %zmm11, %zmm25 #83.3 - vpermps %zmm30, %zmm11, %zmm30 #85.3 - vmovups 496(%rdx), %zmm4 #91.3 - vaddps %zmm25, %zmm23, %zmm23{%k3} #83.3 - vaddps %zmm30, %zmm6, %zmm6{%k3} #85.3 - vpermt2ps 568(%rdx), %zmm2, %zmm4 #91.3 - vsubps %zmm25, %zmm23, %zmm23{%k4} #83.3 - vsubps %zmm30, %zmm6, %zmm6{%k4} #85.3 - vmovups .L_2il0floatpacket.40(%rip), %zmm25 #79.3 - vpermt2ps %zmm31, %zmm25, %zmm28 #79.3 - vmovups 288(%rdx), %zmm25 #91.3 - vpermps %zmm28, %zmm11, %zmm11 #84.3 - vmovups 352(%rdx), %zmm28 #91.3 - vaddps %zmm11, %zmm5, %zmm5{%k3} #84.3 - vpermt2ps 424(%rdx), %zmm2, %zmm28 #91.3 - vsubps %zmm11, %zmm5, %zmm5{%k4} #84.3 - vmovups 432(%rdx), %zmm11 #91.3 - vpermi2ps %zmm4, %zmm28, %zmm3 #91.3 - vpermt2ps %zmm4, %zmm1, %zmm28 #91.3 - vmovaps %zmm25, %zmm2 #91.3 - vpermt2ps 360(%rdx), %zmm10, %zmm2 #91.3 - vpermi2ps 504(%rdx), %zmm11, %zmm10 #91.3 - vpermt2ps 504(%rdx), %zmm9, %zmm11 #91.3 - vpermt2ps 360(%rdx), %zmm9, %zmm25 #91.3 - vpermi2ps %zmm10, %zmm2, %zmm7 #91.3 - vpermi2ps %zmm10, %zmm2, %zmm12 #91.3 - vpermi2ps %zmm10, %zmm2, %zmm13 #91.3 - vpermi2ps %zmm10, %zmm2, %zmm14 #91.3 - vpermi2ps %zmm10, %zmm2, %zmm17 #91.3 - vpermi2ps %zmm10, %zmm2, %zmm16 #91.3 - vpermi2ps %zmm10, %zmm2, %zmm8 #91.3 - vpermt2ps %zmm10, %zmm18, %zmm2 #91.3 - vpermi2ps %zmm11, %zmm25, %zmm20 #91.3 - vpermi2ps %zmm11, %zmm25, %zmm22 #91.3 - vpermi2ps %zmm11, %zmm25, %zmm26 #91.3 - vpermi2ps %zmm11, %zmm25, %zmm19 #91.3 - vpermi2ps %zmm11, %zmm25, %zmm21 #91.3 - vpermi2ps %zmm11, %zmm25, %zmm24 #91.3 - vpermi2ps %zmm11, %zmm25, %zmm27 #91.3 - vpermt2ps %zmm11, %zmm29, %zmm25 #91.3 - vmulps %zmm7, %zmm23, %zmm7 #91.3 - vmulps %zmm12, %zmm5, %zmm12 #91.3 - vmovups .L_2il0floatpacket.42(%rip), %ymm18 #96.3 - vfmadd231ps %zmm5, %zmm13, %zmm7 #91.3 - vfmadd231ps %zmm23, %zmm14, %zmm12 #91.3 - vpermilps $177, %zmm5, %zmm13 #91.3 - vmulps %zmm13, %zmm15, %zmm4 #91.3 - vmovups .L_2il0floatpacket.41(%rip), %ymm13 #96.3 - vfmadd231ps %zmm4, %zmm17, %zmm12 #91.3 - vpermilps $177, %zmm23, %zmm1 #91.3 - vmulps %zmm15, %zmm1, %zmm1 #91.3 - vfmadd231ps %zmm1, %zmm2, %zmm12 #91.3 - vfmadd231ps %zmm1, %zmm16, %zmm7 #91.3 - vmulps %zmm20, %zmm23, %zmm2 #91.3 - vfmadd231ps %zmm4, %zmm8, %zmm7 #91.3 - vfmadd231ps %zmm6, %zmm21, %zmm12 #91.3 - vmovups .L_2il0floatpacket.43(%rip), %ymm21 #96.3 - vfmadd213ps %zmm2, %zmm22, %zmm5 #91.3 - vfmadd231ps %zmm6, %zmm19, %zmm7 #91.3 - vmovups .L_2il0floatpacket.36(%rip), %ymm19 #96.3 - vmovups .L_2il0floatpacket.44(%rip), %ymm22 #96.3 - vfmadd213ps %zmm5, %zmm26, %zmm1 #91.3 - vbroadcastss -16(%rbp), %ymm26 #94.10 - vfmadd213ps %zmm1, %zmm25, %zmm4 #91.3 - vpermilps $177, %zmm6, %zmm8 #91.3 - vmulps %zmm8, %zmm15, %zmm15 #91.3 - vfmadd213ps %zmm4, %zmm3, %zmm6 #91.3 - vfmadd231ps %zmm15, %zmm24, %zmm7 #91.3 - vfmadd231ps %zmm15, %zmm27, %zmm12 #91.3 - vfmadd213ps %zmm6, %zmm28, %zmm15 #91.3 - vpermps %ymm7, %ymm0, %ymm3 #96.3 - vpermps %ymm12, %ymm0, %ymm10 #97.3 - vpermps %ymm15, %ymm0, %ymm17 #98.3 - vfmadd213ps %ymm7, %ymm19, %ymm3 #96.3 - vfmadd213ps %ymm12, %ymm19, %ymm10 #97.3 - vfmadd213ps %ymm15, %ymm19, %ymm17 #98.3 - vextractf64x4 $1, %zmm7, %ymm5 #96.3 - vextractf64x4 $1, %zmm12, %ymm11 #97.3 - vextractf64x4 $1, %zmm15, %ymm20 #98.3 - vpermps %ymm5, %ymm0, %ymm6 #96.3 - vpermps %ymm11, %ymm0, %ymm14 #97.3 - vpermps %ymm20, %ymm0, %ymm0 #98.3 - vpermilps %ymm13, %ymm3, %ymm9 #96.3 - vfmadd213ps %ymm5, %ymm19, %ymm6 #96.3 - vfmadd213ps %ymm11, %ymm19, %ymm14 #97.3 - vfmadd213ps %ymm20, %ymm19, %ymm0 #98.3 - vfmadd213ps -112(%rbp), %ymm18, %ymm9 #96.3[spill] - vpermilps %ymm13, %ymm10, %ymm16 #97.3 - vpermilps %ymm13, %ymm17, %ymm23 #98.3 - vfmadd213ps -80(%rbp), %ymm18, %ymm16 #97.3[spill] - vfmadd213ps -48(%rbp), %ymm18, %ymm23 #98.3[spill] - vpermilps %ymm21, %ymm6, %ymm24 #96.3 - vpermilps %ymm21, %ymm14, %ymm25 #97.3 - vpermilps %ymm21, %ymm0, %ymm27 #98.3 - vfmadd213ps %ymm9, %ymm22, %ymm24 #96.3 - vfmadd213ps %ymm16, %ymm22, %ymm25 #97.3 - vfmadd213ps %ymm23, %ymm22, %ymm27 #98.3 - vmulps %ymm26, %ymm24, %ymm29 #100.8 - vmulps %ymm25, %ymm26, %ymm31 #101.8 - vmulps %ymm27, %ymm26, %ymm0 #102.8 - vshufps $68, %ymm31, %ymm29, %ymm28 #104.3 - vshufps $228, %ymm29, %ymm0, %ymm30 #104.3 - vshufps $238, %ymm0, %ymm31, %ymm1 #104.3 - vmovups %xmm28, (%r8) #104.3 - vmovups %xmm30, 16(%r8) #104.3 - vmovups %xmm1, 32(%r8) #104.3 - vextractf32x4 $1, %ymm28, 48(%r8) #104.3 - vextractf32x4 $1, %ymm30, 64(%r8) #104.3 - vextractf128 $1, %ymm1, 80(%r8) #104.3 - vzeroupper #105.1 - movq %rbp, %rsp #105.1 - popq %rbp #105.1 - .cfi_restore 6 - movq %rbx, %rsp #105.1 - popq %rbx #105.1 - .cfi_def_cfa 7, 8 - .cfi_restore 3 - ret #105.1 - .align 16,0x90 - # LOE - .cfi_endproc -# mark_end; - .type doe_avx512,@function - .size doe_avx512,.-doe_avx512 - .data -# -- End doe_avx512 - .text -# -- Begin deo_avx512 - .text -# mark_begin; - .align 16,0x90 - .globl deo_avx512 -# --- deo_avx512(int *, int *, su3 *, spinor *, float, spin_t *) -deo_avx512: -# parameter 1: %rdi -# parameter 2: %rsi -# parameter 3: %rdx -# parameter 4: %rcx -# parameter 5: %xmm0 -# parameter 6: %r8 -..B2.1: # Preds ..B2.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_deo_avx512.11: -..L12: - #108.1 - pushq %rbp #108.1 - .cfi_def_cfa_offset 16 - movq %rsp, %rbp #108.1 - .cfi_def_cfa 6, 16 - .cfi_offset 6, -16 - movslq (%rdi), %rax #122.16 - vmovss %xmm0, -16(%rbp) #108.1 - lea (%rax,%rax,2), %r9 #122.8 - shlq $5, %r9 #122.8 - prefetcht0 (%rcx,%r9) #123.3 - movl $42255, %r10d #165.3 - movslq (%rsi), %r11 #124.16 - kmovw %r10d, %k1 #165.3 - movl $23280, %r10d #165.3 - kmovw %r10d, %k2 #165.3 - movl $38595, %r10d #183.3 - lea (%r11,%r11,2), %rax #124.8 - shlq $5, %rax #124.8 - kmovw %r10d, %k3 #183.3 - movl $26940, %r10d #183.3 - kmovw %r10d, %k4 #183.3 - prefetcht0 (%rcx,%rax) #125.3 - movslq 4(%rdi), %r10 #126.17 - lea (%r10,%r10,2), %r11 #126.9 - shlq $5, %r11 #126.9 - prefetcht0 (%rcx,%r11) #127.3 - movslq 4(%rsi), %r10 #128.17 - lea (%r10,%r10,2), %r10 #128.9 - shlq $5, %r10 #128.9 - prefetcht0 (%rcx,%r10) #129.3 - vmovups (%r8), %xmm2 #131.3 - vmovups 16(%r8), %xmm6 #131.3 - vmovups 32(%r8), %xmm4 #131.3 - vbroadcastss -16(%rbp), %ymm27 #133.10 - vmovups .L_2il0floatpacket.35(%rip), %ymm23 #138.3 - vmovups .L_2il0floatpacket.36(%rip), %ymm28 #138.3 - vmovups .L_2il0floatpacket.47(%rip), %ymm18 #142.3 - vmovups 216(%rdx), %zmm9 #150.3 - vmovups .L_2il0floatpacket.15(%rip), %zmm26 #150.3 - vmovups .L_2il0floatpacket.16(%rip), %zmm8 #150.3 - vinsertf128 $1, 48(%r8), %ymm2, %ymm15 #131.3 - vinsertf128 $1, 64(%r8), %ymm6, %ymm13 #131.3 - vshufps $228, %ymm13, %ymm15, %ymm21 #131.3 - vmulps %ymm27, %ymm21, %ymm20 #134.8 - vpermps %ymm20, %ymm23, %ymm24 #138.3 - vfmadd231ps %ymm20, %ymm28, %ymm24 #138.3 - vinsertf128 $1, 80(%r8), %ymm4, %ymm1 #131.3 - vshufps $78, %ymm1, %ymm15, %ymm30 #131.3 - vshufps $228, %ymm1, %ymm13, %ymm31 #131.3 - vmovups .L_2il0floatpacket.45(%rip), %ymm13 #142.3 - vmovups .L_2il0floatpacket.46(%rip), %ymm1 #142.3 - vmulps %ymm30, %ymm27, %ymm25 #135.8 - vmulps %ymm31, %ymm27, %ymm14 #136.8 - vpermps %ymm20, %ymm13, %ymm15 #142.3 - vpermps %ymm20, %ymm1, %ymm21 #142.3 - vfmadd213ps %ymm21, %ymm18, %ymm15 #142.3 - vmovups .L_2il0floatpacket.14(%rip), %zmm21 #150.3 - vpermps %ymm25, %ymm1, %ymm16 #143.3 - vpermps %ymm14, %ymm1, %ymm12 #144.3 - vmovaps %zmm9, %zmm1 #150.3 - vpermps %ymm14, %ymm23, %ymm29 #140.3 - vpermps %ymm14, %ymm13, %ymm30 #144.3 - vfmadd231ps %ymm14, %ymm28, %ymm29 #140.3 - vfmadd213ps %ymm12, %ymm18, %ymm30 #144.3 - vpermps %ymm25, %ymm23, %ymm22 #139.3 - vpermps %ymm25, %ymm13, %ymm17 #143.3 - vfmadd231ps %ymm25, %ymm28, %ymm22 #139.3 - vfmadd213ps %ymm16, %ymm18, %ymm17 #143.3 - vmovups .L_2il0floatpacket.25(%rip), %zmm28 #150.3 - vmovups 280(%rdx), %zmm13 #150.3 - movslq 8(%rdi), %r8 #174.16 - vinsertf64x4 $1, %ymm15, %zmm24, %zmm10 #142.3 - lea (%r8,%r8,2), %r8 #174.8 - vmovups 72(%rdx), %zmm15 #150.3 - vmovups .L_2il0floatpacket.13(%rip), %zmm24 #150.3 - vmovaps %zmm15, %zmm7 #150.3 - vpermt2ps (%rdx), %zmm24, %zmm7 #150.3 - vpermt2ps 144(%rdx), %zmm24, %zmm1 #150.3 - vmovaps %zmm7, %zmm5 #150.3 - vpermt2ps %zmm1, %zmm21, %zmm5 #150.3 - vmulps %zmm5, %zmm10, %zmm18 #150.3 - vpermilps $177, %zmm10, %zmm31 #150.3 - vmovaps %zmm7, %zmm0 #150.3 - vmovaps %zmm7, %zmm11 #150.3 - vpermt2ps %zmm1, %zmm26, %zmm0 #150.3 - vpermt2ps %zmm1, %zmm8, %zmm11 #150.3 - vmovaps %zmm7, %zmm6 #150.3 - vmovaps %zmm7, %zmm23 #150.3 - vmovaps %zmm7, %zmm27 #150.3 - vmovaps %zmm7, %zmm4 #150.3 - shlq $5, %r8 #174.8 - vinsertf64x4 $1, %ymm30, %zmm29, %zmm12 #144.3 - vmovups .L_2il0floatpacket.18(%rip), %zmm30 #150.3 - vmovups .L_2il0floatpacket.17(%rip), %zmm29 #150.3 - vmulps %zmm30, %zmm31, %zmm5 #150.3 - vmovups .L_2il0floatpacket.27(%rip), %zmm31 #150.3 - vpermt2ps %zmm1, %zmm29, %zmm27 #150.3 - vinsertf64x4 $1, %ymm17, %zmm22, %zmm3 #143.3 - vmovups .L_2il0floatpacket.21(%rip), %zmm22 #150.3 - vmovups 136(%rdx), %zmm17 #150.3 - vfmadd231ps %zmm3, %zmm11, %zmm18 #150.3 - vmulps %zmm0, %zmm3, %zmm19 #150.3 - vpermt2ps %zmm1, %zmm22, %zmm23 #150.3 - vfmadd231ps %zmm10, %zmm27, %zmm19 #150.3 - vmovups .L_2il0floatpacket.26(%rip), %zmm27 #150.3 - vpermilps $177, %zmm3, %zmm2 #150.3 - vmulps %zmm2, %zmm30, %zmm16 #150.3 - vmovups .L_2il0floatpacket.19(%rip), %zmm2 #150.3 - vpermt2ps %zmm1, %zmm2, %zmm6 #150.3 - vfmadd231ps %zmm5, %zmm6, %zmm18 #150.3 - vmovups .L_2il0floatpacket.20(%rip), %zmm6 #150.3 - vfmadd231ps %zmm16, %zmm23, %zmm18 #150.3 - vmovups .L_2il0floatpacket.23(%rip), %zmm23 #150.3 - vpermt2ps %zmm1, %zmm6, %zmm4 #150.3 - vpermt2ps (%rdx), %zmm23, %zmm15 #150.3 - vpermt2ps 144(%rdx), %zmm23, %zmm9 #150.3 - vfmadd231ps %zmm16, %zmm4, %zmm19 #150.3 - vmovups .L_2il0floatpacket.22(%rip), %zmm4 #150.3 - vmovaps %zmm15, %zmm0 #150.3 - vpermt2ps %zmm9, %zmm28, %zmm0 #150.3 - vpermt2ps %zmm1, %zmm4, %zmm7 #150.3 - vmovups .L_2il0floatpacket.24(%rip), %zmm1 #150.3 - vmulps %zmm0, %zmm10, %zmm0 #150.3 - vfmadd231ps %zmm5, %zmm7, %zmm19 #150.3 - vmovaps %zmm15, %zmm10 #150.3 - vmovaps %zmm15, %zmm7 #150.3 - vpermt2ps %zmm9, %zmm27, %zmm10 #150.3 - vpermt2ps %zmm9, %zmm1, %zmm7 #150.3 - vfmadd231ps %zmm12, %zmm10, %zmm19 #150.3 - vfmadd231ps %zmm12, %zmm7, %zmm18 #150.3 - vpermilps $177, %zmm12, %zmm10 #150.3 - vmovaps %zmm15, %zmm7 #150.3 - vmulps %zmm10, %zmm30, %zmm11 #150.3 - vmovups .L_2il0floatpacket.29(%rip), %zmm10 #150.3 - vpermt2ps %zmm9, %zmm31, %zmm7 #150.3 - vfmadd213ps %zmm0, %zmm7, %zmm3 #150.3 - vmovups .L_2il0floatpacket.28(%rip), %zmm7 #150.3 - vmovaps %zmm15, %zmm0 #150.3 - vpermt2ps %zmm9, %zmm7, %zmm0 #150.3 - vfmadd231ps %zmm11, %zmm0, %zmm18 #150.3 - vmovaps %zmm15, %zmm0 #150.3 - vpermt2ps %zmm9, %zmm10, %zmm0 #150.3 - vfmadd213ps %zmm3, %zmm0, %zmm5 #150.3 - vmovups .L_2il0floatpacket.30(%rip), %zmm3 #150.3 - vmovaps %zmm15, %zmm0 #150.3 - vpermt2ps %zmm9, %zmm3, %zmm0 #150.3 - vfmadd231ps %zmm11, %zmm0, %zmm19 #150.3 - vmovups .L_2il0floatpacket.31(%rip), %zmm0 #150.3 - vpermt2ps %zmm9, %zmm0, %zmm15 #150.3 - vmovups .L_2il0floatpacket.32(%rip), %zmm9 #150.3 - vfmadd213ps %zmm5, %zmm15, %zmm16 #150.3 - vmovups .L_2il0floatpacket.33(%rip), %zmm5 #150.3 - vpermt2ps 64(%rdx), %zmm9, %zmm17 #150.3 - vpermt2ps 208(%rdx), %zmm9, %zmm13 #150.3 - vmovaps %zmm17, %zmm15 #150.3 - vpermt2ps %zmm13, %zmm5, %zmm15 #150.3 - vfmadd213ps %zmm16, %zmm15, %zmm12 #150.3 - vmovups .L_2il0floatpacket.34(%rip), %zmm16 #150.3 - vmovups 16(%rcx,%rax), %xmm15 #152.3 - vpermt2ps %zmm13, %zmm16, %zmm17 #150.3 - vfmadd213ps %zmm12, %zmm17, %zmm11 #150.3 - vmovups (%rcx,%rax), %xmm17 #152.3 - vinsertf32x4 $1, (%rcx,%r9), %zmm17, %zmm13 #152.3 - vinsertf32x4 $2, (%rcx,%r10), %zmm13, %zmm12 #152.3 - vinsertf32x4 $3, (%rcx,%r11), %zmm12, %zmm13 #152.3 - vmovups 32(%rcx,%rax), %xmm12 #152.3 - vinsertf32x4 $1, 16(%rcx,%r9), %zmm15, %zmm16 #152.3 - vinsertf32x4 $2, 16(%rcx,%r10), %zmm16, %zmm17 #152.3 - vinsertf32x4 $3, 16(%rcx,%r11), %zmm17, %zmm17 #152.3 - vinsertf32x4 $1, 32(%rcx,%r9), %zmm12, %zmm15 #152.3 - vinsertf32x4 $2, 32(%rcx,%r10), %zmm15, %zmm16 #152.3 - vshufps $228, %zmm17, %zmm13, %zmm15 #152.3 - vinsertf32x4 $3, 32(%rcx,%r11), %zmm16, %zmm12 #152.3 - vshufps $78, %zmm12, %zmm13, %zmm16 #152.3 - vshufps $228, %zmm12, %zmm17, %zmm13 #152.3 - vaddps %zmm18, %zmm15, %zmm12 #155.8 - vaddps %zmm19, %zmm16, %zmm17 #156.8 - vaddps %zmm11, %zmm13, %zmm13 #157.8 - vshufps $68, %zmm17, %zmm12, %zmm15 #158.3 - vshufps $228, %zmm12, %zmm13, %zmm16 #158.3 - vshufps $238, %zmm13, %zmm17, %zmm17 #158.3 - vmovups %xmm15, (%rcx,%rax) #158.3 - vextractf32x4 $1, %zmm15, (%rcx,%r9) #158.3 - vextractf32x4 $2, %zmm15, (%rcx,%r10) #158.3 - vextractf32x4 $3, %zmm15, (%rcx,%r11) #158.3 - vmovups %xmm16, 16(%rcx,%rax) #158.3 - vextractf32x4 $1, %zmm16, 16(%rcx,%r9) #158.3 - vextractf32x4 $2, %zmm16, 16(%rcx,%r10) #158.3 - vextractf32x4 $3, %zmm16, 16(%rcx,%r11) #158.3 - vmovups %xmm17, 32(%rcx,%rax) #158.3 - vextractf32x4 $1, %zmm17, 32(%rcx,%r9) #158.3 - vextractf32x4 $2, %zmm17, 32(%rcx,%r10) #158.3 - vextractf32x4 $3, %zmm17, 32(%rcx,%r11) #158.3 - vmovups 48(%rcx,%rax), %xmm12 #162.3 - vmovups 64(%rcx,%rax), %xmm13 #162.3 - vmovups 80(%rcx,%rax), %xmm17 #162.3 - vinsertf32x4 $1, 48(%rcx,%r9), %zmm12, %zmm15 #162.3 - vinsertf32x4 $2, 48(%rcx,%r10), %zmm15, %zmm16 #162.3 - vinsertf32x4 $3, 48(%rcx,%r11), %zmm16, %zmm12 #162.3 - vinsertf32x4 $1, 64(%rcx,%r9), %zmm13, %zmm15 #162.3 - vinsertf32x4 $2, 64(%rcx,%r10), %zmm15, %zmm16 #162.3 - vinsertf32x4 $3, 64(%rcx,%r11), %zmm16, %zmm13 #162.3 - vinsertf32x4 $1, 80(%rcx,%r9), %zmm17, %zmm15 #162.3 - vinsertf32x4 $2, 80(%rcx,%r10), %zmm15, %zmm16 #162.3 - vinsertf32x4 $3, 80(%rcx,%r11), %zmm16, %zmm17 #162.3 - vmovups .L_2il0floatpacket.10(%rip), %zmm16 #162.3 - prefetcht0 (%rcx,%r8) #175.3 - vmovaps %zmm12, %zmm15 #162.3 - vpermt2ps %zmm13, %zmm16, %zmm15 #162.3 - vpermt2ps %zmm17, %zmm16, %zmm13 #162.3 - vmovups .L_2il0floatpacket.11(%rip), %zmm16 #162.3 - vpermt2ps %zmm17, %zmm16, %zmm12 #162.3 - vmovups .L_2il0floatpacket.12(%rip), %zmm16 #165.3 - vpermps %zmm18, %zmm16, %zmm18 #165.3 - vpermps %zmm11, %zmm16, %zmm17 #167.3 - vpermps %zmm19, %zmm16, %zmm19 #166.3 - vaddps %zmm18, %zmm15, %zmm15{%k1} #165.3 - vaddps %zmm17, %zmm13, %zmm13{%k1} #167.3 - vaddps %zmm19, %zmm12, %zmm12{%k1} #166.3 - vsubps %zmm18, %zmm15, %zmm15{%k2} #165.3 - vsubps %zmm17, %zmm13, %zmm13{%k2} #167.3 - vsubps %zmm19, %zmm12, %zmm12{%k2} #166.3 - vmovups .L_2il0floatpacket.49(%rip), %zmm11 #168.3 - vmovups .L_2il0floatpacket.48(%rip), %zmm17 #168.3 - vpermi2ps %zmm15, %zmm13, %zmm11 #168.3 - vmovaps %zmm15, %zmm18 #168.3 - vmovups .L_2il0floatpacket.50(%rip), %zmm15 #168.3 - vpermt2ps %zmm12, %zmm17, %zmm18 #168.3 - vpermt2ps %zmm13, %zmm15, %zmm12 #168.3 - vmovups %xmm18, 48(%rcx,%rax) #168.3 - vextractf32x4 $1, %zmm18, 48(%rcx,%r9) #168.3 - vextractf32x4 $2, %zmm18, 48(%rcx,%r10) #168.3 - vextractf32x4 $3, %zmm18, 48(%rcx,%r11) #168.3 - vmovups %xmm11, 64(%rcx,%rax) #168.3 - vextractf32x4 $1, %zmm11, 64(%rcx,%r9) #168.3 - vextractf32x4 $2, %zmm11, 64(%rcx,%r10) #168.3 - vextractf32x4 $3, %zmm11, 64(%rcx,%r11) #168.3 - vmovups %xmm12, 80(%rcx,%rax) #168.3 - vextractf32x4 $1, %zmm12, 80(%rcx,%r9) #168.3 - vextractf32x4 $2, %zmm12, 80(%rcx,%r10) #168.3 - vextractf32x4 $3, %zmm12, 80(%rcx,%r11) #168.3 - movslq 8(%rsi), %r9 #176.16 - lea (%r9,%r9,2), %rax #176.8 - shlq $5, %rax #176.8 - prefetcht0 (%rcx,%rax) #177.3 - movslq 12(%rdi), %rdi #178.17 - lea (%rdi,%rdi,2), %rdi #178.9 - shlq $5, %rdi #178.9 - prefetcht0 (%rcx,%rdi) #179.3 - movslq 12(%rsi), %rsi #180.17 - lea (%rsi,%rsi,2), %rsi #180.9 - shlq $5, %rsi #180.9 - prefetcht0 (%rcx,%rsi) #181.3 - vmovups .L_2il0floatpacket.51(%rip), %zmm13 #183.3 - vmovups .L_2il0floatpacket.52(%rip), %zmm18 #183.3 - vmovups 424(%rdx), %zmm12 #191.3 - vpermps %zmm20, %zmm13, %zmm11 #183.3 - vpermps %zmm20, %zmm18, %zmm20 #183.3 - vpermt2ps 352(%rdx), %zmm9, %zmm12 #191.3 - vaddps %zmm11, %zmm20, %zmm19{%k3}{z} #183.3 - vsubps %zmm11, %zmm20, %zmm19{%k4} #183.3 - vpermps %zmm25, %zmm13, %zmm20 #184.3 - vpermps %zmm25, %zmm18, %zmm25 #184.3 - vpermps %zmm14, %zmm13, %zmm13 #185.3 - vpermps %zmm14, %zmm18, %zmm14 #185.3 - vaddps %zmm20, %zmm25, %zmm11{%k3}{z} #184.3 - vmovups 568(%rdx), %zmm18 #191.3 - vsubps %zmm20, %zmm25, %zmm11{%k4} #184.3 - vaddps %zmm13, %zmm14, %zmm20{%k3}{z} #185.3 - vpermt2ps 496(%rdx), %zmm9, %zmm18 #191.3 - vmovups 360(%rdx), %zmm25 #191.3 - vsubps %zmm13, %zmm14, %zmm20{%k4} #185.3 - vpermi2ps %zmm18, %zmm12, %zmm5 #191.3 - vmovups 504(%rdx), %zmm14 #191.3 - vmovaps %zmm25, %zmm9 #191.3 - vpermt2ps 288(%rdx), %zmm24, %zmm9 #191.3 - vpermi2ps 432(%rdx), %zmm14, %zmm24 #191.3 - vpermt2ps 432(%rdx), %zmm23, %zmm14 #191.3 - vpermt2ps 288(%rdx), %zmm23, %zmm25 #191.3 - vpermi2ps %zmm24, %zmm9, %zmm21 #191.3 - vpermi2ps %zmm24, %zmm9, %zmm26 #191.3 - vpermi2ps %zmm24, %zmm9, %zmm8 #191.3 - vpermi2ps %zmm24, %zmm9, %zmm29 #191.3 - vpermi2ps %zmm24, %zmm9, %zmm6 #191.3 - vpermi2ps %zmm24, %zmm9, %zmm2 #191.3 - vpermi2ps %zmm24, %zmm9, %zmm22 #191.3 - vpermt2ps %zmm24, %zmm4, %zmm9 #191.3 - vpermi2ps %zmm14, %zmm25, %zmm28 #191.3 - vpermi2ps %zmm14, %zmm25, %zmm31 #191.3 - vpermi2ps %zmm14, %zmm25, %zmm1 #191.3 - vpermi2ps %zmm14, %zmm25, %zmm10 #191.3 - vpermi2ps %zmm14, %zmm25, %zmm27 #191.3 - vpermi2ps %zmm14, %zmm25, %zmm7 #191.3 - vpermi2ps %zmm14, %zmm25, %zmm3 #191.3 - vpermt2ps %zmm14, %zmm0, %zmm25 #191.3 - vmovups (%rcx,%rax), %xmm0 #193.3 - vmulps %zmm21, %zmm19, %zmm21 #191.3 - vmulps %zmm26, %zmm11, %zmm13 #191.3 - vmovups 16(%rcx,%rax), %xmm4 #193.3 - vfmadd231ps %zmm11, %zmm8, %zmm21 #191.3 - vfmadd231ps %zmm19, %zmm29, %zmm13 #191.3 - vpermilps $177, %zmm11, %zmm26 #191.3 - vmulps %zmm26, %zmm30, %zmm26 #191.3 - vpermilps $177, %zmm19, %zmm8 #191.3 - vmulps %zmm30, %zmm8, %zmm8 #191.3 - vfmadd231ps %zmm26, %zmm6, %zmm13 #191.3 - vfmadd231ps %zmm8, %zmm2, %zmm21 #191.3 - vfmadd231ps %zmm8, %zmm9, %zmm13 #191.3 - vmulps %zmm28, %zmm19, %zmm9 #191.3 - vfmadd231ps %zmm26, %zmm22, %zmm21 #191.3 - vfmadd231ps %zmm20, %zmm27, %zmm13 #191.3 - vfmadd213ps %zmm9, %zmm31, %zmm11 #191.3 - vfmadd231ps %zmm20, %zmm1, %zmm21 #191.3 - vfmadd213ps %zmm11, %zmm10, %zmm8 #191.3 - vpermilps $177, %zmm20, %zmm28 #191.3 - vmulps %zmm28, %zmm30, %zmm1 #191.3 - vfmadd213ps %zmm8, %zmm25, %zmm26 #191.3 - vfmadd231ps %zmm1, %zmm3, %zmm13 #191.3 - vmovups .L_2il0floatpacket.34(%rip), %zmm3 #191.3 - vfmadd231ps %zmm1, %zmm7, %zmm21 #191.3 - vmovups 32(%rcx,%rax), %xmm7 #193.3 - vfmadd213ps %zmm26, %zmm5, %zmm20 #191.3 - vpermt2ps %zmm18, %zmm3, %zmm12 #191.3 - vfmadd213ps %zmm20, %zmm12, %zmm1 #191.3 - vinsertf32x4 $1, (%rcx,%r8), %zmm0, %zmm2 #193.3 - vinsertf32x4 $2, (%rcx,%rsi), %zmm2, %zmm3 #193.3 - vinsertf32x4 $3, (%rcx,%rdi), %zmm3, %zmm2 #193.3 - vinsertf32x4 $1, 16(%rcx,%r8), %zmm4, %zmm5 #193.3 - vinsertf32x4 $2, 16(%rcx,%rsi), %zmm5, %zmm6 #193.3 - vinsertf32x4 $3, 16(%rcx,%rdi), %zmm6, %zmm3 #193.3 - vinsertf32x4 $1, 32(%rcx,%r8), %zmm7, %zmm8 #193.3 - vinsertf32x4 $2, 32(%rcx,%rsi), %zmm8, %zmm0 #193.3 - # LOE rax rcx rbx rsi rdi r8 r12 r13 r14 r15 zmm0 zmm1 zmm2 zmm3 zmm13 zmm15 zmm16 zmm17 zmm21 -..B2.4: # Preds ..B2.1 - # Execution count [1.00e+00] - vshufps $228, %zmm3, %zmm2, %zmm4 #193.3 - movl $27075, %edx #200.3 - vmovups .L_2il0floatpacket.39(%rip), %zmm26 #199.3 - vmovups .L_2il0floatpacket.40(%rip), %zmm25 #199.3 - vinsertf32x4 $3, 32(%rcx,%rdi), %zmm0, %zmm0 #193.3 - vaddps %zmm21, %zmm4, %zmm5 #194.8 - vpermps %zmm21, %zmm16, %zmm21 #200.3 - vshufps $78, %zmm0, %zmm2, %zmm2 #193.3 - vshufps $228, %zmm0, %zmm3, %zmm3 #193.3 - kmovw %edx, %k1 #200.3 - vaddps %zmm13, %zmm2, %zmm6 #195.8 - vaddps %zmm1, %zmm3, %zmm7 #196.8 - vpermps %zmm13, %zmm16, %zmm13 #201.3 - vpermps %zmm1, %zmm16, %zmm1 #202.3 - vshufps $68, %zmm6, %zmm5, %zmm8 #197.3 - vshufps $228, %zmm5, %zmm7, %zmm9 #197.3 - vshufps $238, %zmm7, %zmm6, %zmm10 #197.3 - vmovups .L_2il0floatpacket.53(%rip), %zmm16 #203.3 - vmovaps %zmm26, %zmm28 #199.3 - movl $38460, %edx #200.3 - kmovw %edx, %k2 #200.3 - vmovups %xmm8, (%rcx,%rax) #197.3 - vextractf32x4 $1, %zmm8, (%rcx,%r8) #197.3 - vextractf32x4 $2, %zmm8, (%rcx,%rsi) #197.3 - vextractf32x4 $3, %zmm8, (%rcx,%rdi) #197.3 - vmovups %xmm9, 16(%rcx,%rax) #197.3 - vextractf32x4 $1, %zmm9, 16(%rcx,%r8) #197.3 - vextractf32x4 $2, %zmm9, 16(%rcx,%rsi) #197.3 - vextractf32x4 $3, %zmm9, 16(%rcx,%rdi) #197.3 - vmovups %xmm10, 32(%rcx,%rax) #197.3 - vextractf32x4 $1, %zmm10, 32(%rcx,%r8) #197.3 - vextractf32x4 $2, %zmm10, 32(%rcx,%rsi) #197.3 - vextractf32x4 $3, %zmm10, 32(%rcx,%rdi) #197.3 - vmovups 48(%rcx,%rax), %xmm11 #199.3 - vmovups 64(%rcx,%rax), %xmm18 #199.3 - vmovups 80(%rcx,%rax), %xmm22 #199.3 - vinsertf32x4 $1, 48(%rcx,%r8), %zmm11, %zmm12 #199.3 - vinsertf32x4 $1, 64(%rcx,%r8), %zmm18, %zmm19 #199.3 - vinsertf32x4 $1, 80(%rcx,%r8), %zmm22, %zmm23 #199.3 - vinsertf32x4 $2, 48(%rcx,%rsi), %zmm12, %zmm14 #199.3 - vinsertf32x4 $2, 64(%rcx,%rsi), %zmm19, %zmm20 #199.3 - vinsertf32x4 $2, 80(%rcx,%rsi), %zmm23, %zmm24 #199.3 - vinsertf32x4 $3, 48(%rcx,%rdi), %zmm14, %zmm30 #199.3 - vinsertf32x4 $3, 64(%rcx,%rdi), %zmm20, %zmm29 #199.3 - vinsertf32x4 $3, 80(%rcx,%rdi), %zmm24, %zmm27 #199.3 - vpermi2ps %zmm29, %zmm30, %zmm28 #199.3 - vpermt2ps %zmm27, %zmm25, %zmm30 #199.3 - vpermt2ps %zmm27, %zmm26, %zmm29 #199.3 - vaddps %zmm21, %zmm28, %zmm28{%k1} #200.3 - vaddps %zmm13, %zmm30, %zmm30{%k1} #201.3 - vaddps %zmm1, %zmm29, %zmm29{%k1} #202.3 - vsubps %zmm21, %zmm28, %zmm28{%k2} #200.3 - vsubps %zmm13, %zmm30, %zmm30{%k2} #201.3 - vsubps %zmm1, %zmm29, %zmm29{%k2} #202.3 - vpermi2ps %zmm30, %zmm28, %zmm15 #203.3 - vpermi2ps %zmm28, %zmm29, %zmm16 #203.3 - vpermt2ps %zmm29, %zmm17, %zmm30 #203.3 - vmovups %xmm15, 48(%rcx,%rax) #203.3 - vextractf32x4 $1, %zmm15, 48(%rcx,%r8) #203.3 - vextractf32x4 $2, %zmm15, 48(%rcx,%rsi) #203.3 - vextractf32x4 $3, %zmm15, 48(%rcx,%rdi) #203.3 - vmovups %xmm16, 64(%rcx,%rax) #203.3 - vextractf32x4 $1, %zmm16, 64(%rcx,%r8) #203.3 - vextractf32x4 $2, %zmm16, 64(%rcx,%rsi) #203.3 - vextractf32x4 $3, %zmm16, 64(%rcx,%rdi) #203.3 - vmovups %xmm30, 80(%rcx,%rax) #203.3 - vextractf32x4 $1, %zmm30, 80(%rcx,%r8) #203.3 - vextractf32x4 $2, %zmm30, 80(%rcx,%rsi) #203.3 - vextractf32x4 $3, %zmm30, 80(%rcx,%rdi) #203.3 - vzeroupper #204.1 - movq %rbp, %rsp #204.1 - popq %rbp #204.1 - .cfi_restore 6 - ret #204.1 - .align 16,0x90 - # LOE - .cfi_endproc -# mark_end; - .type deo_avx512,@function - .size deo_avx512,.-deo_avx512 - .data -# -- End deo_avx512 - .section .rodata, "a" - .align 64 - .align 64 -.L_2il0floatpacket.10: - .long 0x00000000,0x00000001,0x00000012,0x00000013,0x00000004,0x00000005,0x00000016,0x00000017,0x0000001a,0x0000001b,0x00000008,0x00000009,0x0000001e,0x0000001f,0x0000000c,0x0000000d - .type .L_2il0floatpacket.10,@object - .size .L_2il0floatpacket.10,64 - .align 64 -.L_2il0floatpacket.11: - .long 0x00000002,0x00000003,0x00000010,0x00000011,0x00000006,0x00000007,0x00000014,0x00000015,0x00000018,0x00000019,0x0000000a,0x0000000b,0x0000001c,0x0000001d,0x0000000e,0x0000000f - .type .L_2il0floatpacket.11,@object - .size .L_2il0floatpacket.11,64 - .align 64 -.L_2il0floatpacket.12: - .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007,0x00000009,0x00000008,0x0000000b,0x0000000a,0x0000000d,0x0000000c,0x0000000f,0x0000000e - .type .L_2il0floatpacket.12,@object - .size .L_2il0floatpacket.12,64 - .align 64 -.L_2il0floatpacket.13: - .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000008,0x00000009,0x00000006,0x00000007,0x00000010,0x00000011,0x00000012,0x00000013,0x00000018,0x00000019,0x00000016,0x00000017 - .type .L_2il0floatpacket.13,@object - .size .L_2il0floatpacket.13,64 - .align 64 -.L_2il0floatpacket.14: - .long 0x00000000,0x00000000,0x00000000,0x00000000,0x00000008,0x00000008,0x00000008,0x00000008,0x00000010,0x00000010,0x00000010,0x00000010,0x00000018,0x00000018,0x00000018,0x00000018 - .type .L_2il0floatpacket.14,@object - .size .L_2il0floatpacket.14,64 - .align 64 -.L_2il0floatpacket.15: - .long 0x00000004,0x00000004,0x00000004,0x00000004,0x0000000c,0x0000000c,0x0000000c,0x0000000c,0x00000014,0x00000014,0x00000014,0x00000014,0x0000001c,0x0000001c,0x0000001c,0x0000001c - .type .L_2il0floatpacket.15,@object - .size .L_2il0floatpacket.15,64 - .align 64 -.L_2il0floatpacket.16: - .long 0x00000002,0x00000002,0x00000002,0x00000002,0x0000000e,0x0000000e,0x0000000e,0x0000000e,0x00000012,0x00000012,0x00000012,0x00000012,0x0000001e,0x0000001e,0x0000001e,0x0000001e - .type .L_2il0floatpacket.16,@object - .size .L_2il0floatpacket.16,64 - .align 64 -.L_2il0floatpacket.17: - .long 0x00000006,0x00000006,0x00000006,0x00000006,0x0000000a,0x0000000a,0x0000000a,0x0000000a,0x00000016,0x00000016,0x00000016,0x00000016,0x0000001a,0x0000001a,0x0000001a,0x0000001a - .type .L_2il0floatpacket.17,@object - .size .L_2il0floatpacket.17,64 - .align 64 -.L_2il0floatpacket.18: - .long 0xbf800000,0x3f800000,0xbf800000,0x3f800000,0x3f800000,0xbf800000,0x3f800000,0xbf800000,0xbf800000,0x3f800000,0xbf800000,0x3f800000,0x3f800000,0xbf800000,0x3f800000,0xbf800000 - .type .L_2il0floatpacket.18,@object - .size .L_2il0floatpacket.18,64 - .align 64 -.L_2il0floatpacket.19: - .long 0x00000001,0x00000001,0x00000001,0x00000001,0x00000009,0x00000009,0x00000009,0x00000009,0x00000011,0x00000011,0x00000011,0x00000011,0x00000019,0x00000019,0x00000019,0x00000019 - .type .L_2il0floatpacket.19,@object - .size .L_2il0floatpacket.19,64 - .align 64 -.L_2il0floatpacket.20: - .long 0x00000005,0x00000005,0x00000005,0x00000005,0x0000000d,0x0000000d,0x0000000d,0x0000000d,0x00000015,0x00000015,0x00000015,0x00000015,0x0000001d,0x0000001d,0x0000001d,0x0000001d - .type .L_2il0floatpacket.20,@object - .size .L_2il0floatpacket.20,64 - .align 64 -.L_2il0floatpacket.21: - .long 0x00000003,0x00000003,0x00000003,0x00000003,0x0000000f,0x0000000f,0x0000000f,0x0000000f,0x00000013,0x00000013,0x00000013,0x00000013,0x0000001f,0x0000001f,0x0000001f,0x0000001f - .type .L_2il0floatpacket.21,@object - .size .L_2il0floatpacket.21,64 - .align 64 -.L_2il0floatpacket.22: - .long 0x00000007,0x00000007,0x00000007,0x00000007,0x0000000b,0x0000000b,0x0000000b,0x0000000b,0x00000017,0x00000017,0x00000017,0x00000017,0x0000001b,0x0000001b,0x0000001b,0x0000001b - .type .L_2il0floatpacket.22,@object - .size .L_2il0floatpacket.22,64 - .align 64 -.L_2il0floatpacket.23: - .long 0x00000004,0x00000005,0x0000000c,0x0000000d,0x0000000a,0x0000000b,0x0000000e,0x0000000f,0x00000014,0x00000015,0x0000001c,0x0000001d,0x0000001a,0x0000001b,0x0000001e,0x0000001f - .type .L_2il0floatpacket.23,@object - .size .L_2il0floatpacket.23,64 - .align 64 -.L_2il0floatpacket.24: - .long 0x00000000,0x00000000,0x00000000,0x00000000,0x0000000a,0x0000000a,0x0000000a,0x0000000a,0x00000010,0x00000010,0x00000010,0x00000010,0x0000001a,0x0000001a,0x0000001a,0x0000001a - .type .L_2il0floatpacket.24,@object - .size .L_2il0floatpacket.24,64 - .align 64 -.L_2il0floatpacket.25: - .long 0x00000002,0x00000002,0x00000002,0x00000002,0x00000008,0x00000008,0x00000008,0x00000008,0x00000012,0x00000012,0x00000012,0x00000012,0x00000018,0x00000018,0x00000018,0x00000018 - .type .L_2il0floatpacket.25,@object - .size .L_2il0floatpacket.25,64 - .align 64 -.L_2il0floatpacket.26: - .long 0x00000004,0x00000004,0x00000004,0x00000004,0x0000000e,0x0000000e,0x0000000e,0x0000000e,0x00000014,0x00000014,0x00000014,0x00000014,0x0000001e,0x0000001e,0x0000001e,0x0000001e - .type .L_2il0floatpacket.26,@object - .size .L_2il0floatpacket.26,64 - .align 64 -.L_2il0floatpacket.27: - .long 0x00000006,0x00000006,0x00000006,0x00000006,0x0000000c,0x0000000c,0x0000000c,0x0000000c,0x00000016,0x00000016,0x00000016,0x00000016,0x0000001c,0x0000001c,0x0000001c,0x0000001c - .type .L_2il0floatpacket.27,@object - .size .L_2il0floatpacket.27,64 - .align 64 -.L_2il0floatpacket.28: - .long 0x00000001,0x00000001,0x00000001,0x00000001,0x0000000b,0x0000000b,0x0000000b,0x0000000b,0x00000011,0x00000011,0x00000011,0x00000011,0x0000001b,0x0000001b,0x0000001b,0x0000001b - .type .L_2il0floatpacket.28,@object - .size .L_2il0floatpacket.28,64 - .align 64 -.L_2il0floatpacket.29: - .long 0x00000003,0x00000003,0x00000003,0x00000003,0x00000009,0x00000009,0x00000009,0x00000009,0x00000013,0x00000013,0x00000013,0x00000013,0x00000019,0x00000019,0x00000019,0x00000019 - .type .L_2il0floatpacket.29,@object - .size .L_2il0floatpacket.29,64 - .align 64 -.L_2il0floatpacket.30: - .long 0x00000005,0x00000005,0x00000005,0x00000005,0x0000000f,0x0000000f,0x0000000f,0x0000000f,0x00000015,0x00000015,0x00000015,0x00000015,0x0000001f,0x0000001f,0x0000001f,0x0000001f - .type .L_2il0floatpacket.30,@object - .size .L_2il0floatpacket.30,64 - .align 64 -.L_2il0floatpacket.31: - .long 0x00000007,0x00000007,0x00000007,0x00000007,0x0000000d,0x0000000d,0x0000000d,0x0000000d,0x00000017,0x00000017,0x00000017,0x00000017,0x0000001d,0x0000001d,0x0000001d,0x0000001d - .type .L_2il0floatpacket.31,@object - .size .L_2il0floatpacket.31,64 - .align 64 -.L_2il0floatpacket.32: - .long 0x00000000,0x00000001,0x00000010,0x00000011,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000 - .type .L_2il0floatpacket.32,@object - .size .L_2il0floatpacket.32,64 - .align 64 -.L_2il0floatpacket.33: - .long 0x00000000,0x00000000,0x00000000,0x00000000,0x00000002,0x00000002,0x00000002,0x00000002,0x00000010,0x00000010,0x00000010,0x00000010,0x00000012,0x00000012,0x00000012,0x00000012 - .type .L_2il0floatpacket.33,@object - .size .L_2il0floatpacket.33,64 - .align 64 -.L_2il0floatpacket.34: - .long 0x00000001,0x00000001,0x00000001,0x00000001,0x00000003,0x00000003,0x00000003,0x00000003,0x00000011,0x00000011,0x00000011,0x00000011,0x00000013,0x00000013,0x00000013,0x00000013 - .type .L_2il0floatpacket.34,@object - .size .L_2il0floatpacket.34,64 - .align 64 -.L_2il0floatpacket.39: - .long 0x00000012,0x00000013,0x00000000,0x00000001,0x00000016,0x00000017,0x00000004,0x00000005,0x00000008,0x00000009,0x0000001a,0x0000001b,0x0000000c,0x0000000d,0x0000001e,0x0000001f - .type .L_2il0floatpacket.39,@object - .size .L_2il0floatpacket.39,64 - .align 64 -.L_2il0floatpacket.40: - .long 0x00000010,0x00000011,0x00000002,0x00000003,0x00000014,0x00000015,0x00000006,0x00000007,0x0000000a,0x0000000b,0x00000018,0x00000019,0x0000000e,0x0000000f,0x0000001c,0x0000001d - .type .L_2il0floatpacket.40,@object - .size .L_2il0floatpacket.40,64 - .align 64 -.L_2il0floatpacket.48: - .long 0x00000000,0x00000001,0x00000010,0x00000011,0x00000004,0x00000005,0x00000014,0x00000015,0x0000000a,0x0000000b,0x0000001a,0x0000001b,0x0000000e,0x0000000f,0x0000001e,0x0000001f - .type .L_2il0floatpacket.48,@object - .size .L_2il0floatpacket.48,64 - .align 64 -.L_2il0floatpacket.49: - .long 0x00000000,0x00000001,0x00000012,0x00000013,0x00000004,0x00000005,0x00000016,0x00000017,0x0000000a,0x0000000b,0x00000018,0x00000019,0x0000000e,0x0000000f,0x0000001c,0x0000001d - .type .L_2il0floatpacket.49,@object - .size .L_2il0floatpacket.49,64 - .align 64 -.L_2il0floatpacket.50: - .long 0x00000002,0x00000003,0x00000012,0x00000013,0x00000006,0x00000007,0x00000016,0x00000017,0x00000008,0x00000009,0x00000018,0x00000019,0x0000000c,0x0000000d,0x0000001c,0x0000001d - .type .L_2il0floatpacket.50,@object - .size .L_2il0floatpacket.50,64 - .align 64 -.L_2il0floatpacket.51: - .long 0x00000006,0x00000007,0x00000004,0x00000005,0x00000006,0x00000007,0x00000004,0x00000005,0x00000005,0x00000004,0x00000007,0x00000006,0x00000005,0x00000004,0x00000007,0x00000006 - .type .L_2il0floatpacket.51,@object - .size .L_2il0floatpacket.51,64 - .align 64 -.L_2il0floatpacket.52: - .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000000,0x00000001,0x00000002,0x00000003,0x00000000,0x00000001,0x00000002,0x00000003,0x00000000,0x00000001,0x00000002,0x00000003 - .type .L_2il0floatpacket.52,@object - .size .L_2il0floatpacket.52,64 - .align 64 -.L_2il0floatpacket.53: - .long 0x00000002,0x00000003,0x00000010,0x00000011,0x00000006,0x00000007,0x00000014,0x00000015,0x00000008,0x00000009,0x0000001a,0x0000001b,0x0000000c,0x0000000d,0x0000001e,0x0000001f - .type .L_2il0floatpacket.53,@object - .size .L_2il0floatpacket.53,64 - .align 32 -.L_2il0floatpacket.35: - .long 0x00000004,0x00000005,0x00000006,0x00000007,0x00000000,0x00000001,0x00000002,0x00000003 - .type .L_2il0floatpacket.35,@object - .size .L_2il0floatpacket.35,32 - .align 32 -.L_2il0floatpacket.36: - .long 0x3f800000,0x3f800000,0x3f800000,0x3f800000,0xbf800000,0xbf800000,0xbf800000,0xbf800000 - .type .L_2il0floatpacket.36,@object - .size .L_2il0floatpacket.36,32 - .align 32 -.L_2il0floatpacket.37: - .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000003,0x00000002,0x00000001,0x00000000 - .type .L_2il0floatpacket.37,@object - .size .L_2il0floatpacket.37,32 - .align 32 -.L_2il0floatpacket.38: - .long 0x3f800000,0x3f800000,0x3f800000,0x3f800000,0xbf800000,0x3f800000,0xbf800000,0x3f800000 - .type .L_2il0floatpacket.38,@object - .size .L_2il0floatpacket.38,32 - .align 32 -.L_2il0floatpacket.41: - .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000002,0x00000003,0x00000000,0x00000001 - .type .L_2il0floatpacket.41,@object - .size .L_2il0floatpacket.41,32 - .align 32 -.L_2il0floatpacket.42: - .long 0x3f800000,0x3f800000,0x3f800000,0x3f800000,0x3f800000,0x3f800000,0xbf800000,0xbf800000 - .type .L_2il0floatpacket.42,@object - .size .L_2il0floatpacket.42,32 - .align 32 -.L_2il0floatpacket.43: - .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000001,0x00000000,0x00000003,0x00000002 - .type .L_2il0floatpacket.43,@object - .size .L_2il0floatpacket.43,32 - .align 32 -.L_2il0floatpacket.44: - .long 0x3f800000,0x3f800000,0x3f800000,0x3f800000,0xbf800000,0x3f800000,0x3f800000,0xbf800000 - .type .L_2il0floatpacket.44,@object - .size .L_2il0floatpacket.44,32 - .align 32 -.L_2il0floatpacket.45: - .long 0x00000007,0x00000006,0x00000005,0x00000004,0x00000007,0x00000006,0x00000005,0x00000004 - .type .L_2il0floatpacket.45,@object - .size .L_2il0floatpacket.45,32 - .align 32 -.L_2il0floatpacket.46: - .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000000,0x00000001,0x00000002,0x00000003 - .type .L_2il0floatpacket.46,@object - .size .L_2il0floatpacket.46,32 - .align 32 -.L_2il0floatpacket.47: - .long 0xbf800000,0x3f800000,0xbf800000,0x3f800000,0x3f800000,0xbf800000,0x3f800000,0xbf800000 - .type .L_2il0floatpacket.47,@object - .size .L_2il0floatpacket.47,32 - .data - .section .note.GNU-stack, "" -// -- Begin DWARF2 SEGMENT .eh_frame - .section .eh_frame,"a",@progbits -.eh_frame_seg: - .align 8 -# End diff --git a/modules/dirac/avx512/Dw_dble_avx512.c b/modules/dirac/avx512/Dw_dble_avx512.c index 2f490be..cf07bfa 100644 --- a/modules/dirac/avx512/Dw_dble_avx512.c +++ b/modules/dirac/avx512/Dw_dble_avx512.c @@ -11,6 +11,9 @@ * See ../Dw_dble.c for more details and alternative implementations *******************************************************************************/ +#ifdef AVX512 + + #include #include #include @@ -254,3 +257,5 @@ void deo_dble_avx512( const int *piup, const int *pidn, const su3_dble *u, spin _avx512_add_to_spinors( b1, b2, b3, &(*sp).c1.c1.re, &(*sm).c1.c1.re ); _avx512_add_to_spinors_5( b1, b2, b3, &(*sp).c3.c1.re, &(*sm).c3.c1.re ); } + +#endif \ No newline at end of file diff --git a/modules/dirac/avx512/Dw_dble_avx512_asm.s b/modules/dirac/avx512/Dw_dble_avx512_asm.s deleted file mode 100644 index f76b428..0000000 --- a/modules/dirac/avx512/Dw_dble_avx512_asm.s +++ /dev/null @@ -1,1306 +0,0 @@ -# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 17.0.4.196 Build 20170411"; -# mark_description "-I../../../include -I.. -I/cineca/prod/opt/compilers/intel/pe-xe-2017/binary/impi/2017.3.196/intel64/include"; -# mark_description " -isystem /cineca/prod/opt/compilers/intel/pe-xe-2018/binary/impi/2018.1.163/include64/ -std=c89 -xCORE-AVX5"; -# mark_description "12 -mtune=skylake -DAVX512 -O3 -Ddirac_counters -pedantic -fstrict-aliasing -Wno-long-long -Wstrict-prototyp"; -# mark_description "es -S"; - .file "Dw_dble_avx512.c" - .text -..TXTST0: -# -- Begin doe_dble_avx512 - .text -# mark_begin; - .align 16,0x90 - .globl doe_dble_avx512 -# --- doe_dble_avx512(const int *, const int *, const su3_dble *, const spinor_dble *, double, spin_t *) -doe_dble_avx512: -# parameter 1: %rdi -# parameter 2: %rsi -# parameter 3: %rdx -# parameter 4: %rcx -# parameter 5: %xmm0 -# parameter 6: %r8 -..B1.1: # Preds ..B1.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_doe_dble_avx512.1: -..L2: - #27.1 - pushq %rbp #27.1 - .cfi_def_cfa_offset 16 - movq %rsp, %rbp #27.1 - .cfi_def_cfa 6, 16 - .cfi_offset 6, -16 - movslq (%rdi), %rax #42.16 - movslq (%rsi), %r9 #43.16 - vmovups .L_2il0floatpacket.14(%rip), %zmm13 #45.3 - vmovups .L_2il0floatpacket.15(%rip), %zmm15 #45.3 - vmovups .L_2il0floatpacket.16(%rip), %zmm16 #45.3 - vmovups .L_2il0floatpacket.17(%rip), %zmm19 #45.3 - vmovups .L_2il0floatpacket.18(%rip), %zmm18 #45.3 - vmovsd %xmm0, -16(%rbp) #27.1 - vmovaps %zmm13, %zmm28 #45.3 - lea (%rax,%rax,2), %r11 #42.8 - shlq $6, %r11 #42.8 - lea (%r9,%r9,2), %r10 #43.8 - shlq $6, %r10 #43.8 - movl $15, %eax #48.8 - vmovaps %zmm13, %zmm27 #46.3 - kmovw %eax, %k4 #48.8 - movl $240, %eax #49.8 - kmovw %eax, %k3 #49.8 - vmovups (%rcx,%r10), %zmm30 #45.3 - vmovups (%rcx,%r11), %zmm24 #45.3 - vmovups 96(%rcx,%r10), %zmm29 #46.3 - vmovups 96(%rcx,%r11), %zmm23 #46.3 - vpermi2pd %zmm24, %zmm30, %zmm28 #45.3 - vpermt2pd 64(%rcx,%r11), %zmm16, %zmm24 #45.3 - vpermt2pd 64(%rcx,%r10), %zmm15, %zmm30 #45.3 - vpermi2pd %zmm23, %zmm29, %zmm27 #46.3 - vpermt2pd 160(%rcx,%r10), %zmm15, %zmm29 #46.3 - vpermt2pd 160(%rcx,%r11), %zmm16, %zmm23 #46.3 - vaddpd %zmm27, %zmm28, %zmm7{%k4}{z} #48.8 - movslq 4(%rdi), %rax #55.16 - vmovaps %zmm19, %zmm26 #45.3 - vmovaps %zmm19, %zmm25 #46.3 - vpermi2pd %zmm30, %zmm24, %zmm26 #45.3 - lea (%rax,%rax,2), %rax #55.8 - vpermt2pd %zmm30, %zmm18, %zmm24 #45.3 - vpermi2pd %zmm29, %zmm23, %zmm25 #46.3 - vpermt2pd %zmm29, %zmm18, %zmm23 #46.3 - vsubpd %zmm27, %zmm28, %zmm7{%k3} #49.8 - vaddpd %zmm25, %zmm26, %zmm4{%k4}{z} #50.8 - vaddpd %zmm23, %zmm24, %zmm14{%k4}{z} #52.8 - vsubpd %zmm25, %zmm26, %zmm4{%k3} #51.8 - vsubpd %zmm23, %zmm24, %zmm14{%k3} #53.8 - shlq $6, %rax #55.8 - prefetcht0 (%rcx,%rax) #56.3 - movslq 4(%rsi), %r9 #57.16 - vpermilpd $85, %zmm7, %zmm22 #62.3 - vpermilpd $85, %zmm4, %zmm10 #62.3 - vpermilpd $85, %zmm14, %zmm29 #62.3 - lea (%r9,%r9,2), %r10 #57.8 - movl $90, %r9d #74.8 - kmovw %r9d, %k1 #74.8 - movl $165, %r9d #75.8 - kmovw %r9d, %k2 #75.8 - shlq $6, %r10 #57.8 - movl $175, %r9d #92.3 - kmovw %r9d, %k5 #92.3 - movl $80, %r9d #92.3 - kmovw %r9d, %k6 #92.3 - movl $60, %r9d #102.8 - kmovw %r9d, %k7 #102.8 - prefetcht0 (%rcx,%r10) #58.3 - vmovups .L_2il0floatpacket.19(%rip), %zmm30 #62.3 - vmovups (%rdx), %zmm11 #62.3 - vmovups .L_2il0floatpacket.25(%rip), %zmm24 #62.3 - vmovups 64(%rdx), %zmm17 #62.3 - vmovups 128(%rdx), %zmm8 #62.3 - vmulpd %zmm10, %zmm30, %zmm21 #62.3 - vmulpd %zmm29, %zmm30, %zmm10 #62.3 - vmulpd %zmm30, %zmm22, %zmm6 #62.3 - vmovups .L_2il0floatpacket.20(%rip), %zmm29 #62.3 - vmovups .L_2il0floatpacket.27(%rip), %zmm22 #62.3 - vmovaps %zmm11, %zmm28 #62.3 - vpermt2pd 144(%rdx), %zmm29, %zmm28 #62.3 - vmulpd %zmm7, %zmm28, %zmm27 #62.3 - vmovups .L_2il0floatpacket.21(%rip), %zmm28 #62.3 - vmovaps %zmm11, %zmm26 #62.3 - vpermt2pd 144(%rdx), %zmm28, %zmm26 #62.3 - vfmadd213pd %zmm27, %zmm6, %zmm26 #62.3 - vmovups .L_2il0floatpacket.22(%rip), %zmm27 #62.3 - vmovaps %zmm11, %zmm25 #62.3 - vpermt2pd 144(%rdx), %zmm27, %zmm25 #62.3 - vfmadd213pd %zmm26, %zmm4, %zmm25 #62.3 - vmovups .L_2il0floatpacket.23(%rip), %zmm26 #62.3 - vmovaps %zmm11, %zmm9 #62.3 - vpermt2pd 144(%rdx), %zmm26, %zmm9 #62.3 - vfmadd213pd %zmm25, %zmm21, %zmm9 #62.3 - vmovups .L_2il0floatpacket.24(%rip), %zmm25 #62.3 - vmovaps %zmm11, %zmm23 #62.3 - vpermt2pd 208(%rdx), %zmm25, %zmm23 #62.3 - vfmadd213pd %zmm9, %zmm14, %zmm23 #62.3 - vmovaps %zmm11, %zmm9 #62.3 - vpermt2pd 208(%rdx), %zmm24, %zmm9 #62.3 - vfmadd213pd %zmm23, %zmm10, %zmm9 #62.3 - vmovups .L_2il0floatpacket.26(%rip), %zmm23 #62.3 - vmovaps %zmm17, %zmm1 #62.3 - vmovaps %zmm11, %zmm3 #62.3 - vpermt2pd 144(%rdx), %zmm25, %zmm1 #62.3 - vpermt2pd 144(%rdx), %zmm23, %zmm3 #62.3 - vpermt2pd 144(%rdx), %zmm22, %zmm11 #62.3 - vmulpd %zmm1, %zmm7, %zmm31 #62.3 - vmulpd %zmm3, %zmm7, %zmm12 #62.3 - vmovups 96(%rcx,%r10), %zmm3 #71.3 - vfmadd213pd %zmm12, %zmm6, %zmm11 #62.3 - vmovaps %zmm17, %zmm7 #62.3 - vpermt2pd 144(%rdx), %zmm24, %zmm7 #62.3 - vfmadd213pd %zmm31, %zmm6, %zmm7 #62.3 - vmovaps %zmm17, %zmm6 #62.3 - vmovaps %zmm17, %zmm5 #62.3 - vpermt2pd 208(%rdx), %zmm23, %zmm6 #62.3 - vpermt2pd 208(%rdx), %zmm29, %zmm5 #62.3 - vfmadd213pd %zmm7, %zmm4, %zmm6 #62.3 - vfmadd213pd %zmm11, %zmm4, %zmm5 #62.3 - vmovups 96(%rcx,%rax), %zmm7 #71.3 - vmovaps %zmm17, %zmm2 #62.3 - vmovaps %zmm17, %zmm20 #62.3 - vmovaps %zmm17, %zmm11 #62.3 - vpermt2pd 208(%rdx), %zmm22, %zmm17 #62.3 - vpermt2pd 208(%rdx), %zmm28, %zmm2 #62.3 - vpermt2pd 208(%rdx), %zmm27, %zmm20 #62.3 - vpermt2pd 208(%rdx), %zmm26, %zmm11 #62.3 - vfmadd213pd %zmm6, %zmm21, %zmm17 #62.3 - vfmadd213pd %zmm5, %zmm21, %zmm2 #62.3 - vmovaps %zmm8, %zmm21 #62.3 - vpermt2pd 272(%rdx), %zmm29, %zmm21 #62.3 - vpermt2pd 272(%rdx), %zmm28, %zmm8 #62.3 - vfmadd213pd %zmm2, %zmm14, %zmm20 #62.3 - vfmadd213pd %zmm17, %zmm14, %zmm21 #62.3 - vfmadd213pd %zmm20, %zmm10, %zmm11 #62.3 - vfmadd213pd %zmm21, %zmm10, %zmm8 #62.3 - vmovups .L_2il0floatpacket.28(%rip), %zmm21 #64.3 - vmovups .L_2il0floatpacket.31(%rip), %zmm20 #71.3 - vpermpd %zmm9, %zmm21, %zmm17 #64.3 - vpermpd %zmm11, %zmm21, %zmm14 #65.3 - vpermpd %zmm8, %zmm21, %zmm4 #66.3 - vaddpd %zmm9, %zmm17, %zmm10{%k4}{z} #64.3 - vsubpd %zmm9, %zmm17, %zmm10{%k3} #64.3 - vaddpd %zmm11, %zmm14, %zmm9{%k4}{z} #65.3 - vmovups .L_2il0floatpacket.29(%rip), %zmm17 #71.3 - vsubpd %zmm11, %zmm14, %zmm9{%k3} #65.3 - vaddpd %zmm8, %zmm4, %zmm11{%k4}{z} #66.3 - vmovups .L_2il0floatpacket.30(%rip), %zmm14 #71.3 - vsubpd %zmm8, %zmm4, %zmm11{%k3} #66.3 - vmovups (%rcx,%r10), %zmm8 #70.3 - vmovups (%rcx,%rax), %zmm4 #70.3 - vmovaps %zmm3, %zmm12 #71.3 - vmovaps %zmm13, %zmm5 #70.3 - vpermt2pd %zmm7, %zmm17, %zmm12 #71.3 - vpermt2pd 160(%rcx,%r10), %zmm14, %zmm3 #71.3 - vpermt2pd 160(%rcx,%rax), %zmm20, %zmm7 #71.3 - vpermi2pd %zmm4, %zmm8, %zmm5 #70.3 - vpermt2pd 64(%rcx,%r10), %zmm15, %zmm8 #70.3 - vpermt2pd 64(%rcx,%rax), %zmm16, %zmm4 #70.3 - vmovaps %zmm19, %zmm0 #71.3 - vmovaps %zmm19, %zmm6 #70.3 - vpermi2pd %zmm3, %zmm7, %zmm0 #71.3 - vpermi2pd %zmm8, %zmm4, %zmm6 #70.3 - vpermt2pd %zmm8, %zmm18, %zmm4 #70.3 - vpermt2pd %zmm3, %zmm18, %zmm7 #71.3 - vpermilpd $85, %zmm12, %zmm1 #73.8 - vaddpd %zmm1, %zmm5, %zmm2{%k1}{z} #74.8 - vpermilpd $85, %zmm0, %zmm8 #76.8 - movslq 8(%rdi), %r11 #83.16 - vsubpd %zmm1, %zmm5, %zmm2{%k2} #75.8 - vaddpd %zmm8, %zmm6, %zmm5{%k1}{z} #77.8 - vsubpd %zmm8, %zmm6, %zmm5{%k2} #78.8 - lea (%r11,%r11,2), %r10 #83.8 - vpermilpd $85, %zmm7, %zmm6 #79.8 - shlq $6, %r10 #83.8 - vaddpd %zmm6, %zmm4, %zmm8{%k1}{z} #80.8 - vsubpd %zmm6, %zmm4, %zmm8{%k2} #81.8 - prefetcht0 (%rcx,%r10) #84.3 - movslq 8(%rsi), %rax #85.16 - vpermilpd $85, %zmm2, %zmm4 #90.3 - vpermilpd $85, %zmm5, %zmm12 #90.3 - vpermilpd $85, %zmm8, %zmm1 #90.3 - lea (%rax,%rax,2), %r9 #85.8 - shlq $6, %r9 #85.8 - movl $63, %eax #117.3 - kmovw %eax, %k1 #117.3 - movl $192, %eax #117.3 - kmovw %eax, %k2 #117.3 - vmulpd %zmm30, %zmm4, %zmm3 #90.3 - vmulpd %zmm12, %zmm30, %zmm4 #90.3 - vmulpd %zmm1, %zmm30, %zmm7 #90.3 - prefetcht0 (%rcx,%r9) #86.3 - movl $195, %eax #101.8 - vmovups 288(%rdx), %zmm1 #90.3 - vmovups 352(%rdx), %zmm12 #90.3 - vmovups 416(%rdx), %zmm6 #90.3 - vmovaps %zmm1, %zmm31 #90.3 - vpermt2pd 432(%rdx), %zmm29, %zmm31 #90.3 - vmulpd %zmm2, %zmm31, %zmm0 #90.3 - vmovaps %zmm1, %zmm31 #90.3 - vpermt2pd 432(%rdx), %zmm28, %zmm31 #90.3 - vfmadd213pd %zmm0, %zmm3, %zmm31 #90.3 - vmovaps %zmm1, %zmm0 #90.3 - vpermt2pd 432(%rdx), %zmm27, %zmm0 #90.3 - vfmadd213pd %zmm31, %zmm5, %zmm0 #90.3 - vmovaps %zmm1, %zmm31 #90.3 - vpermt2pd 432(%rdx), %zmm26, %zmm31 #90.3 - vfmadd213pd %zmm0, %zmm4, %zmm31 #90.3 - vmovaps %zmm1, %zmm0 #90.3 - vpermt2pd 496(%rdx), %zmm25, %zmm0 #90.3 - vfmadd213pd %zmm31, %zmm8, %zmm0 #90.3 - vmovaps %zmm1, %zmm31 #90.3 - vpermt2pd 496(%rdx), %zmm24, %zmm31 #90.3 - vfmadd213pd %zmm0, %zmm7, %zmm31 #90.3 - vmovaps %zmm1, %zmm0 #90.3 - vpermt2pd 432(%rdx), %zmm23, %zmm0 #90.3 - vpermt2pd 432(%rdx), %zmm22, %zmm1 #90.3 - vmulpd %zmm0, %zmm2, %zmm0 #90.3 - vfmadd213pd %zmm0, %zmm3, %zmm1 #90.3 - vmovaps %zmm12, %zmm0 #90.3 - vpermt2pd 496(%rdx), %zmm29, %zmm0 #90.3 - vfmadd213pd %zmm1, %zmm5, %zmm0 #90.3 - vmovaps %zmm12, %zmm1 #90.3 - vpermt2pd 496(%rdx), %zmm28, %zmm1 #90.3 - vfmadd213pd %zmm0, %zmm4, %zmm1 #90.3 - vmovaps %zmm12, %zmm0 #90.3 - vpermt2pd 496(%rdx), %zmm27, %zmm0 #90.3 - vfmadd213pd %zmm1, %zmm8, %zmm0 #90.3 - vmovaps %zmm12, %zmm1 #90.3 - vpermt2pd 496(%rdx), %zmm26, %zmm1 #90.3 - vfmadd213pd %zmm0, %zmm7, %zmm1 #90.3 - vmovaps %zmm12, %zmm0 #90.3 - vpermt2pd 432(%rdx), %zmm25, %zmm0 #90.3 - vmulpd %zmm0, %zmm2, %zmm2 #90.3 - vmovaps %zmm12, %zmm0 #90.3 - vpermt2pd 432(%rdx), %zmm24, %zmm0 #90.3 - vfmadd213pd %zmm2, %zmm3, %zmm0 #90.3 - vmovups .L_2il0floatpacket.32(%rip), %zmm2 #92.3 - vmovaps %zmm12, %zmm3 #90.3 - vpermt2pd 496(%rdx), %zmm23, %zmm3 #90.3 - vpermt2pd 496(%rdx), %zmm22, %zmm12 #90.3 - vfmadd213pd %zmm0, %zmm5, %zmm3 #90.3 - vfmadd213pd %zmm3, %zmm4, %zmm12 #90.3 - vpermpd %zmm31, %zmm21, %zmm4 #92.3 - vmovaps %zmm6, %zmm5 #90.3 - vpermt2pd 560(%rdx), %zmm29, %zmm5 #90.3 - vpermt2pd 560(%rdx), %zmm28, %zmm6 #90.3 - vaddpd %zmm31, %zmm4, %zmm4{%k4} #92.3 - vfmadd213pd %zmm12, %zmm8, %zmm5 #90.3 - vsubpd %zmm4, %zmm31, %zmm4{%k3} #92.3 - vpermpd %zmm1, %zmm21, %zmm8 #93.3 - vfmadd213pd %zmm5, %zmm7, %zmm6 #90.3 - vmovups 96(%rcx,%r9), %zmm31 #99.3 - vpermpd %zmm4, %zmm2, %zmm7 #92.3 - vpermpd %zmm6, %zmm21, %zmm0 #94.3 - vaddpd %zmm1, %zmm8, %zmm8{%k4} #93.3 - vaddpd %zmm7, %zmm10, %zmm10{%k5} #92.3 - vaddpd %zmm6, %zmm0, %zmm0{%k4} #94.3 - vsubpd %zmm8, %zmm1, %zmm8{%k3} #93.3 - vsubpd %zmm7, %zmm10, %zmm10{%k6} #92.3 - vsubpd %zmm0, %zmm6, %zmm0{%k3} #94.3 - vpermpd %zmm8, %zmm2, %zmm12 #93.3 - vmovups (%rcx,%r9), %zmm1 #98.3 - vmovups (%rcx,%r10), %zmm7 #98.3 - vmovups 96(%rcx,%r10), %zmm8 #99.3 - vpermpd %zmm0, %zmm2, %zmm6 #94.3 - vaddpd %zmm12, %zmm9, %zmm9{%k5} #93.3 - vpermi2pd %zmm8, %zmm31, %zmm17 #99.3 - vaddpd %zmm6, %zmm11, %zmm11{%k5} #94.3 - vpermt2pd 160(%rcx,%r10), %zmm20, %zmm8 #99.3 - vpermt2pd 160(%rcx,%r9), %zmm14, %zmm31 #99.3 - vsubpd %zmm6, %zmm11, %zmm11{%k6} #94.3 - vsubpd %zmm12, %zmm9, %zmm9{%k6} #93.3 - kmovw %eax, %k5 #101.8 - vmovaps %zmm13, %zmm3 #98.3 - vpermi2pd %zmm7, %zmm1, %zmm3 #98.3 - vpermt2pd 64(%rcx,%r9), %zmm15, %zmm1 #98.3 - vpermt2pd 64(%rcx,%r10), %zmm16, %zmm7 #98.3 - vaddpd %zmm17, %zmm3, %zmm2{%k5}{z} #101.8 - movslq 12(%rdi), %rdi #108.15 - vmovaps %zmm19, %zmm6 #98.3 - vmovaps %zmm19, %zmm20 #99.3 - vpermi2pd %zmm1, %zmm7, %zmm6 #98.3 - lea (%rdi,%rdi,2), %r9 #108.8 - vpermt2pd %zmm1, %zmm18, %zmm7 #98.3 - vpermi2pd %zmm31, %zmm8, %zmm20 #99.3 - vpermt2pd %zmm31, %zmm18, %zmm8 #99.3 - vsubpd %zmm17, %zmm3, %zmm2{%k7} #102.8 - vaddpd %zmm20, %zmm6, %zmm3{%k5}{z} #103.8 - vaddpd %zmm8, %zmm7, %zmm4{%k5}{z} #105.8 - vsubpd %zmm20, %zmm6, %zmm3{%k7} #104.8 - vsubpd %zmm8, %zmm7, %zmm4{%k7} #106.8 - shlq $6, %r9 #108.8 - prefetcht0 (%rcx,%r9) #109.3 - movslq 12(%rsi), %rsi #110.15 - vpermilpd $85, %zmm2, %zmm6 #115.3 - vpermilpd $85, %zmm3, %zmm17 #115.3 - vpermilpd $85, %zmm4, %zmm14 #115.3 - lea (%rsi,%rsi,2), %rax #110.8 - shlq $6, %rax #110.8 - movl $150, %esi #127.8 - kmovw %esi, %k5 #127.8 - movl $105, %esi #128.8 - kmovw %esi, %k6 #128.8 - vmulpd %zmm30, %zmm6, %zmm5 #115.3 - vmulpd %zmm17, %zmm30, %zmm12 #115.3 - vmulpd %zmm14, %zmm30, %zmm8 #115.3 - prefetcht0 (%rcx,%rax) #111.3 - vmovups 576(%rdx), %zmm20 #115.3 - vmovups 640(%rdx), %zmm7 #115.3 - vmovups 704(%rdx), %zmm6 #115.3 - vmovaps %zmm20, %zmm17 #115.3 - vpermt2pd 720(%rdx), %zmm29, %zmm17 #115.3 - vmulpd %zmm2, %zmm17, %zmm14 #115.3 - vmovaps %zmm20, %zmm0 #115.3 - vpermt2pd 720(%rdx), %zmm28, %zmm0 #115.3 - vfmadd213pd %zmm14, %zmm5, %zmm0 #115.3 - vmovaps %zmm20, %zmm1 #115.3 - vpermt2pd 720(%rdx), %zmm27, %zmm1 #115.3 - vfmadd213pd %zmm0, %zmm3, %zmm1 #115.3 - vmovaps %zmm20, %zmm31 #115.3 - vmovaps %zmm20, %zmm0 #115.3 - vpermt2pd 720(%rdx), %zmm26, %zmm31 #115.3 - vpermt2pd 720(%rdx), %zmm23, %zmm0 #115.3 - vfmadd213pd %zmm1, %zmm12, %zmm31 #115.3 - vmulpd %zmm0, %zmm2, %zmm1 #115.3 - vmovaps %zmm20, %zmm17 #115.3 - vpermt2pd 784(%rdx), %zmm25, %zmm17 #115.3 - vfmadd213pd %zmm31, %zmm4, %zmm17 #115.3 - vmovaps %zmm20, %zmm14 #115.3 - vpermt2pd 720(%rdx), %zmm22, %zmm20 #115.3 - vpermt2pd 784(%rdx), %zmm24, %zmm14 #115.3 - vfmadd213pd %zmm1, %zmm5, %zmm20 #115.3 - vfmadd213pd %zmm17, %zmm8, %zmm14 #115.3 - vmovaps %zmm7, %zmm17 #115.3 - vpermt2pd 784(%rdx), %zmm29, %zmm17 #115.3 - vfmadd213pd %zmm20, %zmm3, %zmm17 #115.3 - vmovaps %zmm7, %zmm20 #115.3 - vpermt2pd 784(%rdx), %zmm28, %zmm20 #115.3 - vmovaps %zmm7, %zmm1 #115.3 - vpermt2pd 720(%rdx), %zmm25, %zmm1 #115.3 - vfmadd213pd %zmm17, %zmm12, %zmm20 #115.3 - vmulpd %zmm1, %zmm2, %zmm2 #115.3 - vmovaps %zmm7, %zmm0 #115.3 - vpermt2pd 784(%rdx), %zmm27, %zmm0 #115.3 - vfmadd213pd %zmm20, %zmm4, %zmm0 #115.3 - vmovaps %zmm7, %zmm17 #115.3 - vpermt2pd 784(%rdx), %zmm26, %zmm17 #115.3 - vfmadd213pd %zmm0, %zmm8, %zmm17 #115.3 - vmovaps %zmm7, %zmm0 #115.3 - vpermt2pd 720(%rdx), %zmm24, %zmm0 #115.3 - vfmadd213pd %zmm2, %zmm5, %zmm0 #115.3 - vmovaps %zmm7, %zmm5 #115.3 - vpermt2pd 784(%rdx), %zmm23, %zmm5 #115.3 - vpermt2pd 784(%rdx), %zmm22, %zmm7 #115.3 - vfmadd213pd %zmm0, %zmm3, %zmm5 #115.3 - vmovups .L_2il0floatpacket.33(%rip), %zmm0 #117.3 - vfmadd213pd %zmm5, %zmm12, %zmm7 #115.3 - vmovups 96(%rcx,%rax), %zmm5 #124.3 - vmovaps %zmm6, %zmm3 #115.3 - vpermt2pd 848(%rdx), %zmm29, %zmm3 #115.3 - vpermt2pd 848(%rdx), %zmm28, %zmm6 #115.3 - vfmadd213pd %zmm7, %zmm4, %zmm3 #115.3 - vpermpd %zmm14, %zmm21, %zmm4 #117.3 - vfmadd213pd %zmm3, %zmm8, %zmm6 #115.3 - vmovups (%rcx,%rax), %zmm3 #123.3 - vaddpd %zmm14, %zmm4, %zmm4{%k4} #117.3 - vpermpd %zmm6, %zmm21, %zmm1 #119.3 - vsubpd %zmm4, %zmm14, %zmm4{%k3} #117.3 - vaddpd %zmm6, %zmm1, %zmm1{%k4} #119.3 - vpermpd %zmm4, %zmm0, %zmm12 #117.3 - vpermpd %zmm17, %zmm21, %zmm4 #118.3 - vsubpd %zmm1, %zmm6, %zmm1{%k3} #119.3 - vaddpd %zmm12, %zmm10, %zmm10{%k1} #117.3 - vaddpd %zmm17, %zmm4, %zmm4{%k4} #118.3 - vpermpd %zmm1, %zmm0, %zmm2 #119.3 - vsubpd %zmm12, %zmm10, %zmm10{%k2} #117.3 - vsubpd %zmm4, %zmm17, %zmm4{%k3} #118.3 - vaddpd %zmm2, %zmm11, %zmm11{%k1} #119.3 - vpermpd %zmm4, %zmm0, %zmm12 #118.3 - vsubpd %zmm2, %zmm11, %zmm11{%k2} #119.3 - vmovups (%rcx,%r9), %zmm4 #123.3 - vmovups 96(%rcx,%r9), %zmm2 #124.3 - vaddpd %zmm12, %zmm9, %zmm9{%k1} #118.3 - vmovaps %zmm13, %zmm6 #123.3 - vpermi2pd %zmm4, %zmm3, %zmm6 #123.3 - vpermt2pd 64(%rcx,%r9), %zmm16, %zmm4 #123.3 - vpermt2pd 64(%rcx,%rax), %zmm15, %zmm3 #123.3 - vpermi2pd %zmm2, %zmm5, %zmm13 #124.3 - vpermt2pd 160(%rcx,%rax), %zmm15, %zmm5 #124.3 - vpermt2pd 160(%rcx,%r9), %zmm16, %zmm2 #124.3 - vsubpd %zmm12, %zmm9, %zmm9{%k2} #118.3 - vmovaps %zmm19, %zmm0 #123.3 - vpermi2pd %zmm3, %zmm4, %zmm0 #123.3 - vpermt2pd %zmm3, %zmm18, %zmm4 #123.3 - vpermi2pd %zmm5, %zmm2, %zmm19 #124.3 - vpermt2pd %zmm5, %zmm18, %zmm2 #124.3 - vpermilpd $85, %zmm13, %zmm18 #126.8 - vaddpd %zmm18, %zmm6, %zmm13{%k5}{z} #127.8 - vpermilpd $85, %zmm19, %zmm1 #129.8 - vsubpd %zmm18, %zmm6, %zmm13{%k6} #128.8 - vaddpd %zmm1, %zmm0, %zmm12{%k5}{z} #130.8 - # LOE rdx rbx r8 r12 r13 r14 r15 zmm0 zmm1 zmm2 zmm4 zmm9 zmm10 zmm11 zmm12 zmm13 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 zmm27 zmm28 zmm29 zmm30 k3 k4 k5 k6 -..B1.4: # Preds ..B1.1 - # Execution count [1.00e+00] - vpermilpd $85, %zmm2, %zmm14 #132.8 - movl $111, %eax #140.3 - vmovups 864(%rdx), %zmm3 #138.3 - vmovups 1008(%rdx), %zmm2 #138.3 - vmovups 928(%rdx), %zmm5 #138.3 - vmovups 992(%rdx), %zmm7 #138.3 - vmovups 1136(%rdx), %zmm6 #138.3 - vsubpd %zmm1, %zmm0, %zmm12{%k6} #131.8 - vaddpd %zmm14, %zmm4, %zmm8{%k5}{z} #133.8 - kmovw %eax, %k1 #140.3 - vsubpd %zmm14, %zmm4, %zmm8{%k6} #134.8 - vmovups 1072(%rdx), %zmm4 #138.3 - vmovaps %zmm29, %zmm18 #138.3 - movl $144, %eax #140.3 - vpermi2pd %zmm2, %zmm3, %zmm18 #138.3 - kmovw %eax, %k2 #140.3 - vmulpd %zmm13, %zmm18, %zmm19 #138.3 - vpermilpd $85, %zmm13, %zmm15 #138.3 - vpermilpd $85, %zmm12, %zmm16 #138.3 - vmulpd %zmm30, %zmm15, %zmm1 #138.3 - vmulpd %zmm16, %zmm30, %zmm0 #138.3 - vmovaps %zmm23, %zmm16 #138.3 - vmovaps %zmm25, %zmm15 #138.3 - vpermi2pd %zmm2, %zmm3, %zmm16 #138.3 - vpermi2pd %zmm2, %zmm5, %zmm25 #138.3 - vpermi2pd %zmm4, %zmm3, %zmm15 #138.3 - vpermi2pd %zmm4, %zmm5, %zmm23 #138.3 - vpermilpd $85, %zmm8, %zmm17 #138.3 - vmulpd %zmm17, %zmm30, %zmm30 #138.3 - vmulpd %zmm16, %zmm13, %zmm17 #138.3 - vmulpd %zmm25, %zmm13, %zmm13 #138.3 - vmovaps %zmm28, %zmm20 #138.3 - vpermi2pd %zmm2, %zmm3, %zmm20 #138.3 - vfmadd213pd %zmm19, %zmm1, %zmm20 #138.3 - vmovaps %zmm27, %zmm31 #138.3 - vmovaps %zmm26, %zmm14 #138.3 - vmovaps %zmm24, %zmm19 #138.3 - vpermi2pd %zmm2, %zmm3, %zmm31 #138.3 - vpermi2pd %zmm2, %zmm3, %zmm14 #138.3 - vpermi2pd %zmm4, %zmm3, %zmm19 #138.3 - vpermt2pd %zmm2, %zmm22, %zmm3 #138.3 - vpermi2pd %zmm2, %zmm5, %zmm24 #138.3 - vpermi2pd %zmm4, %zmm5, %zmm27 #138.3 - vpermi2pd %zmm4, %zmm5, %zmm26 #138.3 - vfmadd213pd %zmm17, %zmm1, %zmm3 #138.3 - vfmadd213pd %zmm13, %zmm1, %zmm24 #138.3 - vfmadd213pd %zmm20, %zmm12, %zmm31 #138.3 - vfmadd213pd %zmm24, %zmm12, %zmm23 #138.3 - vfmadd213pd %zmm31, %zmm0, %zmm14 #138.3 - vmovups .L_2il0floatpacket.34(%rip), %zmm24 #140.3 - vfmadd213pd %zmm14, %zmm8, %zmm15 #138.3 - vmovaps %zmm29, %zmm18 #138.3 - vpermi2pd %zmm4, %zmm5, %zmm18 #138.3 - vpermi2pd %zmm6, %zmm7, %zmm29 #138.3 - vpermt2pd %zmm6, %zmm28, %zmm7 #138.3 - vfmadd213pd %zmm3, %zmm12, %zmm18 #138.3 - vfmadd213pd %zmm15, %zmm30, %zmm19 #138.3 - vmovaps %zmm28, %zmm3 #138.3 - vpermi2pd %zmm4, %zmm5, %zmm3 #138.3 - vpermt2pd %zmm4, %zmm22, %zmm5 #138.3 - vpermpd %zmm19, %zmm21, %zmm12 #140.3 - vfmadd213pd %zmm18, %zmm0, %zmm3 #138.3 - vfmadd213pd %zmm23, %zmm0, %zmm5 #138.3 - vmovups .L_2il0floatpacket.35(%rip), %zmm28 #150.3 - vaddpd %zmm19, %zmm12, %zmm12{%k4} #140.3 - vfmadd213pd %zmm3, %zmm8, %zmm27 #138.3 - vfmadd213pd %zmm5, %zmm8, %zmm29 #138.3 - vsubpd %zmm12, %zmm19, %zmm12{%k3} #140.3 - vfmadd213pd %zmm27, %zmm30, %zmm26 #138.3 - vfmadd213pd %zmm29, %zmm30, %zmm7 #138.3 - vmovups .L_2il0floatpacket.36(%rip), %zmm29 #150.3 - vpermpd %zmm26, %zmm21, %zmm23 #141.3 - vpermpd %zmm7, %zmm21, %zmm21 #142.3 - vpermpd %zmm12, %zmm24, %zmm22 #140.3 - vaddpd %zmm26, %zmm23, %zmm23{%k4} #141.3 - vaddpd %zmm7, %zmm21, %zmm21{%k4} #142.3 - vaddpd %zmm22, %zmm10, %zmm10{%k1} #140.3 - vsubpd %zmm23, %zmm26, %zmm23{%k3} #141.3 - vsubpd %zmm21, %zmm7, %zmm21{%k3} #142.3 - vsubpd %zmm22, %zmm10, %zmm10{%k2} #140.3 - vpermpd %zmm23, %zmm24, %zmm26 #141.3 - vpermpd %zmm21, %zmm24, %zmm25 #142.3 - vbroadcastsd -16(%rbp), %zmm27 #145.10 - vaddpd %zmm26, %zmm9, %zmm9{%k1} #141.3 - vaddpd %zmm25, %zmm11, %zmm11{%k1} #142.3 - vmulpd %zmm10, %zmm27, %zmm10 #146.8 - vsubpd %zmm26, %zmm9, %zmm9{%k2} #141.3 - vsubpd %zmm25, %zmm11, %zmm11{%k2} #142.3 - vmulpd %zmm9, %zmm27, %zmm0 #147.8 - vmulpd %zmm11, %zmm27, %zmm9 #148.8 - vmovups .L_2il0floatpacket.37(%rip), %zmm11 #150.3 - vpermi2pd %zmm0, %zmm10, %zmm28 #150.3 - vpermi2pd %zmm10, %zmm9, %zmm29 #150.3 - vpermt2pd %zmm9, %zmm11, %zmm0 #150.3 - vmovupd %ymm28, (%r8) #150.3 - vmovupd %ymm29, 32(%r8) #150.3 - vmovupd %ymm0, 64(%r8) #150.3 - vextractf64x4 $1, %zmm28, 96(%r8) #150.3 - vextractf64x4 $1, %zmm29, 128(%r8) #150.3 - vextractf64x4 $1, %zmm0, 160(%r8) #150.3 - vzeroupper #151.1 - movq %rbp, %rsp #151.1 - popq %rbp #151.1 - .cfi_restore 6 - ret #151.1 - .align 16,0x90 - # LOE - .cfi_endproc -# mark_end; - .type doe_dble_avx512,@function - .size doe_dble_avx512,.-doe_dble_avx512 - .data -# -- End doe_dble_avx512 - .text -# -- Begin deo_dble_avx512 - .text -# mark_begin; - .align 16,0x90 - .globl deo_dble_avx512 -# --- deo_dble_avx512(const int *, const int *, const su3_dble *, spinor_dble *, double, spin_t *) -deo_dble_avx512: -# parameter 1: %rdi -# parameter 2: %rsi -# parameter 3: %rdx -# parameter 4: %rcx -# parameter 5: %xmm0 -# parameter 6: %r8 -..B2.1: # Preds ..B2.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_deo_dble_avx512.8: -..L9: - #154.1 - pushq %rbp #154.1 - .cfi_def_cfa_offset 16 - movq %rsp, %rbp #154.1 - .cfi_def_cfa 6, 16 - .cfi_offset 6, -16 - andq $-64, %rsp #154.1 - movslq (%rdi), %rax #168.16 - lea (%rax,%rax,2), %r11 #168.8 - shlq $6, %r11 #168.8 - prefetcht0 (%rcx,%r11) #169.3 - movl $15, %eax #181.3 - movslq (%rsi), %r9 #170.16 - kmovw %eax, %k5 #181.3 - movl $240, %eax #181.3 - kmovw %eax, %k6 #181.3 - vbroadcastsd %xmm0, %zmm24 #176.10 - movl $90, %eax #198.3 - lea (%r9,%r9,2), %r10 #170.8 - shlq $6, %r10 #170.8 - kmovw %eax, %k1 #198.3 - movl $165, %eax #198.3 - kmovw %eax, %k2 #198.3 - movl $195, %eax #215.3 - kmovw %eax, %k4 #215.3 - movl $60, %eax #215.3 - kmovw %eax, %k3 #215.3 - prefetcht0 (%rcx,%r10) #171.3 - vmovups 96(%r8), %zmm27 #173.3 - vmovups (%r8), %zmm23 #173.3 - vmovups .L_2il0floatpacket.14(%rip), %zmm26 #173.3 - vmovups .L_2il0floatpacket.15(%rip), %zmm30 #173.3 - vmovups .L_2il0floatpacket.16(%rip), %zmm29 #173.3 - vmovups .L_2il0floatpacket.17(%rip), %zmm25 #173.3 - vmovups .L_2il0floatpacket.18(%rip), %zmm28 #173.3 - vmovups .L_2il0floatpacket.28(%rip), %zmm20 #181.3 - vmovups 144(%rdx), %zmm18 #187.3 - vmovups 208(%rdx), %zmm13 #187.3 - vmovups 272(%rdx), %zmm1 #187.3 - vmovups (%rcx,%r10), %zmm3 #189.3 - vpermi2pd %zmm23, %zmm27, %zmm26 #173.3 - vpermt2pd 160(%r8), %zmm30, %zmm27 #173.3 - vpermt2pd 64(%r8), %zmm29, %zmm23 #173.3 - vmulpd %zmm26, %zmm24, %zmm30 #177.8 - vpermi2pd %zmm27, %zmm23, %zmm25 #173.3 - vpermt2pd %zmm27, %zmm28, %zmm23 #173.3 - vpermpd %zmm30, %zmm20, %zmm22 #181.3 - vmulpd %zmm25, %zmm24, %zmm29 #178.8 - vmulpd %zmm23, %zmm24, %zmm28 #179.8 - vmovups .L_2il0floatpacket.19(%rip), %zmm27 #187.3 - vpermpd %zmm29, %zmm20, %zmm21 #182.3 - vaddpd %zmm30, %zmm22, %zmm6{%k5}{z} #181.3 - vpermpd %zmm28, %zmm20, %zmm19 #183.3 - vaddpd %zmm29, %zmm21, %zmm10{%k5}{z} #182.3 - vsubpd %zmm30, %zmm22, %zmm6{%k6} #181.3 - vaddpd %zmm28, %zmm19, %zmm12{%k5}{z} #183.3 - vsubpd %zmm29, %zmm21, %zmm10{%k6} #182.3 - vsubpd %zmm28, %zmm19, %zmm12{%k6} #183.3 - vmovups .L_2il0floatpacket.27(%rip), %zmm19 #187.3 - vpermilpd $85, %zmm10, %zmm26 #187.3 - vmulpd %zmm26, %zmm27, %zmm16 #187.3 - vmovups .L_2il0floatpacket.20(%rip), %zmm26 #187.3 - vmovaps %zmm18, %zmm24 #187.3 - vpermt2pd (%rdx), %zmm26, %zmm24 #187.3 - vpermilpd $85, %zmm6, %zmm15 #187.3 - vmulpd %zmm27, %zmm15, %zmm17 #187.3 - vmulpd %zmm6, %zmm24, %zmm23 #187.3 - vmovups .L_2il0floatpacket.22(%rip), %zmm24 #187.3 - vpermilpd $85, %zmm12, %zmm25 #187.3 - vmulpd %zmm25, %zmm27, %zmm15 #187.3 - vmovups .L_2il0floatpacket.21(%rip), %zmm25 #187.3 - vmovaps %zmm18, %zmm22 #187.3 - vpermt2pd (%rdx), %zmm25, %zmm22 #187.3 - vfmadd213pd %zmm23, %zmm17, %zmm22 #187.3 - vmovups .L_2il0floatpacket.23(%rip), %zmm23 #187.3 - vmovaps %zmm18, %zmm14 #187.3 - vpermt2pd (%rdx), %zmm24, %zmm14 #187.3 - vfmadd213pd %zmm22, %zmm10, %zmm14 #187.3 - vmovups .L_2il0floatpacket.24(%rip), %zmm22 #187.3 - vmovaps %zmm18, %zmm21 #187.3 - vpermt2pd (%rdx), %zmm23, %zmm21 #187.3 - vfmadd213pd %zmm14, %zmm16, %zmm21 #187.3 - vmovaps %zmm18, %zmm20 #187.3 - vpermt2pd 64(%rdx), %zmm22, %zmm20 #187.3 - vfmadd213pd %zmm21, %zmm12, %zmm20 #187.3 - vmovups .L_2il0floatpacket.25(%rip), %zmm21 #187.3 - vmovaps %zmm18, %zmm14 #187.3 - vpermt2pd 64(%rdx), %zmm21, %zmm14 #187.3 - vfmadd213pd %zmm20, %zmm15, %zmm14 #187.3 - vmovups .L_2il0floatpacket.26(%rip), %zmm20 #187.3 - vmovaps %zmm18, %zmm11 #187.3 - vpermt2pd (%rdx), %zmm20, %zmm11 #187.3 - vpermt2pd (%rdx), %zmm19, %zmm18 #187.3 - vmulpd %zmm11, %zmm6, %zmm31 #187.3 - vfmadd213pd %zmm31, %zmm17, %zmm18 #187.3 - vmovaps %zmm13, %zmm8 #187.3 - vmovaps %zmm13, %zmm2 #187.3 - vpermt2pd 64(%rdx), %zmm26, %zmm8 #187.3 - vpermt2pd (%rdx), %zmm22, %zmm2 #187.3 - vfmadd213pd %zmm18, %zmm10, %zmm8 #187.3 - vmulpd %zmm2, %zmm6, %zmm18 #187.3 - vmovaps %zmm13, %zmm5 #187.3 - vpermt2pd (%rdx), %zmm21, %zmm5 #187.3 - vfmadd213pd %zmm18, %zmm17, %zmm5 #187.3 - vmovups .L_2il0floatpacket.38(%rip), %zmm18 #189.3 - vmovaps %zmm13, %zmm17 #187.3 - vpermt2pd 64(%rdx), %zmm20, %zmm17 #187.3 - vfmadd213pd %zmm5, %zmm10, %zmm17 #187.3 - vmovups 64(%rcx,%r10), %zmm10 #189.3 - vmovaps %zmm13, %zmm9 #187.3 - vmovaps %zmm13, %zmm7 #187.3 - vmovaps %zmm13, %zmm11 #187.3 - vpermt2pd 64(%rdx), %zmm19, %zmm13 #187.3 - vpermt2pd 64(%rdx), %zmm25, %zmm9 #187.3 - vpermt2pd 64(%rdx), %zmm24, %zmm7 #187.3 - vpermt2pd 64(%rdx), %zmm23, %zmm11 #187.3 - vpermt2pd 64(%rcx,%r11), %zmm18, %zmm10 #189.3 - vfmadd213pd %zmm17, %zmm16, %zmm13 #187.3 - vfmadd213pd %zmm8, %zmm16, %zmm9 #187.3 - vmovups 96(%rcx,%r10), %zmm8 #190.3 - vmovups .L_2il0floatpacket.39(%rip), %zmm17 #189.3 - vfmadd213pd %zmm9, %zmm12, %zmm7 #187.3 - vmovups 160(%rcx,%r10), %zmm9 #190.3 - vfmadd213pd %zmm7, %zmm15, %zmm11 #187.3 - vpermt2pd 160(%rcx,%r11), %zmm18, %zmm9 #190.3 - vmovaps %zmm1, %zmm16 #187.3 - vpermt2pd 128(%rdx), %zmm26, %zmm16 #187.3 - vpermt2pd 128(%rdx), %zmm25, %zmm1 #187.3 - vfmadd213pd %zmm13, %zmm12, %zmm16 #187.3 - vfmadd213pd %zmm16, %zmm15, %zmm1 #187.3 - vmovups .L_2il0floatpacket.36(%rip), %zmm15 #189.3 - vmovups .L_2il0floatpacket.35(%rip), %zmm16 #189.3 - vmovaps %zmm1, %zmm12 #189.3 - vmovaps %zmm14, %zmm13 #189.3 - vpermt2pd %zmm14, %zmm15, %zmm12 #189.3 - vpermt2pd %zmm11, %zmm16, %zmm13 #189.3 - vmovups .L_2il0floatpacket.37(%rip), %zmm14 #189.3 - vmovaps %zmm8, %zmm0 #190.3 - vpermt2pd %zmm1, %zmm14, %zmm11 #189.3 - vpermt2pd 96(%rcx,%r11), %zmm18, %zmm0 #190.3 - vpermt2pd 96(%rcx,%r11), %zmm17, %zmm8 #190.3 - vaddpd %zmm11, %zmm9, %zmm9{%k5} #190.3 - vaddpd %zmm13, %zmm0, %zmm0{%k5} #190.3 - vaddpd %zmm12, %zmm8, %zmm8{%k5} #190.3 - vaddpd %zmm11, %zmm10, %zmm31 #189.3 - vsubpd %zmm13, %zmm0, %zmm0{%k6} #190.3 - vsubpd %zmm12, %zmm8, %zmm8{%k6} #190.3 - vsubpd %zmm11, %zmm9, %zmm9{%k6} #190.3 - vmovaps %zmm3, %zmm4 #189.3 - vpermt2pd (%rcx,%r11), %zmm18, %zmm4 #189.3 - vpermt2pd (%rcx,%r11), %zmm17, %zmm3 #189.3 - vaddpd %zmm13, %zmm4, %zmm6 #189.3 - vaddpd %zmm12, %zmm3, %zmm1 #189.3 - movslq 4(%rdi), %rax #193.16 - vmovupd %ymm6, (%rcx,%r10) #189.3 - vmovupd %ymm1, 32(%rcx,%r10) #189.3 - vmovupd %ymm31, 64(%rcx,%r10) #189.3 - vextractf64x4 $1, %zmm6, (%rcx,%r11) #189.3 - vextractf64x4 $1, %zmm1, 32(%rcx,%r11) #189.3 - vextractf64x4 $1, %zmm31, 64(%rcx,%r11) #189.3 - vmovupd %ymm0, 96(%rcx,%r10) #190.3 - vmovupd %ymm8, 128(%rcx,%r10) #190.3 - vmovupd %ymm9, 160(%rcx,%r10) #190.3 - vextractf64x4 $1, %zmm0, 96(%rcx,%r11) #190.3 - vextractf64x4 $1, %zmm8, 128(%rcx,%r11) #190.3 - vextractf64x4 $1, %zmm9, 160(%rcx,%r11) #190.3 - lea (%rax,%rax,2), %r10 #193.8 - shlq $6, %r10 #193.8 - prefetcht0 (%rcx,%r10) #194.3 - movslq 4(%rsi), %r8 #195.16 - lea (%r8,%r8,2), %r9 #195.8 - shlq $6, %r9 #195.8 - prefetcht0 (%rcx,%r9) #196.3 - vmovups .L_2il0floatpacket.40(%rip), %zmm7 #198.3 - vmovups .L_2il0floatpacket.41(%rip), %zmm4 #198.3 - vmovups 432(%rdx), %zmm6 #204.3 - vmovups 496(%rdx), %zmm8 #204.3 - vmovups 560(%rdx), %zmm9 #204.3 - vpermpd %zmm30, %zmm7, %zmm12 #198.3 - vpermpd %zmm30, %zmm4, %zmm13 #198.3 - vpermpd %zmm29, %zmm7, %zmm11 #199.3 - vpermpd %zmm28, %zmm7, %zmm3 #200.3 - vaddpd %zmm12, %zmm13, %zmm5{%k1}{z} #198.3 - vsubpd %zmm12, %zmm13, %zmm5{%k2} #198.3 - vpermpd %zmm29, %zmm4, %zmm12 #199.3 - vaddpd %zmm11, %zmm12, %zmm2{%k1}{z} #199.3 - vsubpd %zmm11, %zmm12, %zmm2{%k2} #199.3 - vpermpd %zmm28, %zmm4, %zmm11 #200.3 - vaddpd %zmm3, %zmm11, %zmm7{%k1}{z} #200.3 - vsubpd %zmm3, %zmm11, %zmm7{%k2} #200.3 - vmovaps %zmm6, %zmm3 #204.3 - vpermt2pd 288(%rdx), %zmm26, %zmm3 #204.3 - vpermilpd $85, %zmm5, %zmm10 #204.3 - vmulpd %zmm27, %zmm10, %zmm0 #204.3 - vmulpd %zmm5, %zmm3, %zmm10 #204.3 - vpermilpd $85, %zmm2, %zmm1 #204.3 - vpermilpd $85, %zmm7, %zmm4 #204.3 - vmulpd %zmm1, %zmm27, %zmm31 #204.3 - vmulpd %zmm4, %zmm27, %zmm1 #204.3 - vmovaps %zmm6, %zmm4 #204.3 - vpermt2pd 288(%rdx), %zmm25, %zmm4 #204.3 - vfmadd213pd %zmm10, %zmm0, %zmm4 #204.3 - vmovaps %zmm6, %zmm10 #204.3 - vpermt2pd 288(%rdx), %zmm24, %zmm10 #204.3 - vfmadd213pd %zmm4, %zmm2, %zmm10 #204.3 - vmovaps %zmm6, %zmm3 #204.3 - vpermt2pd 288(%rdx), %zmm23, %zmm3 #204.3 - vfmadd213pd %zmm10, %zmm31, %zmm3 #204.3 - vmovaps %zmm6, %zmm4 #204.3 - vpermt2pd 352(%rdx), %zmm22, %zmm4 #204.3 - vfmadd213pd %zmm3, %zmm7, %zmm4 #204.3 - vmovaps %zmm6, %zmm10 #204.3 - vmovaps %zmm6, %zmm3 #204.3 - vpermt2pd 352(%rdx), %zmm21, %zmm10 #204.3 - vpermt2pd 288(%rdx), %zmm20, %zmm3 #204.3 - vpermt2pd 288(%rdx), %zmm19, %zmm6 #204.3 - vfmadd213pd %zmm4, %zmm1, %zmm10 #204.3 - vmulpd %zmm3, %zmm5, %zmm4 #204.3 - vfmadd213pd %zmm4, %zmm0, %zmm6 #204.3 - vmovaps %zmm8, %zmm3 #204.3 - vpermt2pd 352(%rdx), %zmm26, %zmm3 #204.3 - vfmadd213pd %zmm6, %zmm2, %zmm3 #204.3 - vmovaps %zmm8, %zmm6 #204.3 - vpermt2pd 352(%rdx), %zmm25, %zmm6 #204.3 - vfmadd213pd %zmm3, %zmm31, %zmm6 #204.3 - vmovaps %zmm8, %zmm3 #204.3 - vpermt2pd 288(%rdx), %zmm22, %zmm3 #204.3 - vmovaps %zmm8, %zmm4 #204.3 - vpermt2pd 352(%rdx), %zmm24, %zmm4 #204.3 - vmulpd %zmm3, %zmm5, %zmm5 #204.3 - vfmadd213pd %zmm6, %zmm7, %zmm4 #204.3 - vmovaps %zmm8, %zmm6 #204.3 - vpermt2pd 352(%rdx), %zmm23, %zmm6 #204.3 - vfmadd213pd %zmm4, %zmm1, %zmm6 #204.3 - vmovaps %zmm8, %zmm4 #204.3 - vpermt2pd 288(%rdx), %zmm21, %zmm4 #204.3 - vfmadd213pd %zmm5, %zmm0, %zmm4 #204.3 - vmovaps %zmm8, %zmm0 #204.3 - vpermt2pd 352(%rdx), %zmm20, %zmm0 #204.3 - vpermt2pd 352(%rdx), %zmm19, %zmm8 #204.3 - vfmadd213pd %zmm4, %zmm2, %zmm0 #204.3 - vfmadd213pd %zmm0, %zmm31, %zmm8 #204.3 - vmovups (%rcx,%r9), %zmm0 #206.3 - vmovaps %zmm9, %zmm2 #204.3 - vpermt2pd 416(%rdx), %zmm26, %zmm2 #204.3 - vpermt2pd 416(%rdx), %zmm25, %zmm9 #204.3 - vfmadd213pd %zmm8, %zmm7, %zmm2 #204.3 - vmovups 64(%rcx,%r9), %zmm7 #206.3 - vfmadd213pd %zmm2, %zmm1, %zmm9 #204.3 - vpermt2pd 64(%rcx,%r10), %zmm18, %zmm7 #206.3 - vmovaps %zmm9, %zmm8 #206.3 - vmovaps %zmm0, %zmm1 #206.3 - vpermt2pd (%rcx,%r10), %zmm17, %zmm0 #206.3 - vpermt2pd %zmm10, %zmm15, %zmm8 #206.3 - vpermt2pd (%rcx,%r10), %zmm18, %zmm1 #206.3 - vaddpd %zmm8, %zmm0, %zmm4 #206.3 - vmovups .L_2il0floatpacket.43(%rip), %zmm0 #207.3 - vmovaps %zmm10, %zmm31 #206.3 - vmovaps %zmm6, %zmm5 #206.3 - vpermt2pd %zmm6, %zmm16, %zmm31 #206.3 - vpermt2pd %zmm9, %zmm14, %zmm5 #206.3 - vpermi2pd %zmm10, %zmm9, %zmm0 #207.3 - vaddpd %zmm31, %zmm1, %zmm2 #206.3 - vaddpd %zmm5, %zmm7, %zmm3 #206.3 - vmovups 96(%rcx,%r9), %zmm7 #207.3 - vmovups .L_2il0floatpacket.42(%rip), %zmm1 #207.3 - vmovaps %zmm10, %zmm31 #207.3 - vmovups .L_2il0floatpacket.44(%rip), %zmm10 #207.3 - vmovups %zmm1, -64(%rsp) #207.3[spill] - vpermt2pd %zmm6, %zmm1, %zmm31 #207.3 - vpermt2pd %zmm9, %zmm10, %zmm6 #207.3 - vmovaps %zmm7, %zmm8 #207.3 - vpermt2pd 96(%rcx,%r10), %zmm18, %zmm8 #207.3 - vpermt2pd 96(%rcx,%r10), %zmm17, %zmm7 #207.3 - vaddpd %zmm31, %zmm8, %zmm8{%k2} #207.3 - vaddpd %zmm0, %zmm7, %zmm7{%k2} #207.3 - vsubpd %zmm31, %zmm8, %zmm8{%k1} #207.3 - vsubpd %zmm0, %zmm7, %zmm7{%k1} #207.3 - movslq 8(%rdi), %r11 #210.16 - lea (%r11,%r11,2), %r8 #210.8 - vmovupd %ymm2, (%rcx,%r9) #206.3 - vmovupd %ymm4, 32(%rcx,%r9) #206.3 - shlq $6, %r8 #210.8 - vmovupd %ymm3, 64(%rcx,%r9) #206.3 - vextractf64x4 $1, %zmm2, (%rcx,%r10) #206.3 - vmovups 160(%rcx,%r9), %zmm2 #207.3 - vextractf64x4 $1, %zmm4, 32(%rcx,%r10) #206.3 - vextractf64x4 $1, %zmm3, 64(%rcx,%r10) #206.3 - vpermt2pd 160(%rcx,%r10), %zmm18, %zmm2 #207.3 - vaddpd %zmm6, %zmm2, %zmm2{%k2} #207.3 - vsubpd %zmm6, %zmm2, %zmm2{%k1} #207.3 - vmovupd %ymm8, 96(%rcx,%r9) #207.3 - vmovupd %ymm7, 128(%rcx,%r9) #207.3 - vmovupd %ymm2, 160(%rcx,%r9) #207.3 - vextractf64x4 $1, %zmm8, 96(%rcx,%r10) #207.3 - vextractf64x4 $1, %zmm7, 128(%rcx,%r10) #207.3 - vextractf64x4 $1, %zmm2, 160(%rcx,%r10) #207.3 - prefetcht0 (%rcx,%r8) #211.3 - movslq 8(%rsi), %rax #212.16 - lea (%rax,%rax,2), %rax #212.8 - shlq $6, %rax #212.8 - prefetcht0 (%rcx,%rax) #213.3 - vmovups .L_2il0floatpacket.45(%rip), %zmm1 #215.3 - vpermpd %zmm30, %zmm1, %zmm6 #215.3 - vpermpd %zmm29, %zmm1, %zmm9 #216.3 - vaddpd %zmm6, %zmm13, %zmm2{%k4}{z} #215.3 - vaddpd %zmm9, %zmm12, %zmm5{%k4}{z} #216.3 - vsubpd %zmm6, %zmm13, %zmm2{%k3} #215.3 - vpermpd %zmm28, %zmm1, %zmm6 #217.3 - vsubpd %zmm9, %zmm12, %zmm5{%k3} #216.3 - vmovups 720(%rdx), %zmm1 #221.3 - vmovups 784(%rdx), %zmm9 #221.3 - vaddpd %zmm6, %zmm11, %zmm8{%k4}{z} #217.3 - vpermilpd $85, %zmm2, %zmm31 #221.3 - vmulpd %zmm27, %zmm31, %zmm3 #221.3 - vsubpd %zmm6, %zmm11, %zmm8{%k3} #217.3 - vmovups 848(%rdx), %zmm6 #221.3 - vmovaps %zmm1, %zmm31 #221.3 - vpermt2pd 576(%rdx), %zmm26, %zmm31 #221.3 - vpermilpd $85, %zmm5, %zmm0 #221.3 - vmulpd %zmm0, %zmm27, %zmm4 #221.3 - vmulpd %zmm2, %zmm31, %zmm0 #221.3 - vmovaps %zmm1, %zmm31 #221.3 - vpermt2pd 576(%rdx), %zmm25, %zmm31 #221.3 - vfmadd213pd %zmm0, %zmm3, %zmm31 #221.3 - vmovaps %zmm1, %zmm0 #221.3 - vpermt2pd 576(%rdx), %zmm24, %zmm0 #221.3 - vfmadd213pd %zmm31, %zmm5, %zmm0 #221.3 - vmovaps %zmm1, %zmm31 #221.3 - vpermt2pd 576(%rdx), %zmm23, %zmm31 #221.3 - vpermilpd $85, %zmm8, %zmm7 #221.3 - vmulpd %zmm7, %zmm27, %zmm7 #221.3 - vfmadd213pd %zmm0, %zmm4, %zmm31 #221.3 - vmovaps %zmm1, %zmm0 #221.3 - vpermt2pd 640(%rdx), %zmm22, %zmm0 #221.3 - vfmadd213pd %zmm31, %zmm8, %zmm0 #221.3 - vmovaps %zmm1, %zmm31 #221.3 - vpermt2pd 640(%rdx), %zmm21, %zmm31 #221.3 - vfmadd213pd %zmm0, %zmm7, %zmm31 #221.3 - vmovaps %zmm1, %zmm0 #221.3 - vpermt2pd 576(%rdx), %zmm20, %zmm0 #221.3 - vpermt2pd 576(%rdx), %zmm19, %zmm1 #221.3 - vmulpd %zmm0, %zmm2, %zmm0 #221.3 - vfmadd213pd %zmm0, %zmm3, %zmm1 #221.3 - vmovaps %zmm9, %zmm0 #221.3 - vpermt2pd 640(%rdx), %zmm26, %zmm0 #221.3 - vfmadd213pd %zmm1, %zmm5, %zmm0 #221.3 - vmovaps %zmm9, %zmm1 #221.3 - vpermt2pd 640(%rdx), %zmm25, %zmm1 #221.3 - vfmadd213pd %zmm0, %zmm4, %zmm1 #221.3 - vmovaps %zmm9, %zmm0 #221.3 - vpermt2pd 640(%rdx), %zmm24, %zmm0 #221.3 - vfmadd213pd %zmm1, %zmm8, %zmm0 #221.3 - vmovaps %zmm9, %zmm1 #221.3 - vpermt2pd 640(%rdx), %zmm23, %zmm1 #221.3 - vfmadd213pd %zmm0, %zmm7, %zmm1 #221.3 - vmovaps %zmm9, %zmm0 #221.3 - vpermt2pd 576(%rdx), %zmm22, %zmm0 #221.3 - vmulpd %zmm0, %zmm2, %zmm0 #221.3 - vmovaps %zmm9, %zmm2 #221.3 - vpermt2pd 576(%rdx), %zmm21, %zmm2 #221.3 - vfmadd213pd %zmm0, %zmm3, %zmm2 #221.3 - vmovaps %zmm9, %zmm3 #221.3 - vpermt2pd 640(%rdx), %zmm20, %zmm3 #221.3 - vpermt2pd 640(%rdx), %zmm19, %zmm9 #221.3 - vfmadd213pd %zmm2, %zmm5, %zmm3 #221.3 - vfmadd213pd %zmm3, %zmm4, %zmm9 #221.3 - vmovaps %zmm6, %zmm5 #221.3 - vpermt2pd 704(%rdx), %zmm26, %zmm5 #221.3 - vpermt2pd 704(%rdx), %zmm25, %zmm6 #221.3 - vfmadd213pd %zmm9, %zmm8, %zmm5 #221.3 - vmovups (%rcx,%rax), %zmm8 #223.3 - vfmadd213pd %zmm5, %zmm7, %zmm6 #221.3 - vmovups 64(%rcx,%rax), %zmm5 #223.3 - vmovaps %zmm8, %zmm2 #223.3 - vmovaps %zmm31, %zmm4 #223.3 - vmovaps %zmm6, %zmm3 #223.3 - vpermt2pd (%rcx,%r8), %zmm18, %zmm2 #223.3 - vpermt2pd (%rcx,%r8), %zmm17, %zmm8 #223.3 - vpermt2pd %zmm1, %zmm16, %zmm4 #223.3 - vpermt2pd %zmm31, %zmm15, %zmm3 #223.3 - vpermt2pd 64(%rcx,%r8), %zmm18, %zmm5 #223.3 - vaddpd %zmm4, %zmm2, %zmm0 #223.3 - vaddpd %zmm3, %zmm8, %zmm9 #223.3 - vmovaps %zmm1, %zmm7 #223.3 - vpermt2pd %zmm6, %zmm14, %zmm7 #223.3 - vaddpd %zmm7, %zmm5, %zmm5 #223.3 - vmovupd %ymm0, (%rcx,%rax) #223.3 - vmovupd %ymm9, 32(%rcx,%rax) #223.3 - # LOE rax rdx rcx rbx rsi rdi r8 r12 r13 r14 r15 zmm0 zmm1 zmm5 zmm6 zmm9 zmm10 zmm11 zmm12 zmm13 zmm14 zmm15 zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 zmm27 zmm28 zmm29 zmm30 zmm31 k1 k2 k3 k4 k5 k6 -..B2.4: # Preds ..B2.1 - # Execution count [1.00e+00] - vmovups .L_2il0floatpacket.46(%rip), %zmm7 #224.3 - vmovups 96(%rcx,%r8), %zmm8 #224.3 - vpermpd %zmm31, %zmm7, %zmm4 #224.3 - vpermpd %zmm1, %zmm7, %zmm2 #224.3 - vmovups 160(%rcx,%rax), %zmm31 #224.3 - vmovaps %zmm18, %zmm1 #224.3 - vmovaps %zmm15, %zmm3 #224.3 - vpermt2pd 160(%rcx,%r8), %zmm18, %zmm31 #224.3 - vmovupd %ymm5, 64(%rcx,%rax) #223.3 - vextractf64x4 $1, %zmm9, 32(%rcx,%r8) #223.3 - vextractf64x4 $1, %zmm0, (%rcx,%r8) #223.3 - vextractf64x4 $1, %zmm5, 64(%rcx,%r8) #223.3 - vpermpd %zmm6, %zmm7, %zmm9 #224.3 - vmovups 96(%rcx,%rax), %zmm6 #224.3 - vpermi2pd %zmm4, %zmm9, %zmm3 #224.3 - vpermi2pd %zmm8, %zmm6, %zmm1 #224.3 - vpermt2pd %zmm8, %zmm17, %zmm6 #224.3 - vmovaps %zmm16, %zmm0 #224.3 - vpermi2pd %zmm2, %zmm4, %zmm0 #224.3 - vpermt2pd %zmm9, %zmm14, %zmm2 #224.3 - vaddpd %zmm3, %zmm6, %zmm6{%k3} #224.3 - vaddpd %zmm0, %zmm1, %zmm1{%k6} #224.3 - vaddpd %zmm2, %zmm31, %zmm31{%k5} #224.3 - vsubpd %zmm3, %zmm6, %zmm6{%k4} #224.3 - vsubpd %zmm0, %zmm1, %zmm1{%k5} #224.3 - vsubpd %zmm2, %zmm31, %zmm31{%k6} #224.3 - vmovupd %ymm1, 96(%rcx,%rax) #224.3 - vmovupd %ymm6, 128(%rcx,%rax) #224.3 - vmovupd %ymm31, 160(%rcx,%rax) #224.3 - vextractf64x4 $1, %zmm1, 96(%rcx,%r8) #224.3 - vextractf64x4 $1, %zmm6, 128(%rcx,%r8) #224.3 - vextractf64x4 $1, %zmm31, 160(%rcx,%r8) #224.3 - movslq 12(%rdi), %rax #228.16 - lea (%rax,%rax,2), %r8 #228.8 - shlq $6, %r8 #228.8 - prefetcht0 (%rcx,%r8) #229.3 - movl $150, %eax #233.3 - movslq 12(%rsi), %rsi #230.16 - kmovw %eax, %k4 #233.3 - movl $105, %eax #233.3 - kmovw %eax, %k3 #233.3 - lea (%rsi,%rsi,2), %rdi #230.8 - shlq $6, %rdi #230.8 - prefetcht0 (%rcx,%rdi) #231.3 - vmovups .L_2il0floatpacket.47(%rip), %zmm5 #233.3 - vmovups 1072(%rdx), %zmm1 #239.3 - vmovups 928(%rdx), %zmm6 #239.3 - vmovups 992(%rdx), %zmm31 #239.3 - vmovups 1136(%rdx), %zmm0 #239.3 - vpermpd %zmm30, %zmm5, %zmm4 #233.3 - vpermpd %zmm28, %zmm5, %zmm28 #235.3 - vaddpd %zmm4, %zmm13, %zmm30{%k4}{z} #233.3 - vaddpd %zmm28, %zmm11, %zmm2{%k4}{z} #235.3 - vsubpd %zmm4, %zmm13, %zmm30{%k3} #233.3 - vpermpd %zmm29, %zmm5, %zmm13 #234.3 - vsubpd %zmm28, %zmm11, %zmm2{%k3} #235.3 - vmovups 1008(%rdx), %zmm29 #239.3 - vaddpd %zmm13, %zmm12, %zmm3{%k4}{z} #234.3 - vsubpd %zmm13, %zmm12, %zmm3{%k3} #234.3 - vmovups 864(%rdx), %zmm13 #239.3 - vpermilpd $85, %zmm30, %zmm12 #239.3 - vpermilpd $85, %zmm3, %zmm11 #239.3 - vpermilpd $85, %zmm2, %zmm7 #239.3 - vmulpd %zmm27, %zmm12, %zmm28 #239.3 - vmulpd %zmm11, %zmm27, %zmm12 #239.3 - vmulpd %zmm7, %zmm27, %zmm11 #239.3 - vmovaps %zmm26, %zmm27 #239.3 - vpermi2pd %zmm13, %zmm29, %zmm27 #239.3 - vmulpd %zmm30, %zmm27, %zmm8 #239.3 - vmovaps %zmm25, %zmm27 #239.3 - vpermi2pd %zmm13, %zmm29, %zmm27 #239.3 - vfmadd213pd %zmm8, %zmm28, %zmm27 #239.3 - vmovaps %zmm24, %zmm4 #239.3 - vpermi2pd %zmm13, %zmm29, %zmm4 #239.3 - vpermi2pd %zmm6, %zmm1, %zmm24 #239.3 - vfmadd213pd %zmm27, %zmm3, %zmm4 #239.3 - vmovaps %zmm23, %zmm5 #239.3 - vmovaps %zmm20, %zmm9 #239.3 - vpermi2pd %zmm13, %zmm29, %zmm5 #239.3 - vpermi2pd %zmm13, %zmm29, %zmm9 #239.3 - vpermi2pd %zmm6, %zmm1, %zmm20 #239.3 - vpermi2pd %zmm6, %zmm1, %zmm23 #239.3 - vfmadd213pd %zmm4, %zmm12, %zmm5 #239.3 - vmulpd %zmm9, %zmm30, %zmm4 #239.3 - vmovaps %zmm22, %zmm7 #239.3 - vpermi2pd %zmm13, %zmm1, %zmm22 #239.3 - vpermi2pd %zmm6, %zmm29, %zmm7 #239.3 - vmulpd %zmm22, %zmm30, %zmm22 #239.3 - vfmadd213pd %zmm5, %zmm2, %zmm7 #239.3 - vmovaps %zmm21, %zmm27 #239.3 - vpermi2pd %zmm6, %zmm29, %zmm27 #239.3 - vpermt2pd %zmm13, %zmm19, %zmm29 #239.3 - vpermi2pd %zmm13, %zmm1, %zmm21 #239.3 - vfmadd213pd %zmm7, %zmm11, %zmm27 #239.3 - vfmadd213pd %zmm4, %zmm28, %zmm29 #239.3 - vfmadd213pd %zmm22, %zmm28, %zmm21 #239.3 - vmovaps %zmm26, %zmm5 #239.3 - vpermi2pd %zmm6, %zmm1, %zmm5 #239.3 - vpermi2pd %zmm31, %zmm0, %zmm26 #239.3 - vpermt2pd %zmm31, %zmm25, %zmm0 #239.3 - vfmadd213pd %zmm29, %zmm3, %zmm5 #239.3 - vfmadd213pd %zmm21, %zmm3, %zmm20 #239.3 - vmovups (%rcx,%rdi), %zmm21 #241.3 - vmovaps %zmm25, %zmm29 #239.3 - vpermi2pd %zmm6, %zmm1, %zmm29 #239.3 - vpermt2pd %zmm6, %zmm19, %zmm1 #239.3 - vmovups (%rcx,%r8), %zmm19 #241.3 - vfmadd213pd %zmm5, %zmm12, %zmm29 #239.3 - vfmadd213pd %zmm20, %zmm12, %zmm1 #239.3 - vmovups 96(%rcx,%r8), %zmm25 #242.3 - vfmadd213pd %zmm29, %zmm2, %zmm24 #239.3 - vfmadd213pd %zmm1, %zmm2, %zmm26 #239.3 - vmovups 160(%rcx,%rdi), %zmm2 #242.3 - vmovups 96(%rcx,%rdi), %zmm1 #242.3 - vfmadd213pd %zmm24, %zmm11, %zmm23 #239.3 - vmovups 64(%rcx,%rdi), %zmm24 #241.3 - vfmadd213pd %zmm26, %zmm11, %zmm0 #239.3 - vpermt2pd 160(%rcx,%r8), %zmm18, %zmm2 #242.3 - vpermt2pd 64(%rcx,%r8), %zmm18, %zmm24 #241.3 - vpermi2pd %zmm23, %zmm27, %zmm16 #241.3 - vpermi2pd %zmm0, %zmm23, %zmm14 #241.3 - vpermi2pd %zmm23, %zmm27, %zmm10 #242.3 - vpermi2pd %zmm27, %zmm0, %zmm15 #241.3 - vaddpd %zmm14, %zmm24, %zmm14 #241.3 - vmovaps %zmm18, %zmm20 #241.3 - vmovaps %zmm18, %zmm26 #242.3 - vpermi2pd %zmm19, %zmm21, %zmm20 #241.3 - vpermt2pd %zmm19, %zmm17, %zmm21 #241.3 - vpermi2pd %zmm25, %zmm1, %zmm26 #242.3 - vpermt2pd %zmm25, %zmm17, %zmm1 #242.3 - vaddpd %zmm16, %zmm20, %zmm16 #241.3 - vaddpd %zmm10, %zmm26, %zmm26{%k2} #242.3 - vaddpd %zmm15, %zmm21, %zmm15 #241.3 - vmovups .L_2il0floatpacket.48(%rip), %zmm18 #242.3 - vmovups -64(%rsp), %zmm17 #242.3[spill] - vsubpd %zmm10, %zmm26, %zmm26{%k1} #242.3 - vpermi2pd %zmm27, %zmm0, %zmm18 #242.3 - vpermt2pd %zmm0, %zmm17, %zmm23 #242.3 - vaddpd %zmm18, %zmm1, %zmm1{%k3} #242.3 - vaddpd %zmm23, %zmm2, %zmm2{%k1} #242.3 - vsubpd %zmm18, %zmm1, %zmm1{%k4} #242.3 - vsubpd %zmm23, %zmm2, %zmm2{%k2} #242.3 - vmovupd %ymm16, (%rcx,%rdi) #241.3 - vmovupd %ymm15, 32(%rcx,%rdi) #241.3 - vmovupd %ymm14, 64(%rcx,%rdi) #241.3 - vextractf64x4 $1, %zmm16, (%rcx,%r8) #241.3 - vextractf64x4 $1, %zmm15, 32(%rcx,%r8) #241.3 - vextractf64x4 $1, %zmm14, 64(%rcx,%r8) #241.3 - vmovupd %ymm26, 96(%rcx,%rdi) #242.3 - vmovupd %ymm1, 128(%rcx,%rdi) #242.3 - vmovupd %ymm2, 160(%rcx,%rdi) #242.3 - vextractf64x4 $1, %zmm26, 96(%rcx,%r8) #242.3 - vextractf64x4 $1, %zmm1, 128(%rcx,%r8) #242.3 - vextractf64x4 $1, %zmm2, 160(%rcx,%r8) #242.3 - vzeroupper #243.1 - movq %rbp, %rsp #243.1 - popq %rbp #243.1 - .cfi_def_cfa 7, 8 - .cfi_restore 6 - ret #243.1 - .align 16,0x90 - # LOE - .cfi_endproc -# mark_end; - .type deo_dble_avx512,@function - .size deo_dble_avx512,.-deo_dble_avx512 - .data -# -- End deo_dble_avx512 - .section .rodata, "a" - .align 64 - .align 64 -.L_2il0floatpacket.14: - .long 0x00000008,0x00000000,0x00000009,0x00000000,0x0000000e,0x00000000,0x0000000f,0x00000000,0x00000000,0x00000000,0x00000001,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000 - .type .L_2il0floatpacket.14,@object - .size .L_2il0floatpacket.14,64 - .align 64 -.L_2il0floatpacket.15: - .long 0x00000004,0x00000000,0x00000005,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000 - .type .L_2il0floatpacket.15,@object - .size .L_2il0floatpacket.15,64 - .align 64 -.L_2il0floatpacket.16: - .long 0x00000002,0x00000000,0x00000003,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000 - .type .L_2il0floatpacket.16,@object - .size .L_2il0floatpacket.16,64 - .align 64 -.L_2il0floatpacket.17: - .long 0x00000000,0x00000000,0x00000001,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x0000000c,0x00000000,0x0000000d,0x00000000,0x0000000e,0x00000000,0x0000000f,0x00000000 - .type .L_2il0floatpacket.17,@object - .size .L_2il0floatpacket.17,64 - .align 64 -.L_2il0floatpacket.18: - .long 0x00000004,0x00000000,0x00000005,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000 - .type .L_2il0floatpacket.18,@object - .size .L_2il0floatpacket.18,64 - .align 64 -.L_2il0floatpacket.19: - .long 0x00000000,0xbff00000,0x00000000,0x3ff00000,0x00000000,0xbff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0xbff00000,0x00000000,0x3ff00000,0x00000000,0xbff00000 - .type .L_2il0floatpacket.19,@object - .size .L_2il0floatpacket.19,64 - .align 64 -.L_2il0floatpacket.20: - .long 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000 - .type .L_2il0floatpacket.20,@object - .size .L_2il0floatpacket.20,64 - .align 64 -.L_2il0floatpacket.21: - .long 0x00000001,0x00000000,0x00000001,0x00000000,0x00000001,0x00000000,0x00000001,0x00000000,0x00000009,0x00000000,0x00000009,0x00000000,0x00000009,0x00000000,0x00000009,0x00000000 - .type .L_2il0floatpacket.21,@object - .size .L_2il0floatpacket.21,64 - .align 64 -.L_2il0floatpacket.22: - .long 0x00000002,0x00000000,0x00000002,0x00000000,0x00000002,0x00000000,0x00000002,0x00000000,0x0000000e,0x00000000,0x0000000e,0x00000000,0x0000000e,0x00000000,0x0000000e,0x00000000 - .type .L_2il0floatpacket.22,@object - .size .L_2il0floatpacket.22,64 - .align 64 -.L_2il0floatpacket.23: - .long 0x00000003,0x00000000,0x00000003,0x00000000,0x00000003,0x00000000,0x00000003,0x00000000,0x0000000f,0x00000000,0x0000000f,0x00000000,0x0000000f,0x00000000,0x0000000f,0x00000000 - .type .L_2il0floatpacket.23,@object - .size .L_2il0floatpacket.23,64 - .align 64 -.L_2il0floatpacket.24: - .long 0x00000004,0x00000000,0x00000004,0x00000000,0x00000004,0x00000000,0x00000004,0x00000000,0x0000000c,0x00000000,0x0000000c,0x00000000,0x0000000c,0x00000000,0x0000000c,0x00000000 - .type .L_2il0floatpacket.24,@object - .size .L_2il0floatpacket.24,64 - .align 64 -.L_2il0floatpacket.25: - .long 0x00000005,0x00000000,0x00000005,0x00000000,0x00000005,0x00000000,0x00000005,0x00000000,0x0000000d,0x00000000,0x0000000d,0x00000000,0x0000000d,0x00000000,0x0000000d,0x00000000 - .type .L_2il0floatpacket.25,@object - .size .L_2il0floatpacket.25,64 - .align 64 -.L_2il0floatpacket.26: - .long 0x00000006,0x00000000,0x00000006,0x00000000,0x00000006,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000a,0x00000000,0x0000000a,0x00000000,0x0000000a,0x00000000 - .type .L_2il0floatpacket.26,@object - .size .L_2il0floatpacket.26,64 - .align 64 -.L_2il0floatpacket.27: - .long 0x00000007,0x00000000,0x00000007,0x00000000,0x00000007,0x00000000,0x00000007,0x00000000,0x0000000b,0x00000000,0x0000000b,0x00000000,0x0000000b,0x00000000,0x0000000b,0x00000000 - .type .L_2il0floatpacket.27,@object - .size .L_2il0floatpacket.27,64 - .align 64 -.L_2il0floatpacket.28: - .long 0x00000004,0x00000000,0x00000005,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000,0x00000000,0x00000000,0x00000001,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000 - .type .L_2il0floatpacket.28,@object - .size .L_2il0floatpacket.28,64 - .align 64 -.L_2il0floatpacket.29: - .long 0x0000000e,0x00000000,0x0000000f,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000,0x00000000,0x00000000,0x00000001,0x00000000 - .type .L_2il0floatpacket.29,@object - .size .L_2il0floatpacket.29,64 - .align 64 -.L_2il0floatpacket.30: - .long 0x0000000a,0x00000000,0x0000000b,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000 - .type .L_2il0floatpacket.30,@object - .size .L_2il0floatpacket.30,64 - .align 64 -.L_2il0floatpacket.31: - .long 0x00000008,0x00000000,0x00000009,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000 - .type .L_2il0floatpacket.31,@object - .size .L_2il0floatpacket.31,64 - .align 64 -.L_2il0floatpacket.32: - .long 0x00000000,0x00000000,0x00000001,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x00000007,0x00000000,0x00000006,0x00000000,0x00000005,0x00000000,0x00000004,0x00000000 - .type .L_2il0floatpacket.32,@object - .size .L_2il0floatpacket.32,64 - .align 64 -.L_2il0floatpacket.33: - .long 0x00000000,0x00000000,0x00000001,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000 - .type .L_2il0floatpacket.33,@object - .size .L_2il0floatpacket.33,64 - .align 64 -.L_2il0floatpacket.34: - .long 0x00000000,0x00000000,0x00000001,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x00000005,0x00000000,0x00000004,0x00000000,0x00000007,0x00000000,0x00000006,0x00000000 - .type .L_2il0floatpacket.34,@object - .size .L_2il0floatpacket.34,64 - .align 64 -.L_2il0floatpacket.35: - .long 0x00000000,0x00000000,0x00000001,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000,0x0000000c,0x00000000,0x0000000d,0x00000000 - .type .L_2il0floatpacket.35,@object - .size .L_2il0floatpacket.35,64 - .align 64 -.L_2il0floatpacket.36: - .long 0x00000000,0x00000000,0x00000001,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000,0x0000000e,0x00000000,0x0000000f,0x00000000 - .type .L_2il0floatpacket.36,@object - .size .L_2il0floatpacket.36,64 - .align 64 -.L_2il0floatpacket.37: - .long 0x00000002,0x00000000,0x00000003,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000,0x0000000e,0x00000000,0x0000000f,0x00000000 - .type .L_2il0floatpacket.37,@object - .size .L_2il0floatpacket.37,64 - .align 64 -.L_2il0floatpacket.38: - .long 0x00000000,0x00000000,0x00000001,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000 - .type .L_2il0floatpacket.38,@object - .size .L_2il0floatpacket.38,64 - .align 64 -.L_2il0floatpacket.39: - .long 0x00000004,0x00000000,0x00000005,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000,0x0000000c,0x00000000,0x0000000d,0x00000000,0x0000000e,0x00000000,0x0000000f,0x00000000 - .type .L_2il0floatpacket.39,@object - .size .L_2il0floatpacket.39,64 - .align 64 -.L_2il0floatpacket.40: - .long 0x00000007,0x00000000,0x00000006,0x00000000,0x00000005,0x00000000,0x00000004,0x00000000,0x00000007,0x00000000,0x00000006,0x00000000,0x00000005,0x00000000,0x00000004,0x00000000 - .type .L_2il0floatpacket.40,@object - .size .L_2il0floatpacket.40,64 - .align 64 -.L_2il0floatpacket.41: - .long 0x00000000,0x00000000,0x00000001,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x00000000,0x00000000,0x00000001,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000 - .type .L_2il0floatpacket.41,@object - .size .L_2il0floatpacket.41,64 - .align 64 -.L_2il0floatpacket.42: - .long 0x00000003,0x00000000,0x00000002,0x00000000,0x0000000b,0x00000000,0x0000000a,0x00000000,0x00000007,0x00000000,0x00000006,0x00000000,0x0000000f,0x00000000,0x0000000e,0x00000000 - .type .L_2il0floatpacket.42,@object - .size .L_2il0floatpacket.42,64 - .align 64 -.L_2il0floatpacket.43: - .long 0x00000003,0x00000000,0x00000002,0x00000000,0x00000009,0x00000000,0x00000008,0x00000000,0x00000007,0x00000000,0x00000006,0x00000000,0x0000000d,0x00000000,0x0000000c,0x00000000 - .type .L_2il0floatpacket.43,@object - .size .L_2il0floatpacket.43,64 - .align 64 -.L_2il0floatpacket.44: - .long 0x00000001,0x00000000,0x00000000,0x00000000,0x00000009,0x00000000,0x00000008,0x00000000,0x00000005,0x00000000,0x00000004,0x00000000,0x0000000d,0x00000000,0x0000000c,0x00000000 - .type .L_2il0floatpacket.44,@object - .size .L_2il0floatpacket.44,64 - .align 64 -.L_2il0floatpacket.45: - .long 0x00000006,0x00000000,0x00000007,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000 - .type .L_2il0floatpacket.45,@object - .size .L_2il0floatpacket.45,64 - .align 64 -.L_2il0floatpacket.46: - .long 0x00000002,0x00000000,0x00000003,0x00000000,0x00000000,0x00000000,0x00000001,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000 - .type .L_2il0floatpacket.46,@object - .size .L_2il0floatpacket.46,64 - .align 64 -.L_2il0floatpacket.47: - .long 0x00000005,0x00000000,0x00000004,0x00000000,0x00000007,0x00000000,0x00000006,0x00000000,0x00000005,0x00000000,0x00000004,0x00000000,0x00000007,0x00000000,0x00000006,0x00000000 - .type .L_2il0floatpacket.47,@object - .size .L_2il0floatpacket.47,64 - .align 64 -.L_2il0floatpacket.48: - .long 0x00000001,0x00000000,0x00000000,0x00000000,0x0000000b,0x00000000,0x0000000a,0x00000000,0x00000005,0x00000000,0x00000004,0x00000000,0x0000000f,0x00000000,0x0000000e,0x00000000 - .type .L_2il0floatpacket.48,@object - .size .L_2il0floatpacket.48,64 - .data - .section .note.GNU-stack, "" -// -- Begin DWARF2 SEGMENT .eh_frame - .section .eh_frame,"a",@progbits -.eh_frame_seg: - .align 8 -# End diff --git a/modules/linalg/avx512/salg_avx512.c b/modules/linalg/avx512/salg_avx512.c index 656711a..509e78f 100644 --- a/modules/linalg/avx512/salg_avx512.c +++ b/modules/linalg/avx512/salg_avx512.c @@ -13,6 +13,8 @@ * *******************************************************************************/ +#ifdef AVX512 + #include "global.h" #include "linalg.h" #include "mpi.h" @@ -20,7 +22,7 @@ #include "avx512.h" -void mulc_spinor_add_avx512(int vol, spinor *s, spinor const *r, complex z) +void mulc_spinor_add(int vol, spinor *s, spinor *r, complex z) { spinor *sm; __m128 tr, ti; @@ -75,14 +77,14 @@ extern __inline double _mm512_reduce_add_ps( __m512 a ) { } #endif -complex_dble spinor_prod_avx512(int vol, spinor const *s, - spinor const *r ) +complex spinor_prod(int vol, int icom, spinor *s, spinor *r ) { - spinor const *sm, *smb; + complex z; + complex_dble v, w; + spinor const *sm, *smb; __m512 tr, ti, s1, s2, s3, r1, r2, r3, sign; double x, y; - complex_dble z, v, w; x = 0.0; y = 0.0; @@ -131,8 +133,22 @@ complex_dble spinor_prod_avx512(int vol, spinor const *s, } - z.re = x; - z.im = y; + v.re = x; + v.im = y; + if ((icom==1)&&(NPROC>1)) + { + MPI_Reduce(&v.re,&w.re,2,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&w.re,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + z.re=(float)(w.re); + z.im=(float)(w.im); + } + else + { + z.re=(float)(v.re); + z.im=(float)(v.im); + } return z; } + +#endif \ No newline at end of file diff --git a/modules/linalg/avx512/salg_avx512_asm.s b/modules/linalg/avx512/salg_avx512_asm.s deleted file mode 100644 index 8e845ae..0000000 --- a/modules/linalg/avx512/salg_avx512_asm.s +++ /dev/null @@ -1,230 +0,0 @@ -# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 17.0.4.196 Build 20170411"; -# mark_description "-I../../../include -I.. -I/cineca/prod/opt/compilers/intel/pe-xe-2017/binary/impi/2017.3.196/intel64/include"; -# mark_description " -isystem /cineca/prod/opt/compilers/intel/pe-xe-2018/binary/impi/2018.1.163/include64/ -std=c89 -xCORE-AVX5"; -# mark_description "12 -mtune=skylake -DAVX512 -O3 -Ddirac_counters -pedantic -fstrict-aliasing -Wno-long-long -Wstrict-prototyp"; -# mark_description "es -S"; - .file "salg_avx512.c" - .text -..TXTST0: -# -- Begin mulc_spinor_add_avx512 - .text -# mark_begin; - .align 16,0x90 - .globl mulc_spinor_add_avx512 -# --- mulc_spinor_add_avx512(int, spinor *, const spinor *, complex) -mulc_spinor_add_avx512: -# parameter 1: %edi -# parameter 2: %rsi -# parameter 3: %rdx -# parameter 4: %xmm0 -..B1.1: # Preds ..B1.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_mulc_spinor_add_avx512.1: -..L2: - #9.1 - vshufps $1, %xmm0, %xmm0, %xmm1 #9.1 - vbroadcastss %xmm1, %xmm3 #19.8 - vbroadcastss %xmm0, %xmm2 #18.8 - movslq %edi, %rdi #9.1 - vshuff32x4 $0, %zmm3, %zmm3, %zmm4 #21.8 - vmulps .L_2il0floatpacket.3(%rip), %zmm4, %zmm0 #23.8 - lea (%rdi,%rdi,2), %rax #16.8 - shlq $5, %rax #16.8 - addq %rsi, %rax #16.8 - vshuff32x4 $0, %zmm2, %zmm2, %zmm1 #20.8 - cmpq %rax, %rsi #25.14 - jae ..B1.5 # Prob 10% #25.14 - # LOE rax rdx rbx rbp rsi r12 r13 r14 r15 zmm0 zmm1 -..B1.3: # Preds ..B1.1 ..B1.3 - # Execution count [5.00e+00] - vmovups (%rdx), %zmm3 #26.30 - vmulps %zmm3, %zmm0, %zmm2 #27.10 - vpermilps $177, %zmm2, %zmm4 #28.10 - vfmadd231ps %zmm1, %zmm3, %zmm4 #29.10 - vaddps (%rsi), %zmm4, %zmm5 #31.10 - vmovups %zmm5, (%rsi) #32.26 - vmovups 64(%rdx), %zmm7 #34.30 - vmulps %zmm7, %zmm0, %zmm6 #35.10 - vpermilps $177, %zmm6, %zmm8 #36.10 - vfmadd231ps %zmm1, %zmm7, %zmm8 #37.10 - vaddps 64(%rsi), %zmm8, %zmm9 #39.10 - vmovups %zmm9, 64(%rsi) #40.26 - vmovups 128(%rdx), %zmm11 #42.30 - addq $192, %rdx #50.5 - vmulps %zmm11, %zmm0, %zmm10 #43.10 - vpermilps $177, %zmm10, %zmm12 #44.10 - vfmadd231ps %zmm1, %zmm11, %zmm12 #45.10 - vaddps 128(%rsi), %zmm12, %zmm13 #47.10 - vmovups %zmm13, 128(%rsi) #48.26 - addq $192, %rsi #25.18 - cmpq %rax, %rsi #25.14 - jb ..B1.3 # Prob 82% #25.14 - # LOE rax rdx rbx rbp rsi r12 r13 r14 r15 zmm0 zmm1 -..B1.5: # Preds ..B1.3 ..B1.1 - # Execution count [1.00e+00] - vzeroupper #52.1 - ret #52.1 - .align 16,0x90 - # LOE - .cfi_endproc -# mark_end; - .type mulc_spinor_add_avx512,@function - .size mulc_spinor_add_avx512,.-mulc_spinor_add_avx512 - .data -# -- End mulc_spinor_add_avx512 - .text -# -- Begin spinor_prod_avx512 - .text -# mark_begin; - .align 16,0x90 - .globl spinor_prod_avx512 -# --- spinor_prod_avx512(int, const spinor *, const spinor *) -spinor_prod_avx512: -# parameter 1: %edi -# parameter 2: %rsi -# parameter 3: %rdx -..B2.1: # Preds ..B2.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_spinor_prod_avx512.4: -..L5: - #65.1 - subq $136, %rsp #65.1 - .cfi_def_cfa_offset 144 - movslq %edi, %rdi #65.1 - vxorpd %xmm0, %xmm0, %xmm0 #72.3 - vmovapd %xmm0, %xmm1 #73.3 - lea (%rdi,%rdi,2), %rax #74.8 - shlq $5, %rax #74.8 - addq %rsi, %rax #74.8 - cmpq %rax, %rsi #77.14 - jae ..B2.9 # Prob 10% #77.14 - # LOE rax rdx rbx rbp rsi r12 r13 r14 r15 xmm0 xmm1 -..B2.2: # Preds ..B2.1 - # Execution count [9.00e-01] - vmovups .L_2il0floatpacket.3(%rip), %zmm5 #102.29 - vpxord %zmm4, %zmm4, %zmm4 #83.10 - vmovaps %zmm4, %zmm2 #83.10 - # LOE rax rdx rbx rbp rsi r12 r13 r14 r15 xmm0 xmm1 zmm2 zmm4 zmm5 -..B2.3: # Preds ..B2.7 ..B2.2 - # Execution count [5.00e+00] - vmovaps %zmm4, %zmm3 #84.10 - lea 768(%rsi), %rcx #78.11 - cmpq %rax, %rcx #79.5 - vmovaps %zmm2, %zmm6 #83.10 - cmovae %rax, %rcx #79.5 - vmovaps %zmm3, %zmm2 #84.10 - cmpq %rcx, %rsi #86.16 - jae ..B2.7 # Prob 10% #86.16 - # LOE rax rdx rcx rbx rbp rsi r12 r13 r14 r15 xmm0 xmm1 zmm2 zmm3 zmm4 zmm5 zmm6 -..B2.5: # Preds ..B2.3 ..B2.5 - # Execution count [2.50e+01] - vmovups (%rdx), %zmm7 #90.32 - vmovups 64(%rdx), %zmm8 #91.32 - vmovups 128(%rdx), %zmm9 #92.32 - vmovups (%rsi), %zmm14 #87.32 - vmovups 64(%rsi), %zmm16 #88.32 - vmovups 128(%rsi), %zmm18 #89.32 - vfmadd231ps %zmm14, %zmm7, %zmm6 #94.12 - vpermilps $177, %zmm7, %zmm10 #98.12 - addq $192, %rsi #86.21 - vmulps %zmm5, %zmm10, %zmm13 #103.12 - addq $192, %rdx #111.7 - vfmadd231ps %zmm16, %zmm8, %zmm6 #95.12 - vfmadd231ps %zmm14, %zmm13, %zmm3 #107.12 - vfmadd231ps %zmm18, %zmm9, %zmm6 #96.12 - vpermilps $177, %zmm8, %zmm11 #99.12 - vmulps %zmm11, %zmm5, %zmm15 #104.12 - vpermilps $177, %zmm9, %zmm12 #100.12 - vmulps %zmm12, %zmm5, %zmm17 #105.12 - vfmadd231ps %zmm16, %zmm15, %zmm3 #108.12 - vfmadd231ps %zmm18, %zmm17, %zmm3 #109.12 - cmpq %rcx, %rsi #86.16 - jb ..B2.5 # Prob 82% #86.16 - # LOE rax rdx rcx rbx rbp rsi r12 r13 r14 r15 xmm0 xmm1 zmm2 zmm3 zmm4 zmm5 zmm6 -..B2.7: # Preds ..B2.5 ..B2.3 - # Execution count [5.00e+00] - vmovups %zmm6, (%rsp) #114.19 - vmovups %zmm3, 64(%rsp) #115.19 - vmovss (%rsp), %xmm6 #114.19 - vmovss 8(%rsp), %xmm7 #114.19 - vmovss 16(%rsp), %xmm10 #114.19 - vmovss 24(%rsp), %xmm11 #114.19 - vmovss 32(%rsp), %xmm16 #114.19 - vmovss 40(%rsp), %xmm17 #114.19 - vmovss 64(%rsp), %xmm3 #115.19 - vmovss 48(%rsp), %xmm20 #114.19 - vmovss 56(%rsp), %xmm21 #114.19 - vmovss 72(%rsp), %xmm29 #115.19 - vaddss 4(%rsp), %xmm6, %xmm8 #114.5 - vaddss 12(%rsp), %xmm7, %xmm9 #114.5 - vaddss 20(%rsp), %xmm10, %xmm12 #114.5 - vaddss 28(%rsp), %xmm11, %xmm13 #114.5 - vaddss 36(%rsp), %xmm16, %xmm18 #114.5 - vaddss 44(%rsp), %xmm17, %xmm19 #114.5 - vaddss %xmm9, %xmm8, %xmm14 #114.5 - vaddss 68(%rsp), %xmm3, %xmm30 #115.5 - vaddss %xmm13, %xmm12, %xmm15 #114.5 - vaddss 52(%rsp), %xmm20, %xmm22 #114.5 - vaddss 60(%rsp), %xmm21, %xmm23 #114.5 - vaddss 76(%rsp), %xmm29, %xmm31 #115.5 - vaddss %xmm19, %xmm18, %xmm24 #114.5 - vaddss %xmm15, %xmm14, %xmm26 #114.5 - vaddss %xmm23, %xmm22, %xmm25 #114.5 - vaddss %xmm31, %xmm30, %xmm9 #115.5 - vaddss %xmm25, %xmm24, %xmm27 #114.5 - vmovss 80(%rsp), %xmm3 #115.19 - vaddss %xmm27, %xmm26, %xmm28 #114.5 - vaddss 84(%rsp), %xmm3, %xmm7 #115.5 - vcvtss2sd %xmm28, %xmm28, %xmm28 #114.19 - vmovss 88(%rsp), %xmm6 #115.19 - vaddsd %xmm28, %xmm0, %xmm0 #114.5 - vaddss 92(%rsp), %xmm6, %xmm8 #115.5 - vmovss 96(%rsp), %xmm11 #115.19 - vaddss %xmm8, %xmm7, %xmm10 #115.5 - vaddss 100(%rsp), %xmm11, %xmm13 #115.5 - vaddss %xmm10, %xmm9, %xmm21 #115.5 - vmovss 104(%rsp), %xmm12 #115.19 - vmovss 112(%rsp), %xmm15 #115.19 - vmovss 120(%rsp), %xmm16 #115.19 - vaddss 108(%rsp), %xmm12, %xmm14 #115.5 - vaddss 116(%rsp), %xmm15, %xmm17 #115.5 - vaddss 124(%rsp), %xmm16, %xmm18 #115.5 - vaddss %xmm14, %xmm13, %xmm19 #115.5 - vaddss %xmm18, %xmm17, %xmm20 #115.5 - vaddss %xmm20, %xmm19, %xmm22 #115.5 - vaddss %xmm22, %xmm21, %xmm23 #115.5 - vcvtss2sd %xmm23, %xmm23, %xmm23 #115.19 - vaddsd %xmm23, %xmm1, %xmm1 #115.5 - cmpq %rax, %rsi #77.14 - jb ..B2.3 # Prob 82% #77.14 - # LOE rax rdx rbx rbp rsi r12 r13 r14 r15 xmm0 xmm1 zmm2 zmm4 zmm5 -..B2.9: # Preds ..B2.7 ..B2.1 - # Execution count [1.00e+00] - vzeroupper #122.10 - addq $136, %rsp #122.10 - .cfi_def_cfa_offset 8 - ret #122.10 - .align 16,0x90 - # LOE - .cfi_endproc -# mark_end; - .type spinor_prod_avx512,@function - .size spinor_prod_avx512,.-spinor_prod_avx512 - .data -# -- End spinor_prod_avx512 - .section .rodata, "a" - .align 64 - .align 64 -.L_2il0floatpacket.3: - .long 0x3f800000,0xbf800000,0x3f800000,0xbf800000,0x3f800000,0xbf800000,0x3f800000,0xbf800000,0x3f800000,0xbf800000,0x3f800000,0xbf800000,0x3f800000,0xbf800000,0x3f800000,0xbf800000 - .type .L_2il0floatpacket.3,@object - .size .L_2il0floatpacket.3,64 - .data - .section .note.GNU-stack, "" -// -- Begin DWARF2 SEGMENT .eh_frame - .section .eh_frame,"a",@progbits -.eh_frame_seg: - .align 8 -# End diff --git a/modules/linalg/avx512/salg_dble_avx512.c b/modules/linalg/avx512/salg_dble_avx512.c index 331a492..8fde96c 100644 --- a/modules/linalg/avx512/salg_dble_avx512.c +++ b/modules/linalg/avx512/salg_dble_avx512.c @@ -12,6 +12,9 @@ * implementations. * *******************************************************************************/ + +#ifdef AVX512 + #include "global.h" #include "linalg.h" #include "mpi.h" @@ -137,7 +140,9 @@ double norm_square_dble_avx512(spinor_dble const *s, spinor_dble const *smb) return _mm512_reduce_add_pd( tmp ); } -void mulc_spinor_add_dble_avx512(int vol, spinor_dble *s, spinor_dble const *r, + + +void mulc_spinor_add_dble(int vol, spinor_dble *s, spinor_dble *r, complex_dble z) { spinor_dble *sm; @@ -180,7 +185,8 @@ void mulc_spinor_add_dble_avx512(int vol, spinor_dble *s, spinor_dble const *r, } } -void mulr_spinor_add_dble_avx512(int vol, spinor_dble *s, spinor_dble const *r, + +void mulr_spinor_add_dble(int vol, spinor_dble *s, spinor_dble *r, double c) { spinor_dble *sm; @@ -215,7 +221,8 @@ void mulr_spinor_add_dble_avx512(int vol, spinor_dble *s, spinor_dble const *r, } } -void combine_spinor_dble_avx512(int vol, spinor_dble *s, spinor_dble const *r, + +void combine_spinor_dble(int vol, spinor_dble *s, spinor_dble *r, double cs, double cr) { spinor_dble *sm; @@ -252,7 +259,8 @@ void combine_spinor_dble_avx512(int vol, spinor_dble *s, spinor_dble const *r, } } -void scale_dble_avx512(int vol, double c, spinor_dble *s) + +void scale_dble(int vol, double c, spinor_dble *s) { spinor_dble *sm; __m128d t128; @@ -350,7 +358,8 @@ void rotate_dble_avx512(int n, int ix, spinor_dble **ppk, spinor_dble *psi, comp } } -void mulg5_dble_avx512(int vol, spinor_dble *s) + +void mulg5_dble(int vol, spinor_dble *s) { spinor_dble *sm; @@ -370,7 +379,7 @@ void mulg5_dble_avx512(int vol, spinor_dble *s) } } -void mulmg5_dble_avx512(int vol, spinor_dble *s) +void mulmg5_dble(int vol, spinor_dble *s) { spinor_dble *sm; @@ -389,3 +398,5 @@ void mulmg5_dble_avx512(int vol, spinor_dble *s) _mm256_storeu_pd( &(*s).c1.c1.re+8, s2 ); } } + +#endif \ No newline at end of file diff --git a/modules/linalg/avx512/salg_dble_avx512_asm.s b/modules/linalg/avx512/salg_dble_avx512_asm.s deleted file mode 100644 index d85271e..0000000 --- a/modules/linalg/avx512/salg_dble_avx512_asm.s +++ /dev/null @@ -1,768 +0,0 @@ -# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 17.0.4.196 Build 20170411"; -# mark_description "-I../../../include -I.. -I/cineca/prod/opt/compilers/intel/pe-xe-2017/binary/impi/2017.3.196/intel64/include"; -# mark_description " -isystem /cineca/prod/opt/compilers/intel/pe-xe-2018/binary/impi/2018.1.163/include64/ -std=c89 -xCORE-AVX5"; -# mark_description "12 -mtune=skylake -DAVX512 -O3 -Ddirac_counters -pedantic -fstrict-aliasing -Wno-long-long -Wstrict-prototyp"; -# mark_description "es -S"; - .file "salg_dble_avx512.c" - .text -..TXTST0: -# -- Begin spinor_prod_dble_avx512 - .text -# mark_begin; - .align 16,0x90 - .globl spinor_prod_dble_avx512 -# --- spinor_prod_dble_avx512(const spinor_dble *, const spinor_dble *, const spinor_dble *) -spinor_prod_dble_avx512: -# parameter 1: %rdi -# parameter 2: %rsi -# parameter 3: %rdx -..B1.1: # Preds ..B1.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_spinor_prod_dble_avx512.1: -..L2: - #16.1 - subq $136, %rsp #16.1 - .cfi_def_cfa_offset 144 - vpxord %zmm2, %zmm2, %zmm2 #21.8 - vmovaps %zmm2, %zmm1 #22.8 - cmpq %rsi, %rdi #23.14 - jae ..B1.5 # Prob 10% #23.14 - # LOE rdx rbx rbp rsi rdi r12 r13 r14 r15 zmm1 zmm2 -..B1.2: # Preds ..B1.1 - # Execution count [9.00e-01] - vmovups .L_2il0floatpacket.3(%rip), %zmm0 #36.27 - # LOE rdx rbx rbp rsi rdi r12 r13 r14 r15 zmm0 zmm1 zmm2 -..B1.3: # Preds ..B1.3 ..B1.2 - # Execution count [5.00e+00] - vmovups (%rdx), %zmm3 #27.30 - vmovups 64(%rdx), %zmm4 #28.30 - vmovups 128(%rdx), %zmm5 #29.30 - vmovups (%rdi), %zmm10 #24.30 - vmovups 64(%rdi), %zmm12 #25.30 - vmovups 128(%rdi), %zmm14 #26.30 - vfmadd231pd %zmm10, %zmm3, %zmm2 #30.10 - vpermilpd $85, %zmm3, %zmm6 #33.10 - addq $192, %rdi #23.19 - vmulpd %zmm0, %zmm6, %zmm9 #37.10 - addq $192, %rdx #43.5 - vfmadd231pd %zmm12, %zmm4, %zmm2 #31.10 - vfmadd231pd %zmm10, %zmm9, %zmm1 #40.10 - vfmadd231pd %zmm14, %zmm5, %zmm2 #32.10 - vpermilpd $85, %zmm4, %zmm7 #34.10 - vmulpd %zmm7, %zmm0, %zmm11 #38.10 - vpermilpd $85, %zmm5, %zmm8 #35.10 - vmulpd %zmm8, %zmm0, %zmm13 #39.10 - vfmadd231pd %zmm12, %zmm11, %zmm1 #41.10 - vfmadd231pd %zmm14, %zmm13, %zmm1 #42.10 - cmpq %rsi, %rdi #23.14 - jb ..B1.3 # Prob 82% #23.14 - # LOE rdx rbx rbp rsi rdi r12 r13 r14 r15 zmm0 zmm1 zmm2 -..B1.5: # Preds ..B1.3 ..B1.1 - # Execution count [1.00e+00] - vmovups %zmm2, (%rsp) #45.10 - vmovups %zmm1, 64(%rsp) #46.10 - vmovsd (%rsp), %xmm2 #45.10 - vmovsd 16(%rsp), %xmm3 #45.10 - vmovsd 32(%rsp), %xmm6 #45.10 - vmovsd 48(%rsp), %xmm7 #45.10 - vmovsd 64(%rsp), %xmm1 #46.10 - vmovsd 80(%rsp), %xmm12 #46.10 - vmovsd 96(%rsp), %xmm15 #46.10 - vmovsd 112(%rsp), %xmm16 #46.10 - vaddsd 8(%rsp), %xmm2, %xmm4 #45.3 - vaddsd 24(%rsp), %xmm3, %xmm5 #45.3 - vaddsd 40(%rsp), %xmm6, %xmm8 #45.3 - vaddsd 56(%rsp), %xmm7, %xmm9 #45.3 - vaddsd 72(%rsp), %xmm1, %xmm13 #46.3 - vaddsd 88(%rsp), %xmm12, %xmm14 #46.3 - vaddsd 104(%rsp), %xmm15, %xmm17 #46.3 - vaddsd 120(%rsp), %xmm16, %xmm18 #46.3 - vaddsd %xmm5, %xmm4, %xmm10 #45.3 - vaddsd %xmm9, %xmm8, %xmm11 #45.3 - vaddsd %xmm14, %xmm13, %xmm19 #46.3 - vaddsd %xmm18, %xmm17, %xmm20 #46.3 - vaddsd %xmm11, %xmm10, %xmm0 #45.3 - vaddsd %xmm20, %xmm19, %xmm1 #46.3 - vzeroupper #47.10 - addq $136, %rsp #47.10 - .cfi_def_cfa_offset 8 - ret #47.10 - .align 16,0x90 - # LOE - .cfi_endproc -# mark_end; - .type spinor_prod_dble_avx512,@function - .size spinor_prod_dble_avx512,.-spinor_prod_dble_avx512 - .data -# -- End spinor_prod_dble_avx512 - .text -# -- Begin spinor_prod_re_dble_avx512 - .text -# mark_begin; - .align 16,0x90 - .globl spinor_prod_re_dble_avx512 -# --- spinor_prod_re_dble_avx512(const spinor_dble *, const spinor_dble *, const spinor_dble *) -spinor_prod_re_dble_avx512: -# parameter 1: %rdi -# parameter 2: %rsi -# parameter 3: %rdx -..B2.1: # Preds ..B2.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_spinor_prod_re_dble_avx512.6: -..L7: - #51.1 - vpxord %zmm0, %zmm0, %zmm0 #54.8 - cmpq %rsi, %rdi #56.14 - jae ..B2.5 # Prob 10% #56.14 - # LOE rdx rbx rbp rsi rdi r12 r13 r14 r15 zmm0 -..B2.3: # Preds ..B2.1 ..B2.3 - # Execution count [5.00e+00] - vmovups (%rdx), %zmm1 #60.30 - vmovups 64(%rdx), %zmm2 #61.30 - vmovups 128(%rdx), %zmm3 #62.30 - vfmadd231pd (%rdi), %zmm1, %zmm0 #63.10 - vfmadd231pd 64(%rdi), %zmm2, %zmm0 #64.10 - addq $192, %rdx #69.5 - vfmadd231pd 128(%rdi), %zmm3, %zmm0 #65.10 - addq $192, %rdi #56.19 - cmpq %rsi, %rdi #56.14 - jb ..B2.3 # Prob 82% #56.14 - # LOE rdx rbx rbp rsi rdi r12 r13 r14 r15 zmm0 -..B2.5: # Preds ..B2.3 ..B2.1 - # Execution count [1.00e+00] - vmovups %zmm0, -72(%rsp) #71.7 - vmovsd -72(%rsp), %xmm0 #71.7 - vmovsd -56(%rsp), %xmm1 #71.7 - vmovsd -40(%rsp), %xmm4 #71.7 - vmovsd -24(%rsp), %xmm5 #71.7 - vaddsd -64(%rsp), %xmm0, %xmm2 #72.10 - vaddsd -48(%rsp), %xmm1, %xmm3 #72.10 - vaddsd -32(%rsp), %xmm4, %xmm6 #72.10 - vaddsd -16(%rsp), %xmm5, %xmm7 #72.10 - vaddsd %xmm3, %xmm2, %xmm8 #72.10 - vaddsd %xmm7, %xmm6, %xmm9 #72.10 - vaddsd %xmm9, %xmm8, %xmm0 #72.10 - vcvtsd2ss %xmm0, %xmm0, %xmm0 #71.7 - vcvtss2sd %xmm0, %xmm0, %xmm0 #72.10 - vzeroupper #72.10 - ret #72.10 - .align 16,0x90 - # LOE - .cfi_endproc -# mark_end; - .type spinor_prod_re_dble_avx512,@function - .size spinor_prod_re_dble_avx512,.-spinor_prod_re_dble_avx512 - .data -# -- End spinor_prod_re_dble_avx512 - .text -# -- Begin spinor_prod5_dble_avx512 - .text -# mark_begin; - .align 16,0x90 - .globl spinor_prod5_dble_avx512 -# --- spinor_prod5_dble_avx512(const spinor_dble *, const spinor_dble *, const spinor_dble *) -spinor_prod5_dble_avx512: -# parameter 1: %rdi -# parameter 2: %rsi -# parameter 3: %rdx -..B3.1: # Preds ..B3.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_spinor_prod5_dble_avx512.9: -..L10: - #76.1 - subq $136, %rsp #76.1 - .cfi_def_cfa_offset 144 - vpxord %zmm3, %zmm3, %zmm3 #80.8 - vmovaps %zmm3, %zmm2 #81.8 - cmpq %rsi, %rdi #82.14 - jae ..B3.5 # Prob 10% #82.14 - # LOE rdx rbx rbp rsi rdi r12 r13 r14 r15 zmm2 zmm3 -..B3.2: # Preds ..B3.1 - # Execution count [9.00e-01] - vmovups .L_2il0floatpacket.4(%rip), %zmm1 #89.27 - vmovups .L_2il0floatpacket.3(%rip), %zmm0 #97.27 - # LOE rdx rbx rbp rsi rdi r12 r13 r14 r15 zmm0 zmm1 zmm2 zmm3 -..B3.3: # Preds ..B3.3 ..B3.2 - # Execution count [5.00e+00] - vmovups (%rdx), %zmm4 #86.30 - vmovups 64(%rdx), %zmm5 #87.30 - vmovups 128(%rdx), %zmm6 #88.30 - vmovups (%rdi), %zmm11 #83.30 - vmovups 128(%rdi), %zmm15 #85.30 - vmulpd 64(%rdi), %zmm1, %zmm13 #90.10 - vfmadd231pd %zmm11, %zmm4, %zmm3 #91.10 - vpermilpd $85, %zmm4, %zmm7 #94.10 - addq $192, %rdi #82.19 - vmulpd %zmm0, %zmm7, %zmm10 #98.10 - addq $192, %rdx #104.5 - vfmadd231pd %zmm13, %zmm5, %zmm3 #92.10 - vfmadd231pd %zmm11, %zmm10, %zmm2 #101.10 - vfnmadd231pd %zmm15, %zmm6, %zmm3 #93.10 - vpermilpd $85, %zmm5, %zmm8 #95.10 - vmulpd %zmm8, %zmm0, %zmm12 #99.10 - vpermilpd $85, %zmm6, %zmm9 #96.10 - vmulpd %zmm9, %zmm0, %zmm14 #100.10 - vfmadd213pd %zmm2, %zmm12, %zmm13 #102.10 - vmovaps %zmm13, %zmm2 #103.10 - vfnmadd231pd %zmm15, %zmm14, %zmm2 #103.10 - cmpq %rsi, %rdi #82.14 - jb ..B3.3 # Prob 82% #82.14 - # LOE rdx rbx rbp rsi rdi r12 r13 r14 r15 zmm0 zmm1 zmm2 zmm3 -..B3.5: # Preds ..B3.3 ..B3.1 - # Execution count [1.00e+00] - vmovups %zmm3, (%rsp) #106.10 - vmovups %zmm2, 64(%rsp) #107.10 - vmovsd (%rsp), %xmm1 #106.10 - vmovsd 16(%rsp), %xmm3 #106.10 - vmovsd 32(%rsp), %xmm6 #106.10 - vmovsd 48(%rsp), %xmm7 #106.10 - vmovsd 64(%rsp), %xmm2 #107.10 - vmovsd 80(%rsp), %xmm12 #107.10 - vmovsd 96(%rsp), %xmm15 #107.10 - vmovsd 112(%rsp), %xmm16 #107.10 - vaddsd 8(%rsp), %xmm1, %xmm4 #106.3 - vaddsd 24(%rsp), %xmm3, %xmm5 #106.3 - vaddsd 40(%rsp), %xmm6, %xmm8 #106.3 - vaddsd 56(%rsp), %xmm7, %xmm9 #106.3 - vaddsd 72(%rsp), %xmm2, %xmm13 #107.3 - vaddsd 88(%rsp), %xmm12, %xmm14 #107.3 - vaddsd 104(%rsp), %xmm15, %xmm17 #107.3 - vaddsd 120(%rsp), %xmm16, %xmm18 #107.3 - vaddsd %xmm5, %xmm4, %xmm10 #106.3 - vaddsd %xmm9, %xmm8, %xmm11 #106.3 - vaddsd %xmm14, %xmm13, %xmm19 #107.3 - vaddsd %xmm18, %xmm17, %xmm20 #107.3 - vaddsd %xmm11, %xmm10, %xmm0 #106.3 - vaddsd %xmm20, %xmm19, %xmm1 #107.3 - vzeroupper #108.10 - addq $136, %rsp #108.10 - .cfi_def_cfa_offset 8 - ret #108.10 - .align 16,0x90 - # LOE - .cfi_endproc -# mark_end; - .type spinor_prod5_dble_avx512,@function - .size spinor_prod5_dble_avx512,.-spinor_prod5_dble_avx512 - .data -# -- End spinor_prod5_dble_avx512 - .text -# -- Begin norm_square_dble_avx512 - .text -# mark_begin; - .align 16,0x90 - .globl norm_square_dble_avx512 -# --- norm_square_dble_avx512(const spinor_dble *, const spinor_dble *) -norm_square_dble_avx512: -# parameter 1: %rdi -# parameter 2: %rsi -..B4.1: # Preds ..B4.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_norm_square_dble_avx512.14: -..L15: - #112.1 - vpxord %zmm0, %zmm0, %zmm0 #114.9 - cmpq %rsi, %rdi #115.14 - jae ..B4.5 # Prob 10% #115.14 - # LOE rbx rbp rsi rdi r12 r13 r14 r15 zmm0 -..B4.3: # Preds ..B4.1 ..B4.3 - # Execution count [5.00e+00] - vmovups (%rdi), %zmm1 #116.30 - vmovups 64(%rdi), %zmm2 #117.30 - vmovups 128(%rdi), %zmm3 #118.30 - vfmadd231pd %zmm1, %zmm1, %zmm0 #119.11 - vfmadd231pd %zmm2, %zmm2, %zmm0 #120.11 - addq $192, %rdi #115.19 - vfmadd231pd %zmm3, %zmm3, %zmm0 #121.11 - cmpq %rsi, %rdi #115.14 - jb ..B4.3 # Prob 82% #115.14 - # LOE rbx rbp rsi rdi r12 r13 r14 r15 zmm0 -..B4.5: # Preds ..B4.3 ..B4.1 - # Execution count [1.00e+00] - vmovups %zmm0, -72(%rsp) #123.10 - vmovsd -72(%rsp), %xmm0 #123.10 - vmovsd -56(%rsp), %xmm1 #123.10 - vmovsd -40(%rsp), %xmm4 #123.10 - vmovsd -24(%rsp), %xmm5 #123.10 - vaddsd -64(%rsp), %xmm0, %xmm2 #123.10 - vaddsd -48(%rsp), %xmm1, %xmm3 #123.10 - vaddsd -32(%rsp), %xmm4, %xmm6 #123.10 - vaddsd -16(%rsp), %xmm5, %xmm7 #123.10 - vaddsd %xmm3, %xmm2, %xmm8 #123.10 - vaddsd %xmm7, %xmm6, %xmm9 #123.10 - vaddsd %xmm9, %xmm8, %xmm0 #123.10 - vzeroupper #123.10 - ret #123.10 - .align 16,0x90 - # LOE - .cfi_endproc -# mark_end; - .type norm_square_dble_avx512,@function - .size norm_square_dble_avx512,.-norm_square_dble_avx512 - .data -# -- End norm_square_dble_avx512 - .text -# -- Begin mulc_spinor_add_dble_avx512 - .text -# mark_begin; - .align 16,0x90 - .globl mulc_spinor_add_dble_avx512 -# --- mulc_spinor_add_dble_avx512(int, spinor_dble *, const spinor_dble *, complex_dble) -mulc_spinor_add_dble_avx512: -# parameter 1: %edi -# parameter 2: %rsi -# parameter 3: %rdx -# parameter 4: %xmm0 %xmm1 -..B5.1: # Preds ..B5.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_mulc_spinor_add_dble_avx512.17: -..L18: - #128.1 - vmovapd %xmm1, %xmm2 #134.8 - movslq %edi, %rdi #128.1 - vbroadcastsd %xmm0, %zmm1 #135.8 - vbroadcastsd %xmm2, %zmm0 #136.8 - lea (%rdi,%rdi,2), %rax #138.8 - shlq $6, %rax #138.8 - addq %rsi, %rax #138.8 - cmpq %rax, %rsi #140.14 - jae ..B5.5 # Prob 10% #140.14 - # LOE rax rdx rbx rbp rsi r12 r13 r14 r15 zmm0 zmm1 -..B5.3: # Preds ..B5.1 ..B5.3 - # Execution count [5.00e+00] - vmovups (%rdx), %zmm3 #141.30 - vmulpd %zmm3, %zmm0, %zmm2 #142.10 - vpermilpd $85, %zmm2, %zmm4 #143.10 - vfmaddsub231pd %zmm1, %zmm3, %zmm4 #144.10 - vaddpd (%rsi), %zmm4, %zmm5 #146.10 - vmovups %zmm5, (%rsi) #147.26 - vmovups 64(%rdx), %zmm7 #149.30 - vmulpd %zmm7, %zmm0, %zmm6 #150.10 - vpermilpd $85, %zmm6, %zmm8 #151.10 - vfmaddsub231pd %zmm1, %zmm7, %zmm8 #152.10 - vaddpd 64(%rsi), %zmm8, %zmm9 #154.10 - vmovups %zmm9, 64(%rsi) #155.26 - vmovups 128(%rdx), %zmm11 #157.30 - addq $192, %rdx #165.5 - vmulpd %zmm11, %zmm0, %zmm10 #158.10 - vpermilpd $85, %zmm10, %zmm12 #159.10 - vfmaddsub231pd %zmm1, %zmm11, %zmm12 #160.10 - vaddpd 128(%rsi), %zmm12, %zmm13 #162.10 - vmovups %zmm13, 128(%rsi) #163.26 - addq $192, %rsi #140.18 - cmpq %rax, %rsi #140.14 - jb ..B5.3 # Prob 82% #140.14 - # LOE rax rdx rbx rbp rsi r12 r13 r14 r15 zmm0 zmm1 -..B5.5: # Preds ..B5.3 ..B5.1 - # Execution count [1.00e+00] - vzeroupper #167.1 - ret #167.1 - .align 16,0x90 - # LOE - .cfi_endproc -# mark_end; - .type mulc_spinor_add_dble_avx512,@function - .size mulc_spinor_add_dble_avx512,.-mulc_spinor_add_dble_avx512 - .data -# -- End mulc_spinor_add_dble_avx512 - .text -# -- Begin mulr_spinor_add_dble_avx512 - .text -# mark_begin; - .align 16,0x90 - .globl mulr_spinor_add_dble_avx512 -# --- mulr_spinor_add_dble_avx512(int, spinor_dble *, const spinor_dble *, double) -mulr_spinor_add_dble_avx512: -# parameter 1: %edi -# parameter 2: %rsi -# parameter 3: %rdx -# parameter 4: %xmm0 -..B6.1: # Preds ..B6.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_mulr_spinor_add_dble_avx512.20: -..L21: - #171.1 - movslq %edi, %rdi #171.1 - vbroadcastsd %xmm0, %zmm0 #177.8 - lea (%rdi,%rdi,2), %rax #179.8 - shlq $6, %rax #179.8 - addq %rsi, %rax #179.8 - cmpq %rax, %rsi #181.14 - jae ..B6.5 # Prob 10% #181.14 - .align 16,0x90 - # LOE rax rdx rbx rbp rsi r12 r13 r14 r15 zmm0 -..B6.3: # Preds ..B6.1 ..B6.3 - # Execution count [5.00e+00] - vmovups (%rdx), %zmm1 #182.30 - vfmadd213pd (%rsi), %zmm0, %zmm1 #185.10 - vmovups %zmm1, (%rsi) #186.26 - vmovups 64(%rdx), %zmm2 #188.30 - vfmadd213pd 64(%rsi), %zmm0, %zmm2 #191.10 - vmovups %zmm2, 64(%rsi) #192.26 - vmovups 128(%rdx), %zmm3 #194.30 - addq $192, %rdx #200.5 - vfmadd213pd 128(%rsi), %zmm0, %zmm3 #197.10 - vmovups %zmm3, 128(%rsi) #198.26 - addq $192, %rsi #181.18 - cmpq %rax, %rsi #181.14 - jb ..B6.3 # Prob 82% #181.14 - # LOE rax rdx rbx rbp rsi r12 r13 r14 r15 zmm0 -..B6.5: # Preds ..B6.3 ..B6.1 - # Execution count [1.00e+00] - vzeroupper #202.1 - ret #202.1 - .align 16,0x90 - # LOE - .cfi_endproc -# mark_end; - .type mulr_spinor_add_dble_avx512,@function - .size mulr_spinor_add_dble_avx512,.-mulr_spinor_add_dble_avx512 - .data -# -- End mulr_spinor_add_dble_avx512 - .text -# -- Begin combine_spinor_dble_avx512 - .text -# mark_begin; - .align 16,0x90 - .globl combine_spinor_dble_avx512 -# --- combine_spinor_dble_avx512(int, spinor_dble *, const spinor_dble *, double, double) -combine_spinor_dble_avx512: -# parameter 1: %edi -# parameter 2: %rsi -# parameter 3: %rdx -# parameter 4: %xmm0 -# parameter 5: %xmm1 -..B7.1: # Preds ..B7.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_combine_spinor_dble_avx512.23: -..L24: - #206.1 - vmovapd %xmm1, %xmm2 #212.11 - movslq %edi, %rdi #206.1 - vbroadcastsd %xmm0, %zmm1 #213.9 - vbroadcastsd %xmm2, %zmm0 #214.9 - lea (%rdi,%rdi,2), %rax #216.8 - shlq $6, %rax #216.8 - addq %rsi, %rax #216.8 - cmpq %rax, %rsi #218.14 - jae ..B7.5 # Prob 10% #218.14 - .align 16,0x90 - # LOE rax rdx rbx rbp rsi r12 r13 r14 r15 zmm0 zmm1 -..B7.3: # Preds ..B7.1 ..B7.3 - # Execution count [5.00e+00] - vmulpd (%rdx), %zmm0, %zmm2 #220.10 - vfmadd231pd (%rsi), %zmm1, %zmm2 #222.10 - vmovups %zmm2, (%rsi) #223.26 - vmulpd 64(%rdx), %zmm0, %zmm3 #226.10 - vfmadd231pd 64(%rsi), %zmm1, %zmm3 #228.10 - vmovups %zmm3, 64(%rsi) #229.26 - vmulpd 128(%rdx), %zmm0, %zmm4 #232.10 - addq $192, %rdx #237.5 - vfmadd231pd 128(%rsi), %zmm1, %zmm4 #234.10 - vmovups %zmm4, 128(%rsi) #235.26 - addq $192, %rsi #218.18 - cmpq %rax, %rsi #218.14 - jb ..B7.3 # Prob 82% #218.14 - # LOE rax rdx rbx rbp rsi r12 r13 r14 r15 zmm0 zmm1 -..B7.5: # Preds ..B7.3 ..B7.1 - # Execution count [1.00e+00] - vzeroupper #239.1 - ret #239.1 - .align 16,0x90 - # LOE - .cfi_endproc -# mark_end; - .type combine_spinor_dble_avx512,@function - .size combine_spinor_dble_avx512,.-combine_spinor_dble_avx512 - .data -# -- End combine_spinor_dble_avx512 - .text -# -- Begin scale_dble_avx512 - .text -# mark_begin; - .align 16,0x90 - .globl scale_dble_avx512 -# --- scale_dble_avx512(int, double, spinor_dble *) -scale_dble_avx512: -# parameter 1: %edi -# parameter 2: %xmm0 -# parameter 3: %rsi -..B8.1: # Preds ..B8.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_scale_dble_avx512.26: -..L27: - #242.1 - movslq %edi, %rdi #242.1 - vbroadcastsd %xmm0, %zmm0 #248.8 - lea (%rdi,%rdi,2), %rax #250.8 - shlq $6, %rax #250.8 - addq %rsi, %rax #250.8 - cmpq %rax, %rsi #252.14 - jae ..B8.5 # Prob 10% #252.14 - # LOE rax rbx rbp rsi r12 r13 r14 r15 zmm0 -..B8.3: # Preds ..B8.1 ..B8.3 - # Execution count [5.00e+00] - vmulpd (%rsi), %zmm0, %zmm1 #254.10 - vmulpd 64(%rsi), %zmm0, %zmm2 #258.10 - vmulpd 128(%rsi), %zmm0, %zmm3 #262.10 - vmovups %zmm1, (%rsi) #255.26 - vmovups %zmm2, 64(%rsi) #259.26 - vmovups %zmm3, 128(%rsi) #263.26 - addq $192, %rsi #252.18 - cmpq %rax, %rsi #252.14 - jb ..B8.3 # Prob 82% #252.14 - # LOE rax rbx rbp rsi r12 r13 r14 r15 zmm0 -..B8.5: # Preds ..B8.3 ..B8.1 - # Execution count [1.00e+00] - vzeroupper #265.1 - ret #265.1 - .align 16,0x90 - # LOE - .cfi_endproc -# mark_end; - .type scale_dble_avx512,@function - .size scale_dble_avx512,.-scale_dble_avx512 - .data -# -- End scale_dble_avx512 - .text -# -- Begin rotate_dble_avx512 - .text -# mark_begin; - .align 16,0x90 - .globl rotate_dble_avx512 -# --- rotate_dble_avx512(int, int, spinor_dble **, spinor_dble *, const complex_dble *) -rotate_dble_avx512: -# parameter 1: %edi -# parameter 2: %esi -# parameter 3: %rdx -# parameter 4: %rcx -# parameter 5: %r8 -..B9.1: # Preds ..B9.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_rotate_dble_avx512.29: -..L30: - #268.1 - xorl %r10d, %r10d #273.8 - movslq %edi, %r9 #268.1 - xorl %edi, %edi #273.8 - testq %r9, %r9 #273.19 - jle ..B9.9 # Prob 10% #273.19 - # LOE rdx rcx rbx rbp rdi r8 r9 r10 r12 r13 r14 r15 esi -..B9.2: # Preds ..B9.1 - # Execution count [9.00e-01] - movslq %esi, %rsi #268.1 - movq %r9, %rax #306.7 - vmovups .L_2il0floatpacket.3(%rip), %zmm0 #286.12 - shlq $4, %rax #306.7 - lea (%rsi,%rsi,2), %rsi #278.10 - shlq $6, %rsi #278.10 - movq %r15, -24(%rsp) #306.7[spill] - movq %rbx, -16(%rsp) #306.7[spill] - .cfi_offset 3, -24 - .cfi_offset 15, -32 - # LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r12 r13 r14 zmm0 -..B9.3: # Preds ..B9.7 ..B9.2 - # Execution count [5.00e+00] - vmulpd 8(%r8){1to8}, %zmm0, %zmm6 #287.10 - movq %r8, %r11 #279.5 - vbroadcastsd (%r8), %zmm1 #283.10 - movq (%rdx), %rbx #278.10 - vmovups (%rsi,%rbx), %zmm2 #289.30 - vmovups 64(%rsi,%rbx), %zmm5 #294.30 - vmovups 128(%rsi,%rbx), %zmm8 #299.30 - vmulpd %zmm2, %zmm6, %zmm3 #290.10 - vmulpd %zmm5, %zmm6, %zmm4 #295.10 - vmulpd %zmm8, %zmm6, %zmm7 #300.10 - vpermilpd $85, %zmm3, %zmm3 #291.10 - movl $1, %ebx #304.10 - vfmadd231pd %zmm1, %zmm2, %zmm3 #292.10 - vpermilpd $85, %zmm4, %zmm2 #296.10 - vpermilpd $85, %zmm7, %zmm9 #301.10 - vfmadd231pd %zmm1, %zmm5, %zmm2 #297.10 - vfmadd213pd %zmm9, %zmm8, %zmm1 #302.10 - cmpq $1, %r9 #304.21 - jle ..B9.7 # Prob 10% #304.21 - # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r11 r12 r13 r14 zmm0 zmm1 zmm2 zmm3 -..B9.5: # Preds ..B9.3 ..B9.5 - # Execution count [2.50e+01] - addq %rax, %r11 #306.7 - movq (%rdx,%rbx,8), %r15 #305.12 - incq %rbx #304.24 - vmulpd 8(%r11){1to8}, %zmm0, %zmm10 #312.12 - vmovups (%r15,%rsi), %zmm5 #314.32 - vmovups 64(%r15,%rsi), %zmm8 #320.32 - vmovups 128(%r15,%rsi), %zmm12 #326.32 - vbroadcastsd (%r11), %zmm14 #310.12 - vmulpd %zmm5, %zmm10, %zmm4 #315.12 - vmulpd %zmm8, %zmm10, %zmm7 #321.12 - vmulpd %zmm12, %zmm10, %zmm11 #327.12 - vpermilpd $85, %zmm4, %zmm6 #316.12 - vpermilpd $85, %zmm7, %zmm9 #322.12 - vpermilpd $85, %zmm11, %zmm13 #328.12 - vfmadd231pd %zmm14, %zmm5, %zmm6 #317.12 - vfmadd231pd %zmm14, %zmm8, %zmm9 #323.12 - vfmadd213pd %zmm13, %zmm12, %zmm14 #329.12 - vaddpd %zmm3, %zmm6, %zmm3 #318.12 - vaddpd %zmm2, %zmm9, %zmm2 #324.12 - vaddpd %zmm1, %zmm14, %zmm1 #330.12 - cmpq %r9, %rbx #304.21 - jl ..B9.5 # Prob 82% #304.21 - # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r11 r12 r13 r14 zmm0 zmm1 zmm2 zmm3 -..B9.7: # Preds ..B9.5 ..B9.3 - # Execution count [5.00e+00] - incq %r10 #273.22 - addq $16, %r8 #273.22 - vmovups %zmm3, (%rdi,%rcx) #333.26 - vmovups %zmm2, 64(%rdi,%rcx) #334.26 - vmovups %zmm1, 128(%rdi,%rcx) #335.26 - addq $192, %rdi #273.22 - cmpq %r9, %r10 #273.19 - jl ..B9.3 # Prob 82% #273.19 - # LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r12 r13 r14 zmm0 -..B9.8: # Preds ..B9.7 - # Execution count [9.00e-01] - movq -24(%rsp), %r15 #[spill] - .cfi_restore 15 - movq -16(%rsp), %rbx #[spill] - .cfi_restore 3 - # LOE rbx rbp r12 r13 r14 r15 -..B9.9: # Preds ..B9.8 ..B9.1 - # Execution count [1.00e+00] - vzeroupper #337.1 - ret #337.1 - .align 16,0x90 - # LOE - .cfi_endproc -# mark_end; - .type rotate_dble_avx512,@function - .size rotate_dble_avx512,.-rotate_dble_avx512 - .data -# -- End rotate_dble_avx512 - .text -# -- Begin mulg5_dble_avx512 - .text -# mark_begin; - .align 16,0x90 - .globl mulg5_dble_avx512 -# --- mulg5_dble_avx512(int, spinor_dble *) -mulg5_dble_avx512: -# parameter 1: %edi -# parameter 2: %rsi -..B10.1: # Preds ..B10.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_mulg5_dble_avx512.36: -..L37: - #340.1 - movslq %edi, %rdi #340.1 - lea (%rdi,%rdi,2), %rax #343.8 - shlq $6, %rax #343.8 - addq %rsi, %rax #343.8 - cmpq %rax, %rsi #345.14 - jae ..B10.5 # Prob 10% #345.14 - # LOE rax rbx rbp rsi r12 r13 r14 r15 -..B10.2: # Preds ..B10.1 - # Execution count [9.00e-01] - vpxord %zmm1, %zmm1, %zmm1 #350.25 - vxorpd %ymm0, %ymm0, %ymm0 #354.25 - # LOE rax rbx rbp rsi r12 r13 r14 r15 ymm0 zmm1 -..B10.3: # Preds ..B10.3 ..B10.2 - # Execution count [5.00e+00] - vsubpd 96(%rsi), %zmm1, %zmm2 #350.10 - vsubpd 160(%rsi), %ymm0, %ymm3 #354.10 - vmovups %zmm2, 96(%rsi) #351.26 - vmovupd %ymm3, 160(%rsi) #355.26 - addq $192, %rsi #345.18 - cmpq %rax, %rsi #345.14 - jb ..B10.3 # Prob 82% #345.14 - # LOE rax rbx rbp rsi r12 r13 r14 r15 ymm0 zmm1 -..B10.5: # Preds ..B10.3 ..B10.1 - # Execution count [1.00e+00] - vzeroupper #357.1 - ret #357.1 - .align 16,0x90 - # LOE - .cfi_endproc -# mark_end; - .type mulg5_dble_avx512,@function - .size mulg5_dble_avx512,.-mulg5_dble_avx512 - .data -# -- End mulg5_dble_avx512 - .text -# -- Begin mulmg5_dble_avx512 - .text -# mark_begin; - .align 16,0x90 - .globl mulmg5_dble_avx512 -# --- mulmg5_dble_avx512(int, spinor_dble *) -mulmg5_dble_avx512: -# parameter 1: %edi -# parameter 2: %rsi -..B11.1: # Preds ..B11.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_mulmg5_dble_avx512.39: -..L40: - #360.1 - movslq %edi, %rdi #360.1 - lea (%rdi,%rdi,2), %rax #363.8 - shlq $6, %rax #363.8 - addq %rsi, %rax #363.8 - cmpq %rax, %rsi #365.14 - jae ..B11.5 # Prob 10% #365.14 - # LOE rax rbx rbp rsi r12 r13 r14 r15 -..B11.2: # Preds ..B11.1 - # Execution count [9.00e-01] - vpxord %zmm1, %zmm1, %zmm1 #370.25 - vxorpd %ymm0, %ymm0, %ymm0 #374.25 - # LOE rax rbx rbp rsi r12 r13 r14 r15 ymm0 zmm1 -..B11.3: # Preds ..B11.3 ..B11.2 - # Execution count [5.00e+00] - vsubpd (%rsi), %zmm1, %zmm2 #370.10 - vsubpd 64(%rsi), %ymm0, %ymm3 #374.10 - vmovups %zmm2, (%rsi) #371.26 - vmovupd %ymm3, 64(%rsi) #375.26 - addq $192, %rsi #365.18 - cmpq %rax, %rsi #365.14 - jb ..B11.3 # Prob 82% #365.14 - # LOE rax rbx rbp rsi r12 r13 r14 r15 ymm0 zmm1 -..B11.5: # Preds ..B11.3 ..B11.1 - # Execution count [1.00e+00] - vzeroupper #377.1 - ret #377.1 - .align 16,0x90 - # LOE - .cfi_endproc -# mark_end; - .type mulmg5_dble_avx512,@function - .size mulmg5_dble_avx512,.-mulmg5_dble_avx512 - .data -# -- End mulmg5_dble_avx512 - .section .rodata, "a" - .align 64 - .align 64 -.L_2il0floatpacket.3: - .long 0x00000000,0x3ff00000,0x00000000,0xbff00000,0x00000000,0x3ff00000,0x00000000,0xbff00000,0x00000000,0x3ff00000,0x00000000,0xbff00000,0x00000000,0x3ff00000,0x00000000,0xbff00000 - .type .L_2il0floatpacket.3,@object - .size .L_2il0floatpacket.3,64 - .align 64 -.L_2il0floatpacket.4: - .long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0xbff00000,0x00000000,0xbff00000,0x00000000,0xbff00000,0x00000000,0xbff00000 - .type .L_2il0floatpacket.4,@object - .size .L_2il0floatpacket.4,64 - .data - .section .note.GNU-stack, "" -// -- Begin DWARF2 SEGMENT .eh_frame - .section .eh_frame,"a",@progbits -.eh_frame_seg: - .align 8 -# End diff --git a/modules/linalg/salg.c b/modules/linalg/salg.c index 40e8479..f58aea8 100644 --- a/modules/linalg/salg.c +++ b/modules/linalg/salg.c @@ -90,40 +90,6 @@ static void alloc_wrotate(int n) } -#if (defined AVX512 ) - -void mulc_spinor_add_avx512(int vol, spinor *s, spinor const *r, complex z); -void mulc_spinor_add(int vol, spinor *s, spinor *r, complex z) -{ - mulc_spinor_add_avx512( vol, s, r, z); -} - -complex_dble spinor_prod_avx512(int vol, spinor *s, spinor *r ); -complex spinor_prod(int vol, int icom, spinor *s, spinor *r ) -{ - complex z; - complex_dble v, w; - - v = spinor_prod_avx512(vol, s, r); - - if ((icom==1)&&(NPROC>1)) - { - MPI_Reduce(&v.re,&w.re,2,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); - MPI_Bcast(&w.re,2,MPI_DOUBLE,0,MPI_COMM_WORLD); - z.re=(float)(w.re); - z.im=(float)(w.im); - } - else - { - z.re=(float)(v.re); - z.im=(float)(v.im); - } - return z; -} - -#endif - - #if (defined AVX) #include "avx.h" @@ -362,7 +328,7 @@ complex spinor_prod(int vol,int icom,spinor *s,spinor *r) } #endif -void mulc_spinor_add(int vol, spinor *s, spinor const *r, complex z) +void mulc_spinor_add(int vol, spinor *s, spinor *r, complex z) { spinor *sm; diff --git a/modules/linalg/salg_dble.c b/modules/linalg/salg_dble.c index 01e20ca..153ca22 100644 --- a/modules/linalg/salg_dble.c +++ b/modules/linalg/salg_dble.c @@ -108,7 +108,6 @@ static void alloc_wrotate(int n) #if (defined AVX512) #include "avx512.h" - complex_dble spinor_prod_dble_avx512( spinor_dble *s, spinor_dble *smb, spinor_dble *r); complex_dble spinor_prod_dble(int vol, int icom, spinor_dble *s, spinor_dble *r) @@ -248,35 +247,6 @@ double norm_square_dble(int vol, int icom, spinor_dble *s) return smx; } -void mulc_spinor_add_dble_avx512(int vol, spinor_dble *s, spinor_dble *r, - complex_dble z); -void mulc_spinor_add_dble(int vol, spinor_dble *s, spinor_dble *r, - complex_dble z) -{ - mulc_spinor_add_dble_avx512(vol,s,r,z); -} - -void mulr_spinor_add_dble_avx512(int vol, spinor_dble *s, spinor_dble *r, - double c); -void mulr_spinor_add_dble(int vol, spinor_dble *s, spinor_dble *r, - double c) -{ - mulr_spinor_add_dble_avx512(vol,s,r,c); -} - -void combine_spinor_dble_avx512(int vol, spinor_dble *s, spinor_dble *r, - double cs, double cr); -void combine_spinor_dble(int vol, spinor_dble *s, spinor_dble *r, - double cs, double cr) -{ - combine_spinor_dble_avx512(vol,s,r,cs,cr); -} - -void scale_dble_avx512(int vol, double c, spinor_dble *s); -void scale_dble(int vol, double c, spinor_dble *s) -{ - scale_dble_avx512(vol,c,s); -} void rotate_dble_avx512(int n, int ix, spinor_dble **ppk, spinor_dble *psi, complex_dble *v); void rotate_dble(int vol, int n, spinor_dble **ppk, complex_dble *v) @@ -296,19 +266,6 @@ void rotate_dble(int vol, int n, spinor_dble **ppk, complex_dble *v) } } -void mulg5_dble_avx512(int vol, spinor_dble *s); -void mulg5_dble(int vol, spinor_dble *s) -{ - mulg5_dble_avx512( vol, s ); -} - -void mulmg5_dble_avx512(int vol, spinor_dble *s); -void mulmg5_dble(int vol, spinor_dble *s) -{ - mulmg5_dble_avx512( vol, s ); -} - - #elif (defined AVX) #include "avx.h" diff --git a/modules/sw_term/avx512/pauli_avx512.c b/modules/sw_term/avx512/pauli_avx512.c index 504ff80..89e9d50 100644 --- a/modules/sw_term/avx512/pauli_avx512.c +++ b/modules/sw_term/avx512/pauli_avx512.c @@ -13,6 +13,8 @@ * *******************************************************************************/ +#ifdef AVX512 + #include #include #include @@ -26,7 +28,8 @@ typedef union #include "avx512.h" -void mul_pauli2_avx512(float mu, pauli *m, spinor *source, spinor *res ) + +void mul_pauli2(float mu, pauli *m, spinor *source, spinor *res ) { spin_t *ps, *pr; float const *u, *u2; @@ -228,3 +231,5 @@ void mul_pauli2_avx512(float mu, pauli *m, spinor *source, spinor *res ) t128b = _mm_add_ps( t128a, t128b ); _mm_storeu_ps( &(*r2).c2.c2.re, t128b ); } + +#endif \ No newline at end of file diff --git a/modules/sw_term/avx512/pauli_avx512_asm.s b/modules/sw_term/avx512/pauli_avx512_asm.s deleted file mode 100644 index d91d416..0000000 --- a/modules/sw_term/avx512/pauli_avx512_asm.s +++ /dev/null @@ -1,295 +0,0 @@ -# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 17.0.4.196 Build 20170411"; -# mark_description "-I../../../include -I.. -I/cineca/prod/opt/compilers/intel/pe-xe-2017/binary/impi/2017.3.196/intel64/include"; -# mark_description " -isystem /cineca/prod/opt/compilers/intel/pe-xe-2018/binary/impi/2018.1.163/include64/ -std=c89 -xCORE-AVX5"; -# mark_description "12 -mtune=skylake -DAVX512 -O3 -Ddirac_counters -pedantic -fstrict-aliasing -Wno-long-long -Wstrict-prototyp"; -# mark_description "es -S"; - .file "pauli_avx512.c" - .text -..TXTST0: -# -- Begin mul_pauli2_avx512 - .text -# mark_begin; - .align 16,0x90 - .globl mul_pauli2_avx512 -# --- mul_pauli2_avx512(float, pauli *, spinor *, spinor *) -mul_pauli2_avx512: -# parameter 1: %xmm0 -# parameter 2: %rdi -# parameter 3: %rsi -# parameter 4: %rdx -..B1.1: # Preds ..B1.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_mul_pauli2_avx512.1: -..L2: - #15.1 - movl $42410, %eax #82.9 - vmovups (%rdi), %zmm8 #51.27 - vmovups 64(%rdi), %zmm7 #52.27 - vmovups 144(%rdi), %zmm29 #54.27 - vmovups 208(%rdi), %zmm3 #55.27 - vmovups .L_2il0floatpacket.11(%rip), %zmm25 #69.9 - vmovups .L_2il0floatpacket.12(%rip), %zmm31 #71.9 - vmovups .L_2il0floatpacket.13(%rip), %zmm16 #74.9 - vbroadcastss %xmm0, %xmm14 #64.9 - vmovups 64(%rsi), %ymm12 #43.9 - vmovups (%rsi), %zmm11 #42.29 - vmovups .L_2il0floatpacket.8(%rip), %zmm9 #45.9 - vmovups .L_2il0floatpacket.14(%rip), %zmm17 #80.10 - vmovups .L_2il0floatpacket.15(%rip), %zmm19 #88.10 - vmovups .L_2il0floatpacket.17(%rip), %zmm22 #101.9 - vmovups .L_2il0floatpacket.9(%rip), %zmm10 #47.9 - vmovups .L_2il0floatpacket.10(%rip), %zmm13 #49.9 - vmovups .L_2il0floatpacket.16(%rip), %zmm20 #93.10 - vmovups .L_2il0floatpacket.18(%rip), %zmm24 #107.10 - vmovups .L_2il0floatpacket.19(%rip), %zmm28 #116.9 - vmovups 128(%rdi), %zmm2 #53.27 - vpermi2ps %zmm29, %zmm8, %zmm25 #69.9 - vpermi2ps %zmm3, %zmm7, %zmm31 #71.9 - vbroadcastss %xmm14, %zmm26 #65.9 - vpermt2ps %zmm29, %zmm28, %zmm8 #116.9 - vpermi2ps %zmm31, %zmm25, %zmm16 #74.9 - vpermi2ps %zmm25, %zmm31, %zmm19 #88.10 - vpermt2ps %zmm31, %zmm22, %zmm25 #101.9 - vpermi2ps %zmm16, %zmm26, %zmm17 #80.10 - vpermi2ps %zmm26, %zmm31, %zmm20 #93.10 - vpermt2ps %zmm25, %zmm24, %zmm26 #107.10 - vmovups .L_2il0floatpacket.22(%rip), %zmm28 #137.9 - vshufps $170, %zmm31, %zmm8, %zmm30 #118.10 - vshufps $255, %zmm31, %zmm8, %zmm24 #121.10 - kmovw %eax, %k2 #82.9 - vpermilps $160, %zmm16, %zmm15 #75.10 - movl $23125, %eax #83.9 - kmovw %eax, %k3 #83.9 - movl $21925, %eax #95.9 - kmovw %eax, %k4 #95.9 - vpermilps $160, %zmm25, %zmm23 #102.10 - movl $43610, %eax #96.9 - kmovw %eax, %k5 #96.9 - movl $26022, %eax #109.9 - kmovw %eax, %k6 #109.9 - movl $39513, %eax #110.9 - kmovw %eax, %k7 #110.9 - movl $43690, %eax #123.9 - vpermi2ps %zmm12, %zmm11, %zmm9 #45.9 - vpermi2ps %zmm12, %zmm11, %zmm10 #47.9 - vpermt2ps %zmm12, %zmm13, %zmm11 #49.9 - vmulps %zmm15, %zmm9, %zmm1 #76.9 - vmulps %zmm19, %zmm10, %zmm0 #89.9 - vmulps %zmm23, %zmm11, %zmm12 #103.9 - vmovups .L_2il0floatpacket.21(%rip), %zmm15 #133.9 - vpermilps $177, %zmm9, %zmm5 #59.10 - vmulps %zmm5, %zmm17, %zmm18 #81.10 - vmovups .L_2il0floatpacket.23(%rip), %zmm17 #158.9 - vpermi2ps %zmm3, %zmm7, %zmm15 #133.9 - vaddps %zmm18, %zmm1, %zmm1{%k2} #82.9 - vpermi2ps %zmm15, %zmm8, %zmm28 #137.9 - vsubps %zmm18, %zmm1, %zmm1{%k3} #83.9 - vpermi2ps %zmm15, %zmm8, %zmm17 #158.9 - kmovw %eax, %k3 #123.9 - vfmadd231ps %zmm10, %zmm30, %zmm1 #119.9 - vpermilps $177, %zmm11, %zmm4 #61.10 - movl $21845, %eax #124.9 - vmulps %zmm4, %zmm26, %zmm27 #108.10 - vmovups .L_2il0floatpacket.20(%rip), %zmm26 #131.10 - kmovw %eax, %k2 #124.9 - vpermt2ps 272(%rdi), %zmm26, %zmm2 #131.10 - vaddps %zmm27, %zmm12, %zmm12{%k6} #109.9 - vpermilps $177, %zmm10, %zmm6 #60.10 - movl $61680, %eax #139.10 - vmulps %zmm6, %zmm20, %zmm21 #94.10 - vmulps %zmm24, %zmm6, %zmm25 #122.10 - vmovups .L_2il0floatpacket.24(%rip), %zmm20 #169.9 - vsubps %zmm27, %zmm12, %zmm12{%k7} #110.9 - vaddps %zmm21, %zmm0, %zmm0{%k4} #95.9 - vpermt2ps %zmm3, %zmm20, %zmm7 #169.9 - vaddps %zmm25, %zmm1, %zmm1{%k3} #123.9 - vsubps %zmm21, %zmm0, %zmm0{%k5} #96.9 - vmovups .L_2il0floatpacket.25(%rip), %zmm3 #172.9 - vmovups .L_2il0floatpacket.26(%rip), %zmm21 #195.9 - vsubps %zmm25, %zmm1, %zmm1{%k2} #124.9 - vpermi2ps %zmm8, %zmm7, %zmm3 #172.9 - kmovw %eax, %k1 #139.10 - vpermilps $245, %zmm28, %zmm29 #142.9 - movl $3855, %eax #148.9 - vshufps $244, %zmm2, %zmm29, %zmm29{%k1} #143.10 - vshufps $68, %zmm2, %zmm7, %zmm7{%k1} #183.9 - kmovw %eax, %k4 #148.9 - vmulps %zmm29, %zmm5, %zmm30 #144.10 - vpermilps $160, %zmm28, %zmm27 #138.9 - movl $42405, %eax #154.9 - vshufps $164, %zmm2, %zmm27, %zmm27{%k1} #139.10 - vmovaps %zmm15, %zmm13 #148.9 - vshufps $228, %zmm15, %zmm8, %zmm13{%k4} #148.9 - vfmadd231ps %zmm9, %zmm27, %zmm0 #140.9 - vpermilps $245, %zmm13, %zmm14 #152.10 - vmulps %zmm14, %zmm5, %zmm5 #153.10 - vaddps %zmm30, %zmm0, %zmm0{%k2} #145.9 - kmovw %eax, %k2 #154.9 - vsubps %zmm30, %zmm0, %zmm0{%k3} #146.9 - vpermilps $160, %zmm13, %zmm31 #149.10 - movl $23130, %eax #155.9 - vpermilps $160, %zmm3, %zmm8 #173.10 - vfmadd213ps %zmm12, %zmm31, %zmm9 #150.9 - kmovw %eax, %k3 #155.9 - vaddps %zmm5, %zmm9, %zmm9{%k2} #154.9 - vpermilps $160, %zmm17, %zmm16 #159.10 - movl $42662, %eax #164.9 - vpermilps $10, %zmm2, %zmm8{%k1} #174.10 - vfmadd231ps %zmm11, %zmm16, %zmm1 #160.9 - vfmadd213ps %zmm0, %zmm8, %zmm11 #175.9 - vsubps %zmm5, %zmm9, %zmm9{%k3} #155.9 - kmovw %eax, %k5 #164.9 - vpermilps $245, %zmm3, %zmm0 #177.10 - movl $22873, %eax #165.9 - vpermilps $245, %zmm17, %zmm18 #162.10 - vpermilps $95, %zmm2, %zmm0{%k1} #178.10 - vpermilps $160, %zmm7, %zmm2 #184.10 - vpermilps $245, %zmm7, %zmm7 #187.10 - vmulps %zmm18, %zmm4, %zmm19 #163.10 - vmulps %zmm7, %zmm6, %zmm6 #188.10 - vmulps %zmm0, %zmm4, %zmm4 #179.10 - vfmadd213ps %zmm9, %zmm2, %zmm10 #185.9 - vmovups .L_2il0floatpacket.27(%rip), %zmm9 #197.9 - vaddps %zmm19, %zmm1, %zmm1{%k5} #164.9 - vaddps %zmm6, %zmm10, %zmm10{%k2} #189.9 - kmovw %eax, %k6 #165.9 - vsubps %zmm6, %zmm10, %zmm10{%k3} #190.9 - vsubps %zmm19, %zmm1, %zmm1{%k6} #165.9 - movl $25957, %eax #180.9 - kmovw %eax, %k7 #180.9 - movl $39578, %eax #181.9 - kmovw %eax, %k5 #181.9 - vaddps %zmm4, %zmm11, %zmm11{%k7} #180.9 - vpermi2ps %zmm10, %zmm1, %zmm21 #195.9 - vpermt2ps %zmm10, %zmm9, %zmm1 #197.9 - vsubps %zmm4, %zmm11, %zmm11{%k5} #181.9 - vaddps %zmm1, %zmm21, %zmm1 #198.9 - vextractf32x4 $1, %zmm11, %xmm10 #204.11 - vextractf32x4 $2, %zmm11, %xmm0 #211.11 - vextractf32x4 $3, %zmm11, %xmm23 #212.11 - vaddps %xmm11, %xmm10, %xmm22 #205.11 - vmovups %xmm22, 32(%rdx) #206.21 - vmovups %ymm1, (%rdx) #201.24 - vextractf64x4 $1, %zmm1, 48(%rdx) #209.24 - vaddps %xmm0, %xmm23, %xmm1 #213.11 - vmovups %xmm1, 80(%rdx) #214.21 - vzeroupper #215.1 - ret #215.1 - .align 16,0x90 - # LOE - .cfi_endproc -# mark_end; - .type mul_pauli2_avx512,@function - .size mul_pauli2_avx512,.-mul_pauli2_avx512 - .data -# -- End mul_pauli2_avx512 - .section .rodata, "a" - .align 64 - .align 64 -.L_2il0floatpacket.8: - .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000006,0x00000007,0x00000008,0x00000009,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000012,0x00000013,0x00000014,0x00000015 - .type .L_2il0floatpacket.8,@object - .size .L_2il0floatpacket.8,64 - .align 64 -.L_2il0floatpacket.9: - .long 0x00000002,0x00000003,0x00000004,0x00000005,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000e,0x0000000f,0x00000010,0x00000011,0x00000014,0x00000015,0x00000016,0x00000017 - .type .L_2il0floatpacket.9,@object - .size .L_2il0floatpacket.9,64 - .align 64 -.L_2il0floatpacket.10: - .long 0x00000004,0x00000005,0x00000000,0x00000001,0x0000000a,0x0000000b,0x00000006,0x00000007,0x00000010,0x00000011,0x0000000c,0x0000000d,0x00000016,0x00000017,0x00000012,0x00000013 - .type .L_2il0floatpacket.10,@object - .size .L_2il0floatpacket.10,64 - .align 64 -.L_2il0floatpacket.11: - .long 0x00000000,0x00000001,0x0000000a,0x0000000b,0x00000004,0x00000005,0x00000002,0x00000003,0x00000010,0x00000011,0x0000001a,0x0000001b,0x00000014,0x00000015,0x00000012,0x00000013 - .type .L_2il0floatpacket.11,@object - .size .L_2il0floatpacket.11,64 - .align 64 -.L_2il0floatpacket.12: - .long 0x00000004,0x00000005,0x00000000,0x00000001,0x0000000c,0x0000000d,0x00000006,0x00000007,0x00000014,0x00000015,0x00000010,0x00000011,0x0000001c,0x0000001d,0x00000016,0x00000017 - .type .L_2il0floatpacket.12,@object - .size .L_2il0floatpacket.12,64 - .align 64 -.L_2il0floatpacket.13: - .long 0x00000000,0x00000000,0x00000001,0x00000001,0x00000002,0x00000003,0x00000010,0x00000011,0x00000008,0x00000008,0x00000009,0x00000009,0x0000000a,0x0000000b,0x00000018,0x00000019 - .type .L_2il0floatpacket.13,@object - .size .L_2il0floatpacket.13,64 - .align 64 -.L_2il0floatpacket.14: - .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000015,0x00000015,0x00000017,0x00000017,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000001d,0x0000001d,0x0000001f,0x0000001f - .type .L_2il0floatpacket.14,@object - .size .L_2il0floatpacket.14,64 - .align 64 -.L_2il0floatpacket.15: - .long 0x00000000,0x00000000,0x00000004,0x00000004,0x00000014,0x00000014,0x00000015,0x00000015,0x00000008,0x00000008,0x0000000c,0x0000000c,0x0000001c,0x0000001c,0x0000001d,0x0000001d - .type .L_2il0floatpacket.15,@object - .size .L_2il0floatpacket.15,64 - .align 64 -.L_2il0floatpacket.16: - .long 0x00000001,0x00000001,0x00000005,0x00000005,0x00000014,0x00000015,0x00000016,0x00000017,0x00000009,0x00000009,0x0000000d,0x0000000d,0x0000001c,0x0000001d,0x0000001e,0x0000001f - .type .L_2il0floatpacket.16,@object - .size .L_2il0floatpacket.16,64 - .align 64 -.L_2il0floatpacket.17: - .long 0x00000006,0x00000006,0x00000002,0x00000003,0x00000014,0x00000015,0x00000007,0x00000007,0x0000000e,0x0000000e,0x0000000a,0x0000000b,0x0000001c,0x0000001d,0x0000000f,0x0000000f - .type .L_2il0floatpacket.17,@object - .size .L_2il0floatpacket.17,64 - .align 64 -.L_2il0floatpacket.18: - .long 0x00000000,0x00000001,0x00000013,0x00000013,0x00000015,0x00000015,0x00000006,0x00000007,0x00000008,0x00000009,0x0000001b,0x0000001b,0x0000001d,0x0000001d,0x0000000e,0x0000000f - .type .L_2il0floatpacket.18,@object - .size .L_2il0floatpacket.18,64 - .align 64 -.L_2il0floatpacket.19: - .long 0x00000008,0x00000009,0x00000006,0x00000007,0x0000000e,0x0000000f,0x0000000c,0x0000000d,0x00000018,0x00000019,0x00000016,0x00000017,0x0000001e,0x0000001f,0x0000001c,0x0000001d - .type .L_2il0floatpacket.19,@object - .size .L_2il0floatpacket.19,64 - .align 64 -.L_2il0floatpacket.20: - .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000000,0x00000001,0x00000002,0x00000003,0x00000010,0x00000011,0x00000012,0x00000013,0x00000010,0x00000011,0x00000012,0x00000013 - .type .L_2il0floatpacket.20,@object - .size .L_2il0floatpacket.20,64 - .align 64 -.L_2il0floatpacket.21: - .long 0x00000006,0x00000007,0x00000002,0x00000003,0x00000008,0x00000009,0x0000000e,0x0000000f,0x00000016,0x00000017,0x00000012,0x00000013,0x00000018,0x00000019,0x0000001e,0x0000001f - .type .L_2il0floatpacket.21,@object - .size .L_2il0floatpacket.21,64 - .align 64 -.L_2il0floatpacket.22: - .long 0x00000006,0x00000007,0x00000010,0x00000011,0x00000016,0x00000017,0x00000000,0x00000000,0x0000000e,0x0000000f,0x00000018,0x00000019,0x0000001e,0x0000001f,0x00000000,0x00000000 - .type .L_2il0floatpacket.22,@object - .size .L_2il0floatpacket.22,64 - .align 64 -.L_2il0floatpacket.23: - .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000012,0x00000013,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000001a,0x0000001b - .type .L_2il0floatpacket.23,@object - .size .L_2il0floatpacket.23,64 - .align 64 -.L_2il0floatpacket.24: - .long 0x00000000,0x00000001,0x00000008,0x00000009,0x0000000a,0x0000000b,0xffffffff,0xffffffff,0x00000010,0x00000011,0x00000018,0x00000019,0x0000001a,0x0000001b,0xffffffff,0xffffffff - .type .L_2il0floatpacket.24,@object - .size .L_2il0floatpacket.24,64 - .align 64 -.L_2il0floatpacket.25: - .long 0x00000004,0x00000005,0x00000014,0x00000015,0x00000000,0x00000000,0x00000000,0x00000000,0x0000000c,0x0000000d,0x0000001c,0x0000001d,0x00000000,0x00000000,0x00000000,0x00000000 - .type .L_2il0floatpacket.25,@object - .size .L_2il0floatpacket.25,64 - .align 64 -.L_2il0floatpacket.26: - .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000010,0x00000011,0x00000012,0x00000013,0x00000008,0x00000009,0x0000000a,0x0000000b,0x00000018,0x00000019,0x0000001a,0x0000001b - .type .L_2il0floatpacket.26,@object - .size .L_2il0floatpacket.26,64 - .align 64 -.L_2il0floatpacket.27: - .long 0x00000004,0x00000005,0x00000006,0x00000007,0x00000014,0x00000015,0x00000016,0x00000017,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x0000001c,0x0000001d,0x0000001e,0x0000001f - .type .L_2il0floatpacket.27,@object - .size .L_2il0floatpacket.27,64 - .data - .section .note.GNU-stack, "" -// -- Begin DWARF2 SEGMENT .eh_frame - .section .eh_frame,"a",@progbits -.eh_frame_seg: - .align 8 -# End diff --git a/modules/sw_term/avx512/pauli_dble_avx512.c b/modules/sw_term/avx512/pauli_dble_avx512.c index 51d0af2..5eb11fb 100644 --- a/modules/sw_term/avx512/pauli_dble_avx512.c +++ b/modules/sw_term/avx512/pauli_dble_avx512.c @@ -12,6 +12,9 @@ * implementations. * *******************************************************************************/ + +#ifdef AVX512 + #include #include #include @@ -30,7 +33,8 @@ typedef union #include "avx512.h" -void mul_pauli2_dble_avx512(double mu, pauli_dble *m, weyl_dble *s, weyl_dble *r) + +void mul_pauli2_dble(double mu, pauli_dble *m, weyl_dble *s, weyl_dble *r) { double const *u = m->u, *u2 = (m+1)->u; @@ -481,3 +485,5 @@ void bck_house_avx512( complex_dble *aa, complex_dble *dd, double * rr ) } } } + +#endif \ No newline at end of file diff --git a/modules/sw_term/avx512/pauli_dble_avx512_asm.s b/modules/sw_term/avx512/pauli_dble_avx512_asm.s deleted file mode 100644 index a3fbdf6..0000000 --- a/modules/sw_term/avx512/pauli_dble_avx512_asm.s +++ /dev/null @@ -1,1235 +0,0 @@ -# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 17.0.4.196 Build 20170411"; -# mark_description "-I../../../include -I.. -I/cineca/prod/opt/compilers/intel/pe-xe-2017/binary/impi/2017.3.196/intel64/include"; -# mark_description " -isystem /cineca/prod/opt/compilers/intel/pe-xe-2018/binary/impi/2018.1.163/include64/ -std=c89 -xCORE-AVX5"; -# mark_description "12 -mtune=skylake -DAVX512 -O3 -Ddirac_counters -pedantic -fstrict-aliasing -Wno-long-long -Wstrict-prototyp"; -# mark_description "es -S"; - .file "pauli_dble_avx512.c" - .text -..TXTST0: -# -- Begin mul_pauli2_dble_avx512 - .text -# mark_begin; - .align 16,0x90 - .globl mul_pauli2_dble_avx512 -# --- mul_pauli2_dble_avx512(double, pauli_dble *, weyl_dble *, weyl_dble *) -mul_pauli2_dble_avx512: -# parameter 1: %xmm0 -# parameter 2: %rdi -# parameter 3: %rsi -# parameter 4: %rdx -..B1.1: # Preds ..B1.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_mul_pauli2_dble_avx512.1: -..L2: - #20.1 - pushq %rbp #20.1 - .cfi_def_cfa_offset 16 - movq %rsp, %rbp #20.1 - .cfi_def_cfa 6, 16 - .cfi_offset 6, -16 - movl $86, %eax #81.8 - vmovups .L_2il0floatpacket.9(%rip), %zmm17 #49.9 - vmovups .L_2il0floatpacket.10(%rip), %zmm9 #51.9 - vmovups (%rsi), %zmm22 #31.28 - vmovups 64(%rsi), %zmm16 #32.28 - vmovups 128(%rsi), %zmm14 #33.28 - vmovups .L_2il0floatpacket.13(%rip), %zmm5 #70.9 - vmovups (%rdi), %zmm19 #35.27 - vmovups 64(%rdi), %zmm29 #36.27 - vmovups 288(%rdi), %zmm21 #40.27 - vmovups 352(%rdi), %zmm6 #41.27 - vmovups .L_2il0floatpacket.14(%rip), %zmm23 #73.9 - vmovsd %xmm0, -16(%rbp) #20.1 - vmovups .L_2il0floatpacket.16(%rip), %zmm8 #78.9 - vmovups -16(%rbp), %zmm13 #46.27 - vmovups .L_2il0floatpacket.11(%rip), %zmm11 #54.8 - vmovups .L_2il0floatpacket.12(%rip), %zmm20 #57.9 - vmovups 128(%rdi), %zmm30 #37.27 - vmovups 416(%rdi), %zmm10 #42.27 - vmovups 480(%rdi), %zmm7 #43.27 - vmovups 192(%rdi), %zmm27 #38.27 - vmovups 256(%rdi), %zmm28 #39.27 - vpermi2pd %zmm6, %zmm29, %zmm23 #73.9 - vpermi2pd %zmm14, %zmm22, %zmm11 #54.8 - kmovw %eax, %k1 #81.8 - vmovaps %zmm17, %zmm2 #50.8 - movl $169, %eax #82.8 - vmovaps %zmm9, %zmm1 #52.8 - vpermi2pd %zmm16, %zmm22, %zmm2 #50.8 - vpermi2pd %zmm16, %zmm22, %zmm1 #52.8 - vpermi2pd %zmm14, %zmm16, %zmm17 #56.8 - vpermt2pd %zmm14, %zmm9, %zmm16 #60.8 - vpermt2pd %zmm14, %zmm20, %zmm22 #58.8 - vmovups .L_2il0floatpacket.15(%rip), %zmm9 #75.9 - kmovw %eax, %k2 #82.8 - vmovaps %zmm5, %zmm25 #71.9 - movl $106, %eax #93.8 - vpermi2pd %zmm21, %zmm19, %zmm25 #71.9 - kmovw %eax, %k3 #93.8 - vmovaps %zmm9, %zmm31 #76.10 - movl $149, %eax #94.8 - vmovaps %zmm8, %zmm26 #79.10 - vpermi2pd %zmm23, %zmm25, %zmm31 #76.10 - vpermi2pd %zmm23, %zmm13, %zmm26 #79.10 - kmovw %eax, %k4 #94.8 - vmulpd %zmm2, %zmm31, %zmm24 #77.8 - vmovups .L_2il0floatpacket.18(%rip), %zmm31 #88.10 - vpermilpd $85, %zmm2, %zmm4 #62.9 - movl $85, %eax #147.8 - vmulpd %zmm4, %zmm26, %zmm12 #80.10 - vmovups .L_2il0floatpacket.17(%rip), %zmm26 #84.9 - vaddpd %zmm12, %zmm24, %zmm24{%k1} #81.8 - vpermt2pd %zmm21, %zmm26, %zmm19 #85.9 - vpermi2pd %zmm10, %zmm30, %zmm26 #98.9 - vsubpd %zmm12, %zmm24, %zmm24{%k2} #82.8 - vpermi2pd %zmm19, %zmm23, %zmm31 #88.10 - vpermi2pd %zmm26, %zmm13, %zmm8 #104.10 - vmovups .L_2il0floatpacket.19(%rip), %zmm12 #90.9 - vfmadd213pd %zmm24, %zmm22, %zmm31 #89.8 - vmovaps %zmm12, %zmm21 #91.10 - vpermi2pd %zmm13, %zmm23, %zmm21 #91.10 - vpermi2pd %zmm13, %zmm26, %zmm12 #113.10 - vpermilpd $85, %zmm22, %zmm15 #66.9 - vmulpd %zmm15, %zmm21, %zmm0 #92.10 - vmovups .L_2il0floatpacket.20(%rip), %zmm21 #101.10 - vaddpd %zmm0, %zmm31, %zmm31{%k3} #93.8 - vpermi2pd %zmm26, %zmm25, %zmm21 #101.10 - vsubpd %zmm0, %zmm31, %zmm31{%k4} #94.8 - vmulpd %zmm1, %zmm21, %zmm21 #102.8 - vpermilpd $85, %zmm1, %zmm3 #63.9 - vmulpd %zmm3, %zmm8, %zmm24 #105.10 - vmovups .L_2il0floatpacket.21(%rip), %zmm8 #110.10 - vaddpd %zmm24, %zmm21, %zmm21{%k1} #106.8 - vpermi2pd %zmm19, %zmm26, %zmm8 #110.10 - vsubpd %zmm24, %zmm21, %zmm21{%k2} #107.8 - vmovups .L_2il0floatpacket.23(%rip), %zmm24 #123.10 - vfmadd213pd %zmm21, %zmm17, %zmm8 #111.8 - vpermilpd $85, %zmm17, %zmm18 #65.9 - vmulpd %zmm18, %zmm12, %zmm21 #114.10 - vmovups .L_2il0floatpacket.24(%rip), %zmm12 #126.10 - vaddpd %zmm21, %zmm8, %zmm8{%k3} #115.8 - vsubpd %zmm21, %zmm8, %zmm8{%k4} #116.8 - vmovups .L_2il0floatpacket.22(%rip), %zmm21 #119.9 - vmovaps %zmm21, %zmm0 #120.9 - vpermi2pd %zmm7, %zmm27, %zmm0 #120.9 - vpermi2pd %zmm0, %zmm19, %zmm24 #123.10 - vpermi2pd %zmm0, %zmm13, %zmm12 #126.10 - vmulpd %zmm11, %zmm24, %zmm24 #124.8 - vpermilpd $85, %zmm11, %zmm14 #64.9 - vmulpd %zmm14, %zmm12, %zmm12 #127.10 - vaddpd %zmm12, %zmm24, %zmm24{%k1} #128.8 - kmovw %eax, %k1 #147.8 - vsubpd %zmm12, %zmm24, %zmm24{%k2} #129.8 - vmovups .L_2il0floatpacket.25(%rip), %zmm12 #132.10 - vpermi2pd %zmm19, %zmm0, %zmm12 #132.10 - movl $170, %eax #148.8 - vmovups .L_2il0floatpacket.26(%rip), %zmm19 #135.10 - vfmadd213pd %zmm24, %zmm16, %zmm12 #133.8 - vpermi2pd %zmm13, %zmm0, %zmm19 #135.10 - kmovw %eax, %k7 #148.8 - vpermilpd $85, %zmm16, %zmm20 #67.9 - movl $90, %eax #166.8 - vmulpd %zmm20, %zmm19, %zmm13 #136.10 - vmovups .L_2il0floatpacket.27(%rip), %zmm19 #141.9 - kmovw %eax, %k5 #166.8 - vaddpd %zmm13, %zmm12, %zmm12{%k3} #137.8 - vsubpd %zmm13, %zmm12, %zmm12{%k4} #138.8 - movl $165, %eax #167.8 - kmovw %eax, %k6 #167.8 - vmovaps %zmm19, %zmm13 #142.10 - movl $240, %eax #272.8 - vpermi2pd %zmm23, %zmm25, %zmm13 #142.10 - kmovw %eax, %k2 #272.8 - vfmadd213pd %zmm8, %zmm2, %zmm13 #143.8 - vmovups .L_2il0floatpacket.28(%rip), %zmm8 #144.9 - vmovaps %zmm8, %zmm24 #145.10 - vpermi2pd %zmm23, %zmm25, %zmm24 #145.10 - vmulpd %zmm24, %zmm4, %zmm24 #146.10 - vaddpd %zmm24, %zmm13, %zmm13{%k1} #147.8 - vsubpd %zmm24, %zmm13, %zmm13{%k7} #148.8 - vmovaps %zmm9, %zmm24 #151.10 - vpermi2pd %zmm0, %zmm23, %zmm24 #151.10 - vfmadd213pd %zmm31, %zmm17, %zmm24 #152.8 - vmovups .L_2il0floatpacket.29(%rip), %zmm31 #153.9 - vpermt2pd %zmm0, %zmm31, %zmm23 #154.10 - vmulpd %zmm23, %zmm18, %zmm23 #155.10 - vaddpd %zmm23, %zmm24, %zmm24{%k7} #156.8 - vsubpd %zmm23, %zmm24, %zmm24{%k1} #157.8 - vmovaps %zmm19, %zmm23 #161.10 - vpermi2pd %zmm26, %zmm25, %zmm23 #161.10 - vpermt2pd %zmm26, %zmm8, %zmm25 #164.10 - vfmadd213pd %zmm24, %zmm1, %zmm23 #162.8 - vmulpd %zmm25, %zmm3, %zmm25 #165.10 - vaddpd %zmm25, %zmm23, %zmm23{%k5} #166.8 - vsubpd %zmm25, %zmm23, %zmm23{%k6} #167.8 - vmovaps %zmm9, %zmm25 #170.10 - vpermi2pd %zmm0, %zmm26, %zmm25 #170.10 - vpermt2pd %zmm0, %zmm31, %zmm26 #173.10 - vfmadd213pd %zmm13, %zmm22, %zmm25 #171.8 - vmulpd %zmm26, %zmm15, %zmm26 #174.10 - vaddpd %zmm26, %zmm25, %zmm25{%k5} #175.8 - vsubpd %zmm26, %zmm25, %zmm25{%k6} #176.8 - vmovups .L_2il0floatpacket.30(%rip), %zmm26 #178.9 - vmovaps %zmm26, %zmm0 #179.10 - vpermi2pd %zmm6, %zmm29, %zmm0 #179.10 - vpermi2pd %zmm10, %zmm30, %zmm26 #189.10 - vfmadd213pd %zmm12, %zmm2, %zmm0 #180.8 - vmovups .L_2il0floatpacket.31(%rip), %zmm12 #181.9 - vmovaps %zmm12, %zmm2 #182.10 - vpermi2pd %zmm6, %zmm29, %zmm2 #182.10 - vpermi2pd %zmm10, %zmm30, %zmm12 #192.10 - vpermt2pd %zmm6, %zmm5, %zmm29 #200.9 - vmulpd %zmm2, %zmm4, %zmm4 #183.10 - vaddpd %zmm4, %zmm0, %zmm0{%k1} #184.8 - vsubpd %zmm4, %zmm0, %zmm0{%k7} #185.8 - vfmadd213pd %zmm0, %zmm1, %zmm26 #190.8 - vmulpd %zmm12, %zmm3, %zmm1 #193.10 - vmovups .L_2il0floatpacket.32(%rip), %zmm3 #201.9 - vaddpd %zmm1, %zmm26, %zmm26{%k1} #194.8 - vpermt2pd %zmm7, %zmm3, %zmm27 #202.9 - vmovups .L_2il0floatpacket.33(%rip), %zmm7 #214.9 - vsubpd %zmm1, %zmm26, %zmm26{%k7} #195.8 - vpermt2pd 544(%rdi), %zmm7, %zmm28 #215.9 - vmovaps %zmm31, %zmm5 #208.10 - vpermi2pd %zmm27, %zmm29, %zmm5 #208.10 - vmulpd %zmm5, %zmm14, %zmm6 #209.10 - vmovaps %zmm9, %zmm0 #205.10 - vpermi2pd %zmm27, %zmm29, %zmm0 #205.10 - vfmadd213pd %zmm23, %zmm11, %zmm0 #206.8 - vaddpd %zmm6, %zmm0, %zmm0{%k5} #210.8 - vmovaps %zmm19, %zmm1 #236.10 - vsubpd %zmm6, %zmm0, %zmm0{%k6} #211.8 - vpermi2pd %zmm28, %zmm29, %zmm1 #236.10 - vpermt2pd %zmm28, %zmm8, %zmm29 #239.10 - vfmadd213pd %zmm0, %zmm16, %zmm1 #237.8 - vmovups .L_2il0floatpacket.34(%rip), %zmm0 #245.9 - vmulpd %zmm29, %zmm20, %zmm29 #240.10 - vpermt2pd %zmm10, %zmm0, %zmm30 #246.9 - vaddpd %zmm29, %zmm1, %zmm1{%k7} #241.8 - vmovaps %zmm19, %zmm5 #218.10 - vpermi2pd %zmm28, %zmm27, %zmm5 #218.10 - vpermi2pd %zmm27, %zmm30, %zmm19 #249.10 - vsubpd %zmm29, %zmm1, %zmm1{%k1} #242.8 - vfmadd213pd %zmm26, %zmm22, %zmm5 #219.8 - vfmadd213pd %zmm25, %zmm11, %zmm19 #250.8 - vmovaps %zmm8, %zmm22 #221.10 - vpermi2pd %zmm28, %zmm27, %zmm22 #221.10 - vpermi2pd %zmm27, %zmm30, %zmm8 #252.10 - vmulpd %zmm22, %zmm15, %zmm15 #222.10 - vmulpd %zmm8, %zmm14, %zmm11 #253.10 - vaddpd %zmm15, %zmm5, %zmm5{%k5} #223.8 - vaddpd %zmm11, %zmm19, %zmm19{%k5} #254.8 - vsubpd %zmm15, %zmm5, %zmm5{%k6} #224.8 - vsubpd %zmm11, %zmm19, %zmm19{%k6} #255.8 - vmovaps %zmm31, %zmm6 #230.10 - vmovaps %zmm9, %zmm15 #227.10 - vpermi2pd %zmm28, %zmm27, %zmm6 #230.10 - vpermi2pd %zmm28, %zmm30, %zmm9 #258.10 - vpermt2pd %zmm28, %zmm31, %zmm30 #261.10 - vpermi2pd %zmm28, %zmm27, %zmm15 #227.10 - vmulpd %zmm6, %zmm18, %zmm18 #231.10 - vfmadd213pd %zmm19, %zmm16, %zmm9 #259.8 - vfmadd213pd %zmm5, %zmm17, %zmm15 #228.8 - vmovups .L_2il0floatpacket.35(%rip), %zmm28 #268.9 - vmulpd %zmm30, %zmm20, %zmm16 #262.10 - vaddpd %zmm18, %zmm15, %zmm15{%k5} #232.8 - vaddpd %zmm16, %zmm9, %zmm9{%k7} #263.8 - vsubpd %zmm18, %zmm15, %zmm15{%k6} #233.8 - vsubpd %zmm16, %zmm9, %zmm9{%k1} #264.8 - vpermi2pd %zmm1, %zmm15, %zmm28 #269.8 - vpermi2pd %zmm9, %zmm1, %zmm7 #267.8 - vpermt2pd %zmm15, %zmm21, %zmm9 #271.8 - vblendmpd %zmm28, %zmm7, %zmm10{%k2} #272.8 - vblendmpd %zmm7, %zmm9, %zmm27{%k2} #273.8 - vblendmpd %zmm9, %zmm28, %zmm30{%k2} #274.8 - vmovups %zmm10, (%rdx) #276.22 - vmovups %zmm27, 64(%rdx) #277.22 - vmovups %zmm30, 128(%rdx) #278.22 - vzeroupper #279.1 - movq %rbp, %rsp #279.1 - popq %rbp #279.1 - .cfi_restore 6 - ret #279.1 - .align 16,0x90 - # LOE - .cfi_endproc -# mark_end; - .type mul_pauli2_dble_avx512,@function - .size mul_pauli2_dble_avx512,.-mul_pauli2_dble_avx512 - .data -# -- End mul_pauli2_dble_avx512 - .text -# -- Begin fwd_house_avx512 - .text -# mark_begin; - .align 16,0x90 - .globl fwd_house_avx512 -# --- fwd_house_avx512(double, complex_dble *, complex_dble *, double *) -fwd_house_avx512: -# parameter 1: %xmm0 -# parameter 2: %rdi -# parameter 3: %rsi -# parameter 4: %rdx -..B2.1: # Preds ..B2.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_fwd_house_avx512.8: -..L9: - #283.1 - pushq %r12 #283.1 - .cfi_def_cfa_offset 16 - .cfi_offset 12, -16 - pushq %r13 #283.1 - .cfi_def_cfa_offset 24 - .cfi_offset 13, -24 - pushq %r14 #283.1 - .cfi_def_cfa_offset 32 - .cfi_offset 14, -32 - pushq %r15 #283.1 - .cfi_def_cfa_offset 40 - .cfi_offset 15, -40 - pushq %rbx #283.1 - .cfi_def_cfa_offset 48 - .cfi_offset 3, -48 - pushq %rbp #283.1 - .cfi_def_cfa_offset 56 - .cfi_offset 6, -56 - xorl %eax, %eax #288.3 - xorl %r8d, %r8d #290.3 - movq %rdi, %r9 #283.1 - xorl %r11d, %r11d #290.3 - vmovapd %xmm0, %xmm14 #283.1 - xorl %r10d, %r10d #290.3 - vxorpd %xmm1, %xmm1, %xmm1 #326.12 - vmovsd .L_2il0floatpacket.38(%rip), %xmm11 #307.12 - xorl %edi, %edi #290.3 - vmovsd .L_2il0floatpacket.36(%rip), %xmm0 #306.16 - # LOE rdx rsi r8 r9 eax edi r10d r11d xmm0 xmm1 xmm11 xmm14 -..B2.2: # Preds ..B2.35 ..B2.1 - # Execution count [5.00e+00] - movslq %r10d, %r12 #292.29 - lea 1(%r8), %ecx #295.10 - shlq $4, %r12 #291.10 - vmovsd 8(%r9,%r12), %xmm3 #292.29 - vmulsd %xmm3, %xmm3, %xmm12 #292.29 - vmovsd (%r9,%r12), %xmm2 #291.29 - vfmadd231sd %xmm2, %xmm2, %xmm12 #291.5 - vsqrtsd %xmm12, %xmm12, %xmm13 #293.10 - # LOE rdx rsi r8 r9 r12 eax ecx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -..B2.3: # Preds ..B2.2 - # Execution count [5.00e+00] - xorl %r13d, %r13d #295.5 - lea 5(%r11), %r14d #295.5 - movl %r14d, %ebp #295.5 - movl $1, %ebx #295.5 - sarl $2, %ebp #295.5 - shrl $29, %ebp #295.5 - lea 5(%rbp,%r11), %r15d #295.5 - xorl %ebp, %ebp #296.7 - sarl $3, %r15d #295.5 - testl %r15d, %r15d #295.5 - jbe ..B2.7 # Prob 10% #295.5 - # LOE rdx rsi r8 r9 r12 eax ecx ebx ebp edi r10d r11d r13d r14d r15d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -..B2.4: # Preds ..B2.3 - # Execution count [1.56e-02] - vxorpd %xmm10, %xmm10, %xmm10 #295.5 - vxorpd %xmm9, %xmm9, %xmm9 #295.5 - vxorpd %xmm8, %xmm8, %xmm8 #295.5 - vxorpd %xmm4, %xmm4, %xmm4 #295.5 - vxorpd %xmm7, %xmm7, %xmm7 #295.5 - vxorpd %xmm6, %xmm6, %xmm6 #295.5 - vxorpd %xmm5, %xmm5, %xmm5 #295.5 - # LOE rdx rsi r8 r9 r12 eax ecx ebp edi r10d r11d r13d r14d r15d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 -..B2.5: # Preds ..B2.5 ..B2.4 - # Execution count [3.12e+00] - incl %r13d #295.5 - lea (%r10,%rbp), %ebx #297.33 - movslq %ebx, %rbx #296.14 - addl $48, %ebp #295.5 - shlq $4, %rbx #297.33 - vmovsd 104(%r9,%rbx), %xmm15 #297.14 - vmovsd 200(%r9,%rbx), %xmm18 #297.14 - vmulsd %xmm15, %xmm15, %xmm17 #296.7 - vmulsd %xmm18, %xmm18, %xmm20 #296.7 - vmovsd 192(%r9,%rbx), %xmm19 #296.14 - vmovsd 96(%r9,%rbx), %xmm16 #296.14 - vfmadd231sd %xmm16, %xmm16, %xmm17 #296.7 - vmovsd 296(%r9,%rbx), %xmm21 #297.14 - vmovsd 392(%r9,%rbx), %xmm24 #297.14 - vmovsd 488(%r9,%rbx), %xmm27 #297.14 - vmovsd 584(%r9,%rbx), %xmm30 #297.14 - vfmadd231sd %xmm19, %xmm19, %xmm20 #296.7 - vaddsd %xmm12, %xmm17, %xmm12 #296.7 - vmulsd %xmm21, %xmm21, %xmm23 #296.7 - vmulsd %xmm24, %xmm24, %xmm26 #296.7 - vmulsd %xmm27, %xmm27, %xmm29 #296.7 - vaddsd %xmm10, %xmm20, %xmm10 #296.7 - vmulsd %xmm30, %xmm30, %xmm15 #296.7 - vmovsd 680(%r9,%rbx), %xmm16 #297.14 - vmovsd 776(%r9,%rbx), %xmm19 #297.14 - vmulsd %xmm16, %xmm16, %xmm18 #296.7 - vmulsd %xmm19, %xmm19, %xmm21 #296.7 - vmovsd 768(%r9,%rbx), %xmm20 #296.14 - vmovsd 288(%r9,%rbx), %xmm22 #296.14 - vmovsd 384(%r9,%rbx), %xmm25 #296.14 - vmovsd 480(%r9,%rbx), %xmm28 #296.14 - vmovsd 576(%r9,%rbx), %xmm31 #296.14 - vmovsd 672(%r9,%rbx), %xmm17 #296.14 - vfmadd231sd %xmm22, %xmm22, %xmm23 #296.7 - vfmadd231sd %xmm25, %xmm25, %xmm26 #296.7 - vfmadd231sd %xmm28, %xmm28, %xmm29 #296.7 - vfmadd231sd %xmm31, %xmm31, %xmm15 #296.7 - vfmadd231sd %xmm17, %xmm17, %xmm18 #296.7 - vfmadd231sd %xmm20, %xmm20, %xmm21 #296.7 - vaddsd %xmm9, %xmm23, %xmm9 #296.7 - vaddsd %xmm8, %xmm26, %xmm8 #296.7 - vaddsd %xmm4, %xmm29, %xmm4 #296.7 - vaddsd %xmm7, %xmm15, %xmm7 #296.7 - vaddsd %xmm6, %xmm18, %xmm6 #296.7 - vaddsd %xmm5, %xmm21, %xmm5 #296.7 - cmpl %r15d, %r13d #295.5 - jb ..B2.5 # Prob 99% #295.5 - # LOE rdx rsi r8 r9 r12 eax ecx ebp edi r10d r11d r13d r14d r15d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 -..B2.6: # Preds ..B2.5 - # Execution count [4.50e+00] - vaddsd %xmm10, %xmm12, %xmm10 #295.5 - vaddsd %xmm8, %xmm9, %xmm8 #295.5 - vaddsd %xmm7, %xmm4, %xmm4 #295.5 - vaddsd %xmm5, %xmm6, %xmm5 #295.5 - vaddsd %xmm8, %xmm10, %xmm9 #295.5 - vaddsd %xmm5, %xmm4, %xmm6 #295.5 - vaddsd %xmm6, %xmm9, %xmm12 #295.5 - lea 1(,%r13,8), %ebx #296.7 - # LOE rdx rsi r8 r9 r12 eax ecx ebx edi r10d r11d r14d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -..B2.7: # Preds ..B2.6 ..B2.3 - # Execution count [5.00e+00] - cmpl %r14d, %ebx #295.5 - ja ..B2.23 # Prob 50% #295.5 - # LOE rdx rsi r8 r9 r12 eax ecx ebx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -..B2.8: # Preds ..B2.7 - # Execution count [0.00e+00] - lea (%r8,%rbx), %ebp #295.5 - negl %ebp #295.5 - addl $5, %ebp #295.5 - jmp *.2.10_2.switchtab.4(,%rbp,8) #295.5 - # LOE rdx rsi r8 r9 r12 eax ecx ebx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -..1.10_0.TAG.6: -..B2.10: # Preds ..B2.8 - # Execution count [0.00e+00] - lea (%rbx,%rbx,2), %ebp #296.14 - lea (%r10,%rbp,2), %r13d #297.33 - movslq %r13d, %r13 #296.14 - shlq $4, %r13 #297.33 - lea 584(%r9,%r13), %r14 #297.14 - vmovsd (%r14), %xmm4 #297.14 - vmulsd %xmm4, %xmm4, %xmm6 #297.33 - vmovsd -8(%r14), %xmm5 #296.14 - vfmadd231sd %xmm5, %xmm5, %xmm6 #296.7 - vaddsd %xmm6, %xmm12, %xmm12 #296.7 - # LOE rdx rsi r8 r9 r12 eax ecx ebx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -..1.10_0.TAG.5: -..B2.12: # Preds ..B2.8 ..B2.10 - # Execution count [0.00e+00] - lea (%rbx,%rbx,2), %ebp #296.14 - lea (%r10,%rbp,2), %r13d #297.33 - movslq %r13d, %r13 #296.14 - shlq $4, %r13 #297.33 - lea 488(%r9,%r13), %r14 #297.14 - vmovsd (%r14), %xmm4 #297.14 - vmulsd %xmm4, %xmm4, %xmm6 #297.33 - vmovsd -8(%r14), %xmm5 #296.14 - vfmadd231sd %xmm5, %xmm5, %xmm6 #296.7 - vaddsd %xmm6, %xmm12, %xmm12 #296.7 - # LOE rdx rsi r8 r9 r12 eax ecx ebx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -..1.10_0.TAG.4: -..B2.14: # Preds ..B2.8 ..B2.12 - # Execution count [0.00e+00] - lea (%rbx,%rbx,2), %ebp #296.14 - lea (%r10,%rbp,2), %r13d #297.33 - movslq %r13d, %r13 #296.14 - shlq $4, %r13 #297.33 - lea 392(%r9,%r13), %r14 #297.14 - vmovsd (%r14), %xmm4 #297.14 - vmulsd %xmm4, %xmm4, %xmm6 #297.33 - vmovsd -8(%r14), %xmm5 #296.14 - vfmadd231sd %xmm5, %xmm5, %xmm6 #296.7 - vaddsd %xmm6, %xmm12, %xmm12 #296.7 - # LOE rdx rsi r8 r9 r12 eax ecx ebx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -..1.10_0.TAG.3: -..B2.16: # Preds ..B2.8 ..B2.14 - # Execution count [0.00e+00] - lea (%rbx,%rbx,2), %ebp #296.14 - lea (%r10,%rbp,2), %r13d #297.33 - movslq %r13d, %r13 #296.14 - shlq $4, %r13 #297.33 - lea 296(%r9,%r13), %r14 #297.14 - vmovsd (%r14), %xmm4 #297.14 - vmulsd %xmm4, %xmm4, %xmm6 #297.33 - vmovsd -8(%r14), %xmm5 #296.14 - vfmadd231sd %xmm5, %xmm5, %xmm6 #296.7 - vaddsd %xmm6, %xmm12, %xmm12 #296.7 - # LOE rdx rsi r8 r9 r12 eax ecx ebx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -..1.10_0.TAG.2: -..B2.18: # Preds ..B2.8 ..B2.16 - # Execution count [0.00e+00] - lea (%rbx,%rbx,2), %ebp #296.14 - lea (%r10,%rbp,2), %r13d #297.33 - movslq %r13d, %r13 #296.14 - shlq $4, %r13 #297.33 - lea 200(%r9,%r13), %r14 #297.14 - vmovsd (%r14), %xmm4 #297.14 - vmulsd %xmm4, %xmm4, %xmm6 #297.33 - vmovsd -8(%r14), %xmm5 #296.14 - vfmadd231sd %xmm5, %xmm5, %xmm6 #296.7 - vaddsd %xmm6, %xmm12, %xmm12 #296.7 - # LOE rdx rsi r8 r9 r12 eax ecx ebx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -..1.10_0.TAG.1: -..B2.20: # Preds ..B2.8 ..B2.18 - # Execution count [0.00e+00] - lea (%rbx,%rbx,2), %ebp #296.14 - lea (%r10,%rbp,2), %r13d #297.33 - movslq %r13d, %r13 #296.14 - shlq $4, %r13 #297.33 - vmovsd 104(%r9,%r13), %xmm4 #297.14 - vmulsd %xmm4, %xmm4, %xmm6 #297.33 - vmovsd 96(%r9,%r13), %xmm5 #296.14 - vfmadd231sd %xmm5, %xmm5, %xmm6 #296.7 - vaddsd %xmm6, %xmm12, %xmm12 #296.7 - # LOE rdx rsi r8 r9 r12 eax ecx ebx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -..1.10_0.TAG.0: -..B2.22: # Preds ..B2.8 ..B2.20 - # Execution count [4.50e+00] - lea (%rbx,%rbx,2), %ebx #296.14 - lea (%r10,%rbx,2), %ebp #297.33 - movslq %ebp, %rbp #296.14 - shlq $4, %rbp #297.33 - vmovsd 8(%r9,%rbp), %xmm4 #297.14 - vmulsd %xmm4, %xmm4, %xmm6 #297.33 - vmovsd (%r9,%rbp), %xmm5 #296.14 - vfmadd231sd %xmm5, %xmm5, %xmm6 #296.7 - vaddsd %xmm6, %xmm12, %xmm12 #296.7 - # LOE rdx rsi r8 r9 r12 eax ecx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -..B2.23: # Preds ..B2.22 ..B2.7 - # Execution count [5.00e+00] - vcomisd %xmm14, %xmm12 #299.15 - jb ..B2.25 # Prob 50% #299.15 - # LOE rdx rsi r8 r9 r12 eax ecx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -..B2.24: # Preds ..B2.23 - # Execution count [2.50e+00] - vsqrtsd %xmm12, %xmm12, %xmm12 #300.12 - jmp ..B2.26 # Prob 100% #300.12 - # LOE rdx rsi r8 r9 r12 eax ecx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -..B2.25: # Preds ..B2.23 - # Execution count [2.50e+00] - vmovapd %xmm11, %xmm12 #303.7 - movl $1, %eax #302.7 - # LOE rdx rsi r8 r9 r12 eax ecx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -..B2.26: # Preds ..B2.24 ..B2.25 - # Execution count [5.00e+00] - vmulsd %xmm0, %xmm12, %xmm4 #306.30 - vcomisd %xmm4, %xmm13 #306.30 - jb ..B2.28 # Prob 50% #306.30 - # LOE rdx rsi r8 r9 r12 eax ecx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm11 xmm12 xmm13 xmm14 -..B2.27: # Preds ..B2.26 - # Execution count [2.50e+00] - vdivsd %xmm13, %xmm11, %xmm4 #307.18 - vmulsd %xmm4, %xmm2, %xmm5 #308.19 - vmulsd %xmm3, %xmm4, %xmm4 #309.19 - jmp ..B2.29 # Prob 100% #309.19 - # LOE rdx rsi r8 r9 r12 eax ecx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm11 xmm12 xmm13 xmm14 -..B2.28: # Preds ..B2.26 - # Execution count [2.50e+00] - vmovapd %xmm11, %xmm5 #311.7 - vxorpd %xmm4, %xmm4, %xmm4 #312.7 - # LOE rdx rsi r8 r9 r12 eax ecx edi r10d r11d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm11 xmm12 xmm13 xmm14 -..B2.29: # Preds ..B2.27 ..B2.28 - # Execution count [6.63e-01] - vfmadd231sd %xmm12, %xmm4, %xmm3 #316.5 - xorl %ebp, %ebp #323.5 - vfmadd231sd %xmm12, %xmm5, %xmm2 #315.5 - vmovsd %xmm3, 8(%r9,%r12) #316.5 - vaddsd %xmm13, %xmm12, %xmm3 #318.28 - vmulsd %xmm3, %xmm12, %xmm12 #318.28 - lea 6(%r11), %r13d #331.23 - vmulsd %xmm3, %xmm5, %xmm5 #320.5 - vmulsd %xmm3, %xmm4, %xmm4 #321.28 - vdivsd %xmm12, %xmm11, %xmm6 #318.28 - vmovsd %xmm2, (%r9,%r12) #315.5 - movq %r8, %rbx #320.5 - vmulsd %xmm6, %xmm5, %xmm2 #320.5 - vmulsd %xmm6, %xmm4, %xmm8 #321.33 - movslq %edi, %r12 #328.12 - shlq $4, %rbx #320.5 - addq %r8, %r12 #328.12 - shlq $4, %r12 #328.12 - vxorpd .L_2il0floatpacket.37(%rip), %xmm2, %xmm7 #320.5 - vmovsd %xmm6, (%rdx,%r8,8) #319.5 - negq %r8 #323.27 - vmovsd %xmm6, -24(%rsp) #318.5 - addq $5, %r8 #323.27 - vmovsd %xmm7, (%rsi,%rbx) #320.5 - vmovsd %xmm8, 8(%rsi,%rbx) #321.5 - lea (%r9,%r12), %rbx #328.12 - vmovddup -24(%rsp), %xmm2 #343.28 - lea 16(%r12,%r9), %r12 #329.12 - movq %rdx, -16(%rsp) #331.23[spill] - # LOE rbx rbp rsi r8 r9 r12 eax ecx edi r10d r11d r13d xmm0 xmm1 xmm2 xmm11 xmm14 -..B2.30: # Preds ..B2.34 ..B2.29 - # Execution count [2.12e+01] - vmovapd %xmm1, %xmm3 #326.12 - movq %rbx, %r15 #328.7 - movq %r12, %r14 #329.7 - xorl %edx, %edx #331.7 - # LOE rbx rbp rsi r8 r9 r12 r14 r15 eax edx ecx edi r10d r11d r13d xmm0 xmm1 xmm2 xmm3 xmm11 xmm14 -..B2.31: # Preds ..B2.31 ..B2.30 - # Execution count [1.18e+02] - vmovupd (%r14), %xmm5 #334.27 - incl %edx #331.7 - vmulpd 8(%r15){1to2}, %xmm5, %xmm4 #335.14 - vpermilpd $1, %xmm4, %xmm6 #336.14 - addq $96, %r14 #340.9 - vfmsubadd231pd (%r15){1to2}, %xmm5, %xmm6 #337.14 - addq $96, %r15 #339.9 - vaddpd %xmm3, %xmm6, %xmm3 #338.14 - cmpl %r13d, %edx #331.7 - jb ..B2.31 # Prob 82% #331.7 - # LOE rbx rbp rsi r8 r9 r12 r14 r15 eax edx ecx edi r10d r11d r13d xmm0 xmm1 xmm2 xmm3 xmm11 xmm14 -..B2.32: # Preds ..B2.31 - # Execution count [2.25e+01] - vmulpd %xmm2, %xmm3, %xmm3 #344.12 - movq %rbx, %r15 #347.7 - movq %r12, %r14 #348.7 - xorl %edx, %edx #349.7 - # LOE rbx rbp rsi r8 r9 r12 r14 r15 eax edx ecx edi r10d r11d r13d xmm0 xmm1 xmm2 xmm3 xmm11 xmm14 -..B2.33: # Preds ..B2.33 ..B2.32 - # Execution count [1.25e+02] - vmulpd 8(%r15){1to2}, %xmm3, %xmm4 #353.14 - vpermilpd $1, %xmm4, %xmm6 #354.14 - incl %edx #349.7 - vfmaddsub231pd (%r15){1to2}, %xmm3, %xmm6 #355.14 - addq $96, %r15 #358.9 - vmovupd (%r14), %xmm5 #352.27 - vsubpd %xmm6, %xmm5, %xmm7 #356.14 - vmovupd %xmm7, (%r14) #357.25 - addq $96, %r14 #359.9 - cmpl %r13d, %edx #349.7 - jb ..B2.33 # Prob 82% #349.7 - # LOE rbx rbp rsi r8 r9 r12 r14 r15 eax edx ecx edi r10d r11d r13d xmm0 xmm1 xmm2 xmm3 xmm11 xmm14 -..B2.34: # Preds ..B2.33 - # Execution count [2.50e+01] - incq %rbp #323.5 - addq $16, %r12 #323.5 - cmpq %r8, %rbp #323.5 - jb ..B2.30 # Prob 81% #323.5 - # LOE rbx rbp rsi r8 r9 r12 eax ecx edi r10d r11d r13d xmm0 xmm1 xmm2 xmm11 xmm14 -..B2.35: # Preds ..B2.34 - # Execution count [5.00e+00] - decl %r11d #295.10 - addl $7, %r10d #295.10 - addl $6, %edi #295.10 - movl %ecx, %r8d #290.3 - movq -16(%rsp), %rdx #[spill] - cmpl $5, %ecx #290.3 - jb ..B2.2 # Prob 79% #290.3 - # LOE rdx rsi r8 r9 eax edi r10d r11d xmm0 xmm1 xmm11 xmm14 -..B2.36: # Preds ..B2.35 - # Execution count [1.00e+00] - vmovsd 568(%r9), %xmm2 #364.44 - vmulsd %xmm2, %xmm2, %xmm0 #364.44 - vmovsd 560(%r9), %xmm1 #364.8 - vfmadd231sd %xmm1, %xmm1, %xmm0 #364.3 - vcomisd %xmm14, %xmm0 #366.13 - jb ..B2.38 # Prob 50% #366.13 - # LOE rbx rbp rsi r12 r13 r14 r15 eax xmm0 xmm1 xmm2 xmm11 -..B2.37: # Preds ..B2.36 - # Execution count [5.00e-01] - vdivsd %xmm0, %xmm11, %xmm11 #367.16 - jmp ..B2.39 # Prob 100% #367.16 - # LOE rbx rbp rsi r12 r13 r14 r15 eax xmm1 xmm2 xmm11 -..B2.38: # Preds ..B2.36 - # Execution count [5.00e-01] - movl $1, %eax #369.5 - # LOE rbx rbp rsi r12 r13 r14 r15 eax xmm1 xmm2 xmm11 -..B2.39: # Preds ..B2.37 ..B2.38 - # Execution count [1.00e+00] - vmulsd %xmm11, %xmm1, %xmm0 #373.19 - vmulsd %xmm2, %xmm11, %xmm1 #374.3 - vxorpd .L_2il0floatpacket.37(%rip), %xmm1, %xmm2 #374.3 - vmovsd %xmm0, 80(%rsi) #373.3 - vmovsd %xmm2, 88(%rsi) #374.3 - .cfi_restore 6 - popq %rbp #376.10 - .cfi_def_cfa_offset 48 - .cfi_restore 3 - popq %rbx #376.10 - .cfi_def_cfa_offset 40 - .cfi_restore 15 - popq %r15 #376.10 - .cfi_def_cfa_offset 32 - .cfi_restore 14 - popq %r14 #376.10 - .cfi_def_cfa_offset 24 - .cfi_restore 13 - popq %r13 #376.10 - .cfi_def_cfa_offset 16 - .cfi_restore 12 - popq %r12 #376.10 - .cfi_def_cfa_offset 8 - ret #376.10 - .align 16,0x90 - # LOE - .cfi_endproc -# mark_end; - .type fwd_house_avx512,@function - .size fwd_house_avx512,.-fwd_house_avx512 - .section .rodata, "a" - .align 64 - .align 8 -.2.10_2.switchtab.4: - .quad ..1.10_0.TAG.0 - .quad ..1.10_0.TAG.1 - .quad ..1.10_0.TAG.2 - .quad ..1.10_0.TAG.3 - .quad ..1.10_0.TAG.4 - .quad ..1.10_0.TAG.5 - .quad ..1.10_0.TAG.6 - .data -# -- End fwd_house_avx512 - .text -# -- Begin solv_sys_avx512 - .text -# mark_begin; - .align 16,0x90 - .globl solv_sys_avx512 -# --- solv_sys_avx512(complex_dble *, complex_dble *) -solv_sys_avx512: -# parameter 1: %rdi -# parameter 2: %rsi -..B3.1: # Preds ..B3.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_solv_sys_avx512.35: -..L36: - #381.1 - pushq %r12 #381.1 - .cfi_def_cfa_offset 16 - .cfi_offset 12, -16 - pushq %r13 #381.1 - .cfi_def_cfa_offset 24 - .cfi_offset 13, -24 - pushq %r14 #381.1 - .cfi_def_cfa_offset 32 - .cfi_offset 14, -32 - pushq %r15 #381.1 - .cfi_def_cfa_offset 40 - .cfi_offset 15, -40 - pushq %rbx #381.1 - .cfi_def_cfa_offset 48 - .cfi_offset 3, -48 - pushq %rbp #381.1 - .cfi_def_cfa_offset 56 - .cfi_offset 6, -56 - movl $5, %edx #386.8 - vxorpd %xmm0, %xmm0, %xmm0 #410.24 - movl $80, %eax #386.8 - # LOE rax rdx rsi rdi xmm0 -..B3.2: # Preds ..B3.10 ..B3.1 - # Execution count [5.00e+00] - lea -1(%rdx), %r13d #387.19 - movslq %r13d, %r14 #387.10 - lea -3(%rdx,%rdx,2), %ebp #387.10 - movq %r14, %r12 #405.28 - addl %ebp, %ebp #387.10 - shlq $4, %r12 #405.28 - movslq %ebp, %rbp #387.10 - addq %rsi, %r12 #381.1 - shlq $4, %rbp #388.28 - testl %r13d, %r13d #387.28 - js ..B3.10 # Prob 2% #387.28 - # LOE rax rdx rbp rsi rdi r12 r14 r13d xmm0 -..B3.3: # Preds ..B3.2 - # Execution count [4.90e+00] - lea -1(%rdx), %r11 #395.21 - movq %r11, %rbx #395.12 - lea (%rdi,%rax), %r8 #388.28 - shlq $4, %rbx #395.12 - lea (%rbp,%r8), %r9 #388.28 - # LOE rax rdx rbx rbp rsi rdi r8 r9 r11 r12 r14 r13d xmm0 -..B3.4: # Preds ..B3.8 ..B3.3 - # Execution count [2.72e+01] - vmovupd (%rax,%rsi), %xmm2 #390.25 - movq %r11, %rcx #395.12 - vmulpd 8(%r9){1to2}, %xmm2, %xmm1 #391.12 - vpermilpd $1, %xmm1, %xmm1 #392.12 - vfmaddsub231pd (%r9){1to2}, %xmm2, %xmm1 #393.12 - cmpq %r14, %r11 #395.29 - jle ..B3.8 # Prob 10% #395.29 - # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r11 r12 r14 r13d xmm0 xmm1 -..B3.5: # Preds ..B3.4 - # Execution count [2.45e+01] - lea (%rdi,%rbp), %r10 #396.30 - addq %rbx, %r10 #396.30 - # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 -..B3.6: # Preds ..B3.6 ..B3.5 - # Execution count [1.36e+02] - lea (%rcx,%rcx,2), %r15d #398.34 - addl %r15d, %r15d #398.34 - decq %rcx #395.32 - movslq %r15d, %r15 #398.27 - shlq $4, %r15 #398.27 - vmovupd (%r8,%r15), %xmm3 #398.27 - vmulpd 8(%r10){1to2}, %xmm3, %xmm2 #399.14 - vpermilpd $1, %xmm2, %xmm4 #400.14 - vfmaddsub231pd (%r10){1to2}, %xmm3, %xmm4 #401.14 - addq $-16, %r10 #395.32 - vaddpd %xmm4, %xmm1, %xmm1 #402.14 - cmpq %r14, %rcx #395.29 - jg ..B3.6 # Prob 82% #395.29 - # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 -..B3.8: # Preds ..B3.6 ..B3.4 - # Execution count [2.72e+01] - vmulpd 8(%r12){1to2}, %xmm1, %xmm2 #407.12 - vpermilpd $1, %xmm2, %xmm3 #408.12 - addq $-96, %rbp #387.31 - vfmaddsub231pd (%r12){1to2}, %xmm1, %xmm3 #409.12 - decq %r14 #387.31 - vsubpd %xmm3, %xmm0, %xmm1 #410.12 - vmovupd %xmm1, (%r9) #411.23 - addq $-96, %r9 #387.31 - addq $-16, %r12 #387.31 - decl %r13d #387.31 - jns ..B3.4 # Prob 82% #387.28 - # LOE rax rdx rbx rbp rsi rdi r8 r9 r11 r12 r14 r13d xmm0 -..B3.10: # Preds ..B3.8 ..B3.2 - # Execution count [5.00e+00] - .byte 15 #386.22 - .byte 31 #386.22 - .byte 128 #386.22 - .byte 0 #386.22 - .byte 0 #386.22 - .byte 0 #386.22 - .byte 0 #386.22 - addq $-16, %rax #386.22 - decq %rdx #386.22 - jg ..B3.2 # Prob 80% #386.19 - # LOE rax rdx rsi rdi xmm0 -..B3.11: # Preds ..B3.10 - # Execution count [1.00e+00] - .cfi_restore 6 - popq %rbp #414.1 - .cfi_def_cfa_offset 48 - .cfi_restore 3 - popq %rbx #414.1 - .cfi_def_cfa_offset 40 - .cfi_restore 15 - popq %r15 #414.1 - .cfi_def_cfa_offset 32 - .cfi_restore 14 - popq %r14 #414.1 - .cfi_def_cfa_offset 24 - .cfi_restore 13 - popq %r13 #414.1 - .cfi_def_cfa_offset 16 - .cfi_restore 12 - popq %r12 #414.1 - .cfi_def_cfa_offset 8 - ret #414.1 - .align 16,0x90 - # LOE - .cfi_endproc -# mark_end; - .type solv_sys_avx512,@function - .size solv_sys_avx512,.-solv_sys_avx512 - .data -# -- End solv_sys_avx512 - .text -# -- Begin bck_house_avx512 - .text -# mark_begin; - .align 16,0x90 - .globl bck_house_avx512 -# --- bck_house_avx512(complex_dble *, complex_dble *, double *) -bck_house_avx512: -# parameter 1: %rdi -# parameter 2: %rsi -# parameter 3: %rdx -..B4.1: # Preds ..B4.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_bck_house_avx512.62: -..L63: - #417.1 - pushq %r12 #417.1 - .cfi_def_cfa_offset 16 - .cfi_offset 12, -16 - pushq %r13 #417.1 - .cfi_def_cfa_offset 24 - .cfi_offset 13, -24 - pushq %r14 #417.1 - .cfi_def_cfa_offset 32 - .cfi_offset 14, -32 - pushq %r15 #417.1 - .cfi_def_cfa_offset 40 - .cfi_offset 15, -40 - pushq %rbx #417.1 - .cfi_def_cfa_offset 48 - .cfi_offset 3, -48 - pushq %rbp #417.1 - .cfi_def_cfa_offset 56 - .cfi_offset 6, -56 - movq %rsi, %r8 #417.1 - movq %rdx, %r9 #417.1 - xorl %edx, %edx #424.3 - xorl %esi, %esi #424.3 - vxorpd %xmm0, %xmm0, %xmm0 #441.12 - movq 80(%r8), %rax #421.15 - movq 88(%r8), %rcx #422.15 - movq %rax, 560(%rdi) #421.3 - xorl %eax, %eax #436.26 - movq %rcx, 568(%rdi) #422.3 - xorl %ecx, %ecx #424.3 - # LOE rax rdx rdi r8 r9 ecx esi xmm0 -..B4.2: # Preds ..B4.15 ..B4.1 - # Execution count [5.00e+00] - movl %edx, %r12d #425.12 - movq %r8, %r11 #425.12 - movq %r12, %rbp #425.12 - movslq %esi, %r15 #427.16 - shlq $4, %rbp #425.12 - shlq $4, %r15 #427.16 - subq %rbp, %r11 #425.12 - movq 448(%rdi,%r15), %r13 #427.16 - movq 64(%r11), %rbx #425.12 - movq %r13, 64(%r11) #427.5 - lea 1(%rdx), %r13d #432.5 - movq %rbx, 448(%rdi,%r15) #429.5 - lea 5(%rcx), %ebx #432.10 - movq 456(%rdi,%r15), %r10 #428.16 - movq 72(%r11), %r14 #426.12 - movq %r10, 72(%r11) #428.5 - movq %r14, 456(%rdi,%r15) #430.5 - cmpl $6, %ebx #432.27 - jge ..B4.9 # Prob 50% #432.27 - # LOE rax rdx rbp rdi r8 r9 r11 r12 ecx esi r13d xmm0 -..B4.3: # Preds ..B4.2 - # Execution count [5.00e+00] - xorl %r14d, %r14d #432.5 - lea 1(%rdx), %r15d #432.5 - shrl $1, %r15d #432.5 - movl $1, %ebx #432.5 - xorl %r10d, %r10d #433.7 - testl %r15d, %r15d #432.5 - jbe ..B4.7 # Prob 9% #432.5 - # LOE rax rdx rbp rdi r8 r9 r11 r12 ecx ebx esi r10d r13d r14d r15d xmm0 -..B4.4: # Preds ..B4.3 - # Execution count [4.50e+00] - movq %r8, -24(%rsp) #[spill] - movq %r9, -16(%rsp) #[spill] - .align 16,0x90 - # LOE rax rdx rbp rdi r11 r12 ecx esi r10d r13d r14d r15d xmm0 -..B4.5: # Preds ..B4.5 ..B4.4 - # Execution count [1.25e+01] - lea (%rsi,%r10), %ebx #434.18 - addl $12, %r10d #432.5 - movslq %ebx, %rbx #434.18 - lea (%r14,%r14), %r8d #434.7 - movslq %r8d, %r8 #434.7 - incl %r14d #432.5 - shlq $4, %rbx #434.18 - shlq $4, %r8 #434.7 - movq 552(%rdi,%rbx), %r9 #434.18 - movq %r9, 88(%r11,%r8) #434.7 - movq 544(%rdi,%rbx), %r9 #433.18 - movq %r9, 80(%r11,%r8) #433.7 - movq 648(%rdi,%rbx), %r9 #434.18 - movq %rax, 552(%rdi,%rbx) #436.7 - movq %r9, 104(%r11,%r8) #434.7 - movq 640(%rdi,%rbx), %r9 #433.18 - movq %rax, 544(%rdi,%rbx) #435.7 - movq %r9, 96(%r11,%r8) #433.7 - movq %rax, 648(%rdi,%rbx) #436.7 - movq %rax, 640(%rdi,%rbx) #435.7 - cmpl %r15d, %r14d #432.5 - jb ..B4.5 # Prob 63% #432.5 - # LOE rax rdx rbp rdi r11 r12 ecx esi r10d r13d r14d r15d xmm0 -..B4.6: # Preds ..B4.5 - # Execution count [4.50e+00] - movq -24(%rsp), %r8 #[spill] - lea 1(%r14,%r14), %ebx #433.7 - movq -16(%rsp), %r9 #[spill] - # LOE rax rdx rbp rdi r8 r9 r11 r12 ecx ebx esi r13d xmm0 -..B4.7: # Preds ..B4.6 ..B4.3 - # Execution count [5.00e+00] - lea -1(%rbx), %r10d #432.5 - cmpl %r13d, %r10d #432.5 - jae ..B4.9 # Prob 9% #432.5 - # LOE rax rdx rbp rdi r8 r9 r11 r12 ecx ebx esi r13d xmm0 -..B4.8: # Preds ..B4.7 - # Execution count [4.50e+00] - movslq %ebx, %r10 #434.7 - lea (%rbx,%rbx,2), %ebx #434.18 - subq %r12, %r10 #434.7 - lea (%rsi,%rbx,2), %r14d #434.18 - movslq %r14d, %r14 #434.18 - shlq $4, %r14 #434.18 - shlq $4, %r10 #434.7 - movq 456(%rdi,%r14), %r15 #434.18 - movq %r15, 72(%r8,%r10) #434.7 - movq 448(%rdi,%r14), %r15 #433.18 - movq %r15, 64(%r8,%r10) #433.7 - movq %rax, 456(%rdi,%r14) #436.7 - movq %rax, 448(%rdi,%r14) #435.7 - # LOE rax rdx rbp rdi r8 r9 r11 r12 ecx esi r13d xmm0 -..B4.9: # Preds ..B4.2 ..B4.8 ..B4.7 - # Execution count [3.96e-01] - shlq $3, %r12 #453.28 - negq %rbp #444.30 - negq %r12 #453.28 - addq %r9, %r12 #453.28 - addq %rdi, %rbp #444.30 - addq $2, %rdx #443.23 - xorb %bl, %bl #439.5 - # LOE rax rdx rbp rdi r8 r9 r11 r12 ecx esi r13d bl xmm0 -..B4.10: # Preds ..B4.14 ..B4.9 - # Execution count [2.54e+01] - movq %rax, %r14 #443.7 - vmovapd %xmm0, %xmm1 #441.12 - movq %r14, %r10 #443.7 - # LOE rax rdx rbp rdi r8 r9 r10 r11 r12 r14 ecx esi r13d bl xmm0 xmm1 -..B4.11: # Preds ..B4.11 ..B4.10 - # Execution count [1.41e+02] - vmovupd 64(%r10,%r11), %xmm3 #446.27 - incq %r14 #443.7 - vmulpd 72(%r10,%rbp){1to2}, %xmm3, %xmm2 #447.14 - vpermilpd $1, %xmm2, %xmm4 #448.14 - vfmaddsub231pd 64(%r10,%rbp){1to2}, %xmm3, %xmm4 #449.14 - addq $16, %r10 #443.7 - vaddpd %xmm1, %xmm4, %xmm1 #450.14 - cmpq %rdx, %r14 #443.7 - jb ..B4.11 # Prob 82% #443.7 - # LOE rax rdx rbp rdi r8 r9 r10 r11 r12 r14 ecx esi r13d bl xmm0 xmm1 -..B4.12: # Preds ..B4.11 - # Execution count [2.70e+01] - movq %rax, %r15 #456.7 - lea 64(%rbp), %r10 #456.7 - vmulpd 32(%r12){1to2}, %xmm1, %xmm1 #454.12 - movq %r15, %r14 #456.7 - # LOE rax rdx rbp rdi r8 r9 r10 r11 r12 r14 r15 ecx esi r13d bl xmm0 xmm1 -..B4.13: # Preds ..B4.13 ..B4.12 - # Execution count [1.50e+02] - vmulpd 72(%r14,%r11){1to2}, %xmm1, %xmm2 #459.14 - vpermilpd $1, %xmm2, %xmm4 #460.14 - incq %r15 #456.7 - vfmsubadd231pd 64(%r14,%r11){1to2}, %xmm1, %xmm4 #461.14 - addq $16, %r14 #456.7 - vmovupd (%r10), %xmm3 #463.28 - vsubpd %xmm4, %xmm3, %xmm5 #464.14 - vmovupd %xmm5, (%r10) #465.25 - addq $16, %r10 #456.7 - cmpq %rdx, %r15 #456.7 - jb ..B4.13 # Prob 82% #456.7 - # LOE rax rdx rbp rdi r8 r9 r10 r11 r12 r14 r15 ecx esi r13d bl xmm0 xmm1 -..B4.14: # Preds ..B4.13 - # Execution count [3.00e+01] - incb %bl #439.5 - addq $96, %rbp #439.5 - cmpb $6, %bl #439.5 - jb ..B4.10 # Prob 83% #439.5 - # LOE rax rdx rbp rdi r8 r9 r11 r12 ecx esi r13d bl xmm0 -..B4.15: # Preds ..B4.14 - # Execution count [5.00e+00] - addl $-7, %esi #432.5 - decl %ecx #432.5 - movl %r13d, %edx #424.3 - cmpl $5, %r13d #424.3 - jb ..B4.2 # Prob 79% #424.3 - # LOE rax rdx rdi r8 r9 ecx esi xmm0 -..B4.16: # Preds ..B4.15 - # Execution count [1.00e+00] - .cfi_restore 6 - popq %rbp #469.1 - .cfi_def_cfa_offset 48 - .cfi_restore 3 - popq %rbx #469.1 - .cfi_def_cfa_offset 40 - .cfi_restore 15 - popq %r15 #469.1 - .cfi_def_cfa_offset 32 - .cfi_restore 14 - popq %r14 #469.1 - .cfi_def_cfa_offset 24 - .cfi_restore 13 - popq %r13 #469.1 - .cfi_def_cfa_offset 16 - .cfi_restore 12 - popq %r12 #469.1 - .cfi_def_cfa_offset 8 - ret #469.1 - .align 16,0x90 - # LOE - .cfi_endproc -# mark_end; - .type bck_house_avx512,@function - .size bck_house_avx512,.-bck_house_avx512 - .data -# -- End bck_house_avx512 - .section .rodata, "a" - .space 8, 0x00 # pad - .align 64 -.L_2il0floatpacket.9: - .long 0x00000000,0x00000000,0x00000001,0x00000000,0x0000000c,0x00000000,0x0000000d,0x00000000,0x00000000,0x00000000,0x00000001,0x00000000,0x0000000c,0x00000000,0x0000000d,0x00000000 - .type .L_2il0floatpacket.9,@object - .size .L_2il0floatpacket.9,64 - .align 64 -.L_2il0floatpacket.10: - .long 0x00000002,0x00000000,0x00000003,0x00000000,0x0000000e,0x00000000,0x0000000f,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x0000000e,0x00000000,0x0000000f,0x00000000 - .type .L_2il0floatpacket.10,@object - .size .L_2il0floatpacket.10,64 - .align 64 -.L_2il0floatpacket.11: - .long 0x00000004,0x00000000,0x00000005,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000 - .type .L_2il0floatpacket.11,@object - .size .L_2il0floatpacket.11,64 - .align 64 -.L_2il0floatpacket.12: - .long 0x00000006,0x00000000,0x00000007,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000 - .type .L_2il0floatpacket.12,@object - .size .L_2il0floatpacket.12,64 - .align 64 -.L_2il0floatpacket.13: - .long 0x00000000,0x00000000,0x00000001,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000,0x0000000e,0x00000000,0x0000000f,0x00000000 - .type .L_2il0floatpacket.13,@object - .size .L_2il0floatpacket.13,64 - .align 64 -.L_2il0floatpacket.14: - .long 0x00000004,0x00000000,0x00000005,0x00000000,0x0000000c,0x00000000,0x0000000d,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000 - .type .L_2il0floatpacket.14,@object - .size .L_2il0floatpacket.14,64 - .align 64 -.L_2il0floatpacket.15: - .long 0x00000000,0x00000000,0x00000000,0x00000000,0x00000002,0x00000000,0x00000002,0x00000000,0x0000000c,0x00000000,0x0000000c,0x00000000,0x0000000e,0x00000000,0x0000000e,0x00000000 - .type .L_2il0floatpacket.15,@object - .size .L_2il0floatpacket.15,64 - .align 64 -.L_2il0floatpacket.16: - .long 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x0000000d,0x00000000,0x0000000d,0x00000000,0x0000000f,0x00000000,0x0000000f,0x00000000 - .type .L_2il0floatpacket.16,@object - .size .L_2il0floatpacket.16,64 - .align 64 -.L_2il0floatpacket.17: - .long 0x00000002,0x00000000,0x00000003,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000,0x0000000c,0x00000000,0x0000000d,0x00000000 - .type .L_2il0floatpacket.17,@object - .size .L_2il0floatpacket.17,64 - .align 64 -.L_2il0floatpacket.18: - .long 0x00000004,0x00000000,0x00000004,0x00000000,0x00000006,0x00000000,0x00000006,0x00000000,0x00000009,0x00000000,0x00000009,0x00000000,0x0000000b,0x00000000,0x0000000b,0x00000000 - .type .L_2il0floatpacket.18,@object - .size .L_2il0floatpacket.18,64 - .align 64 -.L_2il0floatpacket.19: - .long 0x00000005,0x00000000,0x00000005,0x00000000,0x00000007,0x00000000,0x00000007,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000 - .type .L_2il0floatpacket.19,@object - .size .L_2il0floatpacket.19,64 - .align 64 -.L_2il0floatpacket.20: - .long 0x00000001,0x00000000,0x00000001,0x00000000,0x00000003,0x00000000,0x00000003,0x00000000,0x0000000c,0x00000000,0x0000000c,0x00000000,0x0000000e,0x00000000,0x0000000e,0x00000000 - .type .L_2il0floatpacket.20,@object - .size .L_2il0floatpacket.20,64 - .align 64 -.L_2il0floatpacket.21: - .long 0x00000004,0x00000000,0x00000004,0x00000000,0x00000006,0x00000000,0x00000006,0x00000000,0x0000000c,0x00000000,0x0000000c,0x00000000,0x0000000e,0x00000000,0x0000000e,0x00000000 - .type .L_2il0floatpacket.21,@object - .size .L_2il0floatpacket.21,64 - .align 64 -.L_2il0floatpacket.22: - .long 0x00000004,0x00000000,0x00000005,0x00000000,0x0000000c,0x00000000,0x0000000d,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000,0x0000000e,0x00000000,0x0000000f,0x00000000 - .type .L_2il0floatpacket.22,@object - .size .L_2il0floatpacket.22,64 - .align 64 -.L_2il0floatpacket.23: - .long 0x00000000,0x00000000,0x00000000,0x00000000,0x00000002,0x00000000,0x00000002,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000,0x0000000a,0x00000000,0x0000000a,0x00000000 - .type .L_2il0floatpacket.23,@object - .size .L_2il0floatpacket.23,64 - .align 64 -.L_2il0floatpacket.24: - .long 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000009,0x00000000,0x00000009,0x00000000,0x0000000b,0x00000000,0x0000000b,0x00000000 - .type .L_2il0floatpacket.24,@object - .size .L_2il0floatpacket.24,64 - .align 64 -.L_2il0floatpacket.25: - .long 0x00000000,0x00000000,0x00000000,0x00000000,0x00000002,0x00000000,0x00000002,0x00000000,0x0000000d,0x00000000,0x0000000d,0x00000000,0x0000000f,0x00000000,0x0000000f,0x00000000 - .type .L_2il0floatpacket.25,@object - .size .L_2il0floatpacket.25,64 - .align 64 -.L_2il0floatpacket.26: - .long 0x00000001,0x00000000,0x00000001,0x00000000,0x00000003,0x00000000,0x00000003,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000 - .type .L_2il0floatpacket.26,@object - .size .L_2il0floatpacket.26,64 - .align 64 -.L_2il0floatpacket.27: - .long 0x00000004,0x00000000,0x00000004,0x00000000,0x00000006,0x00000000,0x00000006,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000,0x0000000a,0x00000000,0x0000000a,0x00000000 - .type .L_2il0floatpacket.27,@object - .size .L_2il0floatpacket.27,64 - .align 64 -.L_2il0floatpacket.28: - .long 0x00000005,0x00000000,0x00000005,0x00000000,0x00000007,0x00000000,0x00000007,0x00000000,0x00000009,0x00000000,0x00000009,0x00000000,0x0000000b,0x00000000,0x0000000b,0x00000000 - .type .L_2il0floatpacket.28,@object - .size .L_2il0floatpacket.28,64 - .align 64 -.L_2il0floatpacket.29: - .long 0x00000001,0x00000000,0x00000001,0x00000000,0x00000003,0x00000000,0x00000003,0x00000000,0x0000000d,0x00000000,0x0000000d,0x00000000,0x0000000f,0x00000000,0x0000000f,0x00000000 - .type .L_2il0floatpacket.29,@object - .size .L_2il0floatpacket.29,64 - .align 64 -.L_2il0floatpacket.30: - .long 0x00000000,0x00000000,0x00000000,0x00000000,0x00000008,0x00000000,0x00000008,0x00000000,0x00000006,0x00000000,0x00000006,0x00000000,0x0000000e,0x00000000,0x0000000e,0x00000000 - .type .L_2il0floatpacket.30,@object - .size .L_2il0floatpacket.30,64 - .align 64 -.L_2il0floatpacket.31: - .long 0x00000001,0x00000000,0x00000001,0x00000000,0x00000009,0x00000000,0x00000009,0x00000000,0x00000007,0x00000000,0x00000007,0x00000000,0x0000000f,0x00000000,0x0000000f,0x00000000 - .type .L_2il0floatpacket.31,@object - .size .L_2il0floatpacket.31,64 - .align 64 -.L_2il0floatpacket.32: - .long 0x00000002,0x00000000,0x00000003,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000,0x00000000,0x00000000,0x00000001,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000 - .type .L_2il0floatpacket.32,@object - .size .L_2il0floatpacket.32,64 - .align 64 -.L_2il0floatpacket.33: - .long 0x00000000,0x00000000,0x00000001,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x0000000a,0x00000000,0x0000000b,0x00000000 - .type .L_2il0floatpacket.33,@object - .size .L_2il0floatpacket.33,64 - .align 64 -.L_2il0floatpacket.34: - .long 0x00000006,0x00000000,0x00000007,0x00000000,0x0000000e,0x00000000,0x0000000f,0x00000000,0x00000000,0x00000000,0x00000001,0x00000000,0x00000008,0x00000000,0x00000009,0x00000000 - .type .L_2il0floatpacket.34,@object - .size .L_2il0floatpacket.34,64 - .align 64 -.L_2il0floatpacket.35: - .long 0x00000002,0x00000000,0x00000003,0x00000000,0x0000000e,0x00000000,0x0000000f,0x00000000,0x00000000,0x00000000,0x00000001,0x00000000,0x0000000c,0x00000000,0x0000000d,0x00000000 - .type .L_2il0floatpacket.35,@object - .size .L_2il0floatpacket.35,64 - .align 16 -.L_2il0floatpacket.37: - .long 0x00000000,0x80000000,0x00000000,0x00000000 - .type .L_2il0floatpacket.37,@object - .size .L_2il0floatpacket.37,16 - .align 8 -.L_2il0floatpacket.36: - .long 0x00000000,0x3cb00000 - .type .L_2il0floatpacket.36,@object - .size .L_2il0floatpacket.36,8 - .align 8 -.L_2il0floatpacket.38: - .long 0x00000000,0x3ff00000 - .type .L_2il0floatpacket.38,@object - .size .L_2il0floatpacket.38,8 - .data - .section .note.GNU-stack, "" -// -- Begin DWARF2 SEGMENT .eh_frame - .section .eh_frame,"a",@progbits -.eh_frame_seg: - .align 8 -# End diff --git a/modules/sw_term/pauli.c b/modules/sw_term/pauli.c index 2547f1d..8e14bee 100644 --- a/modules/sw_term/pauli.c +++ b/modules/sw_term/pauli.c @@ -497,17 +497,8 @@ void mul_pauli(float mu, pauli *m, weyl *s, weyl *r) #endif - -#ifdef AVX512 - -void mul_pauli2_avx512(float mu, pauli *m, spinor *source, spinor *res ); -void mul_pauli2(float mu, pauli *m, spinor *source, spinor *res ) -{ - mul_pauli2_avx512( mu, m, source, res ); -} - - -#elif (defined AVX) +#ifndef AVX512 +#ifdef AVX #include "avx.h" void mul_pauli2(float mu,pauli *m,spinor *s,spinor *r) @@ -1049,6 +1040,7 @@ void mul_pauli2(float mu,pauli *m,spinor *s,spinor *r) mul_pauli(-mu,m+1,(*ps).w+1,(*pr).w+1); } +#endif #endif void assign_pauli(int vol,pauli_dble *md,pauli *m) diff --git a/modules/sw_term/pauli_dble.c b/modules/sw_term/pauli_dble.c index 7b87d16..65f7376 100644 --- a/modules/sw_term/pauli_dble.c +++ b/modules/sw_term/pauli_dble.c @@ -91,12 +91,6 @@ static complex_dble dd[6] ALIGNED16; #if (defined AVX512) -void mul_pauli2_dble_avx512(double mu, pauli_dble *m, weyl_dble *s, weyl_dble *r); -void mul_pauli2_dble(double mu, pauli_dble *m, weyl_dble *s, weyl_dble *r) -{ - mul_pauli2_dble_avx512( mu, m, s, r ); -} - int fwd_house_avx512(double eps, complex_dble *aa, complex_dble *dd, double * rr ); static int fwd_house(double eps ){ return fwd_house_avx512( eps, aa, dd, rr );