diff --git a/runtime/libpgmath/lib/common/dispatch.c b/runtime/libpgmath/lib/common/dispatch.c index 7ec997c1dfb..51a81aacf71 100644 --- a/runtime/libpgmath/lib/common/dispatch.c +++ b/runtime/libpgmath/lib/common/dispatch.c @@ -86,7 +86,7 @@ #include "mth_tbldefs.h" #if defined(TARGET_LINUX_X8664) || defined(TARGET_OSX_X8664) || defined(TARGET_WIN_X8664) -#include "cpuid8664.h" +#include "x86id.h" #endif /* @@ -1026,25 +1026,25 @@ __math_dispatch() } else { /* Get processor architecture using CPUID information */ #if defined(TARGET_LINUX_X8664) || defined(TARGET_OSX_X8664) || defined(TARGET_WIN_X8664) - if (CPUIDX8664(is_avx512vl)() == 1) { + if (X86IDFN(is_avx512vl)() == 1) { __math_target = arch_avx512; - } else if (CPUIDX8664(is_avx512f)() == 1) { + } else if (X86IDFN(is_avx512f)() == 1) { __math_target = arch_avx512knl; - } else if (CPUIDX8664(is_avx2)() == 1) { + } else if (X86IDFN(is_avx2)() == 1) { __math_target = arch_avx2; - } else if (CPUIDX8664(is_avx)() == 1) { - if (CPUIDX8664(is_intel)() == 1) { + } else if (X86IDFN(is_avx)() == 1) { + if (X86IDFN(is_intel)() == 1) { __math_target = arch_avx; } - if (CPUIDX8664(is_amd)() == 1) { - if (CPUIDX8664(is_fma4)() == 1) { + if (X86IDFN(is_amd)() == 1) { + if (X86IDFN(is_fma4)() == 1) { __math_target = arch_avxfma4; } else { __math_target = arch_sse4; } } } else { - if ((CPUIDX8664(is_sse4a)() == 1) || (CPUIDX8664(is_sse41)() == 1)) { + if ((X86IDFN(is_sse4a)() == 1) || (X86IDFN(is_sse41)() == 1)) { __math_target = arch_sse4; } else { __math_target = arch_em64t; diff --git a/runtime/libpgmath/lib/x86_64/CMakeLists.txt b/runtime/libpgmath/lib/x86_64/CMakeLists.txt index 527b129f0ff..265ad92a69b 100644 --- a/runtime/libpgmath/lib/x86_64/CMakeLists.txt +++ b/runtime/libpgmath/lib/x86_64/CMakeLists.txt @@ -58,9 +58,16 @@ set(SRCS dsqrt.c fabs.c sqrt.c + pgcpuid.c ${ASM_SRCS}) libmath_add_object_library("${SRCS}" "${FLAGS}" "${DEFINITIONS}" "") +# Decorate entry points and global objects in x86id with an internal prefix. +set(SRCS + x86id.c) +list(APPEND DEFINITIONS_FOR_LIBPGC ${DEFINITIONS} FOR_LIBPGC) +libmath_add_object_library("${SRCS}" "${FLAGS}" "${DEFINITIONS_FOR_LIBPGC}" "for_libpgc") + # isoc99 set(SRCS alog.c diff --git a/runtime/libpgmath/lib/x86_64/cpuid8664.h b/runtime/libpgmath/lib/x86_64/cpuid8664.h index 13c8911972a..8fdbee60ee1 100644 --- a/runtime/libpgmath/lib/x86_64/cpuid8664.h +++ b/runtime/libpgmath/lib/x86_64/cpuid8664.h @@ -61,6 +61,7 @@ static int CPUIDX8664(is_amd)(); static int CPUIDX8664(is_fma4)(); static int CPUIDX8664(is_sse4a)(); static int CPUIDX8664(is_sse41)(); +static int CPUIDX8664(is_f16c)(); /* * Check that this is a Genuine Intel processor @@ -296,6 +297,30 @@ CPUIDX8664(is_avx512vl)(void) return (ebx & bit_AVX512VL) != 0; }/* is_avx512vl */ +/* + * Check that this is either a Genuine Intel or AMD processor that supports + * f16c instructions. + */ +static int +CPUIDX8664(is_f16c)(void) +{ + uint32_t eax, ebx, ecx, edx; + + if ((CPUIDX8664(is_intel)() == 0) && (CPUIDX8664(is_amd)() == 0)) { + return 0; + } + + if (CPUIDX8664(is_avx)() == 0) { + return 0; + } + + if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) == 0) { + return 0; + } + + return (ecx & bit_F16C) != 0; +}/* is_f16c */ + #ifdef UNIT_TEST int main() @@ -309,6 +334,7 @@ main() printf("is_avx2()=%d\n", CPUIDX8664(is_avx2)()); printf("is_avx512f()=%d\n", CPUIDX8664(is_avx512f)()); printf("is_avx512vl()=%d\n", CPUIDX8664(is_avx512vl)()); + printf("is_f16c()=%d\n", CPUIDX8664(is_f16c)()); } #endif #endif // #ifndef CPUIDX8664_H diff --git a/runtime/libpgmath/lib/x86_64/pgcpuid.c b/runtime/libpgmath/lib/x86_64/pgcpuid.c new file mode 100644 index 00000000000..c3e0f4ab4dd --- /dev/null +++ b/runtime/libpgmath/lib/x86_64/pgcpuid.c @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include +#include "pgcpuid.h" + +/* + * Note: + * 1) these functions cannot call any other function + * 2) these functions can only use GPR (not floating point) + * + */ + +/** @brief returns false/true if CPUID supports eax function. + * __pgi_cpuid_getma (uint32_t cpuid_func) + * @param cpuid_func (I1) function to execute CPUID with + * + * Returns false(0)/true(1) + * + */ + +int +__pgi_cpuid_getmax(uint32_t f) +{ + uint32_t maxcpueax; + uint32_t fin = f & 0x80000000; + asm("\tcpuid" + : "=a"(maxcpueax) + : "0"(fin) + : "ebx", "ecx", "edx" + ); + return f <= maxcpueax; +} + +/** @brief returns results of executing CPUID with function cpuid_func and + * sub function ecx. + * __pgi_cpuid_ecx(uint32_t cpuid_func, uint32_t *res, uint32_t ecx) + * @param cpuid_func (I1) function to execute CPUID with + * @param res (I2) pointer to buffer to store eax, ebx, ecx, edx + * @param ecx (I3) value of %ecx to execute CPUID with + * + * Returns false(0): if cpuid_func not supported + * true(1): CPUID successfully executed with cpuid_func+ecx and: + * res[0]=%eax, res[1]=%ebx, res[2]=%ecx, res[3]=%edx + * + */ + +int +__pgi_cpuid_ecx(uint32_t f, uint32_t *r, uint32_t c) +{ + if (__pgi_cpuid_getmax(f) == 0) return 0; + asm("\tcpuid" + : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3]) + : "0"(f), "2"(c) + : + ); + return 1; +} + + +/** @brief returns results of executing CPUID with function cpuid_func. + * __pgi_cpuid(uint32_t cpuid_func, uint32_t *res) + * @param cpuid_func (I1) function to execute CPUID with + * @param res (I2) pointer to buffer to store eax, ebx, ecx, edx + * + * Returns false(0): if cpuid_func not supported + * true(1): CPUID successfully executed with cpuid_func and: + * res[0]=%eax, res[1]=%ebx, res[2]=%ecx, res[3]=%edx + * + */ + +int +__pgi_cpuid(uint32_t f, uint32_t *r) +{ + return __pgi_cpuid_ecx(f, r, 0); +} + +/** @brief returns results of executing CPUID with function cpuid_func. + * __pgcpuid(uint32_t cpuid_func, uint32_t *res) + * @param cpuid_func (I1) function to execute CPUID with + * @param res (I2) pointer to buffer to store eax, ebx, ecx, edx + * + * Returns false(0): if cpuid_func not supported + * true(1): CPUID successfully executed with cpuid_func and: + * res[0]=%eax, res[1]=%ebx, res[2]=%ecx, res[3]=%edx + * + */ + +int +__pgcpuid(uint32_t f, uint32_t *r) +{ + return __pgi_cpuid_ecx(f, r, 0); +} + +/** @brief read extended control register. + * __pgi_getbv(uint32_t xcr_num, uint64_t *xcr_res) + * @param xcr_num (I1) extended control register number to read + * @param xcr_res (I2) pointer to buffer to store xcr[xcr_num] + * + * Returns true(1) with: + * xcr_res[31: 0]=%eax + * xcr_res[63:32]=%edx + * + */ +int +__pgi_getbv(uint32_t f, uint64_t *r) +{ + uint32_t *u32; + u32 = (uint32_t *)r; + asm( +#if defined(__WIN64) +"\t.byte\t0x0f, 0x01, 0xd0" +#else +"\txgetbv" +#endif + : "=a"(u32[0]), "=d"(u32[1]) + : "c"(f) + : + ); + return 1; +} diff --git a/runtime/libpgmath/lib/x86_64/pgcpuid.h b/runtime/libpgmath/lib/x86_64/pgcpuid.h new file mode 100644 index 00000000000..f99e95003ce --- /dev/null +++ b/runtime/libpgmath/lib/x86_64/pgcpuid.h @@ -0,0 +1,1020 @@ +/* + * Copyright (c) 2005-2018, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include + +typedef enum { + RES4_EAX = 0, /* Index of eax in res[4] structure */ + RES4_EBX = 1, /* Index of ebx in res[4] structure */ + RES4_ECX = 2, /* Index of ecx in res[4] structure */ + RES4_EDX = 3, /* Index of edx in res[4] structure */ +} RES4_E; + +extern int __pgi_cpuid_getmax(uint32_t); + +/* + * __pgi_cpuid( uint32_t id, uint32_t *res): + * __pgcpuid( uint32_t id, uint32_t *res): + * + * uint32_t res[4]; + * res[0] <-- %eax + * res[1] <-- %ebx + * res[2] <-- %ecx + * res[3] <-- %edx + */ + +extern int __pgi_cpuid(uint32_t, uint32_t *); +extern int __pgcpuid(uint32_t, uint32_t *); + +/* + * __pgi_cpuid_ecx( uint32_t id, uint32_t *res, uint32_t ecx ): + * + * uint32_t res[4]; + * res[0] <-- %eax + * res[1] <-- %ebx + * res[2] <-- %ecx + * res[3] <-- %edx + */ + +extern int __pgi_cpuid_ecx(uint32_t, uint32_t *, uint32_t ); + +/* + * __pgi_getbv( uint32_t xcr_reg, uint64_t *xcr_result): + * xcr_result[31: 0] <-- %eax + * xcr_result[63:32] <-- %edx + */ + +extern int __pgi_getbv(uint32_t, uint64_t *); + +/* + *********************************************************************** + * Non-vendor specific information + */ + +/* + * --------------------------------------------------------------------- + * CPUID( 0000 0000h ) - Largest Standard Function Value and Vendor String + * eax = value + * ebx || edx || ecx gives a 12-character vendor string: + * GenuineIntel + * AuthenticAMD + * UMC UMC UMC + * CyrixInstead + * NexGenDriven + * CentaurHauls + * RiseRiseRise + * SiS SiS SiS + * GenuineTMx86 + * Geode by NSC + */ + +typedef union CPU0 { + unsigned int i[4]; + struct { + int largest; /* largest standard function value */ + char vendor[12]; + } b; +} CPU0; + +/* + * --------------------------------------------------------------------- + * CPUID( 8000 0000h ) - Highest Extended Function Available + * eax = highest 8000 00xx function available + * ebx = reserved + * ecx = reserved + * edx = reserved + */ + +typedef union CPU80 { + unsigned int i[4]; + struct { + int largest; /* largest standard function value */ + int ebx, ecx, edx; + } b; +} CPU80; + +/* + * --------------------------------------------------------------------- + * CPUID( 8000 0002h ) - Processor Name String + * eax = chars 0..3 + * ebx = chars 4..7 + * ecx = chars 8..11 + * edx = chars 12..15 + * + * CPUID( 8000 0003h ) - Processor Name String + * eax = chars 16..19 + * ebx = chars 20..23 + * ecx = chars 24..27 + * edx = chars 28..31 + * + * CPUID( 8000 0004h ) - Processor Name String + * eax = chars 32..35 + * ebx = chars 36..39 + * ecx = chars 40..43 + * edx = chars 44..47 + */ + +typedef union Xname { + unsigned int i[4]; + char name[16]; +} Xname; + +/* + *********************************************************************** + * Intel-specific information + */ + +/* + * --------------------------------------------------------------------- + * CPUID( 0000 0001h ) - Processor Version Information and Feature Flags + * eax = Int_Version + * ebx = Int_Brand + * ecx = Int_Feature2 + * edx = Int_Feature1 + */ + +typedef union ICPU1 { + unsigned int i[4]; + struct { + struct Int_Version { + unsigned int stepping : 4; /* processor stepping / revision */ + unsigned int model : 4; /* processor model */ + unsigned int family : 4; /* processor family, Pentium 4 is 1111 */ + unsigned int ptype : 2; /* processor type */ + unsigned int rs : 2; + unsigned int extmodel : 4; /* extended model information, if model=F */ + unsigned int extfamily : 8; /* extended family information, if family=F */ + unsigned int rs2 : 4; + } eax; + /* Notes: + * ptype is + * 00 - Original OEM processor + * 01 - Overdrive + * 10 - dual processor + * 11 - reserved + */ + struct Int_Brand { + unsigned int brandindex : 8; /* 8-bit brand index */ + unsigned int clflush : 8; /* CLFLUSH size == 8*cache line size */ + unsigned int proccount : 8; /* maximum logical processor count. The + * nearest power-of-2 integer that is + * not smaller is the number of unique + * APIC IDs; this field is valid only + * if CPUID.1.edx.htt is set. + */ + unsigned int apic : 8; /* initial local APIC physical ID */ + } ebx; + /* Notes: + * brandindex is: + * 0 - unsupported + * 1 - Celeron + * 2 - Pentium III + * 3 - Pentium III Xeon + * 4 - Pentium III + * 6 - Mobile Pentium III-M + * 7 - Mobile Celeron + * 8 - Pentium 4 + * 9 - Pentium 4 + * 10 - Celeron + * 11 - Xeon (or Xeon MP) + * 12 - Xeon MP + * 14 - Mobile Pentium 4-M + * 15 - Mobile Celeron + */ + struct Int_Feature2 { + unsigned int sse3 : 1; /* 0:SSE3 */ + unsigned int pclmulqdq : 1; /* 1: PCLMULQDQ instruction */ + unsigned int rs2 : 1; + unsigned int mon : 1; /* 3:monitor/mwait */ + unsigned int cpl : 1; /* 4:CPL qualified debug store */ + unsigned int vmx : 1; /* 5:virtual machine technology */ + unsigned int rs6 : 1; + unsigned int est : 1; /* 7:speedstep technology */ + unsigned int tm2 : 1; /* 8:thermal monitor 2 */ + unsigned int ssse3 : 1; /* 9:Supplemental SSE 4/SSSE3/mni/core2 */ + unsigned int cnxt : 1; /* 10:L1 context ID */ + unsigned int rs11 : 1; + unsigned int fma : 1; /* 12:FMA - FMA extensions in YMM */ + unsigned int cx16 : 1; /* 13:compare/exchange 16-bytes instruction */ + unsigned int xtpr : 1; /* 14:xTPR update control */ + unsigned int pdcm : 1; /* 15:perf/debug capability MSR */ + unsigned int rsx : 2; + unsigned int dca : 1; /* 18:DCA - direct cache access */ + unsigned int sse41 : 1; /* 19:SSE 4.1 */ + unsigned int sse42 : 1; /* 20:SSE 4.2 */ + unsigned int apic : 1; /* 21:x2APIC */ + unsigned int rsy : 1; + unsigned int popcnt : 1; /* 23:POPCNT instruction */ + unsigned int rsy2 : 1; + unsigned int aes : 1; /* 25:AES instruction */ + unsigned int xsave : 1; /* 26:XSAVE save extended states */ + unsigned int osxsave : 1; /* 27:OSXSAVE - XSAVE supported by OS */ + unsigned int avx : 1; /* 28:AVX instructions */ + unsigned int f16c: 1; /* 29:16-bit FP conversion instructions */ + unsigned int rdrand : 1; /* 30:RDRAND instruction */ + unsigned int rsz : 1; + } ecx; + + struct Int_Feature1 { + unsigned int fpu : 1; /* 0:floating point unit on chip */ + unsigned int vme : 1; /* 1:virtual mode extension */ + unsigned int de : 1; /* 2:debugging extension */ + unsigned int pse : 1; /* 3:page size extension */ + unsigned int tsc : 1; /* 4:time stamp counter */ + unsigned int msr : 1; /* 5:model specific registers */ + unsigned int pae : 1; /* 6:physical address extension */ + unsigned int mce : 1; /* 7:machine check exception */ + unsigned int cx8 : 1; /* 8:compare/exchange 8-bytes instruction */ + unsigned int apic : 1; /* 9:on chip APIC hardware */ + unsigned int rs10 : 1; + unsigned int sep : 1; /* 11:fast system call */ + unsigned int mtrr : 1; /* 12:memory type range registers */ + unsigned int pge : 1; /* 13:page global enable */ + unsigned int mca : 1; /* 14:machine check architecture */ + unsigned int cmov : 1; /* 15:conditional move */ + unsigned int pat : 1; /* 16:page attribute table */ + unsigned int pseg : 1; /* 17:36-bit page size extensions */ + unsigned int psn : 1; /* 18:processor serial number */ + unsigned int cflsh : 1; /* 19:cflush */ + unsigned int rs20 : 1; + unsigned int dtes : 1; /* 21:debug store */ + unsigned int + acpi : 1; /* 22:thermal monitor and software controlled clock */ + unsigned int mmx : 1; /* 23:mmx extensions */ + unsigned int fxsr : 1; /* 24:fast floating point save/restore */ + unsigned int sse : 1; /* 25:streaming SIMD extensions */ + unsigned int sse2 : 1; /* 26:streaming SIMD extensions 2 */ + unsigned int slfsnp : 1; /* 27:self-snoop */ + unsigned int htt : 1; /* 28:hyper-threading technology */ + unsigned int tm : 1; /* 29:thermal monitor */ + unsigned int rs30 : 1; + unsigned int ferr : 1; /* 31:FERR signalling change*/ + } edx; + } u; +} ICPU1; + +/* + * --------------------------------------------------------------------- + * CPUID( 0000 0002h ) - Cache and TLB Information + * eax = Int_Cache + * ebx = Int_Cache + * ecx = Int_Cache + * edx = Int_Cache + */ + +typedef union ICPU2 { + unsigned int i[4]; + struct Int_Cache { + unsigned int c1 : 8, c2 : 8, c3 : 8, c4 : 7; /* see below */ + unsigned int invalid : 1; /* if set, no information here */ + } u[4]; +} ICPU2; + +/* Notes: + * c1 for eax is the number of times that CPUID(2) must be called + * to get all cache information; it is usually just 1. + * Otherwise, * if 'invalid' is not set then the four values in c1/c2/c3/c4 + * (c2/c3/c4 for eax) may be zero (no information), or may be one of + * the following in any order: + * 00 - no information + * 01 - 32 entry ITLB 4-way for 4K pages + * 02 - 2 entry ITLB 2-way for 4M pages + * 03 - 64 entry DTLB 4-way for 4K pages + * 04 - 8 entry DTLB 4-way for 4M pages + * 06 - 8KB L1 Icache 4-way 32b line + * 08 - 16KB L1 Icache 4-way 32b line + * 0a - 8KB L1 Dcache 2-way 32b line + * 0c - 16KB L1 Dcache 4-way 32b line + * 22 - 512KB L3 cache 4-way 64b line 128b sector + * 23 - 1MB L3 cache 8-way 64b line 128b sector + * 25 - 2MB L3 cache 8-way 64b line 128b sector + * 29 - 4MB L3 cache 8-way 64b line 128b sector + * 2c - 32KB L1 Dcache 8-way 64b line + * 30 - 32KB L1 Icache 8-way 64b line + * 39 - 128KB L2 cache 4-way 64b line sectored + * 3b - 128KB L2 cache 2-way 64b line sectored + * 3c - 256KB L2 cache 4-way 64b line sectored + * 40 - no L3 cache, or no L2 cache if no L2 cache info + * 41 - 128KB L2 cache 4-way 32b line + * 42 - 256KB L2 cache 4-way 32b line + * 43 - 512KB L2 cache 4-way 32b line + * 44 - 1MB L2 cache 4-way 32b line + * 45 - 2MB L2 cache 4-way 32b line + * 50 - 64 entry ITLB for 4K and 2MB/4MB pages + * 51 - 128 entry ITLB for 4K and 2MB/4MB pages + * 52 - 256 entry ITLB for 4K and 2MB/4MB pages + * 5b - 64 entry DTLB for 4K and 2MB/4MB pages + * 5c - 128 entry DTLB for 4K and 2MB/4MB pages + * 5d - 256 entry DTLB for 4K and 2MB/4MB pages + * 60 - 16KB L1 cache 8-way 64b line + * 66 - 8KB L1 cache 4-way 64b line + * 67 - 16KB L1 cache 4-way 64b line + * 68 - 32KB L1 cache 4-way 64b line + * 70 - 12K uop trace cache, 8-way + * 71 - 16K uop trace cache, 8-way + * 72 - 32K uop trace cache, 8-way + * 79 - 128KB L2 cache 8-way 64b line 128b sector + * 7a - 256KB L2 cache 8-way 64b line 128b sector + * 7b - 512KB L2 cache 8-way 64b line 128b sector + * 7c - 1MB L2 cache 8-way 64b line 128b sector + * 7d - 2MB L2 cache 8-way 64b line sectored + * 7f - 512KB L2 cache 2-way 64b line sectored + * 82 - 256KB L2 cache 8-way 32b line + * 83 - 512KB L2 cache 8-way 32b line + * 84 - 1MB L2 cache 8-way 33b line + * 85 - 2MB L2 cache 8-way 32b line + * 86 - 512KB L2 cache 4-way 64b line + * 87 - 1MB L2 cache 8-way 64b line + * b0 - 128 entry ITLB 4-way for 4K pages + * b3 - 128 entry DTLB 4-way for 4K pages + * f0 - 64b prefetching + * f1 - 128b prefetching + */ + +/* + * --------------------------------------------------------------------- + * CPUID( 0000 0003h ) - Reserved + */ + +/* + * --------------------------------------------------------------------- + * CPUID( 0000 0004h ) - Deterministic Cache Parameters + * eax = Int_Cache_Parms1 + * ebx = Int_Cache_Parms2 + * ecx = int - number of sets (-1 which means add one to this value) + * edx = Int_Cache_Parms4 + */ + +typedef union ICPU4 { + unsigned int i[4]; + struct { + struct Int_Cache_Parms1 { + unsigned int cachetype : 5; /* 0-none, 1-data, 2-instruction, 3-unified */ + unsigned int cachelevel : 3; /* 1..n */ + unsigned int selflevel : 1; /* self-initializing cache level */ + unsigned int fullyassoc : 1; /* fully associative cache */ + unsigned int rs : 4; + unsigned int nthreads : 12; /* number of threads sharing this cache + * (-1). The nearest power-of-2 int + * that is not smaller than 1+nthreads + * is the max number of unique APIC IDs + */ + unsigned int ncores : 6; /* physical cores on the die (-1). The + * nearest power-of-2 int that is not + * smaller than 1+ncores is the max + * number of unique Core_IDs. + */ + } eax; + + struct Int_Cache_Parms2 { + unsigned int linesize : 12; /* system coherency line size (-1) */ + unsigned int partitions : 10; /* physical line partitions (-1) */ + unsigned int assoc : 10; /* associativity */ + } ebx; + int nsets; /* number of sets */ + struct Int_Cache_Params4 { + unsigned int + llcbehavior : 1; /* wbinvd/invd behavior on lower level chaches */ + unsigned int + llcinclusive : 1; /* cache is inclusive to lower cache levels */ + unsigned int cacheindexing : 1; /* complex cache indexing */ + } edx; + } u; +} ICPU4; + +/* + * --------------------------------------------------------------------- + * CPUID( 0000 0005h ) - Monitor/Mwait + * eax = Int_Monitor - smallest + * ebx = Int_Monitor - largest + * ecx = reserved + * edx = reserved + */ + +typedef union ICPU5 { + unsigned int i[4]; + struct { + struct Int_Monitor { + unsigned int limit : 16; /* smallest/largest monitor line size in bytes */ + unsigned int rs : 16; + } smallest; + struct Int_Monitor largest; + int ecx; + int edx; + } u; +} ICPU5; + +/* + * --------------------------------------------------------------------- + * CPUID( 0000 0006h ) - Thermal and Power Management Leaf + * eax = Int_Power_Mgmt1 + * ebx = Int_Power_Mgmt2 + * ecx = reserved + * edx = reserved + */ + +typedef union ICPU6 { + unsigned int i[4]; + struct { + struct Int_Power_Mgmt1 { + unsigned int tempsensor : 1; + unsigned int turboboost : 1; + unsigned int arat : 1; + unsigned int rsvd3 : 1; + unsigned int pln : 1; + unsigned int ecmd : 1; + unsigned int ptm : 1; + unsigned int rsvd : 25; + } eax; + struct Int_Power_Mgmt2 { + unsigned int numinterrupts : 4; + unsigned int rsvd : 28; + } ebx; + } u; +} ICPU6; + +/* + * --------------------------------------------------------------------- + * CPUID( 0000 0007h ) - Structured Extended Feature Flags + * eax = Int_Monitor - smallest + * ebx = Int_Monitoe - largest + * ecx = reserved + * edx = reserved + */ + +typedef union ICPU7 { + unsigned int i[4]; + struct { + unsigned int numsubleaves; /* eax */ + struct Int_Feature_7 { + unsigned int fsgsbase : 1; + unsigned int ia32_tsc_adjust : 1; + unsigned int sgx : 1; + unsigned int bmi1 : 1; + unsigned int hle : 1; + unsigned int avx2 : 1; + unsigned int rsv6 : 1; + unsigned int smep : 1; + unsigned int bmi2 : 1; + unsigned int erms : 1; + unsigned int invpcid : 1; + unsigned int rtm : 1; + unsigned int pqm : 1; + unsigned int depcsds : 1; + unsigned int memprotect : 1; + unsigned int pqe : 1; + unsigned int avx512f : 1; + unsigned int avx512dq : 1; + unsigned int rdseed : 1; + unsigned int adx : 1; + unsigned int smap : 1; + unsigned int avx512fma : 1; + unsigned int rsv22 : 1; + unsigned int clflushopt : 1; + unsigned int clwb : 1; + unsigned int trace : 1; + unsigned int avx512pf : 1; + unsigned int avx512er : 1; + unsigned int avx512cd : 1; + unsigned int sha : 1; + unsigned int avx512bw : 1; + unsigned int avx512vl: 1; + } ebx; + struct { + unsigned int prefetchwt1 : 1; + unsigned int avx512vbmi : 1; + unsigned int rsv2 : 1; + unsigned int pku : 1; + unsigned int ospke : 1; + unsigned int rsvd : 27; + } ecx; + struct { + unsigned int rsv0 : 1; + unsigned int avx512_4vnniw : 1; + unsigned int avx512_4fmapx : 1; + unsigned int rsvd : 28; + } edx; + } u; +} ICPU7; + +/* + * --------------------------------------------------------------------- + * CPUID( 8000 0001h ) - Processor Feature Flags + * eax = reserved + * ebx = reserved + * ecx = reserved + * edx = Int_XFeature1 + */ + +typedef union ICPU81 { + unsigned int i[4]; + struct { + int eax, ebx, ecx; + struct Int_XFeature1 { + unsigned int rs0 : 11; + unsigned int sep : 1; /* 11:syscall/sysret */ + unsigned int rs12 : 8; + unsigned int nx : 1; /* 20: no-execute page protection */ + unsigned int rs21 : 8; + unsigned int lm : 1; /* 29:long mode capable */ + unsigned int rs30 : 2; + } edx; + } u; +} ICPU81; + +/* + * --------------------------------------------------------------------- + * CPUID( 8000 0002h ) - processor name string + * CPUID( 8000 0003h ) - processor name string continued + * CPUID( 8000 0004h ) - processor name string continued + */ + +/* + * --------------------------------------------------------------------- + * CPUID( 8000 0005h ) - reserved + * eax = reserved + * ebx = reserved + * ecx = reserved + * edx = reserved + */ + +/* + * --------------------------------------------------------------------- + * CPUID( 8000 0006h ) - Cache information + * eax = reserved + * ebx = reserved + * ecx = Int_Cache_Info + * edx = reserved + */ + +typedef union ICPU86 { + unsigned int i[4]; + struct { + int eax, ebx; + struct Int_Cache_Info { + unsigned int linesize : 8; /* cache line size */ + unsigned int rs : 4; + unsigned int assoc : 4; /* L2 associativity */ + unsigned int size : 16; /* cache size in K */ + } ecx; + int edx; + } u; +} ICPU86; + +/* + * --------------------------------------------------------------------- + * CPUID( 8000 0007h ) - Reserved + * eax = reserved + * ebx = reserved + * ecx = reserved + * edx = reserved + */ + +/* + * --------------------------------------------------------------------- + * CPUID( 8000 0008h ) - Address Size + * eax = Int_Physical + * ebx = reserved + * ecx = reserved + * edx = reserved + */ + +typedef union ICPU88 { + unsigned int i[4]; + struct { + struct Int_Physical { + unsigned int physical : 8; /* max physical address width in bits */ + unsigned int virtual : 8; /* max virtual address width in bits */ + unsigned int rs : 16; + } eax; + int ebx, ecx, edx; + } u; +} ICPU88; + +/* + *********************************************************************** + * AMD-specific information + */ + + +/* + * --------------------------------------------------------------------- + * CPUID( 0000 0001h ) - Processor Version Information and Feature Flags + * eax = AMD_Version + * ebx = AMD_Brand + * ecx = AMD_Feature2 + * edx = AMD_Feature1 + */ + +typedef union ACPU1 { + unsigned int i[4]; + struct { + + struct AMD_Version { + unsigned int stepping : 4; /* processor stepping / revision */ + unsigned int model : 4; /* processor model */ + unsigned int family : 4; /* processor family */ + unsigned int rs1 : 4; /* reserved */ + unsigned int + extmodel : 4; /* extended model information, if family == 0 */ + unsigned int + extfamily : 8; /* extended family information, if family == 0 */ + unsigned int rs2 : 4; /* reserved */ + } eax; + /* Notes: + * if family==0, Family is extfamily + * if family==0, Model is extmodel<<4 + model + */ + + struct AMD_Brand { + unsigned int + brandid : 8; /* 8-bit brand ID; 0 means use 12-bit brand ID */ + unsigned int clflush : 8; /* CLFLUSH size */ + unsigned int proccount : 8; /* logical processor count */ + unsigned int apic : 8; /* initial local APIC physical ID */ + } ebx; + /* Notes: + * brandid==0 means use the 12-bit brand ID of CPUID( 8000 0001 ) + * clflush is cache line size in quadwords (8 bytes) + * this is only valid if clflush feature bit is set + * proccount is valid if cmp_legacy==1 && htt==1, indicates + * number of physical cores to legacy software + * better to use CPUID( 8000 0008 ) + */ + + struct AMD_Feature2 { + unsigned int sse3 : 1; /* 0:SSE3 */ + unsigned int pclmulqdq : 1; /* 1: PCLMULQDQ: PCLMULQDQ */ + unsigned int rs1 : 1; + unsigned int mon : 1; /* 3:monitor/mwait */ + unsigned int rs2 : 5; + unsigned int ssse3 : 1; /* 9:Supplemental SSE 4/SSSE3/mni/core2 */ + unsigned int rs3 : 2; + unsigned int fma : 1; /* 12:FMA - FMA extensions in YMM */ + unsigned int cx16 : 1; /* 13:compare/exchange 16-bytes instruction */ + unsigned int rs14 : 5; + unsigned int sse41 : 1; /* 19:SSE 4.1 */ + unsigned int sse42 : 1; /* 20:SSE 4.2 */ + unsigned int rs21 : 2; + unsigned int popcnt : 1; /* 23:POPCNT instruction */ + unsigned int rs24 : 1; + unsigned int aes : 1; /* 25:AES instruction */ + unsigned int xsave : 1; /* 26:XSAVE instruction */ + unsigned int osxsave : 1; /* 27:XSAVE OS */ + unsigned int avx : 1; /* 28:AVX instructions */ + unsigned int f16c : 1; /* 29:half-precision convert instruction */ + unsigned int rs30 : 1; + unsigned int raz : 1; /* 31:reserved for use by hypervisor to indicate + guest status */ + } ecx; + + struct AMD_Feature1 { + unsigned int fpu : 1; /* 0:floating point unit on chip */ + unsigned int vme : 1; /* 1:virtual mode extension */ + unsigned int de : 1; /* 2:debugging extension */ + unsigned int pse : 1; /* 3:page size extension */ + unsigned int tsc : 1; /* 4:time stamp counter */ + unsigned int msr : 1; /* 5:model specific registers (K86 MSR) */ + unsigned int pae : 1; /* 6:physical address extension */ + unsigned int mce : 1; /* 7:machine check exception */ + unsigned int cx8 : 1; /* 8:compare/exchange 8-bytes instruction */ + unsigned int apic : 1; /* 9:on chip APIC hardware */ + unsigned int rs10 : 1; + unsigned int sep : 1; /* 11:sysenter/sysexit (>=PIII) */ + unsigned int mtrr : 1; /* 12:memory type range registers */ + unsigned int pge : 1; /* 13:page global enable */ + unsigned int mca : 1; /* 14:machine check architecture */ + unsigned int cmov : 1; /* 15:conditional move */ + unsigned int pat : 1; /* 16:page attribute table */ + unsigned int pseg : 1; /* 17:36-bit page size extensions */ + unsigned int rs18 : 1; + unsigned int cflsh : 1; /* 19:clflush */ + unsigned int rs20 : 1; + unsigned int rs21 : 1; + unsigned int rs22 : 1; + unsigned int mmx : 1; /* 23:mmx extensions */ + unsigned int fxsr : 1; /* 24:fast floating point save/restore */ + unsigned int sse : 1; /* 25:streaming SIMD extensions */ + unsigned int sse2 : 1; /* 26:SSE2 */ + unsigned int rs27 : 1; + unsigned int htt : 1; /* 28:hyper-threading technology */ + unsigned int rs29 : 1; + unsigned int rs30 : 1; + unsigned int rs31 : 1; + } edx; + } u; +} ACPU1; + +/* + * --------------------------------------------------------------------- + * CPUID( 0000 0007h ) - Structured Extended Feature Identifiers + * eax = Reserved + * ebx = AMD_Brand + * ecx = Reserved + * edx = Reserved + */ + +typedef union ACPU7 { + unsigned int i[4]; + struct { + + int eax; /* Reserved */ + struct AMD_Extended_Features { + unsigned int fsgbase : 1; /* FS & GS base read/write support */ + unsigned int rs1 : 1; + unsigned int rs2 : 1; + unsigned int bmi1 : 1; /* bit manipluation group 1 */ + unsigned int rs4 : 1; + unsigned int avx2 : 1; /* AVX extension support (avx2) */ + unsigned int rs6 : 1; + unsigned int smep : 1; /* Supervisor mode execution protection */ + unsigned int bmi2 : 1; /* bit manipluation group 2 */ + unsigned int rs9 : 1; + unsigned int rs10 : 1; + unsigned int rs11 : 1; + unsigned int rs12 : 1; + unsigned int rs13 : 1; + unsigned int rs14 : 1; + unsigned int rs15 : 1; + unsigned int rs16 : 1; + unsigned int rs17 : 1; + unsigned int rdseed : 1; /* RDSEED is present */ + unsigned int adx : 1; /* ADCX and ADOX are present */ + unsigned int smap : 1; /* Secure mode access prevention - supported */ + unsigned int rs21 : 1; + unsigned int pcommit: 1; + unsigned int clfshopt: 1; + unsigned int rs24 : 1; + unsigned int rs25 : 1; + unsigned int rs26 : 1; + unsigned int rs27 : 1; + unsigned int rs29 : 1; + unsigned int sha : 1; + unsigned int rs30 : 1; + unsigned int rs31 : 1; + } ebx; + int ecx, edx; /* Reserved */ + + } u; +} ACPU7; + +/* + * --------------------------------------------------------------------- + * CPUID( 8000 0001h ) - Processor Version Information and Feature Flags + * eax = AMD_Version - same as 0000 0001h + * ebx = 12-bit Brand ID + * ecx = AMD_XFeature2 + * edx = AMD_XFeature1 + * + * Notes: + * if the 12-bit Brand ID is zero, use the 8-bit Brand ID + */ + +typedef union ACPU81 { + unsigned int i[4]; + struct { + struct AMD_Version eax; + + struct AMD_XBrand { + unsigned int + brandid : 16; /* 16-bit brand ID; 0 means use 8-bit brand ID */ + unsigned int rs : 16; + } ebx; + /* Notes: + * brandid==0 means use the 8-bit brand ID of CPUID( 0000 0001 ) + */ + + struct AMD_XFeature2 { + unsigned int ahf : 1; /* 0:LAHF/SAHF support in long mode */ + unsigned int cmp : 1; /* 1:CMP_LEGACY */ + unsigned int svm : 1; /* 2:Secure Virtual Machine */ + unsigned int extapic : 1; /* 3:Extended APIC register space: */ + unsigned int lockmov : 1; /* 4:LOCK MOV CR0 means MOV CR8 */ + unsigned int abm : 1; /* 5:Advanced Bit Manipulation, POPCNT, LZCNT */ + unsigned int sse4a : 1; /* 6: EXTRQ,INSERTQ,MOVNT[SS|SD] */ + unsigned int mas : 1; /* 7:Misaligned SSE mode */ + unsigned int prefetch : 1; /* 8:3DNow prefetch */ + unsigned int osvw : 1; /* 9:OS visible workaround */ + unsigned int ibs : 1; /* 10:intstruction based sampling */ + unsigned int xop : 1; /* 11:extended operation support */ + unsigned int skinit : 1; /*12: SKINIT & STGI are supported */ + unsigned int wdt : 1; /* 13:watchdog timer support */ + unsigned int rs14 : 1; + unsigned int lwp : 1; /* 15:lightweight profiling support */ + unsigned int fma4 : 1; /* 16:4-operand FMA instructions */ + unsigned int tce : 1; /* 17:translation cache extension */ + unsigned int rs18 : 1; + unsigned int + nodeid : 1; /*19:MSRC001_100C[NodeId,NodesPerProcessor] supported */ + unsigned int rs20 : 1; + unsigned int tbm : 1; /* 21:trailing bit manipulation support */ + unsigned int topolext : 1; /*22:topology extensions suppport */ + unsigned int rs23 : 9; + } ecx; + + struct AMD_XFeature1 { + unsigned int fpu : 1; /* 0:floating point unit on chip */ + unsigned int vme : 1; /* 1:virtual mode extension */ + unsigned int de : 1; /* 2:debugging extension */ + unsigned int pse : 1; /* 3:page size extension */ + unsigned int tsc : 1; /* 4:time stamp counter */ + unsigned int msr : 1; /* 5:model specific registers (K86 MSR) */ + unsigned int pae : 1; /* 6:physical address extension */ + unsigned int mce : 1; /* 7:machine check exception */ + unsigned int cx8 : 1; /* 8:compare/exchange 8-bytes instruction */ + unsigned int apic : 1; /* 9:on chip APIC hardware */ + unsigned int rs10 : 1; + unsigned int sep : 1; /* 11:sysenter/sysexit (>=PIII) */ + unsigned int mtrr : 1; /* 12:memory type range registers */ + unsigned int pge : 1; /* 13:page global enable */ + unsigned int mca : 1; /* 14:machine check architecture */ + unsigned int cmov : 1; /* 15:conditional move */ + unsigned int pat : 1; /* 16:page attribute table */ + unsigned int pseg : 1; /* 17:36-bit page size extensions */ + unsigned int rs18 : 1; + unsigned int rs19 : 1; + unsigned int nx : 1; /* 20: no-execute page protection */ + unsigned int rs21 : 1; + unsigned int ammx : 1; /* 22:AMD MMX instruction extensions */ + unsigned int mmx : 1; /* 23:mmx extensions */ + unsigned int fxsr : 1; /* 24:fast floating point save/restore */ + unsigned int fxsro : 1; /* 25:fxsave/fxrstor optimizations */ + unsigned int rs26 : 1; + unsigned int rdtscp : 1; /* 27: RDTSCP instruction */ + unsigned int rs28 : 1; + unsigned int lm : 1; /* 29:long mode capable */ + unsigned int now3dx : 1; /* 30:3DNow! instructions extensions */ + unsigned int now3d : 1; /* 31:3DNow! instructions */ + } edx; + } u; +} ACPU81; +/* + * --------------------------------------------------------------------- + * CPUID( 8000 0002h ) - processor name string + * CPUID( 8000 0003h ) - processor name string continued + * CPUID( 8000 0004h ) - processor name string continued + */ + +/* + * --------------------------------------------------------------------- + * CPUID( 8000 0005h ) - L1 Cache and L1 TLB information + * eax = AMD_L1_TLB - L1 TLB Large Page Information + * ebx = AMD_L1_TLB - L1 TLB 4-Kbyte Page Information + * ecx = AMD_L1_Cache - L1 Data Cache Information + * edx = AMD_L1_Cache - L1 Instruction Cache Information + */ + +typedef union ACPU85 { + unsigned int i[4]; + struct { + struct AMD_L1_TLB { + unsigned int ientries : 8; /* number of entries in instruction TLB */ + unsigned int iassoc : 8; /* associativity, FF=full in instruction TLB */ + unsigned int dentries : 8; /* number of entries in data TLB */ + unsigned int dassoc : 8; /* associativity, FF=full in data TLB */ + } tlb_large, tlb_4k; + + struct AMD_L1_Cache { + unsigned int linesize : 8; /* line size in bytes */ + unsigned int taglines : 8; /* lines per tag */ + unsigned int assoc : 8; /* associativity */ + unsigned int size : 8; /* cache size in Kbytes */ + } dcache, icache; + } u; +} ACPU85; + +/* + * --------------------------------------------------------------------- + * CPUID( 8000 0006h ) - L1 Cache and L1 TLB information + * eax = AMD_L2_TLB - L2 TLB Large Page Information + * ebx = AMD_L2_TLB - L2 TLB 4-Kbyte Page Information + * ecx = AMD_L2_Cache - L2 (Unified) Cache Information + * edx = reserved + */ + +typedef union ACPU86 { + unsigned int i[4]; + struct { + struct AMD_L2_TLB { + unsigned int itlb_entries : 12; /* number of entries in instruction TLB */ + unsigned int + itlb_assoc : 4; /* associativity, FF=full in instruction TLB */ + unsigned int dtlb_entries : 12; /* number of entries in data TLB */ + unsigned int dtlb_assoc : 4; /* associativity, FF=full in data TLB */ + } tlb_large, tlb_4k; + + struct AMD_L2_Cache { + unsigned int linesize : 8; /* line size in bytes */ + unsigned int taglines : 4; /* lines per tag */ + unsigned int assoc : 4; /* associativity */ + unsigned int size : 16; /* cache size in Kbytes */ + } l2cache; + struct AMD_L3_Cache { + unsigned int linesize : 8; /* line size in bytes */ + unsigned int taglines : 4; /* lines per tag */ + unsigned int assoc : 4; /* associativity */ + unsigned int reserved : 2; + unsigned int size : 14; /* cache size in half-Mbytes */ + } l3cache; + } u; +} ACPU86; + +/* + * --------------------------------------------------------------------- + * CPUID( 8000 0007h ) - Advanced Power Management + * eax = reserved + * ebx = reserved + * ecx = reserved + * edx = AMD_Power + */ + +typedef union ACPU87 { + unsigned int i[4]; + struct { + int eax, ebx, ecx; + struct AMD_Power { + unsigned int ts : 1; /* 0: temperature sensor */ + unsigned int fid : 1; /* 1: frequency ID control */ + unsigned int vid : 1; /* 2: voltage ID control */ + unsigned int ttp : 1; /* 3: thermal trip */ + unsigned int tm : 1; /* 4: thermal monitoring */ + unsigned int stc : 1; /* 5: software thermal control */ + unsigned int mhz : 1; /* 6: 100 Mhz multiplier control */ + unsigned int rs : 25; + } edx; + } u; +} ACPU87; + +/* + * --------------------------------------------------------------------- + * CPUID( 8000 0008h ) - Address Size and Physical Core Count + * eax = AMD_Physical + * ebx = reserved + * ecx = AMD_Core_Count + * edx = reserved + */ + +typedef union ACPU88 { + unsigned int i[4]; + struct { + struct AMD_Physical { + unsigned int physical : 8; /* max physical address width in bits */ + unsigned int virtual : 8; /* max virtual address width in bits */ + unsigned int rs : 16; + } eax; + + int ebx; + struct AMD_Core_Count { + unsigned int cores : 8; /* number of cores minus one (0 means 1 core) */ + unsigned int rs : 24; + } ecx; + int edx; + } u; +} ACPU88; + +/* + * --------------------------------------------------------------------- + * CPUID( 8000 001eh ) - Extended APIC / CoreId / NodeId + * eax = APIC ID + * ebx = Core ID + * ecx = Node ID + * edx = reserved + */ + +typedef union ACPU81e { + unsigned int i[4]; + struct { + struct AMD_ExtAPICId { + unsigned int extendedapicid; + } eax; + + struct AMD_CoreId { + unsigned int coreid : 8; + unsigned int threadspercore : 8; /* number of threads minus one (0 means 1 thread) */ + unsigned int rs : 16; + } ebx; + + struct AMD_NodeId { + unsigned int nodeid : 8; + unsigned int nodesperproc : 3; /* number of threads minus one (0 means 1 thread) */ + unsigned int rs : 21; + } ecx; + int edx; + } u; +} ACPU81e; + +#define x80 0x80000000U diff --git a/runtime/libpgmath/lib/x86_64/x86id.c b/runtime/libpgmath/lib/x86_64/x86id.c new file mode 100644 index 00000000000..006dc42f2d9 --- /dev/null +++ b/runtime/libpgmath/lib/x86_64/x86id.c @@ -0,0 +1,1137 @@ +/* + * Copyright (c) 2007-2018, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +#include +#include + +#if defined(TARGET_WIN_X8664) +# if defined(OBJ_WIN_X8664_IS_X86ID) +# error object macro OBJ_WIN_X8664_IS_X86ID cannot already be defined +# else +# define OBJ_WIN_X8664_IS_X86ID +# endif +#endif + +#include "pgcpuid.h" +#include "x86id.h" + +/* + * Define some interesting fields in the extended control register[0]. + * xcr[0] only defines the lower 32-bits of the 64-bit register. + */ + +#define xcr0_bit_XMM 0x00000002U +#define xcr0_bit_YMM 0x00000004U +#define xcr0_bit_ZMMK 0x00000020U +#define xcr0_bit_ZMMLO 0x00000040U +#define xcr0_bit_ZMMHI 0x00000080U + +#define xcr0_mask_YMM (xcr0_bit_XMM | xcr0_bit_YMM) +#define xcr0_mask_ZMM (xcr0_bit_ZMMK | xcr0_bit_ZMMLO | xcr0_bit_ZMMHI) + + +#define signature_AMD_ebx 0x68747541 +#define signature_AMD_ecx 0x444d4163 +#define signature_AMD_edx 0x69746e65 + +#define signature_INTEL_ebx 0x756e6547 +#define signature_INTEL_ecx 0x6c65746e +#define signature_INTEL_edx 0x49656e69 + +//#define DEBUG +#if defined(DEBUG) +#include +/* use DEBUG_PRINTF(format %s with any arguments %d but no endline", + * string, integer ); + */ +#define DEBUG_PRINTF(...) \ + do { fputs(__func__, stdout); \ + fputs(strlen(__func__) > 7 ? ":\t" : ":\t\t", stdout); \ + printf(__VA_ARGS__); \ + fputs("\n", stdout); } while (0) +#else +#define DEBUG_PRINTF(...) +#endif + +/* + * prototypes for the test functions here + */ +static int ia_cachesize(void); +static int ia_unifiedcache(void); +static int amd_cachesize(void); +static int ia_cores(void); +static int amd_cores(void); +static int is_xcr_set(uint32_t, uint64_t); +static int is_amd_family(uint32_t, uint32_t *); + +/* + * Various routines in the runtime libraries are needing to detect what processor type/model/feature + * they are running on. Instead of using relatively heavy weight routines to return that information, + * provide a mechanism to cache the data. + * + * The "X86IDFN(is_)" is the routine that is called, cache that info in global + * variable X86IDFN(is__cached). + * + * Use macro IS_X86ID(< + */ + +int X86IDFN(is_intel_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_amd_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_ip6_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_sse_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_sse2_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_sse3_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_ssse3_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_sse4a_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_sse41_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_sse42_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_aes_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_avx_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_avx2_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_avx512_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_avx512f_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_avx512vl_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_fma_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_fma4_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_ht_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_athlon_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_hammer_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_gh_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_gh_a_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_gh_b_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_shanghai_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_istanbul_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_bulldozer_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_piledriver_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_k7_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_ia32e_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_p4_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_knl_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_x86_64_cached) = X86ID_IS_CACHED_UNDEF; +int X86IDFN(is_f16c_cached) = X86ID_IS_CACHED_UNDEF; + +/* + * Return whether extended control register has requested bits set. + * Assumes that the processor has the xgetbv instruction. + * Return: 0 == register does not have bit(s) set or __pgi_getbv() failed. + * 1 == bits set. + */ + +static +int is_xcr_set(uint32_t xcr_indx, uint64_t xcr_mask) +{ + uint64_t xcr; + + if( __pgi_getbv( xcr_indx, &xcr ) == 0 ) { + DEBUG_PRINTF("_pgi_getbv() failed xcr_indx=%#8.8x, " + "xcr_mask=%#16.16lx", xcr_indx, xcr_mask); + return 0; + } + + DEBUG_PRINTF("xcr[%u]=%#16.16x, xcr_mask=%#16.16lx", + xcr_indx, xcr, xcr_mask); + return (xcr & xcr_mask) == xcr_mask; +} + +/* + * is_amd_family(uint32_t family, uint32_t * model) + * Return true if processor is AMD and of specific family. + * Always return model. + */ + +static +int is_amd_family(uint32_t family, uint32_t *model) +{ + ACPU1 c1; + + if ((X86IDFN(is_amd)() == 0) || (__pgi_cpuid( 1, c1.i ) == 0)) { + return 0; + } + + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c1.i[0], c1.i[1], c1.i[2], c1.i[3] ); + *model = c1.u.eax.model; + return ( c1.u.eax.family == family); +} + +/* + * Check that this is a Genuine Intel processor + */ +int +X86IDFN(is_intel)(void) +// is_intel: eax 0x00000014 ebx 0x756e6547 ecx 0x6c65746e edx 0x49656e69 +// is_intel: eax 0x00000014 ebx 0x756e6547 ecx 0x49656e69 edx 0x6c65746e +{ + unsigned int h; + CPU0 c0; + __pgi_cpuid( 0, c0.i ); + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c0.i[0], c0.i[1], c0.i[2], c0.i[3] ); + X86IDFN(is_intel_cached) = + ((signature_INTEL_ebx ^ c0.i[1]) | + (signature_INTEL_ecx ^ c0.i[2]) | + (signature_INTEL_edx ^ c0.i[3])) == 0; + return X86IDFN(is_intel_cached); +}/* is_intel */ + +/* + * Check that this is an Authentic AMD processor + */ +int +X86IDFN(is_amd)(void) +{ + CPU0 c0; + unsigned int h; + __pgi_cpuid( 0, c0.i ); + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c0.i[0], c0.i[1], c0.i[2], c0.i[3] ); + X86IDFN(is_amd_cached) = + ((signature_AMD_ebx ^ c0.i[1]) | + (signature_AMD_ecx ^ c0.i[2]) | + (signature_AMD_edx ^ c0.i[3])) == 0; + return X86IDFN(is_amd_cached); +}/* is_amd */ + +/* + * test(p6) + * either manufacturer + * cpuid(1) returns fpu and cmov flag, then must be at least p6 + */ +int +X86IDFN(is_ip6)(void) +{ + ICPU1 c1; + + if( __pgi_cpuid( 1, c1.i ) == 0 ) + return X86IDFN(is_ip6_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c1.i[0], c1.i[1], c1.i[2], c1.i[3] ); + + return X86IDFN(is_ip6_cached) = ( c1.u.edx.fpu && c1.u.edx.cmov ); +}/* is_ip6 */ + +/* + * test(sse) + * call with either AMD or Intel + * test sse bit, same bit for either manufacturer + */ +int +X86IDFN(is_sse)(void) +{ + ICPU1 c1; + if( !X86IDFN(is_intel)() && !X86IDFN(is_amd)() ) + return X86IDFN(is_sse_cached) = 0; + if( __pgi_cpuid( 1, c1.i ) == 0 ) + return X86IDFN(is_sse_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c1.i[0], c1.i[1], c1.i[2], c1.i[3] ); + return X86IDFN(is_sse_cached) = ( c1.u.edx.sse != 0); +}/* is_sse */ + +/* + * test(sse2) + * call with either AMD or Intel + * test sse2 bit, same bit for either manufacturer + */ +int +X86IDFN(is_sse2)(void) +{ + ICPU1 c1; + if( !X86IDFN(is_intel)() && !X86IDFN(is_amd)() ) + return X86IDFN(is_sse2_cached) = 0; + if( __pgi_cpuid( 1, c1.i ) == 0 ) + return X86IDFN(is_sse2_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c1.i[0], c1.i[1], c1.i[2], c1.i[3] ); + return X86IDFN(is_sse2_cached) = ( c1.u.edx.sse2 != 0); +}/* is_sse2 */ + +/* + * test(sse3) + * call with either AMD or Intel + */ +int +X86IDFN(is_sse3)(void) +{ + ICPU1 c1; + if( !X86IDFN(is_intel)() && !X86IDFN(is_amd)() ) + return X86IDFN(is_sse3_cached) = 0; + if( __pgi_cpuid( 1, c1.i ) == 0 ) + return X86IDFN(is_sse3_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c1.i[0], c1.i[1], c1.i[2], c1.i[3] ); + return X86IDFN(is_sse3_cached) = ( c1.u.ecx.sse3 != 0); +}/* is_sse3 */ + +/* + * test(ssse3) + * call with either AMD or Intel + * test ssse3 bit, same bit for either manufacturer + */ +int +X86IDFN(is_ssse3)(void) +{ + ICPU1 c1; + if( !X86IDFN(is_intel)() && !X86IDFN(is_amd)() ) + return X86IDFN(is_ssse3_cached) = 0; + if( __pgi_cpuid( 1, c1.i ) == 0 ) + return X86IDFN(is_ssse3_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c1.i[0], c1.i[1], c1.i[2], c1.i[3] ); + return X86IDFN(is_ssse3_cached) = ( c1.u.ecx.ssse3 != 0); +}/* is_ssse3 */ + +/* + * test(sse4a) + * right now, it's just the greyhound check + */ +int +X86IDFN(is_sse4a)(void) +{ + CPU80 c80; + ACPU81 c81; + if( !X86IDFN(is_amd)() ) + return X86IDFN(is_sse4a_cached) = 0; + if( __pgi_cpuid( 0x80000000, c80.i ) == 0 ) + return X86IDFN(is_sse4a_cached) = 0; + if( c80.b.largest < 0x80000001 ) + return X86IDFN(is_sse4a_cached) = 0; + if( __pgi_cpuid( 0x80000001, c81.i ) == 0 ) + return X86IDFN(is_sse4a_cached) = 0; + return X86IDFN(is_sse4a_cached) = ( c81.u.ecx.sse4a != 0); +}/* is_sse4a */ + +/* + * test(sse41) + * right now, it's just the penryn check + */ +int +X86IDFN(is_sse41)(void) +{ + ICPU1 c1; + if( !X86IDFN(is_intel)() ) + return X86IDFN(is_sse41_cached) = 0; + if( __pgi_cpuid( 1, c1.i ) == 0 ) + return X86IDFN(is_sse41_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c1.i[0], c1.i[1], c1.i[2], c1.i[3] ); + return X86IDFN(is_sse41_cached) = ( c1.u.ecx.sse41 != 0); +}/* is_sse41 */ + +/* + * test(sse42) + */ +int +X86IDFN(is_sse42)(void) +{ + ICPU1 c1; + if( !X86IDFN(is_intel)() && !X86IDFN(is_amd)() ) + return X86IDFN(is_sse42_cached) = 0; + if( __pgi_cpuid( 1, c1.i ) == 0 ) + return X86IDFN(is_sse42_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c1.i[0], c1.i[1], c1.i[2], c1.i[3] ); + return X86IDFN(is_sse42_cached) = ( c1.u.ecx.sse42 != 0); +}/* is_sse42 */ + +/* + * test(aes) + */ +int +X86IDFN(is_aes)(void) +{ + ICPU1 c1; + if( !X86IDFN(is_intel)() && !X86IDFN(is_amd)() ) + return X86IDFN(is_aes_cached) = 0; + if( __pgi_cpuid( 1, c1.i ) == 0 ) + return X86IDFN(is_aes_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c1.i[0], c1.i[1], c1.i[2], c1.i[3] ); + return X86IDFN(is_aes_cached) = ( c1.u.ecx.aes != 0); +}/* is_aes */ + +/* + * test(avx) + */ +int +X86IDFN(is_avx)(void) +{ + ICPU1 c1; + + if( !X86IDFN(is_intel)() && !X86IDFN(is_amd)() ) + return X86IDFN(is_avx_cached) = 0; + if( __pgi_cpuid( 1, c1.i ) == 0 ) + return X86IDFN(is_avx_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c1.i[0], c1.i[1], c1.i[2], c1.i[3] ); + if( !c1.u.ecx.avx ) + return X86IDFN(is_avx_cached) = 0; + /* see whether the OS will save the ymm state */ + if( !c1.u.ecx.osxsave ) + return X86IDFN(is_avx_cached) = 0; + + return X86IDFN(is_avx_cached) = is_xcr_set(0, xcr0_mask_YMM); +}/* is_avx */ + + +/* + * test(avx2) + */ +int +X86IDFN(is_avx2)(void) +{ + ICPU7 c7; + + if ( !X86IDFN(is_intel)() && !X86IDFN(is_amd)() ) + return X86IDFN(is_avx2_cached) = 0; + + if ( !X86IDFN(is_avx)() ) + return X86IDFN(is_avx2_cached) = 0; + + if ( __pgi_cpuid_ecx( 7, c7.i, 0 ) == 0 ) + return X86IDFN(is_avx2_cached) = 0; + + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c7.i[0], c7.i[1], c7.i[2], c7.i[3] ); + + return X86IDFN(is_avx2_cached) = (c7.u.ebx.avx2 != 0); +}/* is_avx2 */ + +/* + * test(avx512) + * Determine whether processor and O/S support AVX512. + */ +int +X86IDFN(is_avx512)(void) +{ + if( !X86IDFN(is_intel)() ) + return X86IDFN(is_avx512_cached) = 0; + + if ( !X86IDFN(is_avx)() ) + return X86IDFN(is_avx512_cached) = 0; + + return X86IDFN(is_avx512_cached) = is_xcr_set(0, xcr0_mask_ZMM); +} + +/* + * test(avx512f) + */ +int +X86IDFN(is_avx512f)(void) +{ + ICPU7 c7; + + if ( !X86IDFN(is_avx512)() ) + return X86IDFN(is_avx512f_cached) = 0; + if( __pgi_cpuid_ecx( 7, c7.i, 0 ) == 0 ) + return X86IDFN(is_avx512f_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c7.i[0], c7.i[1], c7.i[2], c7.i[3] ); + return X86IDFN(is_avx512f_cached) = ( c7.u.ebx.avx512f != 0); +}/* is_avx512f */ + +/* + * test(avx512vl) + */ +int +X86IDFN(is_avx512vl)(void) +{ + ICPU7 c7; + + if( !X86IDFN(is_avx512f)() ) + return X86IDFN(is_avx512vl_cached) = 0; + if( __pgi_cpuid_ecx( 7, c7.i, 0 ) == 0 ) + return X86IDFN(is_avx512vl_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c7.i[0], c7.i[1], c7.i[2], c7.i[3] ); + return X86IDFN(is_avx512vl_cached) = ( c7.u.ebx.avx512vl != 0); +}/* is_avx51vlf */ + +/* + * test(f16c) + */ +int +X86IDFN(is_f16c)(void) +{ + ICPU1 c1; + + if ( !X86IDFN(is_intel)() && !X86IDFN(is_amd)() ) + return X86IDFN(is_f16c_cached) = 0; + + if ( !X86IDFN(is_avx)() ) + return X86IDFN(is_f16c_cached) = 0; + + if ( __pgi_cpuid( 1, c1.i ) == 0 ) + return X86IDFN(is_f16c_cached) = 0; + + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c7.i[0], c7.i[1], c7.i[2], c7.i[3] ); + + return X86IDFN(is_f16c_cached) = (c1.u.ecx.f16c != 0); +}/* is_f16c */ + +/* + * test(fma) + */ +int +X86IDFN(is_fma)(void) +{ + ICPU1 c1; + if( !X86IDFN(is_intel)() && !X86IDFN(is_amd)() ) + return X86IDFN(is_fma_cached) = 0; + if( __pgi_cpuid( 1, c1.i ) == 0 ) + return X86IDFN(is_fma_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c1.i[0], c1.i[1], c1.i[2], c1.i[3] ); + return X86IDFN(is_fma_cached) = ( c1.u.ecx.fma != 0); +}/* is_fma */ + +/* + * test(fma4) + */ +int +X86IDFN(is_fma4)(void) +{ + CPU80 c80; + ACPU81 c81; + if( !X86IDFN(is_amd)() ) + return X86IDFN(is_fma4_cached) = 0; + if( __pgi_cpuid( 0x80000000, c80.i ) == 0 ) + return X86IDFN(is_fma4_cached) = 0; + if( c80.b.largest < 0x80000001 ) + return X86IDFN(is_fma4_cached) = 0; + if( __pgi_cpuid( 0x80000001, c81.i ) == 0 ) + return X86IDFN(is_fma4_cached) = 0; + return X86IDFN(is_fma4_cached) = ( c81.u.ecx.fma4 != 0); +}/* is_fma4 */ + +/* + * test(ht) + * call with Intel + * test sse3 bit, same bit for either manufacturer + */ +int +X86IDFN(is_ht)(void) +{ + ICPU1 c1; + if( !X86IDFN(is_intel)() ) + return X86IDFN(is_ht_cached) = 0; + if( __pgi_cpuid( 1, c1.i ) == 0 ) + return X86IDFN(is_ht_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c1.i[0], c1.i[1], c1.i[2], c1.i[3] ); + if( c1.u.edx.htt ) + return X86IDFN(is_ht_cached) = c1.u.ebx.proccount; + return X86IDFN(is_ht_cached) = 0; +}/* is_ht */ + +/* + * test(athlon) + * test AMD + * test family==15, or model == 1,2,4,6 + */ +int +X86IDFN(is_athlon)(void) +{ + ACPU1 c1; + if( !X86IDFN(is_amd)() ) + return X86IDFN(is_athlon_cached) = 0; + if( __pgi_cpuid( 1, c1.i ) == 0 ) + return X86IDFN(is_athlon_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c1.i[0], c1.i[1], c1.i[2], c1.i[3] ); + if( c1.u.eax.family == 15 ) + return X86IDFN(is_athlon_cached) = 1; + if( c1.u.eax.family != 6 ) + return X86IDFN(is_athlon_cached) = 0; + switch( c1.u.eax.model ){ + case 1 : + case 2 : + case 4 : + case 6 : + return X86IDFN(is_athlon_cached) = 1; + } + return X86IDFN(is_athlon_cached) = 0; +}/* is_athlon */ + +/* + * test(hammer) + * test for AMD + * test for family == 15 + */ +int +X86IDFN(is_hammer)(void) +{ + ACPU1 c1; + if( !X86IDFN(is_amd)() ) + return X86IDFN(is_hammer_cached) = 0; + if( __pgi_cpuid( 1, c1.i ) == 0 ) + return X86IDFN(is_hammer_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c1.i[0], c1.i[1], c1.i[2], c1.i[3] ); + return X86IDFN(is_hammer_cached) = ( c1.u.eax.family == 15 ); +}/* is_hammer */ + +/* + * test(gh) + * test for AMD + * test for family == 16 + */ +int +X86IDFN(is_gh)(void) +{ + ACPU1 c1; + if( !X86IDFN(is_amd)() ) + return X86IDFN(is_gh_cached) = 0; + if( __pgi_cpuid( 1, c1.i ) == 0 ) + return X86IDFN(is_gh_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c1.i[0], c1.i[1], c1.i[2], c1.i[3] ); + return X86IDFN(is_gh_cached) = ( c1.u.eax.family == 15 && c1.u.eax.extfamily == 1); +}/* is_gh */ + +/* + * test(gh-a) + * test for gh + * test for model == 0 + */ +int +X86IDFN(is_gh_a)(void) +{ + ACPU1 c1; + if( !X86IDFN(is_gh)() ) + return X86IDFN(is_gh_a_cached) = 0; + if( __pgi_cpuid( 1, c1.i ) == 0 ) + return X86IDFN(is_gh_a_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c1.i[0], c1.i[1], c1.i[2], c1.i[3] ); + return X86IDFN(is_gh_a_cached) = ( c1.u.eax.model == 0 ); +}/* is_gh_a */ + +/* + * test(gh-b) + * test for gh + * test for model == 1 + */ + +/* + * + * Code from rte/pgc/hammer/src/cpuinfo.c + * + * { + * CPUID c1; + * CPUMODEL m1; + * ACPU81 c81; + * + * if (!__pgi_is_gh()) + * return 0; + * + * if (__pgi_cpuid(1, c1.i) == 0) + * return 0; + * + * m1.i = c1.reg.eax; + * + * if (m1.bits.model >= 2) { + * if (__pgi_cpuid(0x80000001, c81.i) == 0) + * return 0; + * if (c81.u.ecx.mas) { + * return 1; + * } + * } + * + * return 0; + * } + */ + +int +X86IDFN(is_gh_b)(void) +{ + ACPU1 c1; + if( !X86IDFN(is_gh)() ) + return X86IDFN(is_gh_b_cached) = 0; + if( __pgi_cpuid( 1, c1.i ) == 0 ) + return X86IDFN(is_gh_b_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c1.i[0], c1.i[1], c1.i[2], c1.i[3] ); + return X86IDFN(is_gh_b_cached) = ( c1.u.eax.model >= 2 ); +}/* is_gh_b */ + +/* + * test(shanghai) + * test for shanghai + * test for is a gh, and cache size >= 6MB + */ +int +X86IDFN(is_shanghai)(void) +{ + CPU80 c80; + ACPU86 c86; + if( !X86IDFN(is_gh)() ) + return X86IDFN(is_shanghai_cached) = 0; + if( __pgi_cpuid( 0x80000000, c80.i ) == 0 ) + return X86IDFN(is_shanghai_cached) = 0; + if( c80.b.largest < 0x80000006U ) + return X86IDFN(is_shanghai_cached) = 0; + if( __pgi_cpuid( 0x80000006U, c86.i ) == 0 ) + return X86IDFN(is_shanghai_cached) = 0; + return X86IDFN(is_shanghai_cached) = ( c86.u.l3cache.size >= 6 ); +}/* is_shanghai */ + +/* + * test(istanbul) + * test for istanbul + * test for is a shanghai, and model > 4 + */ +int +X86IDFN(is_istanbul)(void) +{ + ACPU1 c1; + if( !X86IDFN(is_shanghai)() ) + return X86IDFN(is_istanbul_cached) = 0; + if( __pgi_cpuid( 1, c1.i ) == 0 ) + return X86IDFN(is_istanbul_cached) = 0; + return X86IDFN(is_istanbul_cached) = ( c1.u.eax.model > 4 ); +}/* is_istanbul */ + + +/* + * test(bulldozer) + * test for bulldozer + * test for family == 21 + */ +int +X86IDFN(is_bulldozer)(void) +{ + ACPU1 c1; + + if ( (X86IDFN(is_amd)() == 0) || (__pgi_cpuid( 1, c1.i ) == 0)) { + return X86IDFN(is_bulldozer_cached) = 0; + } + DEBUG_PRINTF("eax %8.8x ebx %8.8x ecx %8.8x edx %8.8x", + c1.i[0], c1.i[1], c1.i[2], c1.i[3] ); + return X86IDFN(is_bulldozer_cached) = ( c1.u.eax.family == 15 && c1.u.eax.extfamily == 6); +}/* is_bulldozer */ + +/* + * test(piledriver) + * test for bulldozer & fma + */ +int +X86IDFN(is_piledriver)(void) +{ + return X86IDFN(is_piledriver_cached) = ( X86IDFN(is_bulldozer)() && X86IDFN(is_fma)() ); +}/* is_piledriver */ + +/* + * test(k7) + * test AMD + * test family == 6 + */ +int +X86IDFN(is_k7)(void) +{ + ACPU1 c1; + if( !X86IDFN(is_amd)() ) + return X86IDFN(is_k7_cached) = 0; + if( __pgi_cpuid( 1, c1.i ) == 0 ) + return X86IDFN(is_k7_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c1.i[0], c1.i[1], c1.i[2], c1.i[3] ); + return X86IDFN(is_k7_cached) = ( c1.u.eax.family == 6 ); +}/* is_k7 */ + +/* + * test(ia32e) + * test Intel + * test family == 15 and lm + */ +int +X86IDFN(is_ia32e)(void) +{ + ICPU1 c1; + CPU80 c80; + ICPU81 c81; + if( !X86IDFN(is_intel)() ) + return X86IDFN(is_ia32e_cached) = 0; + if( __pgi_cpuid( 1, c1.i ) == 0 ) + return X86IDFN(is_ia32e_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c1.i[0], c1.i[1], c1.i[2], c1.i[3] ); + if( c1.u.eax.family != 15 ) + return X86IDFN(is_ia32e_cached) = 0; + if( __pgi_cpuid( 0x80000000, c80.i ) == 0 ) + return X86IDFN(is_ia32e_cached) = 0; + DEBUG_PRINTF("eax %#8.8x", c80.i[0] ); + if( c80.b.largest < 0x80000001 ) + return X86IDFN(is_ia32e_cached) = 0; /* no extended flags */ + if( __pgi_cpuid( 0x80000001, c81.i ) == 0 ) + return X86IDFN(is_ia32e_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c81.i[0], c81.i[1], c81.i[2], c81.i[3] ); + return X86IDFN(is_ia32e_cached) = ( c81.u.edx.lm != 0); +}/* is_ia32e */ + +/* + * test(p4) + * test Intel + * test family == 15 + */ +int +X86IDFN(is_p4)(void) +{ + ICPU1 c1; + if( !X86IDFN(is_intel)() ) + return X86IDFN(is_p4_cached) = 0; + if( __pgi_cpuid( 1, c1.i ) == 0 ) + return X86IDFN(is_p4_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c1.i[0], c1.i[1], c1.i[2], c1.i[3] ); + return X86IDFN(is_p4_cached) = ( c1.u.eax.family == 15 ); +}/* is_p4 */ + +/* + * test(knl) + * test Intel + * test family == 6 && model == 0x57 + */ +int +X86IDFN(is_knl)(void) +{ + ICPU1 c1; + if( !X86IDFN(is_intel)() ) + return X86IDFN(is_knl_cached) = 0; + if( __pgi_cpuid( 1, c1.i ) == 0 ) + return X86IDFN(is_knl_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c1.i[0], c1.i[1], c1.i[2], c1.i[3] ); + if( c1.u.eax.family == 6 ){ + int model = ((int)c1.u.eax.extmodel << 4) + (int)c1.u.eax.model; + return X86IDFN(is_knl_cached) = ( model == 0x57 ); + } + return X86IDFN(is_knl_cached) = 0; +}/* is_knl */ + +/* + * either manufacturer + * test for lm flag in extended features + */ +int +X86IDFN(is_x86_64)(void) +{ + CPU80 c80; + ICPU81 c81; + + if( __pgi_cpuid( 0x80000000, c80.i ) == 0 ) + return X86IDFN(is_x86_64_cached) = 0; + DEBUG_PRINTF("eax %#8.8x", c80.i[0] ); + if( c80.b.largest < 0x80000001 ) + return X86IDFN(is_x86_64_cached) = 0; + if( __pgi_cpuid( 0x80000001, c81.i ) == 0 ) + return X86IDFN(is_x86_64_cached) = 0; + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c81.i[0], c81.i[1], c81.i[2], c81.i[3] ); + return X86IDFN(is_x86_64_cached) = ( c81.u.edx.lm != 0); +}/* is_x86_64 */ + +/* + * for Intel processors, the values returned by cpuid(2) + * are an encoding of the cache size, as below + * other values encode TLB sizes, etc. + */ +static int +ia_cachecode( int code ) +{ + switch( code ){ + case 0x39: + case 0x3b: + case 0x41: + case 0x79: + case 0x81: + return 128*1024; /*"128KB L2 cache"*/ + case 0x3c: + case 0x42: + case 0x7a: + case 0x82: + return 256*1024; /*"256KB L2 cache"*/ + case 0x43: + case 0x7b: + case 0x7f: + case 0x83: + case 0x86: + return 512*1024; /*"512KB L2 cache"*/ + case 0x44: + case 0x7c: + case 0x84: + case 0x87: + return 1024*1024; /*"1MB L2 cache"*/ + case 0x45: + case 0x7d: + case 0x85: + return 2048*1024; /*"2MB L2 cache"*/ + case 0x4e: + return 6*1024*1024; /*"6MB L2 cache"*/ + case 0xe4: + return 8*1024*1024; /*"8MB L3 cache"*/ + } + return 0; +}/* ia_cachecode */ + +/* + * return cache size for Intel processors + */ +static int +ia_cachesize(void) +{ + CPU0 c0; + ICPU2 c2; + CPU80 c80; + ICPU86 c86; + ICPU4 c4; + int i, n, r; + + if( __pgi_cpuid( 0, c0.i ) == 0 ) + return 0; + if (c0.b.largest >= 4) { + r = ia_unifiedcache(); + if (r) { + return r; + } + } + if( __pgi_cpuid( 0x80000000, c80.i ) == 0 ) + return 0; + DEBUG_PRINTF("eax %#8.8x", c80.i[0] ); + if( c80.b.largest >= 0x80000006 ){ + if( __pgi_cpuid( 0x80000006, c86.i ) ){ + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c86.i[0], c86.i[1], c86.i[2], c86.i[3] ); + return c86.u.ecx.size * 1024; + } + } + + DEBUG_PRINTF("largest=%d", c0.b.largest ); + + if( c0.b.largest < 2 ) + return 0; + + __pgi_cpuid( 2, c2.i ); + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c2.i[0], c2.i[1], c2.i[2], c2.i[3] ); + n = c2.u[0].c1; + while( n-- ){ + for( i = 0; i < 4; ++i ){ + if( c2.u[i].invalid == 0 ){ + if( i > 0 ){ /* 1st byte in eax is something else */ + r = ia_cachecode( c2.u[i].c1 ); + if( r ) + return r; + } + r = ia_cachecode( c2.u[i].c2 ); + if( r ) + return r; + r = ia_cachecode( c2.u[i].c3 ); + if( r ) + return r; + r = ia_cachecode( c2.u[i].c4 ); + if( r ) + return r; + } + } + __pgi_cpuid( 2, c2.i ); + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c2.i[0], c2.i[1], c2.i[2], c2.i[3] ); + } + return 0; +}/* ia_cachesize */ + +static int +ia_unifiedcache(void) { + ICPU4 c4; + int n; + int i; + int r, r2, r3; + /* cache size information available */ + + r2 = r3 = 0; + for (i = 0; i <= 3; i++) { + __pgi_cpuid_ecx( 4, c4.i, i ); + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c4.i[0], c4.i[1], c4.i[2], c4.i[3] ); + switch (c4.u.eax.cachetype) { + default: + goto done; + case 1: + /* + printf("Data Cache\n"); + printf("+++ level %d\n", c4.u.eax.cachelevel); + printf("+++ #bytes %d\n", + ( (c4.u.ebx.assoc+1) * + (c4.u.ebx.partitions+1) * + (c4.u.ebx.linesize+1) * + (c4.u.nsets+1) ) ; + ); + */ + break; + case 2: + /* + printf("Instruction Cache\n"); + printf("+++ level %d\n", c4.u.eax.cachelevel); + */ + break; + case 3: + /* + printf("Unified Cache\n"); + printf("+++ level %d\n", c4.u.eax.cachelevel); + printf("+++ #bytes %d\n", + ( (c4.u.ebx.assoc+1) * + (c4.u.ebx.partitions+1) * + (c4.u.ebx.linesize+1) * + (c4.u.nsets+1) ) + ); + */ + r = (c4.u.ebx.assoc+1) * + (c4.u.ebx.partitions+1) * + (c4.u.ebx.linesize+1) * + (c4.u.nsets+1); + if (c4.u.eax.cachelevel == 2) + r2 = r; + else if (c4.u.eax.cachelevel == 3) { + r3 = r; + } + break; + } + } +done: + if (r3) + return r3; + return r2; +} + +/* + * return cache size for AMD processors + */ +static int +amd_cachesize(void) +{ + CPU80 c80; + ACPU86 c86; + + if( __pgi_cpuid( 0x80000000U, c80.i ) == 0 ) + return 0; + DEBUG_PRINTF("largest=%#8.8x", c80.b.largest ); + if( c80.b.largest < 0x80000006U ) + return 0; + if( __pgi_cpuid( 0x80000006U, c86.i ) == 0 ) + return 0; + if( c86.u.l3cache.size ) { + return c86.u.l3cache.size * 512 * 1024; + } + return c86.u.l2cache.size * 1024; +}/* amd_cachesize */ + +/* + * test(cachesize) + * return intel or amd cache size + */ +int +X86IDFN(get_cachesize)(void) +{ + if( X86IDFN(is_intel)() ) + return ia_cachesize(); + if( X86IDFN(is_amd)() ) + return amd_cachesize(); + return 0; +}/* get_cachesize */ + +/* + * return cores for Intel processors + */ +static int +ia_cores(void) +{ + CPU0 c0; + ICPU4 c4; + int i, n, r; + + if( __pgi_cpuid( 0, c0.i ) == 0 ) + return 0; + DEBUG_PRINTF("largest=%d", c0.b.largest ); + + if( c0.b.largest < 4 ) + return 0; + + __pgi_cpuid_ecx( 4, c4.i, 0 ); + DEBUG_PRINTF("eax %#8.8x ebx %#8.8x ecx %#8.8x edx %#8.8x", + c4.i[0], c4.i[1], c4.i[2], c4.i[3] ); + return c4.u.eax.ncores + 1; +}/* ia_cores */ + +/* + * return cores for AMD processors + */ +static int +amd_cores(void) +{ + CPU80 c80; + ACPU88 c88; + + if( __pgi_cpuid( 0x80000000U, c80.i ) == 0 ) + return 0; + DEBUG_PRINTF("largest=%d", c80.b.largest ); + if( c80.b.largest < 0x80000008U ) + return 0; + if( __pgi_cpuid( 0x80000008U, c88.i ) == 0 ) + return 0; + return c88.u.ecx.cores + 1; +}/* amd_cores */ + +/* + * test(cpuname) + * return processor name string + */ +static char processor_name[50]; +char * +X86IDFN(get_processor_name)(void) +{ + CPU80 c80; + int i; + if( __pgi_cpuid( 0x80000000, c80.i ) == 0 ) + return 0; + DEBUG_PRINTF("eax %#8.8x", c80.i[0] ); + if( c80.b.largest < 0x80000004 ){ + processor_name[0] = '\0'; + return processor_name; /* no processor name string */ + } + if( __pgi_cpuid( 0x80000002, (unsigned int*)(processor_name+0) ) == 0 ){ + processor_name[0] = '\0'; + return processor_name; /* no processor name string */ + } + if( __pgi_cpuid( 0x80000003, (unsigned int*)(processor_name+16) ) == 0 ){ + processor_name[0] = '\0'; + return processor_name; /* no processor name string */ + } + if( __pgi_cpuid( 0x80000004, (unsigned int*)(processor_name+32) ) == 0 ){ + processor_name[0] = '\0'; + return processor_name; /* no processor name string */ + } + processor_name[48] = '\0'; + for( i = 0; i < 48; ++i ){ + if( processor_name[i] != ' ' ) + return processor_name+i; + } + return processor_name; +}/* get_processor_name */ diff --git a/runtime/libpgmath/lib/x86_64/x86id.h b/runtime/libpgmath/lib/x86_64/x86id.h index ce3c9a5b431..ac25b227069 100644 --- a/runtime/libpgmath/lib/x86_64/x86id.h +++ b/runtime/libpgmath/lib/x86_64/x86id.h @@ -19,7 +19,8 @@ #define X86ID_H_ #if defined(FOR_LIBPGC) -#define X86IDFN(n) __Cpuid_ ## n +#define X86IDFN_(l,r) l##r +#define X86IDFN(n) X86IDFN_(__Cpuid_,n) #else #define X86IDFN(n) n #endif @@ -28,86 +29,109 @@ #if ! defined(__ASSEMBLER__) +#define IS_CONCAT3_(l,m,r) l##m##r +#define IS_CONCAT3(l,m,r) IS_CONCAT3_(l,m,r) + #define IS_X86ID(f) \ - (X86IDFN(f)##_cached != X86ID_IS_CACHED_UNDEF) ? X86IDFN(f)##_cached : \ - X86IDFN(f) + (X86IDFN(IS_CONCAT3(is_,f,_cached)) != X86ID_IS_CACHED_UNDEF) ? \ + X86IDFN(IS_CONCAT3(is_,f,_cached)) :X86IDFN(IS_CONCAT3(is_,f,))() /* * All the "_cached" varaibles are one of three values: * 1) IS_X86ID_CACHED_UNDEF: not initialized * 2) false (0): initialized and value is false - * 3) true (1): initiialized and value is true + * 3) true (1): initialized and value is true + */ + +/* + * For Non-Windows based builds (Linux, OSX), the extern keyword + * gives the proper attribute for the global variables is__cached. + * But for Windows, we need to use MS' __declspec attribute. + * When building x86id.c which defines those global variables, we define the + * CPP object macro OBJ_WIN_X8664_IS_X86ID. */ -extern int X86IDFN(is_intel_cached); -extern int X86IDFN(is_amd_cached); -extern int X86IDFN(is_ip6_cached); -extern int X86IDFN(is_sse_cached); -extern int X86IDFN(is_sse2_cached); -extern int X86IDFN(is_sse3_cached); -extern int X86IDFN(is_ssse3_cached); -extern int X86IDFN(is_sse4a_cached); -extern int X86IDFN(is_sse41_cached); -extern int X86IDFN(is_sse42_cached); -extern int X86IDFN(is_aes_cached); -extern int X86IDFN(is_avx_cached); -extern int X86IDFN(is_avx2_cached); -extern int X86IDFN(is_avx512_cached); -extern int X86IDFN(is_avx512f_cached); -extern int X86IDFN(is_avx512vl_cached); -extern int X86IDFN(is_fma_cached); -extern int X86IDFN(is_fma4_cached); -extern int X86IDFN(is_ht_cached); -extern int X86IDFN(is_athlon_cached); -extern int X86IDFN(is_hammer_cached); -extern int X86IDFN(is_gh_cached); -extern int X86IDFN(is_gh_a_cached); -extern int X86IDFN(is_gh_b_cached); -extern int X86IDFN(is_shanghai_cached); -extern int X86IDFN(is_istanbul_cached); -extern int X86IDFN(is_bulldozer_cached); -extern int X86IDFN(is_piledriver_cached); -extern int X86IDFN(is_k7_cached); -extern int X86IDFN(is_ia32e_cached); -extern int X86IDFN(is_p4_cached); -extern int X86IDFN(is_knl_cached); -extern int X86IDFN(is_x86_64_cached); +#if defined (TARGET_WIN_X8664) && defined(_DLL) +# if defined(OBJ_WIN_X8664_IS_X86ID) +# define DECLEXTERN __declspec(dllexport) +# else +# define DECLEXTERN __declspec(dllimport) +# endif +#else +# define DECLEXTERN extern +#endif + +DECLEXTERN int X86IDFN(is_intel_cached); +DECLEXTERN int X86IDFN(is_amd_cached); +DECLEXTERN int X86IDFN(is_ip6_cached); +DECLEXTERN int X86IDFN(is_sse_cached); +DECLEXTERN int X86IDFN(is_sse2_cached); +DECLEXTERN int X86IDFN(is_sse3_cached); +DECLEXTERN int X86IDFN(is_ssse3_cached); +DECLEXTERN int X86IDFN(is_sse4a_cached); +DECLEXTERN int X86IDFN(is_sse41_cached); +DECLEXTERN int X86IDFN(is_sse42_cached); +DECLEXTERN int X86IDFN(is_aes_cached); +DECLEXTERN int X86IDFN(is_avx_cached); +DECLEXTERN int X86IDFN(is_avx2_cached); +DECLEXTERN int X86IDFN(is_avx512_cached); +DECLEXTERN int X86IDFN(is_avx512f_cached); +DECLEXTERN int X86IDFN(is_avx512vl_cached); +DECLEXTERN int X86IDFN(is_fma_cached); +DECLEXTERN int X86IDFN(is_fma4_cached); +DECLEXTERN int X86IDFN(is_ht_cached); +DECLEXTERN int X86IDFN(is_athlon_cached); +DECLEXTERN int X86IDFN(is_hammer_cached); +DECLEXTERN int X86IDFN(is_gh_cached); +DECLEXTERN int X86IDFN(is_gh_a_cached); +DECLEXTERN int X86IDFN(is_gh_b_cached); +DECLEXTERN int X86IDFN(is_shanghai_cached); +DECLEXTERN int X86IDFN(is_istanbul_cached); +DECLEXTERN int X86IDFN(is_bulldozer_cached); +DECLEXTERN int X86IDFN(is_piledriver_cached); +DECLEXTERN int X86IDFN(is_k7_cached); +DECLEXTERN int X86IDFN(is_ia32e_cached); +DECLEXTERN int X86IDFN(is_p4_cached); +DECLEXTERN int X86IDFN(is_knl_cached); +DECLEXTERN int X86IDFN(is_x86_64_cached); +DECLEXTERN int X86IDFN(is_f16c_cached); -extern int X86IDFN(is_intel)(void); /* return 0 or 1 */ -extern int X86IDFN(is_amd)(void); /* return 0 or 1 */ -extern int X86IDFN(is_ip6)(void); /* return 0 or 1 */ -extern int X86IDFN(is_sse)(void); /* return 0 or 1 */ -extern int X86IDFN(is_sse2)(void); /* return 0 or 1 */ -extern int X86IDFN(is_sse3)(void); /* return 0 or 1 */ -extern int X86IDFN(is_ssse3)(void); /* return 0 or 1 */ -extern int X86IDFN(is_sse4a)(void); /* return 0 or 1 */ -extern int X86IDFN(is_sse41)(void); /* return 0 or 1 */ -extern int X86IDFN(is_sse42)(void); /* return 0 or 1 */ -extern int X86IDFN(is_aes)(void); /* return 0 or 1 */ -extern int X86IDFN(is_avx)(void); /* return 0 or 1 */ -extern int X86IDFN(is_avx2)(void); /* return 0 or 1 */ -extern int X86IDFN(is_avx512)(void); /* return 0 or 1 */ -extern int X86IDFN(is_avx512f)(void); /* return 0 or 1 */ -extern int X86IDFN(is_avx512vl)(void); /* return 0 or 1 */ -extern int X86IDFN(is_fma)(void); /* return 0 or 1 */ -extern int X86IDFN(is_fma4)(void); /* return 0 or 1 */ -extern int X86IDFN(is_ht)(void); /* return 0 .. logical processor count */ -extern int X86IDFN(is_athlon)(void); /* return 0 or 1 */ -extern int X86IDFN(is_hammer)(void); /* return 0 or 1 */ -extern int X86IDFN(is_gh)(void); /* return 0 or 1 */ -extern int X86IDFN(is_gh_a)(void); /* return 0 or 1 */ -extern int X86IDFN(is_gh_b)(void); /* return 0 or 1 */ -extern int X86IDFN(is_shanghai)(void); /* return 0 or 1 */ -extern int X86IDFN(is_istanbul)(void); /* return 0 or 1 */ -extern int X86IDFN(is_bulldozer)(void); /* return 0 or 1 */ -extern int X86IDFN(is_piledriver)(void);/* return 0 or 1 */ -extern int X86IDFN(is_k7)(void); /* return 0 or 1 */ -extern int X86IDFN(is_ia32e)(void); /* return 0 or 1 */ -extern int X86IDFN(is_p4)(void); /* return 0 or 1 */ -extern int X86IDFN(is_knl)(void); /* return 0 or 1 */ -extern int X86IDFN(is_x86_64)(void); /* return 0 or 1 */ -extern int X86IDFN(get_cachesize)(void); -extern char *X86IDFN(get_processor_name)(void); +DECLEXTERN int X86IDFN(is_intel)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_amd)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_ip6)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_sse)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_sse2)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_sse3)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_ssse3)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_sse4a)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_sse41)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_sse42)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_aes)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_avx)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_avx2)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_avx512)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_avx512f)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_avx512vl)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_fma)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_fma4)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_ht)(void); /* return 0 .. logical processor count */ +DECLEXTERN int X86IDFN(is_athlon)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_hammer)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_gh)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_gh_a)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_gh_b)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_shanghai)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_istanbul)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_bulldozer)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_piledriver)(void);/* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_k7)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_ia32e)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_p4)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_knl)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(is_x86_64)(void); /* return 0 or 1 */ +DECLEXTERN int X86IDFN(get_cachesize)(void); +DECLEXTERN int X86IDFN(is_f16c)(void); +DECLEXTERN char *X86IDFN(get_processor_name)(void); #if !defined(FOR_LIBPGC) extern int get_cores(void); diff --git a/test/f90_correct/lit/runmake b/test/f90_correct/lit/runmake index fc1ce7b2082..05a6acfe9a1 100644 --- a/test/f90_correct/lit/runmake +++ b/test/f90_correct/lit/runmake @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -33,4 +33,4 @@ export PATH=$PATH:$(pwd) make -f $MAKE_FILE HOMEQA=$MAKE_FILE_DIR TEST=$test_name OPT="$FLAGS" build 2>&1 make -f $MAKE_FILE HOMEQA=$MAKE_FILE_DIR TEST=$test_name OPT="$FLAGS" run 2>&1 make -f $MAKE_FILE HOMEQA=$MAKE_FILE_DIR TEST=$test_name OPT="$FLAGS" verify 2>&1 -# CHECK: {{([1-9][0-9]* tests PASSED\. 0 tests failed|^[[:space:]]*PASS[[:space:]])}} +# CHECK: {{([1-9][0-9]* tests PASSED\. 0 tests failed|^[[:space:]]*PASS[[:space:]]|^[[:space:]]*PASSED[[:space:]]*$)}} diff --git a/test/mp_correct/lit/runmake b/test/mp_correct/lit/runmake index fc1ce7b2082..05a6acfe9a1 100644 --- a/test/mp_correct/lit/runmake +++ b/test/mp_correct/lit/runmake @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -33,4 +33,4 @@ export PATH=$PATH:$(pwd) make -f $MAKE_FILE HOMEQA=$MAKE_FILE_DIR TEST=$test_name OPT="$FLAGS" build 2>&1 make -f $MAKE_FILE HOMEQA=$MAKE_FILE_DIR TEST=$test_name OPT="$FLAGS" run 2>&1 make -f $MAKE_FILE HOMEQA=$MAKE_FILE_DIR TEST=$test_name OPT="$FLAGS" verify 2>&1 -# CHECK: {{([1-9][0-9]* tests PASSED\. 0 tests failed|^[[:space:]]*PASS[[:space:]])}} +# CHECK: {{([1-9][0-9]* tests PASSED\. 0 tests failed|^[[:space:]]*PASS[[:space:]]|^[[:space:]]*PASSED[[:space:]]*$)}} diff --git a/test/openmp_examples/lit/runmake b/test/openmp_examples/lit/runmake index fc1ce7b2082..05a6acfe9a1 100644 --- a/test/openmp_examples/lit/runmake +++ b/test/openmp_examples/lit/runmake @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -33,4 +33,4 @@ export PATH=$PATH:$(pwd) make -f $MAKE_FILE HOMEQA=$MAKE_FILE_DIR TEST=$test_name OPT="$FLAGS" build 2>&1 make -f $MAKE_FILE HOMEQA=$MAKE_FILE_DIR TEST=$test_name OPT="$FLAGS" run 2>&1 make -f $MAKE_FILE HOMEQA=$MAKE_FILE_DIR TEST=$test_name OPT="$FLAGS" verify 2>&1 -# CHECK: {{([1-9][0-9]* tests PASSED\. 0 tests failed|^[[:space:]]*PASS[[:space:]])}} +# CHECK: {{([1-9][0-9]* tests PASSED\. 0 tests failed|^[[:space:]]*PASS[[:space:]]|^[[:space:]]*PASSED[[:space:]]*$)}}