From 30dc0845d0423ac23c92fc411f6e15695b514211 Mon Sep 17 00:00:00 2001
From: Kimball Thurston <kdt3rd@gmail.com>
Date: Tue, 31 Dec 2024 22:03:45 +1300
Subject: [PATCH] use better define to identify msvc

also propagate cross platform function fixes to test suite

Signed-off-by: Kimball Thurston <kdt3rd@gmail.com>
---
 src/lib/OpenEXRCore/internal_dwa_encoder.h    |  2 +-
 .../OpenEXRCoreTest/compressionTables.cpp     | 70 +++++++++++++------
 2 files changed, 50 insertions(+), 22 deletions(-)

diff --git a/src/lib/OpenEXRCore/internal_dwa_encoder.h b/src/lib/OpenEXRCore/internal_dwa_encoder.h
index cb05c955a..2ad6c5ed7 100644
--- a/src/lib/OpenEXRCore/internal_dwa_encoder.h
+++ b/src/lib/OpenEXRCore/internal_dwa_encoder.h
@@ -28,7 +28,7 @@
 #endif
 
 #ifndef USE_CLZ
-#    ifdef _WIN32
+#    ifdef _MSC_VER
 static int __inline __builtin_clz(uint32_t v)
 {
 #ifdef __BMI1__
diff --git a/src/test/OpenEXRCoreTest/compressionTables.cpp b/src/test/OpenEXRCoreTest/compressionTables.cpp
index 9237cbb45..5cd5b0ec5 100644
--- a/src/test/OpenEXRCoreTest/compressionTables.cpp
+++ b/src/test/OpenEXRCoreTest/compressionTables.cpp
@@ -41,6 +41,7 @@
 
 #if defined(_WIN32)
 #include <windows.h>
+#include <intrin.h>
 #endif
 
 #if defined(OPENEXR_ENABLE_API_VISIBILITY)
@@ -509,11 +510,32 @@ testDWATable (const std::string&)
 #    if __has_builtin(__builtin_popcount)
 #        define USE_POPCOUNT 1
 #    endif
+#    if __has_builtin(__builtin_clz)
+#        define USE_CLZ 1
+#    endif
 #endif
 #ifndef USE_POPCOUNT
 #    define USE_POPCOUNT 0
 #endif
 
+#ifndef USE_CLZ
+#    ifdef _WIN32
+static int __inline __builtin_clz(uint32_t v)
+{
+#ifdef __BMI1__
+    return __lzcnt(v);
+#else
+    unsigned long r;
+    _BitScanReverse(&r, v);
+    return 31 - r;
+#endif
+}
+#        define USE_CLZ 1
+#    else
+#        define USE_CLZ 0
+#    endif
+#endif
+
 #if USE_POPCOUNT
 static inline int
 countSetBits (uint16_t src)
@@ -521,28 +543,34 @@ countSetBits (uint16_t src)
     return __builtin_popcount (src);
 }
 #else
-static inline uint8_t
-countSetBits8 (uint8_t src)
+// courtesy hacker's delight
+static inline int countSetBits(uint32_t x)
 {
-    static const int8_t numBitsSet[256] = {
-        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,
-        2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4,
-        2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
-        4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
-        3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
-        4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-        4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
-    return numBitsSet[src];
+    uint64_t y;
+    y = x * 0x0002000400080010ULL;
+    y = y & 0x1111111111111111ULL;
+    y = y * 0x1111111111111111ULL;
+    y = y >> 60;
+    return y;
 }
+#endif
 
-static inline uint8_t
-countSetBits (uint16_t src)
+#if USE_CLZ
+static inline int
+countLeadingZeros(uint16_t src)
+{
+    return __builtin_clz (src);
+}
+#else
+// courtesy hacker's delight
+static int inline countLeadingZeros( uint32_t x )
 {
-    return countSetBits8 (src & 0xff) + countSetBits8 (src >> 8);
+    x |= (x >> 1);
+    x |= (x >> 2);
+    x |= (x >> 4);
+    x |= (x >> 8);
+    x |= (x >> 16);
+    return 32 - countSetBits(x);
 }
 #endif
 
@@ -597,7 +625,7 @@ countSetBits (uint16_t src)
 static uint16_t handleQuantizeDenormTol (
     uint16_t abssrc, uint16_t tolSig, float errTol, float srcFloat, int doprint)
 {
-    const uint16_t tsigshift = (32 - __builtin_clz (tolSig));
+    const uint16_t tsigshift = (32 - countLeadingZeros (tolSig));
     const uint16_t npow2 = (1 << tsigshift);
     const uint16_t lowermask = npow2 - 1;
     const uint16_t mask = ~lowermask;
@@ -642,7 +670,7 @@ static uint16_t handleQuantizeGeneric (
     // classic would do clz(significand - 1) but here we are trying to
     // construct a mask, so want to ensure for an power of 2, we
     // actually get the next (i.e. 2 returns 4)
-    const uint16_t tsigshift = (32 - __builtin_clz (tolSig));
+    const uint16_t tsigshift = (32 - countLeadingZeros (tolSig));
     const uint16_t npow2 = (1 << tsigshift);
     const uint16_t lowermask = npow2 - 1;
     const uint16_t mask = ~lowermask;
@@ -1083,7 +1111,7 @@ static uint16_t handleQuantizeDefault (
     // classic would do clz(significand - 1) but here we are trying to
     // construct a mask, so want to ensure for an power of 2, we
     // actually get the next (i.e. 2 returns 4)
-    const uint16_t tsigshift = (32 - __builtin_clz (tolSig));
+    const uint16_t tsigshift = (32 - countLeadingZeros (tolSig));
     const uint16_t npow2 = (1 << tsigshift);
     const uint16_t lowermask = npow2 - 1;
     const uint16_t mask = ~lowermask;