From e6ead08d93b0c376b6c91a8c9692e5c8d9715e11 Mon Sep 17 00:00:00 2001 From: Griffin Bassman Date: Thu, 20 Jun 2024 11:31:27 -0400 Subject: [PATCH] refactor: option to turn on sse2 optimization (#4666) * refactor: option to turn on sse2 optimization * remove sse2 option * add opt for std sqrt * add opt to ci * add to vw_core * more ci defs * add to more CIs * ; * asan preset * add to valgrind + setup.py * macOS * fix wasm * remove std from wheels * comment * macos version * revert --------- Co-authored-by: Alexey Taymanov <41013086+ataymano@users.noreply.github.com> --- .github/workflows/build_macos.yml | 2 +- .github/workflows/valgrind.yml | 2 +- .github/workflows/vendor_build.yml | 3 ++ .scripts/linux/build-static-java.sh | 2 +- .scripts/linux/build-with-coverage.sh | 2 +- .scripts/linux/build.sh | 2 +- CMakeLists.txt | 1 + CMakePresets.json | 8 +++++ setup.py | 1 + vowpalwabbit/core/CMakeLists.txt | 4 +++ vowpalwabbit/core/src/reductions/gd.cc | 49 +++++++++++++++++++++++++- 11 files changed, 70 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build_macos.yml b/.github/workflows/build_macos.yml index 2670782379f..8a32cd59496 100644 --- a/.github/workflows/build_macos.yml +++ b/.github/workflows/build_macos.yml @@ -27,7 +27,7 @@ jobs: - name: Install dependencies run: brew install cmake boost flatbuffers ninja - name: Configure - run: cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=${{matrix.build_type}} -DWARNINGS=Off -DVW_BUILD_VW_C_WRAPPER=Off -DBUILD_TESTING=On -DBUILD_EXPERIMENTAL_BINDING=On -DVW_FEAT_CSV=On -DVW_FEAT_CB_GRAPH_FEEDBACK=On -DVW_INSTALL=Off + run: cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=${{matrix.build_type}} -DWARNINGS=Off -DVW_BUILD_VW_C_WRAPPER=Off -DBUILD_TESTING=On -DBUILD_EXPERIMENTAL_BINDING=On -DVW_FEAT_CSV=On -DVW_FEAT_CB_GRAPH_FEEDBACK=On -DVW_INSTALL=Off -DSTD_INV_SQRT=ON - name: Build run: cmake --build build --target all - name: Unit tests diff --git a/.github/workflows/valgrind.yml b/.github/workflows/valgrind.yml index c7c35bda6bc..f24630265cb 100644 --- a/.github/workflows/valgrind.yml +++ b/.github/workflows/valgrind.yml @@ -21,7 +21,7 @@ jobs: submodules: recursive - name: Build C++ VW binary run: | - cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DBUILD_EXPERIMENTAL_BINDING=On -DVW_FEAT_FLATBUFFERS=On -DVW_FEAT_CSV=On -DVW_FEAT_CB_GRAPH_FEEDBACK=On + cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DBUILD_EXPERIMENTAL_BINDING=On -DVW_FEAT_FLATBUFFERS=On -DVW_FEAT_CSV=On -DVW_FEAT_CB_GRAPH_FEEDBACK=On -DSTD_INV_SQRT=ON cmake --build build - name: Upload vw binary uses: actions/upload-artifact@v2 diff --git a/.github/workflows/vendor_build.yml b/.github/workflows/vendor_build.yml index e15b5ae89ab..0d17809b8fe 100644 --- a/.github/workflows/vendor_build.yml +++ b/.github/workflows/vendor_build.yml @@ -49,6 +49,7 @@ jobs: -DWARNINGS=On -DWARNING_AS_ERROR=On -DVW_CXX_STANDARD=17 + -DSTD_INV_SQRT=ON - name: Build run: cmake --build build - name: Unit tests @@ -85,6 +86,7 @@ jobs: -DVW_ZLIB_SYS_DEP=Off -DVW_BOOST_MATH_SYS_DEP=Off -DVW_INSTALL=Off + -DSTD_INV_SQRT=ON - name: Build run: cmake --build "${{ env.CMAKE_BUILD_DIR }}" --config ${{ matrix.build_type }} - name: Test run_tests.py @@ -118,6 +120,7 @@ jobs: -DVW_ZLIB_SYS_DEP=Off -DVW_BOOST_MATH_SYS_DEP=Off -DVW_INSTALL=Off + -DSTD_INV_SQRT=ON - name: Build run: cmake --build build - name: Unit tests diff --git a/.scripts/linux/build-static-java.sh b/.scripts/linux/build-static-java.sh index 21d334cb8fe..826685fb8a5 100755 --- a/.scripts/linux/build-static-java.sh +++ b/.scripts/linux/build-static-java.sh @@ -12,6 +12,6 @@ cd build # /usr/local/bin/gcc + g++ is 9.2.0 version cmake -E env LDFLAGS="-Wl,--exclude-libs,ALL -static-libgcc -static-libstdc++" cmake .. -DCMAKE_BUILD_TYPE=Release -DWARNINGS=Off -DBUILD_JAVA=On -DBUILD_DOCS=Off -DVW_FEAT_FLATBUFFERS=On -DVW_FEAT_CSV=On -DVW_FEAT_CB_GRAPH_FEEDBACK=On\ -DBUILD_PYTHON=Off -DSTATIC_LINK_VW_JAVA=On -DCMAKE_C_COMPILER=/usr/local/bin/gcc -DCMAKE_CXX_COMPILER=/usr/local/bin/g++ \ - -DBUILD_TESTING=Off -DVW_ZLIB_SYS_DEP=Off -DBUILD_SHARED_LIBS=Off -DVW_BUILD_LAS_WITH_SIMD=Off + -DBUILD_TESTING=Off -DVW_ZLIB_SYS_DEP=Off -DBUILD_SHARED_LIBS=Off -DVW_BUILD_LAS_WITH_SIMD=Off -DSTD_INV_SQRT=ON NUM_PROCESSORS=$(nproc) make vw_jni -j ${NUM_PROCESSORS} diff --git a/.scripts/linux/build-with-coverage.sh b/.scripts/linux/build-with-coverage.sh index bac8db23bac..83b4acc49d1 100755 --- a/.scripts/linux/build-with-coverage.sh +++ b/.scripts/linux/build-with-coverage.sh @@ -6,5 +6,5 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" REPO_DIR=$SCRIPT_DIR/../../ cd $REPO_DIR -cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Debug -DVW_GCOV=ON -DWARNINGS=OFF -DBUILD_JAVA=Off -DBUILD_PYTHON=Off -DBUILD_TESTING=On -DVW_FEAT_FLATBUFFERS=On -DVW_FEAT_CSV=On -DVW_FEAT_CB_GRAPH_FEEDBACK=On +cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Debug -DVW_GCOV=ON -DWARNINGS=OFF -DBUILD_JAVA=Off -DBUILD_PYTHON=Off -DBUILD_TESTING=On -DVW_FEAT_FLATBUFFERS=On -DVW_FEAT_CSV=On -DVW_FEAT_CB_GRAPH_FEEDBACK=On -DSTD_INV_SQRT=ON cmake --build build diff --git a/.scripts/linux/build.sh b/.scripts/linux/build.sh index ab886b0a349..50a61fe5d7b 100755 --- a/.scripts/linux/build.sh +++ b/.scripts/linux/build.sh @@ -9,5 +9,5 @@ cd $REPO_DIR # If parameter 1 is not supplied, it defaults to Release BUILD_CONFIGURATION=${1:-Release} -cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=${BUILD_CONFIGURATION} -DWARNINGS=Off -DWARNING_AS_ERROR=On -DVW_BUILD_VW_C_WRAPPER=Off -DBUILD_JAVA=On -DBUILD_PYTHON=Off -DBUILD_TESTING=On -DBUILD_EXPERIMENTAL_BINDING=On -DVW_FEAT_FLATBUFFERS=On -DVW_FEAT_CSV=On -DVW_FEAT_CB_GRAPH_FEEDBACK=On +cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=${BUILD_CONFIGURATION} -DWARNINGS=Off -DWARNING_AS_ERROR=On -DVW_BUILD_VW_C_WRAPPER=Off -DBUILD_JAVA=On -DBUILD_PYTHON=Off -DBUILD_TESTING=On -DBUILD_EXPERIMENTAL_BINDING=On -DVW_FEAT_FLATBUFFERS=On -DVW_FEAT_CSV=On -DVW_FEAT_CB_GRAPH_FEEDBACK=On -DSTD_INV_SQRT=ON cmake --build build --target all diff --git a/CMakeLists.txt b/CMakeLists.txt index 7b373508f90..2b367e7eb20 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -178,6 +178,7 @@ option(VW_BUILD_VW_C_WRAPPER "Enable building the c_wrapper project" ON) option(vw_BUILD_NET_CORE "Build .NET Core targets" OFF) option(vw_BUILD_NET_FRAMEWORK "Build .NET Framework targets" OFF) option(VW_BUILD_WASM "Add WASM target" OFF) +option(STD_INV_SQRT "Use standard library inverse square root" OFF) if(VW_INSTALL AND NOT VW_ZLIB_SYS_DEP) message(WARNING "Installing with a vendored version of zlib is not recommended. Use VW_ZLIB_SYS_DEP to use a system dependency or specify VW_INSTALL=OFF to silence this warning.") diff --git a/CMakePresets.json b/CMakePresets.json index 2c9110de54c..693b834a4a1 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -100,6 +100,10 @@ "VW_FEAT_CB_GRAPH_FEEDBACK": { "type": "BOOL", "value": "On" + }, + "STD_INV_SQRT": { + "type": "BOOL", + "value": "On" } } }, @@ -208,6 +212,10 @@ "VCPKG_TARGET_TRIPLET": { "type": "STRING", "value": "wasm32-emscripten" + }, + "STD_INV_SQRT": { + "type": "BOOL", + "value": "On" } } } diff --git a/setup.py b/setup.py index 5cf49116393..d7cd252db36 100644 --- a/setup.py +++ b/setup.py @@ -82,6 +82,7 @@ def build_cmake(self, ext): "-DBUILD_TESTING=Off", "-DWARNINGS=Off", "-DVW_FEAT_CB_GRAPH_FEEDBACK=On", + "-DSTD_INV_SQRT=On", ] # This doesn't work as expected for Python3.6 and 3.7 on Windows. diff --git a/vowpalwabbit/core/CMakeLists.txt b/vowpalwabbit/core/CMakeLists.txt index db03a2ed0b7..e6107a988fd 100644 --- a/vowpalwabbit/core/CMakeLists.txt +++ b/vowpalwabbit/core/CMakeLists.txt @@ -440,6 +440,10 @@ if (MSVC_IDE) target_sources(vw_core PRIVATE $ ) endif() +if(STD_INV_SQRT) + target_compile_definitions(vw_core PUBLIC STD_INV_SQRT) +endif() + # Clang-cl on Windows has issues with our usage of SIMD types. Turn it off explicitly for Windows + clang-cl to mitigate. # See issue # if(WIN32 AND CMAKE_CXX_COMPILER_ID MATCHES "Clang") diff --git a/vowpalwabbit/core/src/reductions/gd.cc b/vowpalwabbit/core/src/reductions/gd.cc index d98b994306a..8e84e061cf0 100644 --- a/vowpalwabbit/core/src/reductions/gd.cc +++ b/vowpalwabbit/core/src/reductions/gd.cc @@ -136,6 +136,53 @@ void sync_weights(VW::workspace& all) all.sd->contraction = 1.; } +VW_WARNING_STATE_PUSH +VW_WARNING_DISABLE_UNUSED_FUNCTION +inline float quake_inv_sqrt(float x) +{ + // Carmack/Quake/SGI fast method: + float xhalf = 0.5f * x; + static_assert(sizeof(int) == sizeof(float), "Floats and ints are converted between, they must be the same size."); + int i = reinterpret_cast(x); // store floating-point bits in integer + i = 0x5f3759d5 - (i >> 1); // initial guess for Newton's method + x = reinterpret_cast(i); // convert new bits into float + x = x * (1.5f - xhalf * x * x); // One round of Newton's method + return x; +} +VW_WARNING_STATE_POP + +static inline float inv_sqrt(float x) +{ +// Standard library used in CI because SSE2 path has floating point differences in github machines +#if defined(STD_INV_SQRT) + return 1.f / std::sqrt(x); +#endif +#if !defined(VW_NO_INLINE_SIMD) +# if defined(__ARM_NEON__) + // Propagate into vector + float32x2_t v1 = vdup_n_f32(x); + // Estimate + float32x2_t e1 = vrsqrte_f32(v1); + // N-R iteration 1 + float32x2_t e2 = vmul_f32(e1, vrsqrts_f32(v1, vmul_f32(e1, e1))); + // N-R iteration 2 + float32x2_t e3 = vmul_f32(e2, vrsqrts_f32(v1, vmul_f32(e2, e2))); + // Extract result + return vget_lane_f32(e3, 0); +# elif defined(__SSE2__) + __m128 eta = _mm_load_ss(&x); + eta = _mm_rsqrt_ss(eta); + _mm_store_ss(&x, eta); +# else + x = quake_inv_sqrt(x); +# endif +#else + x = quake_inv_sqrt(x); +#endif + + return x; +} + VW_WARNING_STATE_PUSH VW_WARNING_DISABLE_COND_CONST_EXPR template @@ -580,7 +627,7 @@ inline float compute_rate_decay(power_data& s, float& fw) float rate_decay = 1.f; if (adaptive) { - if (sqrt_rate) { rate_decay = 1.0f / std::sqrt(w[adaptive]); } + if (sqrt_rate) { rate_decay = inv_sqrt(w[adaptive]); } else { rate_decay = powf(w[adaptive], s.minus_power_t); } } if VW_STD17_CONSTEXPR (normalized != 0)