From 07b18c5e1b7a8ac9347f945da5ffaecc4515f391 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Sun, 17 Mar 2024 20:00:54 +0100 Subject: [PATCH] [libc++] Optimize ranges::fill{,_n} for vector::iterator (#84642) ``` ------------------------------------------------------ Benchmark old new ------------------------------------------------------ bm_ranges_fill_n/1 1.64 ns 3.06 ns bm_ranges_fill_n/2 3.45 ns 3.06 ns bm_ranges_fill_n/3 4.88 ns 3.06 ns bm_ranges_fill_n/4 6.46 ns 3.06 ns bm_ranges_fill_n/5 8.03 ns 3.06 ns bm_ranges_fill_n/6 9.65 ns 3.07 ns bm_ranges_fill_n/7 11.5 ns 3.06 ns bm_ranges_fill_n/8 13.0 ns 3.06 ns bm_ranges_fill_n/16 25.9 ns 3.06 ns bm_ranges_fill_n/64 103 ns 4.62 ns bm_ranges_fill_n/512 711 ns 4.40 ns bm_ranges_fill_n/4096 5642 ns 9.86 ns bm_ranges_fill_n/32768 45135 ns 33.6 ns bm_ranges_fill_n/262144 360818 ns 243 ns bm_ranges_fill_n/1048576 1442828 ns 982 ns bm_ranges_fill/1 1.63 ns 3.17 ns bm_ranges_fill/2 3.43 ns 3.28 ns bm_ranges_fill/3 4.97 ns 3.31 ns bm_ranges_fill/4 6.53 ns 3.27 ns bm_ranges_fill/5 8.12 ns 3.33 ns bm_ranges_fill/6 9.76 ns 3.32 ns bm_ranges_fill/7 11.6 ns 3.29 ns bm_ranges_fill/8 13.2 ns 3.26 ns bm_ranges_fill/16 26.3 ns 3.26 ns bm_ranges_fill/64 104 ns 4.92 ns bm_ranges_fill/512 716 ns 4.47 ns bm_ranges_fill/4096 5772 ns 8.21 ns bm_ranges_fill/32768 45778 ns 33.1 ns bm_ranges_fill/262144 351422 ns 241 ns bm_ranges_fill/1048576 1404710 ns 965 ns ``` --- libcxx/benchmarks/CMakeLists.txt | 1 + libcxx/benchmarks/algorithms/fill.bench.cpp | 49 +++++++ libcxx/docs/ReleaseNotes/19.rst | 2 + libcxx/include/__algorithm/fill_n.h | 58 +++++++++ libcxx/include/__bit_reference | 59 +-------- .../vector/robust_against_adl.pass.cpp | 3 +- .../alg.fill/fill.pass.cpp | 121 +++++++++++------- .../alg.nonmodifying/alg.count/count.pass.cpp | 2 +- .../alg.count/ranges.count.pass.cpp | 2 +- 9 files changed, 192 insertions(+), 105 deletions(-) create mode 100644 libcxx/benchmarks/algorithms/fill.bench.cpp diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt index b436e96f178b70..3dec6faea13a0c 100644 --- a/libcxx/benchmarks/CMakeLists.txt +++ b/libcxx/benchmarks/CMakeLists.txt @@ -176,6 +176,7 @@ set(BENCHMARK_TESTS algorithms/count.bench.cpp algorithms/equal.bench.cpp algorithms/find.bench.cpp + algorithms/fill.bench.cpp algorithms/for_each.bench.cpp algorithms/lower_bound.bench.cpp algorithms/make_heap.bench.cpp diff --git a/libcxx/benchmarks/algorithms/fill.bench.cpp b/libcxx/benchmarks/algorithms/fill.bench.cpp new file mode 100644 index 00000000000000..40f37425c394cf --- /dev/null +++ b/libcxx/benchmarks/algorithms/fill.bench.cpp @@ -0,0 +1,49 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +static void bm_fill_n(benchmark::State& state) { + std::vector vec1(state.range()); + for (auto _ : state) { + benchmark::DoNotOptimize(vec1); + benchmark::DoNotOptimize(std::fill_n(vec1.begin(), vec1.size(), false)); + } +} +BENCHMARK(bm_fill_n)->DenseRange(1, 8)->Range(16, 1 << 20); + +static void bm_ranges_fill_n(benchmark::State& state) { + std::vector vec1(state.range()); + for (auto _ : state) { + benchmark::DoNotOptimize(vec1); + benchmark::DoNotOptimize(std::ranges::fill_n(vec1.begin(), vec1.size(), false)); + } +} +BENCHMARK(bm_ranges_fill_n)->DenseRange(1, 8)->Range(16, 1 << 20); + +static void bm_fill(benchmark::State& state) { + std::vector vec1(state.range()); + for (auto _ : state) { + benchmark::DoNotOptimize(vec1); + std::fill(vec1.begin(), vec1.end(), false); + } +} +BENCHMARK(bm_fill)->DenseRange(1, 8)->Range(16, 1 << 20); + +static void bm_ranges_fill(benchmark::State& state) { + std::vector vec1(state.range()); + for (auto _ : state) { + benchmark::DoNotOptimize(vec1); + benchmark::DoNotOptimize(std::ranges::fill(vec1, false)); + } +} +BENCHMARK(bm_ranges_fill)->DenseRange(1, 8)->Range(16, 1 << 20); + +BENCHMARK_MAIN(); diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst index 2b62a36ca8e5cd..c70ae477fafc1d 100644 --- a/libcxx/docs/ReleaseNotes/19.rst +++ b/libcxx/docs/ReleaseNotes/19.rst @@ -49,6 +49,8 @@ Improvements and New Features ----------------------------- - The performance of growing ``std::vector`` has been improved for trivially relocatable types. +- The performance of ``ranges::fill`` and ``ranges::fill_n`` has been improved for ``vector::iterator``\s, + resulting in a performance increase of up to 1400x. Deprecations and Removals ------------------------- diff --git a/libcxx/include/__algorithm/fill_n.h b/libcxx/include/__algorithm/fill_n.h index 36f3349d9e7a37..f29633f88087f0 100644 --- a/libcxx/include/__algorithm/fill_n.h +++ b/libcxx/include/__algorithm/fill_n.h @@ -9,18 +9,74 @@ #ifndef _LIBCPP___ALGORITHM_FILL_N_H #define _LIBCPP___ALGORITHM_FILL_N_H +#include <__algorithm/min.h> #include <__config> +#include <__fwd/bit_reference.h> #include <__iterator/iterator_traits.h> +#include <__memory/pointer_traits.h> #include <__utility/convert_to_integral.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD // fill_n isn't specialized for std::memset, because the compiler already optimizes the loop to a call to std::memset. +template +inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator +__fill_n(_OutputIterator __first, _Size __n, const _Tp& __value); + +template +_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void +__fill_n_bool(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) { + using _It = __bit_iterator<_Cp, false>; + using __storage_type = typename _It::__storage_type; + + const int __bits_per_word = _It::__bits_per_word; + // do first partial word + if (__first.__ctz_ != 0) { + __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_); + __storage_type __dn = std::min(__clz_f, __n); + __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn)); + if (_FillVal) + *__first.__seg_ |= __m; + else + *__first.__seg_ &= ~__m; + __n -= __dn; + ++__first.__seg_; + } + // do middle whole words + __storage_type __nw = __n / __bits_per_word; + std::__fill_n(std::__to_address(__first.__seg_), __nw, _FillVal ? static_cast<__storage_type>(-1) : 0); + __n -= __nw * __bits_per_word; + // do last partial word + if (__n > 0) { + __first.__seg_ += __nw; + __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n); + if (_FillVal) + *__first.__seg_ |= __m; + else + *__first.__seg_ &= ~__m; + } +} + +template +inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false> +__fill_n(__bit_iterator<_Cp, false> __first, _Size __n, const bool& __value) { + if (__n > 0) { + if (__value) + std::__fill_n_bool(__first, __n); + else + std::__fill_n_bool(__first, __n); + } + return __first + __n; +} + template inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator __fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) { @@ -37,4 +93,6 @@ fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) { _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_FILL_N_H diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference index 3a5339b72ddc31..9579b9eaf70bbd 100644 --- a/libcxx/include/__bit_reference +++ b/libcxx/include/__bit_reference @@ -171,61 +171,6 @@ private: __bit_const_reference& operator=(const __bit_const_reference&) = delete; }; -// fill_n - -template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void -__fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) { - using _It = __bit_iterator<_Cp, false>; - using __storage_type = typename _It::__storage_type; - - const int __bits_per_word = _It::__bits_per_word; - // do first partial word - if (__first.__ctz_ != 0) { - __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_); - __storage_type __dn = std::min(__clz_f, __n); - __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn)); - if (_FillVal) - *__first.__seg_ |= __m; - else - *__first.__seg_ &= ~__m; - __n -= __dn; - ++__first.__seg_; - } - // do middle whole words - __storage_type __nw = __n / __bits_per_word; - std::fill_n(std::__to_address(__first.__seg_), __nw, _FillVal ? static_cast<__storage_type>(-1) : 0); - __n -= __nw * __bits_per_word; - // do last partial word - if (__n > 0) { - __first.__seg_ += __nw; - __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n); - if (_FillVal) - *__first.__seg_ |= __m; - else - *__first.__seg_ &= ~__m; - } -} - -template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void -fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n, bool __value) { - if (__n > 0) { - if (__value) - std::__fill_n(__first, __n); - else - std::__fill_n(__first, __n); - } -} - -// fill - -template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void -fill(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __last, bool __value) { - std::fill_n(__first, static_cast(__last - __first), __value); -} - // copy template @@ -1007,8 +952,10 @@ private: friend class __bit_iterator<_Cp, true>; template friend struct __bit_array; + template - _LIBCPP_CONSTEXPR_SINCE_CXX20 friend void __fill_n(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n); + _LIBCPP_CONSTEXPR_SINCE_CXX20 friend void + __fill_n_bool(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n); template _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_aligned( diff --git a/libcxx/test/libcxx/containers/sequences/vector/robust_against_adl.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/robust_against_adl.pass.cpp index 83f90ac4184bf1..9c780ae98d1e84 100644 --- a/libcxx/test/libcxx/containers/sequences/vector/robust_against_adl.pass.cpp +++ b/libcxx/test/libcxx/containers/sequences/vector/robust_against_adl.pass.cpp @@ -31,7 +31,8 @@ struct MyAlloc { int main(int, char**) { std::vector> vb; - std::vector> wb(100); + // std::fill_n triggers ADL because __bit_iterator has the container type as a template argument + // std::vector> wb(100); std::vector> v; std::vector> w(100); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp index da56ec30f128b1..481d565961b2b5 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp @@ -14,62 +14,91 @@ // fill(Iter first, Iter last, const T& value); #include +#include #include +#include #include "test_macros.h" #include "test_iterators.h" -#if TEST_STD_VER > 17 -TEST_CONSTEXPR bool test_constexpr() { - int ia[] = {0, 1, 2, 3, 4}; - - std::fill(std::begin(ia), std::end(ia), 5); +template +TEST_CONSTEXPR_CXX20 void +test(Container in, size_t from, size_t to, typename Container::value_type value, Container expected) { + std::fill(Iter(in.data() + from), Iter(in.data() + to), value); + assert(in == expected); +} - return std::all_of(std::begin(ia), std::end(ia), [](int a) {return a == 5; }) - ; +template +struct Test { + template + TEST_CONSTEXPR_CXX20 void operator()() { + { + std::array in = {1, 2, 3, 4}; + std::array expected = {5, 5, 5, 5}; + test(in, 0, 4, 5, expected); } -#endif - -template -void -test_char() -{ - const unsigned n = 4; - char ca[n] = {0}; - std::fill(Iter(ca), Iter(ca+n), char(1)); - assert(ca[0] == 1); - assert(ca[1] == 1); - assert(ca[2] == 1); - assert(ca[3] == 1); -} + { + std::array in = {1, 2, 3, 4}; + std::array expected = {1, 5, 5, 4}; + test(in, 1, 3, 5, expected); + } + } +}; -template -void -test_int() -{ - const unsigned n = 4; - int ia[n] = {0}; - std::fill(Iter(ia), Iter(ia+n), 1); - assert(ia[0] == 1); - assert(ia[1] == 1); - assert(ia[2] == 1); - assert(ia[3] == 1); +TEST_CONSTEXPR_CXX20 bool test() { + types::for_each(types::forward_iterator_list(), Test()); + types::for_each(types::forward_iterator_list(), Test()); + { // test vector::iterator optimization + { // simple case + std::vector in(4, false); + std::vector expected(4, true); + std::fill(in.begin(), in.end(), true); + assert(in == expected); + } + { // partial byte in the front is not filled + std::vector in(8, false); + std::vector expected(8, true); + expected[0] = false; + expected[1] = false; + std::fill(in.begin() + 2, in.end(), true); + assert(in == expected); + } + { // partial byte in the back is not filled + std::vector in(8, false); + std::vector expected(8, true); + expected[6] = false; + expected[7] = false; + std::fill(in.begin(), in.end() - 2, true); + assert(in == expected); + } + { // partial byte in the front and back is not filled + std::vector in(16, false); + std::vector expected(16, true); + expected[0] = false; + expected[1] = false; + expected[14] = false; + expected[15] = false; + std::fill(in.begin() + 2, in.end() - 2, true); + assert(in == expected); + } + { // only a few bits of a byte are set + std::vector in(8, false); + std::vector expected(8, true); + expected[0] = false; + expected[1] = false; + expected[6] = false; + expected[7] = false; + std::fill(in.begin() + 2, in.end() - 2, true); + assert(in == expected); + } + } + return true; } -int main(int, char**) -{ - test_char >(); - test_char >(); - test_char >(); - test_char(); - - test_int >(); - test_int >(); - test_int >(); - test_int(); - -#if TEST_STD_VER > 17 - static_assert(test_constexpr()); +int main(int, char**) { + test(); +#if TEST_STD_VER >= 20 + static_assert(test()); #endif return 0; diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp index 904100c1cf0bb1..7654a4b0c7f007 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp @@ -14,7 +14,7 @@ // count(Iter first, Iter last, const T& value); // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=20000000 -// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=70000000 +// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=80000000 #include #include diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp index b17272ea90cddd..b6631add7e48a7 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp @@ -11,7 +11,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=20000000 -// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=70000000 +// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=80000000 // template S, class T, class Proj = identity> // requires indirect_binary_predicate, const T*>