From 07b18c5e1b7a8ac9347f945da5ffaecc4515f391 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Sun, 17 Mar 2024 20:00:54 +0100
Subject: [PATCH] [libc++] Optimize ranges::fill{,_n} for
 vector<bool>::iterator (#84642)

```
------------------------------------------------------
Benchmark                          old             new
------------------------------------------------------
bm_ranges_fill_n/1             1.64 ns         3.06 ns
bm_ranges_fill_n/2             3.45 ns         3.06 ns
bm_ranges_fill_n/3             4.88 ns         3.06 ns
bm_ranges_fill_n/4             6.46 ns         3.06 ns
bm_ranges_fill_n/5             8.03 ns         3.06 ns
bm_ranges_fill_n/6             9.65 ns         3.07 ns
bm_ranges_fill_n/7             11.5 ns         3.06 ns
bm_ranges_fill_n/8             13.0 ns         3.06 ns
bm_ranges_fill_n/16            25.9 ns         3.06 ns
bm_ranges_fill_n/64             103 ns         4.62 ns
bm_ranges_fill_n/512            711 ns         4.40 ns
bm_ranges_fill_n/4096          5642 ns         9.86 ns
bm_ranges_fill_n/32768        45135 ns         33.6 ns
bm_ranges_fill_n/262144      360818 ns          243 ns
bm_ranges_fill_n/1048576    1442828 ns          982 ns
bm_ranges_fill/1               1.63 ns         3.17 ns
bm_ranges_fill/2               3.43 ns         3.28 ns
bm_ranges_fill/3               4.97 ns         3.31 ns
bm_ranges_fill/4               6.53 ns         3.27 ns
bm_ranges_fill/5               8.12 ns         3.33 ns
bm_ranges_fill/6               9.76 ns         3.32 ns
bm_ranges_fill/7               11.6 ns         3.29 ns
bm_ranges_fill/8               13.2 ns         3.26 ns
bm_ranges_fill/16              26.3 ns         3.26 ns
bm_ranges_fill/64               104 ns         4.92 ns
bm_ranges_fill/512              716 ns         4.47 ns
bm_ranges_fill/4096            5772 ns         8.21 ns
bm_ranges_fill/32768          45778 ns         33.1 ns
bm_ranges_fill/262144        351422 ns          241 ns
bm_ranges_fill/1048576      1404710 ns          965 ns
```
---
 libcxx/benchmarks/CMakeLists.txt              |   1 +
 libcxx/benchmarks/algorithms/fill.bench.cpp   |  49 +++++++
 libcxx/docs/ReleaseNotes/19.rst               |   2 +
 libcxx/include/__algorithm/fill_n.h           |  58 +++++++++
 libcxx/include/__bit_reference                |  59 +--------
 .../vector/robust_against_adl.pass.cpp        |   3 +-
 .../alg.fill/fill.pass.cpp                    | 121 +++++++++++-------
 .../alg.nonmodifying/alg.count/count.pass.cpp |   2 +-
 .../alg.count/ranges.count.pass.cpp           |   2 +-
 9 files changed, 192 insertions(+), 105 deletions(-)
 create mode 100644 libcxx/benchmarks/algorithms/fill.bench.cpp
diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt
index b436e96f178b70..3dec6faea13a0c 100644
--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/benchmarks/CMakeLists.txt
@@ -176,6 +176,7 @@ set(BENCHMARK_TESTS
     algorithms/count.bench.cpp
     algorithms/equal.bench.cpp
     algorithms/find.bench.cpp
+    algorithms/fill.bench.cpp
     algorithms/for_each.bench.cpp
     algorithms/lower_bound.bench.cpp
     algorithms/make_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/fill.bench.cpp b/libcxx/benchmarks/algorithms/fill.bench.cpp
new file mode 100644
index 00000000000000..40f37425c394cf
--- /dev/null
+++ b/libcxx/benchmarks/algorithms/fill.bench.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <benchmark/benchmark.h>
+#include <vector>
+
+static void bm_fill_n(benchmark::State& state) {
+  std::vector<bool> vec1(state.range());
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(vec1);
+    benchmark::DoNotOptimize(std::fill_n(vec1.begin(), vec1.size(), false));
+  }
+}
+BENCHMARK(bm_fill_n)->DenseRange(1, 8)->Range(16, 1 << 20);
+
+static void bm_ranges_fill_n(benchmark::State& state) {
+  std::vector<bool> vec1(state.range());
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(vec1);
+    benchmark::DoNotOptimize(std::ranges::fill_n(vec1.begin(), vec1.size(), false));
+  }
+}
+BENCHMARK(bm_ranges_fill_n)->DenseRange(1, 8)->Range(16, 1 << 20);
+
+static void bm_fill(benchmark::State& state) {
+  std::vector<bool> vec1(state.range());
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(vec1);
+    std::fill(vec1.begin(), vec1.end(), false);
+  }
+}
+BENCHMARK(bm_fill)->DenseRange(1, 8)->Range(16, 1 << 20);
+
+static void bm_ranges_fill(benchmark::State& state) {
+  std::vector<bool> vec1(state.range());
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(vec1);
+    benchmark::DoNotOptimize(std::ranges::fill(vec1, false));
+  }
+}
+BENCHMARK(bm_ranges_fill)->DenseRange(1, 8)->Range(16, 1 << 20);
+
+BENCHMARK_MAIN();
diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index 2b62a36ca8e5cd..c70ae477fafc1d 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -49,6 +49,8 @@ Improvements and New Features
 -----------------------------
 
 - The performance of growing ``std::vector`` has been improved for trivially relocatable types.
+- The performance of ``ranges::fill`` and ``ranges::fill_n`` has been improved for ``vector<bool>::iterator``\s,
+  resulting in a performance increase of up to 1400x.
 
 Deprecations and Removals
 -------------------------
diff --git a/libcxx/include/__algorithm/fill_n.h b/libcxx/include/__algorithm/fill_n.h
index 36f3349d9e7a37..f29633f88087f0 100644
--- a/libcxx/include/__algorithm/fill_n.h
+++ b/libcxx/include/__algorithm/fill_n.h
@@ -9,18 +9,74 @@
 #ifndef _LIBCPP___ALGORITHM_FILL_N_H
 #define _LIBCPP___ALGORITHM_FILL_N_H
 
+#include <__algorithm/min.h>
 #include <__config>
+#include <__fwd/bit_reference.h>
 #include <__iterator/iterator_traits.h>
+#include <__memory/pointer_traits.h>
 #include <__utility/convert_to_integral.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 // fill_n isn't specialized for std::memset, because the compiler already optimizes the loop to a call to std::memset.
 
+template <class _OutputIterator, class _Size, class _Tp>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
+__fill_n(_OutputIterator __first, _Size __n, const _Tp& __value);
+
+template <bool _FillVal, class _Cp>
+_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
+__fill_n_bool(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) {
+  using _It            = __bit_iterator<_Cp, false>;
+  using __storage_type = typename _It::__storage_type;
+
+  const int __bits_per_word = _It::__bits_per_word;
+  // do first partial word
+  if (__first.__ctz_ != 0) {
+    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
+    __storage_type __dn    = std::min(__clz_f, __n);
+    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+    if (_FillVal)
+      *__first.__seg_ |= __m;
+    else
+      *__first.__seg_ &= ~__m;
+    __n -= __dn;
+    ++__first.__seg_;
+  }
+  // do middle whole words
+  __storage_type __nw = __n / __bits_per_word;
+  std::__fill_n(std::__to_address(__first.__seg_), __nw, _FillVal ? static_cast<__storage_type>(-1) : 0);
+  __n -= __nw * __bits_per_word;
+  // do last partial word
+  if (__n > 0) {
+    __first.__seg_ += __nw;
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    if (_FillVal)
+      *__first.__seg_ |= __m;
+    else
+      *__first.__seg_ &= ~__m;
+  }
+}
+
+template <class _Cp, class _Size>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false>
+__fill_n(__bit_iterator<_Cp, false> __first, _Size __n, const bool& __value) {
+  if (__n > 0) {
+    if (__value)
+      std::__fill_n_bool<true>(__first, __n);
+    else
+      std::__fill_n_bool<false>(__first, __n);
+  }
+  return __first + __n;
+}
+
 template <class _OutputIterator, class _Size, class _Tp>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
 __fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
@@ -37,4 +93,6 @@ fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_FILL_N_H
diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference
index 3a5339b72ddc31..9579b9eaf70bbd 100644
--- a/libcxx/include/__bit_reference
+++ b/libcxx/include/__bit_reference
@@ -171,61 +171,6 @@ private:
   __bit_const_reference& operator=(const __bit_const_reference&) = delete;
 };
 
-// fill_n
-
-template <bool _FillVal, class _Cp>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
-__fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) {
-  using _It            = __bit_iterator<_Cp, false>;
-  using __storage_type = typename _It::__storage_type;
-
-  const int __bits_per_word = _It::__bits_per_word;
-  // do first partial word
-  if (__first.__ctz_ != 0) {
-    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
-    __storage_type __dn    = std::min(__clz_f, __n);
-    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-    if (_FillVal)
-      *__first.__seg_ |= __m;
-    else
-      *__first.__seg_ &= ~__m;
-    __n -= __dn;
-    ++__first.__seg_;
-  }
-  // do middle whole words
-  __storage_type __nw = __n / __bits_per_word;
-  std::fill_n(std::__to_address(__first.__seg_), __nw, _FillVal ? static_cast<__storage_type>(-1) : 0);
-  __n -= __nw * __bits_per_word;
-  // do last partial word
-  if (__n > 0) {
-    __first.__seg_ += __nw;
-    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-    if (_FillVal)
-      *__first.__seg_ |= __m;
-    else
-      *__first.__seg_ &= ~__m;
-  }
-}
-
-template <class _Cp>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n, bool __value) {
-  if (__n > 0) {
-    if (__value)
-      std::__fill_n<true>(__first, __n);
-    else
-      std::__fill_n<false>(__first, __n);
-  }
-}
-
-// fill
-
-template <class _Cp>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-fill(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __last, bool __value) {
-  std::fill_n(__first, static_cast<typename _Cp::size_type>(__last - __first), __value);
-}
-
 // copy
 
 template <class _Cp, bool _IsConst>
@@ -1007,8 +952,10 @@ private:
   friend class __bit_iterator<_Cp, true>;
   template <class _Dp>
   friend struct __bit_array;
+
   template <bool _FillVal, class _Dp>
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 friend void __fill_n(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 friend void
+  __fill_n_bool(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
 
   template <class _Dp, bool _IC>
   _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_aligned(
diff --git a/libcxx/test/libcxx/containers/sequences/vector/robust_against_adl.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/robust_against_adl.pass.cpp
index 83f90ac4184bf1..9c780ae98d1e84 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/robust_against_adl.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/robust_against_adl.pass.cpp
@@ -31,7 +31,8 @@ struct MyAlloc {
 int main(int, char**)
 {
     std::vector<bool, MyAlloc<bool>> vb;
-    std::vector<bool, MyAlloc<bool>> wb(100);
+    // std::fill_n triggers ADL because __bit_iterator has the container type as a template argument
+    // std::vector<bool, MyAlloc<bool>> wb(100);
 
     std::vector<int, MyAlloc<int>> v;
     std::vector<int, MyAlloc<int>> w(100);
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp
index da56ec30f128b1..481d565961b2b5 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp
@@ -14,62 +14,91 @@
 //   fill(Iter first, Iter last, const T& value);
 
 #include <algorithm>
+#include <array>
 #include <cassert>
+#include <vector>
 
 #include "test_macros.h"
 #include "test_iterators.h"
 
-#if TEST_STD_VER > 17
-TEST_CONSTEXPR bool test_constexpr() {
-    int ia[] = {0, 1, 2, 3, 4};
-
-    std::fill(std::begin(ia), std::end(ia), 5);
+template <class Iter, class Container>
+TEST_CONSTEXPR_CXX20 void
+test(Container in, size_t from, size_t to, typename Container::value_type value, Container expected) {
+  std::fill(Iter(in.data() + from), Iter(in.data() + to), value);
+  assert(in == expected);
+}
 
-    return std::all_of(std::begin(ia), std::end(ia), [](int a) {return a == 5; })
-        ;
+template <class T>
+struct Test {
+  template <class Iter>
+  TEST_CONSTEXPR_CXX20 void operator()() {
+    {
+      std::array<T, 4> in       = {1, 2, 3, 4};
+      std::array<T, 4> expected = {5, 5, 5, 5};
+      test<Iter>(in, 0, 4, 5, expected);
     }
-#endif
-
-template <class Iter>
-void
-test_char()
-{
-    const unsigned n = 4;
-    char ca[n] = {0};
-    std::fill(Iter(ca), Iter(ca+n), char(1));
-    assert(ca[0] == 1);
-    assert(ca[1] == 1);
-    assert(ca[2] == 1);
-    assert(ca[3] == 1);
-}
+    {
+      std::array<T, 4> in       = {1, 2, 3, 4};
+      std::array<T, 4> expected = {1, 5, 5, 4};
+      test<Iter>(in, 1, 3, 5, expected);
+    }
+  }
+};
 
-template <class Iter>
-void
-test_int()
-{
-    const unsigned n = 4;
-    int ia[n] = {0};
-    std::fill(Iter(ia), Iter(ia+n), 1);
-    assert(ia[0] == 1);
-    assert(ia[1] == 1);
-    assert(ia[2] == 1);
-    assert(ia[3] == 1);
+TEST_CONSTEXPR_CXX20 bool test() {
+  types::for_each(types::forward_iterator_list<char*>(), Test<char>());
+  types::for_each(types::forward_iterator_list<int*>(), Test<int>());
+  {   // test vector<bool>::iterator optimization
+    { // simple case
+      std::vector<bool> in(4, false);
+      std::vector<bool> expected(4, true);
+      std::fill(in.begin(), in.end(), true);
+      assert(in == expected);
+    }
+    { // partial byte in the front is not filled
+      std::vector<bool> in(8, false);
+      std::vector<bool> expected(8, true);
+      expected[0] = false;
+      expected[1] = false;
+      std::fill(in.begin() + 2, in.end(), true);
+      assert(in == expected);
+    }
+    { // partial byte in the back is not filled
+      std::vector<bool> in(8, false);
+      std::vector<bool> expected(8, true);
+      expected[6] = false;
+      expected[7] = false;
+      std::fill(in.begin(), in.end() - 2, true);
+      assert(in == expected);
+    }
+    { // partial byte in the front and back is not filled
+      std::vector<bool> in(16, false);
+      std::vector<bool> expected(16, true);
+      expected[0]  = false;
+      expected[1]  = false;
+      expected[14] = false;
+      expected[15] = false;
+      std::fill(in.begin() + 2, in.end() - 2, true);
+      assert(in == expected);
+    }
+    { // only a few bits of a byte are set
+      std::vector<bool> in(8, false);
+      std::vector<bool> expected(8, true);
+      expected[0] = false;
+      expected[1] = false;
+      expected[6] = false;
+      expected[7] = false;
+      std::fill(in.begin() + 2, in.end() - 2, true);
+      assert(in == expected);
+    }
+  }
+  return true;
 }
 
-int main(int, char**)
-{
-    test_char<forward_iterator<char*> >();
-    test_char<bidirectional_iterator<char*> >();
-    test_char<random_access_iterator<char*> >();
-    test_char<char*>();
-
-    test_int<forward_iterator<int*> >();
-    test_int<bidirectional_iterator<int*> >();
-    test_int<random_access_iterator<int*> >();
-    test_int<int*>();
-
-#if TEST_STD_VER > 17
-    static_assert(test_constexpr());
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 20
+  static_assert(test());
 #endif
 
   return 0;
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp
index 904100c1cf0bb1..7654a4b0c7f007 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp
@@ -14,7 +14,7 @@
 //   count(Iter first, Iter last, const T& value);
 
 // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=20000000
-// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=70000000
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=80000000
 
 #include <algorithm>
 #include <cassert>
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp
index b17272ea90cddd..b6631add7e48a7 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp
@@ -11,7 +11,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=20000000
-// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=70000000
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=80000000
 
 // template<input_iterator I, sentinel_for<I> S, class T, class Proj = identity>
 //   requires indirect_binary_predicate<ranges::equal_to, projected<I, Proj>, const T*>