From 255d0beb91a71e85b0e524dd4f4e2cd7accf619b Mon Sep 17 00:00:00 2001 From: Aldrin Montana Date: Fri, 12 Jan 2024 11:04:41 -0800 Subject: [PATCH] GH-17211: refresh history of scalar hash benchmark This commit includes additions to the general hashing benchmarks that cover the use of hashing functions in key_hash.h without carrying the burden of a long dev history. Some existing benchmark names were changed to distinguish between the use of Int32 and Int64 types, new benchmarks were added that use the functions declared in key_hash.h. The reason the new benchmarks are added is because it is claimed they prioritize speed over cryptography as they're primarily used for join algorithms and other processing tasks, which the hashing benchmark can now provide observability for. Issue: GH-17211 Issue: ARROW-8991 --- cpp/src/arrow/util/hashing_benchmark.cc | 108 +++++++++++++++++++++++- 1 file changed, 106 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/hashing_benchmark.cc b/cpp/src/arrow/util/hashing_benchmark.cc index c7051d1a35155..1797d6db93586 100644 --- a/cpp/src/arrow/util/hashing_benchmark.cc +++ b/cpp/src/arrow/util/hashing_benchmark.cc @@ -25,11 +25,22 @@ #include "benchmark/benchmark.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" #include "arrow/util/hashing.h" +#include "arrow/array/builder_primitive.h" +#include "arrow/compute/key_hash.h" + namespace arrow { namespace internal { +namespace { +// copied from scalar_string_benchmark +constexpr auto kSeed = 0x94378165; + +static random::RandomArrayGenerator hashing_rng(kSeed); +} // namespace + template static std::vector MakeIntegers(int32_t n_values) { std::vector values(n_values); @@ -62,7 +73,22 @@ static std::vector MakeStrings(int32_t n_values, int32_t min_length return values; } -static void HashIntegers(benchmark::State& state) { // NOLINT non-const reference +static void HashIntegers32(benchmark::State& state) { // NOLINT non-const reference + const std::vector values = MakeIntegers(10000); + + while (state.KeepRunning()) { + hash_t total = 0; + for (const int32_t v : values) { + total += ScalarHelper::ComputeHash(v); + total += ScalarHelper::ComputeHash(v); + } + benchmark::DoNotOptimize(total); + } + state.SetBytesProcessed(2 * state.iterations() * values.size() * sizeof(int32_t)); + state.SetItemsProcessed(2 * state.iterations() * values.size()); +} + +static void HashIntegers64(benchmark::State& state) { // NOLINT non-const reference const std::vector values = MakeIntegers(10000); while (state.KeepRunning()) { @@ -111,13 +137,91 @@ static void HashLargeStrings(benchmark::State& state) { // NOLINT non-const ref BenchmarkStringHashing(state, values); } +static void KeyHashIntegers32(benchmark::State& state) { // NOLINT non-const reference + auto test_vals = hashing_rng.Int32(10000, 0, std::numeric_limits::max()); + + // initialize the stack allocator + util::TempVectorStack stack_memallocator; + ASSERT_OK( + stack_memallocator.Init(compute::default_exec_context()->memory_pool(), + 3 * sizeof(int32_t) * util::MiniBatch::kMiniBatchLength)); + + // prepare the execution context for Hashing32 + compute::LightContext hash_ctx; + hash_ctx.hardware_flags = compute::default_exec_context()->cpu_info()->hardware_flags(); + hash_ctx.stack = &stack_memallocator; + + // allocate memory for results + ASSERT_OK_AND_ASSIGN(std::unique_ptr hash_buffer, + AllocateBuffer(test_vals->length() * sizeof(int32_t))); + + // run the benchmark + while (state.KeepRunning()) { + // Prepare input data structure for propagation to hash function + ASSERT_OK_AND_ASSIGN( + compute::KeyColumnArray input_keycol, + compute::ColumnArrayFromArrayData(test_vals->data(), 0, test_vals->length())); + + compute::Hashing32::HashMultiColumn( + {input_keycol}, &hash_ctx, + reinterpret_cast(hash_buffer->mutable_data())); + + // benchmark::DoNotOptimize(hash_buffer); + } + + state.SetBytesProcessed(state.iterations() * test_vals->length() * sizeof(int32_t)); + state.SetItemsProcessed(state.iterations() * test_vals->length()); +} + +static void KeyHashIntegers64(benchmark::State& state) { // NOLINT non-const reference + auto test_vals = hashing_rng.Int64(10000, 0, std::numeric_limits::max()); + + // initialize the stack allocator + util::TempVectorStack stack_memallocator; + ASSERT_OK( + stack_memallocator.Init(compute::default_exec_context()->memory_pool(), + 3 * sizeof(int32_t) * util::MiniBatch::kMiniBatchLength)); + + // prepare the execution context for Hashing32 + compute::LightContext hash_ctx; + hash_ctx.hardware_flags = compute::default_exec_context()->cpu_info()->hardware_flags(); + hash_ctx.stack = &stack_memallocator; + + // allocate memory for results + ASSERT_OK_AND_ASSIGN(std::unique_ptr hash_buffer, + AllocateBuffer(test_vals->length() * sizeof(int64_t))); + + // run the benchmark + while (state.KeepRunning()) { + // Prepare input data structure for propagation to hash function + ASSERT_OK_AND_ASSIGN( + compute::KeyColumnArray input_keycol, + compute::ColumnArrayFromArrayData(test_vals->data(), 0, test_vals->length())); + + compute::Hashing64::HashMultiColumn( + {input_keycol}, &hash_ctx, + reinterpret_cast(hash_buffer->mutable_data())); + + // benchmark::DoNotOptimize(hash_buffer); + } + + state.SetBytesProcessed(state.iterations() * test_vals->length() * sizeof(int64_t)); + state.SetItemsProcessed(state.iterations() * test_vals->length()); +} + // ---------------------------------------------------------------------- // Benchmark declarations -BENCHMARK(HashIntegers); +// Directly uses "Hashing" hash functions from hashing.h (xxHash) +BENCHMARK(HashIntegers32); +BENCHMARK(HashIntegers64); BENCHMARK(HashSmallStrings); BENCHMARK(HashMediumStrings); BENCHMARK(HashLargeStrings); +// Directly uses "KeyHash" hash functions from key_hash.h (xxHash-like) +BENCHMARK(KeyHashIntegers32); +BENCHMARK(KeyHashIntegers64); + } // namespace internal } // namespace arrow