From 9dbf1725c4b6967f6c046caae13d8346f0f869be Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Thu, 26 Oct 2023 17:17:13 +0200 Subject: [PATCH] [MISC] Use seqan::hibf::build::bin_size_in_bits Co-authored-by: Enrico Seiler --- include/chopper/layout/hibf_statistics.hpp | 12 ---------- src/layout/hibf_statistics.cpp | 23 ++++++++++--------- .../query_cost/bin_influence_benchmark.cpp | 13 +++-------- 3 files changed, 15 insertions(+), 33 deletions(-) diff --git a/include/chopper/layout/hibf_statistics.hpp b/include/chopper/layout/hibf_statistics.hpp index 81aeca40..d9571c33 100644 --- a/include/chopper/layout/hibf_statistics.hpp +++ b/include/chopper/layout/hibf_statistics.hpp @@ -133,18 +133,6 @@ class hibf_statistics //!\brief The gathered summary of statistics for each level of this HIBF. std::map summaries; - /*!\brief Computes the bin size in bits. - * - * -NUM_ELEM*HASHES - * ---------------------- = SIZE - * LN(1-FPR^(1/HASHES)) - * - * -NUM_ELEMS*HASHES - * ----------------------- - * LN(1 - e^(LN(FPR) / HASHES) ) - */ - size_t compute_bin_size(size_t const number_of_kmers_to_be_stored) const; - /*!\brief Compute the Bloom Filter size from `number_of_kmers_to_be_stored` and * return it as a formatted string with the appropriate unit. * \param[in] number_of_kmers_to_be_stored diff --git a/src/layout/hibf_statistics.cpp b/src/layout/hibf_statistics.cpp index 8b1655a3..5e3cd104 100644 --- a/src/layout/hibf_statistics.cpp +++ b/src/layout/hibf_statistics.cpp @@ -32,6 +32,7 @@ #include #include +#include #include #include #include @@ -258,7 +259,12 @@ size_t hibf_statistics::total_hibf_size_in_byte() total_size += std::reduce(summary.ibf_mem_size.begin(), summary.ibf_mem_size.end()); } - return compute_bin_size(total_size) / 8; + size_t const size_in_bits = + seqan::hibf::build::bin_size_in_bits({.fpr = config.hibf_config.maximum_false_positive_rate, + .hash_count = config.hibf_config.number_of_hash_functions, + .elements = total_size}); + + return size_in_bits / 8; } //!\brief Round bytes to the appropriate unit and convert to string with unit. @@ -340,18 +346,13 @@ size_t hibf_statistics::total_hibf_size_in_byte() return result; } -size_t hibf_statistics::compute_bin_size(size_t const number_of_kmers_to_be_stored) const -{ - return std::ceil(-static_cast(number_of_kmers_to_be_stored * config.hibf_config.number_of_hash_functions) - / std::log(1 - - std::exp(std::log(config.hibf_config.maximum_false_positive_rate) - / config.hibf_config.number_of_hash_functions))); -} - std::string hibf_statistics::to_formatted_BF_size(size_t const number_of_kmers_to_be_stored) const { - size_t const size_in_bytes = compute_bin_size(number_of_kmers_to_be_stored) / 8; - return byte_size_to_formatted_str(size_in_bytes); + size_t const size_in_bits = + seqan::hibf::build::bin_size_in_bits({.fpr = config.hibf_config.maximum_false_positive_rate, + .hash_count = config.hibf_config.number_of_hash_functions, + .elements = number_of_kmers_to_be_stored}); + return byte_size_to_formatted_str(size_in_bits / 8); } void hibf_statistics::collect_bins() diff --git a/test/benchmark/benchmark_data/query_cost/bin_influence_benchmark.cpp b/test/benchmark/benchmark_data/query_cost/bin_influence_benchmark.cpp index 9bbbbeb4..6d5928c3 100644 --- a/test/benchmark/benchmark_data/query_cost/bin_influence_benchmark.cpp +++ b/test/benchmark/benchmark_data/query_cost/bin_influence_benchmark.cpp @@ -22,6 +22,8 @@ #include +#include + #define USE_UNIT_TEST_PARAMETERS 0 static constexpr size_t operator""_MiB(unsigned long long int number) @@ -73,14 +75,6 @@ static std::vector> const reads{ using ibf_t = seqan3::interleaved_bloom_filter; -static constexpr size_t compute_bin_size(size_t const max_bin_size, double const fpr) -{ - double const numerator{-static_cast(max_bin_size * hash_num)}; - double const denominator{std::log(1 - std::exp(std::log(fpr) / hash_num))}; - double const result{std::ceil(numerator / denominator)}; - return result; -} - static std::vector cardinality(size_t const bin_count, auto && hash_adaptor) { std::vector cardinalities(bin_count); @@ -105,7 +99,7 @@ static std::vector cardinality(size_t const bin_count, auto && hash_adap static ibf_t construct_ibf(size_t const bin_count, auto && hash_adaptor, double const fpr) { - size_t const bin_size{compute_bin_size(std::ranges::max(cardinality(bin_count, hash_adaptor)), fpr)}; + size_t const bin_size{seqan::hibf::build::bin_size_in_bits({.fpr = fpr, .hash_count = hash_num, .elements = cardinality(bin_count, hash_adaptor)}); if (bin_size * bin_count > max_ibf_size) throw std::runtime_error{"Resulting IBF would be too big. " + std::to_string(bin_size * bin_count)}; @@ -197,4 +191,3 @@ BENCHMARK_CAPTURE(bulk_count, "0.3125", 0.3125)->RangeMultiplier(2)->Range(64, 6 #endif BENCHMARK_MAIN(); -// clang-format on