Skip to content

Commit

Permalink
Merge pull request #225 from smehringer/replace_function_bit_size
Browse files Browse the repository at this point in the history
[MISC] Use seqan::hibf::build::bin_size_in_bits
  • Loading branch information
eseiler authored Oct 26, 2023
2 parents 7767b99 + 9dbf172 commit ec33f79
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 33 deletions.
12 changes: 0 additions & 12 deletions include/chopper/layout/hibf_statistics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,18 +133,6 @@ class hibf_statistics
//!\brief The gathered summary of statistics for each level of this HIBF.
std::map<size_t, level_summary> summaries;

/*!\brief Computes the bin size in bits.
*
* -NUM_ELEM*HASHES
* ---------------------- = SIZE
* LN(1-FPR^(1/HASHES))
*
* -NUM_ELEMS*HASHES
* -----------------------
* LN(1 - e^(LN(FPR) / HASHES) )
*/
size_t compute_bin_size(size_t const number_of_kmers_to_be_stored) const;

/*!\brief Compute the Bloom Filter size from `number_of_kmers_to_be_stored` and
* return it as a formatted string with the appropriate unit.
* \param[in] number_of_kmers_to_be_stored
Expand Down
23 changes: 12 additions & 11 deletions src/layout/hibf_statistics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include <chopper/layout/hibf_statistics.hpp>
#include <chopper/layout/ibf_query_cost.hpp>

#include <hibf/build/bin_size_in_bits.hpp>
#include <hibf/contrib/robin_hood.hpp>
#include <hibf/layout/compute_fpr_correction.hpp>
#include <hibf/layout/layout.hpp>
Expand Down Expand Up @@ -258,7 +259,12 @@ size_t hibf_statistics::total_hibf_size_in_byte()
total_size += std::reduce(summary.ibf_mem_size.begin(), summary.ibf_mem_size.end());
}

return compute_bin_size(total_size) / 8;
size_t const size_in_bits =
seqan::hibf::build::bin_size_in_bits({.fpr = config.hibf_config.maximum_false_positive_rate,
.hash_count = config.hibf_config.number_of_hash_functions,
.elements = total_size});

return size_in_bits / 8;
}

//!\brief Round bytes to the appropriate unit and convert to string with unit.
Expand Down Expand Up @@ -340,18 +346,13 @@ size_t hibf_statistics::total_hibf_size_in_byte()
return result;
}

size_t hibf_statistics::compute_bin_size(size_t const number_of_kmers_to_be_stored) const
{
return std::ceil(-static_cast<double>(number_of_kmers_to_be_stored * config.hibf_config.number_of_hash_functions)
/ std::log(1
- std::exp(std::log(config.hibf_config.maximum_false_positive_rate)
/ config.hibf_config.number_of_hash_functions)));
}

std::string hibf_statistics::to_formatted_BF_size(size_t const number_of_kmers_to_be_stored) const
{
size_t const size_in_bytes = compute_bin_size(number_of_kmers_to_be_stored) / 8;
return byte_size_to_formatted_str(size_in_bytes);
size_t const size_in_bits =
seqan::hibf::build::bin_size_in_bits({.fpr = config.hibf_config.maximum_false_positive_rate,
.hash_count = config.hibf_config.number_of_hash_functions,
.elements = number_of_kmers_to_be_stored});
return byte_size_to_formatted_str(size_in_bits / 8);
}

void hibf_statistics::collect_bins()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@

#include <raptor/adjust_seed.hpp>

#include <hibf/build/bin_size_in_bits.hpp>

#define USE_UNIT_TEST_PARAMETERS 0

static constexpr size_t operator""_MiB(unsigned long long int number)
Expand Down Expand Up @@ -73,14 +75,6 @@ static std::vector<std::vector<seqan3::dna4>> const reads{

using ibf_t = seqan3::interleaved_bloom_filter<seqan3::data_layout::uncompressed>;

static constexpr size_t compute_bin_size(size_t const max_bin_size, double const fpr)
{
double const numerator{-static_cast<double>(max_bin_size * hash_num)};
double const denominator{std::log(1 - std::exp(std::log(fpr) / hash_num))};
double const result{std::ceil(numerator / denominator)};
return result;
}

static std::vector<size_t> cardinality(size_t const bin_count, auto && hash_adaptor)
{
std::vector<size_t> cardinalities(bin_count);
Expand All @@ -105,7 +99,7 @@ static std::vector<size_t> cardinality(size_t const bin_count, auto && hash_adap

static ibf_t construct_ibf(size_t const bin_count, auto && hash_adaptor, double const fpr)
{
size_t const bin_size{compute_bin_size(std::ranges::max(cardinality(bin_count, hash_adaptor)), fpr)};
size_t const bin_size{seqan::hibf::build::bin_size_in_bits({.fpr = fpr, .hash_count = hash_num, .elements = cardinality(bin_count, hash_adaptor)});

if (bin_size * bin_count > max_ibf_size)
throw std::runtime_error{"Resulting IBF would be too big. " + std::to_string(bin_size * bin_count)};
Expand Down Expand Up @@ -197,4 +191,3 @@ BENCHMARK_CAPTURE(bulk_count, "0.3125", 0.3125)->RangeMultiplier(2)->Range(64, 6
#endif

BENCHMARK_MAIN();
// clang-format on

0 comments on commit ec33f79

Please sign in to comment.