Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] update hibf lib and introduce --relaxed-fpr option. #223

Merged
merged 3 commits into from
Oct 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lib/hibf
Submodule hibf updated 34 files
+5 −1 .clang-tidy
+1 −1 README.md
+6 −1 include/hibf/build/bin_size_in_bits.hpp
+54 −21 include/hibf/config.hpp
+2 −0 include/hibf/contrib/robin_hood.hpp
+1 −1 include/hibf/hierarchical_interleaved_bloom_filter.hpp
+24 −24 include/hibf/interleaved_bloom_filter.hpp
+5 −0 include/hibf/layout/graph.hpp
+70 −9 include/hibf/layout/hierarchical_binning.hpp
+27 −0 include/hibf/misc/divide_and_ceil.hpp
+2 −2 include/hibf/misc/insert_iterator.hpp
+18 −8 src/build/construct_ibf.cpp
+2 −1 src/build/insert_into_ibf.cpp
+9 −2 src/config.cpp
+10 −6 src/hierarchical_interleaved_bloom_filter.cpp
+2 −1 src/interleaved_bloom_filter.cpp
+2 −2 src/layout/compute_layout.cpp
+10 −10 src/layout/graph.cpp
+25 −32 src/layout/hierarchical_binning.cpp
+9 −4 src/layout/simple_binning.cpp
+5 −4 test/performance/example/example_benchmark.cpp
+3 −2 test/performance/hibf/hierarchical_interleaved_bloom_filter_benchmark.cpp
+2 −2 test/performance/ibf/binning_bitvector_benchmark.cpp
+2 −1 test/performance/ibf/interleaved_bloom_filter_benchmark.cpp
+2 −2 test/snippet/hibf/hibf_construction.cpp
+1 −1 test/snippet/readme.cpp
+5 −0 test/unit/hibf/build/CMakeLists.txt
+28 −0 test/unit/hibf/build/bin_size_in_bits_test.cpp
+34 −13 test/unit/hibf/config_test.cpp
+5 −4 test/unit/hibf/hierarchical_interleaved_bloom_filter_test.cpp
+12 −1 test/unit/hibf/layout/graph_test.cpp
+6 −6 test/unit/hibf/layout/hierarchical_binning_test.cpp
+2 −1 test/unit/hibf/sketch/hyperloglog_test.cpp
+2 −1 util/fpr_correction_check.cpp
3 changes: 2 additions & 1 deletion src/layout/determine_best_number_of_technical_bins.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ determine_best_number_of_technical_bins(chopper::configuration & config)
file_out << "## ### Parameters ###\n"
<< "## number of user bins = " << config.hibf_config.number_of_user_bins << '\n'
<< "## number of hash functions = " << config.hibf_config.number_of_hash_functions << '\n'
<< "## false positive rate = " << config.hibf_config.maximum_false_positive_rate << '\n';
<< "## maximum false positive rate = " << config.hibf_config.maximum_fpr << '\n'
<< "## relaxed false positive rate = " << config.hibf_config.relaxed_fpr << '\n';
hibf_statistics::print_header_to(file_out, config.output_verbose_statistics);

double best_expected_HIBF_query_cost{std::numeric_limits<double>::infinity()};
Expand Down
14 changes: 6 additions & 8 deletions src/layout/hibf_statistics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ hibf_statistics::hibf_statistics(configuration const & config_,
std::vector<size_t> const & kmer_counts) :
config{config_},
fp_correction{
seqan::hibf::layout::compute_fpr_correction({.fpr = config_.hibf_config.maximum_false_positive_rate,
seqan::hibf::layout::compute_fpr_correction({.fpr = config_.hibf_config.maximum_fpr,
.hash_count = config_.hibf_config.number_of_hash_functions,
.t_max = config_.hibf_config.tmax})},
sketches{sketches_},
Expand Down Expand Up @@ -187,9 +187,8 @@ void hibf_statistics::print_summary_to(size_t & t_max_64_memory, std::ostream &
stream /* tmax */ << config.hibf_config.tmax
<< '\t'
/* c_tmax */
<< chopper::layout::ibf_query_cost::interpolated(
config.hibf_config.tmax,
config.hibf_config.maximum_false_positive_rate)
<< chopper::layout::ibf_query_cost::interpolated(config.hibf_config.tmax,
config.hibf_config.maximum_fpr)
<< '\t'
/* l_tmax */
<< expected_HIBF_query_cost
Expand Down Expand Up @@ -260,7 +259,7 @@ size_t hibf_statistics::total_hibf_size_in_byte()
}

size_t const size_in_bits =
seqan::hibf::build::bin_size_in_bits({.fpr = config.hibf_config.maximum_false_positive_rate,
seqan::hibf::build::bin_size_in_bits({.fpr = config.hibf_config.maximum_fpr,
.hash_count = config.hibf_config.number_of_hash_functions,
.elements = total_size});

Expand Down Expand Up @@ -349,7 +348,7 @@ size_t hibf_statistics::total_hibf_size_in_byte()
std::string hibf_statistics::to_formatted_BF_size(size_t const number_of_kmers_to_be_stored) const
{
size_t const size_in_bits =
seqan::hibf::build::bin_size_in_bits({.fpr = config.hibf_config.maximum_false_positive_rate,
seqan::hibf::build::bin_size_in_bits({.fpr = config.hibf_config.maximum_fpr,
.hash_count = config.hibf_config.number_of_hash_functions,
.elements = number_of_kmers_to_be_stored});
return byte_size_to_formatted_str(size_in_bits / 8);
Expand Down Expand Up @@ -486,8 +485,7 @@ void hibf_statistics::compute_total_query_cost(level & curr_level)

// Add cost of querying the current IBF
// (how costly is querying number_of_tbs (e.g. 128 tbs) compared to 64 tbs given the current FPR)
curr_level.current_query_cost +=
ibf_query_cost::interpolated(number_of_tbs, config.hibf_config.maximum_false_positive_rate);
curr_level.current_query_cost += ibf_query_cost::interpolated(number_of_tbs, config.hibf_config.maximum_fpr);

// Add costs of querying the HIBF for each kmer in this level.
total_query_cost += curr_level.current_query_cost * level_kmer_count;
Expand Down
12 changes: 11 additions & 1 deletion src/set_up_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,23 @@ void set_up_parser(sharg::parser & parser, configuration & config)
"This parameter is needed to correctly estimate the index size when computing the layout."});

parser.add_option(
config.hibf_config.maximum_false_positive_rate,
config.hibf_config.maximum_fpr,
sharg::config{.short_id = '\0',
.long_id = "fpr",
.description =
"The false positive rate you aim for when building the HIBF from the resulting layout. "
"This parameter is needed to correctly estimate the index size when computing the layout."});

parser.add_option(
config.hibf_config.relaxed_fpr,
sharg::config{.short_id = '\0',
.long_id = "relaxed-fpr",
.description =
"The relaxed false positive rate (fpr) for parts that are not critical for the maximum fpr. "
"Choosing a higher relaxed FPR can lower the memory requirement but increases the runtime. "
"Experiments show that the decrease in memory is significant while the the runtime suffers "
"only slightly. We still guarantee that we never exceed the maximum fpr (--fpr)."});

parser.add_option(
config.output_filename,
sharg::config{.short_id = '\0', .long_id = "output", .description = "A file name for the resulting layout."});
Expand Down
23 changes: 17 additions & 6 deletions src/util/display_layout/compute_ibf_size.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,37 @@
#include <hibf/build/build_data.hpp>
#include <hibf/contrib/robin_hood.hpp>
#include <hibf/layout/graph.hpp>
#include <hibf/misc/divide_and_ceil.hpp>

void update_parent_kmers(robin_hood::unordered_flat_set<uint64_t> & parent_kmers,
robin_hood::unordered_flat_set<uint64_t> const & kmers)
{
parent_kmers.insert(kmers.begin(), kmers.end());
}

// this function is copied from seqan::hibf::build::construct_ibf
// it needs to be held consistent in order to compute the correct sizes
size_t compute_ibf_size(robin_hood::unordered_flat_set<uint64_t> & parent_kmers,
robin_hood::unordered_flat_set<uint64_t> & kmers,
size_t const number_of_bins,
seqan::hibf::layout::graph::node const & ibf_node,
seqan::hibf::build::build_data & data,
size_t const current_hibf_level)
{
size_t const kmers_per_bin = std::ceil(static_cast<double>(kmers.size()) / number_of_bins);
size_t const bin_size =
std::ceil(seqan::hibf::build::bin_size_in_bits({.fpr = data.config.maximum_false_positive_rate,
.hash_count = data.config.number_of_hash_functions,
.elements = kmers_per_bin})
* data.fpr_correction[number_of_bins]);
bool const max_bin_is_merged = ibf_node.max_bin_is_merged();
assert(!max_bin_is_merged || number_of_bins == 1u); // merged max bin implies (=>) number of bins == 1

size_t const kmers_per_bin = seqan::hibf::divide_and_ceil(kmers.size(), number_of_bins);
double const fpr = max_bin_is_merged ? data.config.relaxed_fpr : data.config.maximum_fpr;

size_t const bin_bits{seqan::hibf::build::bin_size_in_bits({.fpr = fpr, //
.hash_count = data.config.number_of_hash_functions,
.elements = kmers_per_bin})};
// data.fpr_correction[1] == 1.0, but we can avoid floating point operations with the ternary.
// Check number_of_bins instead of max_bin_is_merged, because split bins can also occupy only one technical bin.
size_t const bin_size{number_of_bins == 1u
? bin_bits
: static_cast<size_t>(std::ceil(bin_bits * data.fpr_correction[number_of_bins]))};

size_t const ibf_size = ibf_node.number_of_technical_bins * bin_size;

Expand Down
6 changes: 2 additions & 4 deletions src/util/display_layout/sizes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -317,10 +317,8 @@ void execute_general_stats(config const & cfg)
seqan::hibf::build::build_data data{.config = hibf_config, .ibf_graph = {hibf_layout}};
seqan::hibf::layout::graph::node const & root_node = data.ibf_graph.root;
size_t const t_max{root_node.number_of_technical_bins};
data.fpr_correction =
seqan::hibf::layout::compute_fpr_correction({.fpr = hibf_config.maximum_false_positive_rate,
.hash_count = hibf_config.number_of_hash_functions,
.t_max = t_max});
data.fpr_correction = seqan::hibf::layout::compute_fpr_correction(
{.fpr = hibf_config.maximum_fpr, .hash_count = hibf_config.number_of_hash_functions, .t_max = t_max});

// Get stats
hierarchical_stats(stats, root_node, data);
Expand Down
43 changes: 23 additions & 20 deletions test/api/config_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ chopper::configuration generate_config()

config.hibf_config.number_of_user_bins = 123456789;
config.hibf_config.number_of_hash_functions = 4;
config.hibf_config.maximum_false_positive_rate = 0.0001;
config.hibf_config.maximum_fpr = 0.0001;
config.hibf_config.relaxed_fpr = 0.2;
config.hibf_config.threads = 31;
config.hibf_config.sketch_bits = 8;
config.hibf_config.tmax = 128;
Expand All @@ -42,24 +43,25 @@ namespace chopper

bool operator==(chopper::configuration const & lhs, chopper::configuration const & rhs)
{
return lhs.data_file == rhs.data_file && //
lhs.debug == rhs.debug && //
lhs.sketch_directory == rhs.sketch_directory && //
lhs.k == rhs.k && //
lhs.disable_sketch_output == rhs.disable_sketch_output && //
lhs.precomputed_files == rhs.precomputed_files && //
lhs.output_filename == rhs.output_filename && //
lhs.determine_best_tmax == rhs.determine_best_tmax && //
lhs.force_all_binnings == rhs.force_all_binnings && //
lhs.hibf_config.number_of_user_bins == rhs.hibf_config.number_of_user_bins && //
lhs.hibf_config.number_of_hash_functions == rhs.hibf_config.number_of_hash_functions && //
lhs.hibf_config.maximum_false_positive_rate == rhs.hibf_config.maximum_false_positive_rate && //
lhs.hibf_config.threads == rhs.hibf_config.threads && //
lhs.hibf_config.sketch_bits == rhs.hibf_config.sketch_bits && //
lhs.hibf_config.tmax == rhs.hibf_config.tmax && //
lhs.hibf_config.alpha == rhs.hibf_config.alpha && //
lhs.hibf_config.max_rearrangement_ratio == rhs.hibf_config.max_rearrangement_ratio && //
lhs.hibf_config.disable_estimate_union == rhs.hibf_config.disable_estimate_union && //
return lhs.data_file == rhs.data_file && //
lhs.debug == rhs.debug && //
lhs.sketch_directory == rhs.sketch_directory && //
lhs.k == rhs.k && //
lhs.disable_sketch_output == rhs.disable_sketch_output && //
lhs.precomputed_files == rhs.precomputed_files && //
lhs.output_filename == rhs.output_filename && //
lhs.determine_best_tmax == rhs.determine_best_tmax && //
lhs.force_all_binnings == rhs.force_all_binnings && //
lhs.hibf_config.number_of_user_bins == rhs.hibf_config.number_of_user_bins && //
lhs.hibf_config.number_of_hash_functions == rhs.hibf_config.number_of_hash_functions && //
lhs.hibf_config.maximum_fpr == rhs.hibf_config.maximum_fpr && //
lhs.hibf_config.relaxed_fpr == rhs.hibf_config.relaxed_fpr && //
lhs.hibf_config.threads == rhs.hibf_config.threads && //
lhs.hibf_config.sketch_bits == rhs.hibf_config.sketch_bits && //
lhs.hibf_config.tmax == rhs.hibf_config.tmax && //
lhs.hibf_config.alpha == rhs.hibf_config.alpha && //
lhs.hibf_config.max_rearrangement_ratio == rhs.hibf_config.max_rearrangement_ratio && //
lhs.hibf_config.disable_estimate_union == rhs.hibf_config.disable_estimate_union && //
lhs.hibf_config.disable_rearrangement == rhs.hibf_config.disable_rearrangement;
}

Expand Down Expand Up @@ -93,7 +95,8 @@ static constexpr std::string_view config_string_view{"@CHOPPER_CONFIG\n"
"@ \"version\": 1,\n"
"@ \"number_of_user_bins\": 123456789,\n"
"@ \"number_of_hash_functions\": 4,\n"
"@ \"maximum_false_positive_rate\": 0.0001,\n"
"@ \"maximum_fpr\": 0.0001,\n"
"@ \"relaxed_fpr\": 0.2,\n"
"@ \"threads\": 31,\n"
"@ \"sketch_bits\": 8,\n"
"@ \"tmax\": 128,\n"
Expand Down
4 changes: 2 additions & 2 deletions test/api/display_layout/compute_ibf_size_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ TEST(compute_ibf_size_test, merged_bin_is_max_bin)
seqan::hibf::build::build_data data{
.config = {.number_of_user_bins = 123,
.number_of_hash_functions = hash,
.maximum_false_positive_rate = fpr,
.maximum_fpr = fpr,
.threads = 1,
.tmax = tmax},
.fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = fpr, .hash_count = hash, .t_max = tmax})};
Expand Down Expand Up @@ -79,7 +79,7 @@ TEST(compute_ibf_size_test, split_bin_is_max_bin)
seqan::hibf::build::build_data data{
.config = {.number_of_user_bins = 123,
.number_of_hash_functions = hash,
.maximum_false_positive_rate = fpr,
.maximum_fpr = fpr,
.threads = 1,
.tmax = tmax},
.fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = fpr, .hash_count = hash, .t_max = tmax})};
Expand Down
8 changes: 5 additions & 3 deletions test/api/layout/execute_layout_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ TEST(execute_test, few_ubs)
"@ \"version\": 1,\n"
"@ \"number_of_user_bins\": 8,\n"
"@ \"number_of_hash_functions\": 2,\n"
"@ \"maximum_false_positive_rate\": 0.05,\n"
"@ \"maximum_fpr\": 0.05,\n"
"@ \"relaxed_fpr\": 0.3,\n"
"@ \"threads\": 1,\n"
"@ \"sketch_bits\": 12,\n"
"@ \"tmax\": 64,\n"
Expand Down Expand Up @@ -292,7 +293,8 @@ TEST(execute_test, many_ubs)
"@ \"version\": 1,\n"
"@ \"number_of_user_bins\": 96,\n"
"@ \"number_of_hash_functions\": 2,\n"
"@ \"maximum_false_positive_rate\": 0.05,\n"
"@ \"maximum_fpr\": 0.05,\n"
"@ \"relaxed_fpr\": 0.3,\n"
"@ \"threads\": 1,\n"
"@ \"sketch_bits\": 12,\n"
"@ \"tmax\": 64,\n"
Expand All @@ -303,7 +305,7 @@ TEST(execute_test, many_ubs)
"@ }\n"
"@}\n"
"@HIBF_CONFIG_END\n"
"#TOP_LEVEL_IBF fullest_technical_bin_idx:26\n"
"#TOP_LEVEL_IBF fullest_technical_bin_idx:48\n"
"#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:14\n"
"#LOWER_LEVEL_IBF_1 fullest_technical_bin_idx:14\n"
"#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:14\n"
Expand Down
Loading
Loading