diff --git a/lib/hibf b/lib/hibf index f8c834b5..cc744d96 160000 --- a/lib/hibf +++ b/lib/hibf @@ -1 +1 @@ -Subproject commit f8c834b5dd692b758e5b6fc1b3eee2e9fc2a6d0d +Subproject commit cc744d96a764170610851509fca98f2c3d71830a diff --git a/src/layout/determine_best_number_of_technical_bins.cpp b/src/layout/determine_best_number_of_technical_bins.cpp index dbf80a8c..7ccfdbcc 100644 --- a/src/layout/determine_best_number_of_technical_bins.cpp +++ b/src/layout/determine_best_number_of_technical_bins.cpp @@ -57,7 +57,8 @@ determine_best_number_of_technical_bins(chopper::configuration & config) file_out << "## ### Parameters ###\n" << "## number of user bins = " << config.hibf_config.number_of_user_bins << '\n' << "## number of hash functions = " << config.hibf_config.number_of_hash_functions << '\n' - << "## false positive rate = " << config.hibf_config.maximum_false_positive_rate << '\n'; + << "## maximum false positive rate = " << config.hibf_config.maximum_fpr << '\n' + << "## relaxed false positive rate = " << config.hibf_config.relaxed_fpr << '\n'; hibf_statistics::print_header_to(file_out, config.output_verbose_statistics); double best_expected_HIBF_query_cost{std::numeric_limits::infinity()}; diff --git a/src/layout/hibf_statistics.cpp b/src/layout/hibf_statistics.cpp index 5e3cd104..dedc702f 100644 --- a/src/layout/hibf_statistics.cpp +++ b/src/layout/hibf_statistics.cpp @@ -46,7 +46,7 @@ hibf_statistics::hibf_statistics(configuration const & config_, std::vector const & kmer_counts) : config{config_}, fp_correction{ - seqan::hibf::layout::compute_fpr_correction({.fpr = config_.hibf_config.maximum_false_positive_rate, + seqan::hibf::layout::compute_fpr_correction({.fpr = config_.hibf_config.maximum_fpr, .hash_count = config_.hibf_config.number_of_hash_functions, .t_max = config_.hibf_config.tmax})}, sketches{sketches_}, @@ -187,9 +187,8 @@ void hibf_statistics::print_summary_to(size_t & t_max_64_memory, std::ostream & stream /* tmax */ << config.hibf_config.tmax << '\t' /* c_tmax */ - << chopper::layout::ibf_query_cost::interpolated( - config.hibf_config.tmax, - config.hibf_config.maximum_false_positive_rate) + << chopper::layout::ibf_query_cost::interpolated(config.hibf_config.tmax, + config.hibf_config.maximum_fpr) << '\t' /* l_tmax */ << expected_HIBF_query_cost @@ -260,7 +259,7 @@ size_t hibf_statistics::total_hibf_size_in_byte() } size_t const size_in_bits = - seqan::hibf::build::bin_size_in_bits({.fpr = config.hibf_config.maximum_false_positive_rate, + seqan::hibf::build::bin_size_in_bits({.fpr = config.hibf_config.maximum_fpr, .hash_count = config.hibf_config.number_of_hash_functions, .elements = total_size}); @@ -349,7 +348,7 @@ size_t hibf_statistics::total_hibf_size_in_byte() std::string hibf_statistics::to_formatted_BF_size(size_t const number_of_kmers_to_be_stored) const { size_t const size_in_bits = - seqan::hibf::build::bin_size_in_bits({.fpr = config.hibf_config.maximum_false_positive_rate, + seqan::hibf::build::bin_size_in_bits({.fpr = config.hibf_config.maximum_fpr, .hash_count = config.hibf_config.number_of_hash_functions, .elements = number_of_kmers_to_be_stored}); return byte_size_to_formatted_str(size_in_bits / 8); @@ -486,8 +485,7 @@ void hibf_statistics::compute_total_query_cost(level & curr_level) // Add cost of querying the current IBF // (how costly is querying number_of_tbs (e.g. 128 tbs) compared to 64 tbs given the current FPR) - curr_level.current_query_cost += - ibf_query_cost::interpolated(number_of_tbs, config.hibf_config.maximum_false_positive_rate); + curr_level.current_query_cost += ibf_query_cost::interpolated(number_of_tbs, config.hibf_config.maximum_fpr); // Add costs of querying the HIBF for each kmer in this level. total_query_cost += curr_level.current_query_cost * level_kmer_count; diff --git a/src/set_up_parser.cpp b/src/set_up_parser.cpp index da9147dd..36109b65 100644 --- a/src/set_up_parser.cpp +++ b/src/set_up_parser.cpp @@ -90,13 +90,23 @@ void set_up_parser(sharg::parser & parser, configuration & config) "This parameter is needed to correctly estimate the index size when computing the layout."}); parser.add_option( - config.hibf_config.maximum_false_positive_rate, + config.hibf_config.maximum_fpr, sharg::config{.short_id = '\0', .long_id = "fpr", .description = "The false positive rate you aim for when building the HIBF from the resulting layout. " "This parameter is needed to correctly estimate the index size when computing the layout."}); + parser.add_option( + config.hibf_config.relaxed_fpr, + sharg::config{.short_id = '\0', + .long_id = "relaxed-fpr", + .description = + "The relaxed false positive rate (fpr) for parts that are not critical for the maximum fpr. " + "Choosing a higher relaxed FPR can lower the memory requirement but increases the runtime. " + "Experiments show that the decrease in memory is significant while the the runtime suffers " + "only slightly. We still guarantee that we never exceed the maximum fpr (--fpr)."}); + parser.add_option( config.output_filename, sharg::config{.short_id = '\0', .long_id = "output", .description = "A file name for the resulting layout."}); diff --git a/src/util/display_layout/compute_ibf_size.hpp b/src/util/display_layout/compute_ibf_size.hpp index 4463d65e..8aeff2f8 100644 --- a/src/util/display_layout/compute_ibf_size.hpp +++ b/src/util/display_layout/compute_ibf_size.hpp @@ -13,6 +13,7 @@ #include #include #include +#include void update_parent_kmers(robin_hood::unordered_flat_set & parent_kmers, robin_hood::unordered_flat_set const & kmers) @@ -20,6 +21,8 @@ void update_parent_kmers(robin_hood::unordered_flat_set & parent_kmers parent_kmers.insert(kmers.begin(), kmers.end()); } +// this function is copied from seqan::hibf::build::construct_ibf +// it needs to be held consistent in order to compute the correct sizes size_t compute_ibf_size(robin_hood::unordered_flat_set & parent_kmers, robin_hood::unordered_flat_set & kmers, size_t const number_of_bins, @@ -27,12 +30,20 @@ size_t compute_ibf_size(robin_hood::unordered_flat_set & parent_kmers, seqan::hibf::build::build_data & data, size_t const current_hibf_level) { - size_t const kmers_per_bin = std::ceil(static_cast(kmers.size()) / number_of_bins); - size_t const bin_size = - std::ceil(seqan::hibf::build::bin_size_in_bits({.fpr = data.config.maximum_false_positive_rate, - .hash_count = data.config.number_of_hash_functions, - .elements = kmers_per_bin}) - * data.fpr_correction[number_of_bins]); + bool const max_bin_is_merged = ibf_node.max_bin_is_merged(); + assert(!max_bin_is_merged || number_of_bins == 1u); // merged max bin implies (=>) number of bins == 1 + + size_t const kmers_per_bin = seqan::hibf::divide_and_ceil(kmers.size(), number_of_bins); + double const fpr = max_bin_is_merged ? data.config.relaxed_fpr : data.config.maximum_fpr; + + size_t const bin_bits{seqan::hibf::build::bin_size_in_bits({.fpr = fpr, // + .hash_count = data.config.number_of_hash_functions, + .elements = kmers_per_bin})}; + // data.fpr_correction[1] == 1.0, but we can avoid floating point operations with the ternary. + // Check number_of_bins instead of max_bin_is_merged, because split bins can also occupy only one technical bin. + size_t const bin_size{number_of_bins == 1u + ? bin_bits + : static_cast(std::ceil(bin_bits * data.fpr_correction[number_of_bins]))}; size_t const ibf_size = ibf_node.number_of_technical_bins * bin_size; diff --git a/src/util/display_layout/sizes.cpp b/src/util/display_layout/sizes.cpp index 1ddd817a..055a93fd 100644 --- a/src/util/display_layout/sizes.cpp +++ b/src/util/display_layout/sizes.cpp @@ -317,10 +317,8 @@ void execute_general_stats(config const & cfg) seqan::hibf::build::build_data data{.config = hibf_config, .ibf_graph = {hibf_layout}}; seqan::hibf::layout::graph::node const & root_node = data.ibf_graph.root; size_t const t_max{root_node.number_of_technical_bins}; - data.fpr_correction = - seqan::hibf::layout::compute_fpr_correction({.fpr = hibf_config.maximum_false_positive_rate, - .hash_count = hibf_config.number_of_hash_functions, - .t_max = t_max}); + data.fpr_correction = seqan::hibf::layout::compute_fpr_correction( + {.fpr = hibf_config.maximum_fpr, .hash_count = hibf_config.number_of_hash_functions, .t_max = t_max}); // Get stats hierarchical_stats(stats, root_node, data); diff --git a/test/api/config_test.cpp b/test/api/config_test.cpp index 66560c34..2d48d53d 100644 --- a/test/api/config_test.cpp +++ b/test/api/config_test.cpp @@ -25,7 +25,8 @@ chopper::configuration generate_config() config.hibf_config.number_of_user_bins = 123456789; config.hibf_config.number_of_hash_functions = 4; - config.hibf_config.maximum_false_positive_rate = 0.0001; + config.hibf_config.maximum_fpr = 0.0001; + config.hibf_config.relaxed_fpr = 0.2; config.hibf_config.threads = 31; config.hibf_config.sketch_bits = 8; config.hibf_config.tmax = 128; @@ -42,24 +43,25 @@ namespace chopper bool operator==(chopper::configuration const & lhs, chopper::configuration const & rhs) { - return lhs.data_file == rhs.data_file && // - lhs.debug == rhs.debug && // - lhs.sketch_directory == rhs.sketch_directory && // - lhs.k == rhs.k && // - lhs.disable_sketch_output == rhs.disable_sketch_output && // - lhs.precomputed_files == rhs.precomputed_files && // - lhs.output_filename == rhs.output_filename && // - lhs.determine_best_tmax == rhs.determine_best_tmax && // - lhs.force_all_binnings == rhs.force_all_binnings && // - lhs.hibf_config.number_of_user_bins == rhs.hibf_config.number_of_user_bins && // - lhs.hibf_config.number_of_hash_functions == rhs.hibf_config.number_of_hash_functions && // - lhs.hibf_config.maximum_false_positive_rate == rhs.hibf_config.maximum_false_positive_rate && // - lhs.hibf_config.threads == rhs.hibf_config.threads && // - lhs.hibf_config.sketch_bits == rhs.hibf_config.sketch_bits && // - lhs.hibf_config.tmax == rhs.hibf_config.tmax && // - lhs.hibf_config.alpha == rhs.hibf_config.alpha && // - lhs.hibf_config.max_rearrangement_ratio == rhs.hibf_config.max_rearrangement_ratio && // - lhs.hibf_config.disable_estimate_union == rhs.hibf_config.disable_estimate_union && // + return lhs.data_file == rhs.data_file && // + lhs.debug == rhs.debug && // + lhs.sketch_directory == rhs.sketch_directory && // + lhs.k == rhs.k && // + lhs.disable_sketch_output == rhs.disable_sketch_output && // + lhs.precomputed_files == rhs.precomputed_files && // + lhs.output_filename == rhs.output_filename && // + lhs.determine_best_tmax == rhs.determine_best_tmax && // + lhs.force_all_binnings == rhs.force_all_binnings && // + lhs.hibf_config.number_of_user_bins == rhs.hibf_config.number_of_user_bins && // + lhs.hibf_config.number_of_hash_functions == rhs.hibf_config.number_of_hash_functions && // + lhs.hibf_config.maximum_fpr == rhs.hibf_config.maximum_fpr && // + lhs.hibf_config.relaxed_fpr == rhs.hibf_config.relaxed_fpr && // + lhs.hibf_config.threads == rhs.hibf_config.threads && // + lhs.hibf_config.sketch_bits == rhs.hibf_config.sketch_bits && // + lhs.hibf_config.tmax == rhs.hibf_config.tmax && // + lhs.hibf_config.alpha == rhs.hibf_config.alpha && // + lhs.hibf_config.max_rearrangement_ratio == rhs.hibf_config.max_rearrangement_ratio && // + lhs.hibf_config.disable_estimate_union == rhs.hibf_config.disable_estimate_union && // lhs.hibf_config.disable_rearrangement == rhs.hibf_config.disable_rearrangement; } @@ -93,7 +95,8 @@ static constexpr std::string_view config_string_view{"@CHOPPER_CONFIG\n" "@ \"version\": 1,\n" "@ \"number_of_user_bins\": 123456789,\n" "@ \"number_of_hash_functions\": 4,\n" - "@ \"maximum_false_positive_rate\": 0.0001,\n" + "@ \"maximum_fpr\": 0.0001,\n" + "@ \"relaxed_fpr\": 0.2,\n" "@ \"threads\": 31,\n" "@ \"sketch_bits\": 8,\n" "@ \"tmax\": 128,\n" diff --git a/test/api/display_layout/compute_ibf_size_test.cpp b/test/api/display_layout/compute_ibf_size_test.cpp index 2f91457b..f05638ed 100644 --- a/test/api/display_layout/compute_ibf_size_test.cpp +++ b/test/api/display_layout/compute_ibf_size_test.cpp @@ -39,7 +39,7 @@ TEST(compute_ibf_size_test, merged_bin_is_max_bin) seqan::hibf::build::build_data data{ .config = {.number_of_user_bins = 123, .number_of_hash_functions = hash, - .maximum_false_positive_rate = fpr, + .maximum_fpr = fpr, .threads = 1, .tmax = tmax}, .fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = fpr, .hash_count = hash, .t_max = tmax})}; @@ -79,7 +79,7 @@ TEST(compute_ibf_size_test, split_bin_is_max_bin) seqan::hibf::build::build_data data{ .config = {.number_of_user_bins = 123, .number_of_hash_functions = hash, - .maximum_false_positive_rate = fpr, + .maximum_fpr = fpr, .threads = 1, .tmax = tmax}, .fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = fpr, .hash_count = hash, .t_max = tmax})}; diff --git a/test/api/layout/execute_layout_test.cpp b/test/api/layout/execute_layout_test.cpp index d163dc3a..23fb64a1 100644 --- a/test/api/layout/execute_layout_test.cpp +++ b/test/api/layout/execute_layout_test.cpp @@ -84,7 +84,8 @@ TEST(execute_test, few_ubs) "@ \"version\": 1,\n" "@ \"number_of_user_bins\": 8,\n" "@ \"number_of_hash_functions\": 2,\n" - "@ \"maximum_false_positive_rate\": 0.05,\n" + "@ \"maximum_fpr\": 0.05,\n" + "@ \"relaxed_fpr\": 0.3,\n" "@ \"threads\": 1,\n" "@ \"sketch_bits\": 12,\n" "@ \"tmax\": 64,\n" @@ -292,7 +293,8 @@ TEST(execute_test, many_ubs) "@ \"version\": 1,\n" "@ \"number_of_user_bins\": 96,\n" "@ \"number_of_hash_functions\": 2,\n" - "@ \"maximum_false_positive_rate\": 0.05,\n" + "@ \"maximum_fpr\": 0.05,\n" + "@ \"relaxed_fpr\": 0.3,\n" "@ \"threads\": 1,\n" "@ \"sketch_bits\": 12,\n" "@ \"tmax\": 64,\n" @@ -303,7 +305,7 @@ TEST(execute_test, many_ubs) "@ }\n" "@}\n" "@HIBF_CONFIG_END\n" - "#TOP_LEVEL_IBF fullest_technical_bin_idx:26\n" + "#TOP_LEVEL_IBF fullest_technical_bin_idx:48\n" "#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:14\n" "#LOWER_LEVEL_IBF_1 fullest_technical_bin_idx:14\n" "#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:14\n" diff --git a/test/api/layout/execute_with_estimation_test.cpp b/test/api/layout/execute_with_estimation_test.cpp index aedbf281..1c551257 100644 --- a/test/api/layout/execute_with_estimation_test.cpp +++ b/test/api/layout/execute_with_estimation_test.cpp @@ -61,7 +61,8 @@ TEST(execute_estimation_test, few_ubs) R"expected_cout(## ### Parameters ### ## number of user bins = 8 ## number of hash functions = 2 -## false positive rate = 0.05 +## maximum false positive rate = 0.05 +## relaxed false positive rate = 0.3 ## ### Notation ### ## X-IBF = An IBF with X number of bins. ## X-HIBF = An HIBF with tmax = X, e.g a maximum of X technical bins on each level. @@ -116,7 +117,8 @@ TEST(execute_estimation_test, many_ubs) R"expected_cout(## ### Parameters ### ## number of user bins = 96 ## number of hash functions = 2 -## false positive rate = 0.05 +## maximum false positive rate = 0.05 +## relaxed false positive rate = 0.3 ## ### Notation ### ## X-IBF = An IBF with X number of bins. ## X-HIBF = An HIBF with tmax = X, e.g a maximum of X technical bins on each level. @@ -128,9 +130,9 @@ TEST(execute_estimation_test, many_ubs) ## (l*m)_tmax : Computed by l_tmax * m_tmax ## size : The expected total size of an tmax-HIBF # tmax c_tmax l_tmax m_tmax (l*m)_tmax size -64 1.00 1.25 1.00 1.25 75.4KiB -128 1.22 1.24 0.69 0.86 51.9KiB -256 1.33 1.33 0.76 1.02 57.5KiB +64 1.00 1.25 1.00 1.25 75.1KiB +128 1.22 1.24 0.68 0.85 51.4KiB +256 1.33 1.33 0.77 1.02 57.5KiB # Best t_max (regarding expected query runtime): 128 )expected_cout"); @@ -262,7 +264,8 @@ TEST(execute_estimation_test, many_ubs) "@ \"version\": 1,\n" "@ \"number_of_user_bins\": 96,\n" "@ \"number_of_hash_functions\": 2,\n" - "@ \"maximum_false_positive_rate\": 0.05,\n" + "@ \"maximum_fpr\": 0.05,\n" + "@ \"relaxed_fpr\": 0.3,\n" "@ \"threads\": 1,\n" "@ \"sketch_bits\": 12,\n" "@ \"tmax\": 128,\n" @@ -274,8 +277,8 @@ TEST(execute_estimation_test, many_ubs) "@}\n" "@HIBF_CONFIG_END\n" "#TOP_LEVEL_IBF fullest_technical_bin_idx:96\n" - "#LOWER_LEVEL_IBF_14 fullest_technical_bin_idx:24\n" - "#LOWER_LEVEL_IBF_15 fullest_technical_bin_idx:24\n" + "#LOWER_LEVEL_IBF_14 fullest_technical_bin_idx:0\n" + "#LOWER_LEVEL_IBF_15 fullest_technical_bin_idx:0\n" "#USER_BIN_IDX\tTECHNICAL_BIN_INDICES\tNUMBER_OF_TECHNICAL_BINS\n" "0\t0\t1\n" "19\t1\t1\n" @@ -291,12 +294,12 @@ TEST(execute_estimation_test, many_ubs) "9\t11\t1\n" "8\t12\t1\n" "7\t13\t1\n" - "4\t14;0\t1;24\n" - "5\t14;24\t1;20\n" - "6\t14;44\t1;20\n" - "1\t15;0\t1;24\n" - "2\t15;24\t1;20\n" - "3\t15;44\t1;20\n" + "4\t14;0\t1;22\n" + "5\t14;22\t1;21\n" + "6\t14;43\t1;21\n" + "1\t15;0\t1;22\n" + "2\t15;22\t1;21\n" + "3\t15;43\t1;21\n" "32\t16\t1\n" "33\t17\t1\n" "34\t18\t1\n" @@ -416,7 +419,8 @@ TEST(execute_estimation_test, many_ubs_force_all) R"expected_cout(## ### Parameters ### ## number of user bins = 96 ## number of hash functions = 2 -## false positive rate = 0.05 +## maximum false positive rate = 0.05 +## relaxed false positive rate = 0.3 ## ### Notation ### ## X-IBF = An IBF with X number of bins. ## X-HIBF = An HIBF with tmax = X, e.g a maximum of X technical bins on each level. @@ -428,9 +432,9 @@ TEST(execute_estimation_test, many_ubs_force_all) ## (l*m)_tmax : Computed by l_tmax * m_tmax ## size : The expected total size of an tmax-HIBF # tmax c_tmax l_tmax m_tmax (l*m)_tmax size -64 1.00 1.25 1.00 1.25 75.4KiB -128 1.22 1.24 0.69 0.86 51.9KiB -256 1.33 1.33 0.76 1.02 57.5KiB +64 1.00 1.25 1.00 1.25 75.1KiB +128 1.22 1.24 0.68 0.85 51.4KiB +256 1.33 1.33 0.77 1.02 57.5KiB # Best t_max (regarding expected query runtime): 128 )expected_cout"); @@ -520,7 +524,8 @@ TEST(execute_estimation_test, with_rearrangement) R"expected_cout(## ### Parameters ### ## number of user bins = 196 ## number of hash functions = 2 -## false positive rate = 0.05 +## maximum false positive rate = 0.05 +## relaxed false positive rate = 0.3 ## ### Notation ### ## X-IBF = An IBF with X number of bins. ## X-HIBF = An HIBF with tmax = X, e.g a maximum of X technical bins on each level. diff --git a/test/api/layout/hibf_statistics_test.cpp b/test/api/layout/hibf_statistics_test.cpp index a5e70312..e7f3821d 100644 --- a/test/api/layout/hibf_statistics_test.cpp +++ b/test/api/layout/hibf_statistics_test.cpp @@ -187,7 +187,8 @@ TEST(execute_test, chopper_layout_statistics_determine_best_bins) R"expected_cout(## ### Parameters ### ## number of user bins = 10 ## number of hash functions = 2 -## false positive rate = 0.05 +## maximum false positive rate = 0.05 +## relaxed false positive rate = 0.3 ## ### Notation ### ## X-IBF = An IBF with X number of bins. ## X-HIBF = An HIBF with tmax = X, e.g a maximum of X technical bins on each level. diff --git a/test/cli/cli_chopper_pipeline_test.cpp b/test/cli/cli_chopper_pipeline_test.cpp index e44f5700..368a2e52 100644 --- a/test/cli/cli_chopper_pipeline_test.cpp +++ b/test/cli/cli_chopper_pipeline_test.cpp @@ -93,7 +93,8 @@ TEST_F(cli_test, chopper_layout) "@ \"version\": 1,\n" "@ \"number_of_user_bins\": 3,\n" "@ \"number_of_hash_functions\": 2,\n" - "@ \"maximum_false_positive_rate\": 0.05,\n" + "@ \"maximum_fpr\": 0.05,\n" + "@ \"relaxed_fpr\": 0.3,\n" "@ \"threads\": 2,\n" "@ \"sketch_bits\": 12,\n" "@ \"tmax\": 64,\n" @@ -193,7 +194,8 @@ TEST_F(cli_test, chopper_layout2) "@ \"version\": 1,\n" "@ \"number_of_user_bins\": 4,\n" "@ \"number_of_hash_functions\": 2,\n" - "@ \"maximum_false_positive_rate\": 0.05,\n" + "@ \"maximum_fpr\": 0.05,\n" + "@ \"relaxed_fpr\": 0.3,\n" "@ \"threads\": 2,\n" "@ \"sketch_bits\": 12,\n" "@ \"tmax\": 64,\n" @@ -204,10 +206,10 @@ TEST_F(cli_test, chopper_layout2) "@ }\n" "@}\n" "@HIBF_CONFIG_END\n" - "#TOP_LEVEL_IBF fullest_technical_bin_idx:54\n" + "#TOP_LEVEL_IBF fullest_technical_bin_idx:16\n" "#USER_BIN_IDX\tTECHNICAL_BIN_INDICES\tNUMBER_OF_TECHNICAL_BINS\n" - "1\t0\t15\n" - "3\t15\t24\n" + "1\t0\t16\n" + "3\t16\t23\n" "2\t39\t15\n" "0\t54\t10\n"};