diff --git a/src/display_layout/sizes.cpp b/src/display_layout/sizes.cpp index 5a0a734e..a0cee4f1 100644 --- a/src/display_layout/sizes.cpp +++ b/src/display_layout/sizes.cpp @@ -5,6 +5,7 @@ // shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md // --------------------------------------------------------------------------------------------------- +#include #include #include @@ -26,11 +27,85 @@ #include "shared.hpp" -struct stats +struct ibf_stats { - std::vector ibf_sizes{}; - std::vector ibf_levels{}; - std::vector ibf_load_factor{}; + size_t size_in_bits{}; + size_t level{}; + size_t max_elements{}; + size_t tbs_too_big{}; + size_t tbs_too_many_elements{}; + double load_factor{}; +}; + +struct per_level_stats +{ + size_t number_of_levels{}; + std::vector size_in_bits{}; + std::vector max_elements{}; + std::vector tbs_too_big{}; + std::vector tbs_too_many_elements{}; + std::vector num_ibfs{}; + std::vector load_factor{}; + + explicit per_level_stats(std::vector const & stats) + { + number_of_levels = std::ranges::max(stats, + [](ibf_stats const & lhs, ibf_stats const & rhs) + { + return lhs.level < rhs.level; + }) + .level + + 1u; + + size_in_bits.resize(number_of_levels); + max_elements.resize(number_of_levels); + tbs_too_big.resize(number_of_levels); + tbs_too_many_elements.resize(number_of_levels); + num_ibfs.resize(number_of_levels); + load_factor.resize(number_of_levels); + + for (size_t i = 0; i < stats.size(); ++i) + { + auto const & stat = stats[i]; + size_in_bits[stat.level] += stat.size_in_bits; + max_elements[stat.level] += stat.max_elements; + tbs_too_big[stat.level] += stat.tbs_too_big; + tbs_too_many_elements[stat.level] += stat.tbs_too_many_elements; + load_factor[stat.level] += stat.load_factor; + ++num_ibfs[stat.level]; + } + + for (size_t i = 0; i < number_of_levels; ++i) + { + load_factor[i] /= num_ibfs[i]; + max_elements[i] /= num_ibfs[i]; + tbs_too_many_elements[i] /= num_ibfs[i]; + } + } + + void print(std::ostream & stream) const + { + stream << std::fixed << std::setprecision(2); + stream << "# Levels: " << number_of_levels << '\n'; + stream << "LEVEL\t" // + << "BIT_SIZE\t" // + << "IBFS\t" // + << "AVG_LOAD_FACTOR\t" // + << "TBS_TOO_BIG\t" // + << "AVG_TBS_TOO_BIG_ELEMENTS\t" // + << "AVG_MAX_ELEMENTS\n"; + + for (size_t idx = 0; idx < number_of_levels; ++idx) + { + stream << idx << '\t' // + << size_in_bits[idx] << '\t' // + << num_ibfs[idx] << '\t' // + << load_factor[idx] * 100 << '\t' // + << tbs_too_big[idx] << '\t' // + << tbs_too_many_elements[idx] << '\t' // + << max_elements[idx] << '\n'; + } + } }; void compute_kmers(robin_hood::unordered_flat_set & kmers, @@ -68,7 +143,7 @@ size_t compute_ibf_size(robin_hood::unordered_flat_set & parent_kmers, return ibf_size; } -size_t hierarchical_stats(stats & stats_data, +size_t hierarchical_stats(std::vector & stats, robin_hood::unordered_flat_set & parent_kmers, seqan::hibf::layout::graph::node const & current_node, seqan::hibf::build::build_data & data, @@ -84,7 +159,7 @@ size_t hierarchical_stats(stats & stats_data, if (current_node.favourite_child_idx.has_value()) // max bin is a merged bin { // recursively initialize favourite child first - hierarchical_stats(stats_data, + hierarchical_stats(stats, kmers, current_node.children[current_node.favourite_child_idx.value()], data, @@ -105,6 +180,8 @@ size_t hierarchical_stats(stats & stats_data, size_t const max_bin_tbs = initialise_max_bin_kmers(); size_t const ibf_size = compute_ibf_size(parent_kmers, kmers, max_bin_tbs, current_node, data, current_hibf_level); size_t const amount_of_max_bin_kmers = kmers.size(); + std::atomic tbs_too_big{}; + std::atomic tbs_too_many_elements{}; kmers.clear(); // reduce memory peak // parse all other children (merged bins) of the current ibf @@ -137,17 +214,24 @@ size_t hierarchical_stats(stats & stats_data, { auto & child = current_node.children[index]; - robin_hood::unordered_flat_set kmers2{}; - hierarchical_stats(stats_data, kmers2, child, data, current_hibf_level + 1u); + robin_hood::unordered_flat_set local_kmers{}; + hierarchical_stats(stats, local_kmers, child, data, current_hibf_level + 1u); if (current_hibf_level > 0 /* not top level */) { std::lock_guard guard{merge_kmer_mutex}; - update_parent_kmers(parent_kmers, kmers2); + update_parent_kmers(parent_kmers, local_kmers); } - size_waste_per_tb[child.parent_bin_index] = - amount_of_max_bin_kmers - std::min(amount_of_max_bin_kmers, kmers2.size()); + if (amount_of_max_bin_kmers < local_kmers.size()) + { + ++tbs_too_big; + tbs_too_many_elements += local_kmers.size() - amount_of_max_bin_kmers; + } + else + { + size_waste_per_tb[child.parent_bin_index] = amount_of_max_bin_kmers - local_kmers.size(); + } } } @@ -160,11 +244,18 @@ size_t hierarchical_stats(stats & stats_data, compute_kmers(kmers, data, record); size_t const kmers_per_tb = kmers.size() / record.number_of_technical_bins + 1u; - size_t const diff = amount_of_max_bin_kmers - std::min(amount_of_max_bin_kmers, kmers_per_tb); - assert(size_waste_per_tb.size() >= record.storage_TB_id + record.number_of_technical_bins); - for (size_t i = record.storage_TB_id; i < record.storage_TB_id + record.number_of_technical_bins; ++i) - size_waste_per_tb[i] = diff; + if (amount_of_max_bin_kmers < kmers_per_tb) + { + tbs_too_big += record.number_of_technical_bins; + tbs_too_many_elements += (kmers_per_tb - amount_of_max_bin_kmers) * record.number_of_technical_bins; + } + else + { + assert(size_waste_per_tb.size() >= record.storage_TB_id + record.number_of_technical_bins); + for (size_t i = record.storage_TB_id; i < record.storage_TB_id + record.number_of_technical_bins; ++i) + size_waste_per_tb[i] = amount_of_max_bin_kmers - kmers_per_tb; + } if (current_hibf_level > 0 /* not top level */) update_parent_kmers(parent_kmers, kmers); @@ -172,38 +263,45 @@ size_t hierarchical_stats(stats & stats_data, kmers.clear(); } - stats_data.ibf_sizes[ibf_pos] = ibf_size; - stats_data.ibf_levels[ibf_pos] = current_hibf_level; - - // compute load factor - size_t const waste = std::accumulate(size_waste_per_tb.begin(), size_waste_per_tb.end(), size_t{}); - size_t const total = amount_of_max_bin_kmers * size_waste_per_tb.size(); - - assert(waste <= total); - - stats_data.ibf_load_factor[ibf_pos] = (total - waste) / static_cast(total); + double const load_factor = [&]() + { + size_t const waste = std::accumulate(size_waste_per_tb.begin(), size_waste_per_tb.end(), size_t{}); + size_t const total = amount_of_max_bin_kmers * current_node.number_of_technical_bins; + assert(waste <= total); + assert(total > 0u); + return (total - waste) / static_cast(total); + }(); + + auto & current_stats = stats[ibf_pos]; + current_stats.size_in_bits = ibf_size; + current_stats.level = current_hibf_level; + current_stats.max_elements = amount_of_max_bin_kmers; + current_stats.tbs_too_big = tbs_too_big.load(); + current_stats.tbs_too_many_elements = tbs_too_many_elements.load(); + current_stats.load_factor = load_factor; return ibf_pos; } -size_t hierarchical_stats(stats & stats_data, +size_t hierarchical_stats(std::vector & stats, seqan::hibf::layout::graph::node const & root_node, seqan::hibf::build::build_data & data) { robin_hood::unordered_flat_set root_kmers{}; - return hierarchical_stats(stats_data, root_kmers, root_node, data, 0); + return hierarchical_stats(stats, root_kmers, root_node, data, 0); } void execute_general_stats(config const & cfg) { + // Read config and layout std::ifstream layout_file{cfg.input}; - if (!layout_file.good() || !layout_file.is_open()) throw std::logic_error{"Could not open file " + cfg.input.string() + " for reading"}; auto [filenames, chopper_config, hibf_layout] = chopper::layout::read_layout_file(layout_file); - chopper_config.hibf_config.threads = cfg.threads; + // Prepare configs + chopper_config.hibf_config.threads = cfg.threads; auto input_lambda = [&filenames, &chopper_config](size_t const user_bin_id, seqan::hibf::insert_iterator it) { std::vector current_kmers; @@ -218,56 +316,33 @@ void execute_general_stats(config const & cfg) it = kmer; }; chopper_config.hibf_config.input_fn = input_lambda; - auto const & hibf_config = chopper_config.hibf_config; - std::ofstream output_stream{cfg.output}; - - if (!output_stream.good() || !output_stream.is_open()) - throw std::logic_error{"Could not open file " + cfg.output.string() + " for reading"}; - + // Prepare stats size_t const number_of_ibfs = hibf_layout.max_bins.size() + 1u; + std::vector stats(number_of_ibfs); - stats stats_data; - stats_data.ibf_sizes.resize(number_of_ibfs); - stats_data.ibf_levels.resize(number_of_ibfs); - stats_data.ibf_load_factor.resize(number_of_ibfs); - + // Prepare data seqan::hibf::build::build_data data{.config = hibf_config, .ibf_graph = {hibf_layout}}; - seqan::hibf::layout::graph::node const & root_node = data.ibf_graph.root; - size_t const t_max{root_node.number_of_technical_bins}; data.fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = hibf_config.maximum_false_positive_rate, .hash_count = hibf_config.number_of_hash_functions, .t_max = t_max}); - hierarchical_stats(stats_data, root_node, data); + // Get stats + hierarchical_stats(stats, root_node, data); - // accumulate statistics per level - size_t const number_of_levels = std::ranges::max(stats_data.ibf_levels) + 1u; - std::vector size_per_level(number_of_levels); - std::vector load_factor_per_level(number_of_levels); - std::vector num_ibfs_per_level(number_of_levels); + // Get stats per level + per_level_stats const level_stats{stats}; - for (size_t i = 0; i < number_of_ibfs; ++i) - { - size_t const ibf_level{stats_data.ibf_levels[i]}; - size_per_level[ibf_level] += stats_data.ibf_sizes[i]; - load_factor_per_level[ibf_level] += stats_data.ibf_load_factor[i]; - ++num_ibfs_per_level[ibf_level]; - } - - for (size_t i = 0; i < number_of_levels; ++i) - load_factor_per_level[i] /= num_ibfs_per_level[i]; + // Output + std::ofstream output_stream{cfg.output}; + if (!output_stream.good() || !output_stream.is_open()) + throw std::logic_error{"Could not open file " + cfg.output.string() + " for reading"}; - output_stream << std::fixed << std::setprecision(2); - output_stream << "# Levels: " << number_of_levels << '\n'; - output_stream << "LEVEL\tBIT_SIZE\tIBFS\tLOAD_FACTOR\n"; - for (size_t i = 0; i < number_of_levels; ++i) - output_stream << i << '\t' << size_per_level[i] << '\t' << num_ibfs_per_level[i] << '\t' - << load_factor_per_level[i] * 100 << '\n'; + level_stats.print(output_stream); } void execute_sizes(config const & cfg)