From 365218d9fef0850e0c1882bd738040d22de516bb Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Tue, 26 Sep 2023 10:06:41 +0200 Subject: [PATCH 1/4] [MISC] Display_layout: Use canonical kmers for size computation. --- src/display_layout.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/display_layout.cpp b/src/display_layout.cpp index 17e57b46..bb6cb6dd 100644 --- a/src/display_layout.cpp +++ b/src/display_layout.cpp @@ -101,11 +101,14 @@ void process_file(std::string const & filename, { sequence_file_type fin{filename}; + auto minimizer_view = seqan3::views::minimiser_hash(seqan3::ungapped{kmer_size}, + seqan3::window_size{kmer_size}, + seqan3::seed{adjust_seed(seqan3::ungapped{kmer_size}.count())}); if (fill_current_kmers) { for (auto && [seq] : fin) { - for (uint64_t hash_value : seq | seqan3::views::kmer_hash(seqan3::ungapped{kmer_size})) + for (uint64_t hash_value : seq | minimizer_view) { current_kmers.push_back(hash_value); sketch.add(reinterpret_cast(&hash_value), sizeof(hash_value)); @@ -116,7 +119,7 @@ void process_file(std::string const & filename, { for (auto && [seq] : fin) { - for (uint64_t hash_value : seq | seqan3::views::kmer_hash(seqan3::ungapped{kmer_size})) + for (uint64_t hash_value : seq | minimizer_view) { sketch.add(reinterpret_cast(&hash_value), sizeof(hash_value)); } From f50468ffe83c5fff9667b48fce7f48d18429d63f Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Tue, 26 Sep 2023 11:06:32 +0200 Subject: [PATCH 2/4] [FEATURE] Display_layout: new statistics to evaluate layout. --- src/display_layout.cpp | 312 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 302 insertions(+), 10 deletions(-) diff --git a/src/display_layout.cpp b/src/display_layout.cpp index bb6cb6dd..57349014 100644 --- a/src/display_layout.cpp +++ b/src/display_layout.cpp @@ -15,17 +15,30 @@ #include #include -#include +#include +#include #include #include +#include +#include +#include +#include +#include #include struct config { std::filesystem::path input{}; - std::filesystem::path output{}; + std::filesystem::path output_prefix{}; +}; + +struct stats +{ + std::vector ibf_sizes{}; + std::vector ibf_levels{}; + std::vector ibf_load_factor{}; }; static void print_progress(size_t const percentage) @@ -101,9 +114,10 @@ void process_file(std::string const & filename, { sequence_file_type fin{filename}; - auto minimizer_view = seqan3::views::minimiser_hash(seqan3::ungapped{kmer_size}, + seqan3::shape shape{seqan3::ungapped{kmer_size}}; + auto minimizer_view = seqan3::views::minimiser_hash(shape, seqan3::window_size{kmer_size}, - seqan3::seed{adjust_seed(seqan3::ungapped{kmer_size}.count())}); + seqan3::seed{chopper::adjust_seed(shape.count())}); if (fill_current_kmers) { for (auto && [seq] : fin) @@ -138,10 +152,16 @@ int execute(config const & cfg) auto [filenames, chopper_config, hibf_layout] = chopper::layout::read_layout_file(layout_file); auto const & hibf_config = chopper_config.hibf_config; - std::ofstream output_stream{cfg.output}; + std::filesystem::path const output_path = [&cfg]() + { + std::filesystem::path output_path{cfg.output_prefix}; + output_path += ".stats"; + return output_path; + }(); + std::ofstream output_stream{output_path}; if (!output_stream.good() || !output_stream.is_open()) - throw std::logic_error{"Could not open file " + cfg.output.string() + " for reading"}; // GCOVR_EXCL_LINE + throw std::logic_error{"Could not open file " + output_path.string() + " for reading"}; // GCOVR_EXCL_LINE // Fetch all file sizes such that sorting by file size doesn't have to access the filesystem too often. // n = filenames.size() @@ -297,6 +317,277 @@ int execute(config const & cfg) return 0; } +void compute_kmers(robin_hood::unordered_flat_set & kmers, + seqan::hibf::build::build_data const & data, + seqan::hibf::layout::layout::user_bin const & record) +{ + data.config.input_fn(record.idx, std::inserter(kmers, kmers.begin())); +} + +void update_parent_kmers(robin_hood::unordered_flat_set & parent_kmers, + robin_hood::unordered_flat_set const & kmers) +{ + parent_kmers.insert(kmers.begin(), kmers.end()); +} + +size_t compute_ibf_size(robin_hood::unordered_flat_set & parent_kmers, + robin_hood::unordered_flat_set & kmers, + size_t const number_of_bins, + seqan::hibf::layout::graph::node const & ibf_node, + seqan::hibf::build::build_data & data, + size_t current_hibf_level) +{ + size_t const kmers_per_bin{static_cast(std::ceil(static_cast(kmers.size()) / number_of_bins))}; + double const bin_bits{ + static_cast(seqan::hibf::build::bin_size_in_bits({.fpr = data.config.maximum_false_positive_rate, + .hash_count = data.config.number_of_hash_functions, + .elements = kmers_per_bin}))}; + seqan::hibf::bin_size const bin_size{ + static_cast(std::ceil(bin_bits * data.fpr_correction[number_of_bins]))}; + seqan::hibf::bin_count const bin_count{ibf_node.number_of_technical_bins}; + + size_t const bin_words = (bin_count.value + 63) >> 6; // = ceil(bins/64) + size_t const technical_bins = bin_words << 6; // = bin_words * 64 + + size_t const ibf_size = technical_bins * bin_size.value; + + if (current_hibf_level > 0 /* not top level */) + update_parent_kmers(parent_kmers, kmers); + + return ibf_size; +} + +size_t hierarchical_stats(stats & stats_data, + robin_hood::unordered_flat_set & parent_kmers, + seqan::hibf::layout::graph::node const & current_node, + seqan::hibf::build::build_data & data, + size_t current_hibf_level) +{ + size_t const ibf_pos{data.request_ibf_idx()}; + std::vector size_waste_per_tb(current_node.number_of_technical_bins); + + robin_hood::unordered_flat_set kmers{}; + + auto initialise_max_bin_kmers = [&]() -> size_t + { + if (current_node.favourite_child_idx.has_value()) // max bin is a merged bin + { + // recursively initialize favourite child first + hierarchical_stats(stats_data, + kmers, + current_node.children[current_node.favourite_child_idx.value()], + data, + current_hibf_level + 1); + return 1; + } + else // max bin is not a merged bin + { + // we assume that the max record is at the beginning of the list of remaining records. + auto const & record = current_node.remaining_records[0]; + compute_kmers(kmers, data, record); + + return record.number_of_technical_bins; + } + }; + + // initialize lower level IBF + size_t const max_bin_tbs = initialise_max_bin_kmers(); + size_t const ibf_size = compute_ibf_size(parent_kmers, kmers, max_bin_tbs, current_node, data, current_hibf_level); + size_t const amount_of_max_bin_kmers = kmers.size(); + kmers.clear(); // reduce memory peak + + // parse all other children (merged bins) of the current ibf + auto loop_over_children = [&]() + { + if (current_node.children.empty()) + return; + + std::vector children = current_node.children; // copy for threads + + size_t const number_of_mutex = (current_node.number_of_technical_bins + 63) / 64; + std::vector local_ibf_mutex(number_of_mutex); + + size_t number_of_threads{}; + std::vector indices(children.size()); + std::iota(indices.begin(), indices.end(), size_t{}); + + // We do not want to process the favourite child. It has already been processed prior. + // https://godbolt.org/z/6Yav7hrG1 + if (current_node.favourite_child_idx.has_value()) + std::erase(indices, current_node.favourite_child_idx.value()); + + if (current_hibf_level == 0) + { + // Shuffle indices: More likely to not block each other. Optimal: Interleave + std::shuffle(indices.begin(), indices.end(), std::mt19937_64{std::random_device{}()}); + number_of_threads = data.config.threads; + } + else + { + number_of_threads = 1u; + } + +#pragma omp parallel for schedule(dynamic, 1) num_threads(number_of_threads) + for (auto && index : indices) + { + auto & child = children[index]; + + robin_hood::unordered_flat_set kmers{}; + hierarchical_stats(stats_data, kmers, child, data, current_hibf_level + 1); + auto parent_bin_index = child.parent_bin_index; + + { + size_t const mutex_id{parent_bin_index / 64}; + std::lock_guard guard{local_ibf_mutex[mutex_id]}; + + if (current_hibf_level > 0 /* not top level */) + update_parent_kmers(parent_kmers, kmers); + + assert(amount_of_max_bin_kmers >= kmers.size()); + size_waste_per_tb[parent_bin_index] = + amount_of_max_bin_kmers - std::min(amount_of_max_bin_kmers, kmers.size()); + } + } + }; + + loop_over_children(); + + // If max bin was a merged bin, process all remaining records, otherwise the first one has already been processed + size_t const start{(current_node.favourite_child_idx.has_value()) ? 0u : 1u}; + for (size_t i = start; i < current_node.remaining_records.size(); ++i) + { + auto const & record = current_node.remaining_records[i]; + + compute_kmers(kmers, data, record); + + size_t const kmers_per_tb = kmers.size() / record.number_of_technical_bins; + size_t const diff = amount_of_max_bin_kmers - std::min(amount_of_max_bin_kmers, kmers_per_tb); + + if (diff > amount_of_max_bin_kmers) + throw std::runtime_error{"Wrong? diff:" + std::to_string(diff) + + " amount_of_max_bin_kmers:" + std::to_string(amount_of_max_bin_kmers)}; + + // size_t const kmers_per_tb_corrected = kmers_per_tb - (0.03 * kmers_per_tb); + // if (amount_of_max_bin_kmers < kmers_per_tb_corrected) + // { + // throw std::runtime_error{"amount_of_max_bin_kmers:" + std::to_string(amount_of_max_bin_kmers) + // + " was smaller than current size: " + std::to_string(kmers.size()) + "/" + // + std::to_string(record.number_of_technical_bins) + "=" + // + std::to_string(kmers.size() / record.number_of_technical_bins)}; + // } + + assert(size_waste_per_tb.size() >= record.storage_TB_id + record.number_of_technical_bins); + for (size_t i = record.storage_TB_id; i < record.storage_TB_id + record.number_of_technical_bins; ++i) + size_waste_per_tb[i] = diff; + + if (current_hibf_level > 0 /* not top level */) + update_parent_kmers(parent_kmers, kmers); + + kmers.clear(); + } + + stats_data.ibf_sizes[ibf_pos] = ibf_size; + stats_data.ibf_levels[ibf_pos] = current_hibf_level; + // compute load factor + size_t const waste = std::accumulate(size_waste_per_tb.begin(), size_waste_per_tb.end(), size_t{}); + size_t const total = amount_of_max_bin_kmers * size_waste_per_tb.size(); + + if (waste > total) + throw std::runtime_error{"Wrong? waste:" + std::to_string(waste) + " total:" + std::to_string(total)}; + + stats_data.ibf_load_factor[ibf_pos] = (total - waste) / static_cast(total); + + return ibf_pos; +} + +size_t hierarchical_stats(stats & stats_data, + seqan::hibf::layout::graph::node const & root_node, + seqan::hibf::build::build_data & data) +{ + robin_hood::unordered_flat_set root_kmers{}; + return hierarchical_stats(stats_data, root_kmers, root_node, data, 0); +} + +void execute_general_stats(config const & cfg) +{ + std::ifstream layout_file{cfg.input}; + + if (!layout_file.good() || !layout_file.is_open()) + throw std::logic_error{"Could not open file " + cfg.input.string() + " for reading"}; // GCOVR_EXCL_LINE + + auto [filenames, chopper_config, hibf_layout] = chopper::layout::read_layout_file(layout_file); + + auto input_lambda = [&filenames, &chopper_config](size_t const user_bin_id, seqan::hibf::insert_iterator it) + { + std::vector current_kmers; + seqan::hibf::sketch::hyperloglog sketch; + + if (filenames[user_bin_id].size() > 1) + throw std::runtime_error{"No multi files accepted yet."}; + + process_file(filenames[user_bin_id][0], current_kmers, sketch, true, chopper_config.k); + + for (auto const kmer : current_kmers) + it = kmer; + }; + chopper_config.hibf_config.input_fn = input_lambda; + + auto const & hibf_config = chopper_config.hibf_config; + + std::filesystem::path const output_path = [&cfg]() + { + std::filesystem::path output_path{cfg.output_prefix}; + output_path += ".general_stats"; + return output_path; + }(); + std::ofstream output_stream{output_path}; + + if (!output_stream.good() || !output_stream.is_open()) + throw std::logic_error{"Could not open file " + output_path.string() + " for reading"}; // GCOVR_EXCL_LINE + + size_t const number_of_ibfs = hibf_layout.max_bins.size() + 1u; + + stats stats_data; + stats_data.ibf_sizes.resize(number_of_ibfs); + stats_data.ibf_levels.resize(number_of_ibfs); + stats_data.ibf_load_factor.resize(number_of_ibfs, 1.0); + + seqan::hibf::build::build_data data{.config = hibf_config, .ibf_graph = {hibf_layout}}; + + seqan::hibf::layout::graph::node const & root_node = data.ibf_graph.root; + + size_t const t_max{root_node.number_of_technical_bins}; + data.fpr_correction = + seqan::hibf::layout::compute_fpr_correction({.fpr = hibf_config.maximum_false_positive_rate, + .hash_count = hibf_config.number_of_hash_functions, + .t_max = t_max}); + + hierarchical_stats(stats_data, root_node, data); + + // accumulate statistics per level + size_t const number_of_levels = std::ranges::max(stats_data.ibf_levels) + 1u; + std::vector size_per_level(number_of_levels, 0u); + std::vector load_factor_per_level(number_of_levels, 0.0); + std::vector num_ibfs_per_level(number_of_levels, 0u); + + for (size_t i = 0; i < stats_data.ibf_sizes.size(); ++i) + { + size_t const ibf_level{stats_data.ibf_levels[i]}; + size_per_level[ibf_level] += stats_data.ibf_sizes[i]; + load_factor_per_level[ibf_level] += stats_data.ibf_load_factor[i]; + ++num_ibfs_per_level[ibf_level]; + } + + for (size_t i = 0; i < number_of_levels; ++i) + load_factor_per_level[i] /= num_ibfs_per_level[i]; + + output_stream << "#Levels: " << number_of_levels << '\n'; + output_stream << "LEVEL-IDX\tSIZE-IN-BITS\tNUMBER-OF-IBFS\tLOAD-FACTOR-IN-%\n"; + for (size_t i = 0; i < number_of_levels; ++i) + output_stream << i << '\t' << size_per_level[i] << '\t' << num_ibfs_per_level[i] << '\t' + << load_factor_per_level[i] << '\n'; +} + inline void set_up_parser(sharg::parser & parser, config & cfg) { parser.info.version = "1.0.0"; @@ -314,10 +605,10 @@ inline void set_up_parser(sharg::parser & parser, config & cfg) .description = "The input must be a layout file computed via chopper layout or raptor layout. ", .required = true, .validator = sharg::input_file_validator{}}); - parser.add_option(cfg.output, + parser.add_option(cfg.output_prefix, sharg::config{.short_id = '\0', - .long_id = "output", - .description = "The output. ", + .long_id = "output-prefix", + .description = "The output-prefix. Create .stats and .general_stats file. ", .required = true, .validator = sharg::output_file_validator{}}); } @@ -340,5 +631,6 @@ int main(int argc, char const * argv[]) return -1; } - execute(cfg); + // execute(cfg); + execute_general_stats(cfg); } From f4ab896a6a5bfc5043011c322bb46bc1e9f3e43b Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Sat, 7 Oct 2023 14:45:42 +0200 Subject: [PATCH 3/4] [FEATURE] Add display_layout submodules. --- .clang-format | 4 +- src/CMakeLists.txt | 3 +- src/display_layout.cpp | 636 -------------------------- src/display_layout/CMakeLists.txt | 11 + src/display_layout/display_layout.cpp | 96 ++++ src/display_layout/general.cpp | 225 +++++++++ src/display_layout/process_file.cpp | 84 ++++ src/display_layout/shared.hpp | 31 ++ src/display_layout/sizes.cpp | 276 +++++++++++ 9 files changed, 727 insertions(+), 639 deletions(-) delete mode 100644 src/display_layout.cpp create mode 100644 src/display_layout/CMakeLists.txt create mode 100644 src/display_layout/display_layout.cpp create mode 100644 src/display_layout/general.cpp create mode 100644 src/display_layout/process_file.cpp create mode 100644 src/display_layout/shared.hpp create mode 100644 src/display_layout/sizes.cpp diff --git a/.clang-format b/.clang-format index 85026442..f82ea95b 100644 --- a/.clang-format +++ b/.clang-format @@ -100,8 +100,10 @@ IncludeCategories: Priority: 9 - Regex: ' -#include - -#include - -#include -#include -#include - -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -struct config -{ - std::filesystem::path input{}; - std::filesystem::path output_prefix{}; -}; - -struct stats -{ - std::vector ibf_sizes{}; - std::vector ibf_levels{}; - std::vector ibf_load_factor{}; -}; - -static void print_progress(size_t const percentage) -{ - assert(percentage <= 100u); - std::cerr << '['; - for (size_t i{}; i < percentage; ++i) - std::cerr << '='; - if (percentage < 100u) - { - std::cerr << '>'; - for (size_t i{1u}; i < 100u - percentage; ++i) - std::cerr << ' '; - } - std::cerr << "] " << percentage << " %\r" << std::flush; -} - -struct dna4_traits : public seqan3::sequence_file_input_default_traits_dna -{ - using sequence_alphabet = seqan3::dna4; -}; - -using sequence_file_type = seqan3::sequence_file_input, - seqan3::type_list>; - -// Using two sets and erasing from shared is slower -void keep_duplicates(robin_hood::unordered_set & shared, std::vector const & current) -{ - // static + calling clear is slower - robin_hood::unordered_set result{}; - - for (uint64_t value : current) - { - if (shared.contains(value)) - result.emplace(value); - } - - shared = std::move(result); -} - -void process_file(std::string const & filename, - std::vector & current_kmers, - seqan::hibf::sketch::hyperloglog & sketch, - bool const fill_current_kmers, - uint8_t const kmer_size) -{ - if (filename.ends_with(".minimiser")) - { - uint64_t hash{}; - char * const hash_data{reinterpret_cast(&hash)}; - std::streamsize const hash_bytes{sizeof(hash)}; - - std::ifstream infile{filename, std::ios::binary}; - - if (fill_current_kmers) - { - while (infile.read(hash_data, hash_bytes)) - { - current_kmers.push_back(hash); - sketch.add(hash_data, hash_bytes); - } - } - else - { - while (infile.read(hash_data, hash_bytes)) - { - sketch.add(hash_data, hash_bytes); - } - } - } - else - { - sequence_file_type fin{filename}; - - seqan3::shape shape{seqan3::ungapped{kmer_size}}; - auto minimizer_view = seqan3::views::minimiser_hash(shape, - seqan3::window_size{kmer_size}, - seqan3::seed{chopper::adjust_seed(shape.count())}); - if (fill_current_kmers) - { - for (auto && [seq] : fin) - { - for (uint64_t hash_value : seq | minimizer_view) - { - current_kmers.push_back(hash_value); - sketch.add(reinterpret_cast(&hash_value), sizeof(hash_value)); - } - } - } - else - { - for (auto && [seq] : fin) - { - for (uint64_t hash_value : seq | minimizer_view) - { - sketch.add(reinterpret_cast(&hash_value), sizeof(hash_value)); - } - } - } - } -} - -int execute(config const & cfg) -{ - std::ifstream layout_file{cfg.input}; - - if (!layout_file.good() || !layout_file.is_open()) - throw std::logic_error{"Could not open file " + cfg.input.string() + " for reading"}; // GCOVR_EXCL_LINE - - auto [filenames, chopper_config, hibf_layout] = chopper::layout::read_layout_file(layout_file); - auto const & hibf_config = chopper_config.hibf_config; - - std::filesystem::path const output_path = [&cfg]() - { - std::filesystem::path output_path{cfg.output_prefix}; - output_path += ".stats"; - return output_path; - }(); - std::ofstream output_stream{output_path}; - - if (!output_stream.good() || !output_stream.is_open()) - throw std::logic_error{"Could not open file " + output_path.string() + " for reading"}; // GCOVR_EXCL_LINE - - // Fetch all file sizes such that sorting by file size doesn't have to access the filesystem too often. - // n = filenames.size() - // Constructing this vector has `n` filesystem accesses. - // Sorting without pre-fetching has `O(n * log(n))` accesses. - std::vector const filesizes{[&filenames]() - { - std::vector result{}; - result.reserve(filenames.size()); - for (auto const & filename : filenames) - result.push_back(std::filesystem::file_size(filename.front())); - return result; - }()}; - - // Sorts by the technical bin indices in the top-level IBF: - // split bins: storage_TB_id, previous_TB_indices is empty - // merged bins: previous_TB_indices[0] - // If the index is the same, sort by file sizes (happens for merged bins). - // Using the smallest file to initialise the shared k-mers later will be less work. - std::ranges::sort( - hibf_layout.user_bins, - [&filesizes](seqan::hibf::layout::layout::user_bin const & lhs, - seqan::hibf::layout::layout::user_bin const & rhs) - { - size_t const first_idx = lhs.previous_TB_indices.empty() ? lhs.storage_TB_id : lhs.previous_TB_indices[0]; - size_t const second_idx = rhs.previous_TB_indices.empty() ? rhs.storage_TB_id : rhs.previous_TB_indices[0]; - return first_idx < second_idx || (first_idx == second_idx && filesizes[lhs.idx] < filesizes[rhs.idx]); - }); - - seqan::hibf::sketch::hyperloglog sketch{hibf_config.sketch_bits}; - robin_hood::unordered_set shared_kmers{}; - // We can't use `shared_kmers.size() == 0` instead of `shared_kmers_initialised`, because keep_duplicates - // will result in a size of 0 when there are no shared k-mers. - bool shared_kmers_initialised{false}; - std::vector current_kmers{}; - size_t ub_count{}; // How many user bins are stored in the current technical bin? Always 1 for split bins. - size_t split_count{}; // Into how many techincal bins is the user bin split? Always 1 for merged bins. - - std::vector bin_kinds( - hibf_config.tmax, - chopper::layout::hibf_statistics::bin_kind::split); - - for (auto const & max_bin : hibf_layout.max_bins) - { - // max_bin.previous_TB_indices.size() == 1: true for merged bins, false for split bins - // max_bin.previous_TB_indices[0]: technical bin index on the top-level - if (max_bin.previous_TB_indices.size() == 1) - { - bin_kinds[max_bin.previous_TB_indices[0]] = chopper::layout::hibf_statistics::bin_kind::merged; - } - } - - size_t const total_ub{hibf_layout.user_bins.size()}; // For progress bar - size_t const ub_percentage{std::max(1u, total_ub / 100u)}; // For progress bar, 1 % of all user bins - - size_t current_idx{}; // The current top-level technical bin index - - // Stats file header - output_stream << "# Layout: " << cfg.input.c_str() << '\n' // - << "tb_index\t" - << "size\t" - << "shared_size\t" - << "ub_count\t" - << "kind\t" - << "splits" << '\n'; - - auto print_result_line = [&]() - { - bool const is_merged{bin_kinds[current_idx] == chopper::layout::hibf_statistics::bin_kind::merged}; - size_t const avg_kmer_count = (sketch.estimate() + split_count - 1u) / split_count; - - for (size_t i{}, total{split_count}; i < total; ++i) - { - output_stream << current_idx + i << '\t' // - << avg_kmer_count << '\t' // - << shared_kmers.size() << '\t' // - << ub_count << '\t' // - << (is_merged ? "merged" : "split") << '\t' // - << split_count << '\n'; - split_count = 0u; // Subsequent split bins display 0, the first split bin displays the actual split count. - } - }; - - // Iterate over all user bins. They are sorted by their technical bin index in the top-level. - // Hence, we can process one top-level technical bin, print the results, clear our stored data and continue with - // the next. - for (size_t ub_index{}; ub_index < hibf_layout.user_bins.size(); ++ub_index) - { - if (ub_index % ub_percentage == 0) - print_progress(ub_index / ub_percentage); - - auto const & user_bin = hibf_layout.user_bins[ub_index]; - current_kmers.clear(); - - // The top-level technical bin index for the current user bin. - // user_bin.previous_TB_indices.size() == 0: true for split bins, false for merged bins - // user_bin.storage_TB_id: technical bin index on the lowest level - // user_bin.previous_TB_indices[0]: technical bin index on the top-level - size_t const idx = - (user_bin.previous_TB_indices.size() == 0) ? user_bin.storage_TB_id : user_bin.previous_TB_indices[0]; - - // We processed all user bins that belong to the `current_idx`th top-level technical bin. - // Print results and update data. - if (idx != current_idx) - { - print_result_line(); - sketch.clear(); - shared_kmers.clear(); - shared_kmers_initialised = false; - ub_count = 0u; - split_count = 0u; - current_idx = idx; - } - - bool const is_merged = bin_kinds[idx] == chopper::layout::hibf_statistics::bin_kind::merged; - // For user bins in a merged bin, `user_bin.number_of_technical_bins` is the number of technical bins on - // the lowest level. A user bin could be part of a merged bin on the top-level, but still be split into - // `user_bin.number_of_technical_bins` many bins on the lowest level. - split_count = is_merged ? 1u : user_bin.number_of_technical_bins; - - // We don't need to keep the current_kmers if there are no shared k-mers to merge them with. - bool const fill_current_kmers = is_merged && !(shared_kmers_initialised && shared_kmers.empty()); - - for (auto const & filename : filenames[user_bin.idx]) - { - ++ub_count; // This assumes that each user bin has exactly one associated file. Currently the case. - - process_file(filename, current_kmers, sketch, fill_current_kmers, chopper_config.k); - } - - // Compute set intersection: shared_kmers = shared_kmers ∩ current_kmers - // This happens for each user bin that belongs to a merged bin. - if (fill_current_kmers) - { - if (!shared_kmers_initialised) - { - shared_kmers_initialised = true; - shared_kmers.insert(current_kmers.begin(), current_kmers.end()); - } - else - { - keep_duplicates(shared_kmers, current_kmers); - } - } - } - - // Print results for the last top-level technical bin. - print_result_line(); - - // The progress bar uses a carriage return '\r' to only use a single line. - std::cerr << '\n'; - - return 0; -} - -void compute_kmers(robin_hood::unordered_flat_set & kmers, - seqan::hibf::build::build_data const & data, - seqan::hibf::layout::layout::user_bin const & record) -{ - data.config.input_fn(record.idx, std::inserter(kmers, kmers.begin())); -} - -void update_parent_kmers(robin_hood::unordered_flat_set & parent_kmers, - robin_hood::unordered_flat_set const & kmers) -{ - parent_kmers.insert(kmers.begin(), kmers.end()); -} - -size_t compute_ibf_size(robin_hood::unordered_flat_set & parent_kmers, - robin_hood::unordered_flat_set & kmers, - size_t const number_of_bins, - seqan::hibf::layout::graph::node const & ibf_node, - seqan::hibf::build::build_data & data, - size_t current_hibf_level) -{ - size_t const kmers_per_bin{static_cast(std::ceil(static_cast(kmers.size()) / number_of_bins))}; - double const bin_bits{ - static_cast(seqan::hibf::build::bin_size_in_bits({.fpr = data.config.maximum_false_positive_rate, - .hash_count = data.config.number_of_hash_functions, - .elements = kmers_per_bin}))}; - seqan::hibf::bin_size const bin_size{ - static_cast(std::ceil(bin_bits * data.fpr_correction[number_of_bins]))}; - seqan::hibf::bin_count const bin_count{ibf_node.number_of_technical_bins}; - - size_t const bin_words = (bin_count.value + 63) >> 6; // = ceil(bins/64) - size_t const technical_bins = bin_words << 6; // = bin_words * 64 - - size_t const ibf_size = technical_bins * bin_size.value; - - if (current_hibf_level > 0 /* not top level */) - update_parent_kmers(parent_kmers, kmers); - - return ibf_size; -} - -size_t hierarchical_stats(stats & stats_data, - robin_hood::unordered_flat_set & parent_kmers, - seqan::hibf::layout::graph::node const & current_node, - seqan::hibf::build::build_data & data, - size_t current_hibf_level) -{ - size_t const ibf_pos{data.request_ibf_idx()}; - std::vector size_waste_per_tb(current_node.number_of_technical_bins); - - robin_hood::unordered_flat_set kmers{}; - - auto initialise_max_bin_kmers = [&]() -> size_t - { - if (current_node.favourite_child_idx.has_value()) // max bin is a merged bin - { - // recursively initialize favourite child first - hierarchical_stats(stats_data, - kmers, - current_node.children[current_node.favourite_child_idx.value()], - data, - current_hibf_level + 1); - return 1; - } - else // max bin is not a merged bin - { - // we assume that the max record is at the beginning of the list of remaining records. - auto const & record = current_node.remaining_records[0]; - compute_kmers(kmers, data, record); - - return record.number_of_technical_bins; - } - }; - - // initialize lower level IBF - size_t const max_bin_tbs = initialise_max_bin_kmers(); - size_t const ibf_size = compute_ibf_size(parent_kmers, kmers, max_bin_tbs, current_node, data, current_hibf_level); - size_t const amount_of_max_bin_kmers = kmers.size(); - kmers.clear(); // reduce memory peak - - // parse all other children (merged bins) of the current ibf - auto loop_over_children = [&]() - { - if (current_node.children.empty()) - return; - - std::vector children = current_node.children; // copy for threads - - size_t const number_of_mutex = (current_node.number_of_technical_bins + 63) / 64; - std::vector local_ibf_mutex(number_of_mutex); - - size_t number_of_threads{}; - std::vector indices(children.size()); - std::iota(indices.begin(), indices.end(), size_t{}); - - // We do not want to process the favourite child. It has already been processed prior. - // https://godbolt.org/z/6Yav7hrG1 - if (current_node.favourite_child_idx.has_value()) - std::erase(indices, current_node.favourite_child_idx.value()); - - if (current_hibf_level == 0) - { - // Shuffle indices: More likely to not block each other. Optimal: Interleave - std::shuffle(indices.begin(), indices.end(), std::mt19937_64{std::random_device{}()}); - number_of_threads = data.config.threads; - } - else - { - number_of_threads = 1u; - } - -#pragma omp parallel for schedule(dynamic, 1) num_threads(number_of_threads) - for (auto && index : indices) - { - auto & child = children[index]; - - robin_hood::unordered_flat_set kmers{}; - hierarchical_stats(stats_data, kmers, child, data, current_hibf_level + 1); - auto parent_bin_index = child.parent_bin_index; - - { - size_t const mutex_id{parent_bin_index / 64}; - std::lock_guard guard{local_ibf_mutex[mutex_id]}; - - if (current_hibf_level > 0 /* not top level */) - update_parent_kmers(parent_kmers, kmers); - - assert(amount_of_max_bin_kmers >= kmers.size()); - size_waste_per_tb[parent_bin_index] = - amount_of_max_bin_kmers - std::min(amount_of_max_bin_kmers, kmers.size()); - } - } - }; - - loop_over_children(); - - // If max bin was a merged bin, process all remaining records, otherwise the first one has already been processed - size_t const start{(current_node.favourite_child_idx.has_value()) ? 0u : 1u}; - for (size_t i = start; i < current_node.remaining_records.size(); ++i) - { - auto const & record = current_node.remaining_records[i]; - - compute_kmers(kmers, data, record); - - size_t const kmers_per_tb = kmers.size() / record.number_of_technical_bins; - size_t const diff = amount_of_max_bin_kmers - std::min(amount_of_max_bin_kmers, kmers_per_tb); - - if (diff > amount_of_max_bin_kmers) - throw std::runtime_error{"Wrong? diff:" + std::to_string(diff) - + " amount_of_max_bin_kmers:" + std::to_string(amount_of_max_bin_kmers)}; - - // size_t const kmers_per_tb_corrected = kmers_per_tb - (0.03 * kmers_per_tb); - // if (amount_of_max_bin_kmers < kmers_per_tb_corrected) - // { - // throw std::runtime_error{"amount_of_max_bin_kmers:" + std::to_string(amount_of_max_bin_kmers) - // + " was smaller than current size: " + std::to_string(kmers.size()) + "/" - // + std::to_string(record.number_of_technical_bins) + "=" - // + std::to_string(kmers.size() / record.number_of_technical_bins)}; - // } - - assert(size_waste_per_tb.size() >= record.storage_TB_id + record.number_of_technical_bins); - for (size_t i = record.storage_TB_id; i < record.storage_TB_id + record.number_of_technical_bins; ++i) - size_waste_per_tb[i] = diff; - - if (current_hibf_level > 0 /* not top level */) - update_parent_kmers(parent_kmers, kmers); - - kmers.clear(); - } - - stats_data.ibf_sizes[ibf_pos] = ibf_size; - stats_data.ibf_levels[ibf_pos] = current_hibf_level; - // compute load factor - size_t const waste = std::accumulate(size_waste_per_tb.begin(), size_waste_per_tb.end(), size_t{}); - size_t const total = amount_of_max_bin_kmers * size_waste_per_tb.size(); - - if (waste > total) - throw std::runtime_error{"Wrong? waste:" + std::to_string(waste) + " total:" + std::to_string(total)}; - - stats_data.ibf_load_factor[ibf_pos] = (total - waste) / static_cast(total); - - return ibf_pos; -} - -size_t hierarchical_stats(stats & stats_data, - seqan::hibf::layout::graph::node const & root_node, - seqan::hibf::build::build_data & data) -{ - robin_hood::unordered_flat_set root_kmers{}; - return hierarchical_stats(stats_data, root_kmers, root_node, data, 0); -} - -void execute_general_stats(config const & cfg) -{ - std::ifstream layout_file{cfg.input}; - - if (!layout_file.good() || !layout_file.is_open()) - throw std::logic_error{"Could not open file " + cfg.input.string() + " for reading"}; // GCOVR_EXCL_LINE - - auto [filenames, chopper_config, hibf_layout] = chopper::layout::read_layout_file(layout_file); - - auto input_lambda = [&filenames, &chopper_config](size_t const user_bin_id, seqan::hibf::insert_iterator it) - { - std::vector current_kmers; - seqan::hibf::sketch::hyperloglog sketch; - - if (filenames[user_bin_id].size() > 1) - throw std::runtime_error{"No multi files accepted yet."}; - - process_file(filenames[user_bin_id][0], current_kmers, sketch, true, chopper_config.k); - - for (auto const kmer : current_kmers) - it = kmer; - }; - chopper_config.hibf_config.input_fn = input_lambda; - - auto const & hibf_config = chopper_config.hibf_config; - - std::filesystem::path const output_path = [&cfg]() - { - std::filesystem::path output_path{cfg.output_prefix}; - output_path += ".general_stats"; - return output_path; - }(); - std::ofstream output_stream{output_path}; - - if (!output_stream.good() || !output_stream.is_open()) - throw std::logic_error{"Could not open file " + output_path.string() + " for reading"}; // GCOVR_EXCL_LINE - - size_t const number_of_ibfs = hibf_layout.max_bins.size() + 1u; - - stats stats_data; - stats_data.ibf_sizes.resize(number_of_ibfs); - stats_data.ibf_levels.resize(number_of_ibfs); - stats_data.ibf_load_factor.resize(number_of_ibfs, 1.0); - - seqan::hibf::build::build_data data{.config = hibf_config, .ibf_graph = {hibf_layout}}; - - seqan::hibf::layout::graph::node const & root_node = data.ibf_graph.root; - - size_t const t_max{root_node.number_of_technical_bins}; - data.fpr_correction = - seqan::hibf::layout::compute_fpr_correction({.fpr = hibf_config.maximum_false_positive_rate, - .hash_count = hibf_config.number_of_hash_functions, - .t_max = t_max}); - - hierarchical_stats(stats_data, root_node, data); - - // accumulate statistics per level - size_t const number_of_levels = std::ranges::max(stats_data.ibf_levels) + 1u; - std::vector size_per_level(number_of_levels, 0u); - std::vector load_factor_per_level(number_of_levels, 0.0); - std::vector num_ibfs_per_level(number_of_levels, 0u); - - for (size_t i = 0; i < stats_data.ibf_sizes.size(); ++i) - { - size_t const ibf_level{stats_data.ibf_levels[i]}; - size_per_level[ibf_level] += stats_data.ibf_sizes[i]; - load_factor_per_level[ibf_level] += stats_data.ibf_load_factor[i]; - ++num_ibfs_per_level[ibf_level]; - } - - for (size_t i = 0; i < number_of_levels; ++i) - load_factor_per_level[i] /= num_ibfs_per_level[i]; - - output_stream << "#Levels: " << number_of_levels << '\n'; - output_stream << "LEVEL-IDX\tSIZE-IN-BITS\tNUMBER-OF-IBFS\tLOAD-FACTOR-IN-%\n"; - for (size_t i = 0; i < number_of_levels; ++i) - output_stream << i << '\t' << size_per_level[i] << '\t' << num_ibfs_per_level[i] << '\t' - << load_factor_per_level[i] << '\n'; -} - -inline void set_up_parser(sharg::parser & parser, config & cfg) -{ - parser.info.version = "1.0.0"; - parser.info.author = "Svenja Mehringer"; - parser.info.email = "svenja.mehringer@fu-berlin.de"; - parser.info.short_description = "Compute a top-level HIBF layout figure file"; - - parser.info.description.emplace_back("Computes an table to display the top-level layout."); - - parser.add_subsection("Main options:"); - parser.add_option( - cfg.input, - sharg::config{.short_id = '\0', - .long_id = "input", - .description = "The input must be a layout file computed via chopper layout or raptor layout. ", - .required = true, - .validator = sharg::input_file_validator{}}); - parser.add_option(cfg.output_prefix, - sharg::config{.short_id = '\0', - .long_id = "output-prefix", - .description = "The output-prefix. Create .stats and .general_stats file. ", - .required = true, - .validator = sharg::output_file_validator{}}); -} - -int main(int argc, char const * argv[]) -{ - sharg::parser parser{"layout_stats", argc, argv, sharg::update_notifications::off}; - parser.info.version = "1.0.0"; - - config cfg{}; - set_up_parser(parser, cfg); - - try - { - parser.parse(); - } - catch (sharg::parser_error const & ext) - { - std::cerr << "[ERROR] " << ext.what() << '\n'; - return -1; - } - - // execute(cfg); - execute_general_stats(cfg); -} diff --git a/src/display_layout/CMakeLists.txt b/src/display_layout/CMakeLists.txt new file mode 100644 index 00000000..22dbab5f --- /dev/null +++ b/src/display_layout/CMakeLists.txt @@ -0,0 +1,11 @@ +# --------------------------------------------------------------------------------------------------- +# Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +# Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +# This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +# shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +# --------------------------------------------------------------------------------------------------- + +cmake_minimum_required (VERSION 3.18) + +add_executable (display_layout EXCLUDE_FROM_ALL display_layout.cpp general.cpp process_file.cpp sizes.cpp) +target_link_libraries (display_layout "chopper_interface") diff --git a/src/display_layout/display_layout.cpp b/src/display_layout/display_layout.cpp new file mode 100644 index 00000000..dfc210df --- /dev/null +++ b/src/display_layout/display_layout.cpp @@ -0,0 +1,96 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include +#include + +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "shared.hpp" + +void init_shared_meta(sharg::parser & parser) +{ + parser.info.version = "1.0.0"; + parser.info.author = "Svenja Mehringer"; + parser.info.email = "svenja.mehringer@fu-berlin.de"; + parser.info.short_description = "Compute a top-level HIBF layout figure file"; + parser.info.description.emplace_back("Computes an table to display the top-level layout."); +} + +void init_options(sharg::parser & parser, config & cfg) +{ + parser.add_subsection("Main options:"); + parser.add_option( + cfg.input, + sharg::config{.short_id = '\0', + .long_id = "input", + .description = "The input must be a layout file computed via chopper layout or raptor layout. ", + .required = true, + .validator = sharg::input_file_validator{}}); + parser.add_option(cfg.output, + sharg::config{.short_id = '\0', + .long_id = "output", + .description = "The output. ", + .required = true, + .validator = sharg::output_file_validator{}}); + parser.add_option( + cfg.threads, + sharg::config{.short_id = '\0', .long_id = "threads", .description = "The number of threads to use."}); +} + +void parse(sharg::parser & parser) +{ + try + { + parser.parse(); + } + catch (sharg::parser_error const & ext) + { + std::cerr << "[ERROR] " << ext.what() << '\n'; + std::exit(EXIT_FAILURE); + } +} + +int main(int argc, char const * argv[]) +{ + config cfg{}; + + sharg::parser main_parser{"layout_stats", argc, argv, sharg::update_notifications::off, {"general", "sizes"}}; + init_shared_meta(main_parser); + + parse(main_parser); + sharg::parser & sub_parser = main_parser.get_sub_parser(); + + init_shared_meta(sub_parser); + init_options(sub_parser, cfg); + parse(sub_parser); + + if (sub_parser.info.app_name == std::string_view{"layout_stats-general"}) + execute_general(cfg); + else if (sub_parser.info.app_name == std::string_view{"layout_stats-sizes"}) + execute_sizes(cfg); + else + std::cerr << "[ERROR] Unknown subcommand\n"; +} diff --git a/src/display_layout/general.cpp b/src/display_layout/general.cpp new file mode 100644 index 00000000..42656cbd --- /dev/null +++ b/src/display_layout/general.cpp @@ -0,0 +1,225 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include + +#include + +#include +#include +#include + +#include +#include + +#include + +#include "shared.hpp" + +static void print_progress(size_t const percentage) +{ + assert(percentage <= 100u); + std::cerr << '['; + for (size_t i{}; i < percentage; ++i) + std::cerr << '='; + if (percentage < 100u) + { + std::cerr << '>'; + for (size_t i{1u}; i < 100u - percentage; ++i) + std::cerr << ' '; + } + std::cerr << "] " << percentage << " %\r" << std::flush; +} + +// Using two sets and erasing from shared is slower +void keep_duplicates(robin_hood::unordered_set & shared, std::vector const & current) +{ + // static + calling clear is slower + robin_hood::unordered_set result{}; + + for (uint64_t value : current) + { + if (shared.contains(value)) + result.emplace(value); + } + + shared = std::move(result); +} + +int execute(config const & cfg) +{ + std::ifstream layout_file{cfg.input}; + + if (!layout_file.good() || !layout_file.is_open()) + throw std::logic_error{"Could not open file " + cfg.input.string() + " for reading"}; + + auto [filenames, chopper_config, hibf_layout] = chopper::layout::read_layout_file(layout_file); + auto const & hibf_config = chopper_config.hibf_config; + + std::ofstream output_stream{cfg.output}; + + if (!output_stream.good() || !output_stream.is_open()) + throw std::logic_error{"Could not open file " + cfg.output.string() + " for reading"}; + + // Fetch all file sizes such that sorting by file size doesn't have to access the filesystem too often. + // n = filenames.size() + // Constructing this vector has `n` filesystem accesses. + // Sorting without pre-fetching has `O(n * log(n))` accesses. + std::vector const filesizes{[&filenames]() + { + std::vector result{}; + result.reserve(filenames.size()); + for (auto const & filename : filenames) + result.push_back(std::filesystem::file_size(filename.front())); + return result; + }()}; + + // Sorts by the technical bin indices in the top-level IBF: + // split bins: storage_TB_id, previous_TB_indices is empty + // merged bins: previous_TB_indices[0] + // If the index is the same, sort by file sizes (happens for merged bins). + // Using the smallest file to initialise the shared k-mers later will be less work. + std::ranges::sort( + hibf_layout.user_bins, + [&filesizes](seqan::hibf::layout::layout::user_bin const & lhs, + seqan::hibf::layout::layout::user_bin const & rhs) + { + size_t const first_idx = lhs.previous_TB_indices.empty() ? lhs.storage_TB_id : lhs.previous_TB_indices[0]; + size_t const second_idx = rhs.previous_TB_indices.empty() ? rhs.storage_TB_id : rhs.previous_TB_indices[0]; + return first_idx < second_idx || (first_idx == second_idx && filesizes[lhs.idx] < filesizes[rhs.idx]); + }); + + seqan::hibf::sketch::hyperloglog sketch{hibf_config.sketch_bits}; + robin_hood::unordered_set shared_kmers{}; + // We can't use `shared_kmers.size() == 0` instead of `shared_kmers_initialised`, because keep_duplicates + // will result in a size of 0 when there are no shared k-mers. + bool shared_kmers_initialised{false}; + std::vector current_kmers{}; + size_t ub_count{}; // How many user bins are stored in the current technical bin? Always 1 for split bins. + size_t split_count{}; // Into how many techincal bins is the user bin split? Always 1 for merged bins. + + std::vector bin_kinds( + hibf_config.tmax, + chopper::layout::hibf_statistics::bin_kind::split); + + for (auto const & max_bin : hibf_layout.max_bins) + { + // max_bin.previous_TB_indices.size() == 1: true for merged bins, false for split bins + // max_bin.previous_TB_indices[0]: technical bin index on the top-level + if (max_bin.previous_TB_indices.size() == 1) + { + bin_kinds[max_bin.previous_TB_indices[0]] = chopper::layout::hibf_statistics::bin_kind::merged; + } + } + + size_t const total_ub{hibf_layout.user_bins.size()}; // For progress bar + size_t const ub_percentage{std::max(1u, total_ub / 100u)}; // For progress bar, 1 % of all user bins + + size_t current_idx{}; // The current top-level technical bin index + + // Stats file header + output_stream << "# Layout: " << cfg.input.c_str() << '\n' // + << "tb_index\t" + << "size\t" + << "shared_size\t" + << "ub_count\t" + << "kind\t" + << "splits" << '\n'; + + auto print_result_line = [&]() + { + bool const is_merged{bin_kinds[current_idx] == chopper::layout::hibf_statistics::bin_kind::merged}; + size_t const avg_kmer_count = (sketch.estimate() + split_count - 1u) / split_count; + + for (size_t i{}, total{split_count}; i < total; ++i) + { + output_stream << current_idx + i << '\t' // + << avg_kmer_count << '\t' // + << shared_kmers.size() << '\t' // + << ub_count << '\t' // + << (is_merged ? "merged" : "split") << '\t' // + << split_count << '\n'; + split_count = 0u; // Subsequent split bins display 0, the first split bin displays the actual split count. + } + }; + + // Iterate over all user bins. They are sorted by their technical bin index in the top-level. + // Hence, we can process one top-level technical bin, print the results, clear our stored data and continue with + // the next. + for (size_t ub_index{}; ub_index < hibf_layout.user_bins.size(); ++ub_index) + { + if (ub_index % ub_percentage == 0) + print_progress(ub_index / ub_percentage); + + auto const & user_bin = hibf_layout.user_bins[ub_index]; + current_kmers.clear(); + + // The top-level technical bin index for the current user bin. + // user_bin.previous_TB_indices.size() == 0: true for split bins, false for merged bins + // user_bin.storage_TB_id: technical bin index on the lowest level + // user_bin.previous_TB_indices[0]: technical bin index on the top-level + size_t const idx = + (user_bin.previous_TB_indices.size() == 0) ? user_bin.storage_TB_id : user_bin.previous_TB_indices[0]; + + // We processed all user bins that belong to the `current_idx`th top-level technical bin. + // Print results and update data. + if (idx != current_idx) + { + print_result_line(); + sketch.clear(); + shared_kmers.clear(); + shared_kmers_initialised = false; + ub_count = 0u; + split_count = 0u; + current_idx = idx; + } + + bool const is_merged = bin_kinds[idx] == chopper::layout::hibf_statistics::bin_kind::merged; + // For user bins in a merged bin, `user_bin.number_of_technical_bins` is the number of technical bins on + // the lowest level. A user bin could be part of a merged bin on the top-level, but still be split into + // `user_bin.number_of_technical_bins` many bins on the lowest level. + split_count = is_merged ? 1u : user_bin.number_of_technical_bins; + + // We don't need to keep the current_kmers if there are no shared k-mers to merge them with. + bool const fill_current_kmers = is_merged && !(shared_kmers_initialised && shared_kmers.empty()); + + for (auto const & filename : filenames[user_bin.idx]) + { + ++ub_count; // This assumes that each user bin has exactly one associated file. Currently the case. + + process_file(filename, current_kmers, sketch, fill_current_kmers, chopper_config.k); + } + + // Compute set intersection: shared_kmers = shared_kmers ∩ current_kmers + // This happens for each user bin that belongs to a merged bin. + if (fill_current_kmers) + { + if (!shared_kmers_initialised) + { + shared_kmers_initialised = true; + shared_kmers.insert(current_kmers.begin(), current_kmers.end()); + } + else + { + keep_duplicates(shared_kmers, current_kmers); + } + } + } + + // Print results for the last top-level technical bin. + print_result_line(); + + // The progress bar uses a carriage return '\r' to only use a single line. + std::cerr << '\n'; + + return 0; +} + +void execute_general(config const & cfg) +{ + execute(cfg); +} diff --git a/src/display_layout/process_file.cpp b/src/display_layout/process_file.cpp new file mode 100644 index 00000000..61291bd5 --- /dev/null +++ b/src/display_layout/process_file.cpp @@ -0,0 +1,84 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include +#include + +#include + +#include "shared.hpp" + +struct dna4_traits : public seqan3::sequence_file_input_default_traits_dna +{ + using sequence_alphabet = seqan3::dna4; +}; + +using sequence_file_type = seqan3::sequence_file_input, + seqan3::type_list>; + +void process_file(std::string const & filename, + std::vector & current_kmers, + seqan::hibf::sketch::hyperloglog & sketch, + bool const fill_current_kmers, + uint8_t const kmer_size) +{ + if (filename.ends_with(".minimiser")) + { + uint64_t hash{}; + char * const hash_data{reinterpret_cast(&hash)}; + std::streamsize const hash_bytes{sizeof(hash)}; + + std::ifstream infile{filename, std::ios::binary}; + + if (fill_current_kmers) + { + while (infile.read(hash_data, hash_bytes)) + { + current_kmers.push_back(hash); + sketch.add(hash_data, hash_bytes); + } + } + else + { + while (infile.read(hash_data, hash_bytes)) + { + sketch.add(hash_data, hash_bytes); + } + } + } + else + { + sequence_file_type fin{filename}; + + seqan3::shape shape{seqan3::ungapped{kmer_size}}; + auto minimizer_view = seqan3::views::minimiser_hash(shape, + seqan3::window_size{kmer_size}, + seqan3::seed{chopper::adjust_seed(shape.count())}); + if (fill_current_kmers) + { + for (auto && [seq] : fin) + { + for (uint64_t hash_value : seq | minimizer_view) + { + current_kmers.push_back(hash_value); + sketch.add(reinterpret_cast(&hash_value), sizeof(hash_value)); + } + } + } + else + { + for (auto && [seq] : fin) + { + for (uint64_t hash_value : seq | minimizer_view) + { + sketch.add(reinterpret_cast(&hash_value), sizeof(hash_value)); + } + } + } + } +} diff --git a/src/display_layout/shared.hpp b/src/display_layout/shared.hpp new file mode 100644 index 00000000..55122c1d --- /dev/null +++ b/src/display_layout/shared.hpp @@ -0,0 +1,31 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#pragma once + +#include +#include +#include +#include + +#include + +struct config +{ + std::filesystem::path input{}; + std::filesystem::path output{}; + uint8_t threads{1u}; +}; + +void execute_general(config const & cfg); +void execute_sizes(config const & cfg); + +void process_file(std::string const & filename, + std::vector & current_kmers, + seqan::hibf::sketch::hyperloglog & sketch, + bool const fill_current_kmers, + uint8_t const kmer_size); diff --git a/src/display_layout/sizes.cpp b/src/display_layout/sizes.cpp new file mode 100644 index 00000000..5a0a734e --- /dev/null +++ b/src/display_layout/sizes.cpp @@ -0,0 +1,276 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include +#include + +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "shared.hpp" + +struct stats +{ + std::vector ibf_sizes{}; + std::vector ibf_levels{}; + std::vector ibf_load_factor{}; +}; + +void compute_kmers(robin_hood::unordered_flat_set & kmers, + seqan::hibf::build::build_data const & data, + seqan::hibf::layout::layout::user_bin const & record) +{ + data.config.input_fn(record.idx, std::inserter(kmers, kmers.begin())); +} + +void update_parent_kmers(robin_hood::unordered_flat_set & parent_kmers, + robin_hood::unordered_flat_set const & kmers) +{ + parent_kmers.insert(kmers.begin(), kmers.end()); +} + +size_t compute_ibf_size(robin_hood::unordered_flat_set & parent_kmers, + robin_hood::unordered_flat_set & kmers, + size_t const number_of_bins, + seqan::hibf::layout::graph::node const & ibf_node, + seqan::hibf::build::build_data & data, + size_t const current_hibf_level) +{ + size_t const kmers_per_bin = std::ceil(static_cast(kmers.size()) / number_of_bins); + size_t const bin_size = + std::ceil(seqan::hibf::build::bin_size_in_bits({.fpr = data.config.maximum_false_positive_rate, + .hash_count = data.config.number_of_hash_functions, + .elements = kmers_per_bin}) + * data.fpr_correction[number_of_bins]); + + size_t const ibf_size = ibf_node.number_of_technical_bins * bin_size; + + if (current_hibf_level > 0 /* not top level */) + update_parent_kmers(parent_kmers, kmers); + + return ibf_size; +} + +size_t hierarchical_stats(stats & stats_data, + robin_hood::unordered_flat_set & parent_kmers, + seqan::hibf::layout::graph::node const & current_node, + seqan::hibf::build::build_data & data, + size_t const current_hibf_level) +{ + size_t const ibf_pos{data.request_ibf_idx()}; + std::vector size_waste_per_tb(current_node.number_of_technical_bins); + + robin_hood::unordered_flat_set kmers{}; + + auto initialise_max_bin_kmers = [&]() -> size_t + { + if (current_node.favourite_child_idx.has_value()) // max bin is a merged bin + { + // recursively initialize favourite child first + hierarchical_stats(stats_data, + kmers, + current_node.children[current_node.favourite_child_idx.value()], + data, + current_hibf_level + 1); + return 1; + } + else // max bin is not a merged bin + { + // we assume that the max record is at the beginning of the list of remaining records. + auto const & record = current_node.remaining_records[0]; + compute_kmers(kmers, data, record); + + return record.number_of_technical_bins; + } + }; + + // initialize lower level IBF + size_t const max_bin_tbs = initialise_max_bin_kmers(); + size_t const ibf_size = compute_ibf_size(parent_kmers, kmers, max_bin_tbs, current_node, data, current_hibf_level); + size_t const amount_of_max_bin_kmers = kmers.size(); + kmers.clear(); // reduce memory peak + + // parse all other children (merged bins) of the current ibf + if (!current_node.children.empty()) + { + std::mutex merge_kmer_mutex{}; + + size_t number_of_threads{}; + std::vector indices(current_node.children.size()); + std::iota(indices.begin(), indices.end(), size_t{}); + + // We do not want to process the favourite child. It has already been processed prior. + // https://godbolt.org/z/6Yav7hrG1 + if (current_node.favourite_child_idx.has_value()) + std::erase(indices, current_node.favourite_child_idx.value()); + + if (current_hibf_level == 0) + { + // Shuffle indices: More likely to not block each other. Optimal: Interleave + std::shuffle(indices.begin(), indices.end(), std::mt19937_64{std::random_device{}()}); + number_of_threads = data.config.threads; + } + else + { + number_of_threads = 1u; + } + +#pragma omp parallel for schedule(dynamic, 1) num_threads(number_of_threads) + for (auto && index : indices) + { + auto & child = current_node.children[index]; + + robin_hood::unordered_flat_set kmers2{}; + hierarchical_stats(stats_data, kmers2, child, data, current_hibf_level + 1u); + + if (current_hibf_level > 0 /* not top level */) + { + std::lock_guard guard{merge_kmer_mutex}; + update_parent_kmers(parent_kmers, kmers2); + } + + size_waste_per_tb[child.parent_bin_index] = + amount_of_max_bin_kmers - std::min(amount_of_max_bin_kmers, kmers2.size()); + } + } + + // If max bin was a merged bin, process all remaining records, otherwise the first one has already been processed + size_t const start{(current_node.favourite_child_idx.has_value()) ? 0u : 1u}; + for (size_t i = start; i < current_node.remaining_records.size(); ++i) + { + auto const & record = current_node.remaining_records[i]; + + compute_kmers(kmers, data, record); + + size_t const kmers_per_tb = kmers.size() / record.number_of_technical_bins + 1u; + size_t const diff = amount_of_max_bin_kmers - std::min(amount_of_max_bin_kmers, kmers_per_tb); + + assert(size_waste_per_tb.size() >= record.storage_TB_id + record.number_of_technical_bins); + for (size_t i = record.storage_TB_id; i < record.storage_TB_id + record.number_of_technical_bins; ++i) + size_waste_per_tb[i] = diff; + + if (current_hibf_level > 0 /* not top level */) + update_parent_kmers(parent_kmers, kmers); + + kmers.clear(); + } + + stats_data.ibf_sizes[ibf_pos] = ibf_size; + stats_data.ibf_levels[ibf_pos] = current_hibf_level; + + // compute load factor + size_t const waste = std::accumulate(size_waste_per_tb.begin(), size_waste_per_tb.end(), size_t{}); + size_t const total = amount_of_max_bin_kmers * size_waste_per_tb.size(); + + assert(waste <= total); + + stats_data.ibf_load_factor[ibf_pos] = (total - waste) / static_cast(total); + + return ibf_pos; +} + +size_t hierarchical_stats(stats & stats_data, + seqan::hibf::layout::graph::node const & root_node, + seqan::hibf::build::build_data & data) +{ + robin_hood::unordered_flat_set root_kmers{}; + return hierarchical_stats(stats_data, root_kmers, root_node, data, 0); +} + +void execute_general_stats(config const & cfg) +{ + std::ifstream layout_file{cfg.input}; + + if (!layout_file.good() || !layout_file.is_open()) + throw std::logic_error{"Could not open file " + cfg.input.string() + " for reading"}; + + auto [filenames, chopper_config, hibf_layout] = chopper::layout::read_layout_file(layout_file); + chopper_config.hibf_config.threads = cfg.threads; + + auto input_lambda = [&filenames, &chopper_config](size_t const user_bin_id, seqan::hibf::insert_iterator it) + { + std::vector current_kmers; + seqan::hibf::sketch::hyperloglog sketch; + + if (filenames[user_bin_id].size() > 1) + throw std::runtime_error{"No multi files accepted yet."}; + + process_file(filenames[user_bin_id][0], current_kmers, sketch, true, chopper_config.k); + + for (auto const kmer : current_kmers) + it = kmer; + }; + chopper_config.hibf_config.input_fn = input_lambda; + + auto const & hibf_config = chopper_config.hibf_config; + + std::ofstream output_stream{cfg.output}; + + if (!output_stream.good() || !output_stream.is_open()) + throw std::logic_error{"Could not open file " + cfg.output.string() + " for reading"}; + + size_t const number_of_ibfs = hibf_layout.max_bins.size() + 1u; + + stats stats_data; + stats_data.ibf_sizes.resize(number_of_ibfs); + stats_data.ibf_levels.resize(number_of_ibfs); + stats_data.ibf_load_factor.resize(number_of_ibfs); + + seqan::hibf::build::build_data data{.config = hibf_config, .ibf_graph = {hibf_layout}}; + + seqan::hibf::layout::graph::node const & root_node = data.ibf_graph.root; + + size_t const t_max{root_node.number_of_technical_bins}; + data.fpr_correction = + seqan::hibf::layout::compute_fpr_correction({.fpr = hibf_config.maximum_false_positive_rate, + .hash_count = hibf_config.number_of_hash_functions, + .t_max = t_max}); + + hierarchical_stats(stats_data, root_node, data); + + // accumulate statistics per level + size_t const number_of_levels = std::ranges::max(stats_data.ibf_levels) + 1u; + std::vector size_per_level(number_of_levels); + std::vector load_factor_per_level(number_of_levels); + std::vector num_ibfs_per_level(number_of_levels); + + for (size_t i = 0; i < number_of_ibfs; ++i) + { + size_t const ibf_level{stats_data.ibf_levels[i]}; + size_per_level[ibf_level] += stats_data.ibf_sizes[i]; + load_factor_per_level[ibf_level] += stats_data.ibf_load_factor[i]; + ++num_ibfs_per_level[ibf_level]; + } + + for (size_t i = 0; i < number_of_levels; ++i) + load_factor_per_level[i] /= num_ibfs_per_level[i]; + + output_stream << std::fixed << std::setprecision(2); + output_stream << "# Levels: " << number_of_levels << '\n'; + output_stream << "LEVEL\tBIT_SIZE\tIBFS\tLOAD_FACTOR\n"; + for (size_t i = 0; i < number_of_levels; ++i) + output_stream << i << '\t' << size_per_level[i] << '\t' << num_ibfs_per_level[i] << '\t' + << load_factor_per_level[i] * 100 << '\n'; +} + +void execute_sizes(config const & cfg) +{ + execute_general_stats(cfg); +} From cf9bd9b79296d06afa4b08900b719f1a49b4e3ac Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Mon, 9 Oct 2023 14:06:25 +0200 Subject: [PATCH 4/4] [FEATURE] Also count TB that are bigger than max bin. --- src/display_layout/sizes.cpp | 203 ++++++++++++++++++++++++----------- 1 file changed, 139 insertions(+), 64 deletions(-) diff --git a/src/display_layout/sizes.cpp b/src/display_layout/sizes.cpp index 5a0a734e..a0cee4f1 100644 --- a/src/display_layout/sizes.cpp +++ b/src/display_layout/sizes.cpp @@ -5,6 +5,7 @@ // shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md // --------------------------------------------------------------------------------------------------- +#include #include #include @@ -26,11 +27,85 @@ #include "shared.hpp" -struct stats +struct ibf_stats { - std::vector ibf_sizes{}; - std::vector ibf_levels{}; - std::vector ibf_load_factor{}; + size_t size_in_bits{}; + size_t level{}; + size_t max_elements{}; + size_t tbs_too_big{}; + size_t tbs_too_many_elements{}; + double load_factor{}; +}; + +struct per_level_stats +{ + size_t number_of_levels{}; + std::vector size_in_bits{}; + std::vector max_elements{}; + std::vector tbs_too_big{}; + std::vector tbs_too_many_elements{}; + std::vector num_ibfs{}; + std::vector load_factor{}; + + explicit per_level_stats(std::vector const & stats) + { + number_of_levels = std::ranges::max(stats, + [](ibf_stats const & lhs, ibf_stats const & rhs) + { + return lhs.level < rhs.level; + }) + .level + + 1u; + + size_in_bits.resize(number_of_levels); + max_elements.resize(number_of_levels); + tbs_too_big.resize(number_of_levels); + tbs_too_many_elements.resize(number_of_levels); + num_ibfs.resize(number_of_levels); + load_factor.resize(number_of_levels); + + for (size_t i = 0; i < stats.size(); ++i) + { + auto const & stat = stats[i]; + size_in_bits[stat.level] += stat.size_in_bits; + max_elements[stat.level] += stat.max_elements; + tbs_too_big[stat.level] += stat.tbs_too_big; + tbs_too_many_elements[stat.level] += stat.tbs_too_many_elements; + load_factor[stat.level] += stat.load_factor; + ++num_ibfs[stat.level]; + } + + for (size_t i = 0; i < number_of_levels; ++i) + { + load_factor[i] /= num_ibfs[i]; + max_elements[i] /= num_ibfs[i]; + tbs_too_many_elements[i] /= num_ibfs[i]; + } + } + + void print(std::ostream & stream) const + { + stream << std::fixed << std::setprecision(2); + stream << "# Levels: " << number_of_levels << '\n'; + stream << "LEVEL\t" // + << "BIT_SIZE\t" // + << "IBFS\t" // + << "AVG_LOAD_FACTOR\t" // + << "TBS_TOO_BIG\t" // + << "AVG_TBS_TOO_BIG_ELEMENTS\t" // + << "AVG_MAX_ELEMENTS\n"; + + for (size_t idx = 0; idx < number_of_levels; ++idx) + { + stream << idx << '\t' // + << size_in_bits[idx] << '\t' // + << num_ibfs[idx] << '\t' // + << load_factor[idx] * 100 << '\t' // + << tbs_too_big[idx] << '\t' // + << tbs_too_many_elements[idx] << '\t' // + << max_elements[idx] << '\n'; + } + } }; void compute_kmers(robin_hood::unordered_flat_set & kmers, @@ -68,7 +143,7 @@ size_t compute_ibf_size(robin_hood::unordered_flat_set & parent_kmers, return ibf_size; } -size_t hierarchical_stats(stats & stats_data, +size_t hierarchical_stats(std::vector & stats, robin_hood::unordered_flat_set & parent_kmers, seqan::hibf::layout::graph::node const & current_node, seqan::hibf::build::build_data & data, @@ -84,7 +159,7 @@ size_t hierarchical_stats(stats & stats_data, if (current_node.favourite_child_idx.has_value()) // max bin is a merged bin { // recursively initialize favourite child first - hierarchical_stats(stats_data, + hierarchical_stats(stats, kmers, current_node.children[current_node.favourite_child_idx.value()], data, @@ -105,6 +180,8 @@ size_t hierarchical_stats(stats & stats_data, size_t const max_bin_tbs = initialise_max_bin_kmers(); size_t const ibf_size = compute_ibf_size(parent_kmers, kmers, max_bin_tbs, current_node, data, current_hibf_level); size_t const amount_of_max_bin_kmers = kmers.size(); + std::atomic tbs_too_big{}; + std::atomic tbs_too_many_elements{}; kmers.clear(); // reduce memory peak // parse all other children (merged bins) of the current ibf @@ -137,17 +214,24 @@ size_t hierarchical_stats(stats & stats_data, { auto & child = current_node.children[index]; - robin_hood::unordered_flat_set kmers2{}; - hierarchical_stats(stats_data, kmers2, child, data, current_hibf_level + 1u); + robin_hood::unordered_flat_set local_kmers{}; + hierarchical_stats(stats, local_kmers, child, data, current_hibf_level + 1u); if (current_hibf_level > 0 /* not top level */) { std::lock_guard guard{merge_kmer_mutex}; - update_parent_kmers(parent_kmers, kmers2); + update_parent_kmers(parent_kmers, local_kmers); } - size_waste_per_tb[child.parent_bin_index] = - amount_of_max_bin_kmers - std::min(amount_of_max_bin_kmers, kmers2.size()); + if (amount_of_max_bin_kmers < local_kmers.size()) + { + ++tbs_too_big; + tbs_too_many_elements += local_kmers.size() - amount_of_max_bin_kmers; + } + else + { + size_waste_per_tb[child.parent_bin_index] = amount_of_max_bin_kmers - local_kmers.size(); + } } } @@ -160,11 +244,18 @@ size_t hierarchical_stats(stats & stats_data, compute_kmers(kmers, data, record); size_t const kmers_per_tb = kmers.size() / record.number_of_technical_bins + 1u; - size_t const diff = amount_of_max_bin_kmers - std::min(amount_of_max_bin_kmers, kmers_per_tb); - assert(size_waste_per_tb.size() >= record.storage_TB_id + record.number_of_technical_bins); - for (size_t i = record.storage_TB_id; i < record.storage_TB_id + record.number_of_technical_bins; ++i) - size_waste_per_tb[i] = diff; + if (amount_of_max_bin_kmers < kmers_per_tb) + { + tbs_too_big += record.number_of_technical_bins; + tbs_too_many_elements += (kmers_per_tb - amount_of_max_bin_kmers) * record.number_of_technical_bins; + } + else + { + assert(size_waste_per_tb.size() >= record.storage_TB_id + record.number_of_technical_bins); + for (size_t i = record.storage_TB_id; i < record.storage_TB_id + record.number_of_technical_bins; ++i) + size_waste_per_tb[i] = amount_of_max_bin_kmers - kmers_per_tb; + } if (current_hibf_level > 0 /* not top level */) update_parent_kmers(parent_kmers, kmers); @@ -172,38 +263,45 @@ size_t hierarchical_stats(stats & stats_data, kmers.clear(); } - stats_data.ibf_sizes[ibf_pos] = ibf_size; - stats_data.ibf_levels[ibf_pos] = current_hibf_level; - - // compute load factor - size_t const waste = std::accumulate(size_waste_per_tb.begin(), size_waste_per_tb.end(), size_t{}); - size_t const total = amount_of_max_bin_kmers * size_waste_per_tb.size(); - - assert(waste <= total); - - stats_data.ibf_load_factor[ibf_pos] = (total - waste) / static_cast(total); + double const load_factor = [&]() + { + size_t const waste = std::accumulate(size_waste_per_tb.begin(), size_waste_per_tb.end(), size_t{}); + size_t const total = amount_of_max_bin_kmers * current_node.number_of_technical_bins; + assert(waste <= total); + assert(total > 0u); + return (total - waste) / static_cast(total); + }(); + + auto & current_stats = stats[ibf_pos]; + current_stats.size_in_bits = ibf_size; + current_stats.level = current_hibf_level; + current_stats.max_elements = amount_of_max_bin_kmers; + current_stats.tbs_too_big = tbs_too_big.load(); + current_stats.tbs_too_many_elements = tbs_too_many_elements.load(); + current_stats.load_factor = load_factor; return ibf_pos; } -size_t hierarchical_stats(stats & stats_data, +size_t hierarchical_stats(std::vector & stats, seqan::hibf::layout::graph::node const & root_node, seqan::hibf::build::build_data & data) { robin_hood::unordered_flat_set root_kmers{}; - return hierarchical_stats(stats_data, root_kmers, root_node, data, 0); + return hierarchical_stats(stats, root_kmers, root_node, data, 0); } void execute_general_stats(config const & cfg) { + // Read config and layout std::ifstream layout_file{cfg.input}; - if (!layout_file.good() || !layout_file.is_open()) throw std::logic_error{"Could not open file " + cfg.input.string() + " for reading"}; auto [filenames, chopper_config, hibf_layout] = chopper::layout::read_layout_file(layout_file); - chopper_config.hibf_config.threads = cfg.threads; + // Prepare configs + chopper_config.hibf_config.threads = cfg.threads; auto input_lambda = [&filenames, &chopper_config](size_t const user_bin_id, seqan::hibf::insert_iterator it) { std::vector current_kmers; @@ -218,56 +316,33 @@ void execute_general_stats(config const & cfg) it = kmer; }; chopper_config.hibf_config.input_fn = input_lambda; - auto const & hibf_config = chopper_config.hibf_config; - std::ofstream output_stream{cfg.output}; - - if (!output_stream.good() || !output_stream.is_open()) - throw std::logic_error{"Could not open file " + cfg.output.string() + " for reading"}; - + // Prepare stats size_t const number_of_ibfs = hibf_layout.max_bins.size() + 1u; + std::vector stats(number_of_ibfs); - stats stats_data; - stats_data.ibf_sizes.resize(number_of_ibfs); - stats_data.ibf_levels.resize(number_of_ibfs); - stats_data.ibf_load_factor.resize(number_of_ibfs); - + // Prepare data seqan::hibf::build::build_data data{.config = hibf_config, .ibf_graph = {hibf_layout}}; - seqan::hibf::layout::graph::node const & root_node = data.ibf_graph.root; - size_t const t_max{root_node.number_of_technical_bins}; data.fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = hibf_config.maximum_false_positive_rate, .hash_count = hibf_config.number_of_hash_functions, .t_max = t_max}); - hierarchical_stats(stats_data, root_node, data); + // Get stats + hierarchical_stats(stats, root_node, data); - // accumulate statistics per level - size_t const number_of_levels = std::ranges::max(stats_data.ibf_levels) + 1u; - std::vector size_per_level(number_of_levels); - std::vector load_factor_per_level(number_of_levels); - std::vector num_ibfs_per_level(number_of_levels); + // Get stats per level + per_level_stats const level_stats{stats}; - for (size_t i = 0; i < number_of_ibfs; ++i) - { - size_t const ibf_level{stats_data.ibf_levels[i]}; - size_per_level[ibf_level] += stats_data.ibf_sizes[i]; - load_factor_per_level[ibf_level] += stats_data.ibf_load_factor[i]; - ++num_ibfs_per_level[ibf_level]; - } - - for (size_t i = 0; i < number_of_levels; ++i) - load_factor_per_level[i] /= num_ibfs_per_level[i]; + // Output + std::ofstream output_stream{cfg.output}; + if (!output_stream.good() || !output_stream.is_open()) + throw std::logic_error{"Could not open file " + cfg.output.string() + " for reading"}; - output_stream << std::fixed << std::setprecision(2); - output_stream << "# Levels: " << number_of_levels << '\n'; - output_stream << "LEVEL\tBIT_SIZE\tIBFS\tLOAD_FACTOR\n"; - for (size_t i = 0; i < number_of_levels; ++i) - output_stream << i << '\t' << size_per_level[i] << '\t' << num_ibfs_per_level[i] << '\t' - << load_factor_per_level[i] * 100 << '\n'; + level_stats.print(output_stream); } void execute_sizes(config const & cfg)