Skip to content

Commit

Permalink
Also count TB that are bigger than max bin
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Oct 9, 2023
1 parent 2c3858d commit e5e2adb
Showing 1 changed file with 139 additions and 64 deletions.
203 changes: 139 additions & 64 deletions src/display_layout/sizes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md
// ---------------------------------------------------------------------------------------------------

#include <atomic>
#include <cassert>
#include <iostream>

Expand All @@ -26,11 +27,85 @@

#include "shared.hpp"

struct stats
struct ibf_stats
{
std::vector<size_t> ibf_sizes{};
std::vector<size_t> ibf_levels{};
std::vector<double> ibf_load_factor{};
size_t size_in_bits{};
size_t level{};
size_t max_elements{};
size_t tbs_too_big{};
size_t tbs_too_many_elements{};
double load_factor{};
};

struct per_level_stats
{
size_t number_of_levels{};
std::vector<size_t> size_in_bits{};
std::vector<size_t> max_elements{};
std::vector<size_t> tbs_too_big{};
std::vector<size_t> tbs_too_many_elements{};
std::vector<size_t> num_ibfs{};
std::vector<double> load_factor{};

explicit per_level_stats(std::vector<ibf_stats> const & stats)
{
number_of_levels = std::ranges::max(stats,
[](ibf_stats const & lhs, ibf_stats const & rhs)
{
return lhs.level < rhs.level;
})
.level
+ 1u;

size_in_bits.resize(number_of_levels);
max_elements.resize(number_of_levels);
tbs_too_big.resize(number_of_levels);
tbs_too_many_elements.resize(number_of_levels);
num_ibfs.resize(number_of_levels);
load_factor.resize(number_of_levels);

for (size_t i = 0; i < stats.size(); ++i)
{
auto const & stat = stats[i];
size_in_bits[stat.level] += stat.size_in_bits;
max_elements[stat.level] += stat.max_elements;
tbs_too_big[stat.level] += stat.tbs_too_big;
tbs_too_many_elements[stat.level] += stat.tbs_too_many_elements;
load_factor[stat.level] += stat.load_factor;
++num_ibfs[stat.level];
}

for (size_t i = 0; i < number_of_levels; ++i)
{
load_factor[i] /= num_ibfs[i];
max_elements[i] /= num_ibfs[i];
tbs_too_many_elements[i] /= num_ibfs[i];
}
}

void print(std::ostream & stream) const
{
stream << std::fixed << std::setprecision(2);
stream << "# Levels: " << number_of_levels << '\n';
stream << "LEVEL\t" //
<< "BIT_SIZE\t" //
<< "IBFS\t" //
<< "AVG_LOAD_FACTOR\t" //
<< "TBS_TOO_BIG\t" //
<< "AVG_TBS_TOO_BIG_ELEMENTS\t" //
<< "AVG_MAX_ELEMENTS\n";

for (size_t idx = 0; idx < number_of_levels; ++idx)
{
stream << idx << '\t' //
<< size_in_bits[idx] << '\t' //
<< num_ibfs[idx] << '\t' //
<< load_factor[idx] * 100 << '\t' //
<< tbs_too_big[idx] << '\t' //
<< tbs_too_many_elements[idx] << '\t' //
<< max_elements[idx] << '\n';
}
}
};

void compute_kmers(robin_hood::unordered_flat_set<uint64_t> & kmers,
Expand Down Expand Up @@ -68,7 +143,7 @@ size_t compute_ibf_size(robin_hood::unordered_flat_set<uint64_t> & parent_kmers,
return ibf_size;
}

size_t hierarchical_stats(stats & stats_data,
size_t hierarchical_stats(std::vector<ibf_stats> & stats,
robin_hood::unordered_flat_set<uint64_t> & parent_kmers,
seqan::hibf::layout::graph::node const & current_node,
seqan::hibf::build::build_data & data,
Expand All @@ -84,7 +159,7 @@ size_t hierarchical_stats(stats & stats_data,
if (current_node.favourite_child_idx.has_value()) // max bin is a merged bin
{
// recursively initialize favourite child first
hierarchical_stats(stats_data,
hierarchical_stats(stats,
kmers,
current_node.children[current_node.favourite_child_idx.value()],
data,
Expand All @@ -105,6 +180,8 @@ size_t hierarchical_stats(stats & stats_data,
size_t const max_bin_tbs = initialise_max_bin_kmers();
size_t const ibf_size = compute_ibf_size(parent_kmers, kmers, max_bin_tbs, current_node, data, current_hibf_level);
size_t const amount_of_max_bin_kmers = kmers.size();
std::atomic<uint64_t> tbs_too_big{};
std::atomic<uint64_t> tbs_too_many_elements{};
kmers.clear(); // reduce memory peak

// parse all other children (merged bins) of the current ibf
Expand Down Expand Up @@ -137,17 +214,24 @@ size_t hierarchical_stats(stats & stats_data,
{
auto & child = current_node.children[index];

robin_hood::unordered_flat_set<uint64_t> kmers2{};
hierarchical_stats(stats_data, kmers2, child, data, current_hibf_level + 1u);
robin_hood::unordered_flat_set<uint64_t> local_kmers{};
hierarchical_stats(stats, local_kmers, child, data, current_hibf_level + 1u);

if (current_hibf_level > 0 /* not top level */)
{
std::lock_guard<std::mutex> guard{merge_kmer_mutex};
update_parent_kmers(parent_kmers, kmers2);
update_parent_kmers(parent_kmers, local_kmers);
}

size_waste_per_tb[child.parent_bin_index] =
amount_of_max_bin_kmers - std::min(amount_of_max_bin_kmers, kmers2.size());
if (amount_of_max_bin_kmers < local_kmers.size())
{
++tbs_too_big;
tbs_too_many_elements += local_kmers.size() - amount_of_max_bin_kmers;
}
else
{
size_waste_per_tb[child.parent_bin_index] = amount_of_max_bin_kmers - local_kmers.size();
}
}
}

Expand All @@ -160,50 +244,64 @@ size_t hierarchical_stats(stats & stats_data,
compute_kmers(kmers, data, record);

size_t const kmers_per_tb = kmers.size() / record.number_of_technical_bins + 1u;
size_t const diff = amount_of_max_bin_kmers - std::min(amount_of_max_bin_kmers, kmers_per_tb);

assert(size_waste_per_tb.size() >= record.storage_TB_id + record.number_of_technical_bins);
for (size_t i = record.storage_TB_id; i < record.storage_TB_id + record.number_of_technical_bins; ++i)
size_waste_per_tb[i] = diff;
if (amount_of_max_bin_kmers < kmers_per_tb)
{
tbs_too_big += record.number_of_technical_bins;
tbs_too_many_elements += (kmers_per_tb - amount_of_max_bin_kmers) * record.number_of_technical_bins;
}
else
{
assert(size_waste_per_tb.size() >= record.storage_TB_id + record.number_of_technical_bins);
for (size_t i = record.storage_TB_id; i < record.storage_TB_id + record.number_of_technical_bins; ++i)
size_waste_per_tb[i] = amount_of_max_bin_kmers - kmers_per_tb;
}

if (current_hibf_level > 0 /* not top level */)
update_parent_kmers(parent_kmers, kmers);

kmers.clear();
}

stats_data.ibf_sizes[ibf_pos] = ibf_size;
stats_data.ibf_levels[ibf_pos] = current_hibf_level;

// compute load factor
size_t const waste = std::accumulate(size_waste_per_tb.begin(), size_waste_per_tb.end(), size_t{});
size_t const total = amount_of_max_bin_kmers * size_waste_per_tb.size();

assert(waste <= total);

stats_data.ibf_load_factor[ibf_pos] = (total - waste) / static_cast<double>(total);
double const load_factor = [&]()
{
size_t const waste = std::accumulate(size_waste_per_tb.begin(), size_waste_per_tb.end(), size_t{});
size_t const total = amount_of_max_bin_kmers * current_node.number_of_technical_bins;
assert(waste <= total);
assert(total > 0u);
return (total - waste) / static_cast<double>(total);
}();

auto & current_stats = stats[ibf_pos];
current_stats.size_in_bits = ibf_size;
current_stats.level = current_hibf_level;
current_stats.max_elements = amount_of_max_bin_kmers;
current_stats.tbs_too_big = tbs_too_big.load();
current_stats.tbs_too_many_elements = tbs_too_many_elements.load();
current_stats.load_factor = load_factor;

return ibf_pos;
}

size_t hierarchical_stats(stats & stats_data,
size_t hierarchical_stats(std::vector<ibf_stats> & stats,
seqan::hibf::layout::graph::node const & root_node,
seqan::hibf::build::build_data & data)
{
robin_hood::unordered_flat_set<uint64_t> root_kmers{};
return hierarchical_stats(stats_data, root_kmers, root_node, data, 0);
return hierarchical_stats(stats, root_kmers, root_node, data, 0);
}

void execute_general_stats(config const & cfg)
{
// Read config and layout
std::ifstream layout_file{cfg.input};

if (!layout_file.good() || !layout_file.is_open())
throw std::logic_error{"Could not open file " + cfg.input.string() + " for reading"};

auto [filenames, chopper_config, hibf_layout] = chopper::layout::read_layout_file(layout_file);
chopper_config.hibf_config.threads = cfg.threads;

// Prepare configs
chopper_config.hibf_config.threads = cfg.threads;
auto input_lambda = [&filenames, &chopper_config](size_t const user_bin_id, seqan::hibf::insert_iterator it)
{
std::vector<uint64_t> current_kmers;
Expand All @@ -218,56 +316,33 @@ void execute_general_stats(config const & cfg)
it = kmer;
};
chopper_config.hibf_config.input_fn = input_lambda;

auto const & hibf_config = chopper_config.hibf_config;

std::ofstream output_stream{cfg.output};

if (!output_stream.good() || !output_stream.is_open())
throw std::logic_error{"Could not open file " + cfg.output.string() + " for reading"};

// Prepare stats
size_t const number_of_ibfs = hibf_layout.max_bins.size() + 1u;
std::vector<ibf_stats> stats(number_of_ibfs);

stats stats_data;
stats_data.ibf_sizes.resize(number_of_ibfs);
stats_data.ibf_levels.resize(number_of_ibfs);
stats_data.ibf_load_factor.resize(number_of_ibfs);

// Prepare data
seqan::hibf::build::build_data data{.config = hibf_config, .ibf_graph = {hibf_layout}};

seqan::hibf::layout::graph::node const & root_node = data.ibf_graph.root;

size_t const t_max{root_node.number_of_technical_bins};
data.fpr_correction =
seqan::hibf::layout::compute_fpr_correction({.fpr = hibf_config.maximum_false_positive_rate,
.hash_count = hibf_config.number_of_hash_functions,
.t_max = t_max});

hierarchical_stats(stats_data, root_node, data);
// Get stats
hierarchical_stats(stats, root_node, data);

// accumulate statistics per level
size_t const number_of_levels = std::ranges::max(stats_data.ibf_levels) + 1u;
std::vector<size_t> size_per_level(number_of_levels);
std::vector<double> load_factor_per_level(number_of_levels);
std::vector<size_t> num_ibfs_per_level(number_of_levels);
// Get stats per level
per_level_stats const level_stats{stats};

for (size_t i = 0; i < number_of_ibfs; ++i)
{
size_t const ibf_level{stats_data.ibf_levels[i]};
size_per_level[ibf_level] += stats_data.ibf_sizes[i];
load_factor_per_level[ibf_level] += stats_data.ibf_load_factor[i];
++num_ibfs_per_level[ibf_level];
}

for (size_t i = 0; i < number_of_levels; ++i)
load_factor_per_level[i] /= num_ibfs_per_level[i];
// Output
std::ofstream output_stream{cfg.output};
if (!output_stream.good() || !output_stream.is_open())
throw std::logic_error{"Could not open file " + cfg.output.string() + " for reading"};

output_stream << std::fixed << std::setprecision(2);
output_stream << "# Levels: " << number_of_levels << '\n';
output_stream << "LEVEL\tBIT_SIZE\tIBFS\tLOAD_FACTOR\n";
for (size_t i = 0; i < number_of_levels; ++i)
output_stream << i << '\t' << size_per_level[i] << '\t' << num_ibfs_per_level[i] << '\t'
<< load_factor_per_level[i] * 100 << '\n';
level_stats.print(output_stream);
}

void execute_sizes(config const & cfg)
Expand Down

0 comments on commit e5e2adb

Please sign in to comment.