Skip to content

Commit

Permalink
[FEATURE] print out merge() stats.
Browse files Browse the repository at this point in the history
  • Loading branch information
smehringer committed Oct 18, 2023
1 parent 6aae9a6 commit 80ef70b
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 4 deletions.
24 changes: 20 additions & 4 deletions src/display_layout/general.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,17 @@ void keep_duplicates(robin_hood::unordered_set<uint64_t> & shared, std::vector<u
shared = std::move(result);
}

size_t merge_sketches_and_estimate(std::vector<seqan::hibf::sketch::hyperloglog> const & sketches)
{
assert(sketches.size() > 0);
seqan::hibf::sketch::hyperloglog temp_hll = sketches[0];

for (size_t j = 1; j < sketches.size(); ++j)
temp_hll.merge(sketches[j]);

return temp_hll.estimate();
}

int execute(config const & cfg)
{
std::ifstream layout_file{cfg.input};
Expand Down Expand Up @@ -93,7 +104,9 @@ int execute(config const & cfg)
return first_idx < second_idx || (first_idx == second_idx && filesizes[lhs.idx] < filesizes[rhs.idx]);
});

seqan::hibf::sketch::hyperloglog sketch{hibf_config.sketch_bits};
seqan::hibf::sketch::hyperloglog sketch{hibf_config.sketch_bits}; // store one sketch computeted by iterative add
std::vector<seqan::hibf::sketch::hyperloglog> sketches; // store a sketch for each user bin to merge afterwards

robin_hood::unordered_set<uint64_t> shared_kmers{};
// We can't use `shared_kmers.size() == 0` instead of `shared_kmers_initialised`, because keep_duplicates
// will result in a size of 0 when there are no shared k-mers.
Expand Down Expand Up @@ -128,12 +141,14 @@ int execute(config const & cfg)
<< "shared_size\t"
<< "ub_count\t"
<< "kind\t"
<< "splits" << '\n';
<< "splits\t"
<< "estimated_size\n";

auto print_result_line = [&]()
{
bool const is_merged{bin_kinds[current_idx] == chopper::layout::hibf_statistics::bin_kind::merged};
size_t const avg_kmer_count = (sketch.estimate() + split_count - 1u) / split_count;
size_t const merge_estimate = merge_sketches_and_estimate(sketches);

for (size_t i{}, total{split_count}; i < total; ++i)
{
Expand All @@ -142,7 +157,7 @@ int execute(config const & cfg)
<< shared_kmers.size() << '\t' //
<< ub_count << '\t' //
<< (is_merged ? "merged" : "split") << '\t' //
<< split_count << '\n';
<< split_count << '\t' << (is_merged ? merge_estimate : avg_kmer_count) << '\n';
split_count = 0u; // Subsequent split bins display 0, the first split bin displays the actual split count.
}
};
Expand Down Expand Up @@ -171,6 +186,7 @@ int execute(config const & cfg)
{
print_result_line();
sketch.reset();
sketches.clear();
shared_kmers.clear();
shared_kmers_initialised = false;
ub_count = 0u;
Expand All @@ -191,7 +207,7 @@ int execute(config const & cfg)
{
++ub_count; // This assumes that each user bin has exactly one associated file. Currently the case.

process_file(filename, current_kmers, sketch, fill_current_kmers, chopper_config.k);
process_file(filename, current_kmers, sketch, sketches, fill_current_kmers, chopper_config.k);
}

// Compute set intersection: shared_kmers = shared_kmers ∩ current_kmers
Expand Down
21 changes: 21 additions & 0 deletions src/display_layout/process_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,13 @@ using sequence_file_type = seqan3::sequence_file_input<dna4_traits,
void process_file(std::string const & filename,
std::vector<uint64_t> & current_kmers,
seqan::hibf::sketch::hyperloglog & sketch,
std::vector<seqan::hibf::sketch::hyperloglog> & sketches,
bool const fill_current_kmers,
uint8_t const kmer_size)
{
seqan::hibf::sketch::hyperloglog local_ub_sketch = sketch; // copy sketch configuration
local_ub_sketch.reset();

if (filename.ends_with(".minimiser"))
{
uint64_t hash{};
Expand All @@ -41,13 +45,15 @@ void process_file(std::string const & filename,
{
current_kmers.push_back(hash);
sketch.add(hash);
local_ub_sketch.add(hash);
}
}
else
{
while (infile.read(hash_data, hash_bytes))
{
sketch.add(hash);
local_ub_sketch.add(hash);
}
}
}
Expand All @@ -67,6 +73,7 @@ void process_file(std::string const & filename,
{
current_kmers.push_back(hash_value);
sketch.add(hash_value);
local_ub_sketch.add(hash_value);
}
}
}
Expand All @@ -77,8 +84,22 @@ void process_file(std::string const & filename,
for (uint64_t hash_value : seq | minimizer_view)
{
sketch.add(hash_value);
local_ub_sketch.add(hash_value);
}
}
}
}

sketches.push_back(local_ub_sketch);
}

void process_file(std::string const & filename,
std::vector<uint64_t> & current_kmers,
seqan::hibf::sketch::hyperloglog & sketch,
bool const fill_current_kmers,
uint8_t const kmer_size)
{
std::vector<seqan::hibf::sketch::hyperloglog> sketches;

process_file(filename, current_kmers, sketch, sketches, fill_current_kmers, kmer_size);
}
7 changes: 7 additions & 0 deletions src/display_layout/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,10 @@ void process_file(std::string const & filename,
seqan::hibf::sketch::hyperloglog & sketch,
bool const fill_current_kmers,
uint8_t const kmer_size);

void process_file(std::string const & filename,
std::vector<uint64_t> & current_kmers,
seqan::hibf::sketch::hyperloglog & sketch,
std::vector<seqan::hibf::sketch::hyperloglog> & sketches,
bool const fill_current_kmers,
uint8_t const kmer_size);

0 comments on commit 80ef70b

Please sign in to comment.