From 80ef70b4fa56aad831915b4626129240eb27a612 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Mon, 9 Oct 2023 17:11:19 +0200 Subject: [PATCH] [FEATURE] print out merge() stats. --- src/display_layout/general.cpp | 24 ++++++++++++++++++++---- src/display_layout/process_file.cpp | 21 +++++++++++++++++++++ src/display_layout/shared.hpp | 7 +++++++ 3 files changed, 48 insertions(+), 4 deletions(-) diff --git a/src/display_layout/general.cpp b/src/display_layout/general.cpp index f5b042aa..7c9e31a2 100644 --- a/src/display_layout/general.cpp +++ b/src/display_layout/general.cpp @@ -50,6 +50,17 @@ void keep_duplicates(robin_hood::unordered_set & shared, std::vector const & sketches) +{ + assert(sketches.size() > 0); + seqan::hibf::sketch::hyperloglog temp_hll = sketches[0]; + + for (size_t j = 1; j < sketches.size(); ++j) + temp_hll.merge(sketches[j]); + + return temp_hll.estimate(); +} + int execute(config const & cfg) { std::ifstream layout_file{cfg.input}; @@ -93,7 +104,9 @@ int execute(config const & cfg) return first_idx < second_idx || (first_idx == second_idx && filesizes[lhs.idx] < filesizes[rhs.idx]); }); - seqan::hibf::sketch::hyperloglog sketch{hibf_config.sketch_bits}; + seqan::hibf::sketch::hyperloglog sketch{hibf_config.sketch_bits}; // store one sketch computeted by iterative add + std::vector sketches; // store a sketch for each user bin to merge afterwards + robin_hood::unordered_set shared_kmers{}; // We can't use `shared_kmers.size() == 0` instead of `shared_kmers_initialised`, because keep_duplicates // will result in a size of 0 when there are no shared k-mers. @@ -128,12 +141,14 @@ int execute(config const & cfg) << "shared_size\t" << "ub_count\t" << "kind\t" - << "splits" << '\n'; + << "splits\t" + << "estimated_size\n"; auto print_result_line = [&]() { bool const is_merged{bin_kinds[current_idx] == chopper::layout::hibf_statistics::bin_kind::merged}; size_t const avg_kmer_count = (sketch.estimate() + split_count - 1u) / split_count; + size_t const merge_estimate = merge_sketches_and_estimate(sketches); for (size_t i{}, total{split_count}; i < total; ++i) { @@ -142,7 +157,7 @@ int execute(config const & cfg) << shared_kmers.size() << '\t' // << ub_count << '\t' // << (is_merged ? "merged" : "split") << '\t' // - << split_count << '\n'; + << split_count << '\t' << (is_merged ? merge_estimate : avg_kmer_count) << '\n'; split_count = 0u; // Subsequent split bins display 0, the first split bin displays the actual split count. } }; @@ -171,6 +186,7 @@ int execute(config const & cfg) { print_result_line(); sketch.reset(); + sketches.clear(); shared_kmers.clear(); shared_kmers_initialised = false; ub_count = 0u; @@ -191,7 +207,7 @@ int execute(config const & cfg) { ++ub_count; // This assumes that each user bin has exactly one associated file. Currently the case. - process_file(filename, current_kmers, sketch, fill_current_kmers, chopper_config.k); + process_file(filename, current_kmers, sketch, sketches, fill_current_kmers, chopper_config.k); } // Compute set intersection: shared_kmers = shared_kmers ∩ current_kmers diff --git a/src/display_layout/process_file.cpp b/src/display_layout/process_file.cpp index 3d08361a..1fedc268 100644 --- a/src/display_layout/process_file.cpp +++ b/src/display_layout/process_file.cpp @@ -24,9 +24,13 @@ using sequence_file_type = seqan3::sequence_file_input & current_kmers, seqan::hibf::sketch::hyperloglog & sketch, + std::vector & sketches, bool const fill_current_kmers, uint8_t const kmer_size) { + seqan::hibf::sketch::hyperloglog local_ub_sketch = sketch; // copy sketch configuration + local_ub_sketch.reset(); + if (filename.ends_with(".minimiser")) { uint64_t hash{}; @@ -41,6 +45,7 @@ void process_file(std::string const & filename, { current_kmers.push_back(hash); sketch.add(hash); + local_ub_sketch.add(hash); } } else @@ -48,6 +53,7 @@ void process_file(std::string const & filename, while (infile.read(hash_data, hash_bytes)) { sketch.add(hash); + local_ub_sketch.add(hash); } } } @@ -67,6 +73,7 @@ void process_file(std::string const & filename, { current_kmers.push_back(hash_value); sketch.add(hash_value); + local_ub_sketch.add(hash_value); } } } @@ -77,8 +84,22 @@ void process_file(std::string const & filename, for (uint64_t hash_value : seq | minimizer_view) { sketch.add(hash_value); + local_ub_sketch.add(hash_value); } } } } + + sketches.push_back(local_ub_sketch); +} + +void process_file(std::string const & filename, + std::vector & current_kmers, + seqan::hibf::sketch::hyperloglog & sketch, + bool const fill_current_kmers, + uint8_t const kmer_size) +{ + std::vector sketches; + + process_file(filename, current_kmers, sketch, sketches, fill_current_kmers, kmer_size); } diff --git a/src/display_layout/shared.hpp b/src/display_layout/shared.hpp index 55122c1d..1c53838c 100644 --- a/src/display_layout/shared.hpp +++ b/src/display_layout/shared.hpp @@ -29,3 +29,10 @@ void process_file(std::string const & filename, seqan::hibf::sketch::hyperloglog & sketch, bool const fill_current_kmers, uint8_t const kmer_size); + +void process_file(std::string const & filename, + std::vector & current_kmers, + seqan::hibf::sketch::hyperloglog & sketch, + std::vector & sketches, + bool const fill_current_kmers, + uint8_t const kmer_size);