Skip to content

Commit

Permalink
[MISC] Remove merge sketches again.
Browse files Browse the repository at this point in the history
  • Loading branch information
smehringer committed Oct 18, 2023
1 parent 71e23c0 commit c02fece
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 33 deletions.
28 changes: 6 additions & 22 deletions src/display_layout/general.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,6 @@ void keep_duplicates(robin_hood::unordered_set<uint64_t> & shared, std::vector<u
shared = std::move(result);
}

size_t merge_sketches_and_estimate(std::vector<seqan::hibf::sketch::hyperloglog> const & sketches)
{
assert(sketches.size() > 0);
seqan::hibf::sketch::hyperloglog temp_hll = sketches[0];

for (size_t j = 1; j < sketches.size(); ++j)
temp_hll.merge(sketches[j]);

return temp_hll.estimate();
}

int execute(config const & cfg)
{
std::ifstream layout_file{cfg.input};
Expand Down Expand Up @@ -103,7 +92,6 @@ int execute(config const & cfg)
});

seqan::hibf::sketch::hyperloglog sketch{hibf_config.sketch_bits}; // store one sketch computeted by iterative add
std::vector<seqan::hibf::sketch::hyperloglog> sketches; // store a sketch for each user bin to merge afterwards

robin_hood::unordered_set<uint64_t> current_kmer_set{};
robin_hood::unordered_set<uint64_t> shared_kmers{};
Expand Down Expand Up @@ -137,30 +125,27 @@ int execute(config const & cfg)
output_stream << "# Layout: " << cfg.input.c_str() << '\n' //
<< "tb_index\t"
<< "size\t"
<< "estimated_size\t"
<< "shared_size\t"
<< "ub_count\t"
<< "kind\t"
<< "splits\t"
<< "estimated_size_single_sketch\t"
<< "estimated_size_merged_sketch\n";
<< "splits\n";

auto print_result_line = [&]()
{
bool const is_merged{bin_kinds[current_idx] == chopper::layout::hibf_statistics::bin_kind::merged};
size_t const avg_kmer_count = (current_kmer_set.size() + split_count - 1u) / split_count;
size_t const one_sketch_estimate = (sketch.estimate() + split_count - 1u) / split_count;
size_t const merge_estimate = merge_sketches_and_estimate(sketches);
size_t const sketch_estimate = (sketch.estimate() + split_count - 1u) / split_count;

for (size_t i{}, total{split_count}; i < total; ++i)
{
output_stream << current_idx + i << '\t' //
<< avg_kmer_count << '\t' //
<< sketch_estimate << '\t' //
<< shared_kmers.size() << '\t' //
<< ub_count << '\t' //
<< (is_merged ? "merged" : "split") << '\t' //
<< split_count << '\t' //
<< one_sketch_estimate << '\t' //
<< (is_merged ? merge_estimate : one_sketch_estimate) << '\n';
<< split_count << '\n';
split_count = 0u; // Subsequent split bins display 0, the first split bin displays the actual split count.
}
};
Expand Down Expand Up @@ -191,7 +176,6 @@ int execute(config const & cfg)

// reset all current data
sketch.reset();
sketches.clear();
current_kmers.clear();
shared_kmers.clear();
shared_kmers_initialised = false;
Expand All @@ -213,7 +197,7 @@ int execute(config const & cfg)
{
++ub_count; // This assumes that each user bin has exactly one associated file. Currently the case.

process_file(filename, current_kmer_set, current_kmers, sketch, sketches, fill_current_kmers, chopper_config.k);
process_file(filename, current_kmer_set, current_kmers, sketch, fill_current_kmers, chopper_config.k);
}

// Compute set intersection: shared_kmers = shared_kmers ∩ current_kmers
Expand Down
10 changes: 0 additions & 10 deletions src/display_layout/process_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,9 @@ void process_file(std::string const & filename,
robin_hood::unordered_set<uint64_t> & current_kmer_set,
std::vector<uint64_t> & current_kmers,
seqan::hibf::sketch::hyperloglog & sketch,
std::vector<seqan::hibf::sketch::hyperloglog> & sketches,
bool const fill_current_kmers,
uint8_t const kmer_size)
{
seqan::hibf::sketch::hyperloglog local_ub_sketch = sketch; // copy sketch configuration
local_ub_sketch.reset();

if (filename.ends_with(".minimiser"))
{
uint64_t hash{};
Expand All @@ -47,7 +43,6 @@ void process_file(std::string const & filename,
current_kmers.push_back(hash);
current_kmer_set.insert(hash);
sketch.add(hash);
local_ub_sketch.add(hash);
}
}
else
Expand All @@ -56,7 +51,6 @@ void process_file(std::string const & filename,
{
sketch.add(hash);
current_kmer_set.insert(hash);
local_ub_sketch.add(hash);
}
}
}
Expand All @@ -77,7 +71,6 @@ void process_file(std::string const & filename,
current_kmers.push_back(hash_value);
current_kmer_set.insert(hash_value);
sketch.add(hash_value);
local_ub_sketch.add(hash_value);
}
}
}
Expand All @@ -89,13 +82,10 @@ void process_file(std::string const & filename,
{
current_kmer_set.insert(hash_value);
sketch.add(hash_value);
local_ub_sketch.add(hash_value);
}
}
}
}

sketches.push_back(local_ub_sketch);
}

void process_file(std::string const & filename,
Expand Down
1 change: 0 additions & 1 deletion src/display_layout/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,5 @@ void process_file(std::string const & filename,
robin_hood::unordered_set<uint64_t> & current_kmer_set,
std::vector<uint64_t> & current_kmers,
seqan::hibf::sketch::hyperloglog & sketch,
std::vector<seqan::hibf::sketch::hyperloglog> & sketches,
bool const fill_current_kmers,
uint8_t const kmer_size);

0 comments on commit c02fece

Please sign in to comment.