Skip to content

Commit

Permalink
[MISC] Add exact size computation.
Browse files Browse the repository at this point in the history
  • Loading branch information
smehringer committed Oct 18, 2023
1 parent 80ef70b commit 71e23c0
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 14 deletions.
18 changes: 12 additions & 6 deletions src/display_layout/general.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@

#include <iostream>

#include <robin_hood.h>

#include <sharg/detail/to_string.hpp>
#include <sharg/exceptions.hpp>
#include <sharg/parser.hpp>
Expand Down Expand Up @@ -107,6 +105,7 @@ int execute(config const & cfg)
seqan::hibf::sketch::hyperloglog sketch{hibf_config.sketch_bits}; // store one sketch computeted by iterative add
std::vector<seqan::hibf::sketch::hyperloglog> sketches; // store a sketch for each user bin to merge afterwards

robin_hood::unordered_set<uint64_t> current_kmer_set{};
robin_hood::unordered_set<uint64_t> shared_kmers{};
// We can't use `shared_kmers.size() == 0` instead of `shared_kmers_initialised`, because keep_duplicates
// will result in a size of 0 when there are no shared k-mers.
Expand Down Expand Up @@ -142,12 +141,14 @@ int execute(config const & cfg)
<< "ub_count\t"
<< "kind\t"
<< "splits\t"
<< "estimated_size\n";
<< "estimated_size_single_sketch\t"
<< "estimated_size_merged_sketch\n";

auto print_result_line = [&]()
{
bool const is_merged{bin_kinds[current_idx] == chopper::layout::hibf_statistics::bin_kind::merged};
size_t const avg_kmer_count = (sketch.estimate() + split_count - 1u) / split_count;
size_t const avg_kmer_count = (current_kmer_set.size() + split_count - 1u) / split_count;
size_t const one_sketch_estimate = (sketch.estimate() + split_count - 1u) / split_count;
size_t const merge_estimate = merge_sketches_and_estimate(sketches);

for (size_t i{}, total{split_count}; i < total; ++i)
Expand All @@ -157,7 +158,9 @@ int execute(config const & cfg)
<< shared_kmers.size() << '\t' //
<< ub_count << '\t' //
<< (is_merged ? "merged" : "split") << '\t' //
<< split_count << '\t' << (is_merged ? merge_estimate : avg_kmer_count) << '\n';
<< split_count << '\t' //
<< one_sketch_estimate << '\t' //
<< (is_merged ? merge_estimate : one_sketch_estimate) << '\n';
split_count = 0u; // Subsequent split bins display 0, the first split bin displays the actual split count.
}
};
Expand Down Expand Up @@ -185,8 +188,11 @@ int execute(config const & cfg)
if (idx != current_idx)
{
print_result_line();

// reset all current data
sketch.reset();
sketches.clear();
current_kmers.clear();
shared_kmers.clear();
shared_kmers_initialised = false;
ub_count = 0u;
Expand All @@ -207,7 +213,7 @@ int execute(config const & cfg)
{
++ub_count; // This assumes that each user bin has exactly one associated file. Currently the case.

process_file(filename, current_kmers, sketch, sketches, fill_current_kmers, chopper_config.k);
process_file(filename, current_kmer_set, current_kmers, sketch, sketches, fill_current_kmers, chopper_config.k);
}

// Compute set intersection: shared_kmers = shared_kmers ∩ current_kmers
Expand Down
32 changes: 28 additions & 4 deletions src/display_layout/process_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ using sequence_file_type = seqan3::sequence_file_input<dna4_traits,
seqan3::type_list<seqan3::format_fasta, seqan3::format_fastq>>;

void process_file(std::string const & filename,
robin_hood::unordered_set<uint64_t> & current_kmer_set,
std::vector<uint64_t> & current_kmers,
seqan::hibf::sketch::hyperloglog & sketch,
std::vector<seqan::hibf::sketch::hyperloglog> & sketches,
Expand All @@ -44,6 +45,7 @@ void process_file(std::string const & filename,
while (infile.read(hash_data, hash_bytes))
{
current_kmers.push_back(hash);
current_kmer_set.insert(hash);
sketch.add(hash);
local_ub_sketch.add(hash);
}
Expand All @@ -53,6 +55,7 @@ void process_file(std::string const & filename,
while (infile.read(hash_data, hash_bytes))
{
sketch.add(hash);
current_kmer_set.insert(hash);
local_ub_sketch.add(hash);
}
}
Expand All @@ -72,6 +75,7 @@ void process_file(std::string const & filename,
for (uint64_t hash_value : seq | minimizer_view)
{
current_kmers.push_back(hash_value);
current_kmer_set.insert(hash_value);
sketch.add(hash_value);
local_ub_sketch.add(hash_value);
}
Expand All @@ -83,6 +87,7 @@ void process_file(std::string const & filename,
{
for (uint64_t hash_value : seq | minimizer_view)
{
current_kmer_set.insert(hash_value);
sketch.add(hash_value);
local_ub_sketch.add(hash_value);
}
Expand All @@ -95,11 +100,30 @@ void process_file(std::string const & filename,

void process_file(std::string const & filename,
std::vector<uint64_t> & current_kmers,
seqan::hibf::sketch::hyperloglog & sketch,
bool const fill_current_kmers,
uint8_t const kmer_size)
{
std::vector<seqan::hibf::sketch::hyperloglog> sketches;
if (filename.ends_with(".minimiser"))
{
uint64_t hash{};
char * const hash_data{reinterpret_cast<char *>(&hash)};
std::streamsize const hash_bytes{sizeof(hash)};

std::ifstream infile{filename, std::ios::binary};

while (infile.read(hash_data, hash_bytes))
current_kmers.push_back(hash);
}
else
{
sequence_file_type fin{filename};

seqan3::shape shape{seqan3::ungapped{kmer_size}};
auto minimizer_view = seqan3::views::minimiser_hash(shape,
seqan3::window_size{kmer_size},
seqan3::seed{chopper::adjust_seed(shape.count())});

process_file(filename, current_kmers, sketch, sketches, fill_current_kmers, kmer_size);
for (auto && [seq] : fin)
for (uint64_t hash_value : seq | minimizer_view)
current_kmers.push_back(hash_value);
}
}
5 changes: 3 additions & 2 deletions src/display_layout/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
#include <string>
#include <vector>

#include <robin_hood.h>

#include <hibf/sketch/hyperloglog.hpp>

struct config
Expand All @@ -26,11 +28,10 @@ void execute_sizes(config const & cfg);

void process_file(std::string const & filename,
std::vector<uint64_t> & current_kmers,
seqan::hibf::sketch::hyperloglog & sketch,
bool const fill_current_kmers,
uint8_t const kmer_size);

void process_file(std::string const & filename,
robin_hood::unordered_set<uint64_t> & current_kmer_set,
std::vector<uint64_t> & current_kmers,
seqan::hibf::sketch::hyperloglog & sketch,
std::vector<seqan::hibf::sketch::hyperloglog> & sketches,
Expand Down
3 changes: 1 addition & 2 deletions src/display_layout/sizes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,12 +305,11 @@ void execute_general_stats(config const & cfg)
auto input_lambda = [&filenames, &chopper_config](size_t const user_bin_id, seqan::hibf::insert_iterator it)
{
std::vector<uint64_t> current_kmers;
seqan::hibf::sketch::hyperloglog sketch;

if (filenames[user_bin_id].size() > 1)
throw std::runtime_error{"No multi files accepted yet."};

process_file(filenames[user_bin_id][0], current_kmers, sketch, true, chopper_config.k);
process_file(filenames[user_bin_id][0], current_kmers, chopper_config.k);

for (auto const kmer : current_kmers)
it = kmer;
Expand Down

0 comments on commit 71e23c0

Please sign in to comment.