Skip to content

Commit

Permalink
[MISC] Update chopper to use newest hibf
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Oct 12, 2023
1 parent 3e95e14 commit 3791716
Show file tree
Hide file tree
Showing 13 changed files with 46 additions and 48 deletions.
2 changes: 1 addition & 1 deletion include/chopper/layout/hibf_statistics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,7 @@ class hibf_statistics
{
seqan::hibf::sketch::hyperloglog tmp =
merged_bin_sketches[i]; // copy needed, s.t. current is not modified
double union_estimate = tmp.merge_and_estimate_SIMD(merged_bin_sketches[j]);
double union_estimate = tmp.merge_and_estimate(merged_bin_sketches[j]);
// Jaccard distance estimate
double distance = 2.0 - (current_estimate + merged_bin_sketches[j].estimate()) / union_estimate;
// Since the sizes are estimates, the distance might be slighlty above 1.0 or below 0.0
Expand Down
2 changes: 1 addition & 1 deletion include/chopper/sketch/output.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ inline void write_sketch_file(std::string const & filename,
std::filesystem::path path = config.sketch_directory / std::filesystem::path(filename).stem();
path += ".hll";
std::ofstream hll_fout(path, std::ios::binary);
sketch.dump(hll_fout);
sketch.store(hll_fout);
}

} // namespace chopper::sketch
2 changes: 1 addition & 1 deletion include/chopper/sketch/read_hll_files_into.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ inline void read_hll_files_into(std::filesystem::path const & hll_dir,
throw std::runtime_error{"Could not open file " + path.string()};

// the sketch bits will be automatically read from the files
target.emplace_back().restore(hll_fin);
target.emplace_back().load(hll_fin);
}
}
catch (std::runtime_error const & err)
Expand Down
2 changes: 1 addition & 1 deletion src/display_layout/general.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ int execute(config const & cfg)
if (idx != current_idx)
{
print_result_line();
sketch.clear();
sketch.reset();
shared_kmers.clear();
shared_kmers_initialised = false;
ub_count = 0u;
Expand Down
6 changes: 3 additions & 3 deletions src/measure_hyperloglog.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,18 +103,18 @@ int main(int argc, char const * argv[])

for (auto & sketch : sketches)
{
double const expected_error = 1.04 / std::sqrt(sketch.registerSize());
double const expected_error = 1.04 / std::sqrt(sketch.data_size());
double const actual_error =
std::abs(1.0 - std::round(sketch.estimate()) / static_cast<double>(control.size()));

fout << id << '\t' << seq.size() << '\t' << sketch.registerSize() << '\t'
fout << id << '\t' << seq.size() << '\t' << sketch.data_size() << '\t'
<< static_cast<uint64_t>(sketch.estimate()) << '\t' << control.size() << '\t' << expected_error << '\t'
<< actual_error << '\n';
}

// clear for the next sequence
for (auto & sketch : sketches)
sketch.clear();
sketch.reset();

control.clear();
}
Expand Down
8 changes: 4 additions & 4 deletions test/api/layout/execute_layout_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ TEST(execute_test, few_ubs)

auto simulated_input = [&](size_t const num, seqan::hibf::insert_iterator it)
{
size_t const desired_kmer_count = (num == 1) ? 1000 : 500;
size_t const desired_kmer_count = (num == 1) ? 880 : 475; // Estimate are 990.71 and 504.88
for (auto hash : std::views::iota(0u, desired_kmer_count))
it = hash;
};
Expand Down Expand Up @@ -299,7 +299,7 @@ TEST(execute_test, many_ubs)
"@ }\n"
"@}\n"
"@HIBF_CONFIG_END\n"
"#TOP_LEVEL_IBF fullest_technical_bin_idx:0\n"
"#TOP_LEVEL_IBF fullest_technical_bin_idx:26\n"
"#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:14\n"
"#LOWER_LEVEL_IBF_1 fullest_technical_bin_idx:14\n"
"#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:14\n"
Expand Down Expand Up @@ -331,8 +331,8 @@ TEST(execute_test, many_ubs)
"6\t2;34\t1;10\n"
"7\t2;44\t1;10\n"
"8\t2;54\t1;10\n"
"33\t3;0\t1;27\n"
"32\t3;27\t1;25\n"
"33\t3;0\t1;26\n"
"32\t3;26\t1;26\n"
"1\t3;52\t1;6\n"
"2\t3;58\t1;6\n"
"30\t4;0\t1;22\n"
Expand Down
31 changes: 16 additions & 15 deletions test/api/layout/execute_with_estimation_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@
#include <chopper/layout/execute.hpp>
#include <chopper/sketch/read_hll_files_into.hpp>

#include "../api_test.hpp"
#include <hibf/sketch/hyperloglog.hpp>

#include "../api_test.hpp"

TEST(execute_estimation_test, few_ubs)
{
seqan3::test::tmp_directory tmp_dir{};
Expand Down Expand Up @@ -66,7 +67,7 @@ TEST(execute_estimation_test, few_ubs)
## (l*m)_tmax : Computed by l_tmax * m_tmax
## size : The expected total size of an tmax-HIBF
# tmax c_tmax l_tmax m_tmax (l*m)_tmax size
64 1.00 1.00 1.00 1.00 14.6KiB
64 1.00 1.00 1.00 1.00 15.7KiB
# Best t_max (regarding expected query runtime): 64
)expected_cout");
}
Expand Down Expand Up @@ -121,9 +122,9 @@ TEST(execute_estimation_test, many_ubs)
## (l*m)_tmax : Computed by l_tmax * m_tmax
## size : The expected total size of an tmax-HIBF
# tmax c_tmax l_tmax m_tmax (l*m)_tmax size
64 1.00 1.26 1.00 1.26 74.5KiB
128 1.22 1.25 0.66 0.83 49.4KiB
256 1.33 1.33 0.74 0.99 55.1KiB
64 1.00 1.25 1.00 1.25 75.4KiB
128 1.22 1.24 0.69 0.86 51.9KiB
256 1.33 1.33 0.76 1.02 57.5KiB
# Best t_max (regarding expected query runtime): 128
)expected_cout");

Expand Down Expand Up @@ -266,7 +267,7 @@ TEST(execute_estimation_test, many_ubs)
"@ }\n"
"@}\n"
"@HIBF_CONFIG_END\n"
"#TOP_LEVEL_IBF fullest_technical_bin_idx:14\n"
"#TOP_LEVEL_IBF fullest_technical_bin_idx:36\n"
"#LOWER_LEVEL_IBF_14 fullest_technical_bin_idx:24\n"
"#LOWER_LEVEL_IBF_15 fullest_technical_bin_idx:24\n"
"#USER_BIN_IDX\tTECHNICAL_BIN_INDICES\tNUMBER_OF_TECHNICAL_BINS\n"
Expand Down Expand Up @@ -421,9 +422,9 @@ TEST(execute_estimation_test, many_ubs_force_all)
## (l*m)_tmax : Computed by l_tmax * m_tmax
## size : The expected total size of an tmax-HIBF
# tmax c_tmax l_tmax m_tmax (l*m)_tmax size
64 1.00 1.26 1.00 1.26 74.5KiB
128 1.22 1.25 0.66 0.83 49.4KiB
256 1.33 1.33 0.74 0.99 55.1KiB
64 1.00 1.25 1.00 1.25 75.4KiB
128 1.22 1.24 0.69 0.86 51.9KiB
256 1.33 1.33 0.76 1.02 57.5KiB
# Best t_max (regarding expected query runtime): 128
)expected_cout");

Expand Down Expand Up @@ -465,9 +466,9 @@ TEST(execute_estimation_test, with_rearrangement)
hll_filenames.push_back("small.hll");

expected_kmer_counts.push_back(387);
expected_kmer_counts.push_back(470);
expected_kmer_counts.push_back(465);
expected_kmer_counts.push_back(465);
expected_kmer_counts.push_back(571);
expected_kmer_counts.push_back(578);
}

// There are 20 files with a count of {100,200,300,400} each. There are 16 files with count 500.
Expand Down Expand Up @@ -503,7 +504,7 @@ TEST(execute_estimation_test, with_rearrangement)
chopper::sketch::read_hll_files_into(sketches_dir, hll_filenames, sketches);

for (size_t i = 0; i < expected_kmer_counts.size(); ++i)
EXPECT_EQ(std::lround(sketches[i].estimate()), expected_kmer_counts[i]) << "failed at " << i;
ASSERT_EQ(std::lround(sketches[i].estimate()), expected_kmer_counts[i]) << "failed at " << i;

ASSERT_TRUE(std::filesystem::exists(stats_file));

Expand All @@ -525,9 +526,9 @@ TEST(execute_estimation_test, with_rearrangement)
## (l*m)_tmax : Computed by l_tmax * m_tmax
## size : The expected total size of an tmax-HIBF
# tmax c_tmax l_tmax m_tmax (l*m)_tmax size
64 1.00 2.23 1.00 2.23 116.5KiB
128 1.22 1.95 1.15 2.24 133.7KiB
256 1.33 1.53 1.19 1.82 138.7KiB
64 1.00 2.22 1.00 2.22 117.1KiB
128 1.22 1.95 1.15 2.23 134.3KiB
256 1.33 1.52 1.18 1.81 138.7KiB
# Best t_max (regarding expected query runtime): 256
)expected_cout");

Expand Down
6 changes: 3 additions & 3 deletions test/api/layout/hibf_statistics_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ TEST(execute_test, chopper_layout_statistics)
## size : The expected total size of an tmax-HIBF
## uncorr_size : The expected size of an tmax-HIBF without FPR correction
# tmax c_tmax l_tmax m_tmax (l*m)_tmax size uncorr_size level num_ibfs level_size level_size_no_corr total_num_tbs avg_num_tbs split_tb_percentage max_split_tb avg_split_tb max_factor avg_factor
64 1.00 1.26 1.00 1.26 74.8KiB 45.8KiB :0:1 :1:12 :37.8KiB:37.0KiB :37.8KiB:8.0KiB :64:768 :64:64 :81.25:100.00 :1:32 :1.00:17.45 :1.00:6.20 :1.00:4.84
64 1.00 1.25 1.00 1.25 75.5KiB 46.8KiB :0:1 :1:12 :38.8KiB:36.7KiB :38.8KiB:8.0KiB :64:768 :64:64 :81.25:100.00 :1:32 :1.00:17.45 :1.00:6.20 :1.00:4.84
)expected_cout";

EXPECT_EQ(layout_result_stdout, expected_cout) << layout_result_stdout;
Expand Down Expand Up @@ -194,8 +194,8 @@ TEST(execute_test, chopper_layout_statistics_determine_best_bins)
## size : The expected total size of an tmax-HIBF
## uncorr_size : The expected size of an tmax-HIBF without FPR correction
# tmax c_tmax l_tmax m_tmax (l*m)_tmax size uncorr_size level num_ibfs level_size level_size_no_corr total_num_tbs avg_num_tbs split_tb_percentage max_split_tb avg_split_tb max_factor avg_factor
64 1.00 1.00 1.00 1.00 1.6MiB 1.2MiB :0 :1 :1.6MiB :1.2MiB :64 :64 :100.00 :16 :6.40 :4.35 :3.05
128 1.22 1.22 1.41 1.72 2.3MiB 1.2MiB :0 :1 :2.3MiB :1.2MiB :128 :128 :100.00 :32 :12.80 :6.20 :4.37
64 1.00 1.00 1.00 1.00 3.1MiB 2.2MiB :0 :1 :3.1MiB :2.2MiB :64 :64 :100.00 :15 :6.40 :4.20 :3.06
128 1.22 1.22 1.40 1.72 4.3MiB 2.2MiB :0 :1 :4.3MiB :2.2MiB :128 :128 :100.00 :31 :12.80 :6.10 :4.39
# Best t_max (regarding expected query runtime): 64
)expected_cout";

Expand Down
19 changes: 10 additions & 9 deletions test/api/sketch/read_hll_files_into_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,16 @@
#include <gtest/gtest.h>

#include <seqan3/io/sequence_file/input.hpp>
#include <seqan3/search/views/minimiser_hash.hpp>

#include <chopper/adjust_seed.hpp>
#include <chopper/sketch/read_hll_files_into.hpp>

#include "../api_test.hpp"

struct input_traits : public seqan3::sequence_file_input_default_traits_dna
{
using sequence_alphabet = char;
using sequence_legal_alphabet = char;
using sequence_alphabet = seqan3::dna4;
};
using sequence_file_type = seqan3::sequence_file_input<input_traits, seqan3::fields<seqan3::field::seq>>;

Expand Down Expand Up @@ -45,15 +46,15 @@ TEST_F(read_hll_files_into_test, basic)
std::string const input_file{data("small.fa")};
sequence_file_type seq_file{input_file};

auto minimizer_view = seqan3::views::minimiser_hash(seqan3::ungapped{kmer_size},
seqan3::window_size{kmer_size},
seqan3::seed{chopper::adjust_seed(kmer_size)});

// put every sequence in this file into the sketch
for (auto && [seq_vec] : seq_file)
for (auto && [seq] : seq_file)
{
std::string_view const seq{seq_vec.begin(), seq_vec.end()};

for (size_t pos = 0; pos + kmer_size <= seq.size(); ++pos) // substr is [pos, pos + len)
{
expected.add(seq.substr(pos, kmer_size));
}
for (auto hash : seq | minimizer_view)
expected.add(hash);
}

chopper::sketch::read_hll_files_into(data(""), test_filenames, target);
Expand Down
4 changes: 2 additions & 2 deletions test/cli/cli_chopper_pipeline_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,9 +210,9 @@ TEST_F(cli_test, chopper_layout2)
"@HIBF_CONFIG_END\n"
"#TOP_LEVEL_IBF fullest_technical_bin_idx:54\n"
"#USER_BIN_IDX\tTECHNICAL_BIN_INDICES\tNUMBER_OF_TECHNICAL_BINS\n"
"2\t0\t15\n"
"1\t0\t15\n"
"3\t15\t24\n"
"1\t39\t15\n"
"2\t39\t15\n"
"0\t54\t10\n"};

std::string const actual_file{string_from_file(binning_filename)};
Expand Down
4 changes: 2 additions & 2 deletions test/data/datasources.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,11 @@ declare_datasource (FILE small.split URL ${CMAKE_SOURCE_DIR}/test/data/small.spl
)

declare_datasource (FILE small.hll URL ${CMAKE_SOURCE_DIR}/test/data/small.hll URL_HASH
SHA256=0f0791fb26fb87b854108e43e5350ff49b483082aca2b6afe5b7b63ff727c15a
SHA256=039d7bdbb70e4d98285c09bbbcb2cd1fafb8038689c270cf231debb248c59206
)

declare_datasource (FILE small2.hll URL ${CMAKE_SOURCE_DIR}/test/data/small2.hll URL_HASH
SHA256=0f0791fb26fb87b854108e43e5350ff49b483082aca2b6afe5b7b63ff727c15a
SHA256=039d7bdbb70e4d98285c09bbbcb2cd1fafb8038689c270cf231debb248c59206
)

declare_datasource (FILE small.minimiser URL ${CMAKE_SOURCE_DIR}/test/data/small.minimiser URL_HASH
Expand Down
4 changes: 1 addition & 3 deletions test/data/small.hll
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@



 
4 changes: 1 addition & 3 deletions test/data/small2.hll
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@



 

0 comments on commit 3791716

Please sign in to comment.