From 3791716cc3a28a470d7a0f6979bc2fd1c7130fab Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Thu, 12 Oct 2023 14:32:34 +0200 Subject: [PATCH] [MISC] Update chopper to use newest hibf --- include/chopper/layout/hibf_statistics.hpp | 2 +- include/chopper/sketch/output.hpp | 2 +- .../chopper/sketch/read_hll_files_into.hpp | 2 +- src/display_layout/general.cpp | 2 +- src/measure_hyperloglog.cpp | 6 ++-- test/api/layout/execute_layout_test.cpp | 8 ++--- .../layout/execute_with_estimation_test.cpp | 31 ++++++++++--------- test/api/layout/hibf_statistics_test.cpp | 6 ++-- test/api/sketch/read_hll_files_into_test.cpp | 19 ++++++------ test/cli/cli_chopper_pipeline_test.cpp | 4 +-- test/data/datasources.cmake | 4 +-- test/data/small.hll | 4 +-- test/data/small2.hll | 4 +-- 13 files changed, 46 insertions(+), 48 deletions(-) diff --git a/include/chopper/layout/hibf_statistics.hpp b/include/chopper/layout/hibf_statistics.hpp index 0ddcae06..5d32eea2 100644 --- a/include/chopper/layout/hibf_statistics.hpp +++ b/include/chopper/layout/hibf_statistics.hpp @@ -655,7 +655,7 @@ class hibf_statistics { seqan::hibf::sketch::hyperloglog tmp = merged_bin_sketches[i]; // copy needed, s.t. current is not modified - double union_estimate = tmp.merge_and_estimate_SIMD(merged_bin_sketches[j]); + double union_estimate = tmp.merge_and_estimate(merged_bin_sketches[j]); // Jaccard distance estimate double distance = 2.0 - (current_estimate + merged_bin_sketches[j].estimate()) / union_estimate; // Since the sizes are estimates, the distance might be slighlty above 1.0 or below 0.0 diff --git a/include/chopper/sketch/output.hpp b/include/chopper/sketch/output.hpp index 6f8c4093..b403778d 100644 --- a/include/chopper/sketch/output.hpp +++ b/include/chopper/sketch/output.hpp @@ -40,7 +40,7 @@ inline void write_sketch_file(std::string const & filename, std::filesystem::path path = config.sketch_directory / std::filesystem::path(filename).stem(); path += ".hll"; std::ofstream hll_fout(path, std::ios::binary); - sketch.dump(hll_fout); + sketch.store(hll_fout); } } // namespace chopper::sketch diff --git a/include/chopper/sketch/read_hll_files_into.hpp b/include/chopper/sketch/read_hll_files_into.hpp index 07a35d4e..f05ffca2 100644 --- a/include/chopper/sketch/read_hll_files_into.hpp +++ b/include/chopper/sketch/read_hll_files_into.hpp @@ -38,7 +38,7 @@ inline void read_hll_files_into(std::filesystem::path const & hll_dir, throw std::runtime_error{"Could not open file " + path.string()}; // the sketch bits will be automatically read from the files - target.emplace_back().restore(hll_fin); + target.emplace_back().load(hll_fin); } } catch (std::runtime_error const & err) diff --git a/src/display_layout/general.cpp b/src/display_layout/general.cpp index 42656cbd..f5b042aa 100644 --- a/src/display_layout/general.cpp +++ b/src/display_layout/general.cpp @@ -170,7 +170,7 @@ int execute(config const & cfg) if (idx != current_idx) { print_result_line(); - sketch.clear(); + sketch.reset(); shared_kmers.clear(); shared_kmers_initialised = false; ub_count = 0u; diff --git a/src/measure_hyperloglog.cpp b/src/measure_hyperloglog.cpp index 2bb22797..7da147ab 100644 --- a/src/measure_hyperloglog.cpp +++ b/src/measure_hyperloglog.cpp @@ -103,18 +103,18 @@ int main(int argc, char const * argv[]) for (auto & sketch : sketches) { - double const expected_error = 1.04 / std::sqrt(sketch.registerSize()); + double const expected_error = 1.04 / std::sqrt(sketch.data_size()); double const actual_error = std::abs(1.0 - std::round(sketch.estimate()) / static_cast(control.size())); - fout << id << '\t' << seq.size() << '\t' << sketch.registerSize() << '\t' + fout << id << '\t' << seq.size() << '\t' << sketch.data_size() << '\t' << static_cast(sketch.estimate()) << '\t' << control.size() << '\t' << expected_error << '\t' << actual_error << '\n'; } // clear for the next sequence for (auto & sketch : sketches) - sketch.clear(); + sketch.reset(); control.clear(); } diff --git a/test/api/layout/execute_layout_test.cpp b/test/api/layout/execute_layout_test.cpp index 59f4764b..cd100ceb 100755 --- a/test/api/layout/execute_layout_test.cpp +++ b/test/api/layout/execute_layout_test.cpp @@ -23,7 +23,7 @@ TEST(execute_test, few_ubs) auto simulated_input = [&](size_t const num, seqan::hibf::insert_iterator it) { - size_t const desired_kmer_count = (num == 1) ? 1000 : 500; + size_t const desired_kmer_count = (num == 1) ? 880 : 475; // Estimate are 990.71 and 504.88 for (auto hash : std::views::iota(0u, desired_kmer_count)) it = hash; }; @@ -299,7 +299,7 @@ TEST(execute_test, many_ubs) "@ }\n" "@}\n" "@HIBF_CONFIG_END\n" - "#TOP_LEVEL_IBF fullest_technical_bin_idx:0\n" + "#TOP_LEVEL_IBF fullest_technical_bin_idx:26\n" "#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:14\n" "#LOWER_LEVEL_IBF_1 fullest_technical_bin_idx:14\n" "#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:14\n" @@ -331,8 +331,8 @@ TEST(execute_test, many_ubs) "6\t2;34\t1;10\n" "7\t2;44\t1;10\n" "8\t2;54\t1;10\n" - "33\t3;0\t1;27\n" - "32\t3;27\t1;25\n" + "33\t3;0\t1;26\n" + "32\t3;26\t1;26\n" "1\t3;52\t1;6\n" "2\t3;58\t1;6\n" "30\t4;0\t1;22\n" diff --git a/test/api/layout/execute_with_estimation_test.cpp b/test/api/layout/execute_with_estimation_test.cpp index 6b05eb94..4e28ee2f 100644 --- a/test/api/layout/execute_with_estimation_test.cpp +++ b/test/api/layout/execute_with_estimation_test.cpp @@ -17,9 +17,10 @@ #include #include -#include "../api_test.hpp" #include +#include "../api_test.hpp" + TEST(execute_estimation_test, few_ubs) { seqan3::test::tmp_directory tmp_dir{}; @@ -66,7 +67,7 @@ TEST(execute_estimation_test, few_ubs) ## (l*m)_tmax : Computed by l_tmax * m_tmax ## size : The expected total size of an tmax-HIBF # tmax c_tmax l_tmax m_tmax (l*m)_tmax size -64 1.00 1.00 1.00 1.00 14.6KiB +64 1.00 1.00 1.00 1.00 15.7KiB # Best t_max (regarding expected query runtime): 64 )expected_cout"); } @@ -121,9 +122,9 @@ TEST(execute_estimation_test, many_ubs) ## (l*m)_tmax : Computed by l_tmax * m_tmax ## size : The expected total size of an tmax-HIBF # tmax c_tmax l_tmax m_tmax (l*m)_tmax size -64 1.00 1.26 1.00 1.26 74.5KiB -128 1.22 1.25 0.66 0.83 49.4KiB -256 1.33 1.33 0.74 0.99 55.1KiB +64 1.00 1.25 1.00 1.25 75.4KiB +128 1.22 1.24 0.69 0.86 51.9KiB +256 1.33 1.33 0.76 1.02 57.5KiB # Best t_max (regarding expected query runtime): 128 )expected_cout"); @@ -266,7 +267,7 @@ TEST(execute_estimation_test, many_ubs) "@ }\n" "@}\n" "@HIBF_CONFIG_END\n" - "#TOP_LEVEL_IBF fullest_technical_bin_idx:14\n" + "#TOP_LEVEL_IBF fullest_technical_bin_idx:36\n" "#LOWER_LEVEL_IBF_14 fullest_technical_bin_idx:24\n" "#LOWER_LEVEL_IBF_15 fullest_technical_bin_idx:24\n" "#USER_BIN_IDX\tTECHNICAL_BIN_INDICES\tNUMBER_OF_TECHNICAL_BINS\n" @@ -421,9 +422,9 @@ TEST(execute_estimation_test, many_ubs_force_all) ## (l*m)_tmax : Computed by l_tmax * m_tmax ## size : The expected total size of an tmax-HIBF # tmax c_tmax l_tmax m_tmax (l*m)_tmax size -64 1.00 1.26 1.00 1.26 74.5KiB -128 1.22 1.25 0.66 0.83 49.4KiB -256 1.33 1.33 0.74 0.99 55.1KiB +64 1.00 1.25 1.00 1.25 75.4KiB +128 1.22 1.24 0.69 0.86 51.9KiB +256 1.33 1.33 0.76 1.02 57.5KiB # Best t_max (regarding expected query runtime): 128 )expected_cout"); @@ -465,9 +466,9 @@ TEST(execute_estimation_test, with_rearrangement) hll_filenames.push_back("small.hll"); expected_kmer_counts.push_back(387); + expected_kmer_counts.push_back(470); expected_kmer_counts.push_back(465); - expected_kmer_counts.push_back(465); - expected_kmer_counts.push_back(571); + expected_kmer_counts.push_back(578); } // There are 20 files with a count of {100,200,300,400} each. There are 16 files with count 500. @@ -503,7 +504,7 @@ TEST(execute_estimation_test, with_rearrangement) chopper::sketch::read_hll_files_into(sketches_dir, hll_filenames, sketches); for (size_t i = 0; i < expected_kmer_counts.size(); ++i) - EXPECT_EQ(std::lround(sketches[i].estimate()), expected_kmer_counts[i]) << "failed at " << i; + ASSERT_EQ(std::lround(sketches[i].estimate()), expected_kmer_counts[i]) << "failed at " << i; ASSERT_TRUE(std::filesystem::exists(stats_file)); @@ -525,9 +526,9 @@ TEST(execute_estimation_test, with_rearrangement) ## (l*m)_tmax : Computed by l_tmax * m_tmax ## size : The expected total size of an tmax-HIBF # tmax c_tmax l_tmax m_tmax (l*m)_tmax size -64 1.00 2.23 1.00 2.23 116.5KiB -128 1.22 1.95 1.15 2.24 133.7KiB -256 1.33 1.53 1.19 1.82 138.7KiB +64 1.00 2.22 1.00 2.22 117.1KiB +128 1.22 1.95 1.15 2.23 134.3KiB +256 1.33 1.52 1.18 1.81 138.7KiB # Best t_max (regarding expected query runtime): 256 )expected_cout"); diff --git a/test/api/layout/hibf_statistics_test.cpp b/test/api/layout/hibf_statistics_test.cpp index eff2ff73..d4e84ee6 100644 --- a/test/api/layout/hibf_statistics_test.cpp +++ b/test/api/layout/hibf_statistics_test.cpp @@ -141,7 +141,7 @@ TEST(execute_test, chopper_layout_statistics) ## size : The expected total size of an tmax-HIBF ## uncorr_size : The expected size of an tmax-HIBF without FPR correction # tmax c_tmax l_tmax m_tmax (l*m)_tmax size uncorr_size level num_ibfs level_size level_size_no_corr total_num_tbs avg_num_tbs split_tb_percentage max_split_tb avg_split_tb max_factor avg_factor -64 1.00 1.26 1.00 1.26 74.8KiB 45.8KiB :0:1 :1:12 :37.8KiB:37.0KiB :37.8KiB:8.0KiB :64:768 :64:64 :81.25:100.00 :1:32 :1.00:17.45 :1.00:6.20 :1.00:4.84 +64 1.00 1.25 1.00 1.25 75.5KiB 46.8KiB :0:1 :1:12 :38.8KiB:36.7KiB :38.8KiB:8.0KiB :64:768 :64:64 :81.25:100.00 :1:32 :1.00:17.45 :1.00:6.20 :1.00:4.84 )expected_cout"; EXPECT_EQ(layout_result_stdout, expected_cout) << layout_result_stdout; @@ -194,8 +194,8 @@ TEST(execute_test, chopper_layout_statistics_determine_best_bins) ## size : The expected total size of an tmax-HIBF ## uncorr_size : The expected size of an tmax-HIBF without FPR correction # tmax c_tmax l_tmax m_tmax (l*m)_tmax size uncorr_size level num_ibfs level_size level_size_no_corr total_num_tbs avg_num_tbs split_tb_percentage max_split_tb avg_split_tb max_factor avg_factor -64 1.00 1.00 1.00 1.00 1.6MiB 1.2MiB :0 :1 :1.6MiB :1.2MiB :64 :64 :100.00 :16 :6.40 :4.35 :3.05 -128 1.22 1.22 1.41 1.72 2.3MiB 1.2MiB :0 :1 :2.3MiB :1.2MiB :128 :128 :100.00 :32 :12.80 :6.20 :4.37 +64 1.00 1.00 1.00 1.00 3.1MiB 2.2MiB :0 :1 :3.1MiB :2.2MiB :64 :64 :100.00 :15 :6.40 :4.20 :3.06 +128 1.22 1.22 1.40 1.72 4.3MiB 2.2MiB :0 :1 :4.3MiB :2.2MiB :128 :128 :100.00 :31 :12.80 :6.10 :4.39 # Best t_max (regarding expected query runtime): 64 )expected_cout"; diff --git a/test/api/sketch/read_hll_files_into_test.cpp b/test/api/sketch/read_hll_files_into_test.cpp index c1d42f63..1147bb5f 100644 --- a/test/api/sketch/read_hll_files_into_test.cpp +++ b/test/api/sketch/read_hll_files_into_test.cpp @@ -8,15 +8,16 @@ #include #include +#include +#include #include #include "../api_test.hpp" struct input_traits : public seqan3::sequence_file_input_default_traits_dna { - using sequence_alphabet = char; - using sequence_legal_alphabet = char; + using sequence_alphabet = seqan3::dna4; }; using sequence_file_type = seqan3::sequence_file_input>; @@ -45,15 +46,15 @@ TEST_F(read_hll_files_into_test, basic) std::string const input_file{data("small.fa")}; sequence_file_type seq_file{input_file}; + auto minimizer_view = seqan3::views::minimiser_hash(seqan3::ungapped{kmer_size}, + seqan3::window_size{kmer_size}, + seqan3::seed{chopper::adjust_seed(kmer_size)}); + // put every sequence in this file into the sketch - for (auto && [seq_vec] : seq_file) + for (auto && [seq] : seq_file) { - std::string_view const seq{seq_vec.begin(), seq_vec.end()}; - - for (size_t pos = 0; pos + kmer_size <= seq.size(); ++pos) // substr is [pos, pos + len) - { - expected.add(seq.substr(pos, kmer_size)); - } + for (auto hash : seq | minimizer_view) + expected.add(hash); } chopper::sketch::read_hll_files_into(data(""), test_filenames, target); diff --git a/test/cli/cli_chopper_pipeline_test.cpp b/test/cli/cli_chopper_pipeline_test.cpp index e197e079..34ef1c92 100644 --- a/test/cli/cli_chopper_pipeline_test.cpp +++ b/test/cli/cli_chopper_pipeline_test.cpp @@ -210,9 +210,9 @@ TEST_F(cli_test, chopper_layout2) "@HIBF_CONFIG_END\n" "#TOP_LEVEL_IBF fullest_technical_bin_idx:54\n" "#USER_BIN_IDX\tTECHNICAL_BIN_INDICES\tNUMBER_OF_TECHNICAL_BINS\n" - "2\t0\t15\n" + "1\t0\t15\n" "3\t15\t24\n" - "1\t39\t15\n" + "2\t39\t15\n" "0\t54\t10\n"}; std::string const actual_file{string_from_file(binning_filename)}; diff --git a/test/data/datasources.cmake b/test/data/datasources.cmake index dd934900..a014932b 100644 --- a/test/data/datasources.cmake +++ b/test/data/datasources.cmake @@ -42,11 +42,11 @@ declare_datasource (FILE small.split URL ${CMAKE_SOURCE_DIR}/test/data/small.spl ) declare_datasource (FILE small.hll URL ${CMAKE_SOURCE_DIR}/test/data/small.hll URL_HASH - SHA256=0f0791fb26fb87b854108e43e5350ff49b483082aca2b6afe5b7b63ff727c15a + SHA256=039d7bdbb70e4d98285c09bbbcb2cd1fafb8038689c270cf231debb248c59206 ) declare_datasource (FILE small2.hll URL ${CMAKE_SOURCE_DIR}/test/data/small2.hll URL_HASH - SHA256=0f0791fb26fb87b854108e43e5350ff49b483082aca2b6afe5b7b63ff727c15a + SHA256=039d7bdbb70e4d98285c09bbbcb2cd1fafb8038689c270cf231debb248c59206 ) declare_datasource (FILE small.minimiser URL ${CMAKE_SOURCE_DIR}/test/data/small.minimiser URL_HASH diff --git a/test/data/small.hll b/test/data/small.hll index 151836c1..8d376347 100644 --- a/test/data/small.hll +++ b/test/data/small.hll @@ -1,3 +1 @@ - - - \ No newline at end of file +  \ No newline at end of file diff --git a/test/data/small2.hll b/test/data/small2.hll index 151836c1..8d376347 100644 --- a/test/data/small2.hll +++ b/test/data/small2.hll @@ -1,3 +1 @@ - - - \ No newline at end of file +  \ No newline at end of file