Skip to content

Commit

Permalink
[FEATURE] Layout can be computed on minimizers.
Browse files Browse the repository at this point in the history
  • Loading branch information
smehringer authored and eseiler committed Nov 2, 2023
1 parent f080ec8 commit a50253e
Show file tree
Hide file tree
Showing 13 changed files with 51 additions and 10 deletions.
4 changes: 4 additions & 0 deletions include/chopper/configuration.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ struct configuration
//!\brief The kmer size to hash the input sequences before computing a HyperLogLog sketch from them.
uint8_t k{19};

//!\brief The window size to compute minimizers before computing a HyperLogLog sketch from them.
uint8_t window_size{19};

//!\brief Do not write the sketches into a dedicated directory.
bool disable_sketch_output{false};

Expand Down Expand Up @@ -83,6 +86,7 @@ struct configuration
archive(CEREAL_NVP(debug));
archive(CEREAL_NVP(sketch_directory));
archive(CEREAL_NVP(k));
archive(CEREAL_NVP(window_size));
archive(CEREAL_NVP(disable_sketch_output));
archive(CEREAL_NVP(precomputed_files));

Expand Down
2 changes: 2 additions & 0 deletions include/chopper/input_functor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ struct input_functor

uint8_t kmer_size{21};

uint8_t window_size{21};

void operator()(size_t const num, seqan::hibf::insert_iterator it);
};

Expand Down
3 changes: 2 additions & 1 deletion src/chopper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ int main(int argc, char const * argv[])

chopper::sketch::check_filenames(filenames, config);

config.hibf_config.input_fn = chopper::input_functor{filenames, config.precomputed_files, config.k};
config.hibf_config.input_fn =
chopper::input_functor{filenames, config.precomputed_files, config.k, config.window_size};
config.hibf_config.number_of_user_bins = filenames.size();

exit_code |= chopper::layout::execute(config, filenames);
Expand Down
2 changes: 1 addition & 1 deletion src/input_functor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ void input_functor::operator()(size_t const num, seqan::hibf::insert_iterator it

seqan3::shape shape = seqan3::ungapped{kmer_size};
auto minimizer_view = seqan3::views::minimiser_hash(shape,
seqan3::window_size{kmer_size},
seqan3::window_size{window_size},
seqan3::seed{adjust_seed(shape.count())});

for (auto && [seq] : fin)
Expand Down
13 changes: 13 additions & 0 deletions src/set_up_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,19 @@ void set_up_parser(sharg::parser & parser, configuration & config)
"k-mer size might miss out on certain similarities. For DNA sequences, a k-mer size between "
"[16,32] has proven to work well."});

parser.add_option(
config.window_size,
sharg::config{
.short_id = '\0',
.long_id = "window",
.description =
"Setting this option will trigger the computation of (w,k)-minimizers instead of canonical kmers. "
"Minimizers can thin out the data, reduce the memory footpring of the resulting index and increase "
"runtime performance. On the other hand, it also decreases accuracy and might cause false negatives."
"For DNA sequences, a window size of 2-4 positions more than the kmers size, e.g. (42,20)-minimizers, "
"has proven to reduce the computational effort significantly while only slightly descreasing the "
"accuracy."});

parser.add_option(
config.hibf_config.tmax,
sharg::config{
Expand Down
8 changes: 7 additions & 1 deletion src/util/display_layout/general.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,13 @@ int execute(config const & cfg)

for (auto const & filename : filenames[user_bin.idx])
{
process_file(filename, current_kmer_set, current_kmers, sketch, fill_current_kmers, chopper_config.k);
process_file(filename,
current_kmer_set,
current_kmers,
sketch,
fill_current_kmers,
chopper_config.k,
chopper_config.window_size);
}

// Compute set intersection: shared_kmers = shared_kmers ∩ current_kmers
Expand Down
12 changes: 8 additions & 4 deletions src/util/display_layout/process_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ void process_file(std::string const & filename,
std::vector<uint64_t> & current_kmers,
seqan::hibf::sketch::hyperloglog & sketch,
bool const fill_current_kmers,
uint8_t const kmer_size)
uint8_t const kmer_size,
uint8_t const window_size)
{
if (filename.ends_with(".minimiser"))
{
Expand Down Expand Up @@ -68,7 +69,7 @@ void process_file(std::string const & filename,

seqan3::shape shape{seqan3::ungapped{kmer_size}};
auto minimizer_view = seqan3::views::minimiser_hash(shape,
seqan3::window_size{kmer_size},
seqan3::window_size{window_size},
seqan3::seed{chopper::adjust_seed(shape.count())});
if (fill_current_kmers)
{
Expand Down Expand Up @@ -96,7 +97,10 @@ void process_file(std::string const & filename,
}
}

void process_file(std::string const & filename, std::vector<uint64_t> & current_kmers, uint8_t const kmer_size)
void process_file(std::string const & filename,
std::vector<uint64_t> & current_kmers,
uint8_t const kmer_size,
uint8_t const window_size)
{
if (filename.ends_with(".minimiser"))
{
Expand All @@ -115,7 +119,7 @@ void process_file(std::string const & filename, std::vector<uint64_t> & current_

seqan3::shape shape{seqan3::ungapped{kmer_size}};
auto minimizer_view = seqan3::views::minimiser_hash(shape,
seqan3::window_size{kmer_size},
seqan3::window_size{window_size},
seqan3::seed{chopper::adjust_seed(shape.count())});

for (auto && [seq] : fin)
Expand Down
8 changes: 6 additions & 2 deletions src/util/display_layout/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,15 @@ struct config
void execute_general(config const & cfg);
void execute_sizes(config const & cfg);

void process_file(std::string const & filename, std::vector<uint64_t> & current_kmers, uint8_t const kmer_size);
void process_file(std::string const & filename,
std::vector<uint64_t> & current_kmers,
uint8_t const kmer_size,
uint8_t const window_size);

void process_file(std::string const & filename,
robin_hood::unordered_set<uint64_t> & current_kmer_set,
std::vector<uint64_t> & current_kmers,
seqan::hibf::sketch::hyperloglog & sketch,
bool const fill_current_kmers,
uint8_t const kmer_size);
uint8_t const kmer_size,
uint8_t const window_size);
2 changes: 1 addition & 1 deletion src/util/display_layout/sizes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ void execute_general_stats(config const & cfg)
if (filenames[user_bin_id].size() > 1)
throw std::runtime_error{"No multi files accepted yet."};

process_file(filenames[user_bin_id][0], current_kmers, chopper_config.k);
process_file(filenames[user_bin_id][0], current_kmers, chopper_config.k, chopper_config.window_size);

for (auto const kmer : current_kmers)
it = kmer;
Expand Down
3 changes: 3 additions & 0 deletions test/api/config_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ chopper::configuration generate_config()
config.data_file = "/path/to/data.file";
config.sketch_directory = "/path/to/sketch/directory";
config.k = 20;
config.window_size = 24;
config.disable_sketch_output = true;
config.precomputed_files = true;
config.output_filename = "file.layout";
Expand Down Expand Up @@ -47,6 +48,7 @@ bool operator==(chopper::configuration const & lhs, chopper::configuration const
lhs.debug == rhs.debug && //
lhs.sketch_directory == rhs.sketch_directory && //
lhs.k == rhs.k && //
lhs.window_size == rhs.window_size && //
lhs.disable_sketch_output == rhs.disable_sketch_output && //
lhs.precomputed_files == rhs.precomputed_files && //
lhs.output_filename == rhs.output_filename && //
Expand Down Expand Up @@ -79,6 +81,7 @@ static constexpr std::string_view config_string_view{"@CHOPPER_CONFIG\n"
"@ \"value0\": \"/path/to/sketch/directory\"\n"
"@ },\n"
"@ \"k\": 20,\n"
"@ \"window_size\": 24,\n"
"@ \"disable_sketch_output\": true,\n"
"@ \"precomputed_files\": true,\n"
"@ \"output_filename\": {\n"
Expand Down
2 changes: 2 additions & 0 deletions test/api/layout/execute_layout_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ TEST(execute_test, few_ubs)
"@ \"value0\": \"\"\n"
"@ },\n"
"@ \"k\": 19,\n"
"@ \"window_size\": 19,\n"
"@ \"disable_sketch_output\": true,\n"
"@ \"precomputed_files\": false,\n"
"@ \"output_filename\": {\n"
Expand Down Expand Up @@ -275,6 +276,7 @@ TEST(execute_test, many_ubs)
"@ \"value0\": \"\"\n"
"@ },\n"
"@ \"k\": 19,\n"
"@ \"window_size\": 19,\n"
"@ \"disable_sketch_output\": true,\n"
"@ \"precomputed_files\": false,\n"
"@ \"output_filename\": {\n"
Expand Down
1 change: 1 addition & 0 deletions test/api/layout/execute_with_estimation_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ TEST(execute_estimation_test, many_ubs)
"@ \"value0\": \"\"\n"
"@ },\n"
"@ \"k\": 19,\n"
"@ \"window_size\": 19,\n"
"@ \"disable_sketch_output\": true,\n"
"@ \"precomputed_files\": false,\n"
"@ \"output_filename\": {\n"
Expand Down
1 change: 1 addition & 0 deletions test/cli/cli_chopper_pipeline_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ TEST_F(cli_test, chopper_layout2)
"@ \"value0\": \"\"\n"
"@ },\n"
"@ \"k\": 19,\n"
"@ \"window_size\": 19,\n"
"@ \"disable_sketch_output\": true,\n"
"@ \"precomputed_files\": false,\n"
"@ \"output_filename\": {\n"
Expand Down

0 comments on commit a50253e

Please sign in to comment.