-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #205 from smehringer/more_stats
[FEATURE] More Layout stats
- Loading branch information
Showing
8 changed files
with
581 additions
and
123 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# --------------------------------------------------------------------------------------------------- | ||
# Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin | ||
# Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik | ||
# This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License | ||
# shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md | ||
# --------------------------------------------------------------------------------------------------- | ||
|
||
cmake_minimum_required (VERSION 3.18) | ||
|
||
add_executable (display_layout EXCLUDE_FROM_ALL display_layout.cpp general.cpp process_file.cpp sizes.cpp) | ||
target_link_libraries (display_layout "chopper_interface") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
// --------------------------------------------------------------------------------------------------- | ||
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin | ||
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik | ||
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License | ||
// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md | ||
// --------------------------------------------------------------------------------------------------- | ||
|
||
#include <iostream> | ||
#include <set> | ||
|
||
#include <robin_hood.h> | ||
|
||
#include <sharg/detail/to_string.hpp> | ||
#include <sharg/exceptions.hpp> | ||
#include <sharg/parser.hpp> | ||
|
||
#include <seqan3/io/sequence_file/all.hpp> | ||
#include <seqan3/search/views/minimiser_hash.hpp> | ||
|
||
#include <chopper/adjust_seed.hpp> | ||
#include <chopper/layout/hibf_statistics.hpp> | ||
#include <chopper/layout/input.hpp> | ||
|
||
#include <hibf/build/bin_size_in_bits.hpp> | ||
#include <hibf/build/build_data.hpp> | ||
#include <hibf/interleaved_bloom_filter.hpp> | ||
#include <hibf/layout/compute_fpr_correction.hpp> | ||
#include <hibf/layout/graph.hpp> | ||
#include <hibf/sketch/hyperloglog.hpp> | ||
|
||
#include "shared.hpp" | ||
|
||
void init_shared_meta(sharg::parser & parser) | ||
{ | ||
parser.info.version = "1.0.0"; | ||
parser.info.author = "Svenja Mehringer"; | ||
parser.info.email = "[email protected]"; | ||
parser.info.short_description = "Compute a top-level HIBF layout figure file"; | ||
parser.info.description.emplace_back("Computes an table to display the top-level layout."); | ||
} | ||
|
||
void init_options(sharg::parser & parser, config & cfg) | ||
{ | ||
parser.add_subsection("Main options:"); | ||
parser.add_option( | ||
cfg.input, | ||
sharg::config{.short_id = '\0', | ||
.long_id = "input", | ||
.description = "The input must be a layout file computed via chopper layout or raptor layout. ", | ||
.required = true, | ||
.validator = sharg::input_file_validator{}}); | ||
parser.add_option(cfg.output, | ||
sharg::config{.short_id = '\0', | ||
.long_id = "output", | ||
.description = "The output. ", | ||
.required = true, | ||
.validator = sharg::output_file_validator{}}); | ||
parser.add_option( | ||
cfg.threads, | ||
sharg::config{.short_id = '\0', .long_id = "threads", .description = "The number of threads to use."}); | ||
} | ||
|
||
void parse(sharg::parser & parser) | ||
{ | ||
try | ||
{ | ||
parser.parse(); | ||
} | ||
catch (sharg::parser_error const & ext) | ||
{ | ||
std::cerr << "[ERROR] " << ext.what() << '\n'; | ||
std::exit(EXIT_FAILURE); | ||
} | ||
} | ||
|
||
int main(int argc, char const * argv[]) | ||
{ | ||
config cfg{}; | ||
|
||
sharg::parser main_parser{"layout_stats", argc, argv, sharg::update_notifications::off, {"general", "sizes"}}; | ||
init_shared_meta(main_parser); | ||
|
||
parse(main_parser); | ||
sharg::parser & sub_parser = main_parser.get_sub_parser(); | ||
|
||
init_shared_meta(sub_parser); | ||
init_options(sub_parser, cfg); | ||
parse(sub_parser); | ||
|
||
if (sub_parser.info.app_name == std::string_view{"layout_stats-general"}) | ||
execute_general(cfg); | ||
else if (sub_parser.info.app_name == std::string_view{"layout_stats-sizes"}) | ||
execute_sizes(cfg); | ||
else | ||
std::cerr << "[ERROR] Unknown subcommand\n"; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,27 +6,19 @@ | |
// --------------------------------------------------------------------------------------------------- | ||
|
||
#include <iostream> | ||
#include <set> | ||
|
||
#include <robin_hood.h> | ||
|
||
#include <sharg/detail/to_string.hpp> | ||
#include <sharg/exceptions.hpp> | ||
#include <sharg/parser.hpp> | ||
|
||
#include <seqan3/io/sequence_file/all.hpp> | ||
#include <seqan3/search/views/kmer_hash.hpp> | ||
|
||
#include <chopper/layout/hibf_statistics.hpp> | ||
#include <chopper/layout/input.hpp> | ||
|
||
#include <hibf/sketch/hyperloglog.hpp> | ||
|
||
struct config | ||
{ | ||
std::filesystem::path input{}; | ||
std::filesystem::path output{}; | ||
}; | ||
#include "shared.hpp" | ||
|
||
static void print_progress(size_t const percentage) | ||
{ | ||
|
@@ -43,15 +35,6 @@ static void print_progress(size_t const percentage) | |
std::cerr << "] " << percentage << " %\r" << std::flush; | ||
} | ||
|
||
struct dna4_traits : public seqan3::sequence_file_input_default_traits_dna | ||
{ | ||
using sequence_alphabet = seqan3::dna4; | ||
}; | ||
|
||
using sequence_file_type = seqan3::sequence_file_input<dna4_traits, | ||
seqan3::fields<seqan3::field::seq>, | ||
seqan3::type_list<seqan3::format_fasta, seqan3::format_fastq>>; | ||
|
||
// Using two sets and erasing from shared is slower | ||
void keep_duplicates(robin_hood::unordered_set<uint64_t> & shared, std::vector<uint64_t> const & current) | ||
{ | ||
|
@@ -67,78 +50,20 @@ void keep_duplicates(robin_hood::unordered_set<uint64_t> & shared, std::vector<u | |
shared = std::move(result); | ||
} | ||
|
||
void process_file(std::string const & filename, | ||
std::vector<uint64_t> & current_kmers, | ||
seqan::hibf::sketch::hyperloglog & sketch, | ||
bool const fill_current_kmers, | ||
uint8_t const kmer_size) | ||
{ | ||
if (filename.ends_with(".minimiser")) | ||
{ | ||
uint64_t hash{}; | ||
char * const hash_data{reinterpret_cast<char *>(&hash)}; | ||
std::streamsize const hash_bytes{sizeof(hash)}; | ||
|
||
std::ifstream infile{filename, std::ios::binary}; | ||
|
||
if (fill_current_kmers) | ||
{ | ||
while (infile.read(hash_data, hash_bytes)) | ||
{ | ||
current_kmers.push_back(hash); | ||
sketch.add(hash_data, hash_bytes); | ||
} | ||
} | ||
else | ||
{ | ||
while (infile.read(hash_data, hash_bytes)) | ||
{ | ||
sketch.add(hash_data, hash_bytes); | ||
} | ||
} | ||
} | ||
else | ||
{ | ||
sequence_file_type fin{filename}; | ||
|
||
if (fill_current_kmers) | ||
{ | ||
for (auto && [seq] : fin) | ||
{ | ||
for (uint64_t hash_value : seq | seqan3::views::kmer_hash(seqan3::ungapped{kmer_size})) | ||
{ | ||
current_kmers.push_back(hash_value); | ||
sketch.add(reinterpret_cast<char *>(&hash_value), sizeof(hash_value)); | ||
} | ||
} | ||
} | ||
else | ||
{ | ||
for (auto && [seq] : fin) | ||
{ | ||
for (uint64_t hash_value : seq | seqan3::views::kmer_hash(seqan3::ungapped{kmer_size})) | ||
{ | ||
sketch.add(reinterpret_cast<char *>(&hash_value), sizeof(hash_value)); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
int execute(config const & cfg) | ||
{ | ||
std::ifstream layout_file{cfg.input}; | ||
|
||
if (!layout_file.good() || !layout_file.is_open()) | ||
throw std::logic_error{"Could not open file " + cfg.input.string() + " for reading"}; // GCOVR_EXCL_LINE | ||
throw std::logic_error{"Could not open file " + cfg.input.string() + " for reading"}; | ||
|
||
auto [filenames, chopper_config, hibf_layout] = chopper::layout::read_layout_file(layout_file); | ||
auto const & hibf_config = chopper_config.hibf_config; | ||
|
||
std::ofstream output_stream{cfg.output}; | ||
|
||
if (!output_stream.good() || !output_stream.is_open()) | ||
throw std::logic_error{"Could not open file " + cfg.output.string() + " for reading"}; // GCOVR_EXCL_LINE | ||
throw std::logic_error{"Could not open file " + cfg.output.string() + " for reading"}; | ||
|
||
// Fetch all file sizes such that sorting by file size doesn't have to access the filesystem too often. | ||
// n = filenames.size() | ||
|
@@ -294,48 +219,7 @@ int execute(config const & cfg) | |
return 0; | ||
} | ||
|
||
inline void set_up_parser(sharg::parser & parser, config & cfg) | ||
void execute_general(config const & cfg) | ||
{ | ||
parser.info.version = "1.0.0"; | ||
parser.info.author = "Svenja Mehringer"; | ||
parser.info.email = "[email protected]"; | ||
parser.info.short_description = "Compute a top-level HIBF layout figure file"; | ||
|
||
parser.info.description.emplace_back("Computes an table to display the top-level layout."); | ||
|
||
parser.add_subsection("Main options:"); | ||
parser.add_option( | ||
cfg.input, | ||
sharg::config{.short_id = '\0', | ||
.long_id = "input", | ||
.description = "The input must be a layout file computed via chopper layout or raptor layout. ", | ||
.required = true, | ||
.validator = sharg::input_file_validator{}}); | ||
parser.add_option(cfg.output, | ||
sharg::config{.short_id = '\0', | ||
.long_id = "output", | ||
.description = "The output. ", | ||
.required = true, | ||
.validator = sharg::output_file_validator{}}); | ||
} | ||
|
||
int main(int argc, char const * argv[]) | ||
{ | ||
sharg::parser parser{"layout_stats", argc, argv, sharg::update_notifications::off}; | ||
parser.info.version = "1.0.0"; | ||
|
||
config cfg{}; | ||
set_up_parser(parser, cfg); | ||
|
||
try | ||
{ | ||
parser.parse(); | ||
} | ||
catch (sharg::parser_error const & ext) | ||
{ | ||
std::cerr << "[ERROR] " << ext.what() << '\n'; | ||
return -1; | ||
} | ||
|
||
execute(cfg); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
// --------------------------------------------------------------------------------------------------- | ||
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin | ||
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik | ||
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License | ||
// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md | ||
// --------------------------------------------------------------------------------------------------- | ||
|
||
#include <seqan3/io/sequence_file/all.hpp> | ||
#include <seqan3/search/views/minimiser_hash.hpp> | ||
|
||
#include <chopper/adjust_seed.hpp> | ||
|
||
#include "shared.hpp" | ||
|
||
struct dna4_traits : public seqan3::sequence_file_input_default_traits_dna | ||
{ | ||
using sequence_alphabet = seqan3::dna4; | ||
}; | ||
|
||
using sequence_file_type = seqan3::sequence_file_input<dna4_traits, | ||
seqan3::fields<seqan3::field::seq>, | ||
seqan3::type_list<seqan3::format_fasta, seqan3::format_fastq>>; | ||
|
||
void process_file(std::string const & filename, | ||
std::vector<uint64_t> & current_kmers, | ||
seqan::hibf::sketch::hyperloglog & sketch, | ||
bool const fill_current_kmers, | ||
uint8_t const kmer_size) | ||
{ | ||
if (filename.ends_with(".minimiser")) | ||
{ | ||
uint64_t hash{}; | ||
char * const hash_data{reinterpret_cast<char *>(&hash)}; | ||
std::streamsize const hash_bytes{sizeof(hash)}; | ||
|
||
std::ifstream infile{filename, std::ios::binary}; | ||
|
||
if (fill_current_kmers) | ||
{ | ||
while (infile.read(hash_data, hash_bytes)) | ||
{ | ||
current_kmers.push_back(hash); | ||
sketch.add(hash_data, hash_bytes); | ||
} | ||
} | ||
else | ||
{ | ||
while (infile.read(hash_data, hash_bytes)) | ||
{ | ||
sketch.add(hash_data, hash_bytes); | ||
} | ||
} | ||
} | ||
else | ||
{ | ||
sequence_file_type fin{filename}; | ||
|
||
seqan3::shape shape{seqan3::ungapped{kmer_size}}; | ||
auto minimizer_view = seqan3::views::minimiser_hash(shape, | ||
seqan3::window_size{kmer_size}, | ||
seqan3::seed{chopper::adjust_seed(shape.count())}); | ||
if (fill_current_kmers) | ||
{ | ||
for (auto && [seq] : fin) | ||
{ | ||
for (uint64_t hash_value : seq | minimizer_view) | ||
{ | ||
current_kmers.push_back(hash_value); | ||
sketch.add(reinterpret_cast<char *>(&hash_value), sizeof(hash_value)); | ||
} | ||
} | ||
} | ||
else | ||
{ | ||
for (auto && [seq] : fin) | ||
{ | ||
for (uint64_t hash_value : seq | minimizer_view) | ||
{ | ||
sketch.add(reinterpret_cast<char *>(&hash_value), sizeof(hash_value)); | ||
} | ||
} | ||
} | ||
} | ||
} |
Oops, something went wrong.