Skip to content

Commit

Permalink
Merge pull request #205 from smehringer/more_stats
Browse files Browse the repository at this point in the history
[FEATURE] More Layout stats
  • Loading branch information
eseiler authored Oct 9, 2023
2 parents 3d54e1f + cf9bd9b commit 44b8b1e
Show file tree
Hide file tree
Showing 8 changed files with 581 additions and 123 deletions.
4 changes: 3 additions & 1 deletion .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,10 @@ IncludeCategories:
Priority: 9
- Regex: '<raptor/'
Priority: 10
- Regex: '.*'
- Regex: '<hibf/'
Priority: 11
- Regex: '.*'
Priority: 12
IncludeIsMainRegex: '(Test)?$'
IncludeIsMainSourceRegex: ''
IndentAccessModifiers: false
Expand Down
3 changes: 1 addition & 2 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ add_executable (measure_hyperloglog EXCLUDE_FROM_ALL measure_hyperloglog.cpp)
target_link_libraries (measure_hyperloglog "chopper_interface")
target_compile_options (measure_hyperloglog PRIVATE "-Werror")

add_executable (display_layout EXCLUDE_FROM_ALL display_layout.cpp)
target_link_libraries (display_layout "chopper_interface")
add_subdirectory (display_layout)

if (CHOPPER_INSTALL)
install (TARGETS chopper RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
Expand Down
11 changes: 11 additions & 0 deletions src/display_layout/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# ---------------------------------------------------------------------------------------------------
# Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
# Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
# This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
# shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md
# ---------------------------------------------------------------------------------------------------

cmake_minimum_required (VERSION 3.18)

add_executable (display_layout EXCLUDE_FROM_ALL display_layout.cpp general.cpp process_file.cpp sizes.cpp)
target_link_libraries (display_layout "chopper_interface")
96 changes: 96 additions & 0 deletions src/display_layout/display_layout.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// ---------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md
// ---------------------------------------------------------------------------------------------------

#include <iostream>
#include <set>

#include <robin_hood.h>

#include <sharg/detail/to_string.hpp>
#include <sharg/exceptions.hpp>
#include <sharg/parser.hpp>

#include <seqan3/io/sequence_file/all.hpp>
#include <seqan3/search/views/minimiser_hash.hpp>

#include <chopper/adjust_seed.hpp>
#include <chopper/layout/hibf_statistics.hpp>
#include <chopper/layout/input.hpp>

#include <hibf/build/bin_size_in_bits.hpp>
#include <hibf/build/build_data.hpp>
#include <hibf/interleaved_bloom_filter.hpp>
#include <hibf/layout/compute_fpr_correction.hpp>
#include <hibf/layout/graph.hpp>
#include <hibf/sketch/hyperloglog.hpp>

#include "shared.hpp"

void init_shared_meta(sharg::parser & parser)
{
parser.info.version = "1.0.0";
parser.info.author = "Svenja Mehringer";
parser.info.email = "[email protected]";
parser.info.short_description = "Compute a top-level HIBF layout figure file";
parser.info.description.emplace_back("Computes an table to display the top-level layout.");
}

void init_options(sharg::parser & parser, config & cfg)
{
parser.add_subsection("Main options:");
parser.add_option(
cfg.input,
sharg::config{.short_id = '\0',
.long_id = "input",
.description = "The input must be a layout file computed via chopper layout or raptor layout. ",
.required = true,
.validator = sharg::input_file_validator{}});
parser.add_option(cfg.output,
sharg::config{.short_id = '\0',
.long_id = "output",
.description = "The output. ",
.required = true,
.validator = sharg::output_file_validator{}});
parser.add_option(
cfg.threads,
sharg::config{.short_id = '\0', .long_id = "threads", .description = "The number of threads to use."});
}

void parse(sharg::parser & parser)
{
try
{
parser.parse();
}
catch (sharg::parser_error const & ext)
{
std::cerr << "[ERROR] " << ext.what() << '\n';
std::exit(EXIT_FAILURE);
}
}

int main(int argc, char const * argv[])
{
config cfg{};

sharg::parser main_parser{"layout_stats", argc, argv, sharg::update_notifications::off, {"general", "sizes"}};
init_shared_meta(main_parser);

parse(main_parser);
sharg::parser & sub_parser = main_parser.get_sub_parser();

init_shared_meta(sub_parser);
init_options(sub_parser, cfg);
parse(sub_parser);

if (sub_parser.info.app_name == std::string_view{"layout_stats-general"})
execute_general(cfg);
else if (sub_parser.info.app_name == std::string_view{"layout_stats-sizes"})
execute_sizes(cfg);
else
std::cerr << "[ERROR] Unknown subcommand\n";
}
124 changes: 4 additions & 120 deletions src/display_layout.cpp → src/display_layout/general.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,19 @@
// ---------------------------------------------------------------------------------------------------

#include <iostream>
#include <set>

#include <robin_hood.h>

#include <sharg/detail/to_string.hpp>
#include <sharg/exceptions.hpp>
#include <sharg/parser.hpp>

#include <seqan3/io/sequence_file/all.hpp>
#include <seqan3/search/views/kmer_hash.hpp>

#include <chopper/layout/hibf_statistics.hpp>
#include <chopper/layout/input.hpp>

#include <hibf/sketch/hyperloglog.hpp>

struct config
{
std::filesystem::path input{};
std::filesystem::path output{};
};
#include "shared.hpp"

static void print_progress(size_t const percentage)
{
Expand All @@ -43,15 +35,6 @@ static void print_progress(size_t const percentage)
std::cerr << "] " << percentage << " %\r" << std::flush;
}

struct dna4_traits : public seqan3::sequence_file_input_default_traits_dna
{
using sequence_alphabet = seqan3::dna4;
};

using sequence_file_type = seqan3::sequence_file_input<dna4_traits,
seqan3::fields<seqan3::field::seq>,
seqan3::type_list<seqan3::format_fasta, seqan3::format_fastq>>;

// Using two sets and erasing from shared is slower
void keep_duplicates(robin_hood::unordered_set<uint64_t> & shared, std::vector<uint64_t> const & current)
{
Expand All @@ -67,78 +50,20 @@ void keep_duplicates(robin_hood::unordered_set<uint64_t> & shared, std::vector<u
shared = std::move(result);
}

void process_file(std::string const & filename,
std::vector<uint64_t> & current_kmers,
seqan::hibf::sketch::hyperloglog & sketch,
bool const fill_current_kmers,
uint8_t const kmer_size)
{
if (filename.ends_with(".minimiser"))
{
uint64_t hash{};
char * const hash_data{reinterpret_cast<char *>(&hash)};
std::streamsize const hash_bytes{sizeof(hash)};

std::ifstream infile{filename, std::ios::binary};

if (fill_current_kmers)
{
while (infile.read(hash_data, hash_bytes))
{
current_kmers.push_back(hash);
sketch.add(hash_data, hash_bytes);
}
}
else
{
while (infile.read(hash_data, hash_bytes))
{
sketch.add(hash_data, hash_bytes);
}
}
}
else
{
sequence_file_type fin{filename};

if (fill_current_kmers)
{
for (auto && [seq] : fin)
{
for (uint64_t hash_value : seq | seqan3::views::kmer_hash(seqan3::ungapped{kmer_size}))
{
current_kmers.push_back(hash_value);
sketch.add(reinterpret_cast<char *>(&hash_value), sizeof(hash_value));
}
}
}
else
{
for (auto && [seq] : fin)
{
for (uint64_t hash_value : seq | seqan3::views::kmer_hash(seqan3::ungapped{kmer_size}))
{
sketch.add(reinterpret_cast<char *>(&hash_value), sizeof(hash_value));
}
}
}
}
}

int execute(config const & cfg)
{
std::ifstream layout_file{cfg.input};

if (!layout_file.good() || !layout_file.is_open())
throw std::logic_error{"Could not open file " + cfg.input.string() + " for reading"}; // GCOVR_EXCL_LINE
throw std::logic_error{"Could not open file " + cfg.input.string() + " for reading"};

auto [filenames, chopper_config, hibf_layout] = chopper::layout::read_layout_file(layout_file);
auto const & hibf_config = chopper_config.hibf_config;

std::ofstream output_stream{cfg.output};

if (!output_stream.good() || !output_stream.is_open())
throw std::logic_error{"Could not open file " + cfg.output.string() + " for reading"}; // GCOVR_EXCL_LINE
throw std::logic_error{"Could not open file " + cfg.output.string() + " for reading"};

// Fetch all file sizes such that sorting by file size doesn't have to access the filesystem too often.
// n = filenames.size()
Expand Down Expand Up @@ -294,48 +219,7 @@ int execute(config const & cfg)
return 0;
}

inline void set_up_parser(sharg::parser & parser, config & cfg)
void execute_general(config const & cfg)
{
parser.info.version = "1.0.0";
parser.info.author = "Svenja Mehringer";
parser.info.email = "[email protected]";
parser.info.short_description = "Compute a top-level HIBF layout figure file";

parser.info.description.emplace_back("Computes an table to display the top-level layout.");

parser.add_subsection("Main options:");
parser.add_option(
cfg.input,
sharg::config{.short_id = '\0',
.long_id = "input",
.description = "The input must be a layout file computed via chopper layout or raptor layout. ",
.required = true,
.validator = sharg::input_file_validator{}});
parser.add_option(cfg.output,
sharg::config{.short_id = '\0',
.long_id = "output",
.description = "The output. ",
.required = true,
.validator = sharg::output_file_validator{}});
}

int main(int argc, char const * argv[])
{
sharg::parser parser{"layout_stats", argc, argv, sharg::update_notifications::off};
parser.info.version = "1.0.0";

config cfg{};
set_up_parser(parser, cfg);

try
{
parser.parse();
}
catch (sharg::parser_error const & ext)
{
std::cerr << "[ERROR] " << ext.what() << '\n';
return -1;
}

execute(cfg);
}
84 changes: 84 additions & 0 deletions src/display_layout/process_file.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// ---------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md
// ---------------------------------------------------------------------------------------------------

#include <seqan3/io/sequence_file/all.hpp>
#include <seqan3/search/views/minimiser_hash.hpp>

#include <chopper/adjust_seed.hpp>

#include "shared.hpp"

struct dna4_traits : public seqan3::sequence_file_input_default_traits_dna
{
using sequence_alphabet = seqan3::dna4;
};

using sequence_file_type = seqan3::sequence_file_input<dna4_traits,
seqan3::fields<seqan3::field::seq>,
seqan3::type_list<seqan3::format_fasta, seqan3::format_fastq>>;

void process_file(std::string const & filename,
std::vector<uint64_t> & current_kmers,
seqan::hibf::sketch::hyperloglog & sketch,
bool const fill_current_kmers,
uint8_t const kmer_size)
{
if (filename.ends_with(".minimiser"))
{
uint64_t hash{};
char * const hash_data{reinterpret_cast<char *>(&hash)};
std::streamsize const hash_bytes{sizeof(hash)};

std::ifstream infile{filename, std::ios::binary};

if (fill_current_kmers)
{
while (infile.read(hash_data, hash_bytes))
{
current_kmers.push_back(hash);
sketch.add(hash_data, hash_bytes);
}
}
else
{
while (infile.read(hash_data, hash_bytes))
{
sketch.add(hash_data, hash_bytes);
}
}
}
else
{
sequence_file_type fin{filename};

seqan3::shape shape{seqan3::ungapped{kmer_size}};
auto minimizer_view = seqan3::views::minimiser_hash(shape,
seqan3::window_size{kmer_size},
seqan3::seed{chopper::adjust_seed(shape.count())});
if (fill_current_kmers)
{
for (auto && [seq] : fin)
{
for (uint64_t hash_value : seq | minimizer_view)
{
current_kmers.push_back(hash_value);
sketch.add(reinterpret_cast<char *>(&hash_value), sizeof(hash_value));
}
}
}
else
{
for (auto && [seq] : fin)
{
for (uint64_t hash_value : seq | minimizer_view)
{
sketch.add(reinterpret_cast<char *>(&hash_value), sizeof(hash_value));
}
}
}
}
}
Loading

0 comments on commit 44b8b1e

Please sign in to comment.