From 98bfd9b3e113b94abf4de486b4386a116dfbcda4 Mon Sep 17 00:00:00 2001 From: LTLA Date: Thu, 5 Oct 2023 15:58:23 -0700 Subject: [PATCH] Added tests for single-end diagnostics. --- .../CombinatorialBarcodesSingleEnd.hpp | 10 +- .../kaori/handlers/DualBarcodesSingleEnd.hpp | 18 ++- .../DualBarcodesSingleEndWithDiagnostics.hpp | 145 ++++++++++++++++++ include/kaori/kaori.hpp | 1 + tests/CMakeLists.txt | 1 + .../DualBarcodesSingleEndWithDiagnostics.cpp | 94 ++++++++++++ 6 files changed, 260 insertions(+), 9 deletions(-) create mode 100644 include/kaori/handlers/DualBarcodesSingleEndWithDiagnostics.hpp create mode 100644 tests/src/handlers/DualBarcodesSingleEndWithDiagnostics.cpp diff --git a/include/kaori/handlers/CombinatorialBarcodesSingleEnd.hpp b/include/kaori/handlers/CombinatorialBarcodesSingleEnd.hpp index 97e0eac..f00d4d0 100644 --- a/include/kaori/handlers/CombinatorialBarcodesSingleEnd.hpp +++ b/include/kaori/handlers/CombinatorialBarcodesSingleEnd.hpp @@ -63,8 +63,12 @@ class CombinatorialBarcodesSingleEnd { * This should be less than or equal to `max_size`. * @param barcode_pools Array containing the known barcode sequences for each of the variable regions, in the order of their appearance in the template sequence. * @param options Optional parameters. + * + * @tparam BarcodePoolContainer Some iterable container of `BarcodePool` instances, + * usually either a `std::vector` or a `std::array`. */ - CombinatorialBarcodesSingleEnd(const char* template_seq, size_t template_length, const std::array& barcode_pools, const Options& options) : + template + CombinatorialBarcodesSingleEnd(const char* template_seq, size_t template_length, const BarcodePoolContainer& barcode_pools, const Options& options) : forward(search_forward(options.strand)), reverse(search_reverse(options.strand)), max_mm(options.max_mismatches), @@ -75,6 +79,10 @@ class CombinatorialBarcodesSingleEnd { if (regions.size() != num_variable) { throw std::runtime_error("expected " + std::to_string(num_variable) + " variable regions in the constant template"); } + if (barcode_pools.size() != num_variable) { + throw std::runtime_error("length of 'barcode_pools' should be equal to the number of variable regions"); + } + for (size_t i = 0; i < num_variable; ++i) { size_t rlen = regions[i].second - regions[i].first; size_t vlen = barcode_pools[i].length; diff --git a/include/kaori/handlers/DualBarcodesSingleEnd.hpp b/include/kaori/handlers/DualBarcodesSingleEnd.hpp index 32d1ef0..95f13f4 100644 --- a/include/kaori/handlers/DualBarcodesSingleEnd.hpp +++ b/include/kaori/handlers/DualBarcodesSingleEnd.hpp @@ -171,7 +171,7 @@ class DualBarcodesSingleEnd { } private: - void process_first(State& state, const std::pair& x) const { + bool process_first(State& state, const std::pair& x) const { auto deets = constant_matcher.initialize(x.first, x.second - x.first); while (!deets.finished) { @@ -181,7 +181,7 @@ class DualBarcodesSingleEnd { auto id = forward_match(x.first, deets, state).first; if (id >= 0) { ++state.counts[id]; - return; + return true; } } @@ -189,13 +189,14 @@ class DualBarcodesSingleEnd { auto id = reverse_match(x.first, deets, state).first; if (id >= 0) { ++state.counts[id]; - return; + return true; } } } + return false; } - void process_best(State& state, const std::pair& x) const { + bool process_best(State& state, const std::pair& x) const { auto deets = constant_matcher.initialize(x.first, x.second - x.first); bool found = false; int best_mismatches = max_mm + 1; @@ -235,6 +236,7 @@ class DualBarcodesSingleEnd { if (found) { ++state.counts[best_id]; } + return found; } public: @@ -262,13 +264,13 @@ class DualBarcodesSingleEnd { return; } - void process(State& state, const std::pair& x) const { + bool process(State& state, const std::pair& x) const { + ++state.total; if (use_first) { - process_first(state, x); + return process_first(state, x); } else { - process_best(state, x); + return process_best(state, x); } - ++state.total; } static constexpr bool use_names = false; diff --git a/include/kaori/handlers/DualBarcodesSingleEndWithDiagnostics.hpp b/include/kaori/handlers/DualBarcodesSingleEndWithDiagnostics.hpp new file mode 100644 index 0000000..276a7e5 --- /dev/null +++ b/include/kaori/handlers/DualBarcodesSingleEndWithDiagnostics.hpp @@ -0,0 +1,145 @@ +#ifndef KAORI_DUAL_BARCODES_SINGLE_END_WITH_DIAGNOSTICS_HPP +#define KAORI_DUAL_BARCODES_SINGLE_END_WITH_DIAGNOSTICS_HPP + +#include "DualBarcodesSingleEnd.hpp" +#include "CombinatorialBarcodesSingleEnd.hpp" +#include "../utils.hpp" + +/** + * @file DualBarcodesSingleEndWithDiagnostics.hpp + * + * @brief Process dual barcodes with extra diagnostics. + */ + +namespace kaori { + +/** + * @brief Handler for dual barcodes with extra diagnostics. + * + * This provides the same information as `DualBarcodesSingleEnd` but also captures the frequency of the invalid combinations. + * These frequences can be helpful for diagnosing problems with library construction. + * + * @tparam max_size Maximum length of the template sequences on both reads. + * @tparam num_variable Number of the template sequences on both reads. + */ +template +class DualBarcodesSingleEndWithDiagnostics { +public: + /** + * @param[in] template_seq Pointer to a character array containing the template sequence. + * @param template_length Length of the template. + * This should be less than or equal to `max_size`. + * @param barcode_pools Pools of known barcode sequences for each variable region in the template. + * Each pool should have the same length, and corresponding values across pools define a specific combination of barcodes. + * @param options Optional parameters. + */ + DualBarcodesSingleEndWithDiagnostics( + const char* template_seq, + size_t template_length, + const std::vector& barcode_pools, + const typename DualBarcodesSingleEnd::Options& options + ) : + dual_handler(template_seq, template_length, barcode_pools, options), + + combo_handler(template_seq, template_length, barcode_pools, + [&]{ + typename CombinatorialBarcodesSingleEnd::Options combopt; + combopt.use_first = options.use_first; + + combopt.max_mismatches = options.max_mismatches; + combopt.strand = options.strand; + + // we allow duplicates in the trie for each individual barcode, as only the combinations are unique in the dual barcode setup. + combopt.duplicates = DuplicateAction::FIRST; + return combopt; + }() + ) + {} + +private: + DualBarcodesSingleEnd dual_handler; + CombinatorialBarcodesSingleEnd combo_handler; + +public: + /** + *@cond + */ + struct State { + State() {} + State(typename DualBarcodesSingleEnd::State ds, typename CombinatorialBarcodesSingleEnd::State cs) : dual_state(std::move(ds)), combo_state(std::move(cs)) {} + + /** + * @cond + */ + typename DualBarcodesSingleEnd::State dual_state; + typename CombinatorialBarcodesSingleEnd::State combo_state; + /** + * @endcond + */ + }; + + State initialize() const { + return State(dual_handler.initialize(), combo_handler.initialize()); + } + + void reduce(State& s) { + dual_handler.reduce(s.dual_state); + combo_handler.reduce(s.combo_state); + } + + constexpr static bool use_names = false; + /** + *@endcond + */ + +public: + /** + *@cond + */ + void process(State& state, const std::pair& x) const { + // Only searching for combinations if we couldn't find a proper dual barcode match. + if (!dual_handler.process(state.dual_state, x)) { + combo_handler.process(state.combo_state, x); + } + } + /** + *@endcond + */ + +public: + /** + * Sort the invalid combinations for easier frequency counting. + * Combinations are sorted by the first index, and then the second index. + */ + void sort() { + combo_handler.sort(); + } + + /** + * @return Vector containing the frequency of each valid combination. + * This has length equal to the number of valid dual barcode combinations (i.e., the length of `barcode_pool1` and `barcode_pool2` in the constructor). + * Each entry contains the count for the corresponding dual barcode combination. + */ + const std::vector& get_counts() const { + return dual_handler.get_counts(); + } + + /** + * @return All invalid combinations encountered by the handler. + * In each array, the first and second element contains the indices of known barcodes in the first and second pools, respectively. + */ + const std::vector >& get_combinations() const { + return combo_handler.get_combinations(); + } + + /** + * @return Total number of reads processed by the handler. + */ + int get_total() const { + return dual_handler.get_total(); + } +}; + +} + +#endif diff --git a/include/kaori/kaori.hpp b/include/kaori/kaori.hpp index 184615b..5fbf108 100644 --- a/include/kaori/kaori.hpp +++ b/include/kaori/kaori.hpp @@ -6,6 +6,7 @@ #include "handlers/DualBarcodesPairedEnd.hpp" #include "handlers/DualBarcodesPairedEndWithDiagnostics.hpp" #include "handlers/DualBarcodesSingleEnd.hpp" +#include "handlers/DualBarcodesSingleEndWithDiagnostics.hpp" #include "handlers/RandomBarcodeSingleEnd.hpp" #include "handlers/SingleBarcodePairedEnd.hpp" #include "handlers/SingleBarcodeSingleEnd.hpp" diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6f8aa60..99a1db9 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -29,6 +29,7 @@ add_executable( src/handlers/DualBarcodesPairedEnd.cpp src/handlers/DualBarcodesPairedEndWithDiagnostics.cpp src/handlers/DualBarcodesSingleEnd.cpp + src/handlers/DualBarcodesSingleEndWithDiagnostics.cpp src/handlers/RandomBarcodeSingleEnd.cpp ) diff --git a/tests/src/handlers/DualBarcodesSingleEndWithDiagnostics.cpp b/tests/src/handlers/DualBarcodesSingleEndWithDiagnostics.cpp new file mode 100644 index 0000000..967418c --- /dev/null +++ b/tests/src/handlers/DualBarcodesSingleEndWithDiagnostics.cpp @@ -0,0 +1,94 @@ +#include +#include "kaori/handlers/DualBarcodesSingleEndWithDiagnostics.hpp" +#include "kaori/process_data.hpp" +#include "byteme/RawBufferReader.hpp" +#include "../utils.h" +#include + +class DualBarcodesSingleEndWithDiagnosticsTest : public testing::Test { +protected: + DualBarcodesSingleEndWithDiagnosticsTest() : + constant("AAAA----CGGCAGCT------TTTT"), + variables1(std::vector{ "AAAA", "CCCC", "GGGG", "TTTT" }), + variables2(std::vector{ "ACACAC", "TGTGTG", "AGAGAG", "CTCTCT" }) + {} + + std::string constant1, constant; + std::vector variables1; + std::vector variables2; + + template + using Options = typename kaori::DualBarcodesSingleEnd::Options; +}; + +TEST_F(DualBarcodesSingleEndWithDiagnosticsTest, BasicFirst) { + std::vector seq { + "cagcatcgatcgtgaAAAACCCCCGGCAGCTTGTGTGTTTTacggaggaga", // index 1 + "AAAAGGGGCGGCAGCTAGAGAGTTTTaaaaccccggg", // index 2 + "AAAAGGGGCGGCAGCTTGTGTGTTTTaaaaccccggg", // invalid: (2, 1) + "aaccaccaAAAATTTTCGGCAGCTACACACTTTTaaaaccccggg", // invalid (3, 0) + "cagacgagcagcgagcagcatcagca" // matches nothing. + }; + std::string fq = convert_to_fastq(seq); + + byteme::RawBufferReader reader(reinterpret_cast(fq.c_str()), fq.size()); + + kaori::DualBarcodesSingleEndWithDiagnostics<32, 2> stuff( + constant.c_str(), constant.size(), + { kaori::BarcodePool(variables1), kaori::BarcodePool(variables2) }, + Options<32>() + ); + kaori::process_single_end_data(&reader, stuff); + + EXPECT_EQ(stuff.get_total(), 5); + EXPECT_EQ(stuff.get_counts()[0], 0); + EXPECT_EQ(stuff.get_counts()[1], 1); + EXPECT_EQ(stuff.get_counts()[2], 1); + EXPECT_EQ(stuff.get_counts()[3], 0); + + stuff.sort(); + const auto& combos = stuff.get_combinations(); + ASSERT_EQ(combos.size(), 2); + EXPECT_EQ(combos[0][0], 2); + EXPECT_EQ(combos[0][1], 1); + + EXPECT_EQ(combos[1][0], 3); + EXPECT_EQ(combos[1][1], 0); +} + +TEST_F(DualBarcodesSingleEndWithDiagnosticsTest, WithDuplicates) { + // Inserting duplicate entries, even though the combinations are unique. + variables1.push_back("AAAA"); + EXPECT_EQ(variables1.front(), variables1.back()); + variables2.push_back("CTCTCT"); + EXPECT_EQ(variables2[3], variables2[4]); + + std::vector seq{ + "cagcatcgatcgtgaAAAAAAAACGGCAGCTACACACTTTTcagcatcgatcgtga", // ok, index 1 + "cagcatcgatcgtgaAAAAAAAACGGCAGCTCTCTCTTTTTcagcatcgatcgtga", // ok, index 4 + "cagcatcgatcgtgaAAAAAAAACGGCAGCTTGTGTGTTTTcagcatcgatcgtga" // invalid (0, 1), as the first hit is reported. + }; + std::string fq = convert_to_fastq(seq); + + byteme::RawBufferReader reader(reinterpret_cast(fq.c_str()), fq.size()); + + kaori::DualBarcodesSingleEndWithDiagnostics<32, 2> stuff( + constant.c_str(), constant.size(), + { kaori::BarcodePool(variables1), kaori::BarcodePool(variables2) }, + Options<32>() + ); + kaori::process_single_end_data(&reader, stuff); + + EXPECT_EQ(stuff.get_total(), 3); + EXPECT_EQ(stuff.get_counts()[0], 1); + EXPECT_EQ(stuff.get_counts()[1], 0); + EXPECT_EQ(stuff.get_counts()[2], 0); + EXPECT_EQ(stuff.get_counts()[3], 0); + EXPECT_EQ(stuff.get_counts()[4], 1); + + stuff.sort(); + const auto& combos = stuff.get_combinations(); + ASSERT_EQ(combos.size(), 1); + EXPECT_EQ(combos.front()[0], 0); + EXPECT_EQ(combos.front()[1], 1); +}