Skip to content

Commit

Permalink
Merge pull request #67 from bcgsc/cbf-update
Browse files Browse the repository at this point in the history
Updates for Counting Bloom Filters
  • Loading branch information
parham-k authored Feb 21, 2023
2 parents 84accb7 + 1da9b7e commit 5662476
Show file tree
Hide file tree
Showing 4 changed files with 3,244 additions and 1,269 deletions.
168 changes: 167 additions & 1 deletion include/btllib/counting_bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,36 @@ class CountingBloomFilter
*/
void insert(const std::vector<uint64_t>& hashes) { insert(hashes.data()); }

/**
* Delete an element.
*
* @param hashes Integer array of the element's hash values. Array size should
* equal the hash_num argument used when the Bloom filter was constructed.
*/
void remove(const uint64_t* hashes);

/**
* Delete an element.
*
* @param hashes Integer vector of the element's hash values.
*/
void remove(const std::vector<uint64_t>& hashes) { remove(hashes.data()); }

/**
* Set the count of an element to zero.
*
* @param hashes Integer array of the element's hash values. Array size should
* equal the hash_num argument used when the Bloom filter was constructed.
*/
void clear(const uint64_t* hashes);

/**
* Set the count of an element to zero.
*
* @param hashes Integer vector of the element's hash values.
*/
void clear(const std::vector<uint64_t>& hashes) { clear(hashes.data()); }

/**
* Get the count of an element.
*
Expand Down Expand Up @@ -325,6 +355,72 @@ class KmerCountingBloomFilter
counting_bloom_filter.insert(hashes);
}

/**
* Decrease the counts of a sequence's k-mers from the filter.
*
* @param seq Sequence to k-merize.
* @param seq_len Length of seq.
*/
void remove(const char* seq, size_t seq_len);

/**
* Decrease the counts of a sequence's k-mers from the filter.
*
* @param seq Sequence to k-merize.
*/
void remove(const std::string& seq) { remove(seq.c_str(), seq.size()); }

/**
* Decrease the counts of a sequence's k-mers from the filter.
*
* @param hashes Integer array of the k-mer's hash values. Array size should
* equal the hash_num argument used when the Bloom filter was constructed.
*/
void remove(const uint64_t* hashes) { counting_bloom_filter.remove(hashes); }

/**
* Decrease the counts of a sequence's k-mers from the filter.
*
* @param hashes Integer vector of the k-mer's hash values.
*/
void remove(const std::vector<uint64_t>& hashes)
{
counting_bloom_filter.remove(hashes);
}

/**
* Set the counts of a sequence's k-mers to zero in the filter.
*
* @param seq Sequence to k-merize.
* @param seq_len Length of seq.
*/
void clear(const char* seq, size_t seq_len);

/**
* Set the counts of a sequence's k-mers to zero in the filter.
*
* @param seq Sequence to k-merize.
*/
void clear(const std::string& seq) { clear(seq.c_str(), seq.size()); }

/**
* Set the counts of a sequence's k-mers to zero in the filter.
*
* @param hashes Integer array of the k-mer's hash values. Array size should
* equal the hash_num argument used when the Bloom filter was constructed.
*/
void clear(const uint64_t* hashes) { counting_bloom_filter.clear(hashes); }

/**
* Set the counts of a sequence's k-mers to zero in the filter.
*
* @param hashes Integer vector of the k-mer's hash values.
*/
void clear(const std::vector<uint64_t>& hashes)
{
counting_bloom_filter.clear(hashes);
}

/**
* Query the counts of k-mers of a sequence.
*
Expand Down Expand Up @@ -692,7 +788,7 @@ CountingBloomFilter<T>::insert(const uint64_t* hashes, T min_val)
new_val = min_val + 1;
for (size_t i = 0; i < hash_num; ++i) {
tmp_min_val = min_val;
update_done = array[hashes[i] % array_size].compare_exchange_strong(
update_done |= array[hashes[i] % array_size].compare_exchange_strong(
tmp_min_val, new_val);
}
if (update_done) {
Expand All @@ -712,6 +808,56 @@ CountingBloomFilter<T>::insert(const uint64_t* hashes)
contains_insert(hashes);
}

template<typename T>
inline void
CountingBloomFilter<T>::remove(const uint64_t* hashes)
{
// Update flag to track if increment is done on at least one counter
bool update_done = false;
T min_val = contains(hashes);
T new_val, tmp_min_val;
while (true) {
new_val = min_val - 1;
for (size_t i = 0; i < hash_num; ++i) {
tmp_min_val = min_val;
update_done |= array[hashes[i] % array_size].compare_exchange_strong(
tmp_min_val, new_val);
}
if (update_done) {
break;
}
min_val = contains(hashes);
if (min_val == std::numeric_limits<T>::max()) {
break;
}
}
}

template<typename T>
inline void
CountingBloomFilter<T>::clear(const uint64_t* hashes)
{
// Update flag to track if increment is done on at least one counter
bool update_done = false;
T min_val = contains(hashes);
T new_val, tmp_min_val;
while (true) {
new_val = 0;
for (size_t i = 0; i < hash_num; ++i) {
tmp_min_val = min_val;
update_done |= array[hashes[i] % array_size].compare_exchange_strong(
tmp_min_val, new_val);
}
if (update_done) {
break;
}
min_val = contains(hashes);
if (min_val == std::numeric_limits<T>::max()) {
break;
}
}
}

template<typename T>
inline T
CountingBloomFilter<T>::contains(const uint64_t* hashes) const
Expand Down Expand Up @@ -886,6 +1032,26 @@ KmerCountingBloomFilter<T>::insert(const char* seq, size_t seq_len)
}
}

template<typename T>
inline void
KmerCountingBloomFilter<T>::remove(const char* seq, size_t seq_len)
{
NtHash nthash(seq, seq_len, get_hash_num(), get_k());
while (nthash.roll()) {
counting_bloom_filter.remove(nthash.hashes());
}
}

template<typename T>
inline void
KmerCountingBloomFilter<T>::clear(const char* seq, size_t seq_len)
{
NtHash nthash(seq, seq_len, get_hash_num(), get_k());
while (nthash.roll()) {
counting_bloom_filter.clear(nthash.hashes());
}
}

template<typename T>
inline uint64_t
KmerCountingBloomFilter<T>::contains(const char* seq, size_t seq_len) const
Expand Down
2 changes: 1 addition & 1 deletion meson.build
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
project('btllib', 'cpp',
version : '1.4.9',
version : '1.5.0',
license : 'GPL3',
default_options : [ 'cpp_std=c++11', 'warning_level=3', 'werror=true' ],
meson_version : '>= 0.60.0')
Expand Down
21 changes: 21 additions & 0 deletions tests/counting_bloom_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,5 +194,26 @@ main()
std::cerr << "Seqs with more than 1 presence = " << more_than_1 << std::endl;
TEST_ASSERT_GT(more_than_1, 5);

{
std::cerr << "Testing CBF element deletion" << std::endl;
std::vector<uint64_t> hashes = { 0x47c80ef7eab,
0x8b4a469ef6,
0x32e7ab5203 };
btllib::CountingBloomFilter8 cbf(64, hashes.size());
cbf.insert(hashes);
cbf.insert(hashes);
TEST_ASSERT_EQ(cbf.contains(hashes), 2);
cbf.remove(hashes);
TEST_ASSERT_EQ(cbf.contains(hashes), 1);
cbf.remove(hashes);
TEST_ASSERT_EQ(cbf.contains(hashes), 0);
cbf.insert(hashes);
cbf.insert(hashes);
cbf.insert(hashes);
TEST_ASSERT_EQ(cbf.contains(hashes), 3);
cbf.clear(hashes);
TEST_ASSERT_EQ(cbf.contains(hashes), 0);
}

return 0;
}
Loading

0 comments on commit 5662476

Please sign in to comment.