Skip to content

Commit

Permalink
feat: skip invisible row for chunk::get_hash_values (#15696)
Browse files Browse the repository at this point in the history
  • Loading branch information
st1page authored Mar 18, 2024
1 parent 7cc9b57 commit a9bdaa8
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 31 deletions.
10 changes: 6 additions & 4 deletions src/common/src/array/data_chunk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -344,18 +344,20 @@ impl DataChunk {
Ok(outputs)
}

/// Compute hash values for each row.
/// Compute hash values for each row. The number of the returning `HashCodes` is `self.capacity()`.
/// When `skip_invisible_row` is true, the `HashCode` for the invisible rows is arbitrary.
pub fn get_hash_values<H: BuildHasher>(
&self,
column_idxes: &[usize],
hasher_builder: H,
) -> Vec<HashCode<H>> {
let mut states = Vec::with_capacity(self.capacity());
states.resize_with(self.capacity(), || hasher_builder.build_hasher());
let len = self.capacity();
let mut states = Vec::with_capacity(len);
states.resize_with(len, || hasher_builder.build_hasher());
// Compute hash for the specified columns.
for column_idx in column_idxes {
let array = self.column_at(*column_idx);
array.hash_vec(&mut states[..]);
array.hash_vec(&mut states[..], self.visibility());
}
finalize_hashers(&states[..])
.into_iter()
Expand Down
14 changes: 8 additions & 6 deletions src/common/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -281,10 +281,10 @@ pub trait Array:
}
}

fn hash_vec<H: Hasher>(&self, hashers: &mut [H]) {
fn hash_vec<H: Hasher>(&self, hashers: &mut [H], vis: &Bitmap) {
assert_eq!(hashers.len(), self.len());
for (idx, state) in hashers.iter_mut().enumerate() {
self.hash_at(idx, state);
for idx in vis.iter_ones() {
self.hash_at(idx, &mut hashers[idx]);
}
}

Expand Down Expand Up @@ -554,8 +554,8 @@ impl ArrayImpl {
dispatch_array_variants!(self, inner, { inner.hash_at(idx, state) })
}

pub fn hash_vec<H: Hasher>(&self, hashers: &mut [H]) {
dispatch_array_variants!(self, inner, { inner.hash_vec(hashers) })
pub fn hash_vec<H: Hasher>(&self, hashers: &mut [H], vis: &Bitmap) {
dispatch_array_variants!(self, inner, { inner.hash_vec(hashers, vis) })
}

/// Select some elements from `Array` based on `visibility` bitmap.
Expand Down Expand Up @@ -711,6 +711,7 @@ mod test_util {
use std::hash::{BuildHasher, Hasher};

use super::Array;
use crate::buffer::Bitmap;
use crate::util::iter_util::ZipEqFast;

pub fn hash_finish<H: Hasher>(hashers: &[H]) -> Vec<u64> {
Expand All @@ -732,8 +733,9 @@ mod test_util {
arr.hash_at(i, state)
}
});
let vis = Bitmap::ones(len);
arrs.iter()
.for_each(|arr| arr.hash_vec(&mut states_vec[..]));
.for_each(|arr| arr.hash_vec(&mut states_vec[..], &vis));
itertools::cons_tuples(
expects
.iter()
Expand Down
31 changes: 10 additions & 21 deletions src/common/src/hash/key_v2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -318,14 +318,9 @@ impl<S: KeyStorage, N: NullBitmap> HashKey for HashKeyImpl<S, N> {

// Dispatch types once to accelerate the inner call.
dispatch_array_variants!(array, array, {
for ((scalar, visible), serializer) in array
.iter()
.zip_eq_fast(data_chunk.visibility().iter())
.zip_eq_fast(&mut serializers)
{
if visible {
serializer.serialize(scalar);
}
for i in data_chunk.visibility().iter_ones() {
// SAFETY(value_at_unchecked): the idx is always in bound.
unsafe { serializers[i].serialize(array.value_at_unchecked(i)) }
}
});
}
Expand Down Expand Up @@ -382,22 +377,16 @@ impl DataChunk {
}
})
}

let mut sizes = self
.visibility()
.iter()
.map(|visible| if visible { exact_size } else { 0 })
.collect_vec();
let mut sizes = vec![exact_size; self.capacity()];

for i in estimated_column_indices {
dispatch_array_variants!(&*self.columns()[i], col, {
for ((datum, visible), size) in col
.iter()
.zip_eq_fast(self.visibility().iter())
.zip_eq_fast(&mut sizes)
{
if visible && let Some(scalar) = datum {
*size += HashKeySer::estimated_size(scalar);
for i in self.visibility().iter_ones() {
// SAFETY(value_at_unchecked): the idx is always in bound.
unsafe {
if let Some(scalar) = col.value_at_unchecked(i) {
sizes[i] += HashKeySer::estimated_size(scalar);
}
}
}
})
Expand Down

0 comments on commit a9bdaa8

Please sign in to comment.