From 505d569a42ad74ddaaadeff04765125c36ff0ae2 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Thu, 2 Nov 2023 16:30:43 +0800 Subject: [PATCH 01/28] feat(query): agg-hashtable-p2 --- .../src/aggregate/aggregate_hashtable.rs | 82 ++++++++++--------- src/query/expression/src/aggregate/payload.rs | 45 +++++----- .../expression/src/aggregate/payload_flush.rs | 10 ++- .../expression/src/aggregate/probe_state.rs | 4 + .../expression/src/utils/select_vector.rs | 13 ++- .../service/src/pipelines/pipeline_builder.rs | 10 ++- .../aggregator/aggregate_exchange_injector.rs | 3 + .../transforms/aggregator/aggregate_meta.rs | 9 ++ .../serde/transform_aggregate_serializer.rs | 1 + ...transform_exchange_aggregate_serializer.rs | 2 + .../transform_exchange_group_by_serializer.rs | 1 + .../serde/transform_group_by_serializer.rs | 1 + .../serde/transform_spill_reader.rs | 2 + .../aggregator/transform_aggregate_final.rs | 34 +++++++- .../aggregator/transform_aggregate_partial.rs | 55 ++++++++++--- .../aggregator/transform_group_by_final.rs | 31 +++++++ .../aggregator/transform_group_by_partial.rs | 39 ++++++++- .../aggregator/transform_partition_bucket.rs | 11 +++ src/query/settings/src/settings_default.rs | 7 ++ .../settings/src/settings_getter_setter.rs | 4 + 20 files changed, 283 insertions(+), 81 deletions(-) diff --git a/src/query/expression/src/aggregate/aggregate_hashtable.rs b/src/query/expression/src/aggregate/aggregate_hashtable.rs index ffb674bf17bc..c2d05eb43f86 100644 --- a/src/query/expression/src/aggregate/aggregate_hashtable.rs +++ b/src/query/expression/src/aggregate/aggregate_hashtable.rs @@ -52,6 +52,9 @@ pub struct AggregateHashTable { capacity: usize, } +unsafe impl Send for AggregateHashTable {} +unsafe impl Sync for AggregateHashTable {} + impl AggregateHashTable { pub fn new( arena: Arc, @@ -75,7 +78,7 @@ impl AggregateHashTable { unsafe { Vec::from_raw_parts(ptr as *mut Entry, len, cap) } } - fn len(&self) -> usize { + pub fn len(&self) -> usize { self.payload.len() } @@ -90,28 +93,31 @@ impl AggregateHashTable { let group_hashes = group_hash_columns(group_columns); let new_group_count = self.probe_and_create(state, group_columns, row_count, &group_hashes); - for i in 0..row_count { - state.state_places[i] = unsafe { - StateAddr::new( - load::(state.addresses[i].add(self.payload.state_offset)) as usize, - ) - }; - } + if !self.payload.aggrs.is_empty() { + for i in 0..row_count { + state.state_places[i] = unsafe { + StateAddr::new( + load::(state.addresses[i].add(self.payload.state_offset)) as usize, + ) + }; + } - for ((aggr, params), addr_offset) in self - .payload - .aggrs - .iter() - .zip(params.iter()) - .zip(self.payload.state_addr_offsets.iter()) - { - aggr.accumulate_keys( - &state.state_places.as_slice()[0..row_count], - *addr_offset, - params, - row_count, - )?; + for ((aggr, params), addr_offset) in self + .payload + .aggrs + .iter() + .zip(params.iter()) + .zip(self.payload.state_addr_offsets.iter()) + { + aggr.accumulate_keys( + &state.state_places.as_slice()[0..row_count], + *addr_offset, + params, + row_count, + )?; + } } + Ok(new_group_count) } @@ -164,8 +170,9 @@ impl AggregateHashTable { } state.empty_vector.set_index(new_entry_count, index); - state.new_groups.set_index(new_group_count, index); new_entry_count += 1; + + state.new_groups.set_index(new_group_count, index); new_group_count += 1; } else if entry.salt == state.hash_salts[index] { state @@ -180,13 +187,8 @@ impl AggregateHashTable { // 2. append new_group_count to payload if new_entry_count != 0 { - self.payload.append_rows( - state, - hashes, - &select_vector, - new_entry_count, - group_columns, - ); + self.payload + .append_rows(state, hashes, new_entry_count, group_columns); } // 3. handle need_compare_count @@ -224,19 +226,23 @@ impl AggregateHashTable { } } - std::mem::swap(&mut select_vector, &mut state.no_match_vector); - state.no_match_vector.resize(no_match_count); - + if select_vector.is_auto_increment() { + select_vector = state.no_match_vector.clone(); + } else { + std::mem::swap(&mut select_vector, &mut state.no_match_vector); + } remaining_entries = no_match_count; } // set state places - for i in 0..row_count { - state.state_places[i] = unsafe { - StateAddr::new( - load::(state.addresses[i].add(self.payload.state_offset)) as usize, - ) - }; + if !self.payload.aggrs.is_empty() { + for i in 0..row_count { + state.state_places[i] = unsafe { + StateAddr::new( + load::(state.addresses[i].add(self.payload.state_offset)) as usize, + ) + }; + } } new_group_count diff --git a/src/query/expression/src/aggregate/payload.rs b/src/query/expression/src/aggregate/payload.rs index 80153e11aed1..327c81d89eb8 100644 --- a/src/query/expression/src/aggregate/payload.rs +++ b/src/query/expression/src/aggregate/payload.rs @@ -22,7 +22,6 @@ use super::payload_row::serialize_column_to_rowformat; use super::probe_state::ProbeState; use crate::get_layout_offsets; use crate::load; -use crate::select_vector::SelectVector; use crate::store; use crate::types::DataType; use crate::AggregateFunctionRef; @@ -52,7 +51,7 @@ pub struct Payload { pub hash_offset: usize, pub state_offset: usize, pub state_addr_offsets: Vec, - pub state_layout: Layout, + pub state_layout: Option, } // TODO FIXME @@ -63,7 +62,11 @@ impl Payload { aggrs: Vec, ) -> Self { let mut state_addr_offsets = Vec::new(); - let state_layout = get_layout_offsets(&aggrs, &mut state_addr_offsets).unwrap(); + let state_layout = if !aggrs.is_empty() { + Some(get_layout_offsets(&aggrs, &mut state_addr_offsets).unwrap()) + } else { + None + }; let mut tuple_size = 0; let mut validity_offsets = Vec::with_capacity(group_types.len()); @@ -92,7 +95,9 @@ impl Payload { tuple_size += hash_size; let state_offset = tuple_size; - tuple_size += 8; + if !aggrs.is_empty() { + tuple_size += 8; + } Self { arena, @@ -142,12 +147,11 @@ impl Payload { &mut self, state: &mut ProbeState, group_hashes: &[u64], - select_vector: &SelectVector, new_group_rows: usize, group_columns: &[Column], ) { self.try_reverse(new_group_rows); - + let select_vector = &state.empty_vector; for i in 0..new_group_rows { let idx = select_vector.get_index(i); @@ -166,7 +170,7 @@ impl Payload { let idx = select_vector.get_index(i); if bitmap.get_bit(idx) { unsafe { - let dst = address[i].add(write_offset); + let dst = address[idx].add(write_offset); store(&1, dst as *mut u8); } } @@ -195,25 +199,26 @@ impl Payload { for i in 0..new_group_rows { let idx = select_vector.get_index(i); unsafe { - let dst = address[i].add(write_offset); + let dst = address[idx].add(write_offset); store(&group_hashes[idx], dst as *mut u8); } } write_offset += 8; + if let Some(layout) = self.state_layout { + // write states + for i in 0..new_group_rows { + let place = self.arena.alloc_layout(layout); + let idx = select_vector.get_index(i); + unsafe { + let dst = address[idx].add(write_offset); + store(&(place.as_ptr() as u64), dst as *mut u8); + } - // write states - for i in 0..new_group_rows { - let place = self.arena.alloc_layout(self.state_layout); - let idx = select_vector.get_index(i); - unsafe { - let dst = address[idx].add(write_offset); - store(&(place.as_ptr() as u64), dst as *mut u8); - } - - let place = StateAddr::from(place); - for (aggr, offset) in self.aggrs.iter().zip(self.state_addr_offsets.iter()) { - aggr.init_state(place.next(*offset)); + let place = StateAddr::from(place); + for (aggr, offset) in self.aggrs.iter().zip(self.state_addr_offsets.iter()) { + aggr.init_state(place.next(*offset)); + } } } } diff --git a/src/query/expression/src/aggregate/payload_flush.rs b/src/query/expression/src/aggregate/payload_flush.rs index c77e47c66814..a82d7df0846f 100644 --- a/src/query/expression/src/aggregate/payload_flush.rs +++ b/src/query/expression/src/aggregate/payload_flush.rs @@ -76,10 +76,12 @@ impl Payload { state.group_columns.push(col); } - for i in 0..rows { - state.state_places[i] = unsafe { - StateAddr::new(load::(state.addresses[i].add(self.state_offset)) as usize) - }; + if !self.aggrs.is_empty() { + for i in 0..rows { + state.state_places[i] = unsafe { + StateAddr::new(load::(state.addresses[i].add(self.state_offset)) as usize) + }; + } } state.flush_offset = flush_end; diff --git a/src/query/expression/src/aggregate/probe_state.rs b/src/query/expression/src/aggregate/probe_state.rs index 16a1b83c10da..e683a0df2cdf 100644 --- a/src/query/expression/src/aggregate/probe_state.rs +++ b/src/query/expression/src/aggregate/probe_state.rs @@ -33,6 +33,9 @@ pub struct ProbeState { pub row_count: usize, } +unsafe impl Send for ProbeState {} +unsafe impl Sync for ProbeState {} + impl ProbeState { pub fn adjust_group_columns( &mut self, @@ -66,6 +69,7 @@ impl ProbeState { self.empty_vector.resize(row_count); self.new_groups.resize(row_count); } + self.row_count = row_count; } } diff --git a/src/query/expression/src/utils/select_vector.rs b/src/query/expression/src/utils/select_vector.rs index e63ff00b75f7..959d501d5d2d 100644 --- a/src/query/expression/src/utils/select_vector.rs +++ b/src/query/expression/src/utils/select_vector.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct SelectVector { increment: bool, sel_vector: Vec, @@ -32,6 +32,10 @@ impl SelectVector { Self::default() } + pub fn is_auto_increment(&self) -> bool { + self.increment + } + pub fn new(size: usize) -> Self { Self { increment: false, @@ -51,6 +55,13 @@ impl SelectVector { // these function did not check index boundes // keep in mind when using them pub fn set_index(&mut self, idx: usize, loc: usize) { + #[cfg(debug_assertions)] + { + if self.sel_vector.len() <= idx { + panic!("index out of bound {}, {}", self.sel_vector.len(), idx); + } + } + self.sel_vector[idx] = loc; } diff --git a/src/query/service/src/pipelines/pipeline_builder.rs b/src/query/service/src/pipelines/pipeline_builder.rs index d478e9c041ef..837b66cc762d 100644 --- a/src/query/service/src/pipelines/pipeline_builder.rs +++ b/src/query/service/src/pipelines/pipeline_builder.rs @@ -1985,6 +1985,10 @@ impl PipelineBuilder { } let efficiently_memory = self.settings.get_efficiently_memory_group_by()?; + let enable_experimental_aggregate_hashtable = self + .settings + .get_enable_experimental_aggregate_hashtable()? + && self.ctx.get_cluster().is_empty(); let group_cols = ¶ms.group_columns; let schema_before_group_by = params.input_schema.clone(); @@ -1999,7 +2003,8 @@ impl PipelineBuilder { method, input, output, - params.clone() + params.clone(), + enable_experimental_aggregate_hashtable, ), }), false => with_mappedhash_method!(|T| match method.clone() { @@ -2008,7 +2013,8 @@ impl PipelineBuilder { method, input, output, - params.clone() + params.clone(), + enable_experimental_aggregate_hashtable, ), }), }?; diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs index 9afe4c1aa5d8..0716ea90e094 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs @@ -76,6 +76,7 @@ impl ExchangeSorting AggregateMeta::Partitioned { .. } => unreachable!(), AggregateMeta::Serialized(v) => Ok(v.bucket), AggregateMeta::HashTable(v) => Ok(v.bucket), + AggregateMeta::AggregateHashTable((bucket, _)) => Ok(*bucket), AggregateMeta::Spilled(_) | AggregateMeta::Spilling(_) | AggregateMeta::BucketSpilled(_) => Ok(-1), @@ -174,6 +175,8 @@ impl FlightScatter }); } } + + AggregateMeta::AggregateHashTable(_) => todo!("AGG_HASHTABLE"), }; return Ok(blocks); diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_meta.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_meta.rs index e391f730b202..d83c0f396bf0 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_meta.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_meta.rs @@ -16,6 +16,7 @@ use std::fmt::Debug; use std::fmt::Formatter; use std::ops::Range; +use common_expression::AggregateHashTable; use common_expression::BlockMetaInfo; use common_expression::BlockMetaInfoPtr; use common_expression::Column; @@ -52,6 +53,7 @@ pub struct BucketSpilledPayload { pub enum AggregateMeta { Serialized(SerializedPayload), HashTable(HashTablePayload), + AggregateHashTable((isize, AggregateHashTable)), BucketSpilled(BucketSpilledPayload), Spilled(Vec), Spilling(HashTablePayload, V>), @@ -67,6 +69,10 @@ impl AggregateMeta BlockMetaInfoPtr { + Box::new(AggregateMeta::::AggregateHashTable((bucket, ht))) + } + pub fn create_serialized(bucket: isize, block: DataBlock) -> BlockMetaInfoPtr { Box::new(AggregateMeta::::Serialized(SerializedPayload { bucket, @@ -127,6 +133,9 @@ impl Debug for AggregateMeta AggregateMeta::Spilling(_) => f.debug_struct("Aggregate::Spilling").finish(), AggregateMeta::Spilled(_) => f.debug_struct("Aggregate::Spilling").finish(), AggregateMeta::BucketSpilled(_) => f.debug_struct("Aggregate::BucketSpilled").finish(), + AggregateMeta::AggregateHashTable(_) => { + f.debug_struct("AggregateMeta:AggHashTable").finish() + } } } } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_aggregate_serializer.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_aggregate_serializer.rs index d982d81e02bb..1c2d393fd2d6 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_aggregate_serializer.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_aggregate_serializer.rs @@ -145,6 +145,7 @@ impl TransformAggregateSerializer { )); return Ok(Event::Sync); } + AggregateMeta::AggregateHashTable(_) => todo!("AGG_HASHTABLE"), } } } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_aggregate_serializer.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_aggregate_serializer.rs index 2d7b08ea1bbb..26d0e6c5c5c4 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_aggregate_serializer.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_aggregate_serializer.rs @@ -147,6 +147,8 @@ impl BlockMetaTransform } })); } + + Some(AggregateMeta::AggregateHashTable(_)) => todo!("AGG_HASHTABLE"), }; } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_group_by_serializer.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_group_by_serializer.rs index eb0c45f21039..dd5df80d0b30 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_group_by_serializer.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_group_by_serializer.rs @@ -197,6 +197,7 @@ impl BlockMetaTransform }, )); } + Some(AggregateMeta::AggregateHashTable(_)) => todo!("AGG_HASHTABLE"), Some(AggregateMeta::HashTable(payload)) => { if index == self.local_pos { serialized_blocks.push(FlightSerialized::DataBlock(block.add_meta( diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_group_by_serializer.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_group_by_serializer.rs index 223617c0f6f6..80e342abe6af 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_group_by_serializer.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_group_by_serializer.rs @@ -126,6 +126,7 @@ impl TransformGroupBySerializer { AggregateMeta::Serialized(_) => unreachable!(), AggregateMeta::BucketSpilled(_) => unreachable!(), AggregateMeta::Partitioned { .. } => unreachable!(), + AggregateMeta::AggregateHashTable(_) => todo!("AGG_HASHTABLE"), AggregateMeta::HashTable(payload) => { self.input_data = Some(SerializeGroupByStream::create(&self.method, payload)); diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_spill_reader.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_spill_reader.rs index 950b0db2120c..c1bbaf9109a8 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_spill_reader.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_spill_reader.rs @@ -137,6 +137,7 @@ impl Processor match meta { AggregateMeta::Spilled(_) => unreachable!(), AggregateMeta::Spilling(_) => unreachable!(), + AggregateMeta::AggregateHashTable(_) => unreachable!(), AggregateMeta::HashTable(_) => unreachable!(), AggregateMeta::Serialized(_) => unreachable!(), AggregateMeta::BucketSpilled(payload) => { @@ -178,6 +179,7 @@ impl Processor AggregateMeta::Spilled(_) => unreachable!(), AggregateMeta::Spilling(_) => unreachable!(), AggregateMeta::HashTable(_) => unreachable!(), + AggregateMeta::AggregateHashTable(_) => unreachable!(), AggregateMeta::Serialized(_) => unreachable!(), AggregateMeta::BucketSpilled(payload) => { let instant = Instant::now(); diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs index 8991bce6ce33..077d4b540ef9 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs @@ -18,8 +18,10 @@ use std::sync::Arc; use bumpalo::Bump; use common_exception::ErrorCode; use common_exception::Result; +use common_expression::AggregateHashTable; use common_expression::ColumnBuilder; use common_expression::DataBlock; +use common_expression::PayloadFlushState; use common_functions::aggregates::StateAddr; use common_hashtable::HashtableEntryMutRefLike; use common_hashtable::HashtableEntryRefLike; @@ -68,10 +70,12 @@ where Method: HashMethodBounds if let AggregateMeta::Partitioned { bucket, data } = meta { let mut reach_limit = false; let arena = Arc::new(Bump::new()); - let hashtable = self.method.create_hash_table::(arena)?; + let hashtable = self.method.create_hash_table::(arena.clone())?; let _dropper = AggregateHashTableDropper::create(self.params.clone()); let mut hash_cell = HashTableCell::::create(hashtable, _dropper); + let mut agg_hashtable: Option = None; + for bucket_data in data { match bucket_data { AggregateMeta::Spilled(_) => unreachable!(), @@ -176,9 +180,37 @@ where Method: HashMethodBounds } } }, + AggregateMeta::AggregateHashTable((_, hashtable)) => { + match agg_hashtable.as_mut() { + Some(ht) => { + let mut flush_state = PayloadFlushState::default(); + ht.combine(hashtable, &mut flush_state)?; + } + None => agg_hashtable = Some(hashtable), + } + } } } + if let Some(mut ht) = agg_hashtable { + let mut flush_state = PayloadFlushState::default(); + + let mut blocks = vec![]; + loop { + if ht.merge_result(&mut flush_state)? { + let mut cols = flush_state.aggregate_results.clone(); + cols.extend_from_slice(&flush_state.group_columns); + + blocks.push(DataBlock::new_from_columns(cols)); + } else { + break; + } + } + + // todo pipeline + return DataBlock::concat(&blocks); + } + let keys_len = hash_cell.hashtable.len(); let value_size = estimated_key_size(&hash_cell.hashtable); diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs index 7a804c4a06d0..b4223f72f93b 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs @@ -23,9 +23,11 @@ use common_catalog::plan::AggIndexMeta; use common_catalog::table_context::TableContext; use common_exception::ErrorCode; use common_exception::Result; +use common_expression::AggregateHashTable; use common_expression::BlockMetaInfoDowncast; use common_expression::Column; use common_expression::DataBlock; +use common_expression::ProbeState; use common_functions::aggregates::StateAddr; use common_functions::aggregates::StateAddrs; use common_hashtable::HashtableEntryMutRefLike; @@ -54,6 +56,7 @@ use crate::sessions::QueryContext; enum HashTable { MovedOut, HashTable(HashTableCell), + AggregateHashTable(AggregateHashTable), PartitionedHashTable(HashTableCell, usize>), } @@ -108,7 +111,7 @@ pub struct TransformPartialAggregate { method: Method, settings: AggregateSettings, hash_table: HashTable, - + probe_state: ProbeState, params: Arc, } @@ -119,17 +122,27 @@ impl TransformPartialAggregate { input: Arc, output: Arc, params: Arc, + enable_experimental_aggregate_hashtable: bool, ) -> Result> { let arena = Arc::new(Bump::new()); - let hashtable = method.create_hash_table(arena)?; - let _dropper = AggregateHashTableDropper::create(params.clone()); - let hashtable = HashTableCell::create(hashtable, _dropper); - - let hash_table = match !Method::SUPPORT_PARTITIONED || !params.has_distinct_combinator() { - true => HashTable::HashTable(hashtable), - false => HashTable::PartitionedHashTable(PartitionedHashMethod::convert_hashtable( - &method, hashtable, - )?), + + let hash_table = if !enable_experimental_aggregate_hashtable { + let hashtable = method.create_hash_table(arena.clone())?; + let _dropper = AggregateHashTableDropper::create(params.clone()); + let hashtable = HashTableCell::create(hashtable, _dropper); + + match !Method::SUPPORT_PARTITIONED || !params.has_distinct_combinator() { + true => HashTable::HashTable(hashtable), + false => HashTable::PartitionedHashTable(PartitionedHashMethod::convert_hashtable( + &method, hashtable, + )?), + } + } else { + HashTable::AggregateHashTable(AggregateHashTable::new( + arena, + params.group_data_types.clone(), + params.aggregate_functions.clone(), + )) }; Ok(AccumulatingTransformer::create( @@ -139,6 +152,7 @@ impl TransformPartialAggregate { method, params, hash_table, + probe_state: ProbeState::default(), settings: AggregateSettings::try_from(ctx)?, }, )) @@ -236,11 +250,11 @@ impl TransformPartialAggregate { unsafe { let rows_num = block.num_rows(); - let state = self.method.build_keys_state(&group_columns, rows_num)?; match &mut self.hash_table { HashTable::MovedOut => unreachable!(), HashTable::HashTable(hashtable) => { + let state = self.method.build_keys_state(&group_columns, rows_num)?; let mut places = Vec::with_capacity(rows_num); for key in self.method.build_keys_iter(&state)? { @@ -261,6 +275,7 @@ impl TransformPartialAggregate { } } HashTable::PartitionedHashTable(hashtable) => { + let state = self.method.build_keys_state(&group_columns, rows_num)?; let mut places = Vec::with_capacity(rows_num); for key in self.method.build_keys_iter(&state)? { @@ -280,6 +295,19 @@ impl TransformPartialAggregate { Self::execute(&self.params, &block, &places) } } + HashTable::AggregateHashTable(hashtable) => { + let group_columns: Vec = + group_columns.into_iter().map(|c| c.0).collect(); + + let params_columns = Self::aggregate_arguments(&block, &self.params)?; + let _ = hashtable.add_groups( + &mut self.probe_state, + &group_columns, + ¶ms_columns, + rows_num, + )?; + Ok(()) + } } } } @@ -368,6 +396,11 @@ impl AccumulatingTransform for TransformPartialAggrega blocks } + HashTable::AggregateHashTable(hashtable) => { + vec![DataBlock::empty_with_meta( + AggregateMeta::::create_agg_hashtable(-1, hashtable), + )] + } }) } } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs index 1f95565cf58e..d3867113bd02 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs @@ -17,7 +17,9 @@ use std::sync::Arc; use bumpalo::Bump; use common_exception::ErrorCode; use common_exception::Result; +use common_expression::AggregateHashTable; use common_expression::DataBlock; +use common_expression::PayloadFlushState; use common_hashtable::HashtableEntryRefLike; use common_hashtable::HashtableLike; use common_pipeline_core::processors::port::InputPort; @@ -62,6 +64,9 @@ where Method: HashMethodBounds if let AggregateMeta::Partitioned { bucket, data } = meta { let arena = Arc::new(Bump::new()); let mut hashtable = self.method.create_hash_table::<()>(arena)?; + + let mut agg_hashtable: Option = None; + 'merge_hashtable: for bucket_data in data { match bucket_data { AggregateMeta::Spilled(_) => unreachable!(), @@ -98,9 +103,35 @@ where Method: HashMethodBounds } } }, + AggregateMeta::AggregateHashTable((_, hashtable)) => { + match agg_hashtable.as_mut() { + Some(ht) => { + let mut flush_state = PayloadFlushState::default(); + ht.combine(hashtable, &mut flush_state)?; + } + None => agg_hashtable = Some(hashtable), + } + } } } + if let Some(mut ht) = agg_hashtable { + let mut flush_state = PayloadFlushState::default(); + + let mut blocks = vec![]; + loop { + if ht.merge_result(&mut flush_state)? { + blocks.push(DataBlock::new_from_columns( + flush_state.group_columns.clone(), + )); + } else { + break; + } + } + + return DataBlock::concat(&blocks); + } + let value_size = estimated_key_size(&hashtable); let keys_len = hashtable.len(); diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs index 9484c2ca9abb..665462b5eba3 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs @@ -22,7 +22,10 @@ use common_base::runtime::GLOBAL_MEM_STAT; use common_catalog::table_context::TableContext; use common_exception::ErrorCode; use common_exception::Result; +use common_expression::AggregateHashTable; +use common_expression::Column; use common_expression::DataBlock; +use common_expression::ProbeState; use common_hashtable::HashtableLike; use common_pipeline_core::processors::port::InputPort; use common_pipeline_core::processors::port::OutputPort; @@ -46,6 +49,7 @@ use crate::sessions::QueryContext; enum HashTable { MovedOut, HashTable(HashTableCell), + AggregateHashTable(AggregateHashTable), PartitionedHashTable(HashTableCell, ()>), } @@ -100,6 +104,7 @@ pub struct TransformPartialGroupBy { method: Method, hash_table: HashTable, group_columns: Vec, + probe_state: ProbeState, settings: GroupBySettings, } @@ -110,11 +115,20 @@ impl TransformPartialGroupBy { input: Arc, output: Arc, params: Arc, + enable_experimental_aggregate_hashtable: bool, ) -> Result> { let arena = Arc::new(Bump::new()); - let hashtable = method.create_hash_table(arena)?; - let _dropper = GroupByHashTableDropper::::create(); - let hash_table = HashTable::HashTable(HashTableCell::create(hashtable, _dropper)); + let hash_table = if !enable_experimental_aggregate_hashtable { + let hashtable = method.create_hash_table(arena.clone())?; + let _dropper = GroupByHashTableDropper::::create(); + HashTable::HashTable(HashTableCell::create(hashtable, _dropper)) + } else { + HashTable::AggregateHashTable(AggregateHashTable::new( + arena, + params.group_data_types.clone(), + params.aggregate_functions.clone(), + )) + }; Ok(AccumulatingTransformer::create( input, @@ -122,6 +136,7 @@ impl TransformPartialGroupBy { TransformPartialGroupBy:: { method, hash_table, + probe_state: ProbeState::default(), group_columns: params.group_columns.clone(), settings: GroupBySettings::try_from(ctx)?, }, @@ -147,20 +162,31 @@ impl AccumulatingTransform for TransformPartialGroupBy unsafe { let rows_num = block.num_rows(); - let state = self.method.build_keys_state(&group_columns, rows_num)?; match &mut self.hash_table { HashTable::MovedOut => unreachable!(), HashTable::HashTable(cell) => { + let state = self.method.build_keys_state(&group_columns, rows_num)?; for key in self.method.build_keys_iter(&state)? { let _ = cell.hashtable.insert_and_entry(key); } } HashTable::PartitionedHashTable(cell) => { + let state = self.method.build_keys_state(&group_columns, rows_num)?; for key in self.method.build_keys_iter(&state)? { let _ = cell.hashtable.insert_and_entry(key); } } + HashTable::AggregateHashTable(hashtable) => { + let group_columns: Vec = + group_columns.into_iter().map(|c| c.0).collect(); + let _ = hashtable.add_groups( + &mut self.probe_state, + &group_columns, + &[vec![]], + rows_num, + )?; + } }; #[allow(clippy::collapsible_if)] @@ -233,6 +259,11 @@ impl AccumulatingTransform for TransformPartialGroupBy blocks } + HashTable::AggregateHashTable(hashtable) => { + vec![DataBlock::empty_with_meta( + AggregateMeta::::create_agg_hashtable(-1, hashtable), + )] + } }) } } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs index 87d8473d2948..47906da26eea 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs @@ -21,6 +21,7 @@ use std::sync::Arc; use common_exception::ErrorCode; use common_exception::Result; +use common_expression::AggregateHashTable; use common_expression::BlockMetaInfoDowncast; use common_expression::DataBlock; use common_hashtable::hash2bucket; @@ -185,6 +186,7 @@ impl unreachable!() } + AggregateMeta::AggregateHashTable((v, _)) => (*v, *v), }; if bucket > SINGLE_LEVEL_BUCKET_NUM { @@ -299,6 +301,12 @@ impl Ok(data_blocks) } + + fn partition_agg_hashtable(&self, ht: AggregateHashTable) -> Result>> { + let block = + DataBlock::empty_with_meta(AggregateMeta::::create_agg_hashtable(0, ht)); + Ok(vec![Some(block)]) + } } #[async_trait::async_trait] @@ -418,6 +426,9 @@ impl Processor AggregateMeta::Partitioned { .. } => unreachable!(), AggregateMeta::Serialized(payload) => self.partition_block(payload)?, AggregateMeta::HashTable(payload) => self.partition_hashtable(payload)?, + AggregateMeta::AggregateHashTable((_, payload)) => { + self.partition_agg_hashtable(payload)? + } }; for (bucket, block) in data_blocks.into_iter().enumerate() { diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index 498c0a0f204a..faed266fd581 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -458,6 +458,13 @@ impl DefaultSettings { possible_values: None, display_in_show_settings: true, }), + ("enable_experimental_aggregate_hashtable", DefaultSettingValue { + value: UserSettingValue::UInt64(0), + desc: "Enables experimental aggregate hashtable", + possible_values: None, + display_in_show_settings: true, + }), + ("numeric_cast_option", DefaultSettingValue { value: UserSettingValue::String("rounding".to_string()), desc: "Set numeric cast mode as \"rounding\" or \"truncating\".", diff --git a/src/query/settings/src/settings_getter_setter.rs b/src/query/settings/src/settings_getter_setter.rs index 7d8cb12c887e..eea5c127eba0 100644 --- a/src/query/settings/src/settings_getter_setter.rs +++ b/src/query/settings/src/settings_getter_setter.rs @@ -271,6 +271,10 @@ impl Settings { Ok(self.try_get_u64("efficiently_memory_group_by")? == 1) } + pub fn get_enable_experimental_aggregate_hashtable(&self) -> Result { + Ok(self.try_get_u64("enable_experimental_aggregate_hashtable")? == 1) + } + pub fn get_lazy_read_threshold(&self) -> Result { self.try_get_u64("lazy_read_threshold") } From 82e7b579be5a9fc57260fa31c4fa36cf0287c6c8 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Thu, 2 Nov 2023 19:25:42 +0800 Subject: [PATCH 02/28] feat(query): agg-hashtable-p2 --- .../src/aggregate/aggregate_hashtable.rs | 32 ++-- src/query/expression/src/aggregate/payload.rs | 22 +-- .../expression/src/aggregate/payload_flush.rs | 53 +++++-- .../expression/src/aggregate/payload_row.rs | 138 ++++++++---------- .../expression/src/aggregate/probe_state.rs | 33 +++-- src/query/expression/src/kernels/utils.rs | 13 +- .../expression/src/utils/select_vector.rs | 21 ++- .../tests/it/aggregates/agg_hashtable.rs | 9 +- .../service/src/pipelines/pipeline_builder.rs | 4 + .../aggregator/aggregator_params.rs | 3 + .../aggregator/transform_aggregate_final.rs | 20 ++- .../aggregator/transform_aggregate_partial.rs | 4 +- .../aggregator/transform_group_by_final.rs | 18 ++- .../aggregator/transform_group_by_partial.rs | 3 +- 14 files changed, 200 insertions(+), 173 deletions(-) diff --git a/src/query/expression/src/aggregate/aggregate_hashtable.rs b/src/query/expression/src/aggregate/aggregate_hashtable.rs index c2d05eb43f86..1c521c57cc33 100644 --- a/src/query/expression/src/aggregate/aggregate_hashtable.rs +++ b/src/query/expression/src/aggregate/aggregate_hashtable.rs @@ -23,7 +23,6 @@ use super::payload_flush::PayloadFlushState; use super::probe_state::ProbeState; use crate::aggregate::payload_row::row_match_columns; use crate::group_hash_columns; -use crate::load; use crate::select_vector::SelectVector; use crate::types::DataType; use crate::AggregateFunctionRef; @@ -96,9 +95,9 @@ impl AggregateHashTable { if !self.payload.aggrs.is_empty() { for i in 0..row_count { state.state_places[i] = unsafe { - StateAddr::new( - load::(state.addresses[i].add(self.payload.state_offset)) as usize, - ) + StateAddr::new(core::ptr::read::( + state.addresses[i].add(self.payload.state_offset) as _, + ) as usize) }; } @@ -137,13 +136,12 @@ impl AggregateHashTable { self.resize(new_capacity); } - state.adjust_group_columns(group_columns, hashes, row_count, self.capacity); + state.adjust_group_columns(hashes, row_count, self.capacity); let mut new_group_count = 0; let mut remaining_entries = row_count; let mut select_vector = SelectVector::auto_increment(); - let mut payload_page_offset = self.len() % self.payload.row_per_page; let mut payload_page_nr = (self.len() / self.payload.row_per_page) + 1; @@ -153,8 +151,7 @@ impl AggregateHashTable { let mut no_match_count = 0; // 1. inject new_group_count, new_entry_count, need_compare_count, no_match_count - for i in 0..remaining_entries { - let index = select_vector.get_index(i); + for index in select_vector.iterator(remaining_entries) { let entry = &mut self.entries[state.ht_offsets[index]]; // cell is empty, could be occupied @@ -171,9 +168,6 @@ impl AggregateHashTable { state.empty_vector.set_index(new_entry_count, index); new_entry_count += 1; - - state.new_groups.set_index(new_group_count, index); - new_group_count += 1; } else if entry.salt == state.hash_salts[index] { state .group_compare_vector @@ -187,13 +181,13 @@ impl AggregateHashTable { // 2. append new_group_count to payload if new_entry_count != 0 { + new_group_count += new_entry_count; self.payload .append_rows(state, hashes, new_entry_count, group_columns); } // 3. handle need_compare_count - for need_compare_idx in 0..need_compare_count { - let index = state.group_compare_vector.get_index(need_compare_idx); + for index in state.group_compare_vector.iterator(need_compare_count) { let entry = &mut self.entries[state.ht_offsets[index]]; let page_ptr = self.payload.get_page_ptr((entry.page_nr - 1) as usize); @@ -217,8 +211,7 @@ impl AggregateHashTable { } // 5. Linear probing - for i in 0..no_match_count { - let index = state.no_match_vector.get_index(i); + for index in state.no_match_vector.iterator(no_match_count) { state.ht_offsets[index] += 1; if state.ht_offsets[index] >= self.capacity { @@ -238,9 +231,9 @@ impl AggregateHashTable { if !self.payload.aggrs.is_empty() { for i in 0..row_count { state.state_places[i] = unsafe { - StateAddr::new( - load::(state.addresses[i].add(self.payload.state_offset)) as usize, - ) + StateAddr::new(core::ptr::read::( + state.addresses[i].add(self.payload.state_offset) as _, + ) as usize) }; } } @@ -249,6 +242,7 @@ impl AggregateHashTable { } pub fn combine(&mut self, other: Self, flush_state: &mut PayloadFlushState) -> Result<()> { + flush_state.reset(); while other.payload.flush(flush_state) { let row_count = flush_state.row_count; @@ -313,7 +307,7 @@ impl AggregateHashTable { // iterate over payloads and copy to new entries for row in 0..self.len() { let row_ptr = self.payload.get_row_ptr(row); - let hash: u64 = unsafe { load(row_ptr.add(self.payload.hash_offset)) }; + let hash: u64 = unsafe { core::ptr::read(row_ptr.add(self.payload.hash_offset) as _) }; let mut hash_slot = hash & mask; while entries[hash_slot as usize].page_nr != 0 { diff --git a/src/query/expression/src/aggregate/payload.rs b/src/query/expression/src/aggregate/payload.rs index 327c81d89eb8..e2d8cd0e35c5 100644 --- a/src/query/expression/src/aggregate/payload.rs +++ b/src/query/expression/src/aggregate/payload.rs @@ -21,7 +21,6 @@ use super::payload_row::rowformat_size; use super::payload_row::serialize_column_to_rowformat; use super::probe_state::ProbeState; use crate::get_layout_offsets; -use crate::load; use crate::store; use crate::types::DataType; use crate::AggregateFunctionRef; @@ -152,9 +151,7 @@ impl Payload { ) { self.try_reverse(new_group_rows); let select_vector = &state.empty_vector; - for i in 0..new_group_rows { - let idx = select_vector.get_index(i); - + for idx in select_vector.iterator(new_group_rows) { state.addresses[idx] = self.get_row_ptr(self.current_row); self.current_row += 1; } @@ -166,12 +163,11 @@ impl Payload { for col in group_columns { if let Column::Nullable(c) = col { let bitmap = &c.validity; - for i in 0..new_group_rows { - let idx = select_vector.get_index(i); + for idx in select_vector.iterator(new_group_rows) { if bitmap.get_bit(idx) { unsafe { let dst = address[idx].add(write_offset); - store(&1, dst as *mut u8); + store(1, dst as *mut u8); } } } @@ -196,23 +192,21 @@ impl Payload { } // write group hashes - for i in 0..new_group_rows { - let idx = select_vector.get_index(i); + for idx in select_vector.iterator(new_group_rows) { unsafe { let dst = address[idx].add(write_offset); - store(&group_hashes[idx], dst as *mut u8); + store(group_hashes[idx], dst as *mut u8); } } write_offset += 8; if let Some(layout) = self.state_layout { // write states - for i in 0..new_group_rows { + for idx in select_vector.iterator(new_group_rows) { let place = self.arena.alloc_layout(layout); - let idx = select_vector.get_index(i); unsafe { let dst = address[idx].add(write_offset); - store(&(place.as_ptr() as u64), dst as *mut u8); + store(place.as_ptr() as u64, dst as *mut u8); } let place = StateAddr::from(place); @@ -233,7 +227,7 @@ impl Drop for Payload { let row_ptr = self.get_row_ptr(row); unsafe { - let state_addr: u64 = load(row_ptr.add(self.state_offset)); + let state_addr: u64 = core::ptr::read(row_ptr.add(self.state_offset) as _); aggr.drop_state(StateAddr::new(state_addr as usize + *addr_offset)) }; } diff --git a/src/query/expression/src/aggregate/payload_flush.rs b/src/query/expression/src/aggregate/payload_flush.rs index a82d7df0846f..bbf0c6a28ad6 100644 --- a/src/query/expression/src/aggregate/payload_flush.rs +++ b/src/query/expression/src/aggregate/payload_flush.rs @@ -16,7 +16,6 @@ use ethnum::i256; use super::payload::Payload; use super::probe_state::ProbeState; -use crate::load; use crate::types::decimal::DecimalType; use crate::types::nullable::NullableColumn; use crate::types::string::StringColumn; @@ -34,7 +33,6 @@ use crate::StateAddr; const FLUSH_BATCH_SIZE: usize = 8192; -#[derive(Default)] pub struct PayloadFlushState { pub probe_state: ProbeState, pub group_hashes: Vec, @@ -47,6 +45,36 @@ pub struct PayloadFlushState { pub state_places: Vec, } +unsafe impl Send for PayloadFlushState {} +unsafe impl Sync for PayloadFlushState {} + +impl PayloadFlushState { + pub fn with_capacity(len: usize) -> PayloadFlushState { + PayloadFlushState { + probe_state: ProbeState::with_capacity(len), + group_hashes: vec![0; len], + group_columns: Vec::new(), + aggregate_results: Vec::new(), + row_count: 0, + flush_offset: 0, + addresses: vec![std::ptr::null::(); len], + state_places: vec![StateAddr::new(0); len], + } + } + + pub fn reset(&mut self) { + self.row_count = 0; + self.flush_offset = 0; + } + + pub fn take_group_columns(&mut self) -> Vec { + std::mem::take(&mut self.group_columns) + } + pub fn take_aggregate_results(&mut self) -> Vec { + std::mem::take(&mut self.aggregate_results) + } +} + impl Payload { pub fn flush(&self, state: &mut PayloadFlushState) -> bool { let flush_end = (state.flush_offset + FLUSH_BATCH_SIZE).min(self.len()); @@ -56,7 +84,7 @@ impl Payload { return false; } - if state.row_count < rows { + if state.group_hashes.len() < rows { state.group_hashes.resize(rows, 0); state.addresses.resize(rows, std::ptr::null::()); state.state_places.resize(rows, StateAddr::new(0)); @@ -64,7 +92,7 @@ impl Payload { state.group_columns.clear(); state.row_count = rows; - state.probe_state.adjust_row_count(rows); + state.probe_state.adjust_vector(rows); for row in state.flush_offset..flush_end { state.addresses[row - state.flush_offset] = self.get_row_ptr(row); @@ -79,7 +107,9 @@ impl Payload { if !self.aggrs.is_empty() { for i in 0..rows { state.state_places[i] = unsafe { - StateAddr::new(load::(state.addresses[i].add(self.state_offset)) as usize) + StateAddr::new(core::ptr::read::( + state.addresses[i].add(self.state_offset) as _ + ) as usize) }; } } @@ -93,7 +123,7 @@ impl Payload { for i in 0..len { state.group_hashes[i] = - unsafe { load::(state.addresses[i].add(self.hash_offset)) }; + unsafe { core::ptr::read::(state.addresses[i].add(self.hash_offset) as _) }; } } @@ -150,8 +180,9 @@ impl Payload { state: &mut PayloadFlushState, ) -> Column { let len = state.probe_state.row_count; - let iter = - (0..len).map(|idx| unsafe { load::(state.addresses[idx].add(col_offset)) }); + let iter = (0..len).map(|idx| unsafe { + core::ptr::read::(state.addresses[idx].add(col_offset) as _) + }); let col = T::column_from_iter(iter, &[]); T::upcast_column(col) } @@ -166,9 +197,11 @@ impl Payload { unsafe { for idx in 0..len { - let str_len = load::(state.addresses[idx].add(col_offset)) as usize; + let str_len = + core::ptr::read::(state.addresses[idx].add(col_offset) as _) as usize; let data_address = - load::(state.addresses[idx].add(col_offset + 4)) as usize as *const u8; + core::ptr::read::(state.addresses[idx].add(col_offset + 4) as _) as usize + as *const u8; let scalar = std::slice::from_raw_parts(data_address, str_len); diff --git a/src/query/expression/src/aggregate/payload_row.rs b/src/query/expression/src/aggregate/payload_row.rs index ffe92e85eaf0..53b23e9da83f 100644 --- a/src/query/expression/src/aggregate/payload_row.rs +++ b/src/query/expression/src/aggregate/payload_row.rs @@ -16,7 +16,6 @@ use bumpalo::Bump; use common_arrow::arrow::bitmap::Bitmap; use ethnum::i256; -use crate::load; use crate::select_vector::SelectVector; use crate::store; use crate::types::decimal::DecimalColumn; @@ -60,7 +59,7 @@ pub fn rowformat_size(data_type: &DataType) -> usize { pub unsafe fn serialize_column_to_rowformat( arena: &Bump, column: &Column, - select_index: &SelectVector, + select_vector: &SelectVector, rows: usize, address: &[*const u8], offset: usize, @@ -70,57 +69,51 @@ pub unsafe fn serialize_column_to_rowformat( Column::Null { .. } | Column::EmptyArray { .. } | Column::EmptyMap { .. } => {} Column::Number(v) => with_number_mapped_type!(|NUM_TYPE| match v { NumberColumn::NUM_TYPE(buffer) => { - for i in 0..rows { - let index = select_index.get_index(i); - store(&buffer[index], address[index].add(offset) as *mut u8); + for index in select_vector.iterator(rows) { + store(buffer[index], address[index].add(offset) as *mut u8); } } }), Column::Decimal(v) => { with_decimal_mapped_type!(|DECIMAL_TYPE| match v { DecimalColumn::DECIMAL_TYPE(buffer, _) => { - for i in 0..rows { - let index = select_index.get_index(i); - store(&buffer[index], address[index].add(offset) as *mut u8); + for index in select_vector.iterator(rows) { + store(buffer[index], address[index].add(offset) as *mut u8); } } }) } Column::Boolean(v) => { - for i in 0..rows { - let index = select_index.get_index(i); - store(&v.get_bit(index), address[index].add(offset) as *mut u8); + for index in select_vector.iterator(rows) { + store(v.get_bit(index), address[index].add(offset) as *mut u8); } } Column::String(v) | Column::Bitmap(v) | Column::Variant(v) => { - for i in 0..rows { - let index = select_index.get_index(i); + for index in select_vector.iterator(rows) { let data = arena.alloc_slice_copy(v.index_unchecked(index)); - store(&(data.len() as u32), address[index].add(offset) as *mut u8); + store(data.len() as u32, address[index].add(offset) as *mut u8); store( - &(data.as_ptr() as u64), + data.as_ptr() as u64, address[index].add(offset + 4) as *mut u8, ); } } Column::Timestamp(buffer) => { - for i in 0..rows { - let index = select_index.get_index(i); - store(&buffer[index], address[index].add(offset) as *mut u8); + for index in select_vector.iterator(rows) { + store(buffer[index], address[index].add(offset) as *mut u8); } } Column::Date(buffer) => { - for i in 0..rows { - let index = select_index.get_index(i); - store(&buffer[index], address[index].add(offset) as *mut u8); + for index in select_vector.iterator(rows) { + store(buffer[index], address[index].add(offset) as *mut u8); } } Column::Nullable(c) => serialize_column_to_rowformat( arena, &c.column, - select_index, + select_vector, rows, address, offset, @@ -139,7 +132,7 @@ pub unsafe fn serialize_column_to_rowformat( pub unsafe fn row_match_columns( cols: &[Column], address: &[*const u8], - select_index: &mut SelectVector, + select_vector: &mut SelectVector, count: usize, validity_offset: &[usize], col_offsets: &[usize], @@ -155,7 +148,7 @@ pub unsafe fn row_match_columns( row_match_column( col, address, - select_index, + select_vector, &mut count, *validity_offset, *col_offset, @@ -168,7 +161,7 @@ pub unsafe fn row_match_columns( pub unsafe fn row_match_column( col: &Column, address: &[*const u8], - select_index: &mut SelectVector, + select_vector: &mut SelectVector, count: &mut usize, validity_offset: usize, col_offset: usize, @@ -192,7 +185,7 @@ pub unsafe fn row_match_column( col, validity, address, - select_index, + select_vector, count, validity_offset, col_offset, @@ -206,7 +199,7 @@ pub unsafe fn row_match_column( col, validity, address, - select_index, + select_vector, count, validity_offset, col_offset, @@ -217,7 +210,7 @@ pub unsafe fn row_match_column( col, validity, address, - select_index, + select_vector, count, validity_offset, col_offset, @@ -229,7 +222,7 @@ pub unsafe fn row_match_column( col, validity, address, - select_index, + select_vector, count, validity_offset, col_offset, @@ -240,7 +233,7 @@ pub unsafe fn row_match_column( col, validity, address, - select_index, + select_vector, count, validity_offset, col_offset, @@ -251,7 +244,7 @@ pub unsafe fn row_match_column( col, validity, address, - select_index, + select_vector, count, validity_offset, col_offset, @@ -262,7 +255,7 @@ pub unsafe fn row_match_column( v, validity, address, - select_index, + select_vector, count, validity_offset, col_offset, @@ -274,7 +267,7 @@ pub unsafe fn row_match_column( v, validity, address, - select_index, + select_vector, count, validity_offset, col_offset, @@ -285,7 +278,7 @@ pub unsafe fn row_match_column( v, validity, address, - select_index, + select_vector, count, validity_offset, col_offset, @@ -303,7 +296,7 @@ unsafe fn row_match_string_column( col: &StringColumn, validity: Option<&Bitmap>, address: &[*const u8], - select_index: &mut SelectVector, + select_vector: &mut SelectVector, count: &mut usize, validity_offset: usize, col_offset: usize, @@ -314,60 +307,57 @@ unsafe fn row_match_string_column( let mut equal: bool; if let Some(validity) = validity { - for i in 0..*count { - let idx = select_index.get_index(i); - let isnull = !validity.get_bit(idx); + for idx in select_vector.sel_vec_mut(*count).iter_mut() { + let isnull = !validity.get_bit(*idx); - let validity_address = address[idx].add(validity_offset); - let isnull2 = load::(validity_address) != 0; + let validity_address = address[*idx].add(validity_offset); + let isnull2 = core::ptr::read::(validity_address as _) != 0; equal = isnull == isnull2; if !isnull && !isnull2 { - let len_address = address[idx].add(col_offset); - let address = address[idx].add(col_offset + 4); - let len = load::(len_address) as usize; + let len_address = address[*idx].add(col_offset); + let address = address[*idx].add(col_offset + 4); + let len = core::ptr::read::(len_address as _) as usize; - let value = StringType::index_column_unchecked(col, idx); + let value = StringType::index_column_unchecked(col, *idx); if len != value.len() { equal = false; } else { - let data_address = load::(address) as usize as *const u8; + let data_address = core::ptr::read::(address as _) as usize as *const u8; let scalar = std::slice::from_raw_parts(data_address, len); equal = scalar.eq(value); } } if equal { - select_index.set_index(match_count, idx); + *idx = match_count; match_count += 1; } else { - no_match.set_index(*no_match_count, idx); + no_match.set_index(*no_match_count, *idx); *no_match_count += 1; } } } else { - for i in 0..*count { - let idx = select_index.get_index(i); + for idx in select_vector.sel_vec_mut(*count).iter_mut() { + let len_address = address[*idx].add(col_offset); + let address = address[*idx].add(col_offset + 4); - let len_address = address[idx].add(col_offset); - let address = address[idx].add(col_offset + 4); + let len = core::ptr::read::(len_address as _) as usize; - let len = load::(len_address) as usize; - - let value = StringType::index_column_unchecked(col, idx); + let value = StringType::index_column_unchecked(col, *idx); if len != value.len() { equal = false; } else { - let data_address = load::(address) as usize as *const u8; + let data_address = core::ptr::read::(address as _) as usize as *const u8; let scalar = std::slice::from_raw_parts(data_address, len); equal = scalar.eq(value); } if equal { - select_index.set_index(match_count, idx); + *idx = match_count; match_count += 1; } else { - no_match.set_index(*no_match_count, idx); + no_match.set_index(*no_match_count, *idx); *no_match_count += 1; } } @@ -380,7 +370,7 @@ unsafe fn row_match_column_type( col: &Column, validity: Option<&Bitmap>, address: &[*const u8], - select_index: &mut SelectVector, + select_vector: &mut SelectVector, count: &mut usize, validity_offset: usize, col_offset: usize, @@ -393,43 +383,41 @@ unsafe fn row_match_column_type( let mut equal: bool; if let Some(validity) = validity { - for i in 0..*count { - let idx = select_index.get_index(i); - let isnull = !validity.get_bit(idx); + for idx in select_vector.sel_vec_mut(*count).iter_mut() { + let isnull = !validity.get_bit(*idx); - let validity_address = address[idx].add(validity_offset); - let isnull2 = load::(validity_address) != 0; + let validity_address = address[*idx].add(validity_offset); + let isnull2 = core::ptr::read::(validity_address as _) != 0; equal = isnull == isnull2; if !isnull && !isnull2 { - let address = address[idx].add(col_offset); - let scalar = load::<::Scalar>(address); - let value = T::index_column_unchecked(&col, idx); + let address = address[*idx].add(col_offset); + let scalar = core::ptr::read::<::Scalar>(address as _); + let value = T::index_column_unchecked(&col, *idx); let value = T::to_owned_scalar(value); equal = scalar.eq(&value); } if equal { - select_index.set_index(match_count, idx); + *idx = match_count; match_count += 1; } else { - no_match.set_index(*no_match_count, idx); + no_match.set_index(*no_match_count, *idx); *no_match_count += 1; } } } else { - for i in 0..*count { - let idx = select_index.get_index(i); - let value = T::index_column_unchecked(&col, idx); - let address = address[idx].add(col_offset); - let scalar = load::<::Scalar>(address); + for idx in select_vector.sel_vec_mut(*count).iter_mut() { + let value = T::index_column_unchecked(&col, *idx); + let address = address[*idx].add(col_offset); + let scalar = core::ptr::read::<::Scalar>(address as _); let value = T::to_owned_scalar(value); if scalar.eq(&value) { - select_index.set_index(match_count, idx); + *idx = match_count; match_count += 1; } else { - no_match.set_index(*no_match_count, idx); + no_match.set_index(*no_match_count, *idx); *no_match_count += 1; } } diff --git a/src/query/expression/src/aggregate/probe_state.rs b/src/query/expression/src/aggregate/probe_state.rs index e683a0df2cdf..809e9ea1b7b4 100644 --- a/src/query/expression/src/aggregate/probe_state.rs +++ b/src/query/expression/src/aggregate/probe_state.rs @@ -13,12 +13,11 @@ // limitations under the License. use crate::select_vector::SelectVector; -use crate::Column; use crate::StateAddr; /// ProbeState is the state to probe HT /// It could be reuse during multiple probe process -#[derive(Default, Debug)] +#[derive(Debug)] pub struct ProbeState { pub ht_offsets: Vec, pub hash_salts: Vec, @@ -27,9 +26,7 @@ pub struct ProbeState { pub group_compare_vector: SelectVector, pub no_match_vector: SelectVector, pub empty_vector: SelectVector, - pub new_groups: SelectVector, - pub group_columns: Vec, pub row_count: usize, } @@ -37,15 +34,20 @@ unsafe impl Send for ProbeState {} unsafe impl Sync for ProbeState {} impl ProbeState { - pub fn adjust_group_columns( - &mut self, - group_columns: &[Column], - hashes: &[u64], - row_count: usize, - ht_size: usize, - ) { - self.group_columns = group_columns.to_owned(); - self.adjust_row_count(row_count); + pub fn with_capacity(len: usize) -> Self { + Self { + ht_offsets: vec![0; len], + hash_salts: vec![0; len], + addresses: vec![std::ptr::null::(); len], + state_places: vec![StateAddr::new(0); len], + group_compare_vector: SelectVector::new(len), + no_match_vector: SelectVector::new(len), + empty_vector: SelectVector::new(len), + row_count: 0, + } + } + pub fn adjust_group_columns(&mut self, hashes: &[u64], row_count: usize, ht_size: usize) { + self.adjust_vector(row_count); for ((hash, salt), ht_offset) in hashes .iter() @@ -57,8 +59,8 @@ impl ProbeState { } } - pub fn adjust_row_count(&mut self, row_count: usize) { - if self.row_count < row_count { + pub fn adjust_vector(&mut self, row_count: usize) { + if self.ht_offsets.len() < row_count { self.ht_offsets.resize(row_count, 0); self.hash_salts.resize(row_count, 0); self.addresses.resize(row_count, std::ptr::null::()); @@ -67,7 +69,6 @@ impl ProbeState { self.group_compare_vector.resize(row_count); self.no_match_vector.resize(row_count); self.empty_vector.resize(row_count); - self.new_groups.resize(row_count); } self.row_count = row_count; diff --git a/src/query/expression/src/kernels/utils.rs b/src/query/expression/src/kernels/utils.rs index 5ba51d81000a..205f52d364e2 100644 --- a/src/query/expression/src/kernels/utils.rs +++ b/src/query/expression/src/kernels/utils.rs @@ -68,15 +68,6 @@ pub unsafe fn set_vec_len_by_ptr(vec: &mut Vec, ptr: *const T) { /// # Safety /// # As: copy_nonoverlapping #[inline] -pub unsafe fn store(val: &T, ptr: *mut u8) { - std::ptr::copy_nonoverlapping(val as *const T as *const u8, ptr, std::mem::size_of::()); -} - -/// # Safety -/// # As: copy_nonoverlapping -#[inline] -pub unsafe fn load(ptr: *const u8) -> T { - let mut ret: T = std::mem::zeroed(); - std::ptr::copy_nonoverlapping(ptr as *const T, &mut ret, 1); - ret +pub unsafe fn store(val: T, ptr: *mut u8) { + core::ptr::write(ptr as _, val) } diff --git a/src/query/expression/src/utils/select_vector.rs b/src/query/expression/src/utils/select_vector.rs index 959d501d5d2d..136d04f9be47 100644 --- a/src/query/expression/src/utils/select_vector.rs +++ b/src/query/expression/src/utils/select_vector.rs @@ -54,17 +54,12 @@ impl SelectVector { // these function did not check index boundes // keep in mind when using them + #[inline] pub fn set_index(&mut self, idx: usize, loc: usize) { - #[cfg(debug_assertions)] - { - if self.sel_vector.len() <= idx { - panic!("index out of bound {}, {}", self.sel_vector.len(), idx); - } - } - self.sel_vector[idx] = loc; } + #[inline] pub fn get_index(&self, idx: usize) -> usize { if self.increment { idx @@ -73,6 +68,18 @@ impl SelectVector { } } + pub fn iterator(&self, count: usize) -> Box + '_> { + if self.increment { + Box::new(0..count) + } else { + Box::new(self.sel_vector.iter().take(count).copied()) + } + } + + pub fn sel_vec_mut(&mut self, count: usize) -> &mut [usize] { + &mut self.sel_vector.as_mut_slice()[0..count] + } + pub fn swap(&mut self, i: usize, j: usize) { self.sel_vector.swap(i, j); } diff --git a/src/query/functions/tests/it/aggregates/agg_hashtable.rs b/src/query/functions/tests/it/aggregates/agg_hashtable.rs index 56b307ffc10f..9138e1e32a9d 100644 --- a/src/query/functions/tests/it/aggregates/agg_hashtable.rs +++ b/src/query/functions/tests/it/aggregates/agg_hashtable.rs @@ -54,6 +54,7 @@ use common_functions::aggregates::AggregateFunctionFactory; fn test_agg_hashtable() { let factory = AggregateFunctionFactory::instance(); let m: usize = 4; + const BATCH_SIZE: usize = 8192; for n in [100, 1000, 10_000, 100_000] { let columns = vec![ StringType::from_data((0..n).map(|x| format!("{}", x % m).as_bytes().to_vec())), @@ -89,7 +90,7 @@ fn test_agg_hashtable() { let arena1 = Arc::new(Bump::new()); let mut hashtable = AggregateHashTable::new(arena1, group_types.clone(), aggrs.clone()); - let mut state = ProbeState::default(); + let mut state = ProbeState::with_capacity(BATCH_SIZE); let _ = hashtable .add_groups(&mut state, &group_columns, ¶ms, n) .unwrap(); @@ -97,15 +98,15 @@ fn test_agg_hashtable() { let arena2 = Arc::new(Bump::new()); let mut hashtable2 = AggregateHashTable::new(arena2, group_types.clone(), aggrs.clone()); - let mut state2 = ProbeState::default(); + let mut state2 = ProbeState::with_capacity(BATCH_SIZE); let _ = hashtable2 .add_groups(&mut state2, &group_columns, ¶ms, n) .unwrap(); - let mut flush_state = PayloadFlushState::default(); + let mut flush_state = PayloadFlushState::with_capacity(BATCH_SIZE); let _ = hashtable.combine(hashtable2, &mut flush_state); - let mut merge_state = PayloadFlushState::default(); + let mut merge_state = PayloadFlushState::with_capacity(BATCH_SIZE); let mut blocks = Vec::new(); loop { diff --git a/src/query/service/src/pipelines/pipeline_builder.rs b/src/query/service/src/pipelines/pipeline_builder.rs index 837b66cc762d..a268958e948b 100644 --- a/src/query/service/src/pipelines/pipeline_builder.rs +++ b/src/query/service/src/pipelines/pipeline_builder.rs @@ -1965,6 +1965,7 @@ impl PipelineBuilder { aggregate.input.output_schema()?, &aggregate.group_by, &aggregate.agg_funcs, + self.settings.get_max_block_size()?, None, )?; @@ -2090,6 +2091,7 @@ impl PipelineBuilder { aggregate.before_group_by_schema.clone(), &aggregate.group_by, &aggregate.agg_funcs, + self.settings.get_max_block_size()?, aggregate.limit, )?; @@ -2188,6 +2190,7 @@ impl PipelineBuilder { input_schema: DataSchemaRef, group_by: &[IndexType], agg_funcs: &[AggregateFunctionDesc], + max_block_size: u64, limit: Option, ) -> Result> { let mut agg_args = Vec::with_capacity(agg_funcs.len()); @@ -2227,6 +2230,7 @@ impl PipelineBuilder { &group_by, &aggs, &agg_args, + max_block_size as usize, limit, )?; diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregator_params.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregator_params.rs index ec4b70546a8c..81c6c9be8c4a 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregator_params.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregator_params.rs @@ -38,6 +38,7 @@ pub struct AggregatorParams { pub layout: Option, pub offsets_aggregate_states: Vec, + pub max_block_size: usize, // Limit is push down to AggregatorTransform pub limit: Option, } @@ -49,6 +50,7 @@ impl AggregatorParams { group_columns: &[usize], agg_funcs: &[AggregateFunctionRef], agg_args: &[Vec], + max_block_size: usize, limit: Option, ) -> Result> { let mut states_offsets: Vec = Vec::with_capacity(agg_funcs.len()); @@ -66,6 +68,7 @@ impl AggregatorParams { aggregate_functions_arguments: agg_args.to_vec(), layout: states_layout, offsets_aggregate_states: states_offsets, + max_block_size, limit, })) } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs index 077d4b540ef9..7f9ed604dd85 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs @@ -44,6 +44,7 @@ use crate::pipelines::processors::AggregatorParams; pub struct TransformFinalAggregate { method: Method, params: Arc, + flush_state: PayloadFlushState, } impl TransformFinalAggregate { @@ -53,10 +54,15 @@ impl TransformFinalAggregate { method: Method, params: Arc, ) -> Result> { + let max_block_size = params.max_block_size; Ok(Box::new(BlockMetaTransformer::create( input, output, - TransformFinalAggregate:: { method, params }, + TransformFinalAggregate:: { + method, + params, + flush_state: PayloadFlushState::with_capacity(max_block_size), + }, ))) } } @@ -183,8 +189,7 @@ where Method: HashMethodBounds AggregateMeta::AggregateHashTable((_, hashtable)) => { match agg_hashtable.as_mut() { Some(ht) => { - let mut flush_state = PayloadFlushState::default(); - ht.combine(hashtable, &mut flush_state)?; + ht.combine(hashtable, &mut self.flush_state)?; } None => agg_hashtable = Some(hashtable), } @@ -193,13 +198,12 @@ where Method: HashMethodBounds } if let Some(mut ht) = agg_hashtable { - let mut flush_state = PayloadFlushState::default(); - let mut blocks = vec![]; + self.flush_state.reset(); loop { - if ht.merge_result(&mut flush_state)? { - let mut cols = flush_state.aggregate_results.clone(); - cols.extend_from_slice(&flush_state.group_columns); + if ht.merge_result(&mut self.flush_state)? { + let mut cols = self.flush_state.take_aggregate_results(); + cols.extend_from_slice(&self.flush_state.group_columns); blocks.push(DataBlock::new_from_columns(cols)); } else { diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs index b4223f72f93b..bbeea38672c4 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs @@ -145,6 +145,8 @@ impl TransformPartialAggregate { )) }; + let max_block_size = ctx.get_settings().get_max_block_size()? as usize; + Ok(AccumulatingTransformer::create( input, output, @@ -152,7 +154,7 @@ impl TransformPartialAggregate { method, params, hash_table, - probe_state: ProbeState::default(), + probe_state: ProbeState::with_capacity(max_block_size), settings: AggregateSettings::try_from(ctx)?, }, )) diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs index d3867113bd02..7631910ebdce 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs @@ -38,6 +38,7 @@ use crate::pipelines::processors::AggregatorParams; pub struct TransformFinalGroupBy { method: Method, params: Arc, + flush_state: PayloadFlushState, } impl TransformFinalGroupBy { @@ -47,10 +48,15 @@ impl TransformFinalGroupBy { method: Method, params: Arc, ) -> Result> { + let max_block_size = params.max_block_size; Ok(Box::new(BlockMetaTransformer::create( input, output, - TransformFinalGroupBy:: { method, params }, + TransformFinalGroupBy:: { + method, + params, + flush_state: PayloadFlushState::with_capacity(max_block_size), + }, ))) } } @@ -106,8 +112,7 @@ where Method: HashMethodBounds AggregateMeta::AggregateHashTable((_, hashtable)) => { match agg_hashtable.as_mut() { Some(ht) => { - let mut flush_state = PayloadFlushState::default(); - ht.combine(hashtable, &mut flush_state)?; + ht.combine(hashtable, &mut self.flush_state)?; } None => agg_hashtable = Some(hashtable), } @@ -116,13 +121,12 @@ where Method: HashMethodBounds } if let Some(mut ht) = agg_hashtable { - let mut flush_state = PayloadFlushState::default(); - let mut blocks = vec![]; + self.flush_state.reset(); loop { - if ht.merge_result(&mut flush_state)? { + if ht.merge_result(&mut self.flush_state)? { blocks.push(DataBlock::new_from_columns( - flush_state.group_columns.clone(), + self.flush_state.take_group_columns(), )); } else { break; diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs index 665462b5eba3..5edeba053b05 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs @@ -130,13 +130,14 @@ impl TransformPartialGroupBy { )) }; + let max_block_size = ctx.get_settings().get_max_block_size()? as usize; Ok(AccumulatingTransformer::create( input, output, TransformPartialGroupBy:: { method, hash_table, - probe_state: ProbeState::default(), + probe_state: ProbeState::with_capacity(max_block_size), group_columns: params.group_columns.clone(), settings: GroupBySettings::try_from(ctx)?, }, From 7592691fda0704b92557e46f4351869d68ad9b69 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Thu, 2 Nov 2023 19:59:27 +0800 Subject: [PATCH 03/28] remove select vector --- .../src/aggregate/aggregate_hashtable.rs | 34 ++++---- src/query/expression/src/aggregate/mod.rs | 2 + src/query/expression/src/aggregate/payload.rs | 8 +- .../expression/src/aggregate/payload_row.rs | 70 ++++++++------- .../expression/src/aggregate/probe_state.rs | 14 +-- src/query/expression/src/utils/mod.rs | 1 - .../expression/src/utils/select_vector.rs | 86 ------------------- 7 files changed, 69 insertions(+), 146 deletions(-) delete mode 100644 src/query/expression/src/utils/select_vector.rs diff --git a/src/query/expression/src/aggregate/aggregate_hashtable.rs b/src/query/expression/src/aggregate/aggregate_hashtable.rs index 1c521c57cc33..1c3bfab06c40 100644 --- a/src/query/expression/src/aggregate/aggregate_hashtable.rs +++ b/src/query/expression/src/aggregate/aggregate_hashtable.rs @@ -23,7 +23,6 @@ use super::payload_flush::PayloadFlushState; use super::probe_state::ProbeState; use crate::aggregate::payload_row::row_match_columns; use crate::group_hash_columns; -use crate::select_vector::SelectVector; use crate::types::DataType; use crate::AggregateFunctionRef; use crate::Column; @@ -141,17 +140,23 @@ impl AggregateHashTable { let mut new_group_count = 0; let mut remaining_entries = row_count; - let mut select_vector = SelectVector::auto_increment(); let mut payload_page_offset = self.len() % self.payload.row_per_page; let mut payload_page_nr = (self.len() / self.payload.row_per_page) + 1; + let mut is_increment = true; while remaining_entries > 0 { let mut new_entry_count = 0; let mut need_compare_count = 0; let mut no_match_count = 0; // 1. inject new_group_count, new_entry_count, need_compare_count, no_match_count - for index in select_vector.iterator(remaining_entries) { + for i in 0..remaining_entries { + let index = if is_increment { + i + } else { + state.no_match_vector[i] + }; + let entry = &mut self.entries[state.ht_offsets[index]]; // cell is empty, could be occupied @@ -166,15 +171,13 @@ impl AggregateHashTable { payload_page_nr += 1; } - state.empty_vector.set_index(new_entry_count, index); + state.empty_vector[new_entry_count] = index; new_entry_count += 1; } else if entry.salt == state.hash_salts[index] { - state - .group_compare_vector - .set_index(need_compare_count, index); + state.group_compare_vector[need_compare_count] = index; need_compare_count += 1; } else { - state.no_match_vector.set_index(no_match_count, index); + state.no_match_vector[no_match_count] = index; no_match_count += 1; } } @@ -187,7 +190,12 @@ impl AggregateHashTable { } // 3. handle need_compare_count - for index in state.group_compare_vector.iterator(need_compare_count) { + for index in state + .group_compare_vector + .iter() + .take(need_compare_count) + .copied() + { let entry = &mut self.entries[state.ht_offsets[index]]; let page_ptr = self.payload.get_page_ptr((entry.page_nr - 1) as usize); @@ -211,7 +219,7 @@ impl AggregateHashTable { } // 5. Linear probing - for index in state.no_match_vector.iterator(no_match_count) { + for index in state.no_match_vector.iter().take(no_match_count).copied() { state.ht_offsets[index] += 1; if state.ht_offsets[index] >= self.capacity { @@ -219,11 +227,7 @@ impl AggregateHashTable { } } - if select_vector.is_auto_increment() { - select_vector = state.no_match_vector.clone(); - } else { - std::mem::swap(&mut select_vector, &mut state.no_match_vector); - } + is_increment = false; remaining_entries = no_match_count; } diff --git a/src/query/expression/src/aggregate/mod.rs b/src/query/expression/src/aggregate/mod.rs index d3adb408663c..ac8c81e7063d 100644 --- a/src/query/expression/src/aggregate/mod.rs +++ b/src/query/expression/src/aggregate/mod.rs @@ -30,3 +30,5 @@ pub use aggregate_hashtable::*; pub use group_hash::*; pub use payload_flush::*; pub use probe_state::*; + +pub type SelectVector = Vec; diff --git a/src/query/expression/src/aggregate/payload.rs b/src/query/expression/src/aggregate/payload.rs index e2d8cd0e35c5..43e2d9206e22 100644 --- a/src/query/expression/src/aggregate/payload.rs +++ b/src/query/expression/src/aggregate/payload.rs @@ -151,7 +151,7 @@ impl Payload { ) { self.try_reverse(new_group_rows); let select_vector = &state.empty_vector; - for idx in select_vector.iterator(new_group_rows) { + for idx in select_vector.iter().take(new_group_rows).copied() { state.addresses[idx] = self.get_row_ptr(self.current_row); self.current_row += 1; } @@ -163,7 +163,7 @@ impl Payload { for col in group_columns { if let Column::Nullable(c) = col { let bitmap = &c.validity; - for idx in select_vector.iterator(new_group_rows) { + for idx in select_vector.iter().take(new_group_rows).copied() { if bitmap.get_bit(idx) { unsafe { let dst = address[idx].add(write_offset); @@ -192,7 +192,7 @@ impl Payload { } // write group hashes - for idx in select_vector.iterator(new_group_rows) { + for idx in select_vector.iter().take(new_group_rows).copied() { unsafe { let dst = address[idx].add(write_offset); store(group_hashes[idx], dst as *mut u8); @@ -202,7 +202,7 @@ impl Payload { write_offset += 8; if let Some(layout) = self.state_layout { // write states - for idx in select_vector.iterator(new_group_rows) { + for idx in select_vector.iter().take(new_group_rows).copied() { let place = self.arena.alloc_layout(layout); unsafe { let dst = address[idx].add(write_offset); diff --git a/src/query/expression/src/aggregate/payload_row.rs b/src/query/expression/src/aggregate/payload_row.rs index 53b23e9da83f..1f3efd0b97b1 100644 --- a/src/query/expression/src/aggregate/payload_row.rs +++ b/src/query/expression/src/aggregate/payload_row.rs @@ -16,7 +16,6 @@ use bumpalo::Bump; use common_arrow::arrow::bitmap::Bitmap; use ethnum::i256; -use crate::select_vector::SelectVector; use crate::store; use crate::types::decimal::DecimalColumn; use crate::types::decimal::DecimalType; @@ -33,6 +32,7 @@ use crate::types::ValueType; use crate::with_decimal_mapped_type; use crate::with_number_mapped_type; use crate::Column; +use crate::SelectVector; pub fn rowformat_size(data_type: &DataType) -> usize { match data_type { @@ -69,7 +69,7 @@ pub unsafe fn serialize_column_to_rowformat( Column::Null { .. } | Column::EmptyArray { .. } | Column::EmptyMap { .. } => {} Column::Number(v) => with_number_mapped_type!(|NUM_TYPE| match v { NumberColumn::NUM_TYPE(buffer) => { - for index in select_vector.iterator(rows) { + for index in select_vector.iter().take(rows).copied() { store(buffer[index], address[index].add(offset) as *mut u8); } } @@ -77,19 +77,19 @@ pub unsafe fn serialize_column_to_rowformat( Column::Decimal(v) => { with_decimal_mapped_type!(|DECIMAL_TYPE| match v { DecimalColumn::DECIMAL_TYPE(buffer, _) => { - for index in select_vector.iterator(rows) { + for index in select_vector.iter().take(rows).copied() { store(buffer[index], address[index].add(offset) as *mut u8); } } }) } Column::Boolean(v) => { - for index in select_vector.iterator(rows) { + for index in select_vector.iter().take(rows).copied() { store(v.get_bit(index), address[index].add(offset) as *mut u8); } } Column::String(v) | Column::Bitmap(v) | Column::Variant(v) => { - for index in select_vector.iterator(rows) { + for index in select_vector.iter().take(rows).copied() { let data = arena.alloc_slice_copy(v.index_unchecked(index)); store(data.len() as u32, address[index].add(offset) as *mut u8); @@ -101,12 +101,12 @@ pub unsafe fn serialize_column_to_rowformat( } } Column::Timestamp(buffer) => { - for index in select_vector.iterator(rows) { + for index in select_vector.iter().take(rows).copied() { store(buffer[index], address[index].add(offset) as *mut u8); } } Column::Date(buffer) => { - for index in select_vector.iterator(rows) { + for index in select_vector.iter().take(rows).copied() { store(buffer[index], address[index].add(offset) as *mut u8); } } @@ -307,19 +307,20 @@ unsafe fn row_match_string_column( let mut equal: bool; if let Some(validity) = validity { - for idx in select_vector.sel_vec_mut(*count).iter_mut() { - let isnull = !validity.get_bit(*idx); + for i in 0..*count { + let idx = select_vector[i]; + let isnull = !validity.get_bit(idx); - let validity_address = address[*idx].add(validity_offset); + let validity_address = address[idx].add(validity_offset); let isnull2 = core::ptr::read::(validity_address as _) != 0; equal = isnull == isnull2; if !isnull && !isnull2 { - let len_address = address[*idx].add(col_offset); - let address = address[*idx].add(col_offset + 4); + let len_address = address[idx].add(col_offset); + let address = address[idx].add(col_offset + 4); let len = core::ptr::read::(len_address as _) as usize; - let value = StringType::index_column_unchecked(col, *idx); + let value = StringType::index_column_unchecked(col, idx); if len != value.len() { equal = false; } else { @@ -330,21 +331,22 @@ unsafe fn row_match_string_column( } if equal { - *idx = match_count; + select_vector[match_count] = idx; match_count += 1; } else { - no_match.set_index(*no_match_count, *idx); + no_match[*no_match_count] = idx; *no_match_count += 1; } } } else { - for idx in select_vector.sel_vec_mut(*count).iter_mut() { - let len_address = address[*idx].add(col_offset); - let address = address[*idx].add(col_offset + 4); + for i in 0..*count { + let idx = select_vector[i]; + let len_address = address[idx].add(col_offset); + let address = address[idx].add(col_offset + 4); let len = core::ptr::read::(len_address as _) as usize; - let value = StringType::index_column_unchecked(col, *idx); + let value = StringType::index_column_unchecked(col, idx); if len != value.len() { equal = false; } else { @@ -354,10 +356,10 @@ unsafe fn row_match_string_column( } if equal { - *idx = match_count; + select_vector[match_count] = idx; match_count += 1; } else { - no_match.set_index(*no_match_count, *idx); + no_match[*no_match_count] = idx; *no_match_count += 1; } } @@ -383,41 +385,43 @@ unsafe fn row_match_column_type( let mut equal: bool; if let Some(validity) = validity { - for idx in select_vector.sel_vec_mut(*count).iter_mut() { - let isnull = !validity.get_bit(*idx); + for i in 0..*count { + let idx = select_vector[i]; + let isnull = !validity.get_bit(idx); - let validity_address = address[*idx].add(validity_offset); + let validity_address = address[idx].add(validity_offset); let isnull2 = core::ptr::read::(validity_address as _) != 0; equal = isnull == isnull2; if !isnull && !isnull2 { - let address = address[*idx].add(col_offset); + let address = address[idx].add(col_offset); let scalar = core::ptr::read::<::Scalar>(address as _); - let value = T::index_column_unchecked(&col, *idx); + let value = T::index_column_unchecked(&col, idx); let value = T::to_owned_scalar(value); equal = scalar.eq(&value); } if equal { - *idx = match_count; + select_vector[match_count] = idx; match_count += 1; } else { - no_match.set_index(*no_match_count, *idx); + no_match[*no_match_count] = idx; *no_match_count += 1; } } } else { - for idx in select_vector.sel_vec_mut(*count).iter_mut() { - let value = T::index_column_unchecked(&col, *idx); - let address = address[*idx].add(col_offset); + for i in 0..*count { + let idx = select_vector[i]; + let value = T::index_column_unchecked(&col, idx); + let address = address[idx].add(col_offset); let scalar = core::ptr::read::<::Scalar>(address as _); let value = T::to_owned_scalar(value); if scalar.eq(&value) { - *idx = match_count; + select_vector[match_count] = idx; match_count += 1; } else { - no_match.set_index(*no_match_count, *idx); + no_match[*no_match_count] = idx; *no_match_count += 1; } } diff --git a/src/query/expression/src/aggregate/probe_state.rs b/src/query/expression/src/aggregate/probe_state.rs index 809e9ea1b7b4..4f70490c3ff8 100644 --- a/src/query/expression/src/aggregate/probe_state.rs +++ b/src/query/expression/src/aggregate/probe_state.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::select_vector::SelectVector; +use crate::SelectVector; use crate::StateAddr; /// ProbeState is the state to probe HT @@ -40,9 +40,9 @@ impl ProbeState { hash_salts: vec![0; len], addresses: vec![std::ptr::null::(); len], state_places: vec![StateAddr::new(0); len], - group_compare_vector: SelectVector::new(len), - no_match_vector: SelectVector::new(len), - empty_vector: SelectVector::new(len), + group_compare_vector: vec![0; len], + no_match_vector: vec![0; len], + empty_vector: vec![0; len], row_count: 0, } } @@ -66,9 +66,9 @@ impl ProbeState { self.addresses.resize(row_count, std::ptr::null::()); self.state_places.resize(row_count, StateAddr::new(0)); - self.group_compare_vector.resize(row_count); - self.no_match_vector.resize(row_count); - self.empty_vector.resize(row_count); + self.group_compare_vector.resize(row_count, 0); + self.no_match_vector.resize(row_count, 0); + self.empty_vector.resize(row_count, 0); } self.row_count = row_count; diff --git a/src/query/expression/src/utils/mod.rs b/src/query/expression/src/utils/mod.rs index 2bc5ffc6be12..86866a8a3f61 100644 --- a/src/query/expression/src/utils/mod.rs +++ b/src/query/expression/src/utils/mod.rs @@ -20,7 +20,6 @@ mod column_from; pub mod date_helper; pub mod display; pub mod filter_helper; -pub mod select_vector; pub mod serialize; pub mod udf_client; pub mod variant_transform; diff --git a/src/query/expression/src/utils/select_vector.rs b/src/query/expression/src/utils/select_vector.rs deleted file mode 100644 index 136d04f9be47..000000000000 --- a/src/query/expression/src/utils/select_vector.rs +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#[derive(Debug, Clone)] -pub struct SelectVector { - increment: bool, - sel_vector: Vec, -} - -impl Default for SelectVector { - fn default() -> Self { - Self { - increment: true, - sel_vector: vec![], - } - } -} - -impl SelectVector { - pub fn auto_increment() -> Self { - Self::default() - } - - pub fn is_auto_increment(&self) -> bool { - self.increment - } - - pub fn new(size: usize) -> Self { - Self { - increment: false, - sel_vector: vec![0; size], - } - } - - pub fn resize(&mut self, new_len: usize) { - self.increment = false; - self.sel_vector.resize(new_len, 0); - } - - pub fn with_sel(&mut self, sel_vection: Vec) { - self.sel_vector = sel_vection; - } - - // these function did not check index boundes - // keep in mind when using them - #[inline] - pub fn set_index(&mut self, idx: usize, loc: usize) { - self.sel_vector[idx] = loc; - } - - #[inline] - pub fn get_index(&self, idx: usize) -> usize { - if self.increment { - idx - } else { - self.sel_vector[idx] - } - } - - pub fn iterator(&self, count: usize) -> Box + '_> { - if self.increment { - Box::new(0..count) - } else { - Box::new(self.sel_vector.iter().take(count).copied()) - } - } - - pub fn sel_vec_mut(&mut self, count: usize) -> &mut [usize] { - &mut self.sel_vector.as_mut_slice()[0..count] - } - - pub fn swap(&mut self, i: usize, j: usize) { - self.sel_vector.swap(i, j); - } -} From fadc04fbe319c900742c8398d915b7474123886e Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Thu, 2 Nov 2023 23:57:36 +0800 Subject: [PATCH 04/28] update --- .../src/aggregate/aggregate_hashtable.rs | 47 +++++------ .../expression/src/aggregate/group_hash.rs | 77 ++++++++++++++++++- src/query/expression/src/aggregate/payload.rs | 13 +++- .../expression/src/aggregate/probe_state.rs | 20 +---- 4 files changed, 103 insertions(+), 54 deletions(-) diff --git a/src/query/expression/src/aggregate/aggregate_hashtable.rs b/src/query/expression/src/aggregate/aggregate_hashtable.rs index 1c3bfab06c40..476b6d6e44c4 100644 --- a/src/query/expression/src/aggregate/aggregate_hashtable.rs +++ b/src/query/expression/src/aggregate/aggregate_hashtable.rs @@ -135,7 +135,7 @@ impl AggregateHashTable { self.resize(new_capacity); } - state.adjust_group_columns(hashes, row_count, self.capacity); + state.adjust_vector(row_count); let mut new_group_count = 0; let mut remaining_entries = row_count; @@ -143,7 +143,7 @@ impl AggregateHashTable { let mut payload_page_offset = self.len() % self.payload.row_per_page; let mut payload_page_nr = (self.len() / self.payload.row_per_page) + 1; - let mut is_increment = true; + let mut iter_times = 0; while remaining_entries > 0 { let mut new_entry_count = 0; let mut need_compare_count = 0; @@ -151,17 +151,20 @@ impl AggregateHashTable { // 1. inject new_group_count, new_entry_count, need_compare_count, no_match_count for i in 0..remaining_entries { - let index = if is_increment { + let index = if iter_times == 0 { i } else { state.no_match_vector[i] }; - let entry = &mut self.entries[state.ht_offsets[index]]; + let ht_offset = (hashes[index] as usize + iter_times) & (self.capacity - 1); + let salt = (hashes[index] >> (64 - 16)) as u16; + + let entry = &mut self.entries[ht_offset]; // cell is empty, could be occupied if entry.page_nr == 0 { - entry.salt = state.hash_salts[index]; + entry.salt = salt; entry.page_nr = payload_page_nr as u32; entry.page_offset = payload_page_offset as u16; @@ -169,11 +172,17 @@ impl AggregateHashTable { if payload_page_offset == self.payload.row_per_page { payload_page_offset = 0; payload_page_nr += 1; + + self.payload.try_extend_page(payload_page_nr - 1); } state.empty_vector[new_entry_count] = index; new_entry_count += 1; - } else if entry.salt == state.hash_salts[index] { + } else if entry.salt == salt { + let page_ptr = self.payload.get_page_ptr((entry.page_nr - 1) as usize); + let page_offset = entry.page_offset as usize * self.payload.tuple_size; + state.addresses[index] = unsafe { page_ptr.add(page_offset) }; + state.group_compare_vector[need_compare_count] = index; need_compare_count += 1; } else { @@ -190,19 +199,7 @@ impl AggregateHashTable { } // 3. handle need_compare_count - for index in state - .group_compare_vector - .iter() - .take(need_compare_count) - .copied() - { - let entry = &mut self.entries[state.ht_offsets[index]]; - - let page_ptr = self.payload.get_page_ptr((entry.page_nr - 1) as usize); - let page_offset = entry.page_offset as usize * self.payload.tuple_size; - - state.addresses[index] = unsafe { page_ptr.add(page_offset) }; - } + // already inject addresses to state.addresses // 4. compare unsafe { @@ -218,16 +215,8 @@ impl AggregateHashTable { ); } - // 5. Linear probing - for index in state.no_match_vector.iter().take(no_match_count).copied() { - state.ht_offsets[index] += 1; - - if state.ht_offsets[index] >= self.capacity { - state.ht_offsets[index] = 0; - } - } - - is_increment = false; + // 5. Linear probing, just increase iter_times + iter_times += 1; remaining_entries = no_match_count; } diff --git a/src/query/expression/src/aggregate/group_hash.rs b/src/query/expression/src/aggregate/group_hash.rs index 129f79d4545d..6971cca8c5dc 100644 --- a/src/query/expression/src/aggregate/group_hash.rs +++ b/src/query/expression/src/aggregate/group_hash.rs @@ -40,10 +40,7 @@ pub fn group_hash_columns(cols: &[Column]) -> Vec { if cols.len() > 1 { for col in &cols[1..] { - let col_values = group_hash_column(col); - for (val, v) in values.iter_mut().zip(col_values) { - *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ v; - } + combine_roup_hash_column(&col, &mut values); } } values @@ -100,8 +97,80 @@ pub fn group_hash_column(c: &Column) -> Vec { } } +pub fn combine_roup_hash_column(c: &Column, values: &mut Vec) { + match c.data_type() { + DataType::Null => {} + DataType::EmptyArray => {} + DataType::EmptyMap => {} + DataType::Number(v) => with_number_mapped_type!(|NUM_TYPE| match v { + NumberDataType::NUM_TYPE => { + combine_group_hash_type_column::>(c, values) + } + }), + DataType::Decimal(v) => match v { + DecimalDataType::Decimal128(_) => { + combine_group_hash_type_column::>(c, values) + } + DecimalDataType::Decimal256(_) => { + combine_group_hash_type_column::>(c, values) + } + }, + DataType::Boolean => combine_group_hash_type_column::(c, values), + + DataType::String => { + let c = StringType::try_downcast_column(c).unwrap(); + for (x, val) in StringType::iter_column(&c).zip(values.iter_mut()) { + *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ x.fast_hash(); + } + } + DataType::Bitmap => { + let c = BitmapType::try_downcast_column(c).unwrap(); + for (x, val) in StringType::iter_column(&c).zip(values.iter_mut()) { + *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ x.fast_hash(); + } + } + DataType::Variant => { + let c = VariantType::try_downcast_column(c).unwrap(); + for (x, val) in StringType::iter_column(&c).zip(values.iter_mut()) { + *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ x.fast_hash(); + } + } + + DataType::Timestamp => combine_group_hash_type_column::(c, values), + DataType::Date => combine_group_hash_type_column::(c, values), + DataType::Nullable(_) => { + let col = c.as_nullable().unwrap(); + let values2 = group_hash_column(&col.column); + + for ((x, val), ok) in values2 + .iter() + .zip(values.iter_mut()) + .zip(col.validity.iter()) + { + if ok { + *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ x; + } else { + *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ NULL_HASH_VAL; + } + } + } + DataType::Tuple(_) => todo!(), + DataType::Array(_) => todo!(), + DataType::Map(_) => todo!(), + DataType::Generic(_) => unreachable!(), + } +} + fn group_hash_type_column(col: &Column) -> Vec where for<'a> T::ScalarRef<'a>: FastHash { let c = T::try_downcast_column(col).unwrap(); T::iter_column(&c).map(|x| x.fast_hash()).collect() } + +fn combine_group_hash_type_column(col: &Column, values: &mut Vec) +where for<'a> T::ScalarRef<'a>: FastHash { + let c = T::try_downcast_column(col).unwrap(); + for (x, val) in T::iter_column(&c).zip(values.iter_mut()) { + *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ x.fast_hash(); + } +} diff --git a/src/query/expression/src/aggregate/payload.rs b/src/query/expression/src/aggregate/payload.rs index 43e2d9206e22..c6cac466b9fc 100644 --- a/src/query/expression/src/aggregate/payload.rs +++ b/src/query/expression/src/aggregate/payload.rs @@ -98,13 +98,15 @@ impl Payload { tuple_size += 8; } + let row_per_page = (u16::MAX as usize).min(MAX_PAGE_SIZE / tuple_size).max(1); + Self { arena, - pages: vec![], + pages: vec![vec![0; row_per_page * tuple_size]], group_types, aggrs, tuple_size, - row_per_page: (u16::MAX as usize).min(MAX_PAGE_SIZE / tuple_size).max(1), + row_per_page, current_row: 0, group_offsets, group_sizes, @@ -134,6 +136,13 @@ impl Payload { } } + pub fn try_extend_page(&mut self, page_nr: usize) { + while page_nr >= self.pages.len() { + self.pages + .push(vec![0; self.row_per_page * self.tuple_size]); + } + } + pub fn get_row_ptr(&self, row: usize) -> *const u8 { let page = row / self.row_per_page; let page_ptr = self.get_page_ptr(page); diff --git a/src/query/expression/src/aggregate/probe_state.rs b/src/query/expression/src/aggregate/probe_state.rs index 4f70490c3ff8..8596af57d85c 100644 --- a/src/query/expression/src/aggregate/probe_state.rs +++ b/src/query/expression/src/aggregate/probe_state.rs @@ -19,8 +19,6 @@ use crate::StateAddr; /// It could be reuse during multiple probe process #[derive(Debug)] pub struct ProbeState { - pub ht_offsets: Vec, - pub hash_salts: Vec, pub addresses: Vec<*const u8>, pub state_places: Vec, pub group_compare_vector: SelectVector, @@ -36,8 +34,6 @@ unsafe impl Sync for ProbeState {} impl ProbeState { pub fn with_capacity(len: usize) -> Self { Self { - ht_offsets: vec![0; len], - hash_salts: vec![0; len], addresses: vec![std::ptr::null::(); len], state_places: vec![StateAddr::new(0); len], group_compare_vector: vec![0; len], @@ -46,23 +42,9 @@ impl ProbeState { row_count: 0, } } - pub fn adjust_group_columns(&mut self, hashes: &[u64], row_count: usize, ht_size: usize) { - self.adjust_vector(row_count); - - for ((hash, salt), ht_offset) in hashes - .iter() - .zip(self.hash_salts.iter_mut()) - .zip(self.ht_offsets.iter_mut()) - { - *salt = (*hash >> (64 - 16)) as u16; - *ht_offset = (*hash & (ht_size as u64 - 1)) as usize; - } - } pub fn adjust_vector(&mut self, row_count: usize) { - if self.ht_offsets.len() < row_count { - self.ht_offsets.resize(row_count, 0); - self.hash_salts.resize(row_count, 0); + if self.no_match_vector.len() < row_count { self.addresses.resize(row_count, std::ptr::null::()); self.state_places.resize(row_count, StateAddr::new(0)); From 25dd0e8d492ddcec59ceb12866860db4c560b15e Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Fri, 3 Nov 2023 17:14:55 +0800 Subject: [PATCH 05/28] improve nullable --- .../src/aggregate/aggregate_hashtable.rs | 5 +- src/query/expression/src/aggregate/payload.rs | 14 +++- .../expression/src/aggregate/payload_row.rs | 78 +++++++++++++------ .../expression/src/aggregate/probe_state.rs | 3 + 4 files changed, 73 insertions(+), 27 deletions(-) diff --git a/src/query/expression/src/aggregate/aggregate_hashtable.rs b/src/query/expression/src/aggregate/aggregate_hashtable.rs index 476b6d6e44c4..2db7641dc2cf 100644 --- a/src/query/expression/src/aggregate/aggregate_hashtable.rs +++ b/src/query/expression/src/aggregate/aggregate_hashtable.rs @@ -144,6 +144,8 @@ impl AggregateHashTable { let mut payload_page_nr = (self.len() / self.payload.row_per_page) + 1; let mut iter_times = 0; + let entries = &mut self.entries; + while remaining_entries > 0 { let mut new_entry_count = 0; let mut need_compare_count = 0; @@ -160,7 +162,7 @@ impl AggregateHashTable { let ht_offset = (hashes[index] as usize + iter_times) & (self.capacity - 1); let salt = (hashes[index] >> (64 - 16)) as u16; - let entry = &mut self.entries[ht_offset]; + let entry = &mut entries[ht_offset]; // cell is empty, could be occupied if entry.page_nr == 0 { @@ -207,6 +209,7 @@ impl AggregateHashTable { group_columns, &state.addresses, &mut state.group_compare_vector, + &mut state.temp_vector, need_compare_count, &self.payload.validity_offsets, &self.payload.group_offsets, diff --git a/src/query/expression/src/aggregate/payload.rs b/src/query/expression/src/aggregate/payload.rs index c6cac466b9fc..0107bc7a3746 100644 --- a/src/query/expression/src/aggregate/payload.rs +++ b/src/query/expression/src/aggregate/payload.rs @@ -172,13 +172,23 @@ impl Payload { for col in group_columns { if let Column::Nullable(c) = col { let bitmap = &c.validity; - for idx in select_vector.iter().take(new_group_rows).copied() { - if bitmap.get_bit(idx) { + if bitmap.unset_bits() == 0 { + // faster path + for idx in select_vector.iter().take(new_group_rows).copied() { unsafe { let dst = address[idx].add(write_offset); store(1, dst as *mut u8); } } + } else if bitmap.unset_bits() != bitmap.len() { + for idx in select_vector.iter().take(new_group_rows).copied() { + if bitmap.get_bit(idx) { + unsafe { + let dst = address[idx].add(write_offset); + store(1, dst as *mut u8); + } + } + } } write_offset += 1; } diff --git a/src/query/expression/src/aggregate/payload_row.rs b/src/query/expression/src/aggregate/payload_row.rs index 1f3efd0b97b1..2b34392bf83d 100644 --- a/src/query/expression/src/aggregate/payload_row.rs +++ b/src/query/expression/src/aggregate/payload_row.rs @@ -84,8 +84,17 @@ pub unsafe fn serialize_column_to_rowformat( }) } Column::Boolean(v) => { - for index in select_vector.iter().take(rows).copied() { - store(v.get_bit(index), address[index].add(offset) as *mut u8); + if v.unset_bits() == 0 { + // faster path + for index in select_vector.iter().take(rows).copied() { + store(1, address[index].add(offset) as *mut u8); + } + } else if v.unset_bits() != v.len() { + for index in select_vector.iter().take(rows).copied() { + if v.get_bit(index) { + store(1, address[index].add(offset) as *mut u8); + } + } } } Column::String(v) | Column::Bitmap(v) | Column::Variant(v) => { @@ -133,6 +142,7 @@ pub unsafe fn row_match_columns( cols: &[Column], address: &[*const u8], select_vector: &mut SelectVector, + temp_vector: &mut SelectVector, count: usize, validity_offset: &[usize], col_offsets: &[usize], @@ -149,6 +159,7 @@ pub unsafe fn row_match_columns( col, address, select_vector, + temp_vector, &mut count, *validity_offset, *col_offset, @@ -162,6 +173,7 @@ pub unsafe fn row_match_column( col: &Column, address: &[*const u8], select_vector: &mut SelectVector, + temp_vector: &mut SelectVector, count: &mut usize, validity_offset: usize, col_offset: usize, @@ -186,6 +198,7 @@ pub unsafe fn row_match_column( validity, address, select_vector, + temp_vector, count, validity_offset, col_offset, @@ -200,6 +213,7 @@ pub unsafe fn row_match_column( validity, address, select_vector, + temp_vector, count, validity_offset, col_offset, @@ -211,6 +225,7 @@ pub unsafe fn row_match_column( validity, address, select_vector, + temp_vector, count, validity_offset, col_offset, @@ -223,6 +238,7 @@ pub unsafe fn row_match_column( validity, address, select_vector, + temp_vector, count, validity_offset, col_offset, @@ -234,6 +250,7 @@ pub unsafe fn row_match_column( validity, address, select_vector, + temp_vector, count, validity_offset, col_offset, @@ -245,6 +262,7 @@ pub unsafe fn row_match_column( validity, address, select_vector, + temp_vector, count, validity_offset, col_offset, @@ -256,6 +274,7 @@ pub unsafe fn row_match_column( validity, address, select_vector, + temp_vector, count, validity_offset, col_offset, @@ -268,6 +287,7 @@ pub unsafe fn row_match_column( validity, address, select_vector, + temp_vector, count, validity_offset, col_offset, @@ -279,6 +299,7 @@ pub unsafe fn row_match_column( validity, address, select_vector, + temp_vector, count, validity_offset, col_offset, @@ -297,6 +318,7 @@ unsafe fn row_match_string_column( validity: Option<&Bitmap>, address: &[*const u8], select_vector: &mut SelectVector, + temp_vector: &mut SelectVector, count: &mut usize, validity_offset: usize, col_offset: usize, @@ -307,15 +329,16 @@ unsafe fn row_match_string_column( let mut equal: bool; if let Some(validity) = validity { - for i in 0..*count { - let idx = select_vector[i]; - let isnull = !validity.get_bit(idx); - + for (idx, is_set) in select_vector + .iter() + .take(*count) + .copied() + .zip(validity.iter()) + { let validity_address = address[idx].add(validity_offset); - let isnull2 = core::ptr::read::(validity_address as _) != 0; + let is_set2 = core::ptr::read::(validity_address as _) != 0; - equal = isnull == isnull2; - if !isnull && !isnull2 { + if is_set && is_set2 { let len_address = address[idx].add(col_offset); let address = address[idx].add(col_offset + 4); let len = core::ptr::read::(len_address as _) as usize; @@ -328,10 +351,12 @@ unsafe fn row_match_string_column( let scalar = std::slice::from_raw_parts(data_address, len); equal = scalar.eq(value); } + } else { + equal = is_set == is_set2; } if equal { - select_vector[match_count] = idx; + temp_vector[match_count] = idx; match_count += 1; } else { no_match[*no_match_count] = idx; @@ -339,8 +364,7 @@ unsafe fn row_match_string_column( } } } else { - for i in 0..*count { - let idx = select_vector[i]; + for idx in select_vector.iter().take(*count).copied() { let len_address = address[idx].add(col_offset); let address = address[idx].add(col_offset + 4); @@ -356,7 +380,7 @@ unsafe fn row_match_string_column( } if equal { - select_vector[match_count] = idx; + temp_vector[match_count] = idx; match_count += 1; } else { no_match[*no_match_count] = idx; @@ -365,6 +389,8 @@ unsafe fn row_match_string_column( } } + std::mem::swap(select_vector, temp_vector); + *count = match_count; } @@ -373,6 +399,7 @@ unsafe fn row_match_column_type( validity: Option<&Bitmap>, address: &[*const u8], select_vector: &mut SelectVector, + temp_vector: &mut SelectVector, count: &mut usize, validity_offset: usize, col_offset: usize, @@ -385,24 +412,27 @@ unsafe fn row_match_column_type( let mut equal: bool; if let Some(validity) = validity { - for i in 0..*count { - let idx = select_vector[i]; - let isnull = !validity.get_bit(idx); - + for (idx, is_set) in select_vector + .iter() + .take(*count) + .copied() + .zip(validity.iter()) + { let validity_address = address[idx].add(validity_offset); - let isnull2 = core::ptr::read::(validity_address as _) != 0; + let is_set2 = core::ptr::read::(validity_address as _) != 0; - equal = isnull == isnull2; - if !isnull && !isnull2 { + if is_set && is_set { let address = address[idx].add(col_offset); let scalar = core::ptr::read::<::Scalar>(address as _); let value = T::index_column_unchecked(&col, idx); let value = T::to_owned_scalar(value); equal = scalar.eq(&value); + } else { + equal = is_set == is_set2; } if equal { - select_vector[match_count] = idx; + temp_vector[match_count] = idx; match_count += 1; } else { no_match[*no_match_count] = idx; @@ -410,15 +440,14 @@ unsafe fn row_match_column_type( } } } else { - for i in 0..*count { - let idx = select_vector[i]; + for idx in select_vector.iter().take(*count).copied() { let value = T::index_column_unchecked(&col, idx); let address = address[idx].add(col_offset); let scalar = core::ptr::read::<::Scalar>(address as _); let value = T::to_owned_scalar(value); if scalar.eq(&value) { - select_vector[match_count] = idx; + temp_vector[match_count] = idx; match_count += 1; } else { no_match[*no_match_count] = idx; @@ -427,5 +456,6 @@ unsafe fn row_match_column_type( } } + std::mem::swap(select_vector, temp_vector); *count = match_count; } diff --git a/src/query/expression/src/aggregate/probe_state.rs b/src/query/expression/src/aggregate/probe_state.rs index 8596af57d85c..238702b89b92 100644 --- a/src/query/expression/src/aggregate/probe_state.rs +++ b/src/query/expression/src/aggregate/probe_state.rs @@ -24,6 +24,7 @@ pub struct ProbeState { pub group_compare_vector: SelectVector, pub no_match_vector: SelectVector, pub empty_vector: SelectVector, + pub temp_vector: SelectVector, pub row_count: usize, } @@ -39,6 +40,7 @@ impl ProbeState { group_compare_vector: vec![0; len], no_match_vector: vec![0; len], empty_vector: vec![0; len], + temp_vector: vec![0; len], row_count: 0, } } @@ -51,6 +53,7 @@ impl ProbeState { self.group_compare_vector.resize(row_count, 0); self.no_match_vector.resize(row_count, 0); self.empty_vector.resize(row_count, 0); + self.temp_vector.resize(row_count, 0); } self.row_count = row_count; From 59a95ba625b201ea98c3cf09f6458d2be272be07 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Fri, 3 Nov 2023 18:16:36 +0800 Subject: [PATCH 06/28] improve nullable --- .../src/aggregate/aggregate_hashtable.rs | 13 +- .../expression/src/aggregate/group_hash.rs | 158 ++++++++---------- src/query/expression/src/aggregate/payload.rs | 3 +- .../expression/src/aggregate/payload_flush.rs | 7 +- .../expression/src/aggregate/probe_state.rs | 5 +- 5 files changed, 82 insertions(+), 104 deletions(-) diff --git a/src/query/expression/src/aggregate/aggregate_hashtable.rs b/src/query/expression/src/aggregate/aggregate_hashtable.rs index 2db7641dc2cf..ca097c2b18fa 100644 --- a/src/query/expression/src/aggregate/aggregate_hashtable.rs +++ b/src/query/expression/src/aggregate/aggregate_hashtable.rs @@ -88,8 +88,8 @@ impl AggregateHashTable { params: &[Vec], row_count: usize, ) -> Result { - let group_hashes = group_hash_columns(group_columns); - let new_group_count = self.probe_and_create(state, group_columns, row_count, &group_hashes); + group_hash_columns(group_columns, &mut state.group_hashes); + let new_group_count = self.probe_and_create(state, group_columns, row_count); if !self.payload.aggrs.is_empty() { for i in 0..row_count { @@ -124,7 +124,6 @@ impl AggregateHashTable { state: &mut ProbeState, group_columns: &[Column], row_count: usize, - hashes: &[u64], ) -> usize { if self.capacity - self.len() <= row_count || self.len() > self.resize_threshold() { let mut new_capacity = self.capacity * 2; @@ -159,8 +158,9 @@ impl AggregateHashTable { state.no_match_vector[i] }; - let ht_offset = (hashes[index] as usize + iter_times) & (self.capacity - 1); - let salt = (hashes[index] >> (64 - 16)) as u16; + let ht_offset = + (state.group_hashes[index] as usize + iter_times) & (self.capacity - 1); + let salt = (state.group_hashes[index] >> (64 - 16)) as u16; let entry = &mut entries[ht_offset]; @@ -197,7 +197,7 @@ impl AggregateHashTable { if new_entry_count != 0 { new_group_count += new_entry_count; self.payload - .append_rows(state, hashes, new_entry_count, group_columns); + .append_rows(state, new_entry_count, group_columns); } // 3. handle need_compare_count @@ -246,7 +246,6 @@ impl AggregateHashTable { &mut flush_state.probe_state, &flush_state.group_columns, row_count, - &flush_state.group_hashes, ); let state = &mut flush_state.probe_state; diff --git a/src/query/expression/src/aggregate/group_hash.rs b/src/query/expression/src/aggregate/group_hash.rs index 6971cca8c5dc..255532145122 100644 --- a/src/query/expression/src/aggregate/group_hash.rs +++ b/src/query/expression/src/aggregate/group_hash.rs @@ -33,124 +33,100 @@ use crate::Column; const NULL_HASH_VAL: u64 = 0xd1cefa08eb382d69; -pub fn group_hash_columns(cols: &[Column]) -> Vec { +pub fn group_hash_columns(cols: &[Column], values: &mut [u64]) { debug_assert!(!cols.is_empty()); - let mut values = group_hash_column(&cols[0]); - + combine_group_hash_column::(&cols[0], values); if cols.len() > 1 { for col in &cols[1..] { - combine_roup_hash_column(&col, &mut values); - } - } - values -} - -pub fn group_hash_column(c: &Column) -> Vec { - let len = c.len(); - match c.data_type() { - DataType::Null => vec![NULL_HASH_VAL; len], - DataType::EmptyArray => vec![NULL_HASH_VAL; len], - DataType::EmptyMap => vec![NULL_HASH_VAL; len], - DataType::Number(v) => with_number_mapped_type!(|NUM_TYPE| match v { - NumberDataType::NUM_TYPE => { - group_hash_type_column::>(c) - } - }), - DataType::Decimal(v) => match v { - DecimalDataType::Decimal128(_) => group_hash_type_column::>(c), - DecimalDataType::Decimal256(_) => group_hash_type_column::>(c), - }, - DataType::Boolean => group_hash_type_column::(c), - - DataType::String => { - let c = StringType::try_downcast_column(c).unwrap(); - StringType::iter_column(&c).map(|x| x.fast_hash()).collect() - } - DataType::Bitmap => { - let c = BitmapType::try_downcast_column(c).unwrap(); - BitmapType::iter_column(&c).map(|x| x.fast_hash()).collect() - } - DataType::Variant => { - let c = VariantType::try_downcast_column(c).unwrap(); - VariantType::iter_column(&c) - .map(|x| x.fast_hash()) - .collect() - } - - DataType::Timestamp => group_hash_type_column::(c), - DataType::Date => group_hash_type_column::(c), - DataType::Nullable(_) => { - let col = c.as_nullable().unwrap(); - let mut values = group_hash_column(&col.column); - for (index, val) in col.validity.iter().enumerate() { - if !val { - values[index] = NULL_HASH_VAL; - } - } - values + combine_group_hash_column::(&col, values); } - DataType::Tuple(_) => todo!(), - DataType::Array(_) => todo!(), - DataType::Map(_) => todo!(), - DataType::Generic(_) => unreachable!(), } } -pub fn combine_roup_hash_column(c: &Column, values: &mut Vec) { +pub fn combine_group_hash_column(c: &Column, values: &mut [u64]) { match c.data_type() { DataType::Null => {} DataType::EmptyArray => {} DataType::EmptyMap => {} DataType::Number(v) => with_number_mapped_type!(|NUM_TYPE| match v { NumberDataType::NUM_TYPE => { - combine_group_hash_type_column::>(c, values) + combine_group_hash_type_column::>(c, values) } }), DataType::Decimal(v) => match v { DecimalDataType::Decimal128(_) => { - combine_group_hash_type_column::>(c, values) + combine_group_hash_type_column::>(c, values) } DecimalDataType::Decimal256(_) => { - combine_group_hash_type_column::>(c, values) + combine_group_hash_type_column::>(c, values) } }, - DataType::Boolean => combine_group_hash_type_column::(c, values), + DataType::Boolean => combine_group_hash_type_column::(c, values), DataType::String => { let c = StringType::try_downcast_column(c).unwrap(); - for (x, val) in StringType::iter_column(&c).zip(values.iter_mut()) { - *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ x.fast_hash(); + + if IS_FIRST { + for (x, val) in StringType::iter_column(&c).zip(values.iter_mut()) { + *val = x.fast_hash(); + } + } else { + for (x, val) in StringType::iter_column(&c).zip(values.iter_mut()) { + *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ x.fast_hash(); + } } } DataType::Bitmap => { let c = BitmapType::try_downcast_column(c).unwrap(); - for (x, val) in StringType::iter_column(&c).zip(values.iter_mut()) { - *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ x.fast_hash(); + if IS_FIRST { + for (x, val) in BitmapType::iter_column(&c).zip(values.iter_mut()) { + *val = x.fast_hash(); + } + } else { + for (x, val) in BitmapType::iter_column(&c).zip(values.iter_mut()) { + *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ x.fast_hash(); + } } } DataType::Variant => { let c = VariantType::try_downcast_column(c).unwrap(); - for (x, val) in StringType::iter_column(&c).zip(values.iter_mut()) { - *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ x.fast_hash(); + if IS_FIRST { + for (x, val) in VariantType::iter_column(&c).zip(values.iter_mut()) { + *val = x.fast_hash(); + } + } else { + for (x, val) in VariantType::iter_column(&c).zip(values.iter_mut()) { + *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ x.fast_hash(); + } } } - DataType::Timestamp => combine_group_hash_type_column::(c, values), - DataType::Date => combine_group_hash_type_column::(c, values), + DataType::Timestamp => combine_group_hash_type_column::(c, values), + DataType::Date => combine_group_hash_type_column::(c, values), DataType::Nullable(_) => { let col = c.as_nullable().unwrap(); - let values2 = group_hash_column(&col.column); + if IS_FIRST { + combine_group_hash_column::(&col.column, values); + for (val, ok) in values.iter_mut().zip(col.validity.iter()) { + if !ok { + *val = NULL_HASH_VAL; + } + } + } else { + let mut values2 = vec![0; c.len()]; + combine_group_hash_column::(&col.column, &mut values2); - for ((x, val), ok) in values2 - .iter() - .zip(values.iter_mut()) - .zip(col.validity.iter()) - { - if ok { - *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ x; - } else { - *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ NULL_HASH_VAL; + for ((x, val), ok) in values2 + .iter() + .zip(values.iter_mut()) + .zip(col.validity.iter()) + { + if ok { + *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ *x; + } else { + *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ NULL_HASH_VAL; + } } } } @@ -161,16 +137,20 @@ pub fn combine_roup_hash_column(c: &Column, values: &mut Vec) { } } -fn group_hash_type_column(col: &Column) -> Vec -where for<'a> T::ScalarRef<'a>: FastHash { +fn combine_group_hash_type_column( + col: &Column, + values: &mut [u64], +) where + for<'a> T::ScalarRef<'a>: FastHash, +{ let c = T::try_downcast_column(col).unwrap(); - T::iter_column(&c).map(|x| x.fast_hash()).collect() -} - -fn combine_group_hash_type_column(col: &Column, values: &mut Vec) -where for<'a> T::ScalarRef<'a>: FastHash { - let c = T::try_downcast_column(col).unwrap(); - for (x, val) in T::iter_column(&c).zip(values.iter_mut()) { - *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ x.fast_hash(); + if IS_FIRST { + for (x, val) in T::iter_column(&c).zip(values.iter_mut()) { + *val = x.fast_hash(); + } + } else { + for (x, val) in T::iter_column(&c).zip(values.iter_mut()) { + *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ x.fast_hash(); + } } } diff --git a/src/query/expression/src/aggregate/payload.rs b/src/query/expression/src/aggregate/payload.rs index 0107bc7a3746..a70012c0039d 100644 --- a/src/query/expression/src/aggregate/payload.rs +++ b/src/query/expression/src/aggregate/payload.rs @@ -154,7 +154,6 @@ impl Payload { pub fn append_rows( &mut self, state: &mut ProbeState, - group_hashes: &[u64], new_group_rows: usize, group_columns: &[Column], ) { @@ -214,7 +213,7 @@ impl Payload { for idx in select_vector.iter().take(new_group_rows).copied() { unsafe { let dst = address[idx].add(write_offset); - store(group_hashes[idx], dst as *mut u8); + store(state.group_hashes[idx], dst as *mut u8); } } diff --git a/src/query/expression/src/aggregate/payload_flush.rs b/src/query/expression/src/aggregate/payload_flush.rs index bbf0c6a28ad6..b1dd4001fc2d 100644 --- a/src/query/expression/src/aggregate/payload_flush.rs +++ b/src/query/expression/src/aggregate/payload_flush.rs @@ -35,7 +35,6 @@ const FLUSH_BATCH_SIZE: usize = 8192; pub struct PayloadFlushState { pub probe_state: ProbeState, - pub group_hashes: Vec, pub group_columns: Vec, pub aggregate_results: Vec, pub row_count: usize, @@ -52,7 +51,6 @@ impl PayloadFlushState { pub fn with_capacity(len: usize) -> PayloadFlushState { PayloadFlushState { probe_state: ProbeState::with_capacity(len), - group_hashes: vec![0; len], group_columns: Vec::new(), aggregate_results: Vec::new(), row_count: 0, @@ -84,8 +82,7 @@ impl Payload { return false; } - if state.group_hashes.len() < rows { - state.group_hashes.resize(rows, 0); + if state.addresses.len() < rows { state.addresses.resize(rows, std::ptr::null::()); state.state_places.resize(rows, StateAddr::new(0)); } @@ -122,7 +119,7 @@ impl Payload { let len = state.probe_state.row_count; for i in 0..len { - state.group_hashes[i] = + state.probe_state.group_hashes[i] = unsafe { core::ptr::read::(state.addresses[i].add(self.hash_offset) as _) }; } } diff --git a/src/query/expression/src/aggregate/probe_state.rs b/src/query/expression/src/aggregate/probe_state.rs index 238702b89b92..b84d4a050263 100644 --- a/src/query/expression/src/aggregate/probe_state.rs +++ b/src/query/expression/src/aggregate/probe_state.rs @@ -19,6 +19,7 @@ use crate::StateAddr; /// It could be reuse during multiple probe process #[derive(Debug)] pub struct ProbeState { + pub group_hashes: Vec, pub addresses: Vec<*const u8>, pub state_places: Vec, pub group_compare_vector: SelectVector, @@ -35,6 +36,7 @@ unsafe impl Sync for ProbeState {} impl ProbeState { pub fn with_capacity(len: usize) -> Self { Self { + group_hashes: vec![0; len], addresses: vec![std::ptr::null::(); len], state_places: vec![StateAddr::new(0); len], group_compare_vector: vec![0; len], @@ -46,7 +48,8 @@ impl ProbeState { } pub fn adjust_vector(&mut self, row_count: usize) { - if self.no_match_vector.len() < row_count { + if self.group_hashes.len() < row_count { + self.group_hashes.resize(row_count, 0); self.addresses.resize(row_count, std::ptr::null::()); self.state_places.resize(row_count, StateAddr::new(0)); From d9909f8e844a121519cf9718292bbe9ff0063d0c Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Fri, 3 Nov 2023 20:38:28 +0800 Subject: [PATCH 07/28] improve nullable --- .../src/aggregate/aggregate_hashtable.rs | 7 ++- .../expression/src/aggregate/group_hash.rs | 58 +++++++------------ .../expression/src/aggregate/probe_state.rs | 1 - 3 files changed, 25 insertions(+), 41 deletions(-) diff --git a/src/query/expression/src/aggregate/aggregate_hashtable.rs b/src/query/expression/src/aggregate/aggregate_hashtable.rs index ca097c2b18fa..a0774daefdc4 100644 --- a/src/query/expression/src/aggregate/aggregate_hashtable.rs +++ b/src/query/expression/src/aggregate/aggregate_hashtable.rs @@ -88,7 +88,9 @@ impl AggregateHashTable { params: &[Vec], row_count: usize, ) -> Result { + state.adjust_vector(row_count); group_hash_columns(group_columns, &mut state.group_hashes); + let new_group_count = self.probe_and_create(state, group_columns, row_count); if !self.payload.aggrs.is_empty() { @@ -134,8 +136,6 @@ impl AggregateHashTable { self.resize(new_capacity); } - state.adjust_vector(row_count); - let mut new_group_count = 0; let mut remaining_entries = row_count; @@ -144,7 +144,6 @@ impl AggregateHashTable { let mut iter_times = 0; let entries = &mut self.entries; - while remaining_entries > 0 { let mut new_entry_count = 0; let mut need_compare_count = 0; @@ -171,6 +170,7 @@ impl AggregateHashTable { entry.page_offset = payload_page_offset as u16; payload_page_offset += 1; + if payload_page_offset == self.payload.row_per_page { payload_page_offset = 0; payload_page_nr += 1; @@ -239,6 +239,7 @@ impl AggregateHashTable { pub fn combine(&mut self, other: Self, flush_state: &mut PayloadFlushState) -> Result<()> { flush_state.reset(); + while other.payload.flush(flush_state) { let row_count = flush_state.row_count; diff --git a/src/query/expression/src/aggregate/group_hash.rs b/src/query/expression/src/aggregate/group_hash.rs index 255532145122..c4fa104a4059 100644 --- a/src/query/expression/src/aggregate/group_hash.rs +++ b/src/query/expression/src/aggregate/group_hash.rs @@ -26,7 +26,6 @@ use crate::types::NumberDataType; use crate::types::NumberType; use crate::types::StringType; use crate::types::TimestampType; -use crate::types::ValueType; use crate::types::VariantType; use crate::with_number_mapped_type; use crate::Column; @@ -64,43 +63,10 @@ pub fn combine_group_hash_column(c: &Column, values: &mut }, DataType::Boolean => combine_group_hash_type_column::(c, values), - DataType::String => { - let c = StringType::try_downcast_column(c).unwrap(); + DataType::String => combine_group_hash_string_column::(c, values), - if IS_FIRST { - for (x, val) in StringType::iter_column(&c).zip(values.iter_mut()) { - *val = x.fast_hash(); - } - } else { - for (x, val) in StringType::iter_column(&c).zip(values.iter_mut()) { - *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ x.fast_hash(); - } - } - } - DataType::Bitmap => { - let c = BitmapType::try_downcast_column(c).unwrap(); - if IS_FIRST { - for (x, val) in BitmapType::iter_column(&c).zip(values.iter_mut()) { - *val = x.fast_hash(); - } - } else { - for (x, val) in BitmapType::iter_column(&c).zip(values.iter_mut()) { - *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ x.fast_hash(); - } - } - } - DataType::Variant => { - let c = VariantType::try_downcast_column(c).unwrap(); - if IS_FIRST { - for (x, val) in VariantType::iter_column(&c).zip(values.iter_mut()) { - *val = x.fast_hash(); - } - } else { - for (x, val) in VariantType::iter_column(&c).zip(values.iter_mut()) { - *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ x.fast_hash(); - } - } - } + DataType::Bitmap => combine_group_hash_string_column::(c, values), + DataType::Variant => combine_group_hash_string_column::(c, values), DataType::Timestamp => combine_group_hash_type_column::(c, values), DataType::Date => combine_group_hash_type_column::(c, values), @@ -154,3 +120,21 @@ fn combine_group_hash_type_column( } } } + +fn combine_group_hash_string_column( + col: &Column, + values: &mut [u64], +) where + for<'a> T::ScalarRef<'a>: AsRef<[u8]>, +{ + let c = T::try_downcast_column(col).unwrap(); + if IS_FIRST { + for (x, val) in T::iter_column(&c).zip(values.iter_mut()) { + *val = x.as_ref().fast_hash(); + } + } else { + for (x, val) in T::iter_column(&c).zip(values.iter_mut()) { + *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ x.as_ref().fast_hash(); + } + } +} diff --git a/src/query/expression/src/aggregate/probe_state.rs b/src/query/expression/src/aggregate/probe_state.rs index b84d4a050263..8cc42659ad92 100644 --- a/src/query/expression/src/aggregate/probe_state.rs +++ b/src/query/expression/src/aggregate/probe_state.rs @@ -58,7 +58,6 @@ impl ProbeState { self.empty_vector.resize(row_count, 0); self.temp_vector.resize(row_count, 0); } - self.row_count = row_count; } } From 367f41ac48b6ad40e7c20c8335c6ba6e233ee8ff Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Sat, 4 Nov 2023 06:26:54 +0800 Subject: [PATCH 08/28] fix bug --- src/query/expression/src/aggregate/payload.rs | 3 +++ src/query/expression/src/aggregate/payload_row.rs | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/query/expression/src/aggregate/payload.rs b/src/query/expression/src/aggregate/payload.rs index a70012c0039d..d7efbafd3a7c 100644 --- a/src/query/expression/src/aggregate/payload.rs +++ b/src/query/expression/src/aggregate/payload.rs @@ -195,6 +195,8 @@ impl Payload { let mut scratch = vec![]; for (idx, col) in group_columns.iter().enumerate() { + debug_assert!(write_offset == self.group_offsets[idx]); + unsafe { serialize_column_to_rowformat( &self.arena, @@ -210,6 +212,7 @@ impl Payload { } // write group hashes + debug_assert!(write_offset == self.hash_offset); for idx in select_vector.iter().take(new_group_rows).copied() { unsafe { let dst = address[idx].add(write_offset); diff --git a/src/query/expression/src/aggregate/payload_row.rs b/src/query/expression/src/aggregate/payload_row.rs index 2b34392bf83d..a4131174ac58 100644 --- a/src/query/expression/src/aggregate/payload_row.rs +++ b/src/query/expression/src/aggregate/payload_row.rs @@ -47,7 +47,7 @@ pub fn rowformat_size(data_type: &DataType) -> usize { }, DataType::Timestamp => 8, DataType::Date => 4, - DataType::Nullable(_) => 4, + DataType::Nullable(x) => rowformat_size(&x), DataType::Array(_) => todo!(), DataType::Map(_) => todo!(), DataType::Tuple(_) => todo!(), @@ -102,7 +102,6 @@ pub unsafe fn serialize_column_to_rowformat( let data = arena.alloc_slice_copy(v.index_unchecked(index)); store(data.len() as u32, address[index].add(offset) as *mut u8); - store( data.as_ptr() as u64, address[index].add(offset + 4) as *mut u8, From 4be1df56db63668d1ff4d0c9ac0bd1a6b8eae3db Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Mon, 6 Nov 2023 15:26:25 +0800 Subject: [PATCH 09/28] update --- src/common/hashtable/src/lib.rs | 1 + src/query/expression/src/aggregate/payload_row.rs | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/common/hashtable/src/lib.rs b/src/common/hashtable/src/lib.rs index e8ff8804d8b5..910fa287dd75 100644 --- a/src/common/hashtable/src/lib.rs +++ b/src/common/hashtable/src/lib.rs @@ -112,3 +112,4 @@ pub use partitioned_hashtable::hash2bucket; pub type HashJoinHashMap = hashjoin_hashtable::HashJoinHashTable; pub type StringHashJoinHashMap = hashjoin_string_hashtable::HashJoinStringHashTable; pub use traits::HashJoinHashtableLike; +pub use utils::sse::memcmp_sse; diff --git a/src/query/expression/src/aggregate/payload_row.rs b/src/query/expression/src/aggregate/payload_row.rs index a4131174ac58..73784e1e7218 100644 --- a/src/query/expression/src/aggregate/payload_row.rs +++ b/src/query/expression/src/aggregate/payload_row.rs @@ -14,6 +14,7 @@ use bumpalo::Bump; use common_arrow::arrow::bitmap::Bitmap; +use common_hashtable::memcmp_sse; use ethnum::i256; use crate::store; @@ -348,7 +349,7 @@ unsafe fn row_match_string_column( } else { let data_address = core::ptr::read::(address as _) as usize as *const u8; let scalar = std::slice::from_raw_parts(data_address, len); - equal = scalar.eq(value); + equal = memcmp_sse(scalar, value); } } else { equal = is_set == is_set2; @@ -375,7 +376,7 @@ unsafe fn row_match_string_column( } else { let data_address = core::ptr::read::(address as _) as usize as *const u8; let scalar = std::slice::from_raw_parts(data_address, len); - equal = scalar.eq(value); + equal = memcmp_sse(scalar, value); } if equal { From 2002dcc60e164c74f5737cd3abdad1015d254528 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Tue, 7 Nov 2023 07:48:11 +0800 Subject: [PATCH 10/28] chore(query): update --- .../expression/src/aggregate/payload_row.rs | 29 +++++++++---------- src/query/expression/src/block.rs | 1 - src/query/expression/src/types/string.rs | 1 + .../pipelines/builders/builder_aggregate.rs | 17 +++++++++-- 4 files changed, 29 insertions(+), 19 deletions(-) diff --git a/src/query/expression/src/aggregate/payload_row.rs b/src/query/expression/src/aggregate/payload_row.rs index 73784e1e7218..fde820b31232 100644 --- a/src/query/expression/src/aggregate/payload_row.rs +++ b/src/query/expression/src/aggregate/payload_row.rs @@ -329,14 +329,12 @@ unsafe fn row_match_string_column( let mut equal: bool; if let Some(validity) = validity { - for (idx, is_set) in select_vector - .iter() - .take(*count) - .copied() - .zip(validity.iter()) - { + let is_all_set = validity.unset_bits() == 0; + for idx in select_vector[..*count].iter() { + let idx = *idx; let validity_address = address[idx].add(validity_offset); let is_set2 = core::ptr::read::(validity_address as _) != 0; + let is_set = is_all_set || validity.get_bit_unchecked(idx); if is_set && is_set2 { let len_address = address[idx].add(col_offset); @@ -364,7 +362,8 @@ unsafe fn row_match_string_column( } } } else { - for idx in select_vector.iter().take(*count).copied() { + for idx in select_vector[..*count].iter() { + let idx = *idx; let len_address = address[idx].add(col_offset); let address = address[idx].add(col_offset + 4); @@ -410,22 +409,19 @@ unsafe fn row_match_column_type( let mut match_count = 0; let mut equal: bool; - if let Some(validity) = validity { - for (idx, is_set) in select_vector - .iter() - .take(*count) - .copied() - .zip(validity.iter()) - { + let is_all_set = validity.unset_bits() == 0; + for idx in select_vector[..*count].iter() { + let idx = *idx; let validity_address = address[idx].add(validity_offset); let is_set2 = core::ptr::read::(validity_address as _) != 0; - + let is_set = is_all_set || validity.get_bit_unchecked(idx); if is_set && is_set { let address = address[idx].add(col_offset); let scalar = core::ptr::read::<::Scalar>(address as _); let value = T::index_column_unchecked(&col, idx); let value = T::to_owned_scalar(value); + equal = scalar.eq(&value); } else { equal = is_set == is_set2; @@ -440,7 +436,8 @@ unsafe fn row_match_column_type( } } } else { - for idx in select_vector.iter().take(*count).copied() { + for idx in select_vector[..*count].iter() { + let idx = *idx; let value = T::index_column_unchecked(&col, idx); let address = address[idx].add(col_offset); let scalar = core::ptr::read::<::Scalar>(address as _); diff --git a/src/query/expression/src/block.rs b/src/query/expression/src/block.rs index 300ed85f7aae..bdb812945557 100644 --- a/src/query/expression/src/block.rs +++ b/src/query/expression/src/block.rs @@ -33,7 +33,6 @@ use crate::Domain; use crate::Scalar; use crate::TableSchemaRef; use crate::Value; -use crate::ValueRef; pub type SendableDataBlockStream = std::pin::Pin> + Send>>; diff --git a/src/query/expression/src/types/string.rs b/src/query/expression/src/types/string.rs index 040fdd8bad5c..17e64bbe09ce 100644 --- a/src/query/expression/src/types/string.rs +++ b/src/query/expression/src/types/string.rs @@ -99,6 +99,7 @@ impl ValueType for StringType { col.index(index) } + #[inline] unsafe fn index_column_unchecked<'a>( col: &'a Self::Column, index: usize, diff --git a/src/query/service/src/pipelines/builders/builder_aggregate.rs b/src/query/service/src/pipelines/builders/builder_aggregate.rs index 6089710c429c..9cd482e3bc47 100644 --- a/src/query/service/src/pipelines/builders/builder_aggregate.rs +++ b/src/query/service/src/pipelines/builders/builder_aggregate.rs @@ -101,10 +101,12 @@ impl PipelineBuilder { pub(crate) fn build_aggregate_partial(&mut self, aggregate: &AggregatePartial) -> Result<()> { self.build_pipeline(&aggregate.input)?; + let max_block_size = self.settings.get_max_block_size()?; let params = Self::build_aggregator_params( aggregate.input.output_schema()?, &aggregate.group_by, &aggregate.agg_funcs, + max_block_size as usize, None, )?; @@ -125,6 +127,10 @@ impl PipelineBuilder { } let efficiently_memory = self.settings.get_efficiently_memory_group_by()?; + let enable_experimental_aggregate_hashtable = self + .settings + .get_enable_experimental_aggregate_hashtable()? + && self.ctx.get_cluster().is_empty(); let group_cols = ¶ms.group_columns; let schema_before_group_by = params.input_schema.clone(); @@ -139,7 +145,8 @@ impl PipelineBuilder { method, input, output, - params.clone() + params.clone(), + enable_experimental_aggregate_hashtable ), }), false => with_mappedhash_method!(|T| match method.clone() { @@ -148,7 +155,8 @@ impl PipelineBuilder { method, input, output, - params.clone() + params.clone(), + enable_experimental_aggregate_hashtable ), }), }?; @@ -220,10 +228,13 @@ impl PipelineBuilder { } pub(crate) fn build_aggregate_final(&mut self, aggregate: &AggregateFinal) -> Result<()> { + let max_block_size = self.settings.get_max_block_size()?; + let params = Self::build_aggregator_params( aggregate.before_group_by_schema.clone(), &aggregate.group_by, &aggregate.agg_funcs, + max_block_size as usize, aggregate.limit, )?; @@ -322,6 +333,7 @@ impl PipelineBuilder { input_schema: DataSchemaRef, group_by: &[IndexType], agg_funcs: &[AggregateFunctionDesc], + max_block_size: usize, limit: Option, ) -> Result> { let mut agg_args = Vec::with_capacity(agg_funcs.len()); @@ -361,6 +373,7 @@ impl PipelineBuilder { &group_by, &aggs, &agg_args, + max_block_size, limit, )?; From b67ee69dfd1f7984f4a1ad288215080462162ca8 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Wed, 8 Nov 2023 06:31:11 +0800 Subject: [PATCH 11/28] chore(query): update --- src/common/hashtable/src/traits.rs | 8 +++ .../src/aggregate/aggregate_hashtable.rs | 56 ++++++++++++++++++- src/query/expression/src/block.rs | 2 +- .../aggregator/transform_aggregate_final.rs | 10 +++- 4 files changed, 70 insertions(+), 6 deletions(-) diff --git a/src/common/hashtable/src/traits.rs b/src/common/hashtable/src/traits.rs index 2d5e747c7ba5..574015464d5f 100644 --- a/src/common/hashtable/src/traits.rs +++ b/src/common/hashtable/src/traits.rs @@ -182,6 +182,14 @@ macro_rules! impl_fast_hash_for_primitive_types { impl FastHash for $t { #[inline(always)] fn fast_hash(&self) -> u64 { + // let mut hasher = *self as u64; + // hasher ^= hasher >> 32; + // hasher = hasher.wrapping_mul(0xd6e8feb86659fd93_u64); + // hasher ^= hasher >> 32; + // hasher = hasher.wrapping_mul(0xd6e8feb86659fd93_u64); + // hasher ^= hasher >> 32; + // hasher + cfg_if::cfg_if! { if #[cfg(target_feature = "sse4.2")] { unsafe { std::arch::x86_64::_mm_crc32_u64(u64::MAX, *self as u64) } diff --git a/src/query/expression/src/aggregate/aggregate_hashtable.rs b/src/query/expression/src/aggregate/aggregate_hashtable.rs index a0774daefdc4..96ab91661b9c 100644 --- a/src/query/expression/src/aggregate/aggregate_hashtable.rs +++ b/src/query/expression/src/aggregate/aggregate_hashtable.rs @@ -59,7 +59,7 @@ impl AggregateHashTable { group_types: Vec, aggrs: Vec, ) -> Self { - let capacity = 128; + let capacity = Self::initial_capacity(); Self { entries: Self::new_entries(capacity), payload: Payload::new(arena, group_types, aggrs), @@ -80,13 +80,49 @@ impl AggregateHashTable { self.payload.len() } - // Add new groups and combine the states pub fn add_groups( &mut self, state: &mut ProbeState, group_columns: &[Column], params: &[Vec], row_count: usize, + ) -> Result { + const BATCH_ADD_SIZE: usize = 2048; + + if row_count <= BATCH_ADD_SIZE { + self.add_groups_inner(state, group_columns, params, row_count) + } else { + let mut new_count = 0; + for start in (0..row_count).step_by(BATCH_ADD_SIZE) { + let end = if start + BATCH_ADD_SIZE > row_count { + row_count + } else { + start + BATCH_ADD_SIZE + }; + let step_group_columns = group_columns + .iter() + .map(|c| c.slice(start..end)) + .collect::>(); + + let step_params: Vec> = params + .iter() + .map(|c| c.iter().map(|x| x.slice(start..end)).collect()) + .collect::>(); + + new_count += + self.add_groups_inner(state, &step_group_columns, &step_params, end - start)?; + } + Ok(new_count) + } + } + + // Add new groups and combine the states + fn add_groups_inner( + &mut self, + state: &mut ProbeState, + group_columns: &[Column], + params: &[Vec], + row_count: usize, ) -> Result { state.adjust_vector(row_count); group_hash_columns(group_columns, &mut state.group_hashes); @@ -127,12 +163,18 @@ impl AggregateHashTable { group_columns: &[Column], row_count: usize, ) -> usize { - if self.capacity - self.len() <= row_count || self.len() > self.resize_threshold() { + if row_count + self.len() > self.capacity + || row_count + self.len() > self.resize_threshold() + { let mut new_capacity = self.capacity * 2; while new_capacity - self.len() <= row_count { new_capacity *= 2; } + println!( + "resize from {} {} by {}", + self.capacity, new_capacity, row_count + ); self.resize(new_capacity); } @@ -322,4 +364,12 @@ impl AggregateHashTable { self.entries = entries; self.capacity = new_capacity; } + + pub fn initial_capacity() -> usize { + 4096 + } + + pub fn get_capacity_for_count(count: usize) -> usize { + ((count.max(Self::initial_capacity()) as f64 * LOAD_FACTOR) as usize).next_power_of_two() + } } diff --git a/src/query/expression/src/block.rs b/src/query/expression/src/block.rs index bdb812945557..b36ebece129f 100644 --- a/src/query/expression/src/block.rs +++ b/src/query/expression/src/block.rs @@ -56,7 +56,7 @@ impl BlockEntry { pub fn new(data_type: DataType, value: Value) -> Self { #[cfg(debug_assertions)] { - if let ValueRef::Column(c) = value.as_ref() { + if let crate::ValueRef::Column(c) = value.as_ref() { c.check_valid().unwrap(); } check_type(&data_type, &value); diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs index 7f9ed604dd85..b4d5b82dcbac 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs @@ -186,12 +186,18 @@ where Method: HashMethodBounds } } }, - AggregateMeta::AggregateHashTable((_, hashtable)) => { + AggregateMeta::AggregateHashTable((_, mut hashtable)) => { match agg_hashtable.as_mut() { Some(ht) => { ht.combine(hashtable, &mut self.flush_state)?; } - None => agg_hashtable = Some(hashtable), + None => { + let new_capacity = + AggregateHashTable::get_capacity_for_count(hashtable.len()); + hashtable.resize(new_capacity); + + agg_hashtable = Some(hashtable); + } } } } From cfe1e2232de3cef8d8c4c6b2b1885ee2b1816b3a Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Wed, 8 Nov 2023 23:20:48 +0800 Subject: [PATCH 12/28] feat(query): add setting parquet_max_block_size --- src/common/hashtable/src/traits.rs | 8 -------- src/query/expression/src/aggregate/aggregate_hashtable.rs | 4 ++++ 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/common/hashtable/src/traits.rs b/src/common/hashtable/src/traits.rs index 574015464d5f..2d5e747c7ba5 100644 --- a/src/common/hashtable/src/traits.rs +++ b/src/common/hashtable/src/traits.rs @@ -182,14 +182,6 @@ macro_rules! impl_fast_hash_for_primitive_types { impl FastHash for $t { #[inline(always)] fn fast_hash(&self) -> u64 { - // let mut hasher = *self as u64; - // hasher ^= hasher >> 32; - // hasher = hasher.wrapping_mul(0xd6e8feb86659fd93_u64); - // hasher ^= hasher >> 32; - // hasher = hasher.wrapping_mul(0xd6e8feb86659fd93_u64); - // hasher ^= hasher >> 32; - // hasher - cfg_if::cfg_if! { if #[cfg(target_feature = "sse4.2")] { unsafe { std::arch::x86_64::_mm_crc32_u64(u64::MAX, *self as u64) } diff --git a/src/query/expression/src/aggregate/aggregate_hashtable.rs b/src/query/expression/src/aggregate/aggregate_hashtable.rs index 96ab91661b9c..10b6af07171c 100644 --- a/src/query/expression/src/aggregate/aggregate_hashtable.rs +++ b/src/query/expression/src/aggregate/aggregate_hashtable.rs @@ -339,6 +339,10 @@ impl AggregateHashTable { } pub fn resize(&mut self, new_capacity: usize) { + if new_capacity == self.capacity { + return; + } + let mask = (new_capacity - 1) as u64; let mut entries = Self::new_entries(new_capacity); From e2e882457faa5578e5f3c9956f06c18e3454aa2e Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Wed, 15 Nov 2023 06:12:20 +0800 Subject: [PATCH 13/28] feat(query): update --- .../src/aggregate/aggregate_hashtable.rs | 350 ++++++++++++------ .../expression/src/aggregate/group_hash.rs | 120 +++++- src/query/expression/src/aggregate/mod.rs | 109 ++++++ .../src/aggregate/partitioned_payload.rs | 266 +++++++++++++ src/query/expression/src/aggregate/payload.rs | 183 ++++++--- .../expression/src/aggregate/payload_flush.rs | 34 +- .../expression/src/aggregate/probe_state.rs | 19 +- .../tests/it/aggregates/agg_hashtable.rs | 14 +- .../pipelines/builders/builder_aggregate.rs | 10 +- .../aggregator/aggregate_exchange_injector.rs | 2 +- .../transforms/aggregator/aggregate_meta.rs | 8 +- .../aggregator/transform_aggregate_final.rs | 37 +- .../aggregator/transform_aggregate_partial.rs | 15 +- .../aggregator/transform_group_by_final.rs | 30 +- .../aggregator/transform_group_by_partial.rs | 14 +- .../aggregator/transform_partition_bucket.rs | 79 +++- 16 files changed, 1066 insertions(+), 224 deletions(-) create mode 100644 src/query/expression/src/aggregate/partitioned_payload.rs diff --git a/src/query/expression/src/aggregate/aggregate_hashtable.rs b/src/query/expression/src/aggregate/aggregate_hashtable.rs index 10b6af07171c..2c82e820aba0 100644 --- a/src/query/expression/src/aggregate/aggregate_hashtable.rs +++ b/src/query/expression/src/aggregate/aggregate_hashtable.rs @@ -14,11 +14,11 @@ // A new AggregateHashtable which inspired by duckdb's https://duckdb.org/2022/03/07/aggregate-hashtable.html -use std::sync::Arc; +use std::sync::atomic::Ordering; use common_exception::Result; -use super::payload::Payload; +use super::partitioned_payload::PartitionedPayload; use super::payload_flush::PayloadFlushState; use super::probe_state::ProbeState; use crate::aggregate::payload_row::row_match_columns; @@ -27,25 +27,20 @@ use crate::types::DataType; use crate::AggregateFunctionRef; use crate::Column; use crate::ColumnBuilder; +use crate::HashTableConfig; +use crate::Payload; use crate::StateAddr; +use crate::FLUSH_BATCH_SIZE; const LOAD_FACTOR: f64 = 1.5; -// hashes layout: -// [SALT][PAGE_NR][PAGE_OFFSET] -// [SALT] are the high bits of the hash value, e.g. 16 for 64 bit hashes -// [PAGE_NR] is the buffer managed payload page index -// [PAGE_OFFSET] is the logical entry offset into said payload page - -#[repr(packed)] -#[derive(Default, Debug, Clone, Copy)] -pub struct Entry { - pub salt: u16, - pub page_offset: u16, - pub page_nr: u32, -} +const MAX_ROWS_IN_HT: usize = 256 * 1024; + +pub type Entry = u64; pub struct AggregateHashTable { - payload: Payload, + pub payload: PartitionedPayload, + config: HashTableConfig, + current_radix_bits: u64, entries: Vec, capacity: usize, } @@ -55,27 +50,29 @@ unsafe impl Sync for AggregateHashTable {} impl AggregateHashTable { pub fn new( - arena: Arc, group_types: Vec, aggrs: Vec, + config: HashTableConfig, ) -> Self { let capacity = Self::initial_capacity(); + Self::new_with_capacity(group_types, aggrs, config, capacity) + } + + pub fn new_with_capacity( + group_types: Vec, + aggrs: Vec, + config: HashTableConfig, + capacity: usize, + ) -> Self { Self { - entries: Self::new_entries(capacity), - payload: Payload::new(arena, group_types, aggrs), + entries: vec![0u64; capacity], + current_radix_bits: config.initial_radix_bits, + payload: PartitionedPayload::new(group_types, aggrs, 1 << config.initial_radix_bits), capacity, + config, } } - // Faster way to create entries - // We don't need to extend N zero elements using u64 after we allocate zero spaces - // due to IsZero Trait(https://stdrs.dev/nightly/x86_64-unknown-linux-gnu/src/alloc/vec/spec_from_elem.rs.html#24) - fn new_entries(capacity: usize) -> Vec { - let entries = vec![0u64; capacity]; - let (ptr, len, cap) = entries.into_raw_parts(); - unsafe { Vec::from_raw_parts(ptr as *mut Entry, len, cap) } - } - pub fn len(&self) -> usize { self.payload.len() } @@ -163,6 +160,18 @@ impl AggregateHashTable { group_columns: &[Column], row_count: usize, ) -> usize { + self.maybe_repartition(); + + if self.config.partial_agg + && self.current_radix_bits == self.config.max_radix_bits + && self.capacity >= MAX_ROWS_IN_HT + { + // directly append rows + state.set_incr_empty_vector(row_count); + self.payload.append_rows(state, row_count, group_columns); + return row_count; + } + if row_count + self.len() > self.capacity || row_count + self.len() > self.resize_threshold() { @@ -171,21 +180,20 @@ impl AggregateHashTable { while new_capacity - self.len() <= row_count { new_capacity *= 2; } - println!( - "resize from {} {} by {}", - self.capacity, new_capacity, row_count - ); self.resize(new_capacity); } let mut new_group_count = 0; let mut remaining_entries = row_count; - let mut payload_page_offset = self.len() % self.payload.row_per_page; - let mut payload_page_nr = (self.len() / self.payload.row_per_page) + 1; - let mut iter_times = 0; + + if self.len() == 0 { + debug_assert_eq!(self.entries.iter().sum::(), 0); + } + let entries = &mut self.entries; + while remaining_entries > 0 { let mut new_entry_count = 0; let mut need_compare_count = 0; @@ -201,37 +209,22 @@ impl AggregateHashTable { let ht_offset = (state.group_hashes[index] as usize + iter_times) & (self.capacity - 1); - let salt = (state.group_hashes[index] >> (64 - 16)) as u16; + let salt = state.group_hashes[index].get_salt(); let entry = &mut entries[ht_offset]; - // cell is empty, could be occupied - if entry.page_nr == 0 { - entry.salt = salt; - entry.page_nr = payload_page_nr as u32; - entry.page_offset = payload_page_offset as u16; - - payload_page_offset += 1; - - if payload_page_offset == self.payload.row_per_page { - payload_page_offset = 0; - payload_page_nr += 1; - - self.payload.try_extend_page(payload_page_nr - 1); + if entry.is_occupied() { + if entry.get_salt() == salt { + state.group_compare_vector[need_compare_count] = index; + need_compare_count += 1; + } else { + state.no_match_vector[no_match_count] = index; + no_match_count += 1; } - + } else { + entry.set_salt(salt); state.empty_vector[new_entry_count] = index; new_entry_count += 1; - } else if entry.salt == salt { - let page_ptr = self.payload.get_page_ptr((entry.page_nr - 1) as usize); - let page_offset = entry.page_offset as usize * self.payload.tuple_size; - state.addresses[index] = unsafe { page_ptr.add(page_offset) }; - - state.group_compare_vector[need_compare_count] = index; - need_compare_count += 1; - } else { - state.no_match_vector[no_match_count] = index; - no_match_count += 1; } } @@ -240,24 +233,48 @@ impl AggregateHashTable { new_group_count += new_entry_count; self.payload .append_rows(state, new_entry_count, group_columns); + + for i in 0..new_entry_count { + let index = state.empty_vector[i]; + let ht_offset = + (state.group_hashes[index] as usize + iter_times) & (self.capacity - 1); + let entry = &mut entries[ht_offset]; + + entry.set_pointer(state.addresses[index]); + + debug_assert_eq!(entry.get_pointer(), state.addresses[index]); + } } - // 3. handle need_compare_count - // already inject addresses to state.addresses - - // 4. compare - unsafe { - row_match_columns( - group_columns, - &state.addresses, - &mut state.group_compare_vector, - &mut state.temp_vector, - need_compare_count, - &self.payload.validity_offsets, - &self.payload.group_offsets, - &mut state.no_match_vector, - &mut no_match_count, - ); + // set address of compare vector + + if need_compare_count > 0 { + for i in 0..need_compare_count { + let index = state.group_compare_vector[i]; + let ht_offset = + (state.group_hashes[index] as usize + iter_times) & (self.capacity - 1); + let entry = &mut entries[ht_offset]; + + debug_assert!(entry.is_occupied()); + debug_assert_eq!(entry.get_salt(), state.group_hashes[index].get_salt()); + + state.addresses[index] = entry.get_pointer(); + } + + // 4. compare + unsafe { + row_match_columns( + group_columns, + &state.addresses, + &mut state.group_compare_vector, + &mut state.temp_vector, + need_compare_count, + &self.payload.validity_offsets, + &self.payload.group_offsets, + &mut state.no_match_vector, + &mut no_match_count, + ); + } } // 5. Linear probing, just increase iter_times @@ -265,24 +282,32 @@ impl AggregateHashTable { remaining_entries = no_match_count; } - // set state places - if !self.payload.aggrs.is_empty() { - for i in 0..row_count { - state.state_places[i] = unsafe { - StateAddr::new(core::ptr::read::( - state.addresses[i].add(self.payload.state_offset) as _, - ) as usize) - }; - } - } - new_group_count } pub fn combine(&mut self, other: Self, flush_state: &mut PayloadFlushState) -> Result<()> { - flush_state.reset(); + self.combine_payloads(&other.payload, flush_state) + } - while other.payload.flush(flush_state) { + pub fn combine_payloads( + &mut self, + payloads: &PartitionedPayload, + flush_state: &mut PayloadFlushState, + ) -> Result<()> { + for payload in payloads.payloads.iter() { + self.combine_payload(payload, flush_state)?; + } + Ok(()) + } + + pub fn combine_payload( + &mut self, + payload: &Payload, + flush_state: &mut PayloadFlushState, + ) -> Result<()> { + flush_state.clear(); + + while payload.flush(flush_state) { let row_count = flush_state.row_count; let _ = self.probe_and_create( @@ -291,6 +316,18 @@ impl AggregateHashTable { row_count, ); + // set state places + if !self.payload.aggrs.is_empty() { + for i in 0..row_count { + flush_state.probe_state.state_places[i] = unsafe { + StateAddr::new(core::ptr::read::( + flush_state.probe_state.addresses[i].add(self.payload.state_offset) + as _, + ) as usize) + }; + } + } + let state = &mut flush_state.probe_state; for (aggr, addr_offset) in self .payload @@ -305,6 +342,7 @@ impl AggregateHashTable { )?; } } + Ok(()) } @@ -334,6 +372,57 @@ impl AggregateHashTable { Ok(false) } + fn maybe_repartition(&mut self) { + // already final stage or the max radix bits + if !self.config.partial_agg || (self.current_radix_bits == self.config.max_radix_bits) { + return; + } + + let bytes_per_partition = self.payload.memory_size() / self.payload.partition_count(); + + let mut new_radix_bits = self.current_radix_bits; + // 256k + if bytes_per_partition > 256 * 1024 { + new_radix_bits += self.config.repartition_radix_bits_incr; + } + + loop { + let current_max_radix_bits = self.config.current_max_radix_bits.load(Ordering::SeqCst); + if current_max_radix_bits < new_radix_bits { + if self + .config + .current_max_radix_bits + .compare_exchange( + current_max_radix_bits, + new_radix_bits, + Ordering::SeqCst, + Ordering::SeqCst, + ) + .is_err() + { + continue; + } + } + break; + } + + let current_max_radix_bits = self.config.current_max_radix_bits.load(Ordering::SeqCst); + + if current_max_radix_bits > self.current_radix_bits { + let temp_payload = PartitionedPayload::new( + self.payload.group_types.clone(), + self.payload.aggrs.clone(), + 1, + ); + let payload = std::mem::replace(&mut self.payload, temp_payload); + let mut state = PayloadFlushState::with_capacity(FLUSH_BATCH_SIZE); + + self.current_radix_bits = current_max_radix_bits; + self.payload = payload.repartition(1 << current_max_radix_bits, &mut state); + } + } + + #[inline] fn resize_threshold(&self) -> usize { (self.capacity as f64 / LOAD_FACTOR) as usize } @@ -345,24 +434,29 @@ impl AggregateHashTable { let mask = (new_capacity - 1) as u64; - let mut entries = Self::new_entries(new_capacity); + let mut entries = vec![0; new_capacity]; + // iterate over payloads and copy to new entries - for row in 0..self.len() { - let row_ptr = self.payload.get_row_ptr(row); - let hash: u64 = unsafe { core::ptr::read(row_ptr.add(self.payload.hash_offset) as _) }; - let mut hash_slot = hash & mask; - - while entries[hash_slot as usize].page_nr != 0 { - hash_slot += 1; - if hash_slot >= self.capacity as u64 { - hash_slot = 0; + for payload in self.payload.payloads.iter() { + for row in 0..payload.len() { + let row_ptr = payload.get_read_ptr(row); + let hash: u64 = unsafe { core::ptr::read(row_ptr.add(payload.hash_offset) as _) }; + let mut hash_slot = hash & mask; + + while entries[hash_slot as usize].is_occupied() { + hash_slot += 1; + if hash_slot >= new_capacity as u64 { + hash_slot = 0; + } } + debug_assert!(!entries[hash_slot as usize].is_occupied()); + // set value + entries[hash_slot as usize].set_salt(hash.get_salt()); + entries[hash_slot as usize].set_pointer(row_ptr); + debug_assert!(entries[hash_slot as usize].is_occupied()); + debug_assert_eq!(entries[hash_slot as usize].get_pointer(), row_ptr); + debug_assert_eq!(entries[hash_slot as usize].get_salt(), hash.get_salt()); } - let entry = &mut entries[hash_slot as usize]; - - entry.page_nr = (row / self.payload.row_per_page) as u32 + 1; - entry.page_offset = (row % self.payload.row_per_page) as u16; - entry.salt = (hash >> (64 - 16)) as u16; } self.entries = entries; @@ -377,3 +471,51 @@ impl AggregateHashTable { ((count.max(Self::initial_capacity()) as f64 * LOAD_FACTOR) as usize).next_power_of_two() } } + +/// Upper 16 bits are salt +const SALT_MASK: u64 = 0xFFFF000000000000; +/// Lower 48 bits are the pointer +const POINTER_MASK: u64 = 0x0000FFFFFFFFFFFF; + +pub const INITIAL_RADIX_BITS: u64 = 4; + +pub(crate) trait EntryLike { + fn get_salt(&self) -> u64; + fn set_salt(&mut self, _salt: u64); + fn is_occupied(&self) -> bool; + + fn get_pointer(&self) -> *const u8; + fn set_pointer(&mut self, ptr: *const u8); +} + +impl EntryLike for u64 { + #[inline] + fn get_salt(&self) -> u64 { + *self | POINTER_MASK + } + + #[inline] + fn set_salt(&mut self, salt: u64) { + *self = salt; + } + + #[inline] + fn is_occupied(&self) -> bool { + *self != 0 + } + + #[inline] + fn get_pointer(&self) -> *const u8 { + (*self & POINTER_MASK) as *const u8 + } + + #[inline] + fn set_pointer(&mut self, ptr: *const u8) { + // Pointer shouldn't use upper bits + debug_assert!(ptr as u64 & SALT_MASK == 0); + // Value should have all 1's in the pointer area + debug_assert!(*self as u64 & POINTER_MASK == POINTER_MASK); + + *self &= (ptr as u64) | SALT_MASK; + } +} diff --git a/src/query/expression/src/aggregate/group_hash.rs b/src/query/expression/src/aggregate/group_hash.rs index c4fa104a4059..8d7857736167 100644 --- a/src/query/expression/src/aggregate/group_hash.rs +++ b/src/query/expression/src/aggregate/group_hash.rs @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -use common_hashtable::FastHash; use ethnum::i256; +use ordered_float::OrderedFloat; use crate::types::decimal::DecimalType; use crate::types::ArgType; @@ -107,16 +107,16 @@ fn combine_group_hash_type_column( col: &Column, values: &mut [u64], ) where - for<'a> T::ScalarRef<'a>: FastHash, + for<'a> T::ScalarRef<'a>: AggHash, { let c = T::try_downcast_column(col).unwrap(); if IS_FIRST { for (x, val) in T::iter_column(&c).zip(values.iter_mut()) { - *val = x.fast_hash(); + *val = x.agg_hash(); } } else { for (x, val) in T::iter_column(&c).zip(values.iter_mut()) { - *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ x.fast_hash(); + *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ x.agg_hash(); } } } @@ -130,11 +130,119 @@ fn combine_group_hash_string_column( let c = T::try_downcast_column(col).unwrap(); if IS_FIRST { for (x, val) in T::iter_column(&c).zip(values.iter_mut()) { - *val = x.as_ref().fast_hash(); + *val = x.as_ref().agg_hash(); } } else { for (x, val) in T::iter_column(&c).zip(values.iter_mut()) { - *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ x.as_ref().fast_hash(); + *val = (*val).wrapping_mul(NULL_HASH_VAL) ^ x.as_ref().agg_hash(); + } + } +} + +trait AggHash { + fn agg_hash(&self) -> u64; +} + +// MIT License +// Copyright (c) 2018-2021 Martin Ankerl +// https://github.com/martinus/robin-hood-hashing/blob/3.11.5/LICENSE +// Rewrite using chatgpt + +impl AggHash for [u8] { + fn agg_hash(&self) -> u64 { + const M: u64 = 0xc6a4a7935bd1e995; + const SEED: u64 = 0xe17a1465; + const R: u64 = 47; + + let mut h = SEED ^ (self.len() as u64).wrapping_mul(M); + let n_blocks = self.len() / 8; + + for i in 0..n_blocks { + let mut k = unsafe { (&self[i * 8] as *const u8 as *const u64).read_unaligned() }; + + k = k.wrapping_mul(M); + k ^= k >> R; + k = k.wrapping_mul(M); + + h ^= k; + h = h.wrapping_mul(M); + } + + let data8 = &self[n_blocks * 8..]; + for (i, &value) in data8.iter().enumerate() { + h ^= (value as u64) << (8 * (data8.len() - i - 1)); + } + + h ^= h >> R; + h = h.wrapping_mul(M); + h ^= h >> R; + + h + } +} + +macro_rules! impl_agg_hash_for_primitive_types { + ($t: ty) => { + impl AggHash for $t { + #[inline(always)] + fn agg_hash(&self) -> u64 { + let mut x = *self as u64; + x ^= x >> 32; + x *= 0xd6e8feb86659fd93; + x ^= x >> 32; + x *= 0xd6e8feb86659fd93; + x ^= x >> 32; + x + } + } + }; +} + +impl_agg_hash_for_primitive_types!(u8); +impl_agg_hash_for_primitive_types!(i8); +impl_agg_hash_for_primitive_types!(u16); +impl_agg_hash_for_primitive_types!(i16); +impl_agg_hash_for_primitive_types!(u32); +impl_agg_hash_for_primitive_types!(i32); +impl_agg_hash_for_primitive_types!(u64); +impl_agg_hash_for_primitive_types!(i64); + +impl AggHash for bool { + fn agg_hash(&self) -> u64 { + *self as u64 + } +} + +impl AggHash for i128 { + fn agg_hash(&self) -> u64 { + self.to_le_bytes().agg_hash() + } +} + +impl AggHash for i256 { + fn agg_hash(&self) -> u64 { + self.to_le_bytes().agg_hash() + } +} + +impl AggHash for OrderedFloat { + #[inline(always)] + fn agg_hash(&self) -> u64 { + if self.is_nan() { + f32::NAN.to_bits().agg_hash() + } else { + self.to_bits().agg_hash() + } + } +} + +impl AggHash for OrderedFloat { + #[inline(always)] + fn agg_hash(&self) -> u64 { + if self.is_nan() { + f64::NAN.to_bits().agg_hash() + } else { + self.to_bits().agg_hash() } } } diff --git a/src/query/expression/src/aggregate/mod.rs b/src/query/expression/src/aggregate/mod.rs index ac8c81e7063d..4f05cb019c81 100644 --- a/src/query/expression/src/aggregate/mod.rs +++ b/src/query/expression/src/aggregate/mod.rs @@ -19,16 +19,125 @@ mod aggregate_function; mod aggregate_function_state; mod aggregate_hashtable; mod group_hash; +mod partitioned_payload; mod payload; mod payload_flush; mod payload_row; mod probe_state; +use std::hash::Hasher; +use std::sync::atomic::AtomicU64; +use std::sync::Arc; + pub use aggregate_function::*; pub use aggregate_function_state::*; pub use aggregate_hashtable::*; pub use group_hash::*; +pub use partitioned_payload::*; +pub use payload::*; pub use payload_flush::*; pub use probe_state::*; pub type SelectVector = Vec; + +#[derive(Clone, Debug)] +pub struct HashTableConfig { + // Max radix bits across all threads, this is a hint to repartition + pub current_max_radix_bits: Arc, + pub initial_radix_bits: u64, + pub max_radix_bits: u64, + pub repartition_radix_bits_incr: u64, + pub block_fill_factor: f64, + pub partial_agg: bool, +} + +impl Default for HashTableConfig { + fn default() -> Self { + Self { + current_max_radix_bits: Arc::new(AtomicU64::new(INITIAL_RADIX_BITS)), + initial_radix_bits: INITIAL_RADIX_BITS, + max_radix_bits: 8, + repartition_radix_bits_incr: 2, + block_fill_factor: 1.8, + partial_agg: false, + } + } +} + +impl HashTableConfig { + pub fn with_initial_radix_bits(mut self, initial_radix_bits: u64) -> Self { + self.initial_radix_bits = initial_radix_bits; + self.current_max_radix_bits = Arc::new(AtomicU64::new(initial_radix_bits)); + self + } + + pub fn with_partial(mut self, partial_agg: bool) -> Self { + self.partial_agg = partial_agg; + self + } +} + +pub struct PerfectHashBuilder; + +// NOTE: This is a dummy hasher that just returns the value passed to it. +// This is only used for i8-i64, u8-u64, isize and usize keys. +pub struct PerfectHash { + val: u64, +} + +impl std::hash::BuildHasher for PerfectHashBuilder { + type Hasher = PerfectHash; + fn build_hasher(&self) -> PerfectHash { + PerfectHash { val: 0 } + } +} + +impl Hasher for PerfectHash { + fn finish(&self) -> u64 { + self.val + } + + fn write(&mut self, _bytes: &[u8]) { + unreachable!() + } + + fn write_u8(&mut self, i: u8) { + self.val = i as u64; + } + + fn write_u16(&mut self, i: u16) { + self.val = i as u64; + } + + fn write_u32(&mut self, i: u32) { + self.val = i as u64; + } + + fn write_u64(&mut self, i: u64) { + self.val = i as u64; + } + + fn write_usize(&mut self, i: usize) { + self.val = i as u64; + } + + fn write_i8(&mut self, i: i8) { + self.val = i as u64; + } + + fn write_i16(&mut self, i: i16) { + self.val = i as u64; + } + + fn write_i32(&mut self, i: i32) { + self.val = i as u64; + } + + fn write_i64(&mut self, i: i64) { + self.val = i as u64; + } + + fn write_isize(&mut self, i: isize) { + self.val = i as u64; + } +} diff --git a/src/query/expression/src/aggregate/partitioned_payload.rs b/src/query/expression/src/aggregate/partitioned_payload.rs new file mode 100644 index 000000000000..05e8fe74f5a0 --- /dev/null +++ b/src/query/expression/src/aggregate/partitioned_payload.rs @@ -0,0 +1,266 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::alloc::Layout; +use std::sync::Arc; + +use bumpalo::Bump; +use itertools::Itertools; + +use super::payload::Payload; +use super::probe_state::ProbeState; +use crate::types::DataType; +use crate::AggregateFunctionRef; +use crate::Column; +use crate::PayloadFlushState; +use crate::FLUSH_BATCH_SIZE; + +pub struct PartitionedPayload { + pub payloads: Vec, + pub group_types: Vec, + pub aggrs: Vec, + + pub group_sizes: Vec, + pub group_offsets: Vec, + pub validity_offsets: Vec, + pub hash_offset: usize, + pub state_offset: usize, + pub state_addr_offsets: Vec, + pub state_layout: Option, + + partition_count: u64, + mask_v: u64, + shift_v: u64, +} + +unsafe impl Send for PartitionedPayload {} +unsafe impl Sync for PartitionedPayload {} + +impl PartitionedPayload { + pub fn new( + group_types: Vec, + aggrs: Vec, + partition_count: u64, + ) -> Self { + let radix_bits = partition_count.trailing_zeros() as u64; + debug_assert_eq!(1 << radix_bits, partition_count); + + let payloads = (0..partition_count) + .map(|_| Payload::new(Arc::new(Bump::new()), group_types.clone(), aggrs.clone())) + .collect_vec(); + + let group_sizes = payloads[0].group_sizes.clone(); + let group_offsets = payloads[0].group_offsets.clone(); + let validity_offsets = payloads[0].validity_offsets.clone(); + let hash_offset = payloads[0].hash_offset; + let state_offset = payloads[0].state_offset; + let state_addr_offsets = payloads[0].state_addr_offsets.clone(); + let state_layout = payloads[0].state_layout.clone(); + + PartitionedPayload { + payloads, + group_types, + aggrs, + group_sizes, + group_offsets, + validity_offsets, + hash_offset, + state_offset, + state_addr_offsets, + state_layout, + partition_count, + mask_v: mask(radix_bits), + shift_v: shift(radix_bits), + } + } + + pub fn append_rows( + &mut self, + state: &mut ProbeState, + new_group_rows: usize, + group_columns: &[Column], + ) { + if self.payloads.len() == 1 { + self.payloads[0].reserve_append_rows( + &state.empty_vector, + &state.group_hashes, + &mut state.addresses, + new_group_rows, + group_columns, + ); + } else { + // generate partition selection indices + state.reset_partitions(); + let select_vector = &state.empty_vector; + + for idx in select_vector.iter().take(new_group_rows).copied() { + let hash = state.group_hashes[idx]; + let partition_idx = ((hash & self.mask_v) >> self.shift_v) as usize; + match state.partition_entries.get_mut(&partition_idx) { + Some((v, count)) => { + v[*count] = idx; + *count += 1; + } + None => { + let mut v = vec![0; state.group_hashes.len()]; + v[0] = idx; + state.partition_entries.insert(partition_idx, (v, 1)); + } + } + } + + for partition_index in 0..self.payloads.len() { + if let Some((select_vector, count)) = + state.partition_entries.get_mut(&partition_index) + { + self.payloads[partition_index].reserve_append_rows( + select_vector, + &state.group_hashes, + &mut state.addresses, + *count, + group_columns, + ); + } + } + } + } + + pub fn repartition(self, new_partition_count: usize, state: &mut PayloadFlushState) -> Self { + if self.partition_count() == new_partition_count { + return self; + } + + let mut new_partition_payload = PartitionedPayload::new( + self.group_types.clone(), + self.aggrs.clone(), + new_partition_count as u64, + ); + + new_partition_payload.combine(self, state); + new_partition_payload + } + + pub fn combine(&mut self, other: PartitionedPayload, state: &mut PayloadFlushState) { + if other.partition_count == self.partition_count { + for (l, r) in self.payloads.iter_mut().zip(other.payloads.into_iter()) { + l.combine(r); + } + } else { + state.clear(); + + for payload in other.payloads.into_iter() { + self.combine_single(payload, state) + } + } + } + + pub fn combine_single(&mut self, other: Payload, state: &mut PayloadFlushState) { + if other.len() == 0 { + return; + } + + if self.partition_count == 1 { + self.payloads[0].combine(other); + } else { + state.clear(); + + while self.gather_flush(&other, state) { + // copy rows + for partition in 0..self.partition_count as usize { + let payload = &mut self.payloads[partition]; + if let Some(sel) = &state.probe_state.partition_entries.get_mut(&partition) { + payload.copy_rows(&sel.0, sel.1, &state.addresses); + + payload.external_arena.push(other.arena.clone()); + payload + .external_arena + .extend_from_slice(&other.external_arena); + } + } + } + other.forget(); + } + } + + pub fn gather_flush(&self, other: &Payload, state: &mut PayloadFlushState) -> bool { + let flush_end = (state.flush_offset + FLUSH_BATCH_SIZE).min(other.len()); + + if flush_end <= state.flush_offset { + return false; + } + + let rows = flush_end - state.flush_offset; + if state.addresses.len() < rows { + state.addresses.resize(rows, std::ptr::null::()); + } + + state.row_count = rows; + for row in state.flush_offset..flush_end { + state.addresses[row - state.flush_offset] = other.get_read_ptr(row); + } + + state.probe_state.reset_partitions(); + for i in 0..rows { + let hash = + unsafe { core::ptr::read::(state.addresses[i].add(self.hash_offset) as _) }; + + let partition_idx = ((hash & self.mask_v) >> self.shift_v) as usize; + match state.probe_state.partition_entries.get_mut(&partition_idx) { + Some((v, count)) => { + v[*count] = i; + *count += 1; + } + None => { + let mut v = vec![0; FLUSH_BATCH_SIZE]; + v[0] = i; + state + .probe_state + .partition_entries + .insert(partition_idx, (v, 1)); + } + } + } + + state.flush_offset = flush_end; + true + } + + pub fn len(&self) -> usize { + self.payloads.iter().map(|x| x.len()).sum() + } + + pub fn partition_count(&self) -> usize { + self.partition_count as usize + } + + #[allow(dead_code)] + pub fn page_count(&self) -> usize { + self.payloads.iter().map(|x| x.pages.len()).sum() + } + + #[allow(dead_code)] + pub fn memory_size(&self) -> usize { + self.payloads.iter().map(|x| x.memory_size()).sum() + } +} + +#[inline] +fn shift(radix_bits: u64) -> u64 { + 48 - radix_bits +} + +#[inline] +fn mask(radix_bits: u64) -> u64 { + ((1 << radix_bits) - 1) << shift(radix_bits) +} diff --git a/src/query/expression/src/aggregate/payload.rs b/src/query/expression/src/aggregate/payload.rs index d7efbafd3a7c..02348c787613 100644 --- a/src/query/expression/src/aggregate/payload.rs +++ b/src/query/expression/src/aggregate/payload.rs @@ -13,18 +13,19 @@ // limitations under the License. use std::alloc::Layout; +use std::mem::MaybeUninit; use std::sync::Arc; use bumpalo::Bump; use super::payload_row::rowformat_size; use super::payload_row::serialize_column_to_rowformat; -use super::probe_state::ProbeState; use crate::get_layout_offsets; use crate::store; use crate::types::DataType; use crate::AggregateFunctionRef; use crate::Column; +use crate::SelectVector; use crate::StateAddr; const MAX_PAGE_SIZE: usize = 256 * 1024; @@ -36,13 +37,20 @@ const MAX_PAGE_SIZE: usize = 256 * 1024; // [STATE_ADDRS] is the state_addrs of the aggregate functions, 8 bytes each pub struct Payload { pub arena: Arc, + pub external_arena: Vec>, + // if true, the states are moved out of the payload into other payload, and will not be dropped + pub state_move_out: bool, pub group_types: Vec, pub aggrs: Vec, - pub pages: Vec>, + pub pages: Pages, pub tuple_size: usize, pub row_per_page: usize, - pub current_row: usize, + + pub total_rows: usize, + + // Starts from 1, zero means no page allocated + pub current_write_page: usize, pub group_offsets: Vec, pub group_sizes: Vec, @@ -53,6 +61,17 @@ pub struct Payload { pub state_layout: Option, } +unsafe impl Send for Payload {} +unsafe impl Sync for Payload {} + +pub struct Page { + data: Vec>, + rows: usize, + capacity: usize, +} + +pub type Pages = Vec; + // TODO FIXME impl Payload { pub fn new( @@ -102,12 +121,15 @@ impl Payload { Self { arena, - pages: vec![vec![0; row_per_page * tuple_size]], + external_arena: vec![], + state_move_out: false, + pages: vec![], + current_write_page: 0, group_types, aggrs, tuple_size, row_per_page, - current_row: 0, + total_rows: 0, group_offsets, group_sizes, validity_offsets, @@ -119,53 +141,85 @@ impl Payload { } pub fn len(&self) -> usize { - self.current_row + self.total_rows } - pub fn get_page_ptr(&self, page_nr: usize) -> *const u8 { - self.pages[page_nr].as_ptr() + pub fn clear(&mut self) { + self.total_rows = 0; + self.pages.clear(); } - pub fn try_reverse(&mut self, additional_rows: usize) { - let mut row_capacity = self.pages.len() * self.row_per_page - self.current_row; - - while row_capacity < additional_rows { - self.pages - .push(vec![0; self.row_per_page * self.tuple_size]); - row_capacity += self.row_per_page; - } + pub fn memory_size(&self) -> usize { + self.pages.iter().map(|x| x.data.capacity()).sum() } - pub fn try_extend_page(&mut self, page_nr: usize) { - while page_nr >= self.pages.len() { - self.pages - .push(vec![0; self.row_per_page * self.tuple_size]); + #[inline] + pub fn writable_page(&mut self) -> &mut Page { + if self.current_write_page == 0 + || self.pages[self.current_write_page - 1].rows + == self.pages[self.current_write_page - 1].capacity + { + self.pages.push(Page { + data: Vec::with_capacity(self.row_per_page * self.tuple_size), + rows: 0, + capacity: self.row_per_page, + }); + self.current_write_page = self.pages.len(); } + &mut self.pages[self.current_write_page - 1] } - pub fn get_row_ptr(&self, row: usize) -> *const u8 { - let page = row / self.row_per_page; - let page_ptr = self.get_page_ptr(page); - let row_offset = (row % self.row_per_page) * self.tuple_size; - - unsafe { page_ptr.add(row_offset) } + pub fn get_read_ptr(&self, row: usize) -> *const u8 { + let mut c = row; + for page in self.pages.iter() { + if page.rows > c { + return unsafe { page.data.as_ptr().add(c * self.tuple_size) as *const u8 }; + } else { + c -= page.rows; + } + } + unreachable!() } - pub fn append_rows( + pub fn reserve_append_rows( &mut self, - state: &mut ProbeState, + select_vector: &SelectVector, + group_hashes: &[u64], + address: &mut [*const u8], new_group_rows: usize, group_columns: &[Column], ) { - self.try_reverse(new_group_rows); - let select_vector = &state.empty_vector; + let tuple_size = self.tuple_size; for idx in select_vector.iter().take(new_group_rows).copied() { - state.addresses[idx] = self.get_row_ptr(self.current_row); - self.current_row += 1; + let page = self.writable_page(); + address[idx] = unsafe { page.data.as_ptr().add(page.rows * tuple_size) as *const u8 }; + page.rows += 1; } - let address = state.addresses.as_slice(); + self.total_rows += new_group_rows; + debug_assert_eq!( + self.total_rows, + self.pages.iter().map(|x| x.rows).sum::() + ); + + self.append_rows( + select_vector, + group_hashes, + address, + new_group_rows, + group_columns, + ) + } + + pub fn append_rows( + &mut self, + select_vector: &SelectVector, + group_hashes: &[u64], + address: &mut [*const u8], + new_group_rows: usize, + group_columns: &[Column], + ) { let mut write_offset = 0; // write validity for col in group_columns { @@ -216,7 +270,7 @@ impl Payload { for idx in select_vector.iter().take(new_group_rows).copied() { unsafe { let dst = address[idx].add(write_offset); - store(state.group_hashes[idx], dst as *mut u8); + store(group_hashes[idx], dst as *mut u8); } } @@ -237,20 +291,63 @@ impl Payload { } } } + + pub fn combine(&mut self, mut other: Payload) { + other.state_move_out = true; + + self.total_rows += other.pages.iter().map(|x| x.rows).sum::(); + self.external_arena.push(other.arena.clone()); + self.pages.append(other.pages.as_mut()); + } + + pub fn copy_rows( + &mut self, + select_vector: &SelectVector, + row_count: usize, + address: &[*const u8], + ) { + let tuple_size = self.tuple_size; + for i in 0..row_count { + let index = select_vector[i]; + let page = self.writable_page(); + unsafe { + std::ptr::copy_nonoverlapping( + address[index], + page.data.as_mut_ptr().add(page.rows * tuple_size) as _, + tuple_size, + ) + } + page.rows += 1; + } + + self.total_rows += row_count; + + debug_assert_eq!( + self.total_rows, + self.pages.iter().map(|x| x.rows).sum::() + ); + } + + pub fn forget(mut self) { + self.state_move_out = true; + } } impl Drop for Payload { fn drop(&mut self) { // drop states - for (aggr, addr_offset) in self.aggrs.iter().zip(self.state_addr_offsets.iter()) { - if aggr.need_manual_drop_state() { - for row in 0..self.len() { - let row_ptr = self.get_row_ptr(row); - - unsafe { - let state_addr: u64 = core::ptr::read(row_ptr.add(self.state_offset) as _); - aggr.drop_state(StateAddr::new(state_addr as usize + *addr_offset)) - }; + if !self.state_move_out { + for (aggr, addr_offset) in self.aggrs.iter().zip(self.state_addr_offsets.iter()) { + if aggr.need_manual_drop_state() { + for row in 0..self.len() { + let row_ptr = self.get_read_ptr(row); + + unsafe { + let state_addr: u64 = + core::ptr::read(row_ptr.add(self.state_offset) as _); + aggr.drop_state(StateAddr::new(state_addr as usize + *addr_offset)) + }; + } } } } diff --git a/src/query/expression/src/aggregate/payload_flush.rs b/src/query/expression/src/aggregate/payload_flush.rs index b1dd4001fc2d..972daa3ec40e 100644 --- a/src/query/expression/src/aggregate/payload_flush.rs +++ b/src/query/expression/src/aggregate/payload_flush.rs @@ -14,6 +14,7 @@ use ethnum::i256; +use super::partitioned_payload::PartitionedPayload; use super::payload::Payload; use super::probe_state::ProbeState; use crate::types::decimal::DecimalType; @@ -31,13 +32,15 @@ use crate::with_number_mapped_type; use crate::Column; use crate::StateAddr; -const FLUSH_BATCH_SIZE: usize = 8192; +pub(crate) const FLUSH_BATCH_SIZE: usize = 8192; pub struct PayloadFlushState { pub probe_state: ProbeState, pub group_columns: Vec, pub aggregate_results: Vec, pub row_count: usize, + + pub flush_partition: usize, pub flush_offset: usize, pub addresses: Vec<*const u8>, @@ -54,14 +57,16 @@ impl PayloadFlushState { group_columns: Vec::new(), aggregate_results: Vec::new(), row_count: 0, + flush_partition: 0, flush_offset: 0, addresses: vec![std::ptr::null::(); len], state_places: vec![StateAddr::new(0); len], } } - pub fn reset(&mut self) { + pub fn clear(&mut self) { self.row_count = 0; + self.flush_partition = 0; self.flush_offset = 0; } @@ -73,15 +78,32 @@ impl PayloadFlushState { } } +impl PartitionedPayload { + pub fn flush(&mut self, state: &mut PayloadFlushState) -> bool { + if state.flush_partition >= self.payloads.len() { + return false; + } + + let p = &self.payloads[state.flush_partition]; + if p.flush(state) { + true + } else { + state.flush_partition += 1; + state.flush_offset = 0; + self.flush(state) + } + } +} + impl Payload { pub fn flush(&self, state: &mut PayloadFlushState) -> bool { let flush_end = (state.flush_offset + FLUSH_BATCH_SIZE).min(self.len()); - - let rows = flush_end - state.flush_offset; - if rows == 0 { + if flush_end <= state.flush_offset { return false; } + let rows = flush_end - state.flush_offset; + if state.addresses.len() < rows { state.addresses.resize(rows, std::ptr::null::()); state.state_places.resize(rows, StateAddr::new(0)); @@ -92,7 +114,7 @@ impl Payload { state.probe_state.adjust_vector(rows); for row in state.flush_offset..flush_end { - state.addresses[row - state.flush_offset] = self.get_row_ptr(row); + state.addresses[row - state.flush_offset] = self.get_read_ptr(row); } self.flush_hashes(state); diff --git a/src/query/expression/src/aggregate/probe_state.rs b/src/query/expression/src/aggregate/probe_state.rs index 8cc42659ad92..f5673f66cc89 100644 --- a/src/query/expression/src/aggregate/probe_state.rs +++ b/src/query/expression/src/aggregate/probe_state.rs @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashMap; + +use crate::PerfectHashBuilder; use crate::SelectVector; use crate::StateAddr; @@ -26,8 +29,9 @@ pub struct ProbeState { pub no_match_vector: SelectVector, pub empty_vector: SelectVector, pub temp_vector: SelectVector, - pub row_count: usize, + + pub partition_entries: HashMap, } unsafe impl Send for ProbeState {} @@ -43,6 +47,7 @@ impl ProbeState { no_match_vector: vec![0; len], empty_vector: vec![0; len], temp_vector: vec![0; len], + partition_entries: HashMap::with_hasher(PerfectHashBuilder), row_count: 0, } } @@ -60,4 +65,16 @@ impl ProbeState { } self.row_count = row_count; } + + pub fn set_incr_empty_vector(&mut self, row_count: usize) { + for i in 0..row_count { + self.empty_vector[i] = i; + } + } + + pub fn reset_partitions(&mut self) { + for (_, (_, p)) in self.partition_entries.iter_mut() { + *p = 0; + } + } } diff --git a/src/query/functions/tests/it/aggregates/agg_hashtable.rs b/src/query/functions/tests/it/aggregates/agg_hashtable.rs index fcf199702063..83ac22cae760 100644 --- a/src/query/functions/tests/it/aggregates/agg_hashtable.rs +++ b/src/query/functions/tests/it/aggregates/agg_hashtable.rs @@ -26,9 +26,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; - -use bumpalo::Bump; use common_expression::types::ArgType; use common_expression::types::BooleanType; use common_expression::types::Float32Type; @@ -45,6 +42,7 @@ use common_expression::AggregateHashTable; use common_expression::Column; use common_expression::DataBlock; use common_expression::FromData; +use common_expression::HashTableConfig; use common_expression::PayloadFlushState; use common_expression::ProbeState; use common_functions::aggregates::AggregateFunctionFactory; @@ -91,17 +89,19 @@ fn test_agg_hashtable() { ]; let params: Vec> = aggrs.iter().map(|_| vec![columns[1].clone()]).collect(); + let radix_bits = 3; - let arena1 = Arc::new(Bump::new()); - let mut hashtable = AggregateHashTable::new(arena1, group_types.clone(), aggrs.clone()); + let config = HashTableConfig::default(); + let mut hashtable = + AggregateHashTable::new(group_types.clone(), aggrs.clone(), config.clone()); let mut state = ProbeState::with_capacity(BATCH_SIZE); let _ = hashtable .add_groups(&mut state, &group_columns, ¶ms, n) .unwrap(); - let arena2 = Arc::new(Bump::new()); - let mut hashtable2 = AggregateHashTable::new(arena2, group_types.clone(), aggrs.clone()); + let mut hashtable2 = + AggregateHashTable::new(group_types.clone(), aggrs.clone(), config.clone()); let mut state2 = ProbeState::with_capacity(BATCH_SIZE); let _ = hashtable2 diff --git a/src/query/service/src/pipelines/builders/builder_aggregate.rs b/src/query/service/src/pipelines/builders/builder_aggregate.rs index 9cd482e3bc47..46156b9962a9 100644 --- a/src/query/service/src/pipelines/builders/builder_aggregate.rs +++ b/src/query/service/src/pipelines/builders/builder_aggregate.rs @@ -22,6 +22,7 @@ use common_expression::AggregateFunctionRef; use common_expression::DataBlock; use common_expression::DataSchemaRef; use common_expression::HashMethodKind; +use common_expression::HashTableConfig; use common_functions::aggregates::AggregateFunctionFactory; use common_pipeline_core::processors::processor::ProcessorPtr; use common_pipeline_core::query_spill_prefix; @@ -137,6 +138,9 @@ impl PipelineBuilder { let sample_block = DataBlock::empty_with_schema(schema_before_group_by); let method = DataBlock::choose_hash_method(&sample_block, group_cols, efficiently_memory)?; + // Need a global atomic to read the max current radix bits hint + let partial_agg_config = HashTableConfig::default().with_partial(true); + self.main_pipeline.add_transform(|input, output| { let transform = match params.aggregate_functions.is_empty() { true => with_mappedhash_method!(|T| match method.clone() { @@ -146,7 +150,8 @@ impl PipelineBuilder { input, output, params.clone(), - enable_experimental_aggregate_hashtable + partial_agg_config.clone(), + enable_experimental_aggregate_hashtable, ), }), false => with_mappedhash_method!(|T| match method.clone() { @@ -156,7 +161,8 @@ impl PipelineBuilder { input, output, params.clone(), - enable_experimental_aggregate_hashtable + partial_agg_config.clone(), + enable_experimental_aggregate_hashtable, ), }), }?; diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs index 0716ea90e094..cb95d799821b 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs @@ -76,7 +76,7 @@ impl ExchangeSorting AggregateMeta::Partitioned { .. } => unreachable!(), AggregateMeta::Serialized(v) => Ok(v.bucket), AggregateMeta::HashTable(v) => Ok(v.bucket), - AggregateMeta::AggregateHashTable((bucket, _)) => Ok(*bucket), + AggregateMeta::AggregateHashTable(_) => unreachable!(), AggregateMeta::Spilled(_) | AggregateMeta::Spilling(_) | AggregateMeta::BucketSpilled(_) => Ok(-1), diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_meta.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_meta.rs index d83c0f396bf0..327f816205e6 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_meta.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_meta.rs @@ -16,11 +16,11 @@ use std::fmt::Debug; use std::fmt::Formatter; use std::ops::Range; -use common_expression::AggregateHashTable; use common_expression::BlockMetaInfo; use common_expression::BlockMetaInfoPtr; use common_expression::Column; use common_expression::DataBlock; +use common_expression::PartitionedPayload; use crate::pipelines::processors::transforms::group_by::HashMethodBounds; use crate::pipelines::processors::transforms::group_by::PartitionedHashMethod; @@ -53,7 +53,7 @@ pub struct BucketSpilledPayload { pub enum AggregateMeta { Serialized(SerializedPayload), HashTable(HashTablePayload), - AggregateHashTable((isize, AggregateHashTable)), + AggregateHashTable(PartitionedPayload), BucketSpilled(BucketSpilledPayload), Spilled(Vec), Spilling(HashTablePayload, V>), @@ -69,8 +69,8 @@ impl AggregateMeta BlockMetaInfoPtr { - Box::new(AggregateMeta::::AggregateHashTable((bucket, ht))) + pub fn create_agg_hashtable(payload: PartitionedPayload) -> BlockMetaInfoPtr { + Box::new(AggregateMeta::::AggregateHashTable(payload)) } pub fn create_serialized(bucket: isize, block: DataBlock) -> BlockMetaInfoPtr { diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs index b4d5b82dcbac..ba2fb3635176 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs @@ -21,6 +21,7 @@ use common_exception::Result; use common_expression::AggregateHashTable; use common_expression::ColumnBuilder; use common_expression::DataBlock; +use common_expression::HashTableConfig; use common_expression::PayloadFlushState; use common_functions::aggregates::StateAddr; use common_hashtable::HashtableEntryMutRefLike; @@ -186,26 +187,30 @@ where Method: HashMethodBounds } } }, - AggregateMeta::AggregateHashTable((_, mut hashtable)) => { - match agg_hashtable.as_mut() { - Some(ht) => { - ht.combine(hashtable, &mut self.flush_state)?; - } - None => { - let new_capacity = - AggregateHashTable::get_capacity_for_count(hashtable.len()); - hashtable.resize(new_capacity); - - agg_hashtable = Some(hashtable); - } + AggregateMeta::AggregateHashTable(payload) => match agg_hashtable.as_mut() { + Some(ht) => { + ht.combine_payloads(&payload, &mut self.flush_state)?; } - } + None => { + let capacity = + AggregateHashTable::get_capacity_for_count(payload.len()); + + let mut hashtable = AggregateHashTable::new_with_capacity( + self.params.group_data_types.clone(), + self.params.aggregate_functions.clone(), + HashTableConfig::default().with_initial_radix_bits(0), + capacity, + ); + hashtable.combine_payloads(&payload, &mut self.flush_state)?; + agg_hashtable = Some(hashtable); + } + }, } } if let Some(mut ht) = agg_hashtable { let mut blocks = vec![]; - self.flush_state.reset(); + self.flush_state.clear(); loop { if ht.merge_result(&mut self.flush_state)? { let mut cols = self.flush_state.take_aggregate_results(); @@ -217,7 +222,9 @@ where Method: HashMethodBounds } } - // todo pipeline + if blocks.is_empty() { + return Ok(DataBlock::empty()); + } return DataBlock::concat(&blocks); } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs index 84d3c619406e..a0a239b4dea3 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs @@ -27,6 +27,7 @@ use common_expression::AggregateHashTable; use common_expression::BlockMetaInfoDowncast; use common_expression::Column; use common_expression::DataBlock; +use common_expression::HashTableConfig; use common_expression::ProbeState; use common_functions::aggregates::StateAddr; use common_functions::aggregates::StateAddrs; @@ -120,11 +121,11 @@ impl TransformPartialAggregate { input: Arc, output: Arc, params: Arc, + config: HashTableConfig, enable_experimental_aggregate_hashtable: bool, ) -> Result> { - let arena = Arc::new(Bump::new()); - let hash_table = if !enable_experimental_aggregate_hashtable { + let arena = Arc::new(Bump::new()); let hashtable = method.create_hash_table(arena.clone())?; let _dropper = AggregateHashTableDropper::create(params.clone()); let hashtable = HashTableCell::create(hashtable, _dropper); @@ -137,9 +138,9 @@ impl TransformPartialAggregate { } } else { HashTable::AggregateHashTable(AggregateHashTable::new( - arena, params.group_data_types.clone(), params.aggregate_functions.clone(), + config, )) }; @@ -396,11 +397,9 @@ impl AccumulatingTransform for TransformPartialAggrega blocks } - HashTable::AggregateHashTable(hashtable) => { - vec![DataBlock::empty_with_meta( - AggregateMeta::::create_agg_hashtable(-1, hashtable), - )] - } + HashTable::AggregateHashTable(hashtable) => vec![DataBlock::empty_with_meta( + AggregateMeta::::create_agg_hashtable(hashtable.payload), + )], }) } } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs index 7631910ebdce..334a1f12c96c 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs @@ -19,6 +19,7 @@ use common_exception::ErrorCode; use common_exception::Result; use common_expression::AggregateHashTable; use common_expression::DataBlock; +use common_expression::HashTableConfig; use common_expression::PayloadFlushState; use common_hashtable::HashtableEntryRefLike; use common_hashtable::HashtableLike; @@ -109,20 +110,29 @@ where Method: HashMethodBounds } } }, - AggregateMeta::AggregateHashTable((_, hashtable)) => { - match agg_hashtable.as_mut() { - Some(ht) => { - ht.combine(hashtable, &mut self.flush_state)?; - } - None => agg_hashtable = Some(hashtable), + AggregateMeta::AggregateHashTable(payload) => match agg_hashtable.as_mut() { + Some(ht) => { + ht.combine_payloads(&payload, &mut self.flush_state)?; } - } + None => { + let capacity = + AggregateHashTable::get_capacity_for_count(payload.len()); + let mut hashtable = AggregateHashTable::new_with_capacity( + self.params.group_data_types.clone(), + self.params.aggregate_functions.clone(), + HashTableConfig::default().with_initial_radix_bits(0), + capacity, + ); + hashtable.combine_payloads(&payload, &mut self.flush_state)?; + agg_hashtable = Some(hashtable); + } + }, } } if let Some(mut ht) = agg_hashtable { let mut blocks = vec![]; - self.flush_state.reset(); + self.flush_state.clear(); loop { if ht.merge_result(&mut self.flush_state)? { blocks.push(DataBlock::new_from_columns( @@ -133,6 +143,10 @@ where Method: HashMethodBounds } } + if blocks.is_empty() { + return Ok(DataBlock::empty()); + } + return DataBlock::concat(&blocks); } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs index 5edeba053b05..ae4af0abc2f8 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs @@ -25,6 +25,7 @@ use common_exception::Result; use common_expression::AggregateHashTable; use common_expression::Column; use common_expression::DataBlock; +use common_expression::HashTableConfig; use common_expression::ProbeState; use common_hashtable::HashtableLike; use common_pipeline_core::processors::port::InputPort; @@ -115,18 +116,19 @@ impl TransformPartialGroupBy { input: Arc, output: Arc, params: Arc, + config: HashTableConfig, enable_experimental_aggregate_hashtable: bool, ) -> Result> { - let arena = Arc::new(Bump::new()); let hash_table = if !enable_experimental_aggregate_hashtable { + let arena = Arc::new(Bump::new()); let hashtable = method.create_hash_table(arena.clone())?; let _dropper = GroupByHashTableDropper::::create(); HashTable::HashTable(HashTableCell::create(hashtable, _dropper)) } else { HashTable::AggregateHashTable(AggregateHashTable::new( - arena, params.group_data_types.clone(), params.aggregate_functions.clone(), + config, )) }; @@ -260,11 +262,9 @@ impl AccumulatingTransform for TransformPartialGroupBy blocks } - HashTable::AggregateHashTable(hashtable) => { - vec![DataBlock::empty_with_meta( - AggregateMeta::::create_agg_hashtable(-1, hashtable), - )] - } + HashTable::AggregateHashTable(hashtable) => vec![DataBlock::empty_with_meta( + AggregateMeta::::create_agg_hashtable(hashtable.payload), + )], }) } } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs index 47906da26eea..b1a6778c680a 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs @@ -15,15 +15,17 @@ use std::any::Any; use std::collections::btree_map::Entry; use std::collections::BTreeMap; +use std::collections::HashMap; use std::marker::PhantomData; use std::mem::take; use std::sync::Arc; use common_exception::ErrorCode; use common_exception::Result; -use common_expression::AggregateHashTable; use common_expression::BlockMetaInfoDowncast; use common_expression::DataBlock; +use common_expression::PartitionedPayload; +use common_expression::PayloadFlushState; use common_hashtable::hash2bucket; use common_hashtable::HashtableLike; use common_pipeline_core::pipe::Pipe; @@ -69,7 +71,10 @@ pub struct TransformPartitionBucket>, + flush_state: PayloadFlushState, + partition_payloads: Vec, unsplitted_blocks: Vec, + max_partition_count: usize, _phantom: PhantomData, } @@ -95,7 +100,10 @@ impl output: OutputPort::create(), buckets_blocks: BTreeMap::new(), unsplitted_blocks: vec![], + flush_state: PayloadFlushState::with_capacity(8192), + partition_payloads: vec![], initialized_all_inputs: false, + max_partition_count: 0, _phantom: Default::default(), }) } @@ -186,7 +194,12 @@ impl unreachable!() } - AggregateMeta::AggregateHashTable((v, _)) => (*v, *v), + AggregateMeta::AggregateHashTable(p) => { + self.max_partition_count = + self.max_partition_count.max(p.partition_count()); + + (SINGLE_LEVEL_BUCKET_NUM, SINGLE_LEVEL_BUCKET_NUM) + } }; if bucket > SINGLE_LEVEL_BUCKET_NUM { @@ -204,6 +217,16 @@ impl } } + if self.max_partition_count > 0 { + let meta = data_block.take_meta().unwrap(); + if let Some(AggregateMeta::AggregateHashTable(p)) = + AggregateMeta::::downcast_from(meta) + { + self.partition_payloads.push(p); + } + return SINGLE_LEVEL_BUCKET_NUM; + } + self.unsplitted_blocks.push(data_block); SINGLE_LEVEL_BUCKET_NUM } @@ -301,12 +324,6 @@ impl Ok(data_blocks) } - - fn partition_agg_hashtable(&self, ht: AggregateHashTable) -> Result>> { - let block = - DataBlock::empty_with_meta(AggregateMeta::::create_agg_hashtable(0, ht)); - Ok(vec![Some(block)]) - } } #[async_trait::async_trait] @@ -336,7 +353,9 @@ impl Processor return Ok(Event::NeedData); } - if !self.buckets_blocks.is_empty() && !self.unsplitted_blocks.is_empty() { + if self.partition_payloads.len() == self.inputs.len() + || (!self.buckets_blocks.is_empty() && !self.unsplitted_blocks.is_empty()) + { // Split data blocks if it's unsplitted. return Ok(Event::Sync); } @@ -408,6 +427,44 @@ impl Processor } fn process(&mut self) -> Result<()> { + if !self.partition_payloads.is_empty() { + let mut payloads = Vec::with_capacity(self.partition_payloads.len()); + + for p in self.partition_payloads.drain(0..) { + if p.partition_count() != self.max_partition_count { + let p = p.repartition(self.max_partition_count, &mut self.flush_state); + payloads.push(p); + } else { + payloads.push(p); + }; + } + + let group_types = payloads[0].group_types.clone(); + let aggrs = payloads[0].aggrs.clone(); + + let mut payload_map = HashMap::with_capacity(self.max_partition_count); + for payload in payloads.into_iter() { + for (bucket, p) in payload.payloads.into_iter().enumerate() { + payload_map + .entry(bucket as isize) + .or_insert_with(|| vec![]) + .push(p); + } + } + for bucket in 0..self.max_partition_count as isize { + let mut payloads = payload_map.remove(&bucket).unwrap_or(vec![]); + let mut partition_payload = + PartitionedPayload::new(group_types.clone(), aggrs.clone(), 1); + + partition_payload.payloads.append(payloads.as_mut()); + self.buckets_blocks + .insert(bucket as isize, vec![DataBlock::empty_with_meta( + AggregateMeta::::create_agg_hashtable(partition_payload), + )]); + } + return Ok(()); + } + let block_meta = self .unsplitted_blocks .pop() @@ -426,9 +483,7 @@ impl Processor AggregateMeta::Partitioned { .. } => unreachable!(), AggregateMeta::Serialized(payload) => self.partition_block(payload)?, AggregateMeta::HashTable(payload) => self.partition_hashtable(payload)?, - AggregateMeta::AggregateHashTable((_, payload)) => { - self.partition_agg_hashtable(payload)? - } + AggregateMeta::AggregateHashTable(_) => unreachable!(), }; for (bucket, block) in data_blocks.into_iter().enumerate() { From 103b6327d8993a83b246677263be8decd122068f Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Wed, 15 Nov 2023 07:39:47 +0800 Subject: [PATCH 14/28] feat(query): update --- src/common/hashtable/src/utils.rs | 2 + .../src/aggregate/aggregate_hashtable.rs | 60 +++++++------- .../expression/src/aggregate/group_hash.rs | 2 +- src/query/expression/src/aggregate/mod.rs | 18 +++- .../src/aggregate/partitioned_payload.rs | 47 ++++++----- src/query/expression/src/aggregate/payload.rs | 35 +++----- .../expression/src/aggregate/payload_flush.rs | 83 +++++++++---------- .../expression/src/aggregate/payload_row.rs | 2 +- .../expression/src/aggregate/probe_state.rs | 39 +++------ .../tests/it/aggregates/agg_hashtable.rs | 8 +- .../aggregator/transform_aggregate_final.rs | 3 +- .../aggregator/transform_aggregate_partial.rs | 4 +- .../aggregator/transform_group_by_final.rs | 3 +- .../aggregator/transform_group_by_partial.rs | 3 +- .../aggregator/transform_partition_bucket.rs | 15 ++-- 15 files changed, 155 insertions(+), 169 deletions(-) diff --git a/src/common/hashtable/src/utils.rs b/src/common/hashtable/src/utils.rs index 083255f31129..bf97b9941abb 100644 --- a/src/common/hashtable/src/utils.rs +++ b/src/common/hashtable/src/utils.rs @@ -137,6 +137,8 @@ pub mod sse { )) } + /// # Safety + /// This is safe that we compare bytes via addr #[inline(always)] pub unsafe fn memcmp_sse(a: &[u8], b: &[u8]) -> bool { let mut size = a.len(); diff --git a/src/query/expression/src/aggregate/aggregate_hashtable.rs b/src/query/expression/src/aggregate/aggregate_hashtable.rs index 2c82e820aba0..86552b396687 100644 --- a/src/query/expression/src/aggregate/aggregate_hashtable.rs +++ b/src/query/expression/src/aggregate/aggregate_hashtable.rs @@ -30,10 +30,8 @@ use crate::ColumnBuilder; use crate::HashTableConfig; use crate::Payload; use crate::StateAddr; -use crate::FLUSH_BATCH_SIZE; - -const LOAD_FACTOR: f64 = 1.5; -const MAX_ROWS_IN_HT: usize = 256 * 1024; +use crate::LOAD_FACTOR; +use crate::MAX_ROWS_IN_HT; pub type Entry = u64; @@ -121,7 +119,7 @@ impl AggregateHashTable { params: &[Vec], row_count: usize, ) -> Result { - state.adjust_vector(row_count); + state.row_count = row_count; group_hash_columns(group_columns, &mut state.group_hashes); let new_group_count = self.probe_and_create(state, group_columns, row_count); @@ -388,8 +386,8 @@ impl AggregateHashTable { loop { let current_max_radix_bits = self.config.current_max_radix_bits.load(Ordering::SeqCst); - if current_max_radix_bits < new_radix_bits { - if self + if current_max_radix_bits < new_radix_bits + && self .config .current_max_radix_bits .compare_exchange( @@ -399,9 +397,8 @@ impl AggregateHashTable { Ordering::SeqCst, ) .is_err() - { - continue; - } + { + continue; } break; } @@ -415,7 +412,7 @@ impl AggregateHashTable { 1, ); let payload = std::mem::replace(&mut self.payload, temp_payload); - let mut state = PayloadFlushState::with_capacity(FLUSH_BATCH_SIZE); + let mut state = PayloadFlushState::new(); self.current_radix_bits = current_max_radix_bits; self.payload = payload.repartition(1 << current_max_radix_bits, &mut state); @@ -438,24 +435,29 @@ impl AggregateHashTable { // iterate over payloads and copy to new entries for payload in self.payload.payloads.iter() { - for row in 0..payload.len() { - let row_ptr = payload.get_read_ptr(row); - let hash: u64 = unsafe { core::ptr::read(row_ptr.add(payload.hash_offset) as _) }; - let mut hash_slot = hash & mask; - - while entries[hash_slot as usize].is_occupied() { - hash_slot += 1; - if hash_slot >= new_capacity as u64 { - hash_slot = 0; + for page in payload.pages.iter() { + for idx in 0..page.rows { + let row_ptr: *const u8 = + unsafe { page.data.as_ptr().add(idx * payload.tuple_size) as _ }; + + let hash: u64 = + unsafe { core::ptr::read(row_ptr.add(payload.hash_offset) as _) }; + + let mut hash_slot = hash & mask; + while entries[hash_slot as usize].is_occupied() { + hash_slot += 1; + if hash_slot >= new_capacity as u64 { + hash_slot = 0; + } } + debug_assert!(!entries[hash_slot as usize].is_occupied()); + // set value + entries[hash_slot as usize].set_salt(hash.get_salt()); + entries[hash_slot as usize].set_pointer(row_ptr); + debug_assert!(entries[hash_slot as usize].is_occupied()); + debug_assert_eq!(entries[hash_slot as usize].get_pointer(), row_ptr); + debug_assert_eq!(entries[hash_slot as usize].get_salt(), hash.get_salt()); } - debug_assert!(!entries[hash_slot as usize].is_occupied()); - // set value - entries[hash_slot as usize].set_salt(hash.get_salt()); - entries[hash_slot as usize].set_pointer(row_ptr); - debug_assert!(entries[hash_slot as usize].is_occupied()); - debug_assert_eq!(entries[hash_slot as usize].get_pointer(), row_ptr); - debug_assert_eq!(entries[hash_slot as usize].get_salt(), hash.get_salt()); } } @@ -477,8 +479,6 @@ const SALT_MASK: u64 = 0xFFFF000000000000; /// Lower 48 bits are the pointer const POINTER_MASK: u64 = 0x0000FFFFFFFFFFFF; -pub const INITIAL_RADIX_BITS: u64 = 4; - pub(crate) trait EntryLike { fn get_salt(&self) -> u64; fn set_salt(&mut self, _salt: u64); @@ -514,7 +514,7 @@ impl EntryLike for u64 { // Pointer shouldn't use upper bits debug_assert!(ptr as u64 & SALT_MASK == 0); // Value should have all 1's in the pointer area - debug_assert!(*self as u64 & POINTER_MASK == POINTER_MASK); + debug_assert!(*self & POINTER_MASK == POINTER_MASK); *self &= (ptr as u64) | SALT_MASK; } diff --git a/src/query/expression/src/aggregate/group_hash.rs b/src/query/expression/src/aggregate/group_hash.rs index 8d7857736167..175014e25164 100644 --- a/src/query/expression/src/aggregate/group_hash.rs +++ b/src/query/expression/src/aggregate/group_hash.rs @@ -38,7 +38,7 @@ pub fn group_hash_columns(cols: &[Column], values: &mut [u64]) { combine_group_hash_column::(&cols[0], values); if cols.len() > 1 { for col in &cols[1..] { - combine_group_hash_column::(&col, values); + combine_group_hash_column::(col, values); } } } diff --git a/src/query/expression/src/aggregate/mod.rs b/src/query/expression/src/aggregate/mod.rs index 4f05cb019c81..ede53f509b33 100644 --- a/src/query/expression/src/aggregate/mod.rs +++ b/src/query/expression/src/aggregate/mod.rs @@ -38,7 +38,17 @@ pub use payload::*; pub use payload_flush::*; pub use probe_state::*; -pub type SelectVector = Vec; +pub type SelectVector = [usize; BATCH_SIZE]; + +pub fn new_sel() -> SelectVector { + [0; BATCH_SIZE] +} + +// A batch size to probe, flush, repartition, etc. +pub(crate) const BATCH_SIZE: usize = 2048; +pub(crate) const LOAD_FACTOR: f64 = 1.5; +pub(crate) const MAX_ROWS_IN_HT: usize = 256 * 1024; +pub(crate) const MAX_PAGE_SIZE: usize = 256 * 1024; #[derive(Clone, Debug)] pub struct HashTableConfig { @@ -54,8 +64,8 @@ pub struct HashTableConfig { impl Default for HashTableConfig { fn default() -> Self { Self { - current_max_radix_bits: Arc::new(AtomicU64::new(INITIAL_RADIX_BITS)), - initial_radix_bits: INITIAL_RADIX_BITS, + current_max_radix_bits: Arc::new(AtomicU64::new(4)), + initial_radix_bits: 4, max_radix_bits: 8, repartition_radix_bits_incr: 2, block_fill_factor: 1.8, @@ -114,7 +124,7 @@ impl Hasher for PerfectHash { } fn write_u64(&mut self, i: u64) { - self.val = i as u64; + self.val = i; } fn write_usize(&mut self, i: usize) { diff --git a/src/query/expression/src/aggregate/partitioned_payload.rs b/src/query/expression/src/aggregate/partitioned_payload.rs index 05e8fe74f5a0..80bce48c02ba 100644 --- a/src/query/expression/src/aggregate/partitioned_payload.rs +++ b/src/query/expression/src/aggregate/partitioned_payload.rs @@ -24,7 +24,7 @@ use crate::types::DataType; use crate::AggregateFunctionRef; use crate::Column; use crate::PayloadFlushState; -use crate::FLUSH_BATCH_SIZE; +use crate::BATCH_SIZE; pub struct PartitionedPayload { pub payloads: Vec, @@ -65,8 +65,8 @@ impl PartitionedPayload { let validity_offsets = payloads[0].validity_offsets.clone(); let hash_offset = payloads[0].hash_offset; let state_offset = payloads[0].state_offset; - let state_addr_offsets = payloads[0].state_addr_offsets.clone(); - let state_layout = payloads[0].state_layout.clone(); + let state_addr_offsets = payloads[0].state_addr_offsets; + let state_layout = payloads[0].state_layout; PartitionedPayload { payloads, @@ -113,7 +113,7 @@ impl PartitionedPayload { *count += 1; } None => { - let mut v = vec![0; state.group_hashes.len()]; + let mut v = [0; BATCH_SIZE]; v[0] = idx; state.partition_entries.insert(partition_idx, (v, 1)); } @@ -176,6 +176,9 @@ impl PartitionedPayload { state.clear(); while self.gather_flush(&other, state) { + if state.row_count == 0 { + continue; + } // copy rows for partition in 0..self.partition_count as usize { let payload = &mut self.payloads[partition]; @@ -194,36 +197,41 @@ impl PartitionedPayload { } pub fn gather_flush(&self, other: &Payload, state: &mut PayloadFlushState) -> bool { - let flush_end = (state.flush_offset + FLUSH_BATCH_SIZE).min(other.len()); - - if flush_end <= state.flush_offset { + if state.flush_page >= other.pages.len() { return false; } - let rows = flush_end - state.flush_offset; - if state.addresses.len() < rows { - state.addresses.resize(rows, std::ptr::null::()); + let page = &other.pages[state.flush_page]; + + // ToNext + if state.flush_page_row >= page.rows { + state.flush_page += 1; + state.flush_page_row = 0; + state.row_count = 0; + return self.gather_flush(other, state); } + let end = (state.flush_page_row + BATCH_SIZE).min(page.rows); + let rows = end - state.flush_page_row; state.row_count = rows; - for row in state.flush_offset..flush_end { - state.addresses[row - state.flush_offset] = other.get_read_ptr(row); - } state.probe_state.reset_partitions(); - for i in 0..rows { + + for idx in 0..rows { + state.addresses[idx] = other.data_ptr(page, idx + state.flush_page_row); + let hash = - unsafe { core::ptr::read::(state.addresses[i].add(self.hash_offset) as _) }; + unsafe { core::ptr::read::(state.addresses[idx].add(self.hash_offset) as _) }; let partition_idx = ((hash & self.mask_v) >> self.shift_v) as usize; match state.probe_state.partition_entries.get_mut(&partition_idx) { Some((v, count)) => { - v[*count] = i; + v[*count] = idx; *count += 1; } None => { - let mut v = vec![0; FLUSH_BATCH_SIZE]; - v[0] = i; + let mut v = [0; BATCH_SIZE]; + v[0] = idx; state .probe_state .partition_entries @@ -231,8 +239,7 @@ impl PartitionedPayload { } } } - - state.flush_offset = flush_end; + state.flush_page_row = end; true } diff --git a/src/query/expression/src/aggregate/payload.rs b/src/query/expression/src/aggregate/payload.rs index 02348c787613..5aa61ec7caac 100644 --- a/src/query/expression/src/aggregate/payload.rs +++ b/src/query/expression/src/aggregate/payload.rs @@ -27,8 +27,8 @@ use crate::AggregateFunctionRef; use crate::Column; use crate::SelectVector; use crate::StateAddr; +use crate::MAX_PAGE_SIZE; -const MAX_PAGE_SIZE: usize = 256 * 1024; // payload layout // [VALIDITY][GROUPS][HASH][STATE_ADDRS] // [VALIDITY] is the validity bits of the data columns (including the HASH) @@ -65,9 +65,9 @@ unsafe impl Send for Payload {} unsafe impl Sync for Payload {} pub struct Page { - data: Vec>, - rows: usize, - capacity: usize, + pub(crate) data: Vec>, + pub(crate) rows: usize, + pub(crate) capacity: usize, } pub type Pages = Vec; @@ -169,16 +169,8 @@ impl Payload { &mut self.pages[self.current_write_page - 1] } - pub fn get_read_ptr(&self, row: usize) -> *const u8 { - let mut c = row; - for page in self.pages.iter() { - if page.rows > c { - return unsafe { page.data.as_ptr().add(c * self.tuple_size) as *const u8 }; - } else { - c -= page.rows; - } - } - unreachable!() + pub fn data_ptr(&self, page: &Page, row: usize) -> *const u8 { + unsafe { page.data.as_ptr().add(row * self.tuple_size) as _ } } pub fn reserve_append_rows( @@ -339,14 +331,13 @@ impl Drop for Payload { if !self.state_move_out { for (aggr, addr_offset) in self.aggrs.iter().zip(self.state_addr_offsets.iter()) { if aggr.need_manual_drop_state() { - for row in 0..self.len() { - let row_ptr = self.get_read_ptr(row); - - unsafe { - let state_addr: u64 = - core::ptr::read(row_ptr.add(self.state_offset) as _); - aggr.drop_state(StateAddr::new(state_addr as usize + *addr_offset)) - }; + for page in self.pages.iter() { + for row in 0..page.rows { + unsafe { + let state_addr = self.data_ptr(page, row).add(self.state_offset); + aggr.drop_state(StateAddr::new(state_addr as usize + *addr_offset)) + }; + } } } } diff --git a/src/query/expression/src/aggregate/payload_flush.rs b/src/query/expression/src/aggregate/payload_flush.rs index 972daa3ec40e..d497ed1d9160 100644 --- a/src/query/expression/src/aggregate/payload_flush.rs +++ b/src/query/expression/src/aggregate/payload_flush.rs @@ -31,8 +31,7 @@ use crate::types::TimestampType; use crate::with_number_mapped_type; use crate::Column; use crate::StateAddr; - -pub(crate) const FLUSH_BATCH_SIZE: usize = 8192; +use crate::BATCH_SIZE; pub struct PayloadFlushState { pub probe_state: ProbeState, @@ -41,33 +40,36 @@ pub struct PayloadFlushState { pub row_count: usize, pub flush_partition: usize, - pub flush_offset: usize, + pub flush_page: usize, + pub flush_page_row: usize, - pub addresses: Vec<*const u8>, - pub state_places: Vec, + pub addresses: [*const u8; BATCH_SIZE], + pub state_places: [StateAddr; BATCH_SIZE], } unsafe impl Send for PayloadFlushState {} unsafe impl Sync for PayloadFlushState {} impl PayloadFlushState { - pub fn with_capacity(len: usize) -> PayloadFlushState { + pub fn new() -> PayloadFlushState { PayloadFlushState { - probe_state: ProbeState::with_capacity(len), + probe_state: ProbeState::new(), group_columns: Vec::new(), aggregate_results: Vec::new(), row_count: 0, flush_partition: 0, - flush_offset: 0, - addresses: vec![std::ptr::null::(); len], - state_places: vec![StateAddr::new(0); len], + flush_page: 0, + flush_page_row: 0, + addresses: [std::ptr::null::(); BATCH_SIZE], + state_places: [StateAddr::new(0); BATCH_SIZE], } } pub fn clear(&mut self) { self.row_count = 0; self.flush_partition = 0; - self.flush_offset = 0; + self.flush_page = 0; + self.flush_page_row = 0; } pub fn take_group_columns(&mut self) -> Vec { @@ -88,8 +90,9 @@ impl PartitionedPayload { if p.flush(state) { true } else { - state.flush_partition += 1; - state.flush_offset = 0; + let p = state.flush_partition + 1; + state.clear(); + state.flush_partition = p; self.flush(state) } } @@ -97,53 +100,47 @@ impl PartitionedPayload { impl Payload { pub fn flush(&self, state: &mut PayloadFlushState) -> bool { - let flush_end = (state.flush_offset + FLUSH_BATCH_SIZE).min(self.len()); - if flush_end <= state.flush_offset { + if state.flush_page >= self.pages.len() { return false; } - let rows = flush_end - state.flush_offset; + let page = &self.pages[state.flush_page]; + + if state.flush_page_row >= page.rows { + state.flush_page += 1; + state.flush_page_row = 0; + state.row_count = 0; - if state.addresses.len() < rows { - state.addresses.resize(rows, std::ptr::null::()); - state.state_places.resize(rows, StateAddr::new(0)); + return self.flush(state); } + let end = (state.flush_page_row + BATCH_SIZE).min(page.rows); + let rows = end - state.flush_page_row; state.group_columns.clear(); state.row_count = rows; - state.probe_state.adjust_vector(rows); + state.probe_state.row_count = rows; - for row in state.flush_offset..flush_end { - state.addresses[row - state.flush_offset] = self.get_read_ptr(row); - } + for idx in 0..rows { + state.addresses[idx] = self.data_ptr(page, idx + state.flush_page_row); + state.probe_state.group_hashes[idx] = + unsafe { core::ptr::read::(state.addresses[idx].add(self.hash_offset) as _) }; - self.flush_hashes(state); - for col_index in 0..self.group_types.len() { - let col = self.flush_column(col_index, state); - state.group_columns.push(col); - } - - if !self.aggrs.is_empty() { - for i in 0..rows { - state.state_places[i] = unsafe { + if !self.aggrs.is_empty() { + state.state_places[idx] = unsafe { StateAddr::new(core::ptr::read::( - state.addresses[i].add(self.state_offset) as _ + state.addresses[idx].add(self.state_offset) as _, ) as usize) }; } } - state.flush_offset = flush_end; - true - } - - fn flush_hashes(&self, state: &mut PayloadFlushState) { - let len = state.probe_state.row_count; - - for i in 0..len { - state.probe_state.group_hashes[i] = - unsafe { core::ptr::read::(state.addresses[i].add(self.hash_offset) as _) }; + for col_index in 0..self.group_types.len() { + let col = self.flush_column(col_index, state); + state.group_columns.push(col); } + + state.flush_page_row = end; + true } fn flush_column(&self, col_index: usize, state: &mut PayloadFlushState) -> Column { diff --git a/src/query/expression/src/aggregate/payload_row.rs b/src/query/expression/src/aggregate/payload_row.rs index fde820b31232..e6a6eaca7668 100644 --- a/src/query/expression/src/aggregate/payload_row.rs +++ b/src/query/expression/src/aggregate/payload_row.rs @@ -48,7 +48,7 @@ pub fn rowformat_size(data_type: &DataType) -> usize { }, DataType::Timestamp => 8, DataType::Date => 4, - DataType::Nullable(x) => rowformat_size(&x), + DataType::Nullable(x) => rowformat_size(x), DataType::Array(_) => todo!(), DataType::Map(_) => todo!(), DataType::Tuple(_) => todo!(), diff --git a/src/query/expression/src/aggregate/probe_state.rs b/src/query/expression/src/aggregate/probe_state.rs index f5673f66cc89..ff65013c5818 100644 --- a/src/query/expression/src/aggregate/probe_state.rs +++ b/src/query/expression/src/aggregate/probe_state.rs @@ -14,17 +14,19 @@ use std::collections::HashMap; +use crate::new_sel; use crate::PerfectHashBuilder; use crate::SelectVector; use crate::StateAddr; +use crate::BATCH_SIZE; /// ProbeState is the state to probe HT /// It could be reuse during multiple probe process #[derive(Debug)] pub struct ProbeState { - pub group_hashes: Vec, - pub addresses: Vec<*const u8>, - pub state_places: Vec, + pub group_hashes: [u64; BATCH_SIZE], + pub addresses: [*const u8; BATCH_SIZE], + pub state_places: [StateAddr; BATCH_SIZE], pub group_compare_vector: SelectVector, pub no_match_vector: SelectVector, pub empty_vector: SelectVector, @@ -38,34 +40,19 @@ unsafe impl Send for ProbeState {} unsafe impl Sync for ProbeState {} impl ProbeState { - pub fn with_capacity(len: usize) -> Self { + pub fn new() -> Self { Self { - group_hashes: vec![0; len], - addresses: vec![std::ptr::null::(); len], - state_places: vec![StateAddr::new(0); len], - group_compare_vector: vec![0; len], - no_match_vector: vec![0; len], - empty_vector: vec![0; len], - temp_vector: vec![0; len], + group_hashes: [0_u64; BATCH_SIZE], + addresses: [std::ptr::null::(); BATCH_SIZE], + state_places: [StateAddr::new(0); BATCH_SIZE], + group_compare_vector: new_sel(), + no_match_vector: new_sel(), + empty_vector: new_sel(), + temp_vector: new_sel(), partition_entries: HashMap::with_hasher(PerfectHashBuilder), row_count: 0, } } - - pub fn adjust_vector(&mut self, row_count: usize) { - if self.group_hashes.len() < row_count { - self.group_hashes.resize(row_count, 0); - self.addresses.resize(row_count, std::ptr::null::()); - self.state_places.resize(row_count, StateAddr::new(0)); - - self.group_compare_vector.resize(row_count, 0); - self.no_match_vector.resize(row_count, 0); - self.empty_vector.resize(row_count, 0); - self.temp_vector.resize(row_count, 0); - } - self.row_count = row_count; - } - pub fn set_incr_empty_vector(&mut self, row_count: usize) { for i in 0..row_count { self.empty_vector[i] = i; diff --git a/src/query/functions/tests/it/aggregates/agg_hashtable.rs b/src/query/functions/tests/it/aggregates/agg_hashtable.rs index 83ac22cae760..d20f6067b0e0 100644 --- a/src/query/functions/tests/it/aggregates/agg_hashtable.rs +++ b/src/query/functions/tests/it/aggregates/agg_hashtable.rs @@ -95,7 +95,7 @@ fn test_agg_hashtable() { let mut hashtable = AggregateHashTable::new(group_types.clone(), aggrs.clone(), config.clone()); - let mut state = ProbeState::with_capacity(BATCH_SIZE); + let mut state = ProbeState::new(); let _ = hashtable .add_groups(&mut state, &group_columns, ¶ms, n) .unwrap(); @@ -103,15 +103,15 @@ fn test_agg_hashtable() { let mut hashtable2 = AggregateHashTable::new(group_types.clone(), aggrs.clone(), config.clone()); - let mut state2 = ProbeState::with_capacity(BATCH_SIZE); + let mut state2 = ProbeState::new(); let _ = hashtable2 .add_groups(&mut state2, &group_columns, ¶ms, n) .unwrap(); - let mut flush_state = PayloadFlushState::with_capacity(BATCH_SIZE); + let mut flush_state = PayloadFlushState::new(); let _ = hashtable.combine(hashtable2, &mut flush_state); - let mut merge_state = PayloadFlushState::with_capacity(BATCH_SIZE); + let mut merge_state = PayloadFlushState::new(); let mut blocks = Vec::new(); loop { diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs index ba2fb3635176..df8226e59e89 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs @@ -55,14 +55,13 @@ impl TransformFinalAggregate { method: Method, params: Arc, ) -> Result> { - let max_block_size = params.max_block_size; Ok(Box::new(BlockMetaTransformer::create( input, output, TransformFinalAggregate:: { method, params, - flush_state: PayloadFlushState::with_capacity(max_block_size), + flush_state: PayloadFlushState::new(), }, ))) } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs index a0a239b4dea3..31a15db3f83a 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs @@ -144,8 +144,6 @@ impl TransformPartialAggregate { )) }; - let max_block_size = ctx.get_settings().get_max_block_size()? as usize; - Ok(AccumulatingTransformer::create( input, output, @@ -153,7 +151,7 @@ impl TransformPartialAggregate { method, params, hash_table, - probe_state: ProbeState::with_capacity(max_block_size), + probe_state: ProbeState::new(), settings: AggregateSettings::try_from(ctx)?, }, )) diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs index 334a1f12c96c..9d1b97d4c06c 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs @@ -49,14 +49,13 @@ impl TransformFinalGroupBy { method: Method, params: Arc, ) -> Result> { - let max_block_size = params.max_block_size; Ok(Box::new(BlockMetaTransformer::create( input, output, TransformFinalGroupBy:: { method, params, - flush_state: PayloadFlushState::with_capacity(max_block_size), + flush_state: PayloadFlushState::new(), }, ))) } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs index ae4af0abc2f8..2114ad8c0db5 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs @@ -132,14 +132,13 @@ impl TransformPartialGroupBy { )) }; - let max_block_size = ctx.get_settings().get_max_block_size()? as usize; Ok(AccumulatingTransformer::create( input, output, TransformPartialGroupBy:: { method, hash_table, - probe_state: ProbeState::with_capacity(max_block_size), + probe_state: ProbeState::new(), group_columns: params.group_columns.clone(), settings: GroupBySettings::try_from(ctx)?, }, diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs index b1a6778c680a..24d5544de23e 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs @@ -15,7 +15,6 @@ use std::any::Any; use std::collections::btree_map::Entry; use std::collections::BTreeMap; -use std::collections::HashMap; use std::marker::PhantomData; use std::mem::take; use std::sync::Arc; @@ -41,6 +40,7 @@ use common_pipeline_transforms::processors::profile_wrapper::ProfileStub; use common_pipeline_transforms::processors::transforms::Transformer; use common_profile::SharedProcessorProfiles; use common_storage::DataOperator; +use itertools::Itertools; use crate::pipelines::processors::transforms::aggregator::aggregate_meta::AggregateMeta; use crate::pipelines::processors::transforms::aggregator::aggregate_meta::HashTablePayload; @@ -100,7 +100,7 @@ impl output: OutputPort::create(), buckets_blocks: BTreeMap::new(), unsplitted_blocks: vec![], - flush_state: PayloadFlushState::with_capacity(8192), + flush_state: PayloadFlushState::new(), partition_payloads: vec![], initialized_all_inputs: false, max_partition_count: 0, @@ -442,17 +442,14 @@ impl Processor let group_types = payloads[0].group_types.clone(); let aggrs = payloads[0].aggrs.clone(); - let mut payload_map = HashMap::with_capacity(self.max_partition_count); + let mut payload_map = (0..self.max_partition_count).map(|_| vec![]).collect_vec(); for payload in payloads.into_iter() { for (bucket, p) in payload.payloads.into_iter().enumerate() { - payload_map - .entry(bucket as isize) - .or_insert_with(|| vec![]) - .push(p); + payload_map[bucket].push(p); } } - for bucket in 0..self.max_partition_count as isize { - let mut payloads = payload_map.remove(&bucket).unwrap_or(vec![]); + + for (bucket, mut payloads) in payload_map.into_iter().enumerate() { let mut partition_payload = PartitionedPayload::new(group_types.clone(), aggrs.clone(), 1); From 113c25317573617ef3f389944d454a5aed947d50 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Wed, 15 Nov 2023 18:11:36 +0800 Subject: [PATCH 15/28] feat(query): update --- .../src/aggregate/aggregate_hashtable.rs | 36 ++++++++++++++++--- src/query/expression/src/aggregate/mod.rs | 5 ++- .../src/aggregate/partitioned_payload.rs | 5 +-- src/query/expression/src/aggregate/payload.rs | 26 +++++++++----- .../expression/src/aggregate/payload_flush.rs | 14 ++++---- .../expression/src/aggregate/payload_row.rs | 2 +- .../expression/src/aggregate/probe_state.rs | 13 ++++--- .../tests/it/aggregates/agg_hashtable.rs | 10 +++--- .../aggregator/transform_aggregate_final.rs | 4 +-- .../aggregator/transform_aggregate_partial.rs | 4 +-- .../aggregator/transform_group_by_final.rs | 2 +- .../aggregator/transform_group_by_partial.rs | 4 +-- .../aggregator/transform_partition_bucket.rs | 2 +- 13 files changed, 84 insertions(+), 43 deletions(-) diff --git a/src/query/expression/src/aggregate/aggregate_hashtable.rs b/src/query/expression/src/aggregate/aggregate_hashtable.rs index 86552b396687..f6f122b4db81 100644 --- a/src/query/expression/src/aggregate/aggregate_hashtable.rs +++ b/src/query/expression/src/aggregate/aggregate_hashtable.rs @@ -33,6 +33,7 @@ use crate::StateAddr; use crate::LOAD_FACTOR; use crate::MAX_ROWS_IN_HT; +// The high 16 bits are the salt, the low 48 bits are the pointer address pub type Entry = u64; pub struct AggregateHashTable { @@ -41,6 +42,10 @@ pub struct AggregateHashTable { current_radix_bits: u64, entries: Vec, capacity: usize, + disable_expand_ht: bool, + + // how many rows probe into this hash table + probe_input_rows: usize, } unsafe impl Send for AggregateHashTable {} @@ -68,6 +73,8 @@ impl AggregateHashTable { payload: PartitionedPayload::new(group_types, aggrs, 1 << config.initial_radix_bits), capacity, config, + disable_expand_ht: false, + probe_input_rows: 0, } } @@ -160,9 +167,8 @@ impl AggregateHashTable { ) -> usize { self.maybe_repartition(); - if self.config.partial_agg - && self.current_radix_bits == self.config.max_radix_bits - && self.capacity >= MAX_ROWS_IN_HT + if self.current_radix_bits == self.config.max_radix_bits + && self.should_disable_expand_hash_table() { // directly append rows state.set_incr_empty_vector(row_count); @@ -181,6 +187,8 @@ impl AggregateHashTable { self.resize(new_capacity); } + self.probe_input_rows += row_count; + let mut new_group_count = 0; let mut remaining_entries = row_count; @@ -379,9 +387,15 @@ impl AggregateHashTable { let bytes_per_partition = self.payload.memory_size() / self.payload.partition_count(); let mut new_radix_bits = self.current_radix_bits; + // 256k if bytes_per_partition > 256 * 1024 { new_radix_bits += self.config.repartition_radix_bits_incr; + + // If reducion is small and input rows will be very large, directly repartition to max radix bits + if self.should_disable_expand_hash_table() { + new_radix_bits = self.config.max_radix_bits; + } } loop { @@ -412,7 +426,7 @@ impl AggregateHashTable { 1, ); let payload = std::mem::replace(&mut self.payload, temp_payload); - let mut state = PayloadFlushState::new(); + let mut state = PayloadFlushState::default(); self.current_radix_bits = current_max_radix_bits; self.payload = payload.repartition(1 << current_max_radix_bits, &mut state); @@ -465,6 +479,20 @@ impl AggregateHashTable { self.capacity = new_capacity; } + pub fn should_disable_expand_hash_table(&mut self) -> bool { + if self.disable_expand_ht { + return true; + } + + if !self.config.partial_agg || self.capacity < MAX_ROWS_IN_HT { + return false; + } + + let ratio = self.probe_input_rows as f64 / self.len() as f64; + self.disable_expand_ht = ratio <= self.config.min_reduction; + self.disable_expand_ht + } + pub fn initial_capacity() -> usize { 4096 } diff --git a/src/query/expression/src/aggregate/mod.rs b/src/query/expression/src/aggregate/mod.rs index ede53f509b33..1ccaf90f7bd1 100644 --- a/src/query/expression/src/aggregate/mod.rs +++ b/src/query/expression/src/aggregate/mod.rs @@ -47,7 +47,7 @@ pub fn new_sel() -> SelectVector { // A batch size to probe, flush, repartition, etc. pub(crate) const BATCH_SIZE: usize = 2048; pub(crate) const LOAD_FACTOR: f64 = 1.5; -pub(crate) const MAX_ROWS_IN_HT: usize = 256 * 1024; +pub(crate) const MAX_ROWS_IN_HT: usize = 32 * 1024; pub(crate) const MAX_PAGE_SIZE: usize = 256 * 1024; #[derive(Clone, Debug)] @@ -59,6 +59,8 @@ pub struct HashTableConfig { pub repartition_radix_bits_incr: u64, pub block_fill_factor: f64, pub partial_agg: bool, + // min reduction ratio to control whether to expand the ht + pub min_reduction: f64, } impl Default for HashTableConfig { @@ -70,6 +72,7 @@ impl Default for HashTableConfig { repartition_radix_bits_incr: 2, block_fill_factor: 1.8, partial_agg: false, + min_reduction: 2.1, } } } diff --git a/src/query/expression/src/aggregate/partitioned_payload.rs b/src/query/expression/src/aggregate/partitioned_payload.rs index 80bce48c02ba..c6e2eddf82df 100644 --- a/src/query/expression/src/aggregate/partitioned_payload.rs +++ b/src/query/expression/src/aggregate/partitioned_payload.rs @@ -65,7 +65,7 @@ impl PartitionedPayload { let validity_offsets = payloads[0].validity_offsets.clone(); let hash_offset = payloads[0].hash_offset; let state_offset = payloads[0].state_offset; - let state_addr_offsets = payloads[0].state_addr_offsets; + let state_addr_offsets = payloads[0].state_addr_offsets.clone(); let state_layout = payloads[0].state_layout; PartitionedPayload { @@ -176,9 +176,6 @@ impl PartitionedPayload { state.clear(); while self.gather_flush(&other, state) { - if state.row_count == 0 { - continue; - } // copy rows for partition in 0..self.partition_count as usize { let payload = &mut self.payloads[partition]; diff --git a/src/query/expression/src/aggregate/payload.rs b/src/query/expression/src/aggregate/payload.rs index 5aa61ec7caac..9730b6137bbd 100644 --- a/src/query/expression/src/aggregate/payload.rs +++ b/src/query/expression/src/aggregate/payload.rs @@ -159,12 +159,14 @@ impl Payload { || self.pages[self.current_write_page - 1].rows == self.pages[self.current_write_page - 1].capacity { - self.pages.push(Page { - data: Vec::with_capacity(self.row_per_page * self.tuple_size), - rows: 0, - capacity: self.row_per_page, - }); - self.current_write_page = self.pages.len(); + self.current_write_page += 1; + if self.current_write_page > self.pages.len() { + self.pages.push(Page { + data: Vec::with_capacity(self.row_per_page * self.tuple_size), + rows: 0, + capacity: self.row_per_page, + }); + } } &mut self.pages[self.current_write_page - 1] } @@ -182,10 +184,14 @@ impl Payload { group_columns: &[Column], ) { let tuple_size = self.tuple_size; + let mut page = self.writable_page(); for idx in select_vector.iter().take(new_group_rows).copied() { - let page = self.writable_page(); address[idx] = unsafe { page.data.as_ptr().add(page.rows * tuple_size) as *const u8 }; page.rows += 1; + + if page.rows == page.capacity { + page = self.writable_page(); + } } self.total_rows += new_group_rows; @@ -299,9 +305,9 @@ impl Payload { address: &[*const u8], ) { let tuple_size = self.tuple_size; + let mut page = self.writable_page(); for i in 0..row_count { let index = select_vector[i]; - let page = self.writable_page(); unsafe { std::ptr::copy_nonoverlapping( address[index], @@ -310,6 +316,10 @@ impl Payload { ) } page.rows += 1; + + if page.rows == page.capacity { + page = self.writable_page(); + } } self.total_rows += row_count; diff --git a/src/query/expression/src/aggregate/payload_flush.rs b/src/query/expression/src/aggregate/payload_flush.rs index d497ed1d9160..97c965652a32 100644 --- a/src/query/expression/src/aggregate/payload_flush.rs +++ b/src/query/expression/src/aggregate/payload_flush.rs @@ -47,13 +47,10 @@ pub struct PayloadFlushState { pub state_places: [StateAddr; BATCH_SIZE], } -unsafe impl Send for PayloadFlushState {} -unsafe impl Sync for PayloadFlushState {} - -impl PayloadFlushState { - pub fn new() -> PayloadFlushState { +impl Default for PayloadFlushState { + fn default() -> Self { PayloadFlushState { - probe_state: ProbeState::new(), + probe_state: ProbeState::default(), group_columns: Vec::new(), aggregate_results: Vec::new(), row_count: 0, @@ -64,7 +61,12 @@ impl PayloadFlushState { state_places: [StateAddr::new(0); BATCH_SIZE], } } +} +unsafe impl Send for PayloadFlushState {} +unsafe impl Sync for PayloadFlushState {} + +impl PayloadFlushState { pub fn clear(&mut self) { self.row_count = 0; self.flush_partition = 0; diff --git a/src/query/expression/src/aggregate/payload_row.rs b/src/query/expression/src/aggregate/payload_row.rs index e6a6eaca7668..f05bc1041bb1 100644 --- a/src/query/expression/src/aggregate/payload_row.rs +++ b/src/query/expression/src/aggregate/payload_row.rs @@ -416,7 +416,7 @@ unsafe fn row_match_column_type( let validity_address = address[idx].add(validity_offset); let is_set2 = core::ptr::read::(validity_address as _) != 0; let is_set = is_all_set || validity.get_bit_unchecked(idx); - if is_set && is_set { + if is_set && is_set2 { let address = address[idx].add(col_offset); let scalar = core::ptr::read::<::Scalar>(address as _); let value = T::index_column_unchecked(&col, idx); diff --git a/src/query/expression/src/aggregate/probe_state.rs b/src/query/expression/src/aggregate/probe_state.rs index ff65013c5818..500ee20e73d0 100644 --- a/src/query/expression/src/aggregate/probe_state.rs +++ b/src/query/expression/src/aggregate/probe_state.rs @@ -36,11 +36,8 @@ pub struct ProbeState { pub partition_entries: HashMap, } -unsafe impl Send for ProbeState {} -unsafe impl Sync for ProbeState {} - -impl ProbeState { - pub fn new() -> Self { +impl Default for ProbeState { + fn default() -> Self { Self { group_hashes: [0_u64; BATCH_SIZE], addresses: [std::ptr::null::(); BATCH_SIZE], @@ -53,6 +50,12 @@ impl ProbeState { row_count: 0, } } +} + +unsafe impl Send for ProbeState {} +unsafe impl Sync for ProbeState {} + +impl ProbeState { pub fn set_incr_empty_vector(&mut self, row_count: usize) { for i in 0..row_count { self.empty_vector[i] = i; diff --git a/src/query/functions/tests/it/aggregates/agg_hashtable.rs b/src/query/functions/tests/it/aggregates/agg_hashtable.rs index d20f6067b0e0..b47f41b7c2f6 100644 --- a/src/query/functions/tests/it/aggregates/agg_hashtable.rs +++ b/src/query/functions/tests/it/aggregates/agg_hashtable.rs @@ -53,7 +53,6 @@ use itertools::Itertools; fn test_agg_hashtable() { let factory = AggregateFunctionFactory::instance(); let m: usize = 4; - const BATCH_SIZE: usize = 8192; for n in [100, 1000, 10_000, 100_000] { let columns = vec![ StringType::from_data( @@ -89,13 +88,12 @@ fn test_agg_hashtable() { ]; let params: Vec> = aggrs.iter().map(|_| vec![columns[1].clone()]).collect(); - let radix_bits = 3; let config = HashTableConfig::default(); let mut hashtable = AggregateHashTable::new(group_types.clone(), aggrs.clone(), config.clone()); - let mut state = ProbeState::new(); + let mut state = ProbeState::default(); let _ = hashtable .add_groups(&mut state, &group_columns, ¶ms, n) .unwrap(); @@ -103,15 +101,15 @@ fn test_agg_hashtable() { let mut hashtable2 = AggregateHashTable::new(group_types.clone(), aggrs.clone(), config.clone()); - let mut state2 = ProbeState::new(); + let mut state2 = ProbeState::default(); let _ = hashtable2 .add_groups(&mut state2, &group_columns, ¶ms, n) .unwrap(); - let mut flush_state = PayloadFlushState::new(); + let mut flush_state = PayloadFlushState::default(); let _ = hashtable.combine(hashtable2, &mut flush_state); - let mut merge_state = PayloadFlushState::new(); + let mut merge_state = PayloadFlushState::default(); let mut blocks = Vec::new(); loop { diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs index df8226e59e89..0c5a892e57e8 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs @@ -61,7 +61,7 @@ impl TransformFinalAggregate { TransformFinalAggregate:: { method, params, - flush_state: PayloadFlushState::new(), + flush_state: PayloadFlushState::default(), }, ))) } @@ -76,7 +76,7 @@ where Method: HashMethodBounds if let AggregateMeta::Partitioned { bucket, data } = meta { let mut reach_limit = false; let arena = Arc::new(Bump::new()); - let hashtable = self.method.create_hash_table::(arena.clone())?; + let hashtable = self.method.create_hash_table::(arena)?; let _dropper = AggregateHashTableDropper::create(self.params.clone()); let mut hash_cell = HashTableCell::::create(hashtable, _dropper); diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs index 31a15db3f83a..872a093751f4 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs @@ -126,7 +126,7 @@ impl TransformPartialAggregate { ) -> Result> { let hash_table = if !enable_experimental_aggregate_hashtable { let arena = Arc::new(Bump::new()); - let hashtable = method.create_hash_table(arena.clone())?; + let hashtable = method.create_hash_table(arena)?; let _dropper = AggregateHashTableDropper::create(params.clone()); let hashtable = HashTableCell::create(hashtable, _dropper); @@ -151,7 +151,7 @@ impl TransformPartialAggregate { method, params, hash_table, - probe_state: ProbeState::new(), + probe_state: ProbeState::default(), settings: AggregateSettings::try_from(ctx)?, }, )) diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs index 9d1b97d4c06c..400029bf5ece 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs @@ -55,7 +55,7 @@ impl TransformFinalGroupBy { TransformFinalGroupBy:: { method, params, - flush_state: PayloadFlushState::new(), + flush_state: PayloadFlushState::default(), }, ))) } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs index 2114ad8c0db5..4a95a4aced62 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs @@ -121,7 +121,7 @@ impl TransformPartialGroupBy { ) -> Result> { let hash_table = if !enable_experimental_aggregate_hashtable { let arena = Arc::new(Bump::new()); - let hashtable = method.create_hash_table(arena.clone())?; + let hashtable = method.create_hash_table(arena)?; let _dropper = GroupByHashTableDropper::::create(); HashTable::HashTable(HashTableCell::create(hashtable, _dropper)) } else { @@ -138,7 +138,7 @@ impl TransformPartialGroupBy { TransformPartialGroupBy:: { method, hash_table, - probe_state: ProbeState::new(), + probe_state: ProbeState::default(), group_columns: params.group_columns.clone(), settings: GroupBySettings::try_from(ctx)?, }, diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs index 24d5544de23e..a375513b98e9 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs @@ -100,7 +100,7 @@ impl output: OutputPort::create(), buckets_blocks: BTreeMap::new(), unsplitted_blocks: vec![], - flush_state: PayloadFlushState::new(), + flush_state: PayloadFlushState::default(), partition_payloads: vec![], initialized_all_inputs: false, max_partition_count: 0, From 539c66156e3ed5726a948198567b899ddc2a7b2b Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Wed, 15 Nov 2023 18:43:25 +0800 Subject: [PATCH 16/28] feat(query): update --- .../expression/src/aggregate/group_hash.rs | 4 +-- src/query/expression/src/utils/block_debug.rs | 30 +++++++++++++++---- .../tests/it/aggregates/agg_hashtable.rs | 8 ++--- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/src/query/expression/src/aggregate/group_hash.rs b/src/query/expression/src/aggregate/group_hash.rs index 175014e25164..ab12b2b5c0c8 100644 --- a/src/query/expression/src/aggregate/group_hash.rs +++ b/src/query/expression/src/aggregate/group_hash.rs @@ -188,9 +188,9 @@ macro_rules! impl_agg_hash_for_primitive_types { fn agg_hash(&self) -> u64 { let mut x = *self as u64; x ^= x >> 32; - x *= 0xd6e8feb86659fd93; + x = x.wrapping_mul(0xd6e8feb86659fd93); x ^= x >> 32; - x *= 0xd6e8feb86659fd93; + x = x.wrapping_mul(0xd6e8feb86659fd93); x ^= x >> 32; x } diff --git a/src/query/expression/src/utils/block_debug.rs b/src/query/expression/src/utils/block_debug.rs index b9e74e2ee540..bc67230cce3b 100644 --- a/src/query/expression/src/utils/block_debug.rs +++ b/src/query/expression/src/utils/block_debug.rs @@ -60,6 +60,19 @@ pub fn assert_block_value_eq(a: &DataBlock, b: &DataBlock) { } } +pub fn assert_block_value_sort_eq(a: &DataBlock, b: &DataBlock) { + assert!(a.num_columns() == b.num_columns()); + assert!(a.num_rows() == b.num_rows()); + + let a = pretty_format_blocks(&[a.clone()]).unwrap(); + let b = pretty_format_blocks(&[b.clone()]).unwrap(); + + let a: Vec<&str> = get_lines(&a); + let b: Vec<&str> = get_lines(&b); + + assert_eq!(a, b); +} + /// Assert with order insensitive. /// ['a', 'b'] equals ['b', 'a'] pub fn assert_blocks_sorted_eq_with_name(test_name: &str, expect: Vec<&str>, blocks: &[DataBlock]) { @@ -72,6 +85,16 @@ pub fn assert_blocks_sorted_eq_with_name(test_name: &str, expect: Vec<&str>, blo } let formatted = pretty_format_blocks(blocks).unwrap(); + let actual_lines: Vec<&str> = get_lines(&formatted); + + assert_eq!( + expected_lines, actual_lines, + "{:#?}\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", + test_name, expected_lines, actual_lines + ); +} + +fn get_lines(formatted: &String) -> Vec<&str> { let mut actual_lines: Vec<&str> = formatted.trim().lines().collect(); // sort except for header + footer @@ -79,12 +102,7 @@ pub fn assert_blocks_sorted_eq_with_name(test_name: &str, expect: Vec<&str>, blo if num_lines > 3 { actual_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() } - - assert_eq!( - expected_lines, actual_lines, - "{:#?}\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", - test_name, expected_lines, actual_lines - ); + actual_lines } /// Assert with order insensitive. diff --git a/src/query/functions/tests/it/aggregates/agg_hashtable.rs b/src/query/functions/tests/it/aggregates/agg_hashtable.rs index b47f41b7c2f6..acdb6320749d 100644 --- a/src/query/functions/tests/it/aggregates/agg_hashtable.rs +++ b/src/query/functions/tests/it/aggregates/agg_hashtable.rs @@ -26,6 +26,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use common_expression::block_debug::assert_block_value_sort_eq; use common_expression::types::ArgType; use common_expression::types::BooleanType; use common_expression::types::Float32Type; @@ -148,9 +149,8 @@ fn test_agg_hashtable() { UInt64Type::from_data(vec![urows / 2, urows / 2, urows / 2, urows / 2]), ]); - for (column, expected) in block.columns().iter().zip(expected_results.iter()) { - let column = column.value.as_column().unwrap(); - assert_eq!(column, expected) - } + let block_expected = DataBlock::new_from_columns(expected_results.clone()); + + assert_block_value_sort_eq(&block, &block_expected); } } From 52051876872251750b9067c6b7b9ab888607a78e Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Wed, 15 Nov 2023 23:06:28 +0800 Subject: [PATCH 17/28] feat(query): update --- .../expression/src/aggregate/aggregate_hashtable.rs | 6 ++---- .../expression/src/aggregate/partitioned_payload.rs | 9 ++------- src/query/expression/src/aggregate/payload.rs | 11 ++++++----- src/query/expression/src/utils/block_debug.rs | 2 +- 4 files changed, 11 insertions(+), 17 deletions(-) diff --git a/src/query/expression/src/aggregate/aggregate_hashtable.rs b/src/query/expression/src/aggregate/aggregate_hashtable.rs index f6f122b4db81..ed0bea425a39 100644 --- a/src/query/expression/src/aggregate/aggregate_hashtable.rs +++ b/src/query/expression/src/aggregate/aggregate_hashtable.rs @@ -138,6 +138,7 @@ impl AggregateHashTable { state.addresses[i].add(self.payload.state_offset) as _, ) as usize) }; + debug_assert_ne!(usize::from(state.state_places[i]) % 8, 0); } for ((aggr, params), addr_offset) in self @@ -430,6 +431,7 @@ impl AggregateHashTable { self.current_radix_bits = current_max_radix_bits; self.payload = payload.repartition(1 << current_max_radix_bits, &mut state); + self.resize(self.capacity); } } @@ -439,10 +441,6 @@ impl AggregateHashTable { } pub fn resize(&mut self, new_capacity: usize) { - if new_capacity == self.capacity { - return; - } - let mask = (new_capacity - 1) as u64; let mut entries = vec![0; new_capacity]; diff --git a/src/query/expression/src/aggregate/partitioned_payload.rs b/src/query/expression/src/aggregate/partitioned_payload.rs index c6e2eddf82df..ebbfca7c51f1 100644 --- a/src/query/expression/src/aggregate/partitioned_payload.rs +++ b/src/query/expression/src/aggregate/partitioned_payload.rs @@ -165,7 +165,7 @@ impl PartitionedPayload { } } - pub fn combine_single(&mut self, other: Payload, state: &mut PayloadFlushState) { + pub fn combine_single(&mut self, mut other: Payload, state: &mut PayloadFlushState) { if other.len() == 0 { return; } @@ -181,15 +181,10 @@ impl PartitionedPayload { let payload = &mut self.payloads[partition]; if let Some(sel) = &state.probe_state.partition_entries.get_mut(&partition) { payload.copy_rows(&sel.0, sel.1, &state.addresses); - - payload.external_arena.push(other.arena.clone()); - payload - .external_arena - .extend_from_slice(&other.external_arena); + payload.fetch_arenas(&mut other); } } } - other.forget(); } } diff --git a/src/query/expression/src/aggregate/payload.rs b/src/query/expression/src/aggregate/payload.rs index 9730b6137bbd..1de720e320e4 100644 --- a/src/query/expression/src/aggregate/payload.rs +++ b/src/query/expression/src/aggregate/payload.rs @@ -291,11 +291,10 @@ impl Payload { } pub fn combine(&mut self, mut other: Payload) { - other.state_move_out = true; - self.total_rows += other.pages.iter().map(|x| x.rows).sum::(); - self.external_arena.push(other.arena.clone()); self.pages.append(other.pages.as_mut()); + + self.fetch_arenas(&mut other); } pub fn copy_rows( @@ -330,8 +329,10 @@ impl Payload { ); } - pub fn forget(mut self) { - self.state_move_out = true; + pub fn fetch_arenas(&mut self, other: &mut Self) { + self.external_arena.push(other.arena.clone()); + self.external_arena.extend_from_slice(&other.external_arena); + other.state_move_out = true; } } diff --git a/src/query/expression/src/utils/block_debug.rs b/src/query/expression/src/utils/block_debug.rs index bc67230cce3b..a0c3d3f80f6e 100644 --- a/src/query/expression/src/utils/block_debug.rs +++ b/src/query/expression/src/utils/block_debug.rs @@ -94,7 +94,7 @@ pub fn assert_blocks_sorted_eq_with_name(test_name: &str, expect: Vec<&str>, blo ); } -fn get_lines(formatted: &String) -> Vec<&str> { +fn get_lines(formatted: &str) -> Vec<&str> { let mut actual_lines: Vec<&str> = formatted.trim().lines().collect(); // sort except for header + footer From 490435495c6a4eda32ebeef8dae4697f6f559751 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Wed, 15 Nov 2023 23:58:45 +0800 Subject: [PATCH 18/28] feat(query): update --- src/common/hashtable/src/lib.rs | 2 +- src/common/hashtable/src/utils.rs | 10 ++++++++++ src/query/expression/src/aggregate/payload_row.rs | 6 +++--- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/common/hashtable/src/lib.rs b/src/common/hashtable/src/lib.rs index 910fa287dd75..faf5ddb84397 100644 --- a/src/common/hashtable/src/lib.rs +++ b/src/common/hashtable/src/lib.rs @@ -112,4 +112,4 @@ pub use partitioned_hashtable::hash2bucket; pub type HashJoinHashMap = hashjoin_hashtable::HashJoinHashTable; pub type StringHashJoinHashMap = hashjoin_string_hashtable::HashJoinStringHashTable; pub use traits::HashJoinHashtableLike; -pub use utils::sse::memcmp_sse; +pub use utils::fast_memcmp; diff --git a/src/common/hashtable/src/utils.rs b/src/common/hashtable/src/utils.rs index bf97b9941abb..2d43578e376c 100644 --- a/src/common/hashtable/src/utils.rs +++ b/src/common/hashtable/src/utils.rs @@ -97,6 +97,16 @@ pub unsafe fn read_le(data: *const u8, len: usize) -> u64 { } } +#[inline] +pub fn fast_memcmp(a: &[u8], b: &[u8]) -> bool { + #[cfg(all(target_arch = "x86_64", target_feature = "sse4.2"))] + unsafe { + sse::compare_sse2(a, b) + } + #[cfg(not(all(any(target_arch = "x86_64"), target_feature = "sse4.2")))] + a == b +} + #[cfg(all(target_arch = "x86_64", target_feature = "sse4.2"))] pub mod sse { use std::arch::x86_64::*; diff --git a/src/query/expression/src/aggregate/payload_row.rs b/src/query/expression/src/aggregate/payload_row.rs index f05bc1041bb1..9316022d4815 100644 --- a/src/query/expression/src/aggregate/payload_row.rs +++ b/src/query/expression/src/aggregate/payload_row.rs @@ -14,7 +14,6 @@ use bumpalo::Bump; use common_arrow::arrow::bitmap::Bitmap; -use common_hashtable::memcmp_sse; use ethnum::i256; use crate::store; @@ -347,7 +346,7 @@ unsafe fn row_match_string_column( } else { let data_address = core::ptr::read::(address as _) as usize as *const u8; let scalar = std::slice::from_raw_parts(data_address, len); - equal = memcmp_sse(scalar, value); + equal = common_hashtable::fast_memcmp(scalar, value); } } else { equal = is_set == is_set2; @@ -375,7 +374,8 @@ unsafe fn row_match_string_column( } else { let data_address = core::ptr::read::(address as _) as usize as *const u8; let scalar = std::slice::from_raw_parts(data_address, len); - equal = memcmp_sse(scalar, value); + + equal = common_hashtable::fast_memcmp(scalar, value); } if equal { From e2706da7a998f999c000a9be73de4cd71bd4b35b Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Thu, 16 Nov 2023 00:10:19 +0800 Subject: [PATCH 19/28] feat(query): update --- src/common/hashtable/src/utils.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/common/hashtable/src/utils.rs b/src/common/hashtable/src/utils.rs index 2d43578e376c..ed0ac6070142 100644 --- a/src/common/hashtable/src/utils.rs +++ b/src/common/hashtable/src/utils.rs @@ -97,13 +97,15 @@ pub unsafe fn read_le(data: *const u8, len: usize) -> u64 { } } +#[cfg(all(target_arch = "x86_64", target_feature = "sse4.2"))] +#[inline] +pub fn fast_memcmp(a: &[u8], b: &[u8]) -> bool { + unsafe { sse::memcmp_sse(a, b) } +} + +#[cfg(not(all(any(target_arch = "x86_64"), target_feature = "sse4.2")))] #[inline] pub fn fast_memcmp(a: &[u8], b: &[u8]) -> bool { - #[cfg(all(target_arch = "x86_64", target_feature = "sse4.2"))] - unsafe { - sse::compare_sse2(a, b) - } - #[cfg(not(all(any(target_arch = "x86_64"), target_feature = "sse4.2")))] a == b } From d18ca5379869f76c21fefdbca69b4395d536f7e9 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Thu, 16 Nov 2023 04:13:38 +0800 Subject: [PATCH 20/28] feat(query): update --- src/query/expression/src/aggregate/aggregate_hashtable.rs | 2 +- src/query/expression/src/kernels/utils.rs | 2 +- src/query/service/tests/it/storages/testdata/settings_table.txt | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/query/expression/src/aggregate/aggregate_hashtable.rs b/src/query/expression/src/aggregate/aggregate_hashtable.rs index ed0bea425a39..5ef492a43d95 100644 --- a/src/query/expression/src/aggregate/aggregate_hashtable.rs +++ b/src/query/expression/src/aggregate/aggregate_hashtable.rs @@ -138,7 +138,7 @@ impl AggregateHashTable { state.addresses[i].add(self.payload.state_offset) as _, ) as usize) }; - debug_assert_ne!(usize::from(state.state_places[i]) % 8, 0); + debug_assert_eq!(usize::from(state.state_places[i]) % 8, 0); } for ((aggr, params), addr_offset) in self diff --git a/src/query/expression/src/kernels/utils.rs b/src/query/expression/src/kernels/utils.rs index e18e96da7336..0db8517fd157 100644 --- a/src/query/expression/src/kernels/utils.rs +++ b/src/query/expression/src/kernels/utils.rs @@ -66,7 +66,7 @@ pub unsafe fn set_vec_len_by_ptr(vec: &mut Vec, ptr: *const T) { } /// # Safety -/// # As: copy_nonoverlapping +/// # As: core::ptr::write #[inline] pub unsafe fn store(val: T, ptr: *mut u8) { core::ptr::write(ptr as _, val) diff --git a/src/query/service/tests/it/storages/testdata/settings_table.txt b/src/query/service/tests/it/storages/testdata/settings_table.txt index f7d706596335..8ac35c1ddcfd 100644 --- a/src/query/service/tests/it/storages/testdata/settings_table.txt +++ b/src/query/service/tests/it/storages/testdata/settings_table.txt @@ -17,6 +17,7 @@ DB.Table: 'system'.'settings', Table: settings-table_id:1, ver:0, Engine: System | 'enable_distributed_recluster' | '0' | '0' | 'SESSION' | 'Enable distributed execution of table recluster.' | 'UInt64' | | 'enable_distributed_replace_into' | '0' | '0' | 'SESSION' | 'Enable distributed execution of replace into.' | 'UInt64' | | 'enable_dphyp' | '1' | '1' | 'SESSION' | 'Enables dphyp join order algorithm.' | 'UInt64' | +| 'enable_experimental_aggregate_hashtable' | '0' | '0' | 'SESSION' | 'Enables experimental aggregate hashtable' | 'UInt64' | | 'enable_experimental_merge_into' | '0' | '0' | 'SESSION' | 'Enable experimental merge into.' | 'UInt64' | | 'enable_hive_parquet_predict_pushdown' | '1' | '1' | 'SESSION' | 'Enable hive parquet predict pushdown by setting this variable to 1, default value: 1' | 'UInt64' | | 'enable_parquet_page_index' | '1' | '1' | 'SESSION' | 'Enables parquet page index' | 'UInt64' | From 3c39641d1935baaaeb3e210fbc9f3112946aff14 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Thu, 16 Nov 2023 04:26:11 +0800 Subject: [PATCH 21/28] feat(query): update --- src/query/expression/src/aggregate/partitioned_payload.rs | 4 +--- src/query/expression/src/aggregate/payload.rs | 8 ++------ 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/src/query/expression/src/aggregate/partitioned_payload.rs b/src/query/expression/src/aggregate/partitioned_payload.rs index ebbfca7c51f1..3db90fb34a78 100644 --- a/src/query/expression/src/aggregate/partitioned_payload.rs +++ b/src/query/expression/src/aggregate/partitioned_payload.rs @@ -13,9 +13,7 @@ // limitations under the License. use std::alloc::Layout; -use std::sync::Arc; -use bumpalo::Bump; use itertools::Itertools; use super::payload::Payload; @@ -57,7 +55,7 @@ impl PartitionedPayload { debug_assert_eq!(1 << radix_bits, partition_count); let payloads = (0..partition_count) - .map(|_| Payload::new(Arc::new(Bump::new()), group_types.clone(), aggrs.clone())) + .map(|_| Payload::new(group_types.clone(), aggrs.clone())) .collect_vec(); let group_sizes = payloads[0].group_sizes.clone(); diff --git a/src/query/expression/src/aggregate/payload.rs b/src/query/expression/src/aggregate/payload.rs index 1de720e320e4..9b65f2b9c819 100644 --- a/src/query/expression/src/aggregate/payload.rs +++ b/src/query/expression/src/aggregate/payload.rs @@ -74,11 +74,7 @@ pub type Pages = Vec; // TODO FIXME impl Payload { - pub fn new( - arena: Arc, - group_types: Vec, - aggrs: Vec, - ) -> Self { + pub fn new(group_types: Vec, aggrs: Vec) -> Self { let mut state_addr_offsets = Vec::new(); let state_layout = if !aggrs.is_empty() { Some(get_layout_offsets(&aggrs, &mut state_addr_offsets).unwrap()) @@ -120,7 +116,7 @@ impl Payload { let row_per_page = (u16::MAX as usize).min(MAX_PAGE_SIZE / tuple_size).max(1); Self { - arena, + arena: Arc::new(Bump::new()), external_arena: vec![], state_move_out: false, pages: vec![], From 01ac099a15d01003da2fa1193babeceb20cbb86f Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Thu, 16 Nov 2023 06:22:12 +0800 Subject: [PATCH 22/28] feat(query): update --- src/query/expression/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/query/expression/src/lib.rs b/src/query/expression/src/lib.rs index 5673a3f516b9..d5e8f9da7fdb 100755 --- a/src/query/expression/src/lib.rs +++ b/src/query/expression/src/lib.rs @@ -15,6 +15,7 @@ #![allow(clippy::uninlined_format_args)] #![allow(clippy::len_without_is_empty)] #![allow(clippy::needless_lifetimes)] +#![allow(clippy::arc_with_non_send_sync)] // FIXME: we should avoid this by implementing Ord correctly. #![allow(clippy::non_canonical_partial_ord_impl)] #![allow(incomplete_features)] From 38a99bd82f81bccc1b455cfa03d83d66d90c83b6 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Thu, 16 Nov 2023 19:48:39 +0800 Subject: [PATCH 23/28] feat(query): update --- .../pipelines/builders/builder_aggregate.rs | 19 ++-- .../aggregator/aggregator_params.rs | 3 + .../aggregator/transform_aggregate_final.rs | 96 +++++++++++-------- .../aggregator/transform_aggregate_partial.rs | 3 +- .../aggregator/transform_group_by_final.rs | 53 ++++++++++ .../aggregator/transform_group_by_partial.rs | 3 +- 6 files changed, 127 insertions(+), 50 deletions(-) diff --git a/src/query/service/src/pipelines/builders/builder_aggregate.rs b/src/query/service/src/pipelines/builders/builder_aggregate.rs index 14598969f0c2..dd7bec4f568e 100644 --- a/src/query/service/src/pipelines/builders/builder_aggregate.rs +++ b/src/query/service/src/pipelines/builders/builder_aggregate.rs @@ -103,10 +103,16 @@ impl PipelineBuilder { self.build_pipeline(&aggregate.input)?; let max_block_size = self.settings.get_max_block_size()?; + let enable_experimental_aggregate_hashtable = self + .settings + .get_enable_experimental_aggregate_hashtable()? + && self.ctx.get_cluster().is_empty(); + let params = Self::build_aggregator_params( aggregate.input.output_schema()?, &aggregate.group_by, &aggregate.agg_funcs, + enable_experimental_aggregate_hashtable, max_block_size as usize, None, )?; @@ -128,10 +134,6 @@ impl PipelineBuilder { } let efficiently_memory = self.settings.get_efficiently_memory_group_by()?; - let enable_experimental_aggregate_hashtable = self - .settings - .get_enable_experimental_aggregate_hashtable()? - && self.ctx.get_cluster().is_empty(); let group_cols = ¶ms.group_columns; let schema_before_group_by = params.input_schema.clone(); @@ -151,7 +153,6 @@ impl PipelineBuilder { output, params.clone(), partial_agg_config.clone(), - enable_experimental_aggregate_hashtable, ), }), false => with_mappedhash_method!(|T| match method.clone() { @@ -162,7 +163,6 @@ impl PipelineBuilder { output, params.clone(), partial_agg_config.clone(), - enable_experimental_aggregate_hashtable, ), }), }?; @@ -235,11 +235,16 @@ impl PipelineBuilder { pub(crate) fn build_aggregate_final(&mut self, aggregate: &AggregateFinal) -> Result<()> { let max_block_size = self.settings.get_max_block_size()?; + let enable_experimental_aggregate_hashtable = self + .settings + .get_enable_experimental_aggregate_hashtable()? + && self.ctx.get_cluster().is_empty(); let params = Self::build_aggregator_params( aggregate.before_group_by_schema.clone(), &aggregate.group_by, &aggregate.agg_funcs, + enable_experimental_aggregate_hashtable, max_block_size as usize, aggregate.limit, )?; @@ -339,6 +344,7 @@ impl PipelineBuilder { input_schema: DataSchemaRef, group_by: &[IndexType], agg_funcs: &[AggregateFunctionDesc], + enable_experimental_aggregate_hashtable: bool, max_block_size: usize, limit: Option, ) -> Result> { @@ -379,6 +385,7 @@ impl PipelineBuilder { &group_by, &aggs, &agg_args, + enable_experimental_aggregate_hashtable, max_block_size, limit, )?; diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregator_params.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregator_params.rs index 81c6c9be8c4a..534e02ca0431 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregator_params.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregator_params.rs @@ -38,6 +38,7 @@ pub struct AggregatorParams { pub layout: Option, pub offsets_aggregate_states: Vec, + pub enable_experimental_aggregate_hashtable: bool, pub max_block_size: usize, // Limit is push down to AggregatorTransform pub limit: Option, @@ -50,6 +51,7 @@ impl AggregatorParams { group_columns: &[usize], agg_funcs: &[AggregateFunctionRef], agg_args: &[Vec], + enable_experimental_aggregate_hashtable: bool, max_block_size: usize, limit: Option, ) -> Result> { @@ -68,6 +70,7 @@ impl AggregatorParams { aggregate_functions_arguments: agg_args.to_vec(), layout: states_layout, offsets_aggregate_states: states_offsets, + enable_experimental_aggregate_hashtable, max_block_size, limit, })) diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs index d98896c831b9..757c288f8f27 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs @@ -65,6 +65,57 @@ impl TransformFinalAggregate { }, ))) } + + fn transform_agg_hashtable(&mut self, meta: AggregateMeta) -> Result { + let mut agg_hashtable: Option = None; + if let AggregateMeta::Partitioned { bucket: _, data } = meta { + for bucket_data in data { + match bucket_data { + AggregateMeta::AggregateHashTable(payload) => match agg_hashtable.as_mut() { + Some(ht) => { + ht.combine_payloads(&payload, &mut self.flush_state)?; + } + None => { + let capacity = + AggregateHashTable::get_capacity_for_count(payload.len()); + + let mut hashtable = AggregateHashTable::new_with_capacity( + self.params.group_data_types.clone(), + self.params.aggregate_functions.clone(), + HashTableConfig::default().with_initial_radix_bits(0), + capacity, + ); + hashtable.combine_payloads(&payload, &mut self.flush_state)?; + agg_hashtable = Some(hashtable); + } + }, + _ => unreachable!(), + } + } + } + + if let Some(mut ht) = agg_hashtable { + let mut blocks = vec![]; + self.flush_state.clear(); + loop { + if ht.merge_result(&mut self.flush_state)? { + let mut cols = self.flush_state.take_aggregate_results(); + cols.extend_from_slice(&self.flush_state.group_columns); + + blocks.push(DataBlock::new_from_columns(cols)); + } else { + break; + } + } + + if blocks.is_empty() { + return Ok(DataBlock::empty()); + } + return DataBlock::concat(&blocks); + } + + Ok(DataBlock::empty()) + } } impl BlockMetaTransform> for TransformFinalAggregate @@ -73,6 +124,10 @@ where Method: HashMethodBounds const NAME: &'static str = "TransformFinalAggregate"; fn transform(&mut self, meta: AggregateMeta) -> Result { + if self.params.enable_experimental_aggregate_hashtable { + return self.transform_agg_hashtable(meta); + } + if let AggregateMeta::Partitioned { bucket, data } = meta { let mut reach_limit = false; let arena = Arc::new(Bump::new()); @@ -80,8 +135,6 @@ where Method: HashMethodBounds let _dropper = AggregateHashTableDropper::create(self.params.clone()); let mut hash_cell = HashTableCell::::create(hashtable, _dropper); - let mut agg_hashtable: Option = None; - for bucket_data in data { match bucket_data { AggregateMeta::Spilled(_) => unreachable!(), @@ -186,45 +239,8 @@ where Method: HashMethodBounds } } }, - AggregateMeta::AggregateHashTable(payload) => match agg_hashtable.as_mut() { - Some(ht) => { - ht.combine_payloads(&payload, &mut self.flush_state)?; - } - None => { - let capacity = - AggregateHashTable::get_capacity_for_count(payload.len()); - - let mut hashtable = AggregateHashTable::new_with_capacity( - self.params.group_data_types.clone(), - self.params.aggregate_functions.clone(), - HashTableConfig::default().with_initial_radix_bits(0), - capacity, - ); - hashtable.combine_payloads(&payload, &mut self.flush_state)?; - agg_hashtable = Some(hashtable); - } - }, - } - } - - if let Some(mut ht) = agg_hashtable { - let mut blocks = vec![]; - self.flush_state.clear(); - loop { - if ht.merge_result(&mut self.flush_state)? { - let mut cols = self.flush_state.take_aggregate_results(); - cols.extend_from_slice(&self.flush_state.group_columns); - - blocks.push(DataBlock::new_from_columns(cols)); - } else { - break; - } - } - - if blocks.is_empty() { - return Ok(DataBlock::empty()); + AggregateMeta::AggregateHashTable(_) => unreachable!(), } - return DataBlock::concat(&blocks); } let keys_len = hash_cell.hashtable.len(); diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs index 77a5adcd05ba..5ab20f2b8df6 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs @@ -122,9 +122,8 @@ impl TransformPartialAggregate { output: Arc, params: Arc, config: HashTableConfig, - enable_experimental_aggregate_hashtable: bool, ) -> Result> { - let hash_table = if !enable_experimental_aggregate_hashtable { + let hash_table = if !params.enable_experimental_aggregate_hashtable { let arena = Arc::new(Bump::new()); let hashtable = method.create_hash_table(arena)?; let _dropper = AggregateHashTableDropper::create(params.clone()); diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs index 22bf43d22fb4..5b218679b637 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs @@ -59,6 +59,55 @@ impl TransformFinalGroupBy { }, ))) } + + fn transform_agg_hashtable(&mut self, meta: AggregateMeta) -> Result { + let mut agg_hashtable: Option = None; + if let AggregateMeta::Partitioned { bucket: _, data } = meta { + for bucket_data in data { + match bucket_data { + AggregateMeta::AggregateHashTable(payload) => match agg_hashtable.as_mut() { + Some(ht) => { + ht.combine_payloads(&payload, &mut self.flush_state)?; + } + None => { + let capacity = + AggregateHashTable::get_capacity_for_count(payload.len()); + let mut hashtable = AggregateHashTable::new_with_capacity( + self.params.group_data_types.clone(), + self.params.aggregate_functions.clone(), + HashTableConfig::default().with_initial_radix_bits(0), + capacity, + ); + hashtable.combine_payloads(&payload, &mut self.flush_state)?; + agg_hashtable = Some(hashtable); + } + }, + _ => unreachable!(), + } + } + } + + if let Some(mut ht) = agg_hashtable { + let mut blocks = vec![]; + self.flush_state.clear(); + loop { + if ht.merge_result(&mut self.flush_state)? { + blocks.push(DataBlock::new_from_columns( + self.flush_state.take_group_columns(), + )); + } else { + break; + } + } + + if blocks.is_empty() { + return Ok(DataBlock::empty()); + } + + return DataBlock::concat(&blocks); + } + Ok(DataBlock::empty()) + } } impl BlockMetaTransform> for TransformFinalGroupBy @@ -67,6 +116,10 @@ where Method: HashMethodBounds const NAME: &'static str = "TransformFinalGroupBy"; fn transform(&mut self, meta: AggregateMeta) -> Result { + if self.params.enable_experimental_aggregate_hashtable { + return self.transform_agg_hashtable(meta); + } + if let AggregateMeta::Partitioned { bucket, data } = meta { let arena = Arc::new(Bump::new()); let mut hashtable = self.method.create_hash_table::<()>(arena)?; diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs index 31deed0a01ec..d70546fde379 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs @@ -117,9 +117,8 @@ impl TransformPartialGroupBy { output: Arc, params: Arc, config: HashTableConfig, - enable_experimental_aggregate_hashtable: bool, ) -> Result> { - let hash_table = if !enable_experimental_aggregate_hashtable { + let hash_table = if !params.enable_experimental_aggregate_hashtable { let arena = Arc::new(Bump::new()); let hashtable = method.create_hash_table(arena)?; let _dropper = GroupByHashTableDropper::::create(); From a9e678817e1c4ac2c2082a1180d8dce2276e1d91 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Fri, 17 Nov 2023 17:52:15 +0800 Subject: [PATCH 24/28] feat(query): update --- .../src/aggregate/aggregate_hashtable.rs | 21 ++++-- src/query/expression/src/aggregate/mod.rs | 9 ++- .../src/aggregate/partitioned_payload.rs | 72 +++++++++---------- src/query/expression/src/aggregate/payload.rs | 20 ++---- .../expression/src/aggregate/probe_state.rs | 20 +++--- 5 files changed, 73 insertions(+), 69 deletions(-) diff --git a/src/query/expression/src/aggregate/aggregate_hashtable.rs b/src/query/expression/src/aggregate/aggregate_hashtable.rs index 5ef492a43d95..a79a57486013 100644 --- a/src/query/expression/src/aggregate/aggregate_hashtable.rs +++ b/src/query/expression/src/aggregate/aggregate_hashtable.rs @@ -30,8 +30,9 @@ use crate::ColumnBuilder; use crate::HashTableConfig; use crate::Payload; use crate::StateAddr; +use crate::L2_MAX_ROWS_IN_HT; +use crate::L3_MAX_ROWS_IN_HT; use crate::LOAD_FACTOR; -use crate::MAX_ROWS_IN_HT; // The high 16 bits are the salt, the low 48 bits are the pointer address pub type Entry = u64; @@ -166,8 +167,6 @@ impl AggregateHashTable { group_columns: &[Column], row_count: usize, ) -> usize { - self.maybe_repartition(); - if self.current_radix_bits == self.config.max_radix_bits && self.should_disable_expand_hash_table() { @@ -390,7 +389,7 @@ impl AggregateHashTable { let mut new_radix_bits = self.current_radix_bits; // 256k - if bytes_per_partition > 256 * 1024 { + if bytes_per_partition >= 256 * 1024 { new_radix_bits += self.config.repartition_radix_bits_incr; // If reducion is small and input rows will be very large, directly repartition to max radix bits @@ -431,7 +430,6 @@ impl AggregateHashTable { self.current_radix_bits = current_max_radix_bits; self.payload = payload.repartition(1 << current_max_radix_bits, &mut state); - self.resize(self.capacity); } } @@ -441,6 +439,8 @@ impl AggregateHashTable { } pub fn resize(&mut self, new_capacity: usize) { + self.maybe_repartition(); + let mask = (new_capacity - 1) as u64; let mut entries = vec![0; new_capacity]; @@ -482,12 +482,19 @@ impl AggregateHashTable { return true; } - if !self.config.partial_agg || self.capacity < MAX_ROWS_IN_HT { + if !self.config.partial_agg || self.len() < L2_MAX_ROWS_IN_HT { return false; } let ratio = self.probe_input_rows as f64 / self.len() as f64; - self.disable_expand_ht = ratio <= self.config.min_reduction; + + let min_reduction = if self.len() >= L3_MAX_ROWS_IN_HT { + self.config.min_reductions[1] + } else { + self.config.min_reductions[0] + }; + + self.disable_expand_ht = ratio <= min_reduction; self.disable_expand_ht } diff --git a/src/query/expression/src/aggregate/mod.rs b/src/query/expression/src/aggregate/mod.rs index 1ccaf90f7bd1..1ef49e224b69 100644 --- a/src/query/expression/src/aggregate/mod.rs +++ b/src/query/expression/src/aggregate/mod.rs @@ -47,9 +47,11 @@ pub fn new_sel() -> SelectVector { // A batch size to probe, flush, repartition, etc. pub(crate) const BATCH_SIZE: usize = 2048; pub(crate) const LOAD_FACTOR: f64 = 1.5; -pub(crate) const MAX_ROWS_IN_HT: usize = 32 * 1024; pub(crate) const MAX_PAGE_SIZE: usize = 256 * 1024; +pub(crate) const L2_MAX_ROWS_IN_HT: usize = 1024 * 1024 / 8; +pub(crate) const L3_MAX_ROWS_IN_HT: usize = 16 * 1024 * 1024 / 8; + #[derive(Clone, Debug)] pub struct HashTableConfig { // Max radix bits across all threads, this is a hint to repartition @@ -60,7 +62,8 @@ pub struct HashTableConfig { pub block_fill_factor: f64, pub partial_agg: bool, // min reduction ratio to control whether to expand the ht - pub min_reduction: f64, + // {1024 * 1024, 1.1} / {1024 * 1024, 2.0}, + pub min_reductions: [f64; 2], } impl Default for HashTableConfig { @@ -72,7 +75,7 @@ impl Default for HashTableConfig { repartition_radix_bits_incr: 2, block_fill_factor: 1.8, partial_agg: false, - min_reduction: 2.1, + min_reductions: [1.1, 2.0], } } } diff --git a/src/query/expression/src/aggregate/partitioned_payload.rs b/src/query/expression/src/aggregate/partitioned_payload.rs index 3db90fb34a78..c82fd6cb71a4 100644 --- a/src/query/expression/src/aggregate/partitioned_payload.rs +++ b/src/query/expression/src/aggregate/partitioned_payload.rs @@ -13,7 +13,9 @@ // limitations under the License. use std::alloc::Layout; +use std::sync::Arc; +use bumpalo::Bump; use itertools::Itertools; use super::payload::Payload; @@ -37,6 +39,8 @@ pub struct PartitionedPayload { pub state_addr_offsets: Vec, pub state_layout: Option, + arenas: Vec>, + partition_count: u64, mask_v: u64, shift_v: u64, @@ -54,8 +58,10 @@ impl PartitionedPayload { let radix_bits = partition_count.trailing_zeros() as u64; debug_assert_eq!(1 << radix_bits, partition_count); + let arena = Arc::new(Bump::new()); + let payloads = (0..partition_count) - .map(|_| Payload::new(group_types.clone(), aggrs.clone())) + .map(|_| Payload::new(arena.clone(), group_types.clone(), aggrs.clone())) .collect_vec(); let group_sizes = payloads[0].group_sizes.clone(); @@ -78,6 +84,8 @@ impl PartitionedPayload { state_addr_offsets, state_layout, partition_count, + + arenas: vec![arena], mask_v: mask(radix_bits), shift_v: shift(radix_bits), } @@ -99,34 +107,28 @@ impl PartitionedPayload { ); } else { // generate partition selection indices - state.reset_partitions(); + state.reset_partitions(self.partition_count()); let select_vector = &state.empty_vector; for idx in select_vector.iter().take(new_group_rows).copied() { let hash = state.group_hashes[idx]; let partition_idx = ((hash & self.mask_v) >> self.shift_v) as usize; - match state.partition_entries.get_mut(&partition_idx) { - Some((v, count)) => { - v[*count] = idx; - *count += 1; - } - None => { - let mut v = [0; BATCH_SIZE]; - v[0] = idx; - state.partition_entries.insert(partition_idx, (v, 1)); - } - } + let sel = &mut state.partition_entries[partition_idx]; + + sel[state.partition_count[partition_idx]] = idx; + state.partition_count[partition_idx] += 1; } for partition_index in 0..self.payloads.len() { - if let Some((select_vector, count)) = - state.partition_entries.get_mut(&partition_index) - { + let count = state.partition_count[partition_index]; + if count > 0 { + let sel = &state.partition_entries[partition_index]; + self.payloads[partition_index].reserve_append_rows( - select_vector, + sel, &state.group_hashes, &mut state.addresses, - *count, + count, group_columns, ); } @@ -149,7 +151,7 @@ impl PartitionedPayload { new_partition_payload } - pub fn combine(&mut self, other: PartitionedPayload, state: &mut PayloadFlushState) { + pub fn combine(&mut self, mut other: PartitionedPayload, state: &mut PayloadFlushState) { if other.partition_count == self.partition_count { for (l, r) in self.payloads.iter_mut().zip(other.payloads.into_iter()) { l.combine(r); @@ -161,9 +163,10 @@ impl PartitionedPayload { self.combine_single(payload, state) } } + self.arenas.append(&mut other.arenas); } - pub fn combine_single(&mut self, mut other: Payload, state: &mut PayloadFlushState) { + fn combine_single(&mut self, mut other: Payload, state: &mut PayloadFlushState) { if other.len() == 0 { return; } @@ -177,12 +180,15 @@ impl PartitionedPayload { // copy rows for partition in 0..self.partition_count as usize { let payload = &mut self.payloads[partition]; - if let Some(sel) = &state.probe_state.partition_entries.get_mut(&partition) { - payload.copy_rows(&sel.0, sel.1, &state.addresses); - payload.fetch_arenas(&mut other); + let count = state.probe_state.partition_count[partition]; + + if count > 0 { + let sel = &state.probe_state.partition_entries[partition]; + payload.copy_rows(sel, count, &state.addresses); } } } + other.state_move_out = true; } } @@ -205,7 +211,7 @@ impl PartitionedPayload { let rows = end - state.flush_page_row; state.row_count = rows; - state.probe_state.reset_partitions(); + state.probe_state.reset_partitions(self.partition_count()); for idx in 0..rows { state.addresses[idx] = other.data_ptr(page, idx + state.flush_page_row); @@ -214,20 +220,10 @@ impl PartitionedPayload { unsafe { core::ptr::read::(state.addresses[idx].add(self.hash_offset) as _) }; let partition_idx = ((hash & self.mask_v) >> self.shift_v) as usize; - match state.probe_state.partition_entries.get_mut(&partition_idx) { - Some((v, count)) => { - v[*count] = idx; - *count += 1; - } - None => { - let mut v = [0; BATCH_SIZE]; - v[0] = idx; - state - .probe_state - .partition_entries - .insert(partition_idx, (v, 1)); - } - } + + let sel = &mut state.probe_state.partition_entries[partition_idx]; + sel[state.probe_state.partition_count[partition_idx]] = idx; + state.probe_state.partition_count[partition_idx] += 1; } state.flush_page_row = end; true diff --git a/src/query/expression/src/aggregate/payload.rs b/src/query/expression/src/aggregate/payload.rs index 9b65f2b9c819..3fd9f440556d 100644 --- a/src/query/expression/src/aggregate/payload.rs +++ b/src/query/expression/src/aggregate/payload.rs @@ -37,7 +37,6 @@ use crate::MAX_PAGE_SIZE; // [STATE_ADDRS] is the state_addrs of the aggregate functions, 8 bytes each pub struct Payload { pub arena: Arc, - pub external_arena: Vec>, // if true, the states are moved out of the payload into other payload, and will not be dropped pub state_move_out: bool, pub group_types: Vec, @@ -74,7 +73,11 @@ pub type Pages = Vec; // TODO FIXME impl Payload { - pub fn new(group_types: Vec, aggrs: Vec) -> Self { + pub fn new( + arena: Arc, + group_types: Vec, + aggrs: Vec, + ) -> Self { let mut state_addr_offsets = Vec::new(); let state_layout = if !aggrs.is_empty() { Some(get_layout_offsets(&aggrs, &mut state_addr_offsets).unwrap()) @@ -116,8 +119,7 @@ impl Payload { let row_per_page = (u16::MAX as usize).min(MAX_PAGE_SIZE / tuple_size).max(1); Self { - arena: Arc::new(Bump::new()), - external_arena: vec![], + arena, state_move_out: false, pages: vec![], current_write_page: 0, @@ -146,7 +148,7 @@ impl Payload { } pub fn memory_size(&self) -> usize { - self.pages.iter().map(|x| x.data.capacity()).sum() + self.total_rows * self.tuple_size } #[inline] @@ -289,8 +291,6 @@ impl Payload { pub fn combine(&mut self, mut other: Payload) { self.total_rows += other.pages.iter().map(|x| x.rows).sum::(); self.pages.append(other.pages.as_mut()); - - self.fetch_arenas(&mut other); } pub fn copy_rows( @@ -324,12 +324,6 @@ impl Payload { self.pages.iter().map(|x| x.rows).sum::() ); } - - pub fn fetch_arenas(&mut self, other: &mut Self) { - self.external_arena.push(other.arena.clone()); - self.external_arena.extend_from_slice(&other.external_arena); - other.state_move_out = true; - } } impl Drop for Payload { diff --git a/src/query/expression/src/aggregate/probe_state.rs b/src/query/expression/src/aggregate/probe_state.rs index 500ee20e73d0..8e5932b5c022 100644 --- a/src/query/expression/src/aggregate/probe_state.rs +++ b/src/query/expression/src/aggregate/probe_state.rs @@ -12,10 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashMap; - use crate::new_sel; -use crate::PerfectHashBuilder; use crate::SelectVector; use crate::StateAddr; use crate::BATCH_SIZE; @@ -33,7 +30,8 @@ pub struct ProbeState { pub temp_vector: SelectVector, pub row_count: usize, - pub partition_entries: HashMap, + pub partition_entries: Vec, + pub partition_count: Vec, } impl Default for ProbeState { @@ -46,7 +44,8 @@ impl Default for ProbeState { no_match_vector: new_sel(), empty_vector: new_sel(), temp_vector: new_sel(), - partition_entries: HashMap::with_hasher(PerfectHashBuilder), + partition_entries: vec![], + partition_count: vec![], row_count: 0, } } @@ -62,9 +61,14 @@ impl ProbeState { } } - pub fn reset_partitions(&mut self) { - for (_, (_, p)) in self.partition_entries.iter_mut() { - *p = 0; + pub fn reset_partitions(&mut self, partition_count: usize) { + if self.partition_entries.len() < partition_count { + self.partition_entries.resize(partition_count, new_sel()); + self.partition_count.resize(partition_count, 0); + } + + for i in 0..partition_count { + self.partition_count[i] = 0; } } } From 9bada690e95ee0aca7260e5ad989f5cb1fff2d47 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Fri, 17 Nov 2023 18:21:32 +0800 Subject: [PATCH 25/28] feat(query): update --- src/query/expression/src/aggregate/aggregate_hashtable.rs | 7 +++++-- src/query/expression/src/aggregate/mod.rs | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/query/expression/src/aggregate/aggregate_hashtable.rs b/src/query/expression/src/aggregate/aggregate_hashtable.rs index a79a57486013..5ae521727ad5 100644 --- a/src/query/expression/src/aggregate/aggregate_hashtable.rs +++ b/src/query/expression/src/aggregate/aggregate_hashtable.rs @@ -494,8 +494,11 @@ impl AggregateHashTable { self.config.min_reductions[0] }; - self.disable_expand_ht = ratio <= min_reduction; - self.disable_expand_ht + if self.len() >= L3_MAX_ROWS_IN_HT { + self.disable_expand_ht = ratio <= min_reduction; + return self.disable_expand_ht; + } + ratio <= min_reduction } pub fn initial_capacity() -> usize { diff --git a/src/query/expression/src/aggregate/mod.rs b/src/query/expression/src/aggregate/mod.rs index 1ef49e224b69..700e3569fc4e 100644 --- a/src/query/expression/src/aggregate/mod.rs +++ b/src/query/expression/src/aggregate/mod.rs @@ -62,7 +62,7 @@ pub struct HashTableConfig { pub block_fill_factor: f64, pub partial_agg: bool, // min reduction ratio to control whether to expand the ht - // {1024 * 1024, 1.1} / {1024 * 1024, 2.0}, + // {1024 * 1024, 1.1} / {16 * 1024 * 1024, 2.0}, pub min_reductions: [f64; 2], } From 27f66f7f85f87fbd174cd346b3bca9b5726c8e35 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Sun, 19 Nov 2023 06:59:50 +0800 Subject: [PATCH 26/28] feat(query): update --- src/query/expression/src/aggregate/mod.rs | 66 ------------------- .../src/aggregate/partitioned_payload.rs | 4 +- src/query/expression/src/aggregate/payload.rs | 11 +++- .../aggregator/transform_partition_bucket.rs | 14 +++- 4 files changed, 22 insertions(+), 73 deletions(-) diff --git a/src/query/expression/src/aggregate/mod.rs b/src/query/expression/src/aggregate/mod.rs index 700e3569fc4e..4122915fcf5f 100644 --- a/src/query/expression/src/aggregate/mod.rs +++ b/src/query/expression/src/aggregate/mod.rs @@ -25,7 +25,6 @@ mod payload_flush; mod payload_row; mod probe_state; -use std::hash::Hasher; use std::sync::atomic::AtomicU64; use std::sync::Arc; @@ -92,68 +91,3 @@ impl HashTableConfig { self } } - -pub struct PerfectHashBuilder; - -// NOTE: This is a dummy hasher that just returns the value passed to it. -// This is only used for i8-i64, u8-u64, isize and usize keys. -pub struct PerfectHash { - val: u64, -} - -impl std::hash::BuildHasher for PerfectHashBuilder { - type Hasher = PerfectHash; - fn build_hasher(&self) -> PerfectHash { - PerfectHash { val: 0 } - } -} - -impl Hasher for PerfectHash { - fn finish(&self) -> u64 { - self.val - } - - fn write(&mut self, _bytes: &[u8]) { - unreachable!() - } - - fn write_u8(&mut self, i: u8) { - self.val = i as u64; - } - - fn write_u16(&mut self, i: u16) { - self.val = i as u64; - } - - fn write_u32(&mut self, i: u32) { - self.val = i as u64; - } - - fn write_u64(&mut self, i: u64) { - self.val = i; - } - - fn write_usize(&mut self, i: usize) { - self.val = i as u64; - } - - fn write_i8(&mut self, i: i8) { - self.val = i as u64; - } - - fn write_i16(&mut self, i: i16) { - self.val = i as u64; - } - - fn write_i32(&mut self, i: i32) { - self.val = i as u64; - } - - fn write_i64(&mut self, i: i64) { - self.val = i as u64; - } - - fn write_isize(&mut self, i: isize) { - self.val = i as u64; - } -} diff --git a/src/query/expression/src/aggregate/partitioned_payload.rs b/src/query/expression/src/aggregate/partitioned_payload.rs index c82fd6cb71a4..795df40c9253 100644 --- a/src/query/expression/src/aggregate/partitioned_payload.rs +++ b/src/query/expression/src/aggregate/partitioned_payload.rs @@ -39,7 +39,7 @@ pub struct PartitionedPayload { pub state_addr_offsets: Vec, pub state_layout: Option, - arenas: Vec>, + pub arenas: Vec>, partition_count: u64, mask_v: u64, @@ -166,7 +166,7 @@ impl PartitionedPayload { self.arenas.append(&mut other.arenas); } - fn combine_single(&mut self, mut other: Payload, state: &mut PayloadFlushState) { + pub fn combine_single(&mut self, mut other: Payload, state: &mut PayloadFlushState) { if other.len() == 0 { return; } diff --git a/src/query/expression/src/aggregate/payload.rs b/src/query/expression/src/aggregate/payload.rs index 3fd9f440556d..c2a69ca1eedf 100644 --- a/src/query/expression/src/aggregate/payload.rs +++ b/src/query/expression/src/aggregate/payload.rs @@ -303,6 +303,7 @@ impl Payload { let mut page = self.writable_page(); for i in 0..row_count { let index = select_vector[i]; + unsafe { std::ptr::copy_nonoverlapping( address[index], @@ -335,9 +336,13 @@ impl Drop for Payload { for page in self.pages.iter() { for row in 0..page.rows { unsafe { - let state_addr = self.data_ptr(page, row).add(self.state_offset); - aggr.drop_state(StateAddr::new(state_addr as usize + *addr_offset)) - }; + let state_place = StateAddr::new(core::ptr::read::( + self.data_ptr(page, row).add(self.state_offset) as _, + ) + as usize); + + aggr.drop_state(state_place.next(*addr_offset)); + } } } } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs index 9c5dbf533402..e26091a5cecc 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_partition_bucket.rs @@ -443,17 +443,27 @@ impl Processor let aggrs = payloads[0].aggrs.clone(); let mut payload_map = (0..self.max_partition_count).map(|_| vec![]).collect_vec(); - for payload in payloads.into_iter() { + + // All arenas should be kept in the bucket partition payload + let mut arenas = vec![]; + + for mut payload in payloads.into_iter() { for (bucket, p) in payload.payloads.into_iter().enumerate() { payload_map[bucket].push(p); } + arenas.append(&mut payload.arenas); } for (bucket, mut payloads) in payload_map.into_iter().enumerate() { let mut partition_payload = PartitionedPayload::new(group_types.clone(), aggrs.clone(), 1); - partition_payload.payloads.append(payloads.as_mut()); + for payload in payloads.drain(0..) { + partition_payload.combine_single(payload, &mut self.flush_state); + } + + partition_payload.arenas.extend_from_slice(&arenas); + self.buckets_blocks .insert(bucket as isize, vec![DataBlock::empty_with_meta( AggregateMeta::::create_agg_hashtable(partition_payload), From f2299d06b7ae94f9120747e89f48e6180a8eb6b0 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Wed, 22 Nov 2023 00:10:39 +0800 Subject: [PATCH 27/28] feat(query): update --- .../src/aggregate/aggregate_hashtable.rs | 64 ++++++++++++++----- .../expression/src/aggregate/group_hash.rs | 10 ++- .../expression/src/aggregate/payload_flush.rs | 13 ++-- .../expression/src/aggregate/payload_row.rs | 50 +++++++++++---- 4 files changed, 100 insertions(+), 37 deletions(-) diff --git a/src/query/expression/src/aggregate/aggregate_hashtable.rs b/src/query/expression/src/aggregate/aggregate_hashtable.rs index 5ae521727ad5..ba2a2d77f413 100644 --- a/src/query/expression/src/aggregate/aggregate_hashtable.rs +++ b/src/query/expression/src/aggregate/aggregate_hashtable.rs @@ -14,6 +14,7 @@ // A new AggregateHashtable which inspired by duckdb's https://duckdb.org/2022/03/07/aggregate-hashtable.html +use std::intrinsics::assume; use std::sync::atomic::Ordering; use common_exception::Result; @@ -213,10 +214,13 @@ impl AggregateHashTable { state.no_match_vector[i] }; + unsafe { assume(index < state.group_hashes.len()) }; let ht_offset = (state.group_hashes[index] as usize + iter_times) & (self.capacity - 1); let salt = state.group_hashes[index].get_salt(); + + unsafe { assume(ht_offset < entries.len()) }; let entry = &mut entries[ht_offset]; if entry.is_occupied() { @@ -390,12 +394,14 @@ impl AggregateHashTable { // 256k if bytes_per_partition >= 256 * 1024 { - new_radix_bits += self.config.repartition_radix_bits_incr; + // direct repartition to max radix bits + new_radix_bits = self.config.max_radix_bits; + // new_radix_bits += self.config.repartition_radix_bits_incr; // If reducion is small and input rows will be very large, directly repartition to max radix bits - if self.should_disable_expand_hash_table() { - new_radix_bits = self.config.max_radix_bits; - } + // if self.should_disable_expand_hash_table() { + // new_radix_bits = self.config.max_radix_bits; + // } } loop { @@ -445,6 +451,9 @@ impl AggregateHashTable { let mut entries = vec![0; new_capacity]; + #[cfg(all(target_arch = "x86_64", target_feature = "avx"))] + let zeros = unsafe { std::arch::x86_64::_mm256_setzero_si256() }; + // iterate over payloads and copy to new entries for payload in self.payload.payloads.iter() { for page in payload.pages.iter() { @@ -455,24 +464,47 @@ impl AggregateHashTable { let hash: u64 = unsafe { core::ptr::read(row_ptr.add(payload.hash_offset) as _) }; - let mut hash_slot = hash & mask; - while entries[hash_slot as usize].is_occupied() { + let mut hash_slot = (hash & mask) as usize; + unsafe { assume(hash_slot < entries.len()) }; + while entries[hash_slot].is_occupied() { hash_slot += 1; - if hash_slot >= new_capacity as u64 { + #[cfg(all(target_arch = "x86_64", target_feature = "avx"))] + { + unsafe { + while hash_slot + 4 < new_capacity { + let read = std::arch::x86_64::_mm256_loadu_si256( + entries.as_ptr().add(hash_slot) as *const _, + ); + let result = std::arch::x86_64::_mm256_cmpeq_epi64(read, zeros); + let mask = std::arch::x86_64::_mm256_movemask_epi8(result); + + // Check if the mask is zero, which indicates all values are non-zero. + if mask == 0 { + hash_slot += 4; + } else { + break; + } + } + } + } + + if hash_slot >= new_capacity { hash_slot = 0; } + unsafe { assume(hash_slot < entries.len()) }; } - debug_assert!(!entries[hash_slot as usize].is_occupied()); + debug_assert!(!entries[hash_slot].is_occupied()); // set value - entries[hash_slot as usize].set_salt(hash.get_salt()); - entries[hash_slot as usize].set_pointer(row_ptr); - debug_assert!(entries[hash_slot as usize].is_occupied()); - debug_assert_eq!(entries[hash_slot as usize].get_pointer(), row_ptr); - debug_assert_eq!(entries[hash_slot as usize].get_salt(), hash.get_salt()); + + unsafe { assume(hash_slot < entries.len()) }; + entries[hash_slot].set_salt(hash.get_salt()); + entries[hash_slot].set_pointer(row_ptr); + debug_assert!(entries[hash_slot].is_occupied()); + debug_assert_eq!(entries[hash_slot].get_pointer(), row_ptr); + debug_assert_eq!(entries[hash_slot].get_salt(), hash.get_salt()); } } } - self.entries = entries; self.capacity = new_capacity; } @@ -498,11 +530,11 @@ impl AggregateHashTable { self.disable_expand_ht = ratio <= min_reduction; return self.disable_expand_ht; } - ratio <= min_reduction + false } pub fn initial_capacity() -> usize { - 4096 + 8192 } pub fn get_capacity_for_count(count: usize) -> usize { diff --git a/src/query/expression/src/aggregate/group_hash.rs b/src/query/expression/src/aggregate/group_hash.rs index ab12b2b5c0c8..3f636abcf5a8 100644 --- a/src/query/expression/src/aggregate/group_hash.rs +++ b/src/query/expression/src/aggregate/group_hash.rs @@ -149,6 +149,7 @@ trait AggHash { // Rewrite using chatgpt impl AggHash for [u8] { + #[inline] fn agg_hash(&self) -> u64 { const M: u64 = 0xc6a4a7935bd1e995; const SEED: u64 = 0xe17a1465; @@ -184,7 +185,7 @@ impl AggHash for [u8] { macro_rules! impl_agg_hash_for_primitive_types { ($t: ty) => { impl AggHash for $t { - #[inline(always)] + #[inline] fn agg_hash(&self) -> u64 { let mut x = *self as u64; x ^= x >> 32; @@ -208,25 +209,28 @@ impl_agg_hash_for_primitive_types!(u64); impl_agg_hash_for_primitive_types!(i64); impl AggHash for bool { + #[inline] fn agg_hash(&self) -> u64 { *self as u64 } } impl AggHash for i128 { + #[inline] fn agg_hash(&self) -> u64 { self.to_le_bytes().agg_hash() } } impl AggHash for i256 { + #[inline] fn agg_hash(&self) -> u64 { self.to_le_bytes().agg_hash() } } impl AggHash for OrderedFloat { - #[inline(always)] + #[inline] fn agg_hash(&self) -> u64 { if self.is_nan() { f32::NAN.to_bits().agg_hash() @@ -237,7 +241,7 @@ impl AggHash for OrderedFloat { } impl AggHash for OrderedFloat { - #[inline(always)] + #[inline] fn agg_hash(&self) -> u64 { if self.is_nan() { f64::NAN.to_bits().agg_hash() diff --git a/src/query/expression/src/aggregate/payload_flush.rs b/src/query/expression/src/aggregate/payload_flush.rs index 97c965652a32..2dbaea2a4368 100644 --- a/src/query/expression/src/aggregate/payload_flush.rs +++ b/src/query/expression/src/aggregate/payload_flush.rs @@ -217,11 +217,16 @@ impl Payload { for idx in 0..len { let str_len = core::ptr::read::(state.addresses[idx].add(col_offset) as _) as usize; - let data_address = - core::ptr::read::(state.addresses[idx].add(col_offset + 4) as _) as usize - as *const u8; - let scalar = std::slice::from_raw_parts(data_address, str_len); + let scalar = if str_len <= 8 { + std::slice::from_raw_parts(state.addresses[idx].add(col_offset + 4), str_len) + } else { + let data_address = + core::ptr::read::(state.addresses[idx].add(col_offset + 4) as _) + as usize as *const u8; + + std::slice::from_raw_parts(data_address, str_len) + }; string_builder.put_slice(scalar); string_builder.commit_row(); diff --git a/src/query/expression/src/aggregate/payload_row.rs b/src/query/expression/src/aggregate/payload_row.rs index 9316022d4815..b9c2056e8bb7 100644 --- a/src/query/expression/src/aggregate/payload_row.rs +++ b/src/query/expression/src/aggregate/payload_row.rs @@ -99,13 +99,23 @@ pub unsafe fn serialize_column_to_rowformat( } Column::String(v) | Column::Bitmap(v) | Column::Variant(v) => { for index in select_vector.iter().take(rows).copied() { - let data = arena.alloc_slice_copy(v.index_unchecked(index)); - + let data = v.index_unchecked(index); store(data.len() as u32, address[index].add(offset) as *mut u8); - store( - data.as_ptr() as u64, - address[index].add(offset + 4) as *mut u8, - ); + + if data.len() <= 8 { + std::ptr::copy_nonoverlapping( + data.as_ptr(), + address[index].add(offset + 4) as *mut u8, + data.len(), + ); + } else { + let data = arena.alloc_slice_copy(v.index_unchecked(index)); + + store( + data.as_ptr() as u64, + address[index].add(offset + 4) as *mut u8, + ); + } } } Column::Timestamp(buffer) => { @@ -344,9 +354,15 @@ unsafe fn row_match_string_column( if len != value.len() { equal = false; } else { - let data_address = core::ptr::read::(address as _) as usize as *const u8; - let scalar = std::slice::from_raw_parts(data_address, len); - equal = common_hashtable::fast_memcmp(scalar, value); + if len <= 8 { + let scalar = std::slice::from_raw_parts(address, len); + equal = common_hashtable::fast_memcmp(scalar, value); + } else { + let data_address = + core::ptr::read::(address as _) as usize as *const u8; + let scalar = std::slice::from_raw_parts(data_address, len); + equal = common_hashtable::fast_memcmp(scalar, value); + } } } else { equal = is_set == is_set2; @@ -372,10 +388,14 @@ unsafe fn row_match_string_column( if len != value.len() { equal = false; } else { - let data_address = core::ptr::read::(address as _) as usize as *const u8; - let scalar = std::slice::from_raw_parts(data_address, len); - - equal = common_hashtable::fast_memcmp(scalar, value); + if len <= 8 { + let scalar = std::slice::from_raw_parts(address, len); + equal = common_hashtable::fast_memcmp(scalar, value); + } else { + let data_address = core::ptr::read::(address as _) as usize as *const u8; + let scalar = std::slice::from_raw_parts(data_address, len); + equal = common_hashtable::fast_memcmp(scalar, value); + } } if equal { @@ -452,7 +472,9 @@ unsafe fn row_match_column_type( } } } + if match_count > 0 { + select_vector[0..match_count].copy_from_slice(&temp_vector[0..match_count]); + } - std::mem::swap(select_vector, temp_vector); *count = match_count; } From d949963cbf167d516a7ec1be78f3a8495af10a09 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Wed, 22 Nov 2023 20:59:52 +0800 Subject: [PATCH 28/28] chore(query): update --- .../src/aggregate/aggregate_hashtable.rs | 25 ------------------- 1 file changed, 25 deletions(-) diff --git a/src/query/expression/src/aggregate/aggregate_hashtable.rs b/src/query/expression/src/aggregate/aggregate_hashtable.rs index ba2a2d77f413..c19cdcd31548 100644 --- a/src/query/expression/src/aggregate/aggregate_hashtable.rs +++ b/src/query/expression/src/aggregate/aggregate_hashtable.rs @@ -451,9 +451,6 @@ impl AggregateHashTable { let mut entries = vec![0; new_capacity]; - #[cfg(all(target_arch = "x86_64", target_feature = "avx"))] - let zeros = unsafe { std::arch::x86_64::_mm256_setzero_si256() }; - // iterate over payloads and copy to new entries for payload in self.payload.payloads.iter() { for page in payload.pages.iter() { @@ -468,34 +465,12 @@ impl AggregateHashTable { unsafe { assume(hash_slot < entries.len()) }; while entries[hash_slot].is_occupied() { hash_slot += 1; - #[cfg(all(target_arch = "x86_64", target_feature = "avx"))] - { - unsafe { - while hash_slot + 4 < new_capacity { - let read = std::arch::x86_64::_mm256_loadu_si256( - entries.as_ptr().add(hash_slot) as *const _, - ); - let result = std::arch::x86_64::_mm256_cmpeq_epi64(read, zeros); - let mask = std::arch::x86_64::_mm256_movemask_epi8(result); - - // Check if the mask is zero, which indicates all values are non-zero. - if mask == 0 { - hash_slot += 4; - } else { - break; - } - } - } - } - if hash_slot >= new_capacity { hash_slot = 0; } - unsafe { assume(hash_slot < entries.len()) }; } debug_assert!(!entries[hash_slot].is_occupied()); // set value - unsafe { assume(hash_slot < entries.len()) }; entries[hash_slot].set_salt(hash.get_salt()); entries[hash_slot].set_pointer(row_ptr);