Skip to content

Commit

Permalink
Merge branch 'hyperloglog' of github.com:sundy-li/fuse-query into hyp…
Browse files Browse the repository at this point in the history
…erloglog
  • Loading branch information
sundy-li committed Feb 3, 2024
2 parents 0b9e656 + a2bade3 commit c63480d
Show file tree
Hide file tree
Showing 27 changed files with 734 additions and 282 deletions.
103 changes: 103 additions & 0 deletions .github/workflows/reuse.benchmark_query_meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
name: Databend Suites Benchmark

on:
workflow_call:
inputs:
sha:
description: Git sha of benchmark
required: true
type: string
run_id:
description: The run id of benchmark
required: true
type: string
source:
description: The source of benchmark, pr/release
required: true
type: string
source_id:
description: The source id of benchmark, pr number/release tag
required: true
type: string
version:
description: The version of databend to run
required: true
type: string
runner_provider:
description: 'Self-hosted runner provider, aws or gcp'
type: string
required: true

permissions:
id-token: write
pull-requests: write
contents: read

env:
BUILD_PROFILE: release
RUNNER_PROVIDER: ${{ inputs.runner_provider }}

jobs:
local:
if: inputs.source == 'release'
timeout-minutes: 60
runs-on: [self-hosted, X64, Linux, 4c8g, aws]
strategy:
matrix:
dataset:
- internal
fail-fast: true
max-parallel: 1
steps:
- uses: actions/checkout@v4
if: inputs.source == 'release'
- uses: actions/checkout@v4
if: inputs.source == 'pr'
with:
ref: "refs/pull/${{ inputs.source_id }}/merge"
- uses: ./.github/actions/setup_bendsql
- name: Download artifact for pr
if: inputs.source == 'pr'
uses: ./.github/actions/artifact_download
with:
sha: ${{ inputs.sha }}
target: x86_64-unknown-linux-gnu
- name: Download artifact for release
if: inputs.source == 'release'
env:
GH_TOKEN: ${{ github.token }}
run: |
version=${{ inputs.source_id }}
target=x86_64-unknown-linux-gnu
mkdir -p ./distro/
mkdir -p ./target/release/
gh release download ${version} --pattern "databend-${version}-${target}.tar.gz" --dir distro/
tar x -C ./target/release -f ./distro/databend-${version}-${target}.tar.gz --strip-components 1 bin/
chmod +x ./target/release/databend-*
- name: Setup Databend Binary
shell: bash
run: |
sudo cp ./target/release/databend-* /usr/local/bin/
databend-query --version
databend-meta --version
- uses: ./.github/actions/benchmark_local
timeout-minutes: 30
id: benchmark_query_meta
with:
sha: ${{ inputs.sha }}
run_id: ${{ inputs.run_id }}
dataset: ${{ matrix.dataset }}
source: ${{ inputs.source }}
source_id: ${{ inputs.source_id }}
benchmark:
if: contains(github.event.pull_request.labels.*.name, 'ci-benchmark-suites')
needs: [ info, build, docker ]
uses: ./.github/workflows/reuse.benchmark_query_meta.yml
secrets: inherit
with:
sha: ${{ needs.info.outputs.sha }}
run_id: ${{ github.run_id }}
source: pr
source_id: ${{ github.event.pull_request.number }}
version: ${{ needs.docker.outputs.tag }}
runner_provider: github
24 changes: 24 additions & 0 deletions benchmark/clickbench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Benchmark Directory

This directory contains subdirectories dedicated to various performance tests,

specifically for TPCH tests, Hits tests, and internal query performance tests. Below is a brief overview of each subdirectory:

## 1. tpch

This subdirectory includes performance evaluation tools and scripts related to TPCH tests.

TPCH tests are designed to simulate complex query scenarios to assess the system's performance when handling large datasets. In this directory, you can find testing scripts, configuration files, and documentation for test results.

## 2. hits

Hits tests focus on specific queries or operations for performance testing.

In this subdirectory, you'll find scripts for Hits tests, sample queries, and performance analysis tools.

## 3. internal

The internal subdirectory contains testing tools and scripts dedicated to ensuring the performance of internal queries.

These tests may be conducted to ensure the system performs well when handling internal queries specific.

5 changes: 5 additions & 0 deletions benchmark/clickbench/benchmark_local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@ function run_query() {
fi
}

if [ "${BENCHMARK_DATASET}" == "internal" ]
then
bash "${BENCHMARK_DATASET}"/load.sh
fi

TRIES=3
QUERY_NUM=0
while read -r query; do
Expand Down
10 changes: 10 additions & 0 deletions benchmark/clickbench/internal/load.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash
# Used to test internal

echo "drop database if exists test" | bendsql -uroot
echo "create database test" | bendsql -uroot
for((i=1;i<=1000;i++));
do
echo "create table if not exists test.t_$i(id int comment 'tes\t', c2 string comment 'c2comment')" | bendsql -uroot

done
3 changes: 3 additions & 0 deletions benchmark/clickbench/internal/queries.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
-- test system table query speed
select * from system.tables where database='test' ignore_result;
select * from system.columns where database='test' ignore_result;
66 changes: 45 additions & 21 deletions src/common/hashtable/src/hashjoin_hashtable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -209,56 +209,80 @@ where
count
}

// Using hashes to probe hash table and converting them in-place to pointers for memory reuse.
fn early_filtering_probe(&self, hashes: &mut [u64], bitmap: Option<Bitmap>) -> usize {
// Perform early filtering probe, store matched indexes in `matched_selection` and store unmatched indexes
// in `unmatched_selection`, return the number of matched and unmatched indexes.
fn early_filtering_probe(
&self,
hashes: &mut [u64],
bitmap: Option<Bitmap>,
matched_selection: &mut [u32],
unmatched_selection: &mut [u32],
) -> (usize, usize) {
let mut valids = None;
if let Some(bitmap) = bitmap {
if bitmap.unset_bits() == bitmap.len() {
hashes.iter_mut().for_each(|hash| {
*hash = 0;
});
return 0;
unmatched_selection
.iter_mut()
.enumerate()
.for_each(|(idx, val)| {
*val = idx as u32;
});
return (0, hashes.len());
} else if bitmap.unset_bits() > 0 {
valids = Some(bitmap);
}
}
let mut count = 0;
let mut matched_idx = 0;
let mut unmatched_idx = 0;
match valids {
Some(valids) => {
valids
.iter()
.zip(hashes.iter_mut())
.for_each(|(valid, hash)| {
valids.iter().zip(hashes.iter_mut().enumerate()).for_each(
|(valid, (idx, hash))| {
if valid {
let header = self.pointers[(*hash >> self.hash_shift) as usize];
if header != 0 && early_filtering(header, *hash) {
*hash = remove_header_tag(header);
count += 1;
unsafe {
*matched_selection.get_unchecked_mut(matched_idx) = idx as u32
};
matched_idx += 1;
} else {
*hash = 0;
unsafe {
*unmatched_selection.get_unchecked_mut(unmatched_idx) =
idx as u32
};
unmatched_idx += 1;
}
} else {
*hash = 0;
unsafe {
*unmatched_selection.get_unchecked_mut(unmatched_idx) = idx as u32
};
unmatched_idx += 1;
}
});
},
);
}
None => {
hashes.iter_mut().for_each(|hash| {
hashes.iter_mut().enumerate().for_each(|(idx, hash)| {
let header = self.pointers[(*hash >> self.hash_shift) as usize];
if header != 0 && early_filtering(header, *hash) {
*hash = remove_header_tag(header);
count += 1;
unsafe { *matched_selection.get_unchecked_mut(matched_idx) = idx as u32 };
matched_idx += 1;
} else {
*hash = 0;
unsafe {
*unmatched_selection.get_unchecked_mut(unmatched_idx) = idx as u32
};
unmatched_idx += 1;
}
});
}
}
count
(matched_idx, unmatched_idx)
}

// Using hashes to probe hash table and converting them in-place to pointers for memory reuse.
fn early_filtering_probe_with_selection(
// Perform early filtering probe and store matched indexes in `selection`, return the number of matched indexes.
fn early_filtering_matched_probe(
&self,
hashes: &mut [u64],
bitmap: Option<Bitmap>,
Expand Down
58 changes: 41 additions & 17 deletions src/common/hashtable/src/hashjoin_string_hashtable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,53 +138,77 @@ where A: Allocator + Clone + 'static
count
}

// Using hashes to probe hash table and converting them in-place to pointers for memory reuse.
fn early_filtering_probe(&self, hashes: &mut [u64], bitmap: Option<Bitmap>) -> usize {
// Perform early filtering probe, store matched indexes in `matched_selection` and store unmatched indexes
// in `unmatched_selection`, return the number of matched and unmatched indexes.
fn early_filtering_probe(
&self,
hashes: &mut [u64],
bitmap: Option<Bitmap>,
matched_selection: &mut [u32],
unmatched_selection: &mut [u32],
) -> (usize, usize) {
let mut valids = None;
if let Some(bitmap) = bitmap {
if bitmap.unset_bits() == bitmap.len() {
hashes.iter_mut().for_each(|hash| {
*hash = 0;
});
return 0;
unmatched_selection
.iter_mut()
.enumerate()
.for_each(|(idx, val)| {
*val = idx as u32;
});
return (0, hashes.len());
} else if bitmap.unset_bits() > 0 {
valids = Some(bitmap);
}
}
let mut count = 0;
let mut matched_idx = 0;
let mut unmatched_idx = 0;
match valids {
Some(valids) => {
hashes.iter_mut().enumerate().for_each(|(idx, hash)| {
if unsafe { valids.get_bit_unchecked(idx) } {
let header = self.pointers[(*hash >> self.hash_shift) as usize];
if header != 0 && early_filtering(header, *hash) {
*hash = remove_header_tag(header);
count += 1;
unsafe {
*matched_selection.get_unchecked_mut(matched_idx) = idx as u32
};
matched_idx += 1;
} else {
*hash = 0;
unsafe {
*unmatched_selection.get_unchecked_mut(unmatched_idx) = idx as u32
};
unmatched_idx += 1;
}
} else {
*hash = 0;
};
unsafe {
*unmatched_selection.get_unchecked_mut(unmatched_idx) = idx as u32
};
unmatched_idx += 1;
}
});
}
None => {
hashes.iter_mut().for_each(|hash| {
hashes.iter_mut().enumerate().for_each(|(idx, hash)| {
let header = self.pointers[(*hash >> self.hash_shift) as usize];
if header != 0 && early_filtering(header, *hash) {
*hash = remove_header_tag(header);
count += 1;
unsafe { *matched_selection.get_unchecked_mut(matched_idx) = idx as u32 };
matched_idx += 1;
} else {
*hash = 0;
unsafe {
*unmatched_selection.get_unchecked_mut(unmatched_idx) = idx as u32
};
unmatched_idx += 1;
}
});
}
}
count
(matched_idx, unmatched_idx)
}

// Using hashes to probe hash table and converting them in-place to pointers for memory reuse.
fn early_filtering_probe_with_selection(
// Perform early filtering probe and store matched indexes in `selection`, return the number of matched indexes.
fn early_filtering_matched_probe(
&self,
hashes: &mut [u64],
bitmap: Option<Bitmap>,
Expand Down
29 changes: 13 additions & 16 deletions src/common/hashtable/src/traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -524,24 +524,21 @@ pub trait HashtableLike {
pub trait HashJoinHashtableLike {
type Key: ?Sized;

// Using hashes to probe hash table and converting them in-place to pointers for memory reuse.
// same with `early_filtering_probe`, but we don't use early_filter
// Probe hash table, use `hashes` to probe hash table and convert it in-place to pointers for memory reuse.
fn probe(&self, hashes: &mut [u64], bitmap: Option<Bitmap>) -> usize;

// Using hashes to probe hash table and converting them in-place to pointers for memory reuse.
// 1. same with `early_filtering_probe_with_selection`, but we don't use selection to preserve the
// unfiltered indexes, we just set the filtered hashes as zero.
// 2. return the unfiltered counts.
fn early_filtering_probe(&self, hashes: &mut [u64], bitmap: Option<Bitmap>) -> usize;

// Using hashes to probe hash table and converting them in-place to pointers for memory reuse.
// we use `early_filtering_probe_with_selection` to do the first round probe.
// 1. `hashes` is the hash value of probe block's rows. we will use this one to
// do early filtering. if we can't early filter one row(at idx), we will assign pointer in
// the bucket to hashes[idx] to reuse the memory.
// 2. `selection` is used to preserved the indexes which can't be early_filtered.
// 3. return the count of preserved the indexes in `selection`
fn early_filtering_probe_with_selection(
// Perform early filtering probe, store matched indexes in `matched_selection` and store unmatched indexes
// in `unmatched_selection`, return the number of matched and unmatched indexes.
fn early_filtering_probe(
&self,
hashes: &mut [u64],
valids: Option<Bitmap>,
matched_selection: &mut [u32],
unmatched_selection: &mut [u32],
) -> (usize, usize);

// Perform early filtering probe and store matched indexes in `selection`, return the number of matched indexes.
fn early_filtering_matched_probe(
&self,
hashes: &mut [u64],
valids: Option<Bitmap>,
Expand Down
Loading

0 comments on commit c63480d

Please sign in to comment.