Merge branch 'hyperloglog' of github.com:sundy-li/fuse-query into hyp…

…erloglog
databendlabs · Feb 3, 2024 · c63480d · c63480d
2 parents 0b9e656 + a2bade3
commit c63480d
Show file tree

Hide file tree

Showing 27 changed files with 734 additions and 282 deletions.
diff --git a/.github/workflows/reuse.benchmark_query_meta.yml b/.github/workflows/reuse.benchmark_query_meta.yml
@@ -0,0 +1,103 @@
+name: Databend Suites Benchmark
+
+on:
+  workflow_call:
+    inputs:
+      sha:
+        description: Git sha of benchmark
+        required: true
+        type: string
+      run_id:
+        description: The run id of benchmark
+        required: true
+        type: string
+      source:
+        description: The source of benchmark, pr/release
+        required: true
+        type: string
+      source_id:
+        description: The source id of benchmark, pr number/release tag
+        required: true
+        type: string
+      version:
+        description: The version of databend to run
+        required: true
+        type: string
+      runner_provider:
+        description: 'Self-hosted runner provider, aws or gcp'
+        type: string
+        required: true
+
+permissions:
+  id-token: write
+  pull-requests: write
+  contents: read
+
+env:
+  BUILD_PROFILE: release
+  RUNNER_PROVIDER: ${{ inputs.runner_provider }}
+
+jobs:
+  local:
+    if: inputs.source == 'release'
+    timeout-minutes: 60
+    runs-on: [self-hosted, X64, Linux, 4c8g, aws]
+    strategy:
+      matrix:
+        dataset:
+          - internal
+      fail-fast: true
+      max-parallel: 1
+    steps:
+      - uses: actions/checkout@v4
+        if: inputs.source == 'release'
+      - uses: actions/checkout@v4
+        if: inputs.source == 'pr'
+        with:
+          ref: "refs/pull/${{ inputs.source_id }}/merge"
+      - uses: ./.github/actions/setup_bendsql
+      - name: Download artifact for pr
+        if: inputs.source == 'pr'
+        uses: ./.github/actions/artifact_download
+        with:
+          sha: ${{ inputs.sha }}
+          target: x86_64-unknown-linux-gnu
+      - name: Download artifact for release
+        if: inputs.source == 'release'
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          version=${{ inputs.source_id }}
+          target=x86_64-unknown-linux-gnu
+          mkdir -p ./distro/
+          mkdir -p ./target/release/
+          gh release download ${version} --pattern "databend-${version}-${target}.tar.gz" --dir distro/
+          tar x -C ./target/release -f ./distro/databend-${version}-${target}.tar.gz --strip-components 1 bin/
+          chmod +x ./target/release/databend-*
+      - name: Setup Databend Binary
+        shell: bash
+        run: |
+          sudo cp ./target/release/databend-* /usr/local/bin/
+          databend-query --version
+          databend-meta --version
+      - uses: ./.github/actions/benchmark_local
+        timeout-minutes: 30
+        id: benchmark_query_meta
+        with:
+          sha: ${{ inputs.sha }}
+          run_id: ${{ inputs.run_id }}
+          dataset: ${{ matrix.dataset }}
+          source: ${{ inputs.source }}
+          source_id: ${{ inputs.source_id }}
+  benchmark:
+    if: contains(github.event.pull_request.labels.*.name, 'ci-benchmark-suites')
+    needs: [ info, build, docker ]
+    uses: ./.github/workflows/reuse.benchmark_query_meta.yml
+    secrets: inherit
+    with:
+      sha: ${{ needs.info.outputs.sha }}
+      run_id: ${{ github.run_id }}
+      source: pr
+      source_id: ${{ github.event.pull_request.number }}
+      version: ${{ needs.docker.outputs.tag }}
+      runner_provider: github
diff --git a/benchmark/clickbench/README.md b/benchmark/clickbench/README.md
@@ -0,0 +1,24 @@
+# Benchmark Directory
+
+This directory contains subdirectories dedicated to various performance tests, 
+
+specifically for TPCH tests, Hits tests, and internal query performance tests. Below is a brief overview of each subdirectory:
+
+## 1. tpch
+
+This subdirectory includes performance evaluation tools and scripts related to TPCH tests. 
+
+TPCH tests are designed to simulate complex query scenarios to assess the system's performance when handling large datasets. In this directory, you can find testing scripts, configuration files, and documentation for test results.
+
+## 2. hits
+
+Hits tests focus on specific queries or operations for performance testing. 
+
+In this subdirectory, you'll find scripts for Hits tests, sample queries, and performance analysis tools.
+
+## 3. internal
+
+The internal subdirectory contains testing tools and scripts dedicated to ensuring the performance of internal queries. 
+
+These tests may be conducted to ensure the system performs well when handling internal queries specific.
+
diff --git a/benchmark/clickbench/benchmark_local.sh b/benchmark/clickbench/benchmark_local.sh
@@ -99,6 +99,11 @@ function run_query() {
     fi
 }
 
+if [ "${BENCHMARK_DATASET}" == "internal" ]
+then
+  bash "${BENCHMARK_DATASET}"/load.sh
+fi
+
 TRIES=3
 QUERY_NUM=0
 while read -r query; do

diff --git a/benchmark/clickbench/internal/load.sh b/benchmark/clickbench/internal/load.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+# Used to test internal
+
+echo "drop database if exists test" | bendsql -uroot
+echo "create database test" | bendsql -uroot
+for((i=1;i<=1000;i++));
+do
+	echo "create table if not exists test.t_$i(id int comment 'tes\t', c2 string comment 'c2comment')" | bendsql -uroot
+
+done
diff --git a/benchmark/clickbench/internal/queries.sql b/benchmark/clickbench/internal/queries.sql
@@ -0,0 +1,3 @@
+-- test system table query speed
+select * from system.tables  where database='test' ignore_result;
+select * from system.columns  where database='test' ignore_result;
diff --git a/src/common/hashtable/src/hashjoin_hashtable.rs b/src/common/hashtable/src/hashjoin_hashtable.rs
@@ -209,56 +209,80 @@ where
         count
     }
 
-    // Using hashes to probe hash table and converting them in-place to pointers for memory reuse.
-    fn early_filtering_probe(&self, hashes: &mut [u64], bitmap: Option<Bitmap>) -> usize {
+    // Perform early filtering probe, store matched indexes in `matched_selection` and store unmatched indexes
+    // in `unmatched_selection`, return the number of matched and unmatched indexes.
+    fn early_filtering_probe(
+        &self,
+        hashes: &mut [u64],
+        bitmap: Option<Bitmap>,
+        matched_selection: &mut [u32],
+        unmatched_selection: &mut [u32],
+    ) -> (usize, usize) {
         let mut valids = None;
         if let Some(bitmap) = bitmap {
             if bitmap.unset_bits() == bitmap.len() {
-                hashes.iter_mut().for_each(|hash| {
-                    *hash = 0;
-                });
-                return 0;
+                unmatched_selection
+                    .iter_mut()
+                    .enumerate()
+                    .for_each(|(idx, val)| {
+                        *val = idx as u32;
+                    });
+                return (0, hashes.len());
             } else if bitmap.unset_bits() > 0 {
                 valids = Some(bitmap);
             }
         }
-        let mut count = 0;
+        let mut matched_idx = 0;
+        let mut unmatched_idx = 0;
         match valids {
             Some(valids) => {
-                valids
-                    .iter()
-                    .zip(hashes.iter_mut())
-                    .for_each(|(valid, hash)| {
+                valids.iter().zip(hashes.iter_mut().enumerate()).for_each(
+                    |(valid, (idx, hash))| {
                         if valid {
                             let header = self.pointers[(*hash >> self.hash_shift) as usize];
                             if header != 0 && early_filtering(header, *hash) {
                                 *hash = remove_header_tag(header);
-                                count += 1;
+                                unsafe {
+                                    *matched_selection.get_unchecked_mut(matched_idx) = idx as u32
+                                };
+                                matched_idx += 1;
                             } else {
-                                *hash = 0;
+                                unsafe {
+                                    *unmatched_selection.get_unchecked_mut(unmatched_idx) =
+                                        idx as u32
+                                };
+                                unmatched_idx += 1;
                             }
                         } else {
-                            *hash = 0;
+                            unsafe {
+                                *unmatched_selection.get_unchecked_mut(unmatched_idx) = idx as u32
+                            };
+                            unmatched_idx += 1;
                         }
-                    });
+                    },
+                );
             }
             None => {
-                hashes.iter_mut().for_each(|hash| {
+                hashes.iter_mut().enumerate().for_each(|(idx, hash)| {
                     let header = self.pointers[(*hash >> self.hash_shift) as usize];
                     if header != 0 && early_filtering(header, *hash) {
                         *hash = remove_header_tag(header);
-                        count += 1;
+                        unsafe { *matched_selection.get_unchecked_mut(matched_idx) = idx as u32 };
+                        matched_idx += 1;
                     } else {
-                        *hash = 0;
+                        unsafe {
+                            *unmatched_selection.get_unchecked_mut(unmatched_idx) = idx as u32
+                        };
+                        unmatched_idx += 1;
                     }
                 });
             }
         }
-        count
+        (matched_idx, unmatched_idx)
     }
 
-    // Using hashes to probe hash table and converting them in-place to pointers for memory reuse.
-    fn early_filtering_probe_with_selection(
+    // Perform early filtering probe and store matched indexes in `selection`, return the number of matched indexes.
+    fn early_filtering_matched_probe(
         &self,
         hashes: &mut [u64],
         bitmap: Option<Bitmap>,

diff --git a/src/common/hashtable/src/hashjoin_string_hashtable.rs b/src/common/hashtable/src/hashjoin_string_hashtable.rs
@@ -138,53 +138,77 @@ where A: Allocator + Clone + 'static
         count
     }
 
-    // Using hashes to probe hash table and converting them in-place to pointers for memory reuse.
-    fn early_filtering_probe(&self, hashes: &mut [u64], bitmap: Option<Bitmap>) -> usize {
+    // Perform early filtering probe, store matched indexes in `matched_selection` and store unmatched indexes
+    // in `unmatched_selection`, return the number of matched and unmatched indexes.
+    fn early_filtering_probe(
+        &self,
+        hashes: &mut [u64],
+        bitmap: Option<Bitmap>,
+        matched_selection: &mut [u32],
+        unmatched_selection: &mut [u32],
+    ) -> (usize, usize) {
         let mut valids = None;
         if let Some(bitmap) = bitmap {
             if bitmap.unset_bits() == bitmap.len() {
-                hashes.iter_mut().for_each(|hash| {
-                    *hash = 0;
-                });
-                return 0;
+                unmatched_selection
+                    .iter_mut()
+                    .enumerate()
+                    .for_each(|(idx, val)| {
+                        *val = idx as u32;
+                    });
+                return (0, hashes.len());
             } else if bitmap.unset_bits() > 0 {
                 valids = Some(bitmap);
             }
         }
-        let mut count = 0;
+        let mut matched_idx = 0;
+        let mut unmatched_idx = 0;
         match valids {
             Some(valids) => {
                 hashes.iter_mut().enumerate().for_each(|(idx, hash)| {
                     if unsafe { valids.get_bit_unchecked(idx) } {
                         let header = self.pointers[(*hash >> self.hash_shift) as usize];
                         if header != 0 && early_filtering(header, *hash) {
                             *hash = remove_header_tag(header);
-                            count += 1;
+                            unsafe {
+                                *matched_selection.get_unchecked_mut(matched_idx) = idx as u32
+                            };
+                            matched_idx += 1;
                         } else {
-                            *hash = 0;
+                            unsafe {
+                                *unmatched_selection.get_unchecked_mut(unmatched_idx) = idx as u32
+                            };
+                            unmatched_idx += 1;
                         }
                     } else {
-                        *hash = 0;
-                    };
+                        unsafe {
+                            *unmatched_selection.get_unchecked_mut(unmatched_idx) = idx as u32
+                        };
+                        unmatched_idx += 1;
+                    }
                 });
             }
             None => {
-                hashes.iter_mut().for_each(|hash| {
+                hashes.iter_mut().enumerate().for_each(|(idx, hash)| {
                     let header = self.pointers[(*hash >> self.hash_shift) as usize];
                     if header != 0 && early_filtering(header, *hash) {
                         *hash = remove_header_tag(header);
-                        count += 1;
+                        unsafe { *matched_selection.get_unchecked_mut(matched_idx) = idx as u32 };
+                        matched_idx += 1;
                     } else {
-                        *hash = 0;
+                        unsafe {
+                            *unmatched_selection.get_unchecked_mut(unmatched_idx) = idx as u32
+                        };
+                        unmatched_idx += 1;
                     }
                 });
             }
         }
-        count
+        (matched_idx, unmatched_idx)
     }
 
-    // Using hashes to probe hash table and converting them in-place to pointers for memory reuse.
-    fn early_filtering_probe_with_selection(
+    // Perform early filtering probe and store matched indexes in `selection`, return the number of matched indexes.
+    fn early_filtering_matched_probe(
         &self,
         hashes: &mut [u64],
         bitmap: Option<Bitmap>,

diff --git a/src/common/hashtable/src/traits.rs b/src/common/hashtable/src/traits.rs
@@ -524,24 +524,21 @@ pub trait HashtableLike {
 pub trait HashJoinHashtableLike {
     type Key: ?Sized;
 
-    // Using hashes to probe hash table and converting them in-place to pointers for memory reuse.
-    // same with `early_filtering_probe`, but we don't use early_filter
+    // Probe hash table, use `hashes` to probe hash table and convert it in-place to pointers for memory reuse.
     fn probe(&self, hashes: &mut [u64], bitmap: Option<Bitmap>) -> usize;
 
-    // Using hashes to probe hash table and converting them in-place to pointers for memory reuse.
-    // 1. same with `early_filtering_probe_with_selection`, but we don't use selection to preserve the
-    // unfiltered indexes, we just set the filtered hashes as zero.
-    // 2. return the unfiltered counts.
-    fn early_filtering_probe(&self, hashes: &mut [u64], bitmap: Option<Bitmap>) -> usize;
-
-    // Using hashes to probe hash table and converting them in-place to pointers for memory reuse.
-    // we use `early_filtering_probe_with_selection` to do the first round probe.
-    // 1. `hashes` is the hash value of probe block's rows. we will use this one to
-    // do early filtering. if we can't early filter one row(at idx), we will assign pointer in
-    // the bucket to hashes[idx] to reuse the memory.
-    // 2. `selection` is used to preserved the indexes which can't be early_filtered.
-    // 3. return the count of preserved the indexes in `selection`
-    fn early_filtering_probe_with_selection(
+    // Perform early filtering probe, store matched indexes in `matched_selection` and store unmatched indexes
+    // in `unmatched_selection`, return the number of matched and unmatched indexes.
+    fn early_filtering_probe(
+        &self,
+        hashes: &mut [u64],
+        valids: Option<Bitmap>,
+        matched_selection: &mut [u32],
+        unmatched_selection: &mut [u32],
+    ) -> (usize, usize);
+
+    // Perform early filtering probe and store matched indexes in `selection`, return the number of matched indexes.
+    fn early_filtering_matched_probe(
         &self,
         hashes: &mut [u64],
         valids: Option<Bitmap>,