Merge branch 'main' into fix-push-down-filter

databendlabs · Oct 12, 2023 · 7f31d57 · 7f31d57
2 parents 1bbb65f + e048513
commit 7f31d57
Show file tree

Hide file tree

Showing 134 changed files with 1,056 additions and 453 deletions.
diff --git a/.github/actions/build_bindings_python/action.yml b/.github/actions/build_bindings_python/action.yml
@@ -24,32 +24,24 @@ runs:
       id: toolchain
       shell: bash
       run: |
-        bash ./scripts/setup/dev_setup.sh -yb
         RUST_TOOLCHAIN=$(awk -F'[ ="]+' '$1 == "channel" { print $2 }' rust-toolchain.toml)
         echo "RUST_TOOLCHAIN=${RUST_TOOLCHAIN}" >> $GITHUB_OUTPUT
 
-    # NOTE: for exporting ACTIONS_RUNTIME_TOKEN and ACTIONS_CACHE_URL
-    - name: Expose GitHub Runtime
-      uses: crazy-max/ghaction-github-runtime@v2
-      if: env.RUNNER_PROVIDER == 'github'
-
     - name: Get opts
       id: opts
       shell: bash
       run: |
-        echo "DOCKER_OPTS=--env RUSTC_WRAPPER=sccache --env SCCACHE_GHA_ENABLED=true" >> $GITHUB_OUTPUT
-        if [[ "${{ github.event_name }}" == "pull_request" ]]; then
+        if [[ -z "${{ inputs.version }}" ]]; then
           echo "BUILD_ARGS=--strip --out dist" >> $GITHUB_OUTPUT
-          echo "BUILD_PROFILE=debug" >> $GITHUB_ENV
         else
           echo "BUILD_ARGS=--release --strip --out dist" >> $GITHUB_OUTPUT
-          echo "BUILD_PROFILE=release" >> $GITHUB_ENV
         fi
 
     - name: Cross setup for macOS
       if: endsWith(inputs.target, '-darwin')
       shell: bash
       run: |
+        bash ./scripts/setup/dev_setup.sh -yb
         echo "JEMALLOC_SYS_WITH_LG_PAGE=14" >> $GITHUB_ENV
         echo "JEMALLOC_SYS_WITH_MALLOC_CONF=oversize_threshold:0,dirty_decay_ms:5000,muzzy_decay_ms:5000" >> $GITHUB_ENV
 
@@ -63,7 +55,7 @@ runs:
         # Keep them in one line due to https://github.com/PyO3/maturin-action/issues/153
         rustup-components: rust-std rustfmt
         args: ${{ steps.opts.outputs.BUILD_ARGS }}
-        docker-options: ${{ steps.opts.outputs.DOCKER_OPTS }}
+        docker-options: --env RUSTC_WRAPPER=sccache --env SCCACHE_GCS_RW_MODE=READ_WRITE --env SCCACHE_GCS_BUCKET=databend-ci --env SCCACHE_GCS_KEY_PREFIX=cache/sccache/
         before-script-linux: ../../scripts/setup/dev_setup.sh -yb
 
     - name: Run tests
@@ -74,9 +66,3 @@ runs:
         pip install dist/*.whl
         pip install pytest pyarrow pandas polars
         pytest -v tests/*
-
-    - name: Upload artifact
-      uses: actions/upload-artifact@v3
-      with:
-        name: dist
-        path: src/bendpy/dist/*.whl
diff --git a/.github/actions/setup_bendsql/action.yml b/.github/actions/setup_bendsql/action.yml
@@ -6,15 +6,7 @@ runs:
     - name: Download and Install
       shell: bash
       run: |
-
-        version=$(gh release view --repo datafuselabs/bendsql --json name | jq -r '.name')
-        deb_version=${version/v/}
-        wget -q https://github.com/datafuselabs/bendsql/releases/download/${version}/bendsql_${deb_version}_amd64.deb
-        sudo dpkg -i bendsql_${deb_version}_amd64.deb
+        sudo curl -L -o /etc/apt/sources.list.d/datafuselabs.sources https://repo.databend.rs/deb/datafuselabs.sources
+        sudo apt update
+        sudo apt install -y bendsql
         bendsql --version
-
-        # sudo curl -L -o /usr/share/keyrings/datafuselabs-keyring.gpg https://repo.databend.rs/deb/datafuselabs.gpg
-        # sudo curl -L -o /etc/apt/sources.list.d/datafuselabs.list https://repo.databend.rs/deb/datafuselabs.list
-        # sudo apt update
-        # sudo apt install -y bendsql
-        # bendsql --version
diff --git a/.github/workflows/bindings.python.yml b/.github/workflows/bindings.python.yml
@@ -6,6 +6,7 @@ on:
       - main
     paths:
       - "src/**"
+      - ".github/workflows/bindings.python.yml"
   workflow_call:
     inputs:
       tag:
@@ -17,44 +18,34 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
   cancel-in-progress: true
 
-env:
-  RUNNER_PROVIDER: github
-
 jobs:
-  build_linux:
-    name: build-${{ matrix.target }}
-    runs-on: ubuntu-latest
+  linux:
+    name: ${{ matrix.target }}
+    runs-on: [self-hosted, X64, Linux, 8c16g, gcp]
     strategy:
       matrix:
         target:
           - x86_64-unknown-linux-gnu
     steps:
-      - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@main
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: false
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: false
-          docker-images: true
-          swap-storage: true
       - uses: actions/checkout@v4
         with:
           fetch-depth: 0
       - uses: ./.github/actions/build_bindings_python
         with:
           target: ${{ matrix.target }}
           version: ${{ inputs.tag }}
+      - name: Publish to PyPI
+        if: inputs.tag
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          skip-existing: true
+          password: ${{ secrets.PYPI_PASSWORD }}
+          packages-dir: src/bendpy/dist
 
-  build_macos:
-    name: build-${{ matrix.target }}
-    if: github.event_name != 'pull_request'
-    runs-on: macos-11
+  macos:
+    if: inputs.tag
+    name: ${{ matrix.target }}
+    runs-on: macos-latest
     strategy:
       matrix:
         target:
@@ -68,18 +59,10 @@ jobs:
         with:
           target: ${{ matrix.target }}
           version: ${{ inputs.tag }}
-
-  release:
-    # publish release only the version endsWith 0
-    # if: endsWith(inputs.tag, '0')
-    if: inputs.tag
-    name: Publish to PyPI
-    needs: [build_linux, build_macos]
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/download-artifact@v3
       - name: Publish to PyPI
+        if: inputs.tag
         uses: pypa/gh-action-pypi-publish@release/v1
         with:
-          password: ${{ secrets.pypi_password }}
           skip-existing: true
+          password: ${{ secrets.PYPI_PASSWORD }}
+          packages-dir: src/bendpy/dist
diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml
@@ -21,6 +21,8 @@ jobs:
       any_src_changed: ${{ steps.src.outputs.any_changed }}
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
       - name: Check Source File Changes
         uses: tj-actions/changed-files@v39
         id: src

diff --git a/benchmark/clickbench/hits/clear.sql b/benchmark/clickbench/hits/clear.sql
@@ -1,2 +1 @@
-drop table hits;
-VACUUM DROP TABLE retain 0 hours;
+drop table hits all;
diff --git a/benchmark/clickbench/tpch/clear.sql b/benchmark/clickbench/tpch/clear.sql
@@ -1,9 +1,8 @@
-drop table customer;
-drop table lineitem;
-drop table nation;
-drop table orders;
-drop table partsupp;
-drop table part;
-drop table region;
-drop table supplier;
-VACUUM DROP TABLE retain 0 hours;
+drop table customer all;
+drop table lineitem all;
+drop table nation all;
+drop table orders all;
+drop table partsupp all;
+drop table part all;
+drop table region all;
+drop table supplier all;
diff --git a/benchmark/tpcds/load_data.sh b/benchmark/tpcds/load_data.sh
@@ -36,11 +36,9 @@ tables=(
 # Clear Data
 for t in ${tables[@]}
 do
-    echo "DROP TABLE IF EXISTS $t" | $MYSQL_CLIENT_CONNECT
+    echo "DROP TABLE IF EXISTS $t ALL" | $MYSQL_CLIENT_CONNECT
 done
 
-echo "VACUUM DROP TABLE retain 0 hours" | $MYSQL_CLIENT_CONNECT
-
 # Create Tables;
 cat "$CURDIR"/tpcds.sql | $MYSQL_CLIENT_CONNECT
 

diff --git a/docs/doc/02-enterprise/10-enterprise-features.md b/docs/doc/02-enterprise/10-enterprise-features.md
@@ -6,6 +6,7 @@ This page provides an updated list of available enterprise features. To access t
 
 | Feature                                                                                        	 | Description                                                                                                                                                                                                                                                                                                                                                                                 	 |
 |--------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| [Aggregating Index](../14-sql-commands/00-ddl/103-aggregating-index/index.md) | Elevate your query speed with aggregating indexes:<br/>- Supercharge queries through precomputed and indexed aggregations.<br/>- Customize the index to meet your unique data analysis requirements. |
 | [Masking Policy](../14-sql-commands/00-ddl/102-mask-policy/index.md) | Enhance your data security with role-based masking feature:<br/>- Safeguard sensitive information through customizable data masking.<br/>- Preserve data usability while reinforcing security. |
 | [Vacuum Dropped Table](../14-sql-commands/00-ddl/20-table/91-vacuum-drop-table.md)            	 | Optimize storage and data management for dropped tables:<br/>- Efficiently free up storage by removing dropped tables' data files.<br/>- Utilize the 'Retain N hours' option to specify a time window during which dropped table data files are retained for potential recovery. <br/>- Safely preview the removal of data files using the dry-run option.                                  	 |
 | [Vacuum Historical Data](../14-sql-commands/00-ddl/20-table/91-vacuum-table.md)            	 | Deep clean your storage space:<br/>- Remove orphan segment and block files. <br/>- Safely preview the removal of data files using the dry-run option.                                                                                                                                                                         	 |

diff --git a/docs/doc/14-sql-commands/00-ddl/103-aggregating-index/_category_.json b/docs/doc/14-sql-commands/00-ddl/103-aggregating-index/_category_.json
@@ -0,0 +1,3 @@
+{
+  "label": "Aggregating Index"
+}
diff --git a/docs/doc/14-sql-commands/00-ddl/103-aggregating-index/create-aggregating-index.md b/docs/doc/14-sql-commands/00-ddl/103-aggregating-index/create-aggregating-index.md
@@ -0,0 +1,38 @@
+---
+title: CREATE AGGREGATING INDEX
+---
+
+import FunctionDescription from '@site/src/components/FunctionDescription';
+
+<FunctionDescription description="Introduced or updated: v1.2.151"/>
+
+import EEFeature from '@site/src/components/EEFeature';
+
+<EEFeature featureName='AGGREGATING INDEX'/>
+
+Creates a new aggregating index in Databend.
+
+## Syntax
+
+```sql
+CREATE AGGREGATING INDEX <index_name> AS SELECT ...
+```
+
+- When creating aggregating indexes, limit their usage to standard aggregate functions (e.g., AVG, SUM, MIN, MAX, COUNT), while keeping in mind that GROUPING SETS, window functions, LIMIT, and ORDER BY are not accepted.
+
+- The query filter scope defined when creating aggregating indexes should either match or encompass the scope of your actual queries.
+
+- To confirm if an aggregating index works for a query, use the [EXPLAIN](../../90-explain-cmds/explain.md) command to analyze the query.
+
+## Examples
+
+This example creates an aggregating index named *my_agg_index* for the query "SELECT MIN(a), MAX(c) FROM agg":
+
+```sql
+-- Prepare data
+CREATE TABLE agg(a int, b int, c int);
+INSERT INTO agg VALUES (1,1,4), (1,2,1), (1,2,4), (2,2,5);
+
+-- Create an aggregating index
+CREATE AGGREGATING INDEX my_agg_index AS SELECT MIN(a), MAX(c) FROM agg;
+```
diff --git a/docs/doc/14-sql-commands/00-ddl/103-aggregating-index/drop-aggregating-index.md b/docs/doc/14-sql-commands/00-ddl/103-aggregating-index/drop-aggregating-index.md
@@ -0,0 +1,27 @@
+---
+title: DROP AGGREGATING INDEX
+---
+
+import FunctionDescription from '@site/src/components/FunctionDescription';
+
+<FunctionDescription description="Introduced or updated: v1.2.151"/>
+
+import EEFeature from '@site/src/components/EEFeature';
+
+<EEFeature featureName='AGGREGATING INDEX'/>
+
+Deletes an existing aggregating index. Please note that deleting an aggregating index does NOT remove the associated storage blocks. To delete the blocks as well, use the [VACUUM TABLE](../20-table/91-vacuum-table.md) command. To disable the aggregating indexing feature, set 'enable_aggregating_index_scan' to 0.
+
+## Syntax
+
+```sql
+DROP AGGREGATING INDEX <index_name>
+```
+
+## Examples
+
+This example deleted an aggregating index named *my_agg_index*:
+
+```sql
+DROP AGGREGATING INDEX my_agg_index;
+```
diff --git a/docs/doc/14-sql-commands/00-ddl/103-aggregating-index/index.md b/docs/doc/14-sql-commands/00-ddl/103-aggregating-index/index.md
@@ -0,0 +1,101 @@
+---
+title: AGGREGATING INDEX
+---
+import IndexOverviewList from '@site/src/components/IndexOverviewList';
+import EEFeature from '@site/src/components/EEFeature';
+
+<EEFeature featureName='AGGREGATING INDEX'/>
+
+### Why Aggregating Index?
+
+The primary purpose of the aggregating index is to enhance query performance, especially in scenarios involving aggregation queries such as MIN, MAX, and SUM. It achieves this by precomputing and storing query results separately in blocks, eliminating the need to scan the entire table and thereby speeding up data retrieval.
+
+The feature also incorporates a refresh mechanism that enables you to update and save the latest query results as needed, ensuring that the query responses consistently reflect the most current data. This manual control allows you to maintain data accuracy and reliability by refreshing the results when deemed necessary.
+
+Please note the following when creating aggregating indexes:
+
+- When creating aggregating indexes, limit their usage to standard aggregate functions (e.g., AVG, SUM, MIN, MAX, COUNT), while keeping in mind that GROUPING SETS, window functions, LIMIT, and ORDER BY are not accepted.
+
+- The query filter scope defined when creating aggregating indexes should either match or encompass the scope of your actual queries.
+
+- To confirm if an aggregating index works for a query, use the [EXPLAIN](../../90-explain-cmds/explain.md) command to analyze the query.
+
+Databend recommends refreshing an aggregating index before executing a query that relies on it to retrieve the most up-to-date data (while Databend Cloud automatically refreshes aggregating indexes for you). If you no longer need an aggregating index, consider deleting it. Please note that deleting an aggregating index does NOT remove the associated storage blocks. To delete the blocks as well, use the [VACUUM TABLE](../20-table/91-vacuum-table.md) command. To disable the aggregating indexing feature, set 'enable_aggregating_index_scan' to 0.
+
+### Implementing Aggregating Index
+
+Databend provides the following commands to manage aggregating indexes:
+
+<IndexOverviewList />
+
+### Usage Example
+
+This example demonstrates the utilization of aggregating indexes and illustrates their impact on the query execution plan.
+
+```sql
+-- Prepare data
+CREATE TABLE agg(a int, b int, c int);
+INSERT INTO agg VALUES (1,1,4), (1,2,1), (1,2,4), (2,2,5);
+
+-- Create an aggregating index
+CREATE AGGREGATING INDEX my_agg_index AS SELECT MIN(a), MAX(c) FROM agg;
+
+-- Refresh the aggregating index
+REFRESH AGGREGATING INDEX my_agg_index;
+
+-- Verify if the aggregating index works
+EXPLAIN SELECT MIN(a), MAX(c) FROM agg;
+
+explain                                                                                                               |
+----------------------------------------------------------------------------------------------------------------------+
+AggregateFinal                                                                                                        |
+├── output columns: [MIN(a) (#8), MAX(c) (#9)]                                                                        |
+├── group by: []                                                                                                      |
+├── aggregate functions: [min(a), max(c)]                                                                             |
+├── estimated rows: 1.00                                                                                              |
+└── AggregatePartial                                                                                                  |
+    ├── output columns: [MIN(a) (#8), MAX(c) (#9)]                                                                    |
+    ├── group by: []                                                                                                  |
+    ├── aggregate functions: [min(a), max(c)]                                                                         |
+    ├── estimated rows: 1.00                                                                                          |
+    └── TableScan                                                                                                     |
+        ├── table: default.default.agg                                                                                |
+        ├── output columns: [a (#5), c (#7)]                                                                          |
+        ├── read rows: 4                                                                                              |
+        ├── read bytes: 61                                                                                            |
+        ├── partitions total: 1                                                                                       |
+        ├── partitions scanned: 1                                                                                     |
+        ├── pruning stats: [segments: <range pruning: 1 to 1>, blocks: <range pruning: 1 to 1, bloom pruning: 0 to 0>]|
+        ├── push downs: [filters: [], limit: NONE]                                                                    |
+        ├── aggregating index: [SELECT MIN(a), MAX(c) FROM default.agg]                                               |
+        ├── rewritten query: [selection: [index_col_0 (#0), index_col_1 (#1)]]                                        |
+        └── estimated rows: 4.00                                                                                      |
+
+-- Delete the aggregating index
+DROP AGGREGATING INDEX my_agg_index;
+
+EXPLAIN SELECT MIN(a), MAX(c) FROM agg;
+
+explain                                                                                                               |
+----------------------------------------------------------------------------------------------------------------------+
+AggregateFinal                                                                                                        |
+├── output columns: [MIN(a) (#3), MAX(c) (#4)]                                                                        |
+├── group by: []                                                                                                      |
+├── aggregate functions: [min(a), max(c)]                                                                             |
+├── estimated rows: 1.00                                                                                              |
+└── AggregatePartial                                                                                                  |
+    ├── output columns: [MIN(a) (#3), MAX(c) (#4)]                                                                    |
+    ├── group by: []                                                                                                  |
+    ├── aggregate functions: [min(a), max(c)]                                                                         |
+    ├── estimated rows: 1.00                                                                                          |
+    └── TableScan                                                                                                     |
+        ├── table: default.default.agg                                                                                |
+        ├── output columns: [a (#0), c (#2)]                                                                          |
+        ├── read rows: 4                                                                                              |
+        ├── read bytes: 61                                                                                            |
+        ├── partitions total: 1                                                                                       |
+        ├── partitions scanned: 1                                                                                     |
+        ├── pruning stats: [segments: <range pruning: 1 to 1>, blocks: <range pruning: 1 to 1, bloom pruning: 0 to 0>]|
+        ├── push downs: [filters: [], limit: NONE]                                                                    |
+        └── estimated rows: 4.00                                                                                      |
+```