Skip to content

Commit

Permalink
Merge branch 'main' into fix-push-down-filter
Browse files Browse the repository at this point in the history
  • Loading branch information
leiysky authored Oct 12, 2023
2 parents 1bbb65f + e048513 commit 7f31d57
Show file tree
Hide file tree
Showing 134 changed files with 1,056 additions and 453 deletions.
20 changes: 3 additions & 17 deletions .github/actions/build_bindings_python/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,32 +24,24 @@ runs:
id: toolchain
shell: bash
run: |
bash ./scripts/setup/dev_setup.sh -yb
RUST_TOOLCHAIN=$(awk -F'[ ="]+' '$1 == "channel" { print $2 }' rust-toolchain.toml)
echo "RUST_TOOLCHAIN=${RUST_TOOLCHAIN}" >> $GITHUB_OUTPUT
# NOTE: for exporting ACTIONS_RUNTIME_TOKEN and ACTIONS_CACHE_URL
- name: Expose GitHub Runtime
uses: crazy-max/ghaction-github-runtime@v2
if: env.RUNNER_PROVIDER == 'github'

- name: Get opts
id: opts
shell: bash
run: |
echo "DOCKER_OPTS=--env RUSTC_WRAPPER=sccache --env SCCACHE_GHA_ENABLED=true" >> $GITHUB_OUTPUT
if [[ "${{ github.event_name }}" == "pull_request" ]]; then
if [[ -z "${{ inputs.version }}" ]]; then
echo "BUILD_ARGS=--strip --out dist" >> $GITHUB_OUTPUT
echo "BUILD_PROFILE=debug" >> $GITHUB_ENV
else
echo "BUILD_ARGS=--release --strip --out dist" >> $GITHUB_OUTPUT
echo "BUILD_PROFILE=release" >> $GITHUB_ENV
fi
- name: Cross setup for macOS
if: endsWith(inputs.target, '-darwin')
shell: bash
run: |
bash ./scripts/setup/dev_setup.sh -yb
echo "JEMALLOC_SYS_WITH_LG_PAGE=14" >> $GITHUB_ENV
echo "JEMALLOC_SYS_WITH_MALLOC_CONF=oversize_threshold:0,dirty_decay_ms:5000,muzzy_decay_ms:5000" >> $GITHUB_ENV
Expand All @@ -63,7 +55,7 @@ runs:
# Keep them in one line due to https://github.com/PyO3/maturin-action/issues/153
rustup-components: rust-std rustfmt
args: ${{ steps.opts.outputs.BUILD_ARGS }}
docker-options: ${{ steps.opts.outputs.DOCKER_OPTS }}
docker-options: --env RUSTC_WRAPPER=sccache --env SCCACHE_GCS_RW_MODE=READ_WRITE --env SCCACHE_GCS_BUCKET=databend-ci --env SCCACHE_GCS_KEY_PREFIX=cache/sccache/
before-script-linux: ../../scripts/setup/dev_setup.sh -yb

- name: Run tests
Expand All @@ -74,9 +66,3 @@ runs:
pip install dist/*.whl
pip install pytest pyarrow pandas polars
pytest -v tests/*
- name: Upload artifact
uses: actions/upload-artifact@v3
with:
name: dist
path: src/bendpy/dist/*.whl
14 changes: 3 additions & 11 deletions .github/actions/setup_bendsql/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,7 @@ runs:
- name: Download and Install
shell: bash
run: |
version=$(gh release view --repo datafuselabs/bendsql --json name | jq -r '.name')
deb_version=${version/v/}
wget -q https://github.com/datafuselabs/bendsql/releases/download/${version}/bendsql_${deb_version}_amd64.deb
sudo dpkg -i bendsql_${deb_version}_amd64.deb
sudo curl -L -o /etc/apt/sources.list.d/datafuselabs.sources https://repo.databend.rs/deb/datafuselabs.sources
sudo apt update
sudo apt install -y bendsql
bendsql --version
# sudo curl -L -o /usr/share/keyrings/datafuselabs-keyring.gpg https://repo.databend.rs/deb/datafuselabs.gpg
# sudo curl -L -o /etc/apt/sources.list.d/datafuselabs.list https://repo.databend.rs/deb/datafuselabs.list
# sudo apt update
# sudo apt install -y bendsql
# bendsql --version
53 changes: 18 additions & 35 deletions .github/workflows/bindings.python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ on:
- main
paths:
- "src/**"
- ".github/workflows/bindings.python.yml"
workflow_call:
inputs:
tag:
Expand All @@ -17,44 +18,34 @@ concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
cancel-in-progress: true

env:
RUNNER_PROVIDER: github

jobs:
build_linux:
name: build-${{ matrix.target }}
runs-on: ubuntu-latest
linux:
name: ${{ matrix.target }}
runs-on: [self-hosted, X64, Linux, 8c16g, gcp]
strategy:
matrix:
target:
- x86_64-unknown-linux-gnu
steps:
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: false
docker-images: true
swap-storage: true
- uses: actions/checkout@v4
with:
fetch-depth: 0
- uses: ./.github/actions/build_bindings_python
with:
target: ${{ matrix.target }}
version: ${{ inputs.tag }}
- name: Publish to PyPI
if: inputs.tag
uses: pypa/gh-action-pypi-publish@release/v1
with:
skip-existing: true
password: ${{ secrets.PYPI_PASSWORD }}
packages-dir: src/bendpy/dist

build_macos:
name: build-${{ matrix.target }}
if: github.event_name != 'pull_request'
runs-on: macos-11
macos:
if: inputs.tag
name: ${{ matrix.target }}
runs-on: macos-latest
strategy:
matrix:
target:
Expand All @@ -68,18 +59,10 @@ jobs:
with:
target: ${{ matrix.target }}
version: ${{ inputs.tag }}

release:
# publish release only the version endsWith 0
# if: endsWith(inputs.tag, '0')
if: inputs.tag
name: Publish to PyPI
needs: [build_linux, build_macos]
runs-on: ubuntu-latest
steps:
- uses: actions/download-artifact@v3
- name: Publish to PyPI
if: inputs.tag
uses: pypa/gh-action-pypi-publish@release/v1
with:
password: ${{ secrets.pypi_password }}
skip-existing: true
password: ${{ secrets.PYPI_PASSWORD }}
packages-dir: src/bendpy/dist
2 changes: 2 additions & 0 deletions .github/workflows/dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ jobs:
any_src_changed: ${{ steps.src.outputs.any_changed }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Check Source File Changes
uses: tj-actions/changed-files@v39
id: src
Expand Down
3 changes: 1 addition & 2 deletions benchmark/clickbench/hits/clear.sql
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
drop table hits;
VACUUM DROP TABLE retain 0 hours;
drop table hits all;
17 changes: 8 additions & 9 deletions benchmark/clickbench/tpch/clear.sql
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
drop table customer;
drop table lineitem;
drop table nation;
drop table orders;
drop table partsupp;
drop table part;
drop table region;
drop table supplier;
VACUUM DROP TABLE retain 0 hours;
drop table customer all;
drop table lineitem all;
drop table nation all;
drop table orders all;
drop table partsupp all;
drop table part all;
drop table region all;
drop table supplier all;
4 changes: 1 addition & 3 deletions benchmark/tpcds/load_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,9 @@ tables=(
# Clear Data
for t in ${tables[@]}
do
echo "DROP TABLE IF EXISTS $t" | $MYSQL_CLIENT_CONNECT
echo "DROP TABLE IF EXISTS $t ALL" | $MYSQL_CLIENT_CONNECT
done

echo "VACUUM DROP TABLE retain 0 hours" | $MYSQL_CLIENT_CONNECT

# Create Tables;
cat "$CURDIR"/tpcds.sql | $MYSQL_CLIENT_CONNECT

Expand Down
1 change: 1 addition & 0 deletions docs/doc/02-enterprise/10-enterprise-features.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ This page provides an updated list of available enterprise features. To access t

| Feature | Description |
|--------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [Aggregating Index](../14-sql-commands/00-ddl/103-aggregating-index/index.md) | Elevate your query speed with aggregating indexes:<br/>- Supercharge queries through precomputed and indexed aggregations.<br/>- Customize the index to meet your unique data analysis requirements. |
| [Masking Policy](../14-sql-commands/00-ddl/102-mask-policy/index.md) | Enhance your data security with role-based masking feature:<br/>- Safeguard sensitive information through customizable data masking.<br/>- Preserve data usability while reinforcing security. |
| [Vacuum Dropped Table](../14-sql-commands/00-ddl/20-table/91-vacuum-drop-table.md) | Optimize storage and data management for dropped tables:<br/>- Efficiently free up storage by removing dropped tables' data files.<br/>- Utilize the 'Retain N hours' option to specify a time window during which dropped table data files are retained for potential recovery. <br/>- Safely preview the removal of data files using the dry-run option. |
| [Vacuum Historical Data](../14-sql-commands/00-ddl/20-table/91-vacuum-table.md) | Deep clean your storage space:<br/>- Remove orphan segment and block files. <br/>- Safely preview the removal of data files using the dry-run option. |
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"label": "Aggregating Index"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
---
title: CREATE AGGREGATING INDEX
---

import FunctionDescription from '@site/src/components/FunctionDescription';

<FunctionDescription description="Introduced or updated: v1.2.151"/>

import EEFeature from '@site/src/components/EEFeature';

<EEFeature featureName='AGGREGATING INDEX'/>

Creates a new aggregating index in Databend.

## Syntax

```sql
CREATE AGGREGATING INDEX <index_name> AS SELECT ...
```

- When creating aggregating indexes, limit their usage to standard aggregate functions (e.g., AVG, SUM, MIN, MAX, COUNT), while keeping in mind that GROUPING SETS, window functions, LIMIT, and ORDER BY are not accepted.

- The query filter scope defined when creating aggregating indexes should either match or encompass the scope of your actual queries.

- To confirm if an aggregating index works for a query, use the [EXPLAIN](../../90-explain-cmds/explain.md) command to analyze the query.

## Examples

This example creates an aggregating index named *my_agg_index* for the query "SELECT MIN(a), MAX(c) FROM agg":

```sql
-- Prepare data
CREATE TABLE agg(a int, b int, c int);
INSERT INTO agg VALUES (1,1,4), (1,2,1), (1,2,4), (2,2,5);

-- Create an aggregating index
CREATE AGGREGATING INDEX my_agg_index AS SELECT MIN(a), MAX(c) FROM agg;
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
---
title: DROP AGGREGATING INDEX
---

import FunctionDescription from '@site/src/components/FunctionDescription';

<FunctionDescription description="Introduced or updated: v1.2.151"/>

import EEFeature from '@site/src/components/EEFeature';

<EEFeature featureName='AGGREGATING INDEX'/>

Deletes an existing aggregating index. Please note that deleting an aggregating index does NOT remove the associated storage blocks. To delete the blocks as well, use the [VACUUM TABLE](../20-table/91-vacuum-table.md) command. To disable the aggregating indexing feature, set 'enable_aggregating_index_scan' to 0.

## Syntax

```sql
DROP AGGREGATING INDEX <index_name>
```

## Examples

This example deleted an aggregating index named *my_agg_index*:

```sql
DROP AGGREGATING INDEX my_agg_index;
```
101 changes: 101 additions & 0 deletions docs/doc/14-sql-commands/00-ddl/103-aggregating-index/index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
---
title: AGGREGATING INDEX
---
import IndexOverviewList from '@site/src/components/IndexOverviewList';
import EEFeature from '@site/src/components/EEFeature';

<EEFeature featureName='AGGREGATING INDEX'/>

### Why Aggregating Index?

The primary purpose of the aggregating index is to enhance query performance, especially in scenarios involving aggregation queries such as MIN, MAX, and SUM. It achieves this by precomputing and storing query results separately in blocks, eliminating the need to scan the entire table and thereby speeding up data retrieval.

The feature also incorporates a refresh mechanism that enables you to update and save the latest query results as needed, ensuring that the query responses consistently reflect the most current data. This manual control allows you to maintain data accuracy and reliability by refreshing the results when deemed necessary.

Please note the following when creating aggregating indexes:

- When creating aggregating indexes, limit their usage to standard aggregate functions (e.g., AVG, SUM, MIN, MAX, COUNT), while keeping in mind that GROUPING SETS, window functions, LIMIT, and ORDER BY are not accepted.

- The query filter scope defined when creating aggregating indexes should either match or encompass the scope of your actual queries.

- To confirm if an aggregating index works for a query, use the [EXPLAIN](../../90-explain-cmds/explain.md) command to analyze the query.

Databend recommends refreshing an aggregating index before executing a query that relies on it to retrieve the most up-to-date data (while Databend Cloud automatically refreshes aggregating indexes for you). If you no longer need an aggregating index, consider deleting it. Please note that deleting an aggregating index does NOT remove the associated storage blocks. To delete the blocks as well, use the [VACUUM TABLE](../20-table/91-vacuum-table.md) command. To disable the aggregating indexing feature, set 'enable_aggregating_index_scan' to 0.

### Implementing Aggregating Index

Databend provides the following commands to manage aggregating indexes:

<IndexOverviewList />

### Usage Example

This example demonstrates the utilization of aggregating indexes and illustrates their impact on the query execution plan.

```sql
-- Prepare data
CREATE TABLE agg(a int, b int, c int);
INSERT INTO agg VALUES (1,1,4), (1,2,1), (1,2,4), (2,2,5);

-- Create an aggregating index
CREATE AGGREGATING INDEX my_agg_index AS SELECT MIN(a), MAX(c) FROM agg;

-- Refresh the aggregating index
REFRESH AGGREGATING INDEX my_agg_index;

-- Verify if the aggregating index works
EXPLAIN SELECT MIN(a), MAX(c) FROM agg;

explain |
----------------------------------------------------------------------------------------------------------------------+
AggregateFinal |
├── output columns: [MIN(a) (#8), MAX(c) (#9)] |
├── group by: [] |
├── aggregate functions: [min(a), max(c)] |
├── estimated rows: 1.00 |
└── AggregatePartial |
├── output columns: [MIN(a) (#8), MAX(c) (#9)] |
├── group by: [] |
├── aggregate functions: [min(a), max(c)] |
├── estimated rows: 1.00 |
└── TableScan |
├── table: default.default.agg |
├── output columns: [a (#5), c (#7)] |
├── read rows: 4 |
├── read bytes: 61 |
├── partitions total: 1 |
├── partitions scanned: 1 |
├── pruning stats: [segments: <range pruning: 1 to 1>, blocks: <range pruning: 1 to 1, bloom pruning: 0 to 0>]|
├── push downs: [filters: [], limit: NONE] |
├── aggregating index: [SELECT MIN(a), MAX(c) FROM default.agg] |
├── rewritten query: [selection: [index_col_0 (#0), index_col_1 (#1)]] |
└── estimated rows: 4.00 |

-- Delete the aggregating index
DROP AGGREGATING INDEX my_agg_index;

EXPLAIN SELECT MIN(a), MAX(c) FROM agg;

explain |
----------------------------------------------------------------------------------------------------------------------+
AggregateFinal |
├── output columns: [MIN(a) (#3), MAX(c) (#4)] |
├── group by: [] |
├── aggregate functions: [min(a), max(c)] |
├── estimated rows: 1.00 |
└── AggregatePartial |
├── output columns: [MIN(a) (#3), MAX(c) (#4)] |
├── group by: [] |
├── aggregate functions: [min(a), max(c)] |
├── estimated rows: 1.00 |
└── TableScan |
├── table: default.default.agg |
├── output columns: [a (#0), c (#2)] |
├── read rows: 4 |
├── read bytes: 61 |
├── partitions total: 1 |
├── partitions scanned: 1 |
├── pruning stats: [segments: <range pruning: 1 to 1>, blocks: <range pruning: 1 to 1, bloom pruning: 0 to 0>]|
├── push downs: [filters: [], limit: NONE] |
└── estimated rows: 4.00 |
```
Loading

0 comments on commit 7f31d57

Please sign in to comment.