diff --git a/.gitignore b/.gitignore index 72e3ed1487fe0..5f54a467b21b7 100644 --- a/.gitignore +++ b/.gitignore @@ -71,12 +71,12 @@ e2e_test/generated/* scale-test.tar.zst simulation-it-test.tar.zst - # hummock-trace .trace # spark binary e2e_test/iceberg/spark-*-bin* +e2e_test/iceberg/metastore_db **/poetry.lock diff --git a/.typos.toml b/.typos.toml index 4d4bbfca1c082..498d954a55d88 100644 --- a/.typos.toml +++ b/.typos.toml @@ -36,4 +36,5 @@ extend-exclude = [ # We don't want to fix "fals" here, but may want in other places. # Ideally, we should just ignore that line: https://github.com/crate-ci/typos/issues/316 "src/common/src/cast/mod.rs", + "src/tests/simulation/tests/integration_tests/scale/shared_source.rs", ] diff --git a/Cargo.lock b/Cargo.lock index c1b1ec57fdece..98d01dff60589 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6086,7 +6086,7 @@ dependencies = [ [[package]] name = "icelake" version = "0.3.141592654" -source = "git+https://github.com/risingwavelabs/icelake.git?rev=1860eb315183a5f3f72b4097c1e40d49407f8373#1860eb315183a5f3f72b4097c1e40d49407f8373" +source = "git+https://github.com/risingwavelabs/icelake.git?rev=3f4724158acee37a4785f56670a1427993a58739#3f4724158acee37a4785f56670a1427993a58739" dependencies = [ "anyhow", "apache-avro 0.17.0 (git+https://github.com/apache/avro.git)", @@ -9216,9 +9216,12 @@ version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55a6a9143ae25c25fa7b6a48d6cc08b10785372060009c25140a4e7c340e95af" dependencies = [ + "base64 0.22.0", "once_cell", "prost 0.13.1", "prost-types 0.13.1", + "serde", + "serde-value", ] [[package]] @@ -11210,6 +11213,7 @@ dependencies = [ "comfy-table", "crepe", "easy-ext", + "educe", "either", "enum-as-inner 0.6.0", "expect-test", @@ -11557,6 +11561,7 @@ dependencies = [ "madsim-etcd-client", "madsim-rdkafka", "madsim-tokio", + "maplit", "paste", "pin-project", "pretty_assertions", diff --git a/Cargo.toml b/Cargo.toml index a5da9b82b658c..46ab2695a4ebb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -135,7 +135,8 @@ tonic-build = { package = "madsim-tonic-build", version = "0.5" } otlp-embedded = { git = "https://github.com/risingwavelabs/otlp-embedded", rev = "e6cd165b9bc85783b42c106e99186b86b73e3507" } prost = { version = "0.13" } prost-build = { version = "0.13" } -icelake = { git = "https://github.com/risingwavelabs/icelake.git", rev = "1860eb315183a5f3f72b4097c1e40d49407f8373", features = [ +# branch dylan/fix_parquet_nested_type_field_id +icelake = { git = "https://github.com/risingwavelabs/icelake.git", rev = "3f4724158acee37a4785f56670a1427993a58739", features = [ "prometheus", ] } arrow-array-iceberg = { package = "arrow-array", version = "52" } diff --git a/README.md b/README.md index 7128dccede28b..4c0e043b71513 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ RisingWave is a Postgres-compatible SQL database engineered to provide the ingest millions of events per second, continuously join and analyze live data streams with historical tables, serve ad-hoc queries in real-time, and deliver fresh, consistent results wherever needed. -![RisingWave](./docs/dev/src/images/architecture_20240814.png) +![RisingWave](./docs/dev/src/images/architecture_20240908.png) ## Try it out in 60 seconds diff --git a/ci/build-ci-image.sh b/ci/build-ci-image.sh index 88542b4aa5f12..9d00b47bcd3aa 100755 --- a/ci/build-ci-image.sh +++ b/ci/build-ci-image.sh @@ -10,7 +10,7 @@ cat ../rust-toolchain # shellcheck disable=SC2155 # REMEMBER TO ALSO UPDATE ci/docker-compose.yml -export BUILD_ENV_VERSION=v20240812 +export BUILD_ENV_VERSION=v20240911 export BUILD_TAG="public.ecr.aws/w1p7b4n3/rw-build-env:${BUILD_ENV_VERSION}" diff --git a/ci/docker-compose.yml b/ci/docker-compose.yml index 4b1954ff5ae2c..11d29d7236367 100644 --- a/ci/docker-compose.yml +++ b/ci/docker-compose.yml @@ -71,7 +71,7 @@ services: retries: 5 source-test-env: - image: public.ecr.aws/w1p7b4n3/rw-build-env:v20240812 + image: public.ecr.aws/w1p7b4n3/rw-build-env:v20240911 depends_on: - mysql - sqlserver-server @@ -85,7 +85,7 @@ services: - ..:/risingwave sink-test-env: - image: public.ecr.aws/w1p7b4n3/rw-build-env:v20240812 + image: public.ecr.aws/w1p7b4n3/rw-build-env:v20240911 depends_on: - mysql - db @@ -108,12 +108,12 @@ services: rw-build-env: - image: public.ecr.aws/w1p7b4n3/rw-build-env:v20240812 + image: public.ecr.aws/w1p7b4n3/rw-build-env:v20240911 volumes: - ..:/risingwave ci-flamegraph-env: - image: public.ecr.aws/w1p7b4n3/rw-build-env:v20240812 + image: public.ecr.aws/w1p7b4n3/rw-build-env:v20240911 # NOTE(kwannoel): This is used in order to permit # syscalls for `nperf` (perf_event_open), # so it can do CPU profiling. @@ -124,7 +124,7 @@ services: - ..:/risingwave regress-test-env: - image: public.ecr.aws/w1p7b4n3/rw-build-env:v20240812 + image: public.ecr.aws/w1p7b4n3/rw-build-env:v20240911 depends_on: db: condition: service_healthy diff --git a/ci/rust-toolchain b/ci/rust-toolchain index 6bc57a2a65d8f..158ecbbdb0dfd 100644 --- a/ci/rust-toolchain +++ b/ci/rust-toolchain @@ -4,4 +4,4 @@ # 3. (optional) **follow the instructions in lints/README.md** to update the toolchain and dependencies for lints [toolchain] -channel = "nightly-2024-06-06" +channel = "nightly-2024-07-19" diff --git a/ci/scripts/e2e-cassandra-sink-test.sh b/ci/scripts/e2e-cassandra-sink-test.sh index 0e1c9a98d49e8..b222e4a944967 100755 --- a/ci/scripts/e2e-cassandra-sink-test.sh +++ b/ci/scripts/e2e-cassandra-sink-test.sh @@ -42,8 +42,9 @@ tar xfvz cassandra_latest.tar.gz export LATEST_CASSANDRA_VERSION=$(get_latest_cassandra_version) export CASSANDRA_DIR="./apache-cassandra-${LATEST_CASSANDRA_VERSION}" # remove bundled packages, and use installed packages, because Python 3.12 has removed asyncore, but I failed to install libev support for bundled Python driver. -rm ${CASSANDRA_DIR}/lib/six-1.12.0-py2.py3-none-any.zip -rm ${CASSANDRA_DIR}/lib/cassandra-driver-internal-only-3.25.0.zip + +rm ${CASSANDRA_DIR}/lib/futures-2.1.6-py2.py3-none-any.zip +rm ${CASSANDRA_DIR}/lib/cassandra-driver-internal-only-3.29.0.zip apt-get install -y libev4 libev-dev pip3 install --break-system-packages cassandra-driver export CQLSH_HOST=cassandra-server diff --git a/ci/scripts/e2e-iceberg-sink-v2-test.sh b/ci/scripts/e2e-iceberg-sink-v2-test.sh index dd2f78037a5f2..1a46f30682bdd 100755 --- a/ci/scripts/e2e-iceberg-sink-v2-test.sh +++ b/ci/scripts/e2e-iceberg-sink-v2-test.sh @@ -45,6 +45,7 @@ poetry run python main.py -t ./test_case/partition_upsert.toml poetry run python main.py -t ./test_case/range_partition_append_only.toml poetry run python main.py -t ./test_case/range_partition_upsert.toml poetry run python main.py -t ./test_case/append_only_with_checkpoint_interval.toml +poetry run python main.py -t ./test_case/iceberg_select_empty_table.toml echo "--- Kill cluster" diff --git a/ci/scripts/e2e-source-test.sh b/ci/scripts/e2e-source-test.sh index 56a06ac756931..29f2a0ac7b5ce 100755 --- a/ci/scripts/e2e-source-test.sh +++ b/ci/scripts/e2e-source-test.sh @@ -130,7 +130,7 @@ echo "> inserted new rows into postgres" # start cluster w/o clean-data unset RISINGWAVE_CI -export RUST_LOG="risingwave_stream=debug,risingwave_batch=info,risingwave_storage=info" \ +export RUST_LOG="risingwave_stream=debug,risingwave_batch=info,risingwave_storage=info" risedev dev ci-1cn-1fe-with-recovery echo "> wait for cluster recovery finish" diff --git a/docker/docker-compose-distributed-etcd.yml b/docker/docker-compose-distributed-etcd.yml index 1e23484c22f72..5fbfcf11e461c 100644 --- a/docker/docker-compose-distributed-etcd.yml +++ b/docker/docker-compose-distributed-etcd.yml @@ -1,7 +1,7 @@ --- version: "3" x-image: &image - image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.10.0} + image: ${RW_IMAGE:-risingwavelabs/risingwave:v2.0.0-rc.1} services: compactor-0: <<: *image diff --git a/docker/docker-compose-distributed.yml b/docker/docker-compose-distributed.yml index 8de40728fd963..6eea5a1a1fb37 100644 --- a/docker/docker-compose-distributed.yml +++ b/docker/docker-compose-distributed.yml @@ -1,6 +1,6 @@ --- x-image: &image - image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.10.0} + image: ${RW_IMAGE:-risingwavelabs/risingwave:v2.0.0-rc.1} services: compactor-0: <<: *image diff --git a/docker/docker-compose-etcd.yml b/docker/docker-compose-etcd.yml index ef444fa2f0d82..f44646f49768e 100644 --- a/docker/docker-compose-etcd.yml +++ b/docker/docker-compose-etcd.yml @@ -1,7 +1,7 @@ --- version: "3" x-image: &image - image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.10.0} + image: ${RW_IMAGE:-risingwavelabs/risingwave:v2.0.0-rc.1} services: risingwave-standalone: <<: *image diff --git a/docker/docker-compose-with-azblob.yml b/docker/docker-compose-with-azblob.yml index 7c6a30e1f336c..490ac3eecc07e 100644 --- a/docker/docker-compose-with-azblob.yml +++ b/docker/docker-compose-with-azblob.yml @@ -1,6 +1,6 @@ --- x-image: &image - image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.10.0} + image: ${RW_IMAGE:-risingwavelabs/risingwave:v2.0.0-rc.1} services: risingwave-standalone: <<: *image diff --git a/docker/docker-compose-with-gcs.yml b/docker/docker-compose-with-gcs.yml index 9327e6b4ee8cb..9787e405df046 100644 --- a/docker/docker-compose-with-gcs.yml +++ b/docker/docker-compose-with-gcs.yml @@ -1,6 +1,6 @@ --- x-image: &image - image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.10.0} + image: ${RW_IMAGE:-risingwavelabs/risingwave:v2.0.0-rc.1} services: risingwave-standalone: <<: *image diff --git a/docker/docker-compose-with-local-fs.yml b/docker/docker-compose-with-local-fs.yml index d52a2adc911fd..d55156dd0cfd2 100644 --- a/docker/docker-compose-with-local-fs.yml +++ b/docker/docker-compose-with-local-fs.yml @@ -1,6 +1,6 @@ --- x-image: &image - image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.10.0} + image: ${RW_IMAGE:-risingwavelabs/risingwave:v2.0.0-rc.1} services: risingwave-standalone: <<: *image diff --git a/docker/docker-compose-with-obs.yml b/docker/docker-compose-with-obs.yml index d6beb4f86e89e..14184a828a8a6 100644 --- a/docker/docker-compose-with-obs.yml +++ b/docker/docker-compose-with-obs.yml @@ -1,6 +1,6 @@ --- x-image: &image - image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.10.0} + image: ${RW_IMAGE:-risingwavelabs/risingwave:v2.0.0-rc.1} services: risingwave-standalone: <<: *image diff --git a/docker/docker-compose-with-oss.yml b/docker/docker-compose-with-oss.yml index 74e4ec15d8f3e..04a1d05852633 100644 --- a/docker/docker-compose-with-oss.yml +++ b/docker/docker-compose-with-oss.yml @@ -1,6 +1,6 @@ --- x-image: &image - image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.10.0} + image: ${RW_IMAGE:-risingwavelabs/risingwave:v2.0.0-rc.1} services: risingwave-standalone: <<: *image diff --git a/docker/docker-compose-with-s3.yml b/docker/docker-compose-with-s3.yml index c6ca1a885b448..ea9647092007a 100644 --- a/docker/docker-compose-with-s3.yml +++ b/docker/docker-compose-with-s3.yml @@ -1,6 +1,6 @@ --- x-image: &image - image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.10.0} + image: ${RW_IMAGE:-risingwavelabs/risingwave:v2.0.0-rc.1} services: risingwave-standalone: <<: *image diff --git a/docker/docker-compose-with-sqlite.yml b/docker/docker-compose-with-sqlite.yml index a4b008c1374cd..0dcdb6c11a814 100644 --- a/docker/docker-compose-with-sqlite.yml +++ b/docker/docker-compose-with-sqlite.yml @@ -1,6 +1,6 @@ --- x-image: &image - image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.10.0} + image: ${RW_IMAGE:-risingwavelabs/risingwave:v2.0.0-rc.1} services: risingwave-standalone: <<: *image diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index bce57e69147f4..781e3e9a476f0 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -1,6 +1,6 @@ --- x-image: &image - image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.10.0} + image: ${RW_IMAGE:-risingwavelabs/risingwave:v2.0.0-rc.1} services: risingwave-standalone: <<: *image @@ -60,7 +60,7 @@ services: ENABLE_TELEMETRY: ${ENABLE_TELEMETRY:-true} RW_TELEMETRY_TYPE: ${RW_TELEMETRY_TYPE:-"docker-compose"} RW_SECRET_STORE_PRIVATE_KEY_HEX: ${RW_SECRET_STORE_PRIVATE_KEY_HEX:-0123456789abcdef} - RW_LICENSE_KEY: ${RW_LICENSE_KEY:-""} + RW_LICENSE_KEY: ${RW_LICENSE_KEY:-} container_name: risingwave-standalone healthcheck: test: @@ -113,7 +113,7 @@ services: - "./grafana-risedev-datasource.yml:/etc/grafana/provisioning/datasources/grafana-risedev-datasource.yml" - "./grafana-risedev-dashboard.yml:/etc/grafana/provisioning/dashboards/grafana-risedev-dashboard.yml" - "./dashboards:/dashboards" - environment: { } + environment: {} container_name: grafana-0 healthcheck: test: @@ -187,7 +187,7 @@ services: volumes: - "prometheus-0:/prometheus" - "./prometheus.yaml:/etc/prometheus/prometheus.yml" - environment: { } + environment: {} container_name: prometheus-0 healthcheck: test: @@ -229,7 +229,7 @@ services: depends_on: [ ] volumes: - "message_queue:/var/lib/redpanda/data" - environment: { } + environment: {} container_name: message_queue healthcheck: test: curl -f localhost:9644/v1/status/ready diff --git a/docs/dev/src/images/architecture_20240908.png b/docs/dev/src/images/architecture_20240908.png new file mode 100644 index 0000000000000..40ba8b8174c68 Binary files /dev/null and b/docs/dev/src/images/architecture_20240908.png differ diff --git a/e2e_test/backup_restore/tpch_snapshot_create.slt b/e2e_test/backup_restore/tpch_snapshot_create.slt index c1fad2a2e0759..bb14dd369b837 100644 --- a/e2e_test/backup_restore/tpch_snapshot_create.slt +++ b/e2e_test/backup_restore/tpch_snapshot_create.slt @@ -1,5 +1,8 @@ include ../tpch/create_tables.slt.part +statement ok +CREATE SECRET secret1 WITH (backend = 'meta') AS 'demo-secret' + # First, insert the data into the tables include ../tpch/insert_customer.slt.part include ../tpch/insert_lineitem.slt.part diff --git a/e2e_test/backup_restore/tpch_snapshot_drop.slt b/e2e_test/backup_restore/tpch_snapshot_drop.slt index 0e593371347b7..27d271c35c617 100644 --- a/e2e_test/backup_restore/tpch_snapshot_drop.slt +++ b/e2e_test/backup_restore/tpch_snapshot_drop.slt @@ -1,3 +1,6 @@ +statement ok +DROP SECRET secret1; + statement ok drop materialized view tpch_q7; diff --git a/e2e_test/batch/catalog/pg_settings.slt.part b/e2e_test/batch/catalog/pg_settings.slt.part index 3482ce4850246..e05d466c3a4d6 100644 --- a/e2e_test/batch/catalog/pg_settings.slt.part +++ b/e2e_test/batch/catalog/pg_settings.slt.part @@ -22,6 +22,7 @@ user backfill_rate_limit user background_ddl user batch_enable_distributed_dml user batch_parallelism +user bypass_cluster_limits user bytea_output user cdc_source_wait_streaming_start_timeout user client_encoding diff --git a/e2e_test/commands/risectl b/e2e_test/commands/risectl new file mode 100755 index 0000000000000..2bb462d83fbab --- /dev/null +++ b/e2e_test/commands/risectl @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +RUST_LOG="error" .risingwave/bin/risingwave/risectl "$@" diff --git a/e2e_test/iceberg/main.py b/e2e_test/iceberg/main.py index 01017f3db783d..4279b899c5c1d 100644 --- a/e2e_test/iceberg/main.py +++ b/e2e_test/iceberg/main.py @@ -55,16 +55,23 @@ def execute_slt(args, slt): def verify_result(args, verify_sql, verify_schema, verify_data): tc = unittest.TestCase() - print(f"Executing sql: {verify_sql}") + + time.sleep(3) + print(f"verify_result:\nExecuting sql: {verify_sql}") spark = get_spark(args) df = spark.sql(verify_sql).collect() + print(f"Result:") + print(f"================") for row in df: print(row) + print(f"================") rows = verify_data.splitlines() - tc.assertEqual(len(df), len(rows)) + tc.assertEqual(len(df), len(rows), "row length mismatch") + tc.assertEqual(len(verify_schema), len(df[0]), "column length mismatch") for row1, row2 in zip(df, rows): print(f"Row1: {row1}, Row 2: {row2}") - row2 = row2.split(",") + # New parsing logic for row2 + row2 = parse_row(row2) for idx, ty in enumerate(verify_schema): if ty == "int" or ty == "long": tc.assertEqual(row1[idx], int(row2[idx])) @@ -89,7 +96,7 @@ def verify_result(args, verify_sql, verify_schema, verify_data): else: tc.assertEqual(row1[idx], decimal.Decimal(row2[idx])) else: - tc.fail(f"Unsupported type {ty}") + tc.assertEqual(str(row1[idx]), str(row2[idx])) def compare_sql(args, cmp_sqls): assert len(cmp_sqls) == 2 @@ -113,6 +120,32 @@ def drop_table(args, drop_sqls): spark.sql(sql) +def parse_row(row): + result = [] + current = "" + parenthesis_count = {"{": 0, "[": 0, "(": 0} + for char in row: + if char in parenthesis_count: + parenthesis_count[char] += 1 + elif char == "}": + parenthesis_count["{"] -= 1 + elif char == "]": + parenthesis_count["["] -= 1 + elif char == ")": + parenthesis_count["("] -= 1 + + if char == "," and all(value == 0 for value in parenthesis_count.values()): + result.append(current.strip()) + current = "" + else: + current += char + + if current: + result.append(current.strip()) + + return result + + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Test script for iceberg") parser.add_argument("-t", dest="test_case", type=str, help="Test case file") @@ -151,4 +184,3 @@ def drop_table(args, drop_sqls): execute_slt(config, verify_slt) if drop_sqls is not None and drop_sqls != "": drop_table(config, drop_sqls) - diff --git a/e2e_test/iceberg/start_spark_connect_server.sh b/e2e_test/iceberg/start_spark_connect_server.sh index 345653778b14c..f0f3f19a1fab7 100755 --- a/e2e_test/iceberg/start_spark_connect_server.sh +++ b/e2e_test/iceberg/start_spark_connect_server.sh @@ -1,3 +1,5 @@ +#!/usr/bin/env bash + set -ex ICEBERG_VERSION=1.4.3 diff --git a/e2e_test/iceberg/test_case/append_only_with_checkpoint_interval.slt b/e2e_test/iceberg/test_case/append_only_with_checkpoint_interval.slt index 0dc937303a852..b0e433c819f83 100644 --- a/e2e_test/iceberg/test_case/append_only_with_checkpoint_interval.slt +++ b/e2e_test/iceberg/test_case/append_only_with_checkpoint_interval.slt @@ -1,6 +1,3 @@ -statement ok -set sink_decouple = false; - statement ok set streaming_parallelism=4; @@ -37,7 +34,6 @@ CREATE SINK sink1 AS select * from mv1 WITH ( s3.region = 'us-east-1', s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', - commit_checkpoint_interval = 1 ); statement ok @@ -54,7 +50,6 @@ CREATE SINK sink2 AS select * from mv1 WITH ( s3.region = 'us-east-1', s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', - commit_checkpoint_interval = 1 ); sleep 20s diff --git a/e2e_test/iceberg/test_case/cdc/load.slt b/e2e_test/iceberg/test_case/cdc/load.slt index df0c319990374..6e6850725f98a 100644 --- a/e2e_test/iceberg/test_case/cdc/load.slt +++ b/e2e_test/iceberg/test_case/cdc/load.slt @@ -1,4 +1,6 @@ # CDC source basic test +statement ok +set sink_decouple = false; statement ok create source mysql_mydb with ( diff --git a/e2e_test/iceberg/test_case/iceberg_select_empty_table.slt b/e2e_test/iceberg/test_case/iceberg_select_empty_table.slt new file mode 100644 index 0000000000000..832a7b781f7fb --- /dev/null +++ b/e2e_test/iceberg/test_case/iceberg_select_empty_table.slt @@ -0,0 +1,60 @@ +statement ok +set sink_decouple = false; + +statement ok +set streaming_parallelism=4; + +statement ok +CREATE TABLE s1 (i1 int, i2 varchar, i3 varchar); + +statement ok +CREATE MATERIALIZED VIEW mv1 AS SELECT * FROM s1; + +statement ok +CREATE SINK sink1 AS select * from mv1 WITH ( + connector = 'iceberg', + type = 'append-only', + force_append_only = 'true', + database.name = 'demo_db', + table.name = 't1', + catalog.name = 'demo', + catalog.type = 'storage', + warehouse.path = 's3a://icebergdata/demo', + s3.endpoint = 'http://127.0.0.1:9301', + s3.region = 'us-east-1', + s3.access.key = 'hummockadmin', + s3.secret.key = 'hummockadmin', + commit_checkpoint_interval = 1, + create_table_if_not_exists = 'true' +); + +statement ok +CREATE SOURCE iceberg_t1_source +WITH ( + connector = 'iceberg', + s3.endpoint = 'http://127.0.0.1:9301', + s3.region = 'us-east-1', + s3.access.key = 'hummockadmin', + s3.secret.key = 'hummockadmin', + catalog.type = 'storage', + warehouse.path = 's3a://icebergdata/demo', + database.name = 'demo_db', + table.name = 't1', +); + +statement ok +flush; + +query I +select count(*) from iceberg_t1_source; +---- +0 + +statement ok +DROP SINK sink1; + +statement ok +DROP SOURCE iceberg_t1_source; + +statement ok +DROP TABLE s1 cascade; diff --git a/e2e_test/iceberg/test_case/iceberg_select_empty_table.toml b/e2e_test/iceberg/test_case/iceberg_select_empty_table.toml new file mode 100644 index 0000000000000..fa6eeff134c26 --- /dev/null +++ b/e2e_test/iceberg/test_case/iceberg_select_empty_table.toml @@ -0,0 +1,11 @@ +init_sqls = [ + 'CREATE SCHEMA IF NOT EXISTS demo_db', + 'DROP TABLE IF EXISTS demo_db.t1', +] + +slt = 'test_case/iceberg_select_empty_table.slt' + +drop_sqls = [ + 'DROP TABLE IF EXISTS demo_db.t1', + 'DROP SCHEMA IF EXISTS demo_db', +] diff --git a/e2e_test/iceberg/test_case/iceberg_sink_no_partition_append_only_table.slt b/e2e_test/iceberg/test_case/iceberg_sink_no_partition_append_only_table.slt index a83173fc48ab6..d57c3096cc1ee 100644 --- a/e2e_test/iceberg/test_case/iceberg_sink_no_partition_append_only_table.slt +++ b/e2e_test/iceberg/test_case/iceberg_sink_no_partition_append_only_table.slt @@ -16,7 +16,10 @@ v_bool boolean, v_date date, v_timestamp timestamptz, v_ts_ntz timestamp, -v_decimal decimal +v_decimal decimal, +v_map map(int, int), +v_array int[], +v_struct struct ); statement ok @@ -36,15 +39,14 @@ CREATE SINK s6 AS select * from mv6 WITH ( s3.region = 'us-east-1', s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', - commit_checkpoint_interval = 1 ); statement ok INSERT INTO t6 VALUES -(1, 1, 1000, 1.1, 1.11, '1-1', true, '2022-03-11', '2022-03-11 01:00:00Z'::timestamptz, '2022-03-11 01:00:00',1.11), -(2, 2, 2000, 2.2, 2.22, '2-2', false, '2022-03-12', '2022-03-12 02:00:00Z'::timestamptz, '2022-03-12 02:00:00',2.22), -(3, 3, 3000, 3.3, 3.33, '3-3', true, '2022-03-13', '2022-03-13 03:00:00Z'::timestamptz, '2022-03-13 03:00:00','inf'), -(4, 4, 4000, 4.4, 4.44, '4-4', false, '2022-03-14', '2022-03-14 04:00:00Z'::timestamptz, '2022-03-14 04:00:00','-inf'); +(1, 1, 1000, 1.1, 1.11, '1-1', true, '2022-03-11', '2022-03-11 01:00:00Z'::timestamptz, '2022-03-11 01:00:00',1.11, map {1:100,2:200}, array[1,2,3], row(1,2)), +(2, 2, 2000, 2.2, 2.22, '2-2', false, '2022-03-12', '2022-03-12 02:00:00Z'::timestamptz, '2022-03-12 02:00:00',2.22, map {3:300}, array[1,null,3], row(3,null)), +(3, 3, 3000, 3.3, 3.33, '3-3', true, '2022-03-13', '2022-03-13 03:00:00Z'::timestamptz, '2022-03-13 03:00:00','inf', null, null, null), +(4, 4, 4000, 4.4, 4.44, '4-4', false, '2022-03-14', '2022-03-14 04:00:00Z'::timestamptz, '2022-03-14 04:00:00','-inf', null, null, null); statement ok FLUSH; @@ -53,7 +55,7 @@ sleep 5s statement ok INSERT INTO t6 VALUES -(5, 5, 5000, 5.5, 5.55, '5-5', true, '2022-03-15', '2022-03-15 05:00:00Z'::timestamptz, '2022-03-15 05:00:00','nan'); +(5, 5, 5000, 5.5, 5.55, '5-5', true, '2022-03-15', '2022-03-15 05:00:00Z'::timestamptz, '2022-03-15 05:00:00','nan', null, null, null); statement ok FLUSH; diff --git a/e2e_test/iceberg/test_case/iceberg_sink_no_partition_upsert_table.slt b/e2e_test/iceberg/test_case/iceberg_sink_no_partition_upsert_table.slt index de96205a2debf..73d953bc2937a 100644 --- a/e2e_test/iceberg/test_case/iceberg_sink_no_partition_upsert_table.slt +++ b/e2e_test/iceberg/test_case/iceberg_sink_no_partition_upsert_table.slt @@ -25,7 +25,6 @@ CREATE SINK s6 AS select mv6.id as id, mv6.v1 as v1, mv6.v2 as v2, mv6.v3 as v3, s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', primary_key = 'v1', - commit_checkpoint_interval = 1 ); statement ok diff --git a/e2e_test/iceberg/test_case/iceberg_sink_partition_append_only_table.slt b/e2e_test/iceberg/test_case/iceberg_sink_partition_append_only_table.slt index 72f0bce46d183..3a27df42903ee 100644 --- a/e2e_test/iceberg/test_case/iceberg_sink_partition_append_only_table.slt +++ b/e2e_test/iceberg/test_case/iceberg_sink_partition_append_only_table.slt @@ -36,7 +36,6 @@ CREATE SINK s6 AS select * from mv6 WITH ( s3.region = 'us-east-1', s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', - commit_checkpoint_interval = 1 ); statement ok diff --git a/e2e_test/iceberg/test_case/iceberg_sink_partition_upsert_table.slt b/e2e_test/iceberg/test_case/iceberg_sink_partition_upsert_table.slt index 2b213a77175bd..39f170a834382 100644 --- a/e2e_test/iceberg/test_case/iceberg_sink_partition_upsert_table.slt +++ b/e2e_test/iceberg/test_case/iceberg_sink_partition_upsert_table.slt @@ -25,7 +25,6 @@ CREATE SINK s6 AS select mv6.id as id, mv6.v1 as v1, mv6.v2 as v2, mv6.v3 as v3, s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', primary_key = 'v1', - commit_checkpoint_interval = 1 ); statement ok diff --git a/e2e_test/iceberg/test_case/iceberg_sink_range_partition_append_only_table.slt b/e2e_test/iceberg/test_case/iceberg_sink_range_partition_append_only_table.slt index 46670ac362599..f0cf9f5fa3133 100644 --- a/e2e_test/iceberg/test_case/iceberg_sink_range_partition_append_only_table.slt +++ b/e2e_test/iceberg/test_case/iceberg_sink_range_partition_append_only_table.slt @@ -36,7 +36,6 @@ CREATE SINK s6 AS select * from mv6 WITH ( s3.region = 'us-east-1', s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', - commit_checkpoint_interval = 1 ); statement ok diff --git a/e2e_test/iceberg/test_case/iceberg_sink_range_partition_upsert_table.slt b/e2e_test/iceberg/test_case/iceberg_sink_range_partition_upsert_table.slt index 5637ce34c940f..f43e2788a020a 100644 --- a/e2e_test/iceberg/test_case/iceberg_sink_range_partition_upsert_table.slt +++ b/e2e_test/iceberg/test_case/iceberg_sink_range_partition_upsert_table.slt @@ -25,7 +25,6 @@ CREATE SINK s6 AS select mv6.id as id, mv6.v1 as v1, mv6.v2 as v2, mv6.v3 as v3, s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', primary_key = 'v1', - commit_checkpoint_interval = 1 ); statement ok diff --git a/e2e_test/iceberg/test_case/no_partition_append_only.toml b/e2e_test/iceberg/test_case/no_partition_append_only.toml index 7d2952c508756..9d49b7a29d17f 100644 --- a/e2e_test/iceberg/test_case/no_partition_append_only.toml +++ b/e2e_test/iceberg/test_case/no_partition_append_only.toml @@ -13,24 +13,27 @@ init_sqls = [ v_date date, v_timestamp timestamp, v_ts_ntz timestamp_ntz, - v_decimal decimal(10,5) + v_decimal decimal(10,5), + v_map map, + v_array array, + v_struct struct ) USING iceberg TBLPROPERTIES ('format-version'='2'); ''' ] slt = 'test_case/iceberg_sink_no_partition_append_only_table.slt' -verify_schema = ['long', 'int', 'long', 'float', 'double', 'string', 'boolean', 'date', 'timestamp', 'timestamp_ntz','decimal'] +verify_schema = ['long', 'int', 'long', 'float', 'double', 'string', 'boolean', 'date', 'timestamp', 'timestamp_ntz','decimal', 'map', 'array', 'struct'] verify_sql = 'SELECT * FROM demo_db.no_partition_append_only_table ORDER BY id ASC' verify_data = """ -1,1,1000,1.1,1.11,1-1,true,2022-03-11,2022-03-11 01:00:00+00:00,2022-03-11 01:00:00,1.11 -2,2,2000,2.2,2.22,2-2,false,2022-03-12,2022-03-12 02:00:00+00:00,2022-03-12 02:00:00,2.22 -3,3,3000,3.3,3.33,3-3,true,2022-03-13,2022-03-13 03:00:00+00:00,2022-03-13 03:00:00,99999.99999 -4,4,4000,4.4,4.44,4-4,false,2022-03-14,2022-03-14 04:00:00+00:00,2022-03-14 04:00:00,-99999.99999 -5,5,5000,5.5,5.55,5-5,true,2022-03-15,2022-03-15 05:00:00+00:00,2022-03-15 05:00:00,none +1,1,1000,1.1,1.11,1-1,true,2022-03-11,2022-03-11 01:00:00+00:00,2022-03-11 01:00:00,1.11,{1: 100, 2: 200},[1, 2, 3],Row(a=1, b=2) +2,2,2000,2.2,2.22,2-2,false,2022-03-12,2022-03-12 02:00:00+00:00,2022-03-12 02:00:00,2.22,{3: 300},[1, None, 3],Row(a=3, b=None) +3,3,3000,3.3,3.33,3-3,true,2022-03-13,2022-03-13 03:00:00+00:00,2022-03-13 03:00:00,99999.99999,None,None,None +4,4,4000,4.4,4.44,4-4,false,2022-03-14,2022-03-14 04:00:00+00:00,2022-03-14 04:00:00,-99999.99999,None,None,None +5,5,5000,5.5,5.55,5-5,true,2022-03-15,2022-03-15 05:00:00+00:00,2022-03-15 05:00:00,none,None,None,None """ verify_slt = 'test_case/iceberg_sink_no_partition_append_only_table_verify.slt' diff --git a/e2e_test/iceberg/test_case/no_partition_upsert.toml b/e2e_test/iceberg/test_case/no_partition_upsert.toml index 24444e025f6fe..0c5d63e88216e 100644 --- a/e2e_test/iceberg/test_case/no_partition_upsert.toml +++ b/e2e_test/iceberg/test_case/no_partition_upsert.toml @@ -15,7 +15,7 @@ init_sqls = [ slt = 'test_case/iceberg_sink_no_partition_upsert_table.slt' -verify_schema = ['int','int','long','string'] +verify_schema = ['int','int','long','string','date'] verify_sql = 'SELECT * FROM demo_db.no_partition_upsert_table ORDER BY id, v1 ASC' diff --git a/e2e_test/iceberg/test_case/partition_upsert.toml b/e2e_test/iceberg/test_case/partition_upsert.toml index 38e6455fa9b0a..52cb1c40ea344 100644 --- a/e2e_test/iceberg/test_case/partition_upsert.toml +++ b/e2e_test/iceberg/test_case/partition_upsert.toml @@ -16,7 +16,7 @@ init_sqls = [ slt = 'test_case/iceberg_sink_partition_upsert_table.slt' -verify_schema = ['int','int','long','string'] +verify_schema = ['int','int','long','string', 'date'] verify_sql = 'SELECT * FROM demo_db.partition_upsert_table ORDER BY id, v1 ASC' diff --git a/e2e_test/iceberg/test_case/range_partition_upsert.toml b/e2e_test/iceberg/test_case/range_partition_upsert.toml index 0e63c4218eadc..ceea071d9c8a2 100644 --- a/e2e_test/iceberg/test_case/range_partition_upsert.toml +++ b/e2e_test/iceberg/test_case/range_partition_upsert.toml @@ -16,7 +16,7 @@ init_sqls = [ slt = 'test_case/iceberg_sink_range_partition_upsert_table.slt' -verify_schema = ['int','int','long','string'] +verify_schema = ['int','int','long','string','date'] verify_sql = 'SELECT * FROM demo_db.range_partition_upsert_table ORDER BY id, v1 ASC' diff --git a/e2e_test/s3/fs_parquet_source_and_sink.py b/e2e_test/s3/fs_parquet_source_and_sink.py index 033cb73ffbe70..6425ef1d3a9d6 100644 --- a/e2e_test/s3/fs_parquet_source_and_sink.py +++ b/e2e_test/s3/fs_parquet_source_and_sink.py @@ -116,6 +116,7 @@ def _table(): return 's3_test_parquet' # Execute a SELECT statement + cur.execute(f'''set sink_decouple = false;''') cur.execute(f'''CREATE sink test_file_sink as select id, name, diff --git a/e2e_test/sink/clickhouse_sink.slt b/e2e_test/sink/clickhouse_sink.slt index e037618bb460e..e5bac0d8d521d 100644 --- a/e2e_test/sink/clickhouse_sink.slt +++ b/e2e_test/sink/clickhouse_sink.slt @@ -17,7 +17,6 @@ CREATE SINK s6 AS select mv6.v1 as v1, mv6.v2 as v2, mv6.v3 as v3, mv6.v4 as v4, clickhouse.password = '', clickhouse.database = 'default', clickhouse.table='demo_test', - commit_checkpoint_interval = 1, ); statement ok diff --git a/e2e_test/sink/create_sink_as.slt b/e2e_test/sink/create_sink_as.slt index 5c66c5623553e..dc6d0f61419c6 100644 --- a/e2e_test/sink/create_sink_as.slt +++ b/e2e_test/sink/create_sink_as.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok CREATE TABLE t4 (v1 int primary key, v2 int); diff --git a/e2e_test/sink/deltalake_rust_sink.slt b/e2e_test/sink/deltalake_rust_sink.slt index 74dca623a9d0a..cb9f9e7817212 100644 --- a/e2e_test/sink/deltalake_rust_sink.slt +++ b/e2e_test/sink/deltalake_rust_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok CREATE TABLE t6 (v1 int primary key, v2 smallint, v3 bigint, v4 real, v5 float, v6 varchar, v7 date, v8 timestamptz, v9 boolean, v10 decimal, v11 decimal[]); diff --git a/e2e_test/sink/doris_sink.slt b/e2e_test/sink/doris_sink.slt index 3242206badaea..3e6a4aca9d9f6 100644 --- a/e2e_test/sink/doris_sink.slt +++ b/e2e_test/sink/doris_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok CREATE TABLE t6 (v1 int primary key, v2 smallint, v3 bigint, v4 real, v5 float, v6 varchar, v7 date, v8 timestamp, v9 boolean, v10 jsonb); diff --git a/e2e_test/sink/iceberg_sink.slt b/e2e_test/sink/iceberg_sink.slt index e3917908f651b..b08abd8a4918c 100644 --- a/e2e_test/sink/iceberg_sink.slt +++ b/e2e_test/sink/iceberg_sink.slt @@ -31,7 +31,6 @@ CREATE SINK s6 AS select mv6.v1 as v1, mv6.v2 as v2, mv6.v3 as v3 from mv6 WITH catalog.type = 'storage', database.name='demo_db', table.name='e2e_demo_table', - commit_checkpoint_interval = 1 ); statement ok diff --git a/e2e_test/sink/mongodb_sink.slt b/e2e_test/sink/mongodb_sink.slt index 2122993e3003a..ddc5a91a20c3f 100644 --- a/e2e_test/sink/mongodb_sink.slt +++ b/e2e_test/sink/mongodb_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok create table t1( a smallint, diff --git a/e2e_test/sink/redis_cluster_sink.slt b/e2e_test/sink/redis_cluster_sink.slt index 03d197485777a..3effd7795d039 100644 --- a/e2e_test/sink/redis_cluster_sink.slt +++ b/e2e_test/sink/redis_cluster_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok CREATE TABLE t6 (v1 int primary key, v2 int); diff --git a/e2e_test/sink/redis_sink.slt b/e2e_test/sink/redis_sink.slt index 7475a80ae696e..8828c22b80d27 100644 --- a/e2e_test/sink/redis_sink.slt +++ b/e2e_test/sink/redis_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok CREATE TABLE t6 (v1 int primary key, v2 smallint, v3 bigint, v4 real, v5 float, v6 varchar, v7 date, v8 timestamptz, v9 boolean); diff --git a/e2e_test/sink/remote/types.slt b/e2e_test/sink/remote/types.slt index f2421eabec906..e511d5e6a6ee7 100644 --- a/e2e_test/sink/remote/types.slt +++ b/e2e_test/sink/remote/types.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok create table t5 (v1 smallint primary key, v2 int, v3 bigint, v4 float, v5 double, v6 decimal, v7 varchar, v8 timestamp, v9 boolean); diff --git a/e2e_test/sink/sqlserver_sink.slt b/e2e_test/sink/sqlserver_sink.slt index 156b8b865ffc8..08bbd3364ed9a 100644 --- a/e2e_test/sink/sqlserver_sink.slt +++ b/e2e_test/sink/sqlserver_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok create table t_many_data_type_rw ( k1 int, k2 int, diff --git a/e2e_test/sink/starrocks_sink.slt b/e2e_test/sink/starrocks_sink.slt index dedb01755cbbe..0aceac592618a 100644 --- a/e2e_test/sink/starrocks_sink.slt +++ b/e2e_test/sink/starrocks_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok CREATE TABLE t6 (v1 int primary key, v2 smallint, v3 bigint, v4 real, v5 float, v6 varchar, v7 date, v8 timestamp, v9 boolean, v10 jsonb, v11 decimal); diff --git a/e2e_test/source/opendal/posix_fs.slt b/e2e_test/source/opendal/posix_fs.slt index 3fc572a1a1cc8..1bf026aed2744 100644 --- a/e2e_test/source/opendal/posix_fs.slt +++ b/e2e_test/source/opendal/posix_fs.slt @@ -2,21 +2,22 @@ statement ok SET RW_IMPLICIT_FLUSH TO true; statement ok -CREATE TABLE diamonds ( +CREATE TABLE diamonds_recursive_read ( carat FLOAT, cut TEXT, color TEXT, depth FLOAT, ) WITH ( - connector = 'posix_fs', - match_pattern = 'data*.csv', - posix_fs.root = 'e2e_test/source/opendal/data', + connector = 'posix_fs', + match_pattern = 'data*.csv', + posix_fs.root = 'e2e_test/source/opendal/data', + recursive_scan = 'true', ) FORMAT PLAIN ENCODE CSV ( without_header = 'false', delimiter = ','); sleep 10s query TTTT rowsort -select * from diamonds; +select * from diamonds_recursive_read; ---- 0.22 Premium I 62 0.23 Very Good H 57.5 @@ -29,5 +30,26 @@ select * from diamonds; 1.28 Good J 63.1 1.3 Fair E 64.7 +statement ok +CREATE TABLE diamonds ( + carat FLOAT, + cut TEXT, + color TEXT, + depth FLOAT, +) WITH ( + connector = 'posix_fs', + match_pattern = 'data*.csv', + posix_fs.root = 'e2e_test/source/opendal', +) FORMAT PLAIN ENCODE CSV ( without_header = 'false', delimiter = ','); + +sleep 10s + +query TTTT rowsort +select * from diamonds; +---- + statement ok DROP TABLE diamonds; + +statement ok +DROP TABLE diamonds_recursive_read; diff --git a/e2e_test/source_inline/kafka/protobuf/recover.slt b/e2e_test/source_inline/kafka/protobuf/recover.slt new file mode 100644 index 0000000000000..3babf26793f2a --- /dev/null +++ b/e2e_test/source_inline/kafka/protobuf/recover.slt @@ -0,0 +1,97 @@ +control substitution on + +system ok +rpk topic create 'test-pb-struct' + + +system ok +jq -sR '{"schema":.,"schemaType":"PROTOBUF"}' << EOF | curl -X POST -H 'content-type: application/json' -d @- "${RISEDEV_SCHEMA_REGISTRY_URL}/subjects/test-pb-struct-value/versions" +syntax = "proto3"; +package test; +message User { + int32 id = 1; + Name name = 2; +} +message Name { + string first_name = 1; + string last_name = 2; +} +EOF + + +# create a source with v1 schema +statement ok +create source s with ( + ${RISEDEV_KAFKA_WITH_OPTIONS_COMMON}, + topic = 'test-pb-struct') +format plain encode protobuf ( + schema.registry = '${RISEDEV_SCHEMA_REGISTRY_URL}', + message = 'test.User'); + + +# register a v2 schema +system ok +jq -sR '{"schema":.,"schemaType":"PROTOBUF"}' << EOF | curl -X POST -H 'content-type: application/json' -d @- "${RISEDEV_SCHEMA_REGISTRY_URL}/subjects/test-pb-struct-value/versions" +syntax = "proto3"; +package test; +message User { + int32 id = 1; + Name name = 2; +} +message Name { + string first_name = 1; + string last_name = 2; + string middle_name = 3; +} +EOF + + +# trigger recovery +statement ok +recover; + + +sleep 2s + + +# produce a v2 message +statement ok +create sink sk as select + 1 as id, + row('Alan', 'Turing', 'Mathison')::struct as name +with ( + ${RISEDEV_KAFKA_WITH_OPTIONS_COMMON}, + topic = 'test-pb-struct') +format plain encode protobuf ( + schema.registry = '${RISEDEV_SCHEMA_REGISTRY_URL}', + message = 'test.User'); + + +sleep 1s + + +# reading as v1 shall not panic +query IT +select * from s; +---- +1 (Alan,Turing) + + +statement ok +drop sink sk; + + +statement ok +drop source s; + + +system ok +curl -X DELETE "${RISEDEV_SCHEMA_REGISTRY_URL}/subjects/test-pb-struct-value" + + +system ok +curl -X DELETE "${RISEDEV_SCHEMA_REGISTRY_URL}/subjects/test-pb-struct-value?permanent=true" + + +system ok +rpk topic delete 'test-pb-struct' diff --git a/e2e_test/source_inline/kafka/shared_source.slt b/e2e_test/source_inline/kafka/shared_source.slt index 51a9f1e5ee1b3..5d1072df2cfaa 100644 --- a/e2e_test/source_inline/kafka/shared_source.slt +++ b/e2e_test/source_inline/kafka/shared_source.slt @@ -230,6 +230,87 @@ internal_table.mjs --name s0 --type source 3,"{""split_info"": {""partition"": 3, ""start_offset"": 11, ""stop_offset"": null, ""topic"": ""shared_source""}, ""split_type"": ""kafka""}" +# # Note: the parallelism depends on the risedev profile. +# # So scale tests below are commented out. + +# query ??? +# select name, flags, parallelism from rw_fragments JOIN rw_relations ON rw_fragments.table_id = rw_relations.id order by name; +# ---- +# mv_1 {MVIEW,SOURCE_SCAN} 5 +# mv_2 {MVIEW,SOURCE_SCAN} 5 +# s0 {SOURCE} 5 + + +# system ok +# risectl meta source-split-info --ignore-id +# ---- +# Table +# Fragment (Source) +# Actor (1 splits): [0] +# Actor (1 splits): [2] +# Actor (1 splits): [3] +# Actor (1 splits): [1] +# Actor (0 splits): [] +# Table +# Fragment (SourceScan) +# Actor (1 splits): [0] <- Upstream Actor #1055: [0] +# Actor (1 splits): [2] <- Upstream Actor #1056: [2] +# Actor (1 splits): [3] <- Upstream Actor #1057: [3] +# Actor (1 splits): [1] <- Upstream Actor #1058: [1] +# Actor (0 splits): [] <- Upstream Actor #1059: [] +# Table +# Fragment (SourceScan) +# Actor (1 splits): [0] <- Upstream Actor #1055: [0] +# Actor (1 splits): [2] <- Upstream Actor #1056: [2] +# Actor (1 splits): [3] <- Upstream Actor #1057: [3] +# Actor (1 splits): [1] <- Upstream Actor #1058: [1] +# Actor (0 splits): [] <- Upstream Actor #1059: [] + + +# # scale down +# statement ok +# ALTER MATERIALIZED VIEW mv_1 SET PARALLELISM TO 2; + +# # should have no effect, because of NoShuffle +# # TODO: support ALTER SOURCE SET PARALLELISM, then we can +# query ??? +# select name, flags, parallelism from rw_fragments JOIN rw_relations ON rw_fragments.table_id = rw_relations.id order by name; +# ---- +# mv_1 {MVIEW,SOURCE_SCAN} 5 +# mv_2 {MVIEW,SOURCE_SCAN} 5 +# s0 {SOURCE} 5 + +# system ok +# risectl meta source-split-info --ignore-id +# ---- +# Table +# Fragment (Source) +# Actor (1 splits): [0] +# Actor (1 splits): [2] +# Actor (1 splits): [3] +# Actor (1 splits): [1] +# Actor (0 splits): [] +# Table +# Fragment (SourceScan) +# Actor (1 splits): [0] <- Upstream Actor #1055: [0] +# Actor (1 splits): [2] <- Upstream Actor #1056: [2] +# Actor (1 splits): [3] <- Upstream Actor #1057: [3] +# Actor (1 splits): [1] <- Upstream Actor #1058: [1] +# Actor (0 splits): [] <- Upstream Actor #1059: [] +# Table +# Fragment (SourceScan) +# Actor (1 splits): [0] <- Upstream Actor #1055: [0] +# Actor (1 splits): [2] <- Upstream Actor #1056: [2] +# Actor (1 splits): [3] <- Upstream Actor #1057: [3] +# Actor (1 splits): [1] <- Upstream Actor #1058: [1] +# Actor (0 splits): [] <- Upstream Actor #1059: [] + + +# # Manual test: change the parallelism of the compute node, kill and restart, and check +# # risedev ctl meta source-split-info --ignore-id +# # risedev psql -c "select name, flags, parallelism from rw_fragments JOIN rw_relations ON rw_fragments.table_id = rw_relations.id order by name;" + + statement ok drop source s0 cascade; diff --git a/e2e_test/time_travel/syntax.slt b/e2e_test/time_travel/syntax.slt index 6c3408a276763..5895f6d9b9e8b 100644 --- a/e2e_test/time_travel/syntax.slt +++ b/e2e_test/time_travel/syntax.slt @@ -7,6 +7,10 @@ SET QUERY_MODE TO local; statement ok CREATE TABLE t (k INT); +query I +SELECT * FROM t; +---- + query error SELECT * FROM t FOR SYSTEM_TIME AS OF 963716300; ---- diff --git a/integration_tests/big-query-sink/create_sink.sql b/integration_tests/big-query-sink/create_sink.sql index a41fe0243120d..01fb5e340d545 100644 --- a/integration_tests/big-query-sink/create_sink.sql +++ b/integration_tests/big-query-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + -- create sink with local file CREATE SINK bhv_big_query_sink FROM diff --git a/integration_tests/cassandra-and-scylladb-sink/create_sink.sql b/integration_tests/cassandra-and-scylladb-sink/create_sink.sql index a0a305aebd0e0..fdda994d01427 100644 --- a/integration_tests/cassandra-and-scylladb-sink/create_sink.sql +++ b/integration_tests/cassandra-and-scylladb-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK bhv_cassandra_sink FROM bhv_mv WITH ( diff --git a/integration_tests/clickhouse-sink/create_sink.sql b/integration_tests/clickhouse-sink/create_sink.sql index 5f730ed6ff910..b913a246b286e 100644 --- a/integration_tests/clickhouse-sink/create_sink.sql +++ b/integration_tests/clickhouse-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK bhv_clickhouse_sink FROM bhv_mv WITH ( diff --git a/integration_tests/deltalake-sink/create_sink.sql b/integration_tests/deltalake-sink/create_sink.sql index f42b09d726e56..17c1c44aea255 100644 --- a/integration_tests/deltalake-sink/create_sink.sql +++ b/integration_tests/deltalake-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + create sink delta_lake_sink from source with ( connector = 'deltalake', diff --git a/integration_tests/doris-sink/create_sink.sql b/integration_tests/doris-sink/create_sink.sql index d4702219fed09..d6b28148c083d 100644 --- a/integration_tests/doris-sink/create_sink.sql +++ b/integration_tests/doris-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + create secret doris_secret with (backend = 'meta') as '123456'; CREATE SINK bhv_doris_sink diff --git a/integration_tests/dynamodb/create_sink.sql b/integration_tests/dynamodb/create_sink.sql index 6de71404a9da1..43cb2be6d1447 100644 --- a/integration_tests/dynamodb/create_sink.sql +++ b/integration_tests/dynamodb/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK dyn_sink FROM movies diff --git a/integration_tests/elasticsearch-sink/create_sink.sql b/integration_tests/elasticsearch-sink/create_sink.sql index 07046507d117d..f72f8f0e6ec3b 100644 --- a/integration_tests/elasticsearch-sink/create_sink.sql +++ b/integration_tests/elasticsearch-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK bhv_es7_sink FROM bhv_mv WITH ( diff --git a/integration_tests/kafka-cdc-sink/create_sink.sql b/integration_tests/kafka-cdc-sink/create_sink.sql index 349aac0ca9b0a..0c25553adebba 100644 --- a/integration_tests/kafka-cdc-sink/create_sink.sql +++ b/integration_tests/kafka-cdc-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK IF NOT EXISTS counts_sink FROM counts WITH ( diff --git a/integration_tests/mqtt/create_sink.sql b/integration_tests/mqtt/create_sink.sql index 69b6886943944..27b84aa354250 100644 --- a/integration_tests/mqtt/create_sink.sql +++ b/integration_tests/mqtt/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK mqtt_sink FROM personnel diff --git a/integration_tests/mysql-sink/create_sink.sql b/integration_tests/mysql-sink/create_sink.sql index 9776360df2914..f73b92e8ce259 100644 --- a/integration_tests/mysql-sink/create_sink.sql +++ b/integration_tests/mysql-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK target_count_mysql_sink FROM target_count WITH ( diff --git a/integration_tests/nats/create_sink.sql b/integration_tests/nats/create_sink.sql index beee01afcecfb..fda1ab1c77621 100644 --- a/integration_tests/nats/create_sink.sql +++ b/integration_tests/nats/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE TABLE personnel (id integer, name varchar); diff --git a/integration_tests/postgres-sink/create_sink.sql b/integration_tests/postgres-sink/create_sink.sql index 5041f1a36b741..ec76f16ac3037 100644 --- a/integration_tests/postgres-sink/create_sink.sql +++ b/integration_tests/postgres-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK target_count_postgres_sink FROM target_count WITH ( diff --git a/integration_tests/redis-sink/create_sink.sql b/integration_tests/redis-sink/create_sink.sql index 61ffb67326227..f88a68aca2110 100644 --- a/integration_tests/redis-sink/create_sink.sql +++ b/integration_tests/redis-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK bhv_redis_sink_1 FROM bhv_mv WITH ( diff --git a/integration_tests/starrocks-sink/create_sink.sql b/integration_tests/starrocks-sink/create_sink.sql index 8d7ebf98dfb20..7cfe69ef21973 100644 --- a/integration_tests/starrocks-sink/create_sink.sql +++ b/integration_tests/starrocks-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + create secret starrocks_secret with (backend = 'meta') as '123456'; CREATE SINK bhv_starrocks_sink_primary diff --git a/integration_tests/twitter-pulsar/pb/create_source.sql b/integration_tests/twitter-pulsar/pb/create_source.sql index bf41939b40d91..22c4927ab3bb9 100644 --- a/integration_tests/twitter-pulsar/pb/create_source.sql +++ b/integration_tests/twitter-pulsar/pb/create_source.sql @@ -1,5 +1,6 @@ CREATE SOURCE twitter WITH ( connector = 'pulsar', pulsar.topic = 'twitter', - pulsar.service.url = 'pulsar://message_queue:6650' + pulsar.service.url = 'pulsar://message_queue:6650', + subscription.name.prefix = 'custom_prefix' ) ROW FORMAT PROTOBUF MESSAGE 'twitter.schema.Event' ROW SCHEMA LOCATION 'http://file_server:8080/schema'; \ No newline at end of file diff --git a/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSink.java b/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSink.java index 10aa371c50aec..02297a4ea57dd 100644 --- a/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSink.java +++ b/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSink.java @@ -71,12 +71,13 @@ public JDBCSink(JDBCSinkConfig config, TableSchema tableSchema) { .collect(Collectors.toList()); LOG.info( - "schema = {}, table = {}, tableSchema = {}, columnSqlTypes = {}, pkIndices = {}", + "schema = {}, table = {}, tableSchema = {}, columnSqlTypes = {}, pkIndices = {}, queryTimeout = {}", config.getSchemaName(), config.getTableName(), tableSchema, columnSqlTypes, - pkIndices); + pkIndices, + config.getQueryTimeout()); if (factory.isPresent()) { this.jdbcDialect = factory.get().create(columnSqlTypes, pkIndices); @@ -92,7 +93,7 @@ public JDBCSink(JDBCSinkConfig config, TableSchema tableSchema) { // Commit the `getTransactionIsolation` conn.commit(); - jdbcStatements = new JdbcStatements(conn); + jdbcStatements = new JdbcStatements(conn, config.getQueryTimeout()); } catch (SQLException e) { throw Status.INTERNAL .withDescription( @@ -173,7 +174,7 @@ public boolean write(Iterable rows) { conn = JdbcUtils.getConnection(config.getJdbcUrl()); // reset the flag since we will retry to prepare the batch again updateFlag = false; - jdbcStatements = new JdbcStatements(conn); + jdbcStatements = new JdbcStatements(conn, config.getQueryTimeout()); } else { throw io.grpc.Status.INTERNAL .withDescription( @@ -206,13 +207,15 @@ public boolean write(Iterable rows) { * across multiple batches if only the JDBC connection is valid. */ class JdbcStatements implements AutoCloseable { + private final int queryTimeoutSecs; private PreparedStatement deleteStatement; private PreparedStatement upsertStatement; private PreparedStatement insertStatement; private final Connection conn; - public JdbcStatements(Connection conn) throws SQLException { + public JdbcStatements(Connection conn, int queryTimeoutSecs) throws SQLException { + this.queryTimeoutSecs = queryTimeoutSecs; this.conn = conn; var schemaTableName = jdbcDialect.createSchemaTableName( @@ -339,6 +342,9 @@ private void executeStatement(PreparedStatement stmt) throws SQLException { if (stmt == null) { return; } + // if timeout occurs, a SQLTimeoutException will be thrown + // and we will retry to write the stream chunk in `JDBCSink.write` + stmt.setQueryTimeout(queryTimeoutSecs); LOG.debug("Executing statement: {}", stmt); stmt.executeBatch(); stmt.clearParameters(); diff --git a/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSinkConfig.java b/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSinkConfig.java index ca74ac6a8eb74..94eb5cdc7e0ff 100644 --- a/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSinkConfig.java +++ b/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSinkConfig.java @@ -32,6 +32,9 @@ public class JDBCSinkConfig extends CommonSinkConfig { @JsonProperty(value = "schema.name") private String schemaName; + @JsonProperty(value = "jdbc.query.timeout") + private int queryTimeoutSeconds = 600; + @JsonCreator public JDBCSinkConfig( @JsonProperty(value = "jdbc.url") String jdbcUrl, @@ -62,4 +65,8 @@ public String getSinkType() { public boolean isUpsertSink() { return this.isUpsertSink; } + + public int getQueryTimeout() { + return queryTimeoutSeconds; + } } diff --git a/lints/Cargo.lock b/lints/Cargo.lock index e3b748e6da670..aa1e1e4ef9b32 100644 --- a/lints/Cargo.lock +++ b/lints/Cargo.lock @@ -162,7 +162,8 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clippy_config" -version = "0.1.80" +version = "0.1.81" +source = "git+https://github.com/risingwavelabs/clippy?rev=5135d0218365e85f3371405b604a7fb1459eb256#5135d0218365e85f3371405b604a7fb1459eb256" dependencies = [ "rustc-semver", "serde", @@ -171,12 +172,14 @@ dependencies = [ [[package]] name = "clippy_utils" -version = "0.1.80" +version = "0.1.81" +source = "git+https://github.com/risingwavelabs/clippy?rev=5135d0218365e85f3371405b604a7fb1459eb256#5135d0218365e85f3371405b604a7fb1459eb256" dependencies = [ "arrayvec", "clippy_config", "itertools", "rustc-semver", + "rustc_apfloat", ] [[package]] @@ -869,6 +872,16 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5be1bdc7edf596692617627bbfeaba522131b18e06ca4df2b6b689e3c5d5ce84" +[[package]] +name = "rustc_apfloat" +version = "0.2.1+llvm-462a31f5a5ab" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "886d94c63c812a8037c4faca2607453a0fa4cf82f734665266876b022244543f" +dependencies = [ + "bitflags 1.3.2", + "smallvec", +] + [[package]] name = "rustfix" version = "0.6.1" @@ -975,6 +988,12 @@ dependencies = [ "digest", ] +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + [[package]] name = "syn" version = "2.0.39" diff --git a/lints/Cargo.toml b/lints/Cargo.toml index 43ece1f6fc5b7..e0b8fe5d96664 100644 --- a/lints/Cargo.toml +++ b/lints/Cargo.toml @@ -14,7 +14,7 @@ path = "ui/format_error.rs" # See `README.md` before bumping the version. # Remember to update the version in `ci/Dockerfile` as well. [dependencies] -clippy_utils = { git = "https://github.com/risingwavelabs/clippy", rev = "5e2a7c6adebdb0478ee6d5b67ab4ee94153b2997" } +clippy_utils = { git = "https://github.com/risingwavelabs/clippy", rev = "61e1d2fd7062e46ccf1237707ee6da5aac018f70" } dylint_linting = "3.1.0" itertools = "0.12" diff --git a/lints/rust-toolchain b/lints/rust-toolchain index a146af66cd637..31dbc57d04b2b 100644 --- a/lints/rust-toolchain +++ b/lints/rust-toolchain @@ -1,5 +1,5 @@ # See `README.md` before bumping the version. [toolchain] -channel = "nightly-2024-06-06" +channel = "nightly-2024-07-19" components = ["llvm-tools-preview", "rustc-dev"] diff --git a/proto/connector_service.proto b/proto/connector_service.proto index 964d227452548..99d9c58d4f1ed 100644 --- a/proto/connector_service.proto +++ b/proto/connector_service.proto @@ -229,9 +229,15 @@ message CoordinateRequest { SinkMetadata metadata = 2; } + message UpdateVnodeBitmapRequest { + common.Buffer vnode_bitmap = 1; + } + oneof msg { StartCoordinationRequest start_request = 1; CommitRequest commit_request = 2; + UpdateVnodeBitmapRequest update_vnode_request = 3; + bool stop = 4; } } diff --git a/proto/hummock.proto b/proto/hummock.proto index 19b7e036c9686..7956b4515dce8 100644 --- a/proto/hummock.proto +++ b/proto/hummock.proto @@ -104,6 +104,11 @@ message GroupTableChange { message GroupDestroy {} +message GroupMerge { + uint64 left_group_id = 1; + uint64 right_group_id = 2; +} + message GroupDelta { oneof delta_type { IntraLevelDelta intra_level = 1; @@ -111,6 +116,7 @@ message GroupDelta { GroupDestroy group_destroy = 3; GroupMetaChange group_meta_change = 4 [deprecated = true]; GroupTableChange group_table_change = 5 [deprecated = true]; + GroupMerge group_merge = 6; } } @@ -744,6 +750,7 @@ message PinVersionResponse { message SplitCompactionGroupRequest { uint64 group_id = 1; repeated uint32 table_ids = 2; + uint32 partition_vnode_count = 3; } message SplitCompactionGroupResponse { @@ -833,12 +840,20 @@ message CancelCompactTaskResponse { message GetVersionByEpochRequest { uint64 epoch = 1; + uint32 table_id = 2; } message GetVersionByEpochResponse { HummockVersion version = 1; } +message MergeCompactionGroupRequest { + uint64 left_group_id = 1; + uint64 right_group_id = 2; +} + +message MergeCompactionGroupResponse {} + service HummockManagerService { rpc UnpinVersionBefore(UnpinVersionBeforeRequest) returns (UnpinVersionBeforeResponse); rpc GetCurrentVersion(GetCurrentVersionRequest) returns (GetCurrentVersionResponse); @@ -880,6 +895,7 @@ service HummockManagerService { rpc CancelCompactTask(CancelCompactTaskRequest) returns (CancelCompactTaskResponse); rpc ListChangeLogEpochs(ListChangeLogEpochsRequest) returns (ListChangeLogEpochsResponse); rpc GetVersionByEpoch(GetVersionByEpochRequest) returns (GetVersionByEpochResponse); + rpc MergeCompactionGroup(MergeCompactionGroupRequest) returns (MergeCompactionGroupResponse); } message CompactionConfig { diff --git a/proto/meta.proto b/proto/meta.proto index 8932dcbc9e033..98a7f267c0124 100644 --- a/proto/meta.proto +++ b/proto/meta.proto @@ -791,3 +791,30 @@ message RelationIdInfos { // relation_id -> FragmentIdToActorIdMap map map = 1; } + +message ActorCountPerParallelism { + message WorkerActorCount { + uint64 actor_count = 1; + uint64 parallelism = 2; + } + map worker_id_to_actor_count = 1; + uint64 hard_limit = 2; + uint64 soft_limit = 3; +} + +message ClusterLimit { + oneof limit { + ActorCountPerParallelism actor_count = 1; + // TODO: limit DDL using compaction pending bytes + } +} + +message GetClusterLimitsRequest {} + +message GetClusterLimitsResponse { + repeated ClusterLimit active_limits = 1; +} + +service ClusterLimitService { + rpc GetClusterLimits(GetClusterLimitsRequest) returns (GetClusterLimitsResponse); +} diff --git a/proto/stream_service.proto b/proto/stream_service.proto index 54ffc3d5ff79c..ce727ba9cc55c 100644 --- a/proto/stream_service.proto +++ b/proto/stream_service.proto @@ -17,16 +17,6 @@ message BuildActorInfo { map related_subscriptions = 2; } -message DropActorsRequest { - string request_id = 1; - repeated uint32 actor_ids = 2; -} - -message DropActorsResponse { - string request_id = 1; - common.Status status = 2; -} - message InjectBarrierRequest { string request_id = 1; stream_plan.Barrier barrier = 2; @@ -109,7 +99,6 @@ message StreamingControlStreamResponse { } service StreamService { - rpc DropActors(DropActorsRequest) returns (DropActorsResponse); rpc WaitEpochCommit(WaitEpochCommitRequest) returns (WaitEpochCommitResponse); rpc StreamingControlStream(stream StreamingControlStreamRequest) returns (stream StreamingControlStreamResponse); } diff --git a/src/batch/src/executor/join/distributed_lookup_join.rs b/src/batch/src/executor/join/distributed_lookup_join.rs index 1068ffd7f3349..74d7843013e4d 100644 --- a/src/batch/src/executor/join/distributed_lookup_join.rs +++ b/src/batch/src/executor/join/distributed_lookup_join.rs @@ -17,8 +17,9 @@ use std::mem::swap; use futures::pin_mut; use itertools::Itertools; +use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::{ColumnDesc, ColumnId, Field, Schema}; -use risingwave_common::hash::{HashKey, HashKeyDispatcher}; +use risingwave_common::hash::{HashKey, HashKeyDispatcher, VirtualNode}; use risingwave_common::memory::MemoryContext; use risingwave_common::row::OwnedRow; use risingwave_common::types::{DataType, Datum}; @@ -30,7 +31,7 @@ use risingwave_pb::batch_plan::plan_node::NodeBody; use risingwave_pb::common::BatchQueryEpoch; use risingwave_storage::store::PrefetchOptions; use risingwave_storage::table::batch_table::storage_table::StorageTable; -use risingwave_storage::table::{TableDistribution, TableIter}; +use risingwave_storage::table::TableIter; use risingwave_storage::{dispatch_state_store, StateStore}; use crate::error::Result; @@ -194,7 +195,8 @@ impl BoxedExecutorBuilder for DistributedLookupJoinExecutorBuilder { .collect(); // Lookup Join always contains distribution key, so we don't need vnode bitmap - let vnodes = Some(TableDistribution::all_vnodes()); + // TODO(var-vnode): use vnode count from table desc + let vnodes = Some(Bitmap::ones(VirtualNode::COUNT).into()); dispatch_state_store!(source.context().state_store(), state_store, { let table = StorageTable::new_partial(state_store, column_ids, vnodes, table_desc); let inner_side_builder = InnerSideExecutorBuilder::new( diff --git a/src/batch/src/executor/join/hash_join.rs b/src/batch/src/executor/join/hash_join.rs index 3bfb583d6459d..863e53035626a 100644 --- a/src/batch/src/executor/join/hash_join.rs +++ b/src/batch/src/executor/join/hash_join.rs @@ -162,9 +162,8 @@ impl<'a> Iterator for RowIdIter<'a> { type Item = RowId; fn next(&mut self) -> Option { - self.current_row_id.map(|row_id| { - self.current_row_id = self.next_row_id[row_id]; - row_id + self.current_row_id.inspect(|row_id| { + self.current_row_id = self.next_row_id[*row_id]; }) } } diff --git a/src/batch/src/executor/join/local_lookup_join.rs b/src/batch/src/executor/join/local_lookup_join.rs index a3be00fc39a22..7c7a08af5d873 100644 --- a/src/batch/src/executor/join/local_lookup_join.rs +++ b/src/batch/src/executor/join/local_lookup_join.rs @@ -17,7 +17,7 @@ use std::marker::PhantomData; use anyhow::Context; use itertools::Itertools; -use risingwave_common::bitmap::BitmapBuilder; +use risingwave_common::bitmap::{Bitmap, BitmapBuilder}; use risingwave_common::catalog::{ColumnDesc, Field, Schema}; use risingwave_common::hash::table_distribution::TableDistribution; use risingwave_common::hash::{ @@ -408,12 +408,11 @@ impl BoxedExecutorBuilder for LocalLookupJoinExecutorBuilder { }) .collect(); + // TODO(var-vnode): use vnode count from table desc + let vnodes = Some(Bitmap::ones(VirtualNode::COUNT).into()); let inner_side_builder = InnerSideExecutorBuilder { table_desc: table_desc.clone(), - table_distribution: TableDistribution::new_from_storage_table_desc( - Some(TableDistribution::all_vnodes()), - table_desc, - ), + table_distribution: TableDistribution::new_from_storage_table_desc(vnodes, table_desc), vnode_mapping, outer_side_key_types, inner_side_schema, diff --git a/src/batch/src/executor/log_row_seq_scan.rs b/src/batch/src/executor/log_row_seq_scan.rs index 7106eaec1b760..be2a11b756946 100644 --- a/src/batch/src/executor/log_row_seq_scan.rs +++ b/src/batch/src/executor/log_row_seq_scan.rs @@ -22,13 +22,14 @@ use prometheus::Histogram; use risingwave_common::array::{DataChunk, Op}; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::{ColumnId, Field, Schema}; +use risingwave_common::hash::VirtualNode; use risingwave_common::row::{Row, RowExt}; use risingwave_common::types::ScalarImpl; use risingwave_pb::batch_plan::plan_node::NodeBody; use risingwave_pb::common::{batch_query_epoch, BatchQueryEpoch}; use risingwave_pb::plan_common::StorageTableDesc; use risingwave_storage::table::batch_table::storage_table::StorageTable; -use risingwave_storage::table::{collect_data_chunk, TableDistribution}; +use risingwave_storage::table::collect_data_chunk; use risingwave_storage::{dispatch_state_store, StateStore}; use super::{BoxedDataChunkStream, BoxedExecutor, BoxedExecutorBuilder, Executor, ExecutorBuilder}; @@ -106,7 +107,8 @@ impl BoxedExecutorBuilder for LogStoreRowSeqScanExecutorBuilder { Some(vnodes) => Some(Bitmap::from(vnodes).into()), // This is possible for dml. vnode_bitmap is not filled by scheduler. // Or it's single distribution, e.g., distinct agg. We scan in a single executor. - None => Some(TableDistribution::all_vnodes()), + // TODO(var-vnode): use vnode count from table desc + None => Some(Bitmap::ones(VirtualNode::COUNT).into()), }; let chunk_size = source.context.get_config().developer.chunk_size as u32; diff --git a/src/batch/src/executor/row_seq_scan.rs b/src/batch/src/executor/row_seq_scan.rs index b897dbd813787..7c7244d954764 100644 --- a/src/batch/src/executor/row_seq_scan.rs +++ b/src/batch/src/executor/row_seq_scan.rs @@ -21,6 +21,7 @@ use prometheus::Histogram; use risingwave_common::array::DataChunk; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::{ColumnId, Schema}; +use risingwave_common::hash::VirtualNode; use risingwave_common::row::{OwnedRow, Row}; use risingwave_common::types::{DataType, Datum}; use risingwave_common::util::chunk_coalesce::DataChunkBuilder; @@ -32,7 +33,6 @@ use risingwave_pb::plan_common::as_of::AsOfType; use risingwave_pb::plan_common::{as_of, PbAsOf, StorageTableDesc}; use risingwave_storage::store::PrefetchOptions; use risingwave_storage::table::batch_table::storage_table::StorageTable; -use risingwave_storage::table::TableDistribution; use risingwave_storage::{dispatch_state_store, StateStore}; use crate::error::{BatchError, Result}; @@ -210,7 +210,8 @@ impl BoxedExecutorBuilder for RowSeqScanExecutorBuilder { Some(vnodes) => Some(Bitmap::from(vnodes).into()), // This is possible for dml. vnode_bitmap is not filled by scheduler. // Or it's single distribution, e.g., distinct agg. We scan in a single executor. - None => Some(TableDistribution::all_vnodes()), + // TODO(var-vnode): use vnode count from table desc + None => Some(Bitmap::ones(VirtualNode::COUNT).into()), }; let scan_ranges = { diff --git a/src/batch/src/lib.rs b/src/batch/src/lib.rs index 414f27b33b4a7..9b88c3be9cd68 100644 --- a/src/batch/src/lib.rs +++ b/src/batch/src/lib.rs @@ -20,7 +20,6 @@ #![feature(coroutines)] #![feature(proc_macro_hygiene, stmt_expr_attributes)] #![feature(iterator_try_collect)] -#![feature(lint_reasons)] #![feature(is_sorted)] #![recursion_limit = "256"] #![feature(let_chains)] diff --git a/src/batch/src/task/consistent_hash_shuffle_channel.rs b/src/batch/src/task/consistent_hash_shuffle_channel.rs index ad0fdbaa8b70a..32d91a7acc09b 100644 --- a/src/batch/src/task/consistent_hash_shuffle_channel.rs +++ b/src/batch/src/task/consistent_hash_shuffle_channel.rs @@ -59,6 +59,7 @@ fn generate_hash_values( .iter() .map(|idx| *idx as usize) .collect::>(), + consistent_hash_info.vmap.len(), ); let hash_values = vnodes diff --git a/src/batch/src/worker_manager/worker_node_manager.rs b/src/batch/src/worker_manager/worker_node_manager.rs index 80cd2806f2b64..fd4d0e37bbbc4 100644 --- a/src/batch/src/worker_manager/worker_node_manager.rs +++ b/src/batch/src/worker_manager/worker_node_manager.rs @@ -19,7 +19,7 @@ use std::time::Duration; use rand::seq::SliceRandom; use risingwave_common::bail; use risingwave_common::catalog::OBJECT_ID_PLACEHOLDER; -use risingwave_common::hash::{WorkerSlotId, WorkerSlotMapping}; +use risingwave_common::hash::{VirtualNode, WorkerSlotId, WorkerSlotMapping}; use risingwave_common::vnode_mapping::vnode_placement::place_vnode; use risingwave_pb::common::{WorkerNode, WorkerType}; @@ -374,7 +374,9 @@ impl WorkerNodeSelector { }; // 2. Temporary mapping that filters out unavailable workers. let new_workers = self.apply_worker_node_mask(self.manager.list_serving_worker_nodes()); - let masked_mapping = place_vnode(hint.as_ref(), &new_workers, parallelism); + // TODO(var-vnode): use vnode count from config + let masked_mapping = + place_vnode(hint.as_ref(), &new_workers, parallelism, VirtualNode::COUNT); masked_mapping.ok_or_else(|| BatchError::EmptyWorkerNodes) } } diff --git a/src/common/benches/bench_data_chunk_encoding.rs b/src/common/benches/bench_data_chunk_encoding.rs index 96413a4305205..4b09aeaeed5c2 100644 --- a/src/common/benches/bench_data_chunk_encoding.rs +++ b/src/common/benches/bench_data_chunk_encoding.rs @@ -55,7 +55,7 @@ fn bench_data_chunk_encoding(c: &mut Criterion) { for null_ratio in NULL_RATIOS { for chunk_size in CHUNK_SIZES { let chunk = rand_chunk::gen_chunk(&case.data_types, *chunk_size, SEED, *null_ratio); - let mut group = c.benchmark_group(&format!( + let mut group = c.benchmark_group(format!( "data chunk encoding: {}, {} rows, Pr[null]={}", case.name, chunk_size, null_ratio )); diff --git a/src/common/benches/bench_sequencer.rs b/src/common/benches/bench_sequencer.rs index 12e92f1f3332d..591b5fd64ee3a 100644 --- a/src/common/benches/bench_sequencer.rs +++ b/src/common/benches/bench_sequencer.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(lint_reasons)] - use std::cell::RefCell; use std::hint::black_box; use std::sync::atomic::{AtomicUsize, Ordering}; diff --git a/src/common/common_service/src/lib.rs b/src/common/common_service/src/lib.rs index 2cf9a56e076f3..ecf89a84fce88 100644 --- a/src/common/common_service/src/lib.rs +++ b/src/common/common_service/src/lib.rs @@ -14,7 +14,6 @@ // This is a stub lib.rs. -#![feature(lint_reasons)] #![feature(impl_trait_in_assoc_type)] #![feature(error_generic_member_access)] diff --git a/src/common/metrics/src/guarded_metrics.rs b/src/common/metrics/src/guarded_metrics.rs index 27710748ae359..9b16cc778938c 100644 --- a/src/common/metrics/src/guarded_metrics.rs +++ b/src/common/metrics/src/guarded_metrics.rs @@ -83,6 +83,22 @@ macro_rules! register_guarded_int_gauge_vec_with_registry { }}; } +#[macro_export] +macro_rules! register_guarded_uint_gauge_vec_with_registry { + ($NAME:expr, $HELP:expr, $LABELS_NAMES:expr, $REGISTRY:expr $(,)?) => {{ + let inner = prometheus::core::GenericGaugeVec::::new( + prometheus::opts!($NAME, $HELP), + $LABELS_NAMES, + ); + inner.and_then(|inner| { + let inner = $crate::__extract_gauge_builder(inner); + let label_guarded = $crate::LabelGuardedUintGaugeVec::new(inner, { $LABELS_NAMES }); + let result = ($REGISTRY).register(Box::new(label_guarded.clone())); + result.map(move |()| label_guarded) + }) + }}; +} + #[macro_export] macro_rules! register_guarded_int_counter_vec_with_registry { ($NAME:expr, $HELP:expr, $LABELS_NAMES:expr, $REGISTRY:expr $(,)?) => {{ @@ -131,6 +147,8 @@ pub type LabelGuardedIntCounterVec = LabelGuardedMetricVec, N>; pub type LabelGuardedIntGaugeVec = LabelGuardedMetricVec, N>; +pub type LabelGuardedUintGaugeVec = + LabelGuardedMetricVec, N>; pub type LabelGuardedGaugeVec = LabelGuardedMetricVec, N>; diff --git a/src/common/src/array/arrow/arrow_impl.rs b/src/common/src/array/arrow/arrow_impl.rs index acc39bc951975..8fa3e2abb6b5f 100644 --- a/src/common/src/array/arrow/arrow_impl.rs +++ b/src/common/src/array/arrow/arrow_impl.rs @@ -448,12 +448,17 @@ pub trait ToArrow { #[inline] fn map_type_to_arrow(&self, map_type: &MapType) -> Result { let sorted = false; - let list_type = map_type.clone().into_list(); + // "key" is always non-null + let key = self + .to_arrow_field("key", map_type.key())? + .with_nullable(false); + let value = self.to_arrow_field("value", map_type.value())?; Ok(arrow_schema::DataType::Map( Arc::new(arrow_schema::Field::new( "entries", - self.list_type_to_arrow(&list_type)?, - true, + arrow_schema::DataType::Struct([Arc::new(key), Arc::new(value)].into()), + // "entries" is always non-null + false, )), sorted, )) diff --git a/src/common/src/array/arrow/arrow_udf.rs b/src/common/src/array/arrow/arrow_udf.rs index e461f49e576a6..a5296ca21cab8 100644 --- a/src/common/src/array/arrow/arrow_udf.rs +++ b/src/common/src/array/arrow/arrow_udf.rs @@ -125,6 +125,7 @@ impl FromArrow for UdfArrowConvert { #[cfg(test)] mod tests { + use super::*; use crate::array::*; @@ -205,4 +206,120 @@ mod tests { .unwrap(); assert_eq!(rw_array.as_list(), &array); } + + #[test] + fn map() { + let map_type = MapType::from_kv(DataType::Varchar, DataType::Int32); + let rw_map_type = DataType::Map(map_type.clone()); + let mut builder = MapArrayBuilder::with_type(3, rw_map_type.clone()); + builder.append_owned(Some( + MapValue::try_from_kv( + ListValue::from_str("{a,b,c}", &DataType::List(Box::new(DataType::Varchar))) + .unwrap(), + ListValue::from_str("{1,2,3}", &DataType::List(Box::new(DataType::Int32))).unwrap(), + ) + .unwrap(), + )); + builder.append_owned(None); + builder.append_owned(Some( + MapValue::try_from_kv( + ListValue::from_str("{a,c}", &DataType::List(Box::new(DataType::Varchar))).unwrap(), + ListValue::from_str("{1,3}", &DataType::List(Box::new(DataType::Int32))).unwrap(), + ) + .unwrap(), + )); + let rw_array = builder.finish(); + + let arrow_map_type = UdfArrowConvert::default() + .map_type_to_arrow(&map_type) + .unwrap(); + expect_test::expect![[r#" + Map( + Field { + name: "entries", + data_type: Struct( + [ + Field { + name: "key", + data_type: Utf8, + nullable: false, + dict_id: 0, + dict_is_ordered: false, + metadata: {}, + }, + Field { + name: "value", + data_type: Int32, + nullable: true, + dict_id: 0, + dict_is_ordered: false, + metadata: {}, + }, + ], + ), + nullable: false, + dict_id: 0, + dict_is_ordered: false, + metadata: {}, + }, + false, + ) + "#]] + .assert_debug_eq(&arrow_map_type); + let rw_map_type_new = UdfArrowConvert::default() + .from_field(&arrow_schema::Field::new( + "map", + arrow_map_type.clone(), + true, + )) + .unwrap(); + assert_eq!(rw_map_type, rw_map_type_new); + let arrow = UdfArrowConvert::default() + .map_to_arrow(&arrow_map_type, &rw_array) + .unwrap(); + expect_test::expect![[r#" + MapArray + [ + StructArray + [ + -- child 0: "key" (Utf8) + StringArray + [ + "a", + "b", + "c", + ] + -- child 1: "value" (Int32) + PrimitiveArray + [ + 1, + 2, + 3, + ] + ], + null, + StructArray + [ + -- child 0: "key" (Utf8) + StringArray + [ + "a", + "c", + ] + -- child 1: "value" (Int32) + PrimitiveArray + [ + 1, + 3, + ] + ], + ] + "#]] + .assert_debug_eq(&arrow); + + let rw_array_new = UdfArrowConvert::default() + .from_map_array(arrow.as_any().downcast_ref().unwrap()) + .unwrap(); + assert_eq!(&rw_array, rw_array_new.as_map()); + } } diff --git a/src/common/src/bitmap.rs b/src/common/src/bitmap.rs index 7ef6bf039f47d..ae07105164408 100644 --- a/src/common/src/bitmap.rs +++ b/src/common/src/bitmap.rs @@ -685,6 +685,12 @@ impl From<&PbBuffer> for Bitmap { } } +impl From for Bitmap { + fn from(buf: PbBuffer) -> Self { + Self::from(&buf) + } +} + /// Bitmap iterator. pub struct BitmapIter<'a> { bits: Option<&'a [usize]>, diff --git a/src/common/src/config.rs b/src/common/src/config.rs index 88ea110869b79..e2b4dd7b0f97c 100644 --- a/src/common/src/config.rs +++ b/src/common/src/config.rs @@ -33,7 +33,6 @@ use serde_default::DefaultFromSerde; use serde_json::Value; use crate::for_all_params; -use crate::hash::VirtualNode; /// Use the maximum value for HTTP/2 connection window size to avoid deadlock among multiplexed /// streams on the same connection. @@ -427,16 +426,13 @@ impl<'de> Deserialize<'de> for DefaultParallelism { ))) } } - Parallelism::Int(i) => Ok(DefaultParallelism::Default(if i > VirtualNode::COUNT { - Err(serde::de::Error::custom(format!( - "default parallelism should be not great than {}", - VirtualNode::COUNT - )))? - } else { + Parallelism::Int(i) => Ok(DefaultParallelism::Default( + // Note: we won't check whether this exceeds the maximum parallelism (i.e., vnode count) + // here because it requires extra context. The check will be done when scheduling jobs. NonZeroUsize::new(i).ok_or_else(|| { - serde::de::Error::custom("default parallelism should be greater than 0") - })? - })), + serde::de::Error::custom("default parallelism should not be 0") + })?, + )), } } } @@ -466,6 +462,16 @@ pub struct MetaDeveloperConfig { #[serde(default = "default::developer::max_get_task_probe_times")] pub max_get_task_probe_times: usize, + + /// Max number of actor allowed per parallelism (default = 100). + /// CREATE MV/Table will be noticed when the number of actors exceeds this limit. + #[serde(default = "default::developer::actor_cnt_per_worker_parallelism_soft_limit")] + pub actor_cnt_per_worker_parallelism_soft_limit: usize, + + /// Max number of actor allowed per parallelism (default = 400). + /// CREATE MV/Table will be rejected when the number of actors exceeds this limit. + #[serde(default = "default::developer::actor_cnt_per_worker_parallelism_hard_limit")] + pub actor_cnt_per_worker_parallelism_hard_limit: usize, } /// The section `[server]` in `risingwave.toml`. @@ -693,6 +699,9 @@ pub struct StorageConfig { #[serde(default)] pub prefetch_buffer_capacity_mb: Option, + #[serde(default)] + pub max_cached_recent_versions_number: Option, + /// max prefetch block number #[serde(default = "default::storage::max_prefetch_block_number")] pub max_prefetch_block_number: usize, @@ -1859,6 +1868,14 @@ pub mod default { 5 } + pub fn actor_cnt_per_worker_parallelism_soft_limit() -> usize { + 100 + } + + pub fn actor_cnt_per_worker_parallelism_hard_limit() -> usize { + 400 + } + pub fn memory_controller_threshold_aggressive() -> f64 { 0.9 } diff --git a/src/common/src/hash/consistent_hash/bitmap.rs b/src/common/src/hash/consistent_hash/bitmap.rs index 773231ba36a89..eee6a64a2b42c 100644 --- a/src/common/src/hash/consistent_hash/bitmap.rs +++ b/src/common/src/hash/consistent_hash/bitmap.rs @@ -15,6 +15,7 @@ use std::ops::RangeInclusive; use crate::bitmap::Bitmap; +use crate::hash::table_distribution::SINGLETON_VNODE; use crate::hash::VirtualNode; /// An extension trait for `Bitmap` to support virtual node operations. @@ -36,4 +37,17 @@ impl Bitmap { self.high_ranges() .map(|r| (VirtualNode::from_index(*r.start())..=VirtualNode::from_index(*r.end()))) } + + /// Returns whether only the [`SINGLETON_VNODE`] is set in the bitmap. + /// + /// Note that this method returning `true` does not imply that the bitmap was created by + /// [`VnodeBitmapExt::singleton`], or that the bitmap has length 1. + pub fn is_singleton(&self) -> bool { + self.count_ones() == 1 && self.iter_vnodes().next().unwrap() == SINGLETON_VNODE + } + + /// Creates a bitmap with length 1 and the single bit set. + pub fn singleton() -> Self { + Self::ones(1) + } } diff --git a/src/common/src/hash/consistent_hash/mapping.rs b/src/common/src/hash/consistent_hash/mapping.rs index a462acb291853..0ab8f9e18fd2e 100644 --- a/src/common/src/hash/consistent_hash/mapping.rs +++ b/src/common/src/hash/consistent_hash/mapping.rs @@ -105,26 +105,26 @@ impl VnodeMapping { /// /// For example, if `items` is `[0, 1, 2]`, and the total vnode count is 10, we'll generate /// mapping like `[0, 0, 0, 0, 1, 1, 1, 2, 2, 2]`. - pub fn new_uniform(items: impl ExactSizeIterator) -> Self { + pub fn new_uniform(items: impl ExactSizeIterator, vnode_count: usize) -> Self { // If the number of items is greater than the total vnode count, no vnode will be mapped to // some items and the mapping will be invalid. - assert!(items.len() <= VirtualNode::COUNT); + assert!(items.len() <= vnode_count); let mut original_indices = Vec::with_capacity(items.len()); let mut data = Vec::with_capacity(items.len()); - let hash_shard_size = VirtualNode::COUNT / items.len(); - let mut one_more_count = VirtualNode::COUNT % items.len(); + let hash_shard_size = vnode_count / items.len(); + let mut one_more_count = vnode_count % items.len(); let mut init_bound = 0; for item in items { - let vnode_count = if one_more_count > 0 { + let count = if one_more_count > 0 { one_more_count -= 1; hash_shard_size + 1 } else { hash_shard_size }; - init_bound += vnode_count; + init_bound += count; original_indices.push(init_bound as u32 - 1); data.push(item); @@ -141,10 +141,11 @@ impl VnodeMapping { /// Create a vnode mapping where all vnodes are mapped to the same single item. pub fn new_single(item: T::Item) -> Self { - Self::new_uniform(std::iter::once(item)) + // TODO(var-vnode): always 1 correct? + Self::new_uniform(std::iter::once(item), 1) } - /// The length of the vnode in this mapping, typically [`VirtualNode::COUNT`]. + /// The length (or count) of the vnode in this mapping. pub fn len(&self) -> usize { self.original_indices .last() @@ -204,12 +205,13 @@ impl VnodeMapping { /// Convert this vnode mapping to a mapping from items to bitmaps, where each bitmap represents /// the vnodes mapped to the item. pub fn to_bitmaps(&self) -> HashMap { + let vnode_count = self.len(); let mut vnode_bitmaps = HashMap::new(); for (vnode, item) in self.iter_with_vnode() { vnode_bitmaps .entry(item) - .or_insert_with(|| BitmapBuilder::zeroed(VirtualNode::COUNT)) + .or_insert_with(|| BitmapBuilder::zeroed(vnode_count)) .set(vnode.to_index(), true); } @@ -222,10 +224,11 @@ impl VnodeMapping { /// Create a vnode mapping from the given mapping from items to bitmaps, where each bitmap /// represents the vnodes mapped to the item. pub fn from_bitmaps(bitmaps: &HashMap) -> Self { - let mut items = vec![None; VirtualNode::COUNT]; + let vnode_count = bitmaps.values().next().expect("empty bitmaps").len(); + let mut items = vec![None; vnode_count]; for (&item, bitmap) in bitmaps { - assert_eq!(bitmap.len(), VirtualNode::COUNT); + assert_eq!(bitmap.len(), vnode_count); for idx in bitmap.iter_ones() { if let Some(prev) = items[idx].replace(item) { panic!("mapping at index `{idx}` is set to both `{prev:?}` and `{item:?}`"); @@ -241,9 +244,8 @@ impl VnodeMapping { Self::from_expanded(&items) } - /// Create a vnode mapping from the expanded slice of items with length [`VirtualNode::COUNT`]. + /// Create a vnode mapping from the expanded slice of items. pub fn from_expanded(items: &[T::Item]) -> Self { - assert_eq!(items.len(), VirtualNode::COUNT); let (original_indices, data) = compress_data(items); Self { original_indices, @@ -251,7 +253,7 @@ impl VnodeMapping { } } - /// Convert this vnode mapping to a expanded vector of items with length [`VirtualNode::COUNT`]. + /// Convert this vnode mapping to a expanded vector of items. pub fn to_expanded(&self) -> ExpandedMapping { self.iter().collect() } @@ -353,8 +355,8 @@ impl ActorMapping { impl WorkerSlotMapping { /// Create a uniform worker mapping from the given worker ids - pub fn build_from_ids(worker_slot_ids: &[WorkerSlotId]) -> Self { - Self::new_uniform(worker_slot_ids.iter().cloned()) + pub fn build_from_ids(worker_slot_ids: &[WorkerSlotId], vnode_count: usize) -> Self { + Self::new_uniform(worker_slot_ids.iter().cloned(), vnode_count) } /// Create a worker mapping from the protobuf representation. @@ -403,18 +405,18 @@ mod tests { type TestMapping = VnodeMapping; type Test2Mapping = VnodeMapping; - const COUNTS: &[usize] = &[1, 3, 12, 42, VirtualNode::COUNT]; + const COUNTS: &[usize] = &[1, 3, 12, 42, VirtualNode::COUNT_FOR_TEST]; fn uniforms() -> impl Iterator { COUNTS .iter() - .map(|&count| TestMapping::new_uniform(0..count as u32)) + .map(|&count| TestMapping::new_uniform(0..count as u32, VirtualNode::COUNT_FOR_TEST)) } fn randoms() -> impl Iterator { COUNTS.iter().map(|&count| { let raw = repeat_with(|| rand::thread_rng().gen_range(0..count as u32)) - .take(VirtualNode::COUNT) + .take(VirtualNode::COUNT_FOR_TEST) .collect_vec(); TestMapping::from_expanded(&raw) }) @@ -427,7 +429,7 @@ mod tests { #[test] fn test_uniform() { for vnode_mapping in uniforms() { - assert_eq!(vnode_mapping.len(), VirtualNode::COUNT); + assert_eq!(vnode_mapping.len(), VirtualNode::COUNT_FOR_TEST); let item_count = vnode_mapping.iter_unique().count(); let mut check: HashMap> = HashMap::new(); diff --git a/src/common/src/hash/consistent_hash/vnode.rs b/src/common/src/hash/consistent_hash/vnode.rs index f528544689f31..685f99d6cf4f4 100644 --- a/src/common/src/hash/consistent_hash/vnode.rs +++ b/src/common/src/hash/consistent_hash/vnode.rs @@ -30,26 +30,45 @@ use crate::util::row_id::extract_vnode_id_from_row_id; pub struct VirtualNode(VirtualNodeInner); /// The internal representation of a virtual node id. +/// +/// Note: not all bits of the inner representation might be used. type VirtualNodeInner = u16; -static_assertions::const_assert!(VirtualNodeInner::BITS >= VirtualNode::BITS as u32); -impl From for VirtualNode { - fn from(hash_code: Crc32HashCode) -> Self { +/// `vnode_count` must be provided to convert a hash code to a virtual node. +/// +/// Use [`Crc32HashCodeToVnodeExt::to_vnode`] instead. +impl !From for VirtualNode {} + +#[easy_ext::ext(Crc32HashCodeToVnodeExt)] +impl Crc32HashCode { + /// Converts the hash code to a virtual node, based on the given total count of vnodes. + fn to_vnode(self, vnode_count: usize) -> VirtualNode { // Take the least significant bits of the hash code. // TODO: should we use the most significant bits? - let inner = (hash_code.value() % Self::COUNT as u64) as VirtualNodeInner; + let inner = (self.value() % vnode_count as u64) as VirtualNodeInner; VirtualNode(inner) } } impl VirtualNode { - /// The number of bits used to represent a virtual node. - /// - /// Note: Not all bits of the inner representation are used. One should rely on this constant - /// to determine the count of virtual nodes. - pub const BITS: usize = 8; /// The total count of virtual nodes. - pub const COUNT: usize = 1 << Self::BITS; + // TODO(var-vnode): remove this and only keep `COUNT_FOR_TEST` + pub const COUNT: usize = 1 << 8; + /// The maximum value of the virtual node. + // TODO(var-vnode): remove this and only keep `MAX_FOR_TEST` + pub const MAX: VirtualNode = VirtualNode::from_index(Self::COUNT - 1); +} + +impl VirtualNode { + /// The total count of virtual nodes, for testing purposes. + pub const COUNT_FOR_TEST: usize = Self::COUNT; + /// The maximum value of the virtual node, for testing purposes. + pub const MAX_FOR_TEST: VirtualNode = Self::MAX; +} + +impl VirtualNode { + /// The maximum count of virtual nodes that fits in [`VirtualNodeInner`]. + pub const MAX_COUNT: usize = 1 << VirtualNodeInner::BITS; /// The size of a virtual node in bytes, in memory or serialized representation. pub const SIZE: usize = std::mem::size_of::(); } @@ -58,8 +77,6 @@ impl VirtualNode { pub type AllVirtualNodeIter = std::iter::Map, fn(usize) -> VirtualNode>; impl VirtualNode { - /// The maximum value of the virtual node. - pub const MAX: VirtualNode = VirtualNode::from_index(Self::COUNT - 1); /// We may use `VirtualNode` as a datum in a stream, or store it as a column. /// Hence this reifies it as a RW datatype. pub const RW_TYPE: DataType = DataType::Int16; @@ -68,7 +85,7 @@ impl VirtualNode { /// Creates a virtual node from the `usize` index. pub const fn from_index(index: usize) -> Self { - debug_assert!(index < Self::COUNT); + debug_assert!(index < Self::MAX_COUNT); Self(index as _) } @@ -79,7 +96,6 @@ impl VirtualNode { /// Creates a virtual node from the given scalar representation. Used by `VNODE` expression. pub const fn from_scalar(scalar: i16) -> Self { - debug_assert!((scalar as usize) < Self::COUNT); Self(scalar as _) } @@ -99,7 +115,6 @@ impl VirtualNode { /// Creates a virtual node from the given big-endian bytes representation. pub const fn from_be_bytes(bytes: [u8; Self::SIZE]) -> Self { let inner = VirtualNodeInner::from_be_bytes(bytes); - debug_assert!((inner as usize) < Self::COUNT); Self(inner) } @@ -109,8 +124,8 @@ impl VirtualNode { } /// Iterates over all virtual nodes. - pub fn all() -> AllVirtualNodeIter { - (0..Self::COUNT).map(Self::from_index) + pub fn all(vnode_count: usize) -> AllVirtualNodeIter { + (0..vnode_count).map(Self::from_index) } } @@ -119,7 +134,11 @@ impl VirtualNode { // chunk. When only one column is provided and its type is `Serial`, we consider the column to // be the one that contains RowId, and use a special method to skip the calculation of Hash // and directly extract the `VirtualNode` from `RowId`. - pub fn compute_chunk(data_chunk: &DataChunk, keys: &[usize]) -> Vec { + pub fn compute_chunk( + data_chunk: &DataChunk, + keys: &[usize], + vnode_count: usize, + ) -> Vec { if let Ok(idx) = keys.iter().exactly_one() && let ArrayImpl::Serial(serial_array) = &**data_chunk.column_at(*idx) { @@ -135,7 +154,7 @@ impl VirtualNode { // This process doesn’t guarantee the order of rows, producing indeterminate results in some cases, // such as when `distinct on` is used without an `order by`. let (row, _) = data_chunk.row_at(idx); - row.hash(Crc32FastBuilder).into() + row.hash(Crc32FastBuilder).to_vnode(vnode_count) } }) .collect(); @@ -144,19 +163,29 @@ impl VirtualNode { data_chunk .get_hash_values(keys, Crc32FastBuilder) .into_iter() - .map(|hash| hash.into()) + .map(|hash| hash.to_vnode(vnode_count)) .collect() } + /// Equivalent to [`Self::compute_chunk`] with [`VirtualNode::COUNT_FOR_TEST`] as the vnode count. + pub fn compute_chunk_for_test(data_chunk: &DataChunk, keys: &[usize]) -> Vec { + Self::compute_chunk(data_chunk, keys, Self::COUNT_FOR_TEST) + } + // `compute_row` is used to calculate the `VirtualNode` for the corresponding columns in a // `Row`. Similar to `compute_chunk`, it also contains special handling for serial columns. - pub fn compute_row(row: impl Row, indices: &[usize]) -> VirtualNode { + pub fn compute_row(row: impl Row, indices: &[usize], vnode_count: usize) -> VirtualNode { let project = row.project(indices); if let Ok(Some(ScalarRefImpl::Serial(s))) = project.iter().exactly_one().as_ref() { return extract_vnode_id_from_row_id(s.as_row_id()); } - project.hash(Crc32FastBuilder).into() + project.hash(Crc32FastBuilder).to_vnode(vnode_count) + } + + /// Equivalent to [`Self::compute_row`] with [`VirtualNode::COUNT_FOR_TEST`] as the vnode count. + pub fn compute_row_for_test(row: impl Row, indices: &[usize]) -> VirtualNode { + Self::compute_row(row, indices, Self::COUNT_FOR_TEST) } } @@ -179,7 +208,7 @@ mod tests { ); let chunk = DataChunk::from_pretty(chunk.as_str()); - let vnodes = VirtualNode::compute_chunk(&chunk, &[0]); + let vnodes = VirtualNode::compute_chunk_for_test(&chunk, &[0]); assert_eq!( vnodes.as_slice(), @@ -195,7 +224,7 @@ mod tests { Some(ScalarImpl::Int64(12345)), ]); - let vnode = VirtualNode::compute_row(&row, &[0]); + let vnode = VirtualNode::compute_row_for_test(&row, &[0]); assert_eq!(vnode, VirtualNode::from_index(100)); } @@ -216,7 +245,7 @@ mod tests { ); let chunk = DataChunk::from_pretty(chunk.as_str()); - let vnodes = VirtualNode::compute_chunk(&chunk, &[0]); + let vnodes = VirtualNode::compute_chunk_for_test(&chunk, &[0]); assert_eq!( vnodes.as_slice(), diff --git a/src/common/src/hash/table_distribution.rs b/src/common/src/hash/table_distribution.rs index 9be9cd2abafb2..5275aca04adb3 100644 --- a/src/common/src/hash/table_distribution.rs +++ b/src/common/src/hash/table_distribution.rs @@ -13,30 +13,34 @@ // limitations under the License. use std::mem::replace; -use std::ops::Deref; use std::sync::{Arc, LazyLock}; use itertools::Itertools; use risingwave_pb::plan_common::StorageTableDesc; -use tracing::warn; use crate::array::{Array, DataChunk, PrimitiveArray}; -use crate::bitmap::{Bitmap, BitmapBuilder}; +use crate::bitmap::Bitmap; use crate::hash::VirtualNode; use crate::row::Row; use crate::util::iter_util::ZipEqFast; -/// For tables without distribution (singleton), the `DEFAULT_VNODE` is encoded. -pub const DEFAULT_VNODE: VirtualNode = VirtualNode::ZERO; +/// For tables without distribution (singleton), the `SINGLETON_VNODE` is encoded. +pub const SINGLETON_VNODE: VirtualNode = VirtualNode::ZERO; + +use super::VnodeBitmapExt; #[derive(Debug, Clone)] enum ComputeVnode { Singleton, DistKeyIndices { + /// Virtual nodes that the table is partitioned into. + vnodes: Arc, /// Indices of distribution key for computing vnode, based on the pk columns of the table. dist_key_in_pk_indices: Vec, }, VnodeColumnIndex { + /// Virtual nodes that the table is partitioned into. + vnodes: Arc, /// Index of vnode column. vnode_col_idx_in_pk: usize, }, @@ -47,13 +51,8 @@ enum ComputeVnode { pub struct TableDistribution { /// The way to compute vnode provided primary key compute_vnode: ComputeVnode, - - /// Virtual nodes that the table is partitioned into. - vnodes: Arc, } -pub const SINGLETON_VNODE: VirtualNode = DEFAULT_VNODE; - impl TableDistribution { pub fn new_from_storage_table_desc( vnodes: Option>, @@ -75,69 +74,32 @@ impl TableDistribution { ) -> Self { let compute_vnode = if let Some(vnode_col_idx_in_pk) = vnode_col_idx_in_pk { ComputeVnode::VnodeColumnIndex { + vnodes: vnodes.unwrap_or_else(|| Bitmap::singleton().into()), vnode_col_idx_in_pk, } } else if !dist_key_in_pk_indices.is_empty() { ComputeVnode::DistKeyIndices { + vnodes: vnodes.expect("vnodes must be `Some` as dist key indices are set"), dist_key_in_pk_indices, } } else { ComputeVnode::Singleton }; - let vnodes = vnodes.unwrap_or_else(Self::singleton_vnode_bitmap); - if let ComputeVnode::Singleton = &compute_vnode { - if &vnodes != Self::singleton_vnode_bitmap_ref() && &vnodes != Self::all_vnodes_ref() { - warn!( - ?vnodes, - "singleton distribution get non-singleton vnode bitmap" - ); - } - } - - Self { - compute_vnode, - vnodes, - } + Self { compute_vnode } } pub fn is_singleton(&self) -> bool { matches!(&self.compute_vnode, ComputeVnode::Singleton) } - pub fn singleton_vnode_bitmap_ref() -> &'static Arc { - /// A bitmap that only the default vnode is set. - static SINGLETON_VNODES: LazyLock> = LazyLock::new(|| { - let mut vnodes = BitmapBuilder::zeroed(VirtualNode::COUNT); - vnodes.set(SINGLETON_VNODE.to_index(), true); - vnodes.finish().into() - }); - - SINGLETON_VNODES.deref() - } - - pub fn singleton_vnode_bitmap() -> Arc { - Self::singleton_vnode_bitmap_ref().clone() - } - - pub fn all_vnodes_ref() -> &'static Arc { - /// A bitmap that all vnodes are set. - static ALL_VNODES: LazyLock> = - LazyLock::new(|| Bitmap::ones(VirtualNode::COUNT).into()); - &ALL_VNODES - } - - pub fn all_vnodes() -> Arc { - Self::all_vnodes_ref().clone() - } - /// Distribution that accesses all vnodes, mainly used for tests. - pub fn all(dist_key_in_pk_indices: Vec) -> Self { + pub fn all(dist_key_in_pk_indices: Vec, vnode_count: usize) -> Self { Self { compute_vnode: ComputeVnode::DistKeyIndices { + vnodes: Bitmap::ones(vnode_count).into(), dist_key_in_pk_indices, }, - vnodes: Self::all_vnodes(), } } @@ -145,20 +107,39 @@ impl TableDistribution { pub fn singleton() -> Self { Self { compute_vnode: ComputeVnode::Singleton, - vnodes: Self::singleton_vnode_bitmap(), } } pub fn update_vnode_bitmap(&mut self, new_vnodes: Arc) -> Arc { - if self.is_singleton() && &new_vnodes != Self::singleton_vnode_bitmap_ref() { - warn!(?new_vnodes, "update vnode on singleton distribution"); + match &mut self.compute_vnode { + ComputeVnode::Singleton => { + if !new_vnodes.is_singleton() { + panic!( + "update vnode bitmap on singleton distribution to non-singleton: {:?}", + new_vnodes + ); + } + self.vnodes().clone() // not updated + } + + ComputeVnode::DistKeyIndices { vnodes, .. } + | ComputeVnode::VnodeColumnIndex { vnodes, .. } => { + assert_eq!(vnodes.len(), new_vnodes.len()); + replace(vnodes, new_vnodes) + } } - assert_eq!(self.vnodes.len(), new_vnodes.len()); - replace(&mut self.vnodes, new_vnodes) } + /// Get vnode bitmap if distributed, or a dummy [`Bitmap::singleton()`] if singleton. pub fn vnodes(&self) -> &Arc { - &self.vnodes + static SINGLETON_VNODES: LazyLock> = + LazyLock::new(|| Bitmap::singleton().into()); + + match &self.compute_vnode { + ComputeVnode::DistKeyIndices { vnodes, .. } => vnodes, + ComputeVnode::VnodeColumnIndex { vnodes, .. } => vnodes, + ComputeVnode::Singleton => &SINGLETON_VNODES, + } } /// Get vnode value with given primary key. @@ -166,11 +147,13 @@ impl TableDistribution { match &self.compute_vnode { ComputeVnode::Singleton => SINGLETON_VNODE, ComputeVnode::DistKeyIndices { + vnodes, dist_key_in_pk_indices, - } => compute_vnode(pk, dist_key_in_pk_indices, &self.vnodes), + } => compute_vnode(pk, dist_key_in_pk_indices, vnodes), ComputeVnode::VnodeColumnIndex { + vnodes, vnode_col_idx_in_pk, - } => get_vnode_from_row(pk, *vnode_col_idx_in_pk, &self.vnodes), + } => get_vnode_from_row(pk, *vnode_col_idx_in_pk, vnodes), } } @@ -178,22 +161,20 @@ impl TableDistribution { match &self.compute_vnode { ComputeVnode::Singleton => Some(SINGLETON_VNODE), ComputeVnode::DistKeyIndices { + vnodes, dist_key_in_pk_indices, } => dist_key_in_pk_indices .iter() .all(|&d| d < pk_prefix.len()) - .then(|| compute_vnode(pk_prefix, dist_key_in_pk_indices, &self.vnodes)), + .then(|| compute_vnode(pk_prefix, dist_key_in_pk_indices, vnodes)), ComputeVnode::VnodeColumnIndex { + vnodes, vnode_col_idx_in_pk, } => { if *vnode_col_idx_in_pk >= pk_prefix.len() { None } else { - Some(get_vnode_from_row( - pk_prefix, - *vnode_col_idx_in_pk, - &self.vnodes, - )) + Some(get_vnode_from_row(pk_prefix, *vnode_col_idx_in_pk, vnodes)) } } } @@ -203,7 +184,7 @@ impl TableDistribution { /// Get vnode value with `indices` on the given `row`. pub fn compute_vnode(row: impl Row, indices: &[usize], vnodes: &Bitmap) -> VirtualNode { assert!(!indices.is_empty()); - let vnode = VirtualNode::compute_row(&row, indices); + let vnode = VirtualNode::compute_row(&row, indices, vnodes.len()); check_vnode_is_set(vnode, vnodes); tracing::debug!(target: "events::storage::storage_table", "compute vnode: {:?} key {:?} => {}", row, indices, vnode); @@ -230,6 +211,7 @@ impl TableDistribution { vec![SINGLETON_VNODE; chunk.capacity()] } ComputeVnode::DistKeyIndices { + vnodes, dist_key_in_pk_indices, } => { let dist_key_indices = dist_key_in_pk_indices @@ -237,19 +219,20 @@ impl TableDistribution { .map(|idx| pk_indices[*idx]) .collect_vec(); - VirtualNode::compute_chunk(chunk, &dist_key_indices) + VirtualNode::compute_chunk(chunk, &dist_key_indices, vnodes.len()) .into_iter() .zip_eq_fast(chunk.visibility().iter()) .map(|(vnode, vis)| { // Ignore the invisible rows. if vis { - check_vnode_is_set(vnode, &self.vnodes); + check_vnode_is_set(vnode, vnodes); } vnode }) .collect() } ComputeVnode::VnodeColumnIndex { + vnodes, vnode_col_idx_in_pk, } => { let array: &PrimitiveArray = @@ -262,7 +245,7 @@ impl TableDistribution { let vnode = VirtualNode::from_scalar(vnode); if vis { assert!(exist); - check_vnode_is_set(vnode, &self.vnodes); + check_vnode_is_set(vnode, vnodes); } vnode }) diff --git a/src/common/src/lib.rs b/src/common/src/lib.rs index 8d47d0c621646..e3417853b0201 100644 --- a/src/common/src/lib.rs +++ b/src/common/src/lib.rs @@ -23,7 +23,6 @@ #![feature(test)] #![feature(trusted_len)] #![feature(allocator_api)] -#![feature(lint_reasons)] #![feature(coroutines)] #![feature(map_try_insert)] #![feature(error_generic_member_access)] @@ -76,7 +75,7 @@ pub mod memory; pub use risingwave_common_metrics::{ monitor, register_guarded_gauge_vec_with_registry, register_guarded_histogram_vec_with_registry, register_guarded_int_counter_vec_with_registry, - register_guarded_int_gauge_vec_with_registry, + register_guarded_int_gauge_vec_with_registry, register_guarded_uint_gauge_vec_with_registry, }; pub use { risingwave_common_metrics as metrics, risingwave_common_secret as secret, diff --git a/src/common/src/session_config/mod.rs b/src/common/src/session_config/mod.rs index ffdbe6753acb5..163aa18799390 100644 --- a/src/common/src/session_config/mod.rs +++ b/src/common/src/session_config/mod.rs @@ -292,6 +292,12 @@ pub struct SessionConfig { #[parameter(default = "hex", check_hook = check_bytea_output)] bytea_output: String, + + /// Bypass checks on cluster limits + /// + /// When enabled, `CREATE MATERIALIZED VIEW` will not fail if the cluster limit is hit. + #[parameter(default = false)] + bypass_cluster_limits: bool, } fn check_timezone(val: &str) -> Result<(), String> { diff --git a/src/common/src/util/cluster_limit.rs b/src/common/src/util/cluster_limit.rs new file mode 100644 index 0000000000000..048ea4fdab305 --- /dev/null +++ b/src/common/src/util/cluster_limit.rs @@ -0,0 +1,134 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::fmt::{self, Display, Formatter}; + +use risingwave_pb::meta::actor_count_per_parallelism::PbWorkerActorCount; +use risingwave_pb::meta::cluster_limit::PbLimit; +use risingwave_pb::meta::{PbActorCountPerParallelism, PbClusterLimit}; +pub enum ClusterLimit { + ActorCount(ActorCountPerParallelism), +} + +impl From for PbClusterLimit { + fn from(limit: ClusterLimit) -> Self { + match limit { + ClusterLimit::ActorCount(actor_count_per_parallelism) => PbClusterLimit { + limit: Some(PbLimit::ActorCount(actor_count_per_parallelism.into())), + }, + } + } +} + +impl From for ClusterLimit { + fn from(pb_limit: PbClusterLimit) -> Self { + match pb_limit.limit.unwrap() { + PbLimit::ActorCount(actor_count_per_parallelism) => { + ClusterLimit::ActorCount(actor_count_per_parallelism.into()) + } + } + } +} + +#[derive(Debug)] +pub struct WorkerActorCount { + pub actor_count: usize, + pub parallelism: usize, +} + +impl From for PbWorkerActorCount { + fn from(worker_actor_count: WorkerActorCount) -> Self { + PbWorkerActorCount { + actor_count: worker_actor_count.actor_count as u64, + parallelism: worker_actor_count.parallelism as u64, + } + } +} + +impl From for WorkerActorCount { + fn from(pb_worker_actor_count: PbWorkerActorCount) -> Self { + WorkerActorCount { + actor_count: pb_worker_actor_count.actor_count as usize, + parallelism: pb_worker_actor_count.parallelism as usize, + } + } +} + +pub struct ActorCountPerParallelism { + pub worker_id_to_actor_count: HashMap, + pub hard_limit: usize, + pub soft_limit: usize, +} + +impl From for PbActorCountPerParallelism { + fn from(actor_count_per_parallelism: ActorCountPerParallelism) -> Self { + PbActorCountPerParallelism { + worker_id_to_actor_count: actor_count_per_parallelism + .worker_id_to_actor_count + .into_iter() + .map(|(k, v)| (k, v.into())) + .collect(), + hard_limit: actor_count_per_parallelism.hard_limit as u64, + soft_limit: actor_count_per_parallelism.soft_limit as u64, + } + } +} + +impl From for ActorCountPerParallelism { + fn from(pb_actor_count_per_parallelism: PbActorCountPerParallelism) -> Self { + ActorCountPerParallelism { + worker_id_to_actor_count: pb_actor_count_per_parallelism + .worker_id_to_actor_count + .into_iter() + .map(|(k, v)| (k, v.into())) + .collect(), + hard_limit: pb_actor_count_per_parallelism.hard_limit as usize, + soft_limit: pb_actor_count_per_parallelism.soft_limit as usize, + } + } +} + +impl ActorCountPerParallelism { + pub fn exceed_hard_limit(&self) -> bool { + self.worker_id_to_actor_count + .values() + .any(|v| v.actor_count > self.hard_limit.saturating_mul(v.parallelism)) + } + + pub fn exceed_soft_limit(&self) -> bool { + self.worker_id_to_actor_count + .values() + .any(|v| v.actor_count > self.soft_limit.saturating_mul(v.parallelism)) + } + + pub fn exceed_limit(&self) -> bool { + self.exceed_soft_limit() || self.exceed_hard_limit() + } +} + +impl Display for ActorCountPerParallelism { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + let worker_id_to_actor_count_str: Vec<_> = self + .worker_id_to_actor_count + .iter() + .map(|(k, v)| format!("{} -> {:?}", k, v)) + .collect(); + write!( + f, + "ActorCountPerParallelism {{ critical limit: {:?}, recommended limit: {:?}. worker_id_to_actor_count: {:?} }}", + self.hard_limit, self.soft_limit, worker_id_to_actor_count_str + ) + } +} diff --git a/src/common/src/util/mod.rs b/src/common/src/util/mod.rs index 20dac5906c91d..bfa15c8327037 100644 --- a/src/common/src/util/mod.rs +++ b/src/common/src/util/mod.rs @@ -42,3 +42,4 @@ pub mod tracing; pub mod value_encoding; pub mod worker_util; pub use tokio_util; +pub mod cluster_limit; diff --git a/src/common/src/util/row_id.rs b/src/common/src/util/row_id.rs index 508f418903413..7f22c17e925e4 100644 --- a/src/common/src/util/row_id.rs +++ b/src/common/src/util/row_id.rs @@ -52,6 +52,7 @@ pub struct RowIdGenerator { pub type RowId = i64; +// TODO(var-vnode): how should we handle this for different virtual node counts? #[inline] pub fn extract_vnode_id_from_row_id(id: RowId) -> VirtualNode { let vnode_id = ((id >> VNODE_ID_SHIFT_BITS) & (VNODE_ID_UPPER_BOUND as i64 - 1)) as u32; diff --git a/src/common/src/util/scan_range.rs b/src/common/src/util/scan_range.rs index fd056f1790444..cfe209cf2c22a 100644 --- a/src/common/src/util/scan_range.rs +++ b/src/common/src/util/scan_range.rs @@ -159,7 +159,7 @@ mod tests { let pk = vec![1, 3, 2]; let dist_key_idx_in_pk = crate::catalog::get_dist_key_in_pk_indices(&dist_key, &pk).unwrap(); - let dist = TableDistribution::all(dist_key_idx_in_pk); + let dist = TableDistribution::all(dist_key_idx_in_pk, VirtualNode::COUNT_FOR_TEST); let mut scan_range = ScanRange::full_table_scan(); assert!(scan_range.try_compute_vnode(&dist).is_none()); @@ -173,7 +173,7 @@ mod tests { Some(ScalarImpl::from(514)), ]); - let vnode = VirtualNode::compute_row(&row, &[0, 1]); + let vnode = VirtualNode::compute_row_for_test(&row, &[0, 1]); assert_eq!(scan_range.try_compute_vnode(&dist), Some(vnode)); } @@ -185,7 +185,7 @@ mod tests { let pk = vec![1, 3, 2]; let dist_key_idx_in_pk = crate::catalog::get_dist_key_in_pk_indices(&dist_key, &pk).unwrap(); - let dist = TableDistribution::all(dist_key_idx_in_pk); + let dist = TableDistribution::all(dist_key_idx_in_pk, VirtualNode::COUNT_FOR_TEST); let mut scan_range = ScanRange::full_table_scan(); assert!(scan_range.try_compute_vnode(&dist).is_none()); @@ -203,7 +203,7 @@ mod tests { Some(ScalarImpl::from(114514)), ]); - let vnode = VirtualNode::compute_row(&row, &[2, 1]); + let vnode = VirtualNode::compute_row_for_test(&row, &[2, 1]); assert_eq!(scan_range.try_compute_vnode(&dist), Some(vnode)); } diff --git a/src/common/src/vnode_mapping/vnode_placement.rs b/src/common/src/vnode_mapping/vnode_placement.rs index 5619ffc6e0f96..1f9235bb862ae 100644 --- a/src/common/src/vnode_mapping/vnode_placement.rs +++ b/src/common/src/vnode_mapping/vnode_placement.rs @@ -30,7 +30,12 @@ pub fn place_vnode( hint_worker_slot_mapping: Option<&WorkerSlotMapping>, workers: &[WorkerNode], max_parallelism: Option, + vnode_count: usize, ) -> Option { + if let Some(mapping) = hint_worker_slot_mapping { + assert_eq!(mapping.len(), vnode_count); + } + // Get all serving worker slots from all available workers, grouped by worker id and ordered // by worker slot id in each group. let mut worker_slots: LinkedList<_> = workers @@ -44,7 +49,7 @@ pub fn place_vnode( // `max_parallelism` and total number of virtual nodes. let serving_parallelism = std::cmp::min( worker_slots.iter().map(|slots| slots.len()).sum(), - std::cmp::min(max_parallelism.unwrap_or(usize::MAX), VirtualNode::COUNT), + std::cmp::min(max_parallelism.unwrap_or(usize::MAX), vnode_count), ); // Select `serving_parallelism` worker slots in a round-robin fashion, to distribute workload @@ -79,14 +84,14 @@ pub fn place_vnode( is_temp: bool, } - let (expected, mut remain) = VirtualNode::COUNT.div_rem(&selected_slots.len()); + let (expected, mut remain) = vnode_count.div_rem(&selected_slots.len()); let mut balances: HashMap = HashMap::default(); for slot in &selected_slots { let mut balance = Balance { slot: *slot, balance: -(expected as i32), - builder: BitmapBuilder::zeroed(VirtualNode::COUNT), + builder: BitmapBuilder::zeroed(vnode_count), is_temp: false, }; @@ -102,7 +107,7 @@ pub fn place_vnode( let mut temp_slot = Balance { slot: WorkerSlotId::new(0u32, usize::MAX), /* This id doesn't matter for `temp_slot`. It's distinguishable via `is_temp`. */ balance: 0, - builder: BitmapBuilder::zeroed(VirtualNode::COUNT), + builder: BitmapBuilder::zeroed(vnode_count), is_temp: true, }; match hint_worker_slot_mapping { @@ -123,7 +128,7 @@ pub fn place_vnode( } None => { // No hint is provided, assign all vnodes to `temp_pu`. - for vnode in VirtualNode::all() { + for vnode in VirtualNode::all(vnode_count) { temp_slot.balance += 1; temp_slot.builder.set(vnode.to_index(), true); } @@ -158,7 +163,7 @@ pub fn place_vnode( let mut dst = balances.pop_back().unwrap(); let n = std::cmp::min(src.balance.abs(), dst.balance.abs()); let mut moved = 0; - for idx in 0..VirtualNode::COUNT { + for idx in 0..vnode_count { if moved >= n { break; } @@ -189,7 +194,7 @@ pub fn place_vnode( for (worker_slot, bitmap) in results { worker_result .entry(worker_slot) - .or_insert(BitmapBuilder::zeroed(VirtualNode::COUNT).finish()) + .or_insert(Bitmap::zeros(vnode_count)) .bitor_assign(&bitmap); } @@ -204,10 +209,24 @@ mod tests { use risingwave_pb::common::WorkerNode; use crate::hash::VirtualNode; - use crate::vnode_mapping::vnode_placement::place_vnode; + + /// [`super::place_vnode`] with [`VirtualNode::COUNT_FOR_TEST`] as the vnode count. + fn place_vnode( + hint_worker_slot_mapping: Option<&WorkerSlotMapping>, + workers: &[WorkerNode], + max_parallelism: Option, + ) -> Option { + super::place_vnode( + hint_worker_slot_mapping, + workers, + max_parallelism, + VirtualNode::COUNT_FOR_TEST, + ) + } + #[test] fn test_place_vnode() { - assert_eq!(VirtualNode::COUNT, 256); + assert_eq!(VirtualNode::COUNT_FOR_TEST, 256); let serving_property = Property { is_unschedulable: false, @@ -220,7 +239,7 @@ mod tests { assert_eq!(wm1.len(), 256); assert_eq!(wm2.len(), 256); let mut count: usize = 0; - for idx in 0..VirtualNode::COUNT { + for idx in 0..VirtualNode::COUNT_FOR_TEST { let vnode = VirtualNode::from_index(idx); if wm1.get(vnode) == wm2.get(vnode) { count += 1; diff --git a/src/compute/src/lib.rs b/src/compute/src/lib.rs index d91fb56b1cb88..1336a84980cea 100644 --- a/src/compute/src/lib.rs +++ b/src/compute/src/lib.rs @@ -16,7 +16,6 @@ #![feature(coroutines)] #![feature(type_alias_impl_trait)] #![feature(let_chains)] -#![feature(lint_reasons)] #![feature(impl_trait_in_assoc_type)] #![cfg_attr(coverage, feature(coverage_attribute))] @@ -103,8 +102,9 @@ pub struct ComputeNodeOpts { pub role: Role, /// Used for control the metrics level, similar to log level. - /// 0 = disable metrics - /// >0 = enable metrics + /// + /// level = 0: disable metrics + /// level > 0: enable metrics #[clap(long, hide = true, env = "RW_METRICS_LEVEL")] #[override_opts(path = server.metrics_level)] pub metrics_level: Option, diff --git a/src/compute/src/rpc/service/stream_service.rs b/src/compute/src/rpc/service/stream_service.rs index eb055a174b3ea..6253cfe74c730 100644 --- a/src/compute/src/rpc/service/stream_service.rs +++ b/src/compute/src/rpc/service/stream_service.rs @@ -40,20 +40,6 @@ impl StreamService for StreamServiceImpl { type StreamingControlStreamStream = impl Stream>; - #[cfg_attr(coverage, coverage(off))] - async fn drop_actors( - &self, - request: Request, - ) -> std::result::Result, Status> { - let req = request.into_inner(); - let actors = req.actor_ids; - self.mgr.drop_actors(actors).await?; - Ok(Response::new(DropActorsResponse { - request_id: req.request_id, - status: None, - })) - } - #[cfg_attr(coverage, coverage(off))] async fn wait_epoch_commit( &self, diff --git a/src/config/docs.md b/src/config/docs.md index 47905d71e5e0c..bcce61d8bb456 100644 --- a/src/config/docs.md +++ b/src/config/docs.md @@ -119,6 +119,7 @@ This page is automatically generated by `./risedev generate-example-config` | enable_fast_compaction | | true | | high_priority_ratio_in_percent | DEPRECATED: This config will be deprecated in the future version, use `storage.cache.block_cache_eviction.high_priority_ratio_in_percent` with `storage.cache.block_cache_eviction.algorithm = "Lru"` instead. | | | imm_merge_threshold | The threshold for the number of immutable memtables to merge to a new imm. | 0 | +| max_cached_recent_versions_number | | | | max_concurrent_compaction_task_number | | 16 | | max_prefetch_block_number | max prefetch block number | 16 | | max_preload_io_retry_times | | 3 | diff --git a/src/config/example.toml b/src/config/example.toml index c81b35163eafa..f3c127cdc7825 100644 --- a/src/config/example.toml +++ b/src/config/example.toml @@ -81,6 +81,8 @@ meta_enable_trivial_move = true meta_enable_check_task_level_overlap = false meta_max_trivial_move_task_count_per_loop = 256 meta_max_get_task_probe_times = 5 +meta_actor_cnt_per_worker_parallelism_soft_limit = 100 +meta_actor_cnt_per_worker_parallelism_hard_limit = 400 [batch] enable_barrier_read = false diff --git a/src/connector/Cargo.toml b/src/connector/Cargo.toml index d87e89c1cf65d..a77e9cb929d17 100644 --- a/src/connector/Cargo.toml +++ b/src/connector/Cargo.toml @@ -103,7 +103,7 @@ pg_bigdecimal = { git = "https://github.com/risingwavelabs/rust-pg_bigdecimal", postgres-openssl = "0.5.0" prometheus = { version = "0.13", features = ["process"] } prost = { workspace = true, features = ["no-recursion-limit"] } -prost-reflect = "0.14" +prost-reflect = { version = "0.14", features = ["serde"] } prost-types = "0.13" protobuf-native = "0.2.2" pulsar = { version = "6.3", default-features = false, features = [ diff --git a/src/connector/codec/src/decoder/mod.rs b/src/connector/codec/src/decoder/mod.rs index 814e06a166c6c..bbfdbf0a90d79 100644 --- a/src/connector/codec/src/decoder/mod.rs +++ b/src/connector/codec/src/decoder/mod.rs @@ -38,6 +38,9 @@ pub enum AccessError { #[error("Unsupported additional column `{name}`")] UnsupportedAdditionalColumn { name: String }, + #[error("Fail to convert protobuf Any into jsonb: {0}")] + ProtobufAnyToJson(#[source] serde_json::Error), + /// Errors that are not categorized into variants above. #[error("{message}")] Uncategorized { message: String }, diff --git a/src/connector/codec/src/lib.rs b/src/connector/codec/src/lib.rs index cbf0ad14046f7..2119c1ece4e57 100644 --- a/src/connector/codec/src/lib.rs +++ b/src/connector/codec/src/lib.rs @@ -21,7 +21,6 @@ #![feature(stmt_expr_attributes)] #![feature(box_patterns)] #![feature(trait_alias)] -#![feature(lint_reasons)] #![feature(let_chains)] #![feature(box_into_inner)] #![feature(type_alias_impl_trait)] diff --git a/src/connector/src/connector_common/common.rs b/src/connector/src/connector_common/common.rs index b522ae2eda560..9f4211aedd4d9 100644 --- a/src/connector/src/connector_common/common.rs +++ b/src/connector/src/connector_common/common.rs @@ -192,14 +192,26 @@ pub struct KafkaCommon { #[serde(rename = "properties.ssl.ca.location")] ssl_ca_location: Option, + /// CA certificate string (PEM format) for verifying the broker's key. + #[serde(rename = "properties.ssl.ca.pem")] + ssl_ca_pem: Option, + /// Path to client's certificate file (PEM). #[serde(rename = "properties.ssl.certificate.location")] ssl_certificate_location: Option, + /// Client's public key string (PEM format) used for authentication. + #[serde(rename = "properties.ssl.certificate.pem")] + ssl_certificate_pem: Option, + /// Path to client's private key file (PEM). #[serde(rename = "properties.ssl.key.location")] ssl_key_location: Option, + /// Client's private key string (PEM format) used for authentication. + #[serde(rename = "properties.ssl.key.pem")] + ssl_key_pem: Option, + /// Passphrase of client's private key. #[serde(rename = "properties.ssl.key.password")] ssl_key_password: Option, @@ -325,12 +337,21 @@ impl KafkaCommon { if let Some(ssl_ca_location) = self.ssl_ca_location.as_ref() { config.set("ssl.ca.location", ssl_ca_location); } + if let Some(ssl_ca_pem) = self.ssl_ca_pem.as_ref() { + config.set("ssl.ca.pem", ssl_ca_pem); + } if let Some(ssl_certificate_location) = self.ssl_certificate_location.as_ref() { config.set("ssl.certificate.location", ssl_certificate_location); } + if let Some(ssl_certificate_pem) = self.ssl_certificate_pem.as_ref() { + config.set("ssl.certificate.pem", ssl_certificate_pem); + } if let Some(ssl_key_location) = self.ssl_key_location.as_ref() { config.set("ssl.key.location", ssl_key_location); } + if let Some(ssl_key_pem) = self.ssl_key_pem.as_ref() { + config.set("ssl.key.pem", ssl_key_pem); + } if let Some(ssl_key_password) = self.ssl_key_password.as_ref() { config.set("ssl.key.password", ssl_key_password); } diff --git a/src/connector/src/lib.rs b/src/connector/src/lib.rs index 6ee28a2161aa1..f66b5116c110b 100644 --- a/src/connector/src/lib.rs +++ b/src/connector/src/lib.rs @@ -19,7 +19,6 @@ #![feature(stmt_expr_attributes)] #![feature(box_patterns)] #![feature(trait_alias)] -#![feature(lint_reasons)] #![feature(let_chains)] #![feature(box_into_inner)] #![feature(type_alias_impl_trait)] diff --git a/src/connector/src/parser/plain_parser.rs b/src/connector/src/parser/plain_parser.rs index f1ac65d79a654..e9c9436fd295f 100644 --- a/src/connector/src/parser/plain_parser.rs +++ b/src/connector/src/parser/plain_parser.rs @@ -297,10 +297,9 @@ mod tests { .unwrap() .into_iter() .filter(|c| c.cardinality() > 0) - .map(|c| { + .inspect(|c| { // 5 data messages in a single chunk assert_eq!(5, c.cardinality()); - c }) .collect_vec(); diff --git a/src/connector/src/parser/protobuf/parser.rs b/src/connector/src/parser/protobuf/parser.rs index 8be25074f6295..bbd1d3f0da1e3 100644 --- a/src/connector/src/parser/protobuf/parser.rs +++ b/src/connector/src/parser/protobuf/parser.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; - use anyhow::Context; use itertools::Itertools; use prost_reflect::{ @@ -22,19 +20,16 @@ use prost_reflect::{ }; use risingwave_common::array::{ListValue, StructValue}; use risingwave_common::types::{ - DataType, Datum, DatumCow, Decimal, JsonbRef, JsonbVal, ScalarImpl, ScalarRefImpl, ToDatumRef, - ToOwnedDatum, F32, F64, + DataType, DatumCow, Decimal, JsonbVal, ScalarImpl, ToOwnedDatum, F32, F64, }; use risingwave_common::{bail, try_match_expand}; use risingwave_pb::plan_common::{AdditionalColumn, ColumnDesc, ColumnDescVersion}; use thiserror::Error; -use thiserror_ext::{AsReport, Macro}; +use thiserror_ext::Macro; use crate::error::ConnectorResult; use crate::parser::unified::protobuf::ProtobufAccess; -use crate::parser::unified::{ - bail_uncategorized, uncategorized, AccessError, AccessImpl, AccessResult, -}; +use crate::parser::unified::{uncategorized, AccessError, AccessImpl, AccessResult}; use crate::parser::util::bytes_from_url; use crate::parser::{AccessBuilder, EncodingProperties}; use crate::schema::schema_registry::{extract_schema_id, handle_sr_list, Client, WireFormatError}; @@ -44,7 +39,6 @@ use crate::schema::SchemaLoader; pub struct ProtobufAccessBuilder { confluent_wire_type: bool, message_descriptor: MessageDescriptor, - descriptor_pool: Arc, } impl AccessBuilder for ProtobufAccessBuilder { @@ -59,10 +53,7 @@ impl AccessBuilder for ProtobufAccessBuilder { let message = DynamicMessage::decode(self.message_descriptor.clone(), payload) .context("failed to parse message")?; - Ok(AccessImpl::Protobuf(ProtobufAccess::new( - message, - Arc::clone(&self.descriptor_pool), - ))) + Ok(AccessImpl::Protobuf(ProtobufAccess::new(message))) } } @@ -71,13 +62,11 @@ impl ProtobufAccessBuilder { let ProtobufParserConfig { confluent_wire_type, message_descriptor, - descriptor_pool, } = config; Ok(Self { confluent_wire_type, message_descriptor, - descriptor_pool, }) } } @@ -86,8 +75,6 @@ impl ProtobufAccessBuilder { pub struct ProtobufParserConfig { confluent_wire_type: bool, pub(crate) message_descriptor: MessageDescriptor, - /// Note that the pub(crate) here is merely for testing - pub(crate) descriptor_pool: Arc, } impl ProtobufParserConfig { @@ -132,7 +119,6 @@ impl ProtobufParserConfig { Ok(Self { message_descriptor, confluent_wire_type: protobuf_config.use_schema_registry, - descriptor_pool: Arc::new(pool), }) } @@ -216,141 +202,10 @@ fn detect_loop_and_push( Ok(()) } -fn extract_any_info(dyn_msg: &DynamicMessage) -> (String, Value) { - debug_assert!( - dyn_msg.fields().count() == 2, - "Expected only two fields for Any Type MessageDescriptor" - ); - - let type_url = dyn_msg - .get_field_by_name("type_url") - .expect("Expect type_url in dyn_msg") - .to_string() - .split('/') - .nth(1) - .map(|part| part[..part.len() - 1].to_string()) - .unwrap_or_default(); - - let payload = dyn_msg - .get_field_by_name("value") - .expect("Expect value (payload) in dyn_msg") - .as_ref() - .clone(); - - (type_url, payload) -} - -/// TODO: Resolve the potential naming conflict in the map -/// i.e., If the two anonymous type shares the same key (e.g., "Int32"), -/// the latter will overwrite the former one in `serde_json::Map`. -/// Possible solution, maintaining a global id map, for the same types -/// In the same level of fields, add the unique id at the tail of the name. -/// e.g., "Int32.1" & "Int32.2" in the above example -fn recursive_parse_json( - fields: &[Datum], - full_name_vec: Option>, - full_name: Option, -) -> serde_json::Value { - // Note that the key is of no order - let mut ret: serde_json::Map = serde_json::Map::new(); - - // The hidden type hint for user's convenience - // i.e., `"_type": message.full_name()` - if let Some(full_name) = full_name { - ret.insert("_type".to_string(), serde_json::Value::String(full_name)); - } - - for (idx, field) in fields.iter().enumerate() { - let mut key; - if let Some(k) = full_name_vec.as_ref() { - key = k[idx].to_string(); - } else { - key = "".to_string(); - } - - match field.clone() { - Some(ScalarImpl::Int16(v)) => { - if key.is_empty() { - key = "Int16".to_string(); - } - ret.insert(key, serde_json::Value::Number(serde_json::Number::from(v))); - } - Some(ScalarImpl::Int32(v)) => { - if key.is_empty() { - key = "Int32".to_string(); - } - ret.insert(key, serde_json::Value::Number(serde_json::Number::from(v))); - } - Some(ScalarImpl::Int64(v)) => { - if key.is_empty() { - key = "Int64".to_string(); - } - ret.insert(key, serde_json::Value::Number(serde_json::Number::from(v))); - } - Some(ScalarImpl::Bool(v)) => { - if key.is_empty() { - key = "Bool".to_string(); - } - ret.insert(key, serde_json::Value::Bool(v)); - } - Some(ScalarImpl::Bytea(v)) => { - if key.is_empty() { - key = "Bytea".to_string(); - } - let s = String::from_utf8(v.to_vec()).unwrap(); - ret.insert(key, serde_json::Value::String(s)); - } - Some(ScalarImpl::Float32(v)) => { - if key.is_empty() { - key = "Int16".to_string(); - } - ret.insert( - key, - serde_json::Value::Number( - serde_json::Number::from_f64(v.into_inner() as f64).unwrap(), - ), - ); - } - Some(ScalarImpl::Float64(v)) => { - if key.is_empty() { - key = "Float64".to_string(); - } - ret.insert( - key, - serde_json::Value::Number( - serde_json::Number::from_f64(v.into_inner()).unwrap(), - ), - ); - } - Some(ScalarImpl::Utf8(v)) => { - if key.is_empty() { - key = "Utf8".to_string(); - } - ret.insert(key, serde_json::Value::String(v.to_string())); - } - Some(ScalarImpl::Struct(v)) => { - if key.is_empty() { - key = "Struct".to_string(); - } - ret.insert(key, recursive_parse_json(v.fields(), None, None)); - } - Some(ScalarImpl::Jsonb(v)) => { - if key.is_empty() { - key = "Jsonb".to_string(); - } - ret.insert(key, v.take()); - } - r#type => panic!("Not yet support ScalarImpl type: {:?}", r#type), - } - } - - serde_json::Value::Object(ret) -} - pub fn from_protobuf_value<'a>( field_desc: &FieldDescriptor, value: &'a Value, - descriptor_pool: &Arc, + type_expected: &DataType, ) -> AccessResult> { let kind = field_desc.kind(); @@ -382,91 +237,46 @@ pub fn from_protobuf_value<'a>( } Value::Message(dyn_msg) => { if dyn_msg.descriptor().full_name() == "google.protobuf.Any" { - // If the fields are not presented, default value is an empty string - if !dyn_msg.has_field_by_name("type_url") || !dyn_msg.has_field_by_name("value") { - borrowed!(JsonbRef::empty_string()); - } - - // Sanity check - debug_assert!( - dyn_msg.has_field_by_name("type_url") && dyn_msg.has_field_by_name("value"), - "`type_url` & `value` must exist in fields of `dyn_msg`" - ); - - // The message is of type `Any` - let (type_url, payload) = extract_any_info(dyn_msg); - - let payload_field_desc = dyn_msg.descriptor().get_field_by_name("value").unwrap(); - - let payload = from_protobuf_value(&payload_field_desc, &payload, descriptor_pool)?; - let Some(ScalarRefImpl::Bytea(payload)) = payload.to_datum_ref() else { - bail_uncategorized!("expected bytes for dynamic message payload"); - }; - - // Get the corresponding schema from the descriptor pool - let msg_desc = descriptor_pool - .get_message_by_name(&type_url) - .ok_or_else(|| { - uncategorized!("message `{type_url}` not found in descriptor pool") - })?; - - let f = msg_desc - .clone() - .fields() - .map(|f| f.name().to_string()) - .collect::>(); - - let full_name = msg_desc.clone().full_name().to_string(); - - // Decode the payload based on the `msg_desc` - let decoded_value = DynamicMessage::decode(msg_desc, payload).unwrap(); - let decoded_value = from_protobuf_value( - field_desc, - &Value::Message(decoded_value), - descriptor_pool, - )? - .to_owned_datum() - .unwrap(); - - // Extract the struct value - let ScalarImpl::Struct(v) = decoded_value else { - panic!("Expect ScalarImpl::Struct"); + ScalarImpl::Jsonb(JsonbVal::from( + serde_json::to_value(dyn_msg).map_err(AccessError::ProtobufAnyToJson)?, + )) + } else { + let desc = dyn_msg.descriptor(); + let DataType::Struct(st) = type_expected else { + return Err(AccessError::TypeError { + expected: type_expected.to_string(), + got: desc.full_name().to_string(), + value: value.to_string(), // Protobuf TEXT + }); }; - ScalarImpl::Jsonb(JsonbVal::from(serde_json::json!(recursive_parse_json( - v.fields(), - Some(f), - Some(full_name), - )))) - } else { - let mut rw_values = Vec::with_capacity(dyn_msg.descriptor().fields().len()); - // fields is a btree map in descriptor - // so it's order is the same as datatype - for field_desc in dyn_msg.descriptor().fields() { - // missing field - if !dyn_msg.has_field(&field_desc) - && field_desc.cardinality() == Cardinality::Required - { - return Err(AccessError::Undefined { - name: field_desc.name().to_owned(), - path: dyn_msg.descriptor().full_name().to_owned(), - }); - } - // use default value if dyn_msg doesn't has this field + let mut rw_values = Vec::with_capacity(st.len()); + for (name, expected_field_type) in st.iter() { + let Some(field_desc) = desc.get_field_by_name(name) else { + // Field deleted in protobuf. Fallback to SQL NULL (of proper RW type). + rw_values.push(None); + continue; + }; let value = dyn_msg.get_field(&field_desc); rw_values.push( - from_protobuf_value(&field_desc, &value, descriptor_pool)?.to_owned_datum(), + from_protobuf_value(&field_desc, &value, expected_field_type)? + .to_owned_datum(), ); } ScalarImpl::Struct(StructValue::new(rw_values)) } } Value::List(values) => { - let data_type = protobuf_type_mapping(field_desc, &mut vec![]) - .map_err(|e| uncategorized!("{}", e.to_report_string()))?; - let mut builder = data_type.as_list().create_array_builder(values.len()); + let DataType::List(element_type) = type_expected else { + return Err(AccessError::TypeError { + expected: type_expected.to_string(), + got: format!("repeated {:?}", kind), + value: value.to_string(), // Protobuf TEXT + }); + }; + let mut builder = element_type.create_array_builder(values.len()); for value in values { - builder.append(from_protobuf_value(field_desc, value, descriptor_pool)?); + builder.append(from_protobuf_value(field_desc, value, element_type)?); } ScalarImpl::List(ListValue::new(builder.finish())) } @@ -498,25 +308,18 @@ fn protobuf_type_mapping( } Kind::Uint64 | Kind::Fixed64 => DataType::Decimal, Kind::String => DataType::Varchar, - Kind::Message(m) => { - let fields = m - .fields() - .map(|f| protobuf_type_mapping(&f, parse_trace)) - .try_collect()?; - let field_names = m.fields().map(|f| f.name().to_string()).collect_vec(); - - // Note that this part is useful for actual parsing - // Since RisingWave will parse message to `ScalarImpl::Jsonb` - // Please do NOT modify it - if field_names.len() == 2 - && field_names.contains(&"value".to_string()) - && field_names.contains(&"type_url".to_string()) - { - DataType::Jsonb - } else { + Kind::Message(m) => match m.full_name() { + // Well-Known Types are identified by their full name + "google.protobuf.Any" => DataType::Jsonb, + _ => { + let fields = m + .fields() + .map(|f| protobuf_type_mapping(&f, parse_trace)) + .try_collect()?; + let field_names = m.fields().map(|f| f.name().to_string()).collect_vec(); DataType::new_struct(fields, field_names) } - } + }, Kind::Enum(_) => DataType::Varchar, Kind::Bytes => DataType::Bytea, }; @@ -597,6 +400,7 @@ mod test { use risingwave_pb::data::data_type::PbTypeName; use risingwave_pb::plan_common::{PbEncodeType, PbFormatType}; use serde_json::json; + use thiserror_ext::AsReport as _; use super::*; use crate::parser::protobuf::recursive::all_types::{EnumType, ExampleOneof, NestedMessage}; @@ -904,7 +708,8 @@ mod test { } fn pb_eq(a: &ProtobufAccess, field_name: &str, value: ScalarImpl) { - let dummy_type = DataType::Varchar; + let field = a.descriptor().get_field_by_name(field_name).unwrap(); + let dummy_type = protobuf_type_mapping(&field, &mut vec![]).unwrap(); let d = a.access_owned(&[field_name], &dummy_type).unwrap().unwrap(); assert_eq!(d, value, "field: {} value: {:?}", field_name, d); } @@ -964,49 +769,35 @@ mod test { println!("Current conf: {:#?}", conf); println!("---------------------------"); - let value = + let message = DynamicMessage::decode(conf.message_descriptor.clone(), ANY_GEN_PROTO_DATA).unwrap(); - println!("Test ANY_GEN_PROTO_DATA, current value: {:#?}", value); + println!("Test ANY_GEN_PROTO_DATA, current value: {:#?}", message); println!("---------------------------"); - // This is of no use - let field = value.fields().next().unwrap().0; - - if let Some(ret) = - from_protobuf_value(&field, &Value::Message(value), &conf.descriptor_pool) - .unwrap() - .to_owned_datum() - { - println!("Decoded Value for ANY_GEN_PROTO_DATA: {:#?}", ret); - println!("---------------------------"); - - let ScalarImpl::Struct(struct_value) = ret else { - panic!("Expected ScalarImpl::Struct"); - }; - - let fields = struct_value.fields(); + let field = conf + .message_descriptor + .get_field_by_name("any_value") + .unwrap(); + let value = message.get_field(&field); - match fields[0].clone() { - Some(ScalarImpl::Int32(v)) => { - println!("Successfully decode field[0]"); - assert_eq!(v, 12345); - } - _ => panic!("Expected ScalarImpl::Int32"), - } + let ret = from_protobuf_value(&field, &value, &DataType::Jsonb) + .unwrap() + .to_owned_datum(); + println!("Decoded Value for ANY_GEN_PROTO_DATA: {:#?}", ret); + println!("---------------------------"); - match fields[1].clone() { - Some(ScalarImpl::Jsonb(jv)) => { - assert_eq!( - jv, - JsonbVal::from(json!({ - "_type": "test.StringValue", - "value": "John Doe" - })) - ); - } - _ => panic!("Expected ScalarImpl::Jsonb"), + match ret { + Some(ScalarImpl::Jsonb(jv)) => { + assert_eq!( + jv, + JsonbVal::from(json!({ + "@type": "type.googleapis.com/test.StringValue", + "value": "John Doe" + })) + ); } + _ => panic!("Expected ScalarImpl::Jsonb"), } Ok(()) @@ -1027,49 +818,35 @@ mod test { println!("Current conf: {:#?}", conf); println!("---------------------------"); - let value = + let message = DynamicMessage::decode(conf.message_descriptor.clone(), ANY_GEN_PROTO_DATA_1).unwrap(); - println!("Current Value: {:#?}", value); + println!("Current Value: {:#?}", message); println!("---------------------------"); - // This is of no use - let field = value.fields().next().unwrap().0; - - if let Some(ret) = - from_protobuf_value(&field, &Value::Message(value), &conf.descriptor_pool) - .unwrap() - .to_owned_datum() - { - println!("Decoded Value for ANY_GEN_PROTO_DATA: {:#?}", ret); - println!("---------------------------"); - - let ScalarImpl::Struct(struct_value) = ret else { - panic!("Expected ScalarImpl::Struct"); - }; - - let fields = struct_value.fields(); + let field = conf + .message_descriptor + .get_field_by_name("any_value") + .unwrap(); + let value = message.get_field(&field); - match fields[0].clone() { - Some(ScalarImpl::Int32(v)) => { - println!("Successfully decode field[0]"); - assert_eq!(v, 12345); - } - _ => panic!("Expected ScalarImpl::Int32"), - } + let ret = from_protobuf_value(&field, &value, &DataType::Jsonb) + .unwrap() + .to_owned_datum(); + println!("Decoded Value for ANY_GEN_PROTO_DATA: {:#?}", ret); + println!("---------------------------"); - match fields[1].clone() { - Some(ScalarImpl::Jsonb(jv)) => { - assert_eq!( - jv, - JsonbVal::from(json!({ - "_type": "test.Int32Value", - "value": 114514 - })) - ); - } - _ => panic!("Expected ScalarImpl::Jsonb"), + match ret { + Some(ScalarImpl::Jsonb(jv)) => { + assert_eq!( + jv, + JsonbVal::from(json!({ + "@type": "type.googleapis.com/test.Int32Value", + "value": 114514 + })) + ); } + _ => panic!("Expected ScalarImpl::Jsonb"), } Ok(()) @@ -1098,60 +875,80 @@ mod test { println!("Current conf: {:#?}", conf); println!("---------------------------"); - let value = DynamicMessage::decode( + let message = DynamicMessage::decode( conf.message_descriptor.clone(), ANY_RECURSIVE_GEN_PROTO_DATA, ) .unwrap(); - println!("Current Value: {:#?}", value); + println!("Current Value: {:#?}", message); + println!("---------------------------"); + + let field = conf + .message_descriptor + .get_field_by_name("any_value") + .unwrap(); + let value = message.get_field(&field); + + let ret = from_protobuf_value(&field, &value, &DataType::Jsonb) + .unwrap() + .to_owned_datum(); + println!("Decoded Value for ANY_RECURSIVE_GEN_PROTO_DATA: {:#?}", ret); println!("---------------------------"); - // This is of no use - let field = value.fields().next().unwrap().0; + match ret { + Some(ScalarImpl::Jsonb(jv)) => { + assert_eq!( + jv, + JsonbVal::from(json!({ + "@type": "type.googleapis.com/test.AnyValue", + "anyValue1": { + "@type": "type.googleapis.com/test.StringValue", + "value": "114514", + }, + "anyValue2": { + "@type": "type.googleapis.com/test.Int32Value", + "value": 114514, + } + })) + ); + } + _ => panic!("Expected ScalarImpl::Jsonb"), + } - if let Some(ret) = - from_protobuf_value(&field, &Value::Message(value), &conf.descriptor_pool) - .unwrap() - .to_owned_datum() - { - println!("Decoded Value for ANY_RECURSIVE_GEN_PROTO_DATA: {:#?}", ret); - println!("---------------------------"); + Ok(()) + } - let ScalarImpl::Struct(struct_value) = ret else { - panic!("Expected ScalarImpl::Struct"); - }; + // id: 12345 + // any_value: { + // type_url: "type.googleapis.com/test.StringXalue" + // value: "\n\010John Doe" + // } + static ANY_GEN_PROTO_DATA_INVALID: &[u8] = b"\x08\xb9\x60\x12\x32\x0a\x24\x74\x79\x70\x65\x2e\x67\x6f\x6f\x67\x6c\x65\x61\x70\x69\x73\x2e\x63\x6f\x6d\x2f\x74\x65\x73\x74\x2e\x53\x74\x72\x69\x6e\x67\x58\x61\x6c\x75\x65\x12\x0a\x0a\x08\x4a\x6f\x68\x6e\x20\x44\x6f\x65"; - let fields = struct_value.fields(); + #[tokio::test] + async fn test_any_invalid() -> crate::error::ConnectorResult<()> { + let conf = create_recursive_pb_parser_config("/any-schema.pb", "test.TestAny").await; - match fields[0].clone() { - Some(ScalarImpl::Int32(v)) => { - println!("Successfully decode field[0]"); - assert_eq!(v, 12345); - } - _ => panic!("Expected ScalarImpl::Int32"), - } + let message = + DynamicMessage::decode(conf.message_descriptor.clone(), ANY_GEN_PROTO_DATA_INVALID) + .unwrap(); - match fields[1].clone() { - Some(ScalarImpl::Jsonb(jv)) => { - assert_eq!( - jv, - JsonbVal::from(json!({ - "_type": "test.AnyValue", - "any_value_1": { - "_type": "test.StringValue", - "value": "114514", - }, - "any_value_2": { - "_type": "test.Int32Value", - "value": 114514, - } - })) - ); - } - _ => panic!("Expected ScalarImpl::Jsonb"), - } - } + let field = conf + .message_descriptor + .get_field_by_name("any_value") + .unwrap(); + let value = message.get_field(&field); + + let err = from_protobuf_value(&field, &value, &DataType::Jsonb).unwrap_err(); + + let expected = expect_test::expect![[r#" + Fail to convert protobuf Any into jsonb + + Caused by: + message 'test.StringXalue' not found + "#]]; + expected.assert_eq(err.to_report_string_pretty().as_str()); Ok(()) } diff --git a/src/connector/src/parser/unified/mod.rs b/src/connector/src/parser/unified/mod.rs index 8045ce0132401..fdfe3aae6aaee 100644 --- a/src/connector/src/parser/unified/mod.rs +++ b/src/connector/src/parser/unified/mod.rs @@ -17,9 +17,7 @@ use auto_impl::auto_impl; use risingwave_common::types::{DataType, DatumCow}; use risingwave_connector_codec::decoder::avro::AvroAccess; -pub use risingwave_connector_codec::decoder::{ - bail_uncategorized, uncategorized, Access, AccessError, AccessResult, -}; +pub use risingwave_connector_codec::decoder::{uncategorized, Access, AccessError, AccessResult}; use self::bytes::BytesAccess; use self::json::JsonAccess; diff --git a/src/connector/src/parser/unified/protobuf.rs b/src/connector/src/parser/unified/protobuf.rs index 02febc22db247..3ebeebca44373 100644 --- a/src/connector/src/parser/unified/protobuf.rs +++ b/src/connector/src/parser/unified/protobuf.rs @@ -13,9 +13,9 @@ // limitations under the License. use std::borrow::Cow; -use std::sync::{Arc, LazyLock}; +use std::sync::LazyLock; -use prost_reflect::{DescriptorPool, DynamicMessage, ReflectMessage}; +use prost_reflect::{DynamicMessage, ReflectMessage}; use risingwave_common::log::LogSuppresser; use risingwave_common::types::{DataType, DatumCow, ToOwnedDatum}; use thiserror_ext::AsReport; @@ -26,24 +26,21 @@ use crate::parser::unified::uncategorized; pub struct ProtobufAccess { message: DynamicMessage, - descriptor_pool: Arc, } impl ProtobufAccess { - pub fn new(message: DynamicMessage, descriptor_pool: Arc) -> Self { - Self { - message, - descriptor_pool, - } + pub fn new(message: DynamicMessage) -> Self { + Self { message } + } + + #[cfg(test)] + pub fn descriptor(&self) -> prost_reflect::MessageDescriptor { + self.message.descriptor() } } impl Access for ProtobufAccess { - fn access<'a>( - &'a self, - path: &[&str], - _type_expected: &DataType, - ) -> AccessResult> { + fn access<'a>(&'a self, path: &[&str], type_expected: &DataType) -> AccessResult> { debug_assert_eq!(1, path.len()); let field_desc = self .message @@ -59,10 +56,10 @@ impl Access for ProtobufAccess { })?; match self.message.get_field(&field_desc) { - Cow::Borrowed(value) => from_protobuf_value(&field_desc, value, &self.descriptor_pool), + Cow::Borrowed(value) => from_protobuf_value(&field_desc, value, type_expected), // `Owned` variant occurs only if there's no such field and the default value is returned. - Cow::Owned(value) => from_protobuf_value(&field_desc, &value, &self.descriptor_pool) + Cow::Owned(value) => from_protobuf_value(&field_desc, &value, type_expected) // enforce `Owned` variant to avoid returning a reference to a temporary value .map(|d| d.to_owned_datum().into()), } diff --git a/src/connector/src/sink/clickhouse.rs b/src/connector/src/sink/clickhouse.rs index 6b3e78f6a7b9d..07db42790f581 100644 --- a/src/connector/src/sink/clickhouse.rs +++ b/src/connector/src/sink/clickhouse.rs @@ -25,7 +25,6 @@ use risingwave_common::array::{Op, StreamChunk}; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::Schema; use risingwave_common::row::Row; -use risingwave_common::session_config::sink_decouple::SinkDecouple; use risingwave_common::types::{DataType, Decimal, ScalarRefImpl, Serial}; use serde::ser::{SerializeSeq, SerializeStruct}; use serde::Serialize; @@ -38,12 +37,10 @@ use with_options::WithOptions; use super::decouple_checkpoint_log_sink::{ default_commit_checkpoint_interval, DecoupleCheckpointLogSinkerOf, - DEFAULT_COMMIT_CHECKPOINT_INTERVAL, }; use super::writer::SinkWriter; use super::{DummySinkCommitCoordinator, SinkWriterParam}; use crate::error::ConnectorResult; -use crate::sink::catalog::desc::SinkDesc; use crate::sink::{ Result, Sink, SinkError, SinkParam, SINK_TYPE_APPEND_ONLY, SINK_TYPE_OPTION, SINK_TYPE_UPSERT, }; @@ -497,29 +494,6 @@ impl Sink for ClickHouseSink { const SINK_NAME: &'static str = CLICKHOUSE_SINK; - fn is_sink_decouple(desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { - let commit_checkpoint_interval = - if let Some(interval) = desc.properties.get("commit_checkpoint_interval") { - interval - .parse::() - .unwrap_or(DEFAULT_COMMIT_CHECKPOINT_INTERVAL) - } else { - DEFAULT_COMMIT_CHECKPOINT_INTERVAL - }; - - match user_specified { - SinkDecouple::Default | SinkDecouple::Enable => Ok(true), - SinkDecouple::Disable => { - if commit_checkpoint_interval > 1 { - return Err(SinkError::Config(anyhow!( - "config conflict: Clickhouse config `commit_checkpoint_interval` larger than 1 means that sink decouple must be enabled, but session config sink_decouple is disabled" - ))); - } - Ok(false) - } - } - } - async fn validate(&self) -> Result<()> { // For upsert clickhouse sink, the primary key must be defined. if !self.is_append_only && self.pk_indices.is_empty() { diff --git a/src/connector/src/sink/coordinate.rs b/src/connector/src/sink/coordinate.rs index c069167870101..fcfb8c0877d6b 100644 --- a/src/connector/src/sink/coordinate.rs +++ b/src/connector/src/sink/coordinate.rs @@ -15,10 +15,12 @@ use std::sync::Arc; use anyhow::anyhow; +use futures::FutureExt; use risingwave_common::array::StreamChunk; use risingwave_common::bitmap::Bitmap; use risingwave_pb::connector_service::SinkMetadata; use risingwave_rpc_client::CoordinatorStreamHandle; +use thiserror_ext::AsReport; use tracing::warn; use super::SinkCoordinationRpcClientEnum; @@ -81,6 +83,23 @@ impl>> SinkWriter for Coordi } async fn update_vnode_bitmap(&mut self, vnode_bitmap: Arc) -> Result<()> { + self.coordinator_stream_handle + .update_vnode_bitmap(&vnode_bitmap) + .await?; self.inner.update_vnode_bitmap(vnode_bitmap).await } } + +impl>> Drop for CoordinatedSinkWriter { + fn drop(&mut self) { + match self.coordinator_stream_handle.stop().now_or_never() { + None => { + warn!("unable to send stop due to channel full") + } + Some(Err(e)) => { + warn!(e = ?e.as_report(), "failed to stop the coordinator"); + } + Some(Ok(_)) => {} + } + } +} diff --git a/src/connector/src/sink/decouple_checkpoint_log_sink.rs b/src/connector/src/sink/decouple_checkpoint_log_sink.rs index 4ba57e3adda7a..59e3335eb36db 100644 --- a/src/connector/src/sink/decouple_checkpoint_log_sink.rs +++ b/src/connector/src/sink/decouple_checkpoint_log_sink.rs @@ -20,10 +20,12 @@ use async_trait::async_trait; use crate::sink::log_store::{LogStoreReadItem, TruncateOffset}; use crate::sink::writer::SinkWriter; use crate::sink::{LogSinker, Result, SinkLogReader, SinkMetrics}; -pub const DEFAULT_COMMIT_CHECKPOINT_INTERVAL: u64 = 10; +pub const DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE: u64 = 10; +pub const DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITHOUT_SINK_DECOUPLE: u64 = 1; +pub const COMMIT_CHECKPOINT_INTERVAL: &str = "commit_checkpoint_interval"; pub fn default_commit_checkpoint_interval() -> u64 { - DEFAULT_COMMIT_CHECKPOINT_INTERVAL + DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE } /// The `LogSinker` implementation used for commit-decoupled sinks (such as `Iceberg`, `DeltaLake` and `StarRocks`). @@ -65,7 +67,7 @@ impl> LogSinker for DecoupleCheckpointLogSink EpochBegun { curr_epoch: u64 }, /// Mark that the consumer has just received a barrier - BarrierReceived { prev_epoch: u64 }, + BarrierReceived { prev_epoch: u64, committed: bool }, } let mut state = LogConsumerState::Uninitialized; @@ -75,15 +77,34 @@ impl> LogSinker for DecoupleCheckpointLogSink loop { let (epoch, item): (u64, LogStoreReadItem) = log_reader.next_item().await?; - if let LogStoreReadItem::UpdateVnodeBitmap(_) = &item { - match &state { - LogConsumerState::BarrierReceived { .. } => {} + if let LogStoreReadItem::UpdateVnodeBitmap(vnode_bitmap) = &item { + match &mut state { + LogConsumerState::BarrierReceived { + committed, + prev_epoch, + } => { + if !*committed { + // force commit on update vnode bitmap + let start_time = Instant::now(); + sink_writer.barrier(true).await?; + sink_metrics + .sink_commit_duration_metrics + .observe(start_time.elapsed().as_millis() as f64); + log_reader.truncate(TruncateOffset::Barrier { epoch: *prev_epoch })?; + current_checkpoint = 0; + *committed = true; + } + sink_writer + .update_vnode_bitmap(vnode_bitmap.clone()) + .await?; + } _ => unreachable!( "update vnode bitmap can be accepted only right after \ barrier, but current state is {:?}", state ), } + continue; } // begin_epoch when not previously began state = match state { @@ -100,7 +121,7 @@ impl> LogSinker for DecoupleCheckpointLogSink ); LogConsumerState::EpochBegun { curr_epoch: epoch } } - LogConsumerState::BarrierReceived { prev_epoch } => { + LogConsumerState::BarrierReceived { prev_epoch, .. } => { assert!( epoch > prev_epoch, "new epoch {} should be greater than prev epoch {}", @@ -123,7 +144,7 @@ impl> LogSinker for DecoupleCheckpointLogSink LogConsumerState::EpochBegun { curr_epoch } => curr_epoch, _ => unreachable!("epoch must have begun before handling barrier"), }; - if is_checkpoint { + let committed = if is_checkpoint { current_checkpoint += 1; if current_checkpoint >= commit_checkpoint_interval.get() { let start_time = Instant::now(); @@ -133,16 +154,22 @@ impl> LogSinker for DecoupleCheckpointLogSink .observe(start_time.elapsed().as_millis() as f64); log_reader.truncate(TruncateOffset::Barrier { epoch })?; current_checkpoint = 0; + true } else { sink_writer.barrier(false).await?; + false } } else { sink_writer.barrier(false).await?; + false + }; + state = LogConsumerState::BarrierReceived { + prev_epoch, + committed, } - state = LogConsumerState::BarrierReceived { prev_epoch } } - LogStoreReadItem::UpdateVnodeBitmap(vnode_bitmap) => { - sink_writer.update_vnode_bitmap(vnode_bitmap).await?; + LogStoreReadItem::UpdateVnodeBitmap(_) => { + unreachable!("should have been handle earlier") } } } diff --git a/src/connector/src/sink/deltalake.rs b/src/connector/src/sink/deltalake.rs index 2dedffa3469e3..494adb2dd6fed 100644 --- a/src/connector/src/sink/deltalake.rs +++ b/src/connector/src/sink/deltalake.rs @@ -31,7 +31,6 @@ use risingwave_common::array::StreamChunk; use risingwave_common::bail; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::Schema; -use risingwave_common::session_config::sink_decouple::SinkDecouple; use risingwave_common::types::DataType; use risingwave_common::util::iter_util::ZipEqFast; use risingwave_pb::connector_service::sink_metadata::Metadata::Serialized; @@ -41,11 +40,9 @@ use serde_derive::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use with_options::WithOptions; -use super::catalog::desc::SinkDesc; use super::coordinate::CoordinatedSinkWriter; use super::decouple_checkpoint_log_sink::{ default_commit_checkpoint_interval, DecoupleCheckpointLogSinkerOf, - DEFAULT_COMMIT_CHECKPOINT_INTERVAL, }; use super::writer::SinkWriter; use super::{ @@ -285,29 +282,6 @@ impl Sink for DeltaLakeSink { const SINK_NAME: &'static str = DELTALAKE_SINK; - fn is_sink_decouple(desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { - let commit_checkpoint_interval = - if let Some(interval) = desc.properties.get("commit_checkpoint_interval") { - interval - .parse::() - .unwrap_or(DEFAULT_COMMIT_CHECKPOINT_INTERVAL) - } else { - DEFAULT_COMMIT_CHECKPOINT_INTERVAL - }; - - match user_specified { - SinkDecouple::Default | SinkDecouple::Enable => Ok(true), - SinkDecouple::Disable => { - if commit_checkpoint_interval > 1 { - return Err(SinkError::Config(anyhow!( - "config conflict: DeltaLake config `commit_checkpoint_interval` larger than 1 means that sink decouple must be enabled, but session config sink_decouple is disabled" - ))); - } - Ok(false) - } - } - } - async fn new_log_sinker(&self, writer_param: SinkWriterParam) -> Result { let inner = DeltaLakeSinkWriter::new( self.config.clone(), diff --git a/src/connector/src/sink/google_pubsub.rs b/src/connector/src/sink/google_pubsub.rs index ea0e0e4776318..ff9079591a2f5 100644 --- a/src/connector/src/sink/google_pubsub.rs +++ b/src/connector/src/sink/google_pubsub.rs @@ -14,11 +14,7 @@ use std::collections::BTreeMap; -use anyhow::{anyhow, Context}; -use futures::future::try_join_all; -use futures::prelude::future::FutureExt; -use futures::prelude::TryFuture; -use futures::TryFutureExt; +use anyhow::anyhow; use google_cloud_gax::conn::Environment; use google_cloud_googleapis::pubsub::v1::PubsubMessage; use google_cloud_pubsub::apiv1; @@ -26,7 +22,7 @@ use google_cloud_pubsub::client::google_cloud_auth::credentials::CredentialsFile use google_cloud_pubsub::client::google_cloud_auth::project; use google_cloud_pubsub::client::google_cloud_auth::token::DefaultTokenSourceProvider; use google_cloud_pubsub::client::{Client, ClientConfig}; -use google_cloud_pubsub::publisher::{Awaiter, Publisher}; +use google_cloud_pubsub::publisher::Publisher; use risingwave_common::array::StreamChunk; use risingwave_common::catalog::Schema; use serde_derive::Deserialize; @@ -46,19 +42,33 @@ use crate::dispatch_sink_formatter_str_key_impl; pub const PUBSUB_SINK: &str = "google_pubsub"; const PUBSUB_SEND_FUTURE_BUFFER_MAX_SIZE: usize = 65536; -fn may_delivery_future(awaiter: Vec) -> GooglePubSubSinkDeliveryFuture { - try_join_all(awaiter.into_iter().map(|awaiter| { - awaiter.get().map(|result| { - result - .context("Google Pub/Sub sink error") - .map_err(SinkError::GooglePubSub) - .map(|_| ()) - }) - })) - .map_ok(|_: Vec<()>| ()) - .boxed() +mod delivery_future { + use anyhow::Context; + use futures::future::try_join_all; + use futures::{FutureExt, TryFuture, TryFutureExt}; + use google_cloud_pubsub::publisher::Awaiter; + + use crate::sink::SinkError; + + pub type GooglePubSubSinkDeliveryFuture = + impl TryFuture + Unpin + 'static; + + pub(super) fn may_delivery_future(awaiter: Vec) -> GooglePubSubSinkDeliveryFuture { + try_join_all(awaiter.into_iter().map(|awaiter| { + awaiter.get().map(|result| { + result + .context("Google Pub/Sub sink error") + .map_err(SinkError::GooglePubSub) + .map(|_| ()) + }) + })) + .map_ok(|_: Vec<()>| ()) + .boxed() + } } +use delivery_future::*; + #[serde_as] #[derive(Clone, Debug, Deserialize, WithOptions)] pub struct GooglePubSubConfig { @@ -172,9 +182,6 @@ struct GooglePubSubPayloadWriter<'w> { add_future: DeliveryFutureManagerAddFuture<'w, GooglePubSubSinkDeliveryFuture>, } -pub type GooglePubSubSinkDeliveryFuture = - impl TryFuture + Unpin + 'static; - impl GooglePubSubSinkWriter { pub async fn new( config: GooglePubSubConfig, diff --git a/src/connector/src/sink/iceberg/mod.rs b/src/connector/src/sink/iceberg/mod.rs index b68e74b1f5d95..e295938a45a61 100644 --- a/src/connector/src/sink/iceberg/mod.rs +++ b/src/connector/src/sink/iceberg/mod.rs @@ -65,10 +65,8 @@ use with_options::WithOptions; use self::mock_catalog::MockCatalog; use self::prometheus::monitored_base_file_writer::MonitoredBaseFileWriterBuilder; use self::prometheus::monitored_position_delete_writer::MonitoredPositionDeleteWriterBuilder; -use super::catalog::desc::SinkDesc; use super::decouple_checkpoint_log_sink::{ default_commit_checkpoint_interval, DecoupleCheckpointLogSinkerOf, - DEFAULT_COMMIT_CHECKPOINT_INTERVAL, }; use super::{ Sink, SinkError, SinkWriterParam, SINK_TYPE_APPEND_ONLY, SINK_TYPE_OPTION, SINK_TYPE_UPSERT, @@ -76,7 +74,7 @@ use super::{ use crate::error::ConnectorResult; use crate::sink::coordinate::CoordinatedSinkWriter; use crate::sink::writer::SinkWriter; -use crate::sink::{Result, SinkCommitCoordinator, SinkDecouple, SinkParam}; +use crate::sink::{Result, SinkCommitCoordinator, SinkParam}; use crate::{ deserialize_bool_from_string, deserialize_optional_bool_from_string, deserialize_optional_string_seq_from_string, @@ -843,31 +841,6 @@ impl Sink for IcebergSink { const SINK_NAME: &'static str = ICEBERG_SINK; - fn is_sink_decouple(desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { - let commit_checkpoint_interval = - desc.properties - .get("commit_checkpoint_interval") - .map(|interval| { - interval - .parse::() - .unwrap_or(DEFAULT_COMMIT_CHECKPOINT_INTERVAL) - }); - - match user_specified { - SinkDecouple::Default | SinkDecouple::Enable => Ok(true), - SinkDecouple::Disable => { - if let Some(commit_checkpoint_interval) = commit_checkpoint_interval - && commit_checkpoint_interval > 1 - { - return Err(SinkError::Config(anyhow!( - "config conflict: Iceberg config `commit_checkpoint_interval` larger than 1 means that sink decouple must be enabled, but session config sink_decouple is disabled" - ))); - } - Ok(false) - } - } - } - async fn validate(&self) -> Result<()> { if "glue".eq_ignore_ascii_case(self.config.catalog_type()) { risingwave_common::license::Feature::IcebergSinkWithGlue @@ -1375,15 +1348,21 @@ pub fn try_matches_arrow_schema( (ArrowDataType::Decimal128(_, _), ArrowDataType::Decimal128(_, _)) => true, (ArrowDataType::Binary, ArrowDataType::LargeBinary) => true, (ArrowDataType::LargeBinary, ArrowDataType::Binary) => true, - (left, right) => left == right, + // cases where left != right (metadata, field name mismatch) + // + // all nested types: in iceberg `field_id` will always be present, but RW doesn't have it: + // {"PARQUET:field_id": ".."} + // + // map: The standard name in arrow is "entries", "key", "value". + // in iceberg-rs, it's called "key_value" + (left, right) => left.equals_datatype(right), }; if !compatible { - bail!("Field {}'s type not compatible, risingwave converted data type {}, iceberg's data type: {}", + bail!("field {}'s type is incompatible\nRisingWave converted data type: {}\niceberg's data type: {}", arrow_field.name(), converted_arrow_data_type, arrow_field.data_type() ); } } - Ok(()) } @@ -1393,7 +1372,7 @@ mod test { use risingwave_common::catalog::Field; - use crate::sink::decouple_checkpoint_log_sink::DEFAULT_COMMIT_CHECKPOINT_INTERVAL; + use crate::sink::decouple_checkpoint_log_sink::DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE; use crate::sink::iceberg::IcebergConfig; use crate::source::DataType; @@ -1476,7 +1455,7 @@ mod test { .into_iter() .map(|(k, v)| (k.to_string(), v.to_string())) .collect(), - commit_checkpoint_interval: DEFAULT_COMMIT_CHECKPOINT_INTERVAL, + commit_checkpoint_interval: DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE, create_table_if_not_exists: false, }; diff --git a/src/connector/src/sink/iceberg/prometheus/monitored_partition_writer.rs b/src/connector/src/sink/iceberg/prometheus/monitored_partition_writer.rs index d85d712c41ac3..463b1f3c9dbd4 100644 --- a/src/connector/src/sink/iceberg/prometheus/monitored_partition_writer.rs +++ b/src/connector/src/sink/iceberg/prometheus/monitored_partition_writer.rs @@ -27,7 +27,6 @@ pub struct MonitoredFanoutPartitionedWriterBuilder { } impl MonitoredFanoutPartitionedWriterBuilder { - #[expect(dead_code)] pub fn new( inner: FanoutPartitionedWriterBuilder, partition_num: LabelGuardedIntGauge<2>, diff --git a/src/connector/src/sink/iceberg/prometheus/monitored_write_writer.rs b/src/connector/src/sink/iceberg/prometheus/monitored_write_writer.rs index dc44434e5d9c2..aebb5939ff143 100644 --- a/src/connector/src/sink/iceberg/prometheus/monitored_write_writer.rs +++ b/src/connector/src/sink/iceberg/prometheus/monitored_write_writer.rs @@ -28,7 +28,6 @@ pub struct MonitoredWriteWriterBuilder { impl MonitoredWriteWriterBuilder { /// Create writer context. - #[expect(dead_code)] pub fn new( inner: B, write_qps: LabelGuardedIntCounter<2>, diff --git a/src/connector/src/sink/mod.rs b/src/connector/src/sink/mod.rs index dafbc856207a9..b453af53cca41 100644 --- a/src/connector/src/sink/mod.rs +++ b/src/connector/src/sink/mod.rs @@ -53,6 +53,13 @@ use ::deltalake::DeltaTableError; use ::redis::RedisError; use anyhow::anyhow; use async_trait::async_trait; +use clickhouse::CLICKHOUSE_SINK; +use decouple_checkpoint_log_sink::{ + COMMIT_CHECKPOINT_INTERVAL, DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITHOUT_SINK_DECOUPLE, + DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE, +}; +use deltalake::DELTALAKE_SINK; +use iceberg::ICEBERG_SINK; use opendal::Error as OpendalError; use risingwave_common::array::ArrayError; use risingwave_common::bitmap::Bitmap; @@ -66,6 +73,7 @@ use risingwave_pb::catalog::PbSinkType; use risingwave_pb::connector_service::{PbSinkParam, SinkMetadata, TableSchema}; use risingwave_rpc_client::error::RpcError; use risingwave_rpc_client::MetaClient; +use starrocks::STARROCKS_SINK; use thiserror::Error; use thiserror_ext::AsReport; pub use tracing; @@ -366,13 +374,54 @@ impl SinkWriterParam { } } +fn is_sink_support_commit_checkpoint_interval(sink_name: &str) -> bool { + matches!( + sink_name, + ICEBERG_SINK | CLICKHOUSE_SINK | STARROCKS_SINK | DELTALAKE_SINK + ) +} pub trait Sink: TryFrom { const SINK_NAME: &'static str; type LogSinker: LogSinker; type Coordinator: SinkCommitCoordinator; + fn set_default_commit_checkpoint_interval( + desc: &mut SinkDesc, + user_specified: &SinkDecouple, + ) -> Result<()> { + if is_sink_support_commit_checkpoint_interval(Self::SINK_NAME) { + match desc.properties.get(COMMIT_CHECKPOINT_INTERVAL) { + Some(commit_checkpoint_interval) => { + let commit_checkpoint_interval = commit_checkpoint_interval + .parse::() + .map_err(|e| SinkError::Config(anyhow!(e)))?; + if matches!(user_specified, SinkDecouple::Disable) + && commit_checkpoint_interval > 1 + { + return Err(SinkError::Config(anyhow!("config conflict: `commit_checkpoint_interval` larger than 1 means that sink decouple must be enabled, but session config sink_decouple is disabled"))); + } + } + None => match user_specified { + SinkDecouple::Default | SinkDecouple::Enable => { + desc.properties.insert( + COMMIT_CHECKPOINT_INTERVAL.to_string(), + DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE.to_string(), + ); + } + SinkDecouple::Disable => { + desc.properties.insert( + COMMIT_CHECKPOINT_INTERVAL.to_string(), + DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITHOUT_SINK_DECOUPLE.to_string(), + ); + } + }, + } + } + Ok(()) + } + /// `user_specified` is the value of `sink_decouple` config. - fn is_sink_decouple(_desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { + fn is_sink_decouple(user_specified: &SinkDecouple) -> Result { match user_specified { SinkDecouple::Default | SinkDecouple::Enable => Ok(true), SinkDecouple::Disable => Ok(false), diff --git a/src/connector/src/sink/redis.rs b/src/connector/src/sink/redis.rs index 49207e668e41b..763d7e9bba49a 100644 --- a/src/connector/src/sink/redis.rs +++ b/src/connector/src/sink/redis.rs @@ -288,7 +288,7 @@ impl RedisSinkPayloadWriter { return Ok(()); } } - self.pipe.query(self.conn.as_mut().unwrap()).await?; + self.pipe.query::<()>(self.conn.as_mut().unwrap()).await?; self.pipe.clear(); Ok(()) } diff --git a/src/connector/src/sink/remote.rs b/src/connector/src/sink/remote.rs index 6fcef5d41b654..aa8ca0625d05f 100644 --- a/src/connector/src/sink/remote.rs +++ b/src/connector/src/sink/remote.rs @@ -23,7 +23,6 @@ use async_trait::async_trait; use await_tree::InstrumentAwait; use futures::future::select; use futures::TryStreamExt; -use itertools::Itertools; use jni::JavaVM; use prost::Message; use risingwave_common::array::StreamChunk; @@ -60,7 +59,6 @@ use tracing::warn; use super::elasticsearch::{is_es_sink, StreamChunkConverter, ES_OPTION_DELIMITER}; use crate::error::ConnectorResult; -use crate::sink::catalog::desc::SinkDesc; use crate::sink::coordinate::CoordinatedSinkWriter; use crate::sink::log_store::{LogStoreReadItem, LogStoreResult, TruncateOffset}; use crate::sink::writer::{LogSinkerOf, SinkWriter, SinkWriterExt}; @@ -116,7 +114,7 @@ def_remote_sink!(); pub trait RemoteSinkTrait: Send + Sync + 'static { const SINK_NAME: &'static str; - fn default_sink_decouple(_desc: &SinkDesc) -> bool { + fn default_sink_decouple() -> bool { true } } @@ -144,9 +142,9 @@ impl Sink for RemoteSink { const SINK_NAME: &'static str = R::SINK_NAME; - fn is_sink_decouple(desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { + fn is_sink_decouple(user_specified: &SinkDecouple) -> Result { match user_specified { - SinkDecouple::Default => Ok(R::default_sink_decouple(desc)), + SinkDecouple::Default => Ok(R::default_sink_decouple()), SinkDecouple::Enable => Ok(true), SinkDecouple::Disable => Ok(false), } @@ -175,7 +173,7 @@ async fn validate_remote_sink(param: &SinkParam, sink_name: &str) -> ConnectorRe bail!("Es sink only supports single pk or pk with delimiter option"); } // FIXME: support struct and array in stream sink - param.columns.iter().map(|col| { + param.columns.iter().try_for_each(|col| { match &col.data_type { DataType::Int16 | DataType::Int32 @@ -218,7 +216,7 @@ async fn validate_remote_sink(param: &SinkParam, sink_name: &str) -> ConnectorRe "remote sink supports Int16, Int32, Int64, Float32, Float64, Boolean, Decimal, Time, Date, Interval, Jsonb, Timestamp, Timestamptz, Bytea, List and Varchar, (Es sink support Struct) got {:?}: {:?}", col.name, col.data_type, - )))}}).try_collect()?; + )))}})?; let jvm = JVM.get_or_init()?; let sink_param = param.to_proto(); diff --git a/src/connector/src/sink/starrocks.rs b/src/connector/src/sink/starrocks.rs index 21a4fc371b940..5c3e724721d18 100644 --- a/src/connector/src/sink/starrocks.rs +++ b/src/connector/src/sink/starrocks.rs @@ -24,7 +24,6 @@ use mysql_async::Opts; use risingwave_common::array::{Op, StreamChunk}; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::Schema; -use risingwave_common::session_config::sink_decouple::SinkDecouple; use risingwave_common::types::DataType; use risingwave_pb::connector_service::sink_metadata::Metadata::Serialized; use risingwave_pb::connector_service::sink_metadata::SerializedMetadata; @@ -38,7 +37,7 @@ use tokio::task::JoinHandle; use url::form_urlencoded; use with_options::WithOptions; -use super::decouple_checkpoint_log_sink::DEFAULT_COMMIT_CHECKPOINT_INTERVAL; +use super::decouple_checkpoint_log_sink::DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE; use super::doris_starrocks_connector::{ HeaderBuilder, InserterInner, StarrocksTxnRequestBuilder, STARROCKS_DELETE_SIGN, STARROCKS_SUCCESS_STATUS, @@ -48,7 +47,6 @@ use super::{ SinkCommitCoordinator, SinkError, SinkParam, SINK_TYPE_APPEND_ONLY, SINK_TYPE_OPTION, SINK_TYPE_UPSERT, }; -use crate::sink::catalog::desc::SinkDesc; use crate::sink::coordinate::CoordinatedSinkWriter; use crate::sink::decouple_checkpoint_log_sink::DecoupleCheckpointLogSinkerOf; use crate::sink::{Result, Sink, SinkWriter, SinkWriterParam}; @@ -118,7 +116,7 @@ pub struct StarrocksConfig { } fn default_commit_checkpoint_interval() -> u64 { - DEFAULT_COMMIT_CHECKPOINT_INTERVAL + DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE } impl StarrocksConfig { @@ -264,29 +262,6 @@ impl Sink for StarrocksSink { const SINK_NAME: &'static str = STARROCKS_SINK; - fn is_sink_decouple(desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { - let commit_checkpoint_interval = - if let Some(interval) = desc.properties.get("commit_checkpoint_interval") { - interval - .parse::() - .unwrap_or(DEFAULT_COMMIT_CHECKPOINT_INTERVAL) - } else { - DEFAULT_COMMIT_CHECKPOINT_INTERVAL - }; - - match user_specified { - SinkDecouple::Default | SinkDecouple::Enable => Ok(true), - SinkDecouple::Disable => { - if commit_checkpoint_interval > 1 { - return Err(SinkError::Config(anyhow!( - "config conflict: Starrocks config `commit_checkpoint_interval` larger than 1 means that sink decouple must be enabled, but session config sink_decouple is disabled" - ))); - } - Ok(false) - } - } - } - async fn validate(&self) -> Result<()> { if !self.is_append_only && self.pk_indices.is_empty() { return Err(SinkError::Config(anyhow!( diff --git a/src/connector/src/sink/trivial.rs b/src/connector/src/sink/trivial.rs index 5c5e093c8e0f0..e19f99943338c 100644 --- a/src/connector/src/sink/trivial.rs +++ b/src/connector/src/sink/trivial.rs @@ -17,7 +17,6 @@ use std::marker::PhantomData; use async_trait::async_trait; use risingwave_common::session_config::sink_decouple::SinkDecouple; -use super::catalog::desc::SinkDesc; use crate::sink::log_store::{LogStoreReadItem, TruncateOffset}; use crate::sink::{ DummySinkCommitCoordinator, LogSinker, Result, Sink, SinkError, SinkLogReader, SinkParam, @@ -67,7 +66,7 @@ impl Sink for TrivialSink { const SINK_NAME: &'static str = T::SINK_NAME; // Disable sink decoupling for all trivial sinks because it introduces overhead without any benefit - fn is_sink_decouple(_desc: &SinkDesc, _user_specified: &SinkDecouple) -> Result { + fn is_sink_decouple(_user_specified: &SinkDecouple) -> Result { Ok(false) } diff --git a/src/connector/src/source/filesystem/opendal_source/azblob_source.rs b/src/connector/src/source/filesystem/opendal_source/azblob_source.rs index 2ee050f21f812..8c6dac01ab87b 100644 --- a/src/connector/src/source/filesystem/opendal_source/azblob_source.rs +++ b/src/connector/src/source/filesystem/opendal_source/azblob_source.rs @@ -66,7 +66,6 @@ impl OpendalEnumerator { }; let compression_format = azblob_properties.compression_format; - Ok(Self { op, prefix, diff --git a/src/connector/src/source/filesystem/opendal_source/gcs_source.rs b/src/connector/src/source/filesystem/opendal_source/gcs_source.rs index 768f19fc36722..9a6d883f3c922 100644 --- a/src/connector/src/source/filesystem/opendal_source/gcs_source.rs +++ b/src/connector/src/source/filesystem/opendal_source/gcs_source.rs @@ -60,7 +60,6 @@ impl OpendalEnumerator { }; let compression_format = gcs_properties.compression_format; - Ok(Self { op, prefix, diff --git a/src/connector/src/source/filesystem/opendal_source/mod.rs b/src/connector/src/source/filesystem/opendal_source/mod.rs index cbb3c2a9c7b85..cea4972def92c 100644 --- a/src/connector/src/source/filesystem/opendal_source/mod.rs +++ b/src/connector/src/source/filesystem/opendal_source/mod.rs @@ -47,6 +47,10 @@ pub struct FsSourceCommon { #[serde(rename = "refresh.interval.sec")] #[serde_as(as = "Option")] pub refresh_interval_sec: Option, + + #[serde(rename = "recursive_scan", default)] + #[serde_as(as = "Option")] + pub recursive_scan: Option, } #[derive(Clone, Debug, Deserialize, PartialEq, WithOptions)] diff --git a/src/connector/src/source/filesystem/opendal_source/opendal_enumerator.rs b/src/connector/src/source/filesystem/opendal_source/opendal_enumerator.rs index 7396eac2ea38e..a9cb4b6c3f7f0 100644 --- a/src/connector/src/source/filesystem/opendal_source/opendal_enumerator.rs +++ b/src/connector/src/source/filesystem/opendal_source/opendal_enumerator.rs @@ -66,13 +66,13 @@ impl SplitEnumerator for OpendalEnumerator { } impl OpendalEnumerator { - pub async fn list(&self) -> ConnectorResult { + pub async fn list(&self, recursive_scan: bool) -> ConnectorResult { let prefix = self.prefix.as_deref().unwrap_or("/"); let object_lister = self .op .lister_with(prefix) - .recursive(false) + .recursive(recursive_scan) .metakey(Metakey::ContentLength | Metakey::LastModified) .await?; let stream = stream::unfold(object_lister, |mut object_lister| async move { diff --git a/src/connector/src/source/iceberg/mod.rs b/src/connector/src/source/iceberg/mod.rs index f101ff9ed6d4b..d65929faafba1 100644 --- a/src/connector/src/source/iceberg/mod.rs +++ b/src/connector/src/source/iceberg/mod.rs @@ -206,6 +206,17 @@ impl IcebergSplitEnumerator { bail!("Batch parallelism is 0. Cannot split the iceberg files."); } let table = self.config.load_table_v2().await?; + let current_snapshot = table.metadata().current_snapshot(); + if current_snapshot.is_none() { + // If there is no snapshot, we will return a mock `IcebergSplit` with empty files. + return Ok(vec![IcebergSplit { + split_id: 0, + snapshot_id: 0, // unused + table_meta: TableMetadataJsonStr::serialize(table.metadata()), + files: vec![], + }]); + } + let snapshot_id = match time_traval_info { Some(IcebergTimeTravelInfo::Version(version)) => { let Some(snapshot) = table.metadata().snapshot_by_id(version) else { @@ -232,10 +243,10 @@ impl IcebergSplitEnumerator { } } } - None => match table.metadata().current_snapshot() { - Some(snapshot) => snapshot.snapshot_id(), - None => bail!("Cannot find the current snapshot id in the iceberg table."), - }, + None => { + assert!(current_snapshot.is_some()); + current_snapshot.unwrap().snapshot_id() + } }; let mut files = vec![]; diff --git a/src/connector/src/source/kafka/stats.rs b/src/connector/src/source/kafka/stats.rs index 679f5c24bd2a1..7a36c4d1fffea 100644 --- a/src/connector/src/source/kafka/stats.rs +++ b/src/connector/src/source/kafka/stats.rs @@ -12,34 +12,37 @@ // See the License for the specific language governing permissions and // limitations under the License. -use prometheus::core::{AtomicU64, GenericGaugeVec}; -use prometheus::{register_int_gauge_vec_with_registry, IntGaugeVec, Registry}; +use prometheus::core::AtomicU64; +use prometheus::Registry; use rdkafka::statistics::{Broker, ConsumerGroup, Partition, Topic, Window}; use rdkafka::Statistics; -use risingwave_common::metrics::register_uint_gauge_vec_with_registry; +use risingwave_common::metrics::{LabelGuardedIntGaugeVec, LabelGuardedUintGaugeVec}; +use risingwave_common::{ + register_guarded_int_gauge_vec_with_registry, register_guarded_uint_gauge_vec_with_registry, +}; #[derive(Debug, Clone)] pub struct RdKafkaStats { pub registry: Registry, - pub ts: IntGaugeVec, - pub time: IntGaugeVec, - pub age: IntGaugeVec, - pub replyq: IntGaugeVec, - pub msg_cnt: GenericGaugeVec, - pub msg_size: GenericGaugeVec, - pub msg_max: GenericGaugeVec, - pub msg_size_max: GenericGaugeVec, - pub tx: IntGaugeVec, - pub tx_bytes: IntGaugeVec, - pub rx: IntGaugeVec, - pub rx_bytes: IntGaugeVec, - pub tx_msgs: IntGaugeVec, - pub tx_msgs_bytes: IntGaugeVec, - pub rx_msgs: IntGaugeVec, - pub rx_msgs_bytes: IntGaugeVec, - pub simple_cnt: IntGaugeVec, - pub metadata_cache_cnt: IntGaugeVec, + pub ts: LabelGuardedIntGaugeVec<2>, + pub time: LabelGuardedIntGaugeVec<2>, + pub age: LabelGuardedIntGaugeVec<2>, + pub replyq: LabelGuardedIntGaugeVec<2>, + pub msg_cnt: LabelGuardedUintGaugeVec<2>, + pub msg_size: LabelGuardedUintGaugeVec<2>, + pub msg_max: LabelGuardedUintGaugeVec<2>, + pub msg_size_max: LabelGuardedUintGaugeVec<2>, + pub tx: LabelGuardedIntGaugeVec<2>, + pub tx_bytes: LabelGuardedIntGaugeVec<2>, + pub rx: LabelGuardedIntGaugeVec<2>, + pub rx_bytes: LabelGuardedIntGaugeVec<2>, + pub tx_msgs: LabelGuardedIntGaugeVec<2>, + pub tx_msgs_bytes: LabelGuardedIntGaugeVec<2>, + pub rx_msgs: LabelGuardedIntGaugeVec<2>, + pub rx_msgs_bytes: LabelGuardedIntGaugeVec<2>, + pub simple_cnt: LabelGuardedIntGaugeVec<2>, + pub metadata_cache_cnt: LabelGuardedIntGaugeVec<2>, pub broker_stats: BrokerStats, pub topic_stats: TopicStats, @@ -50,29 +53,29 @@ pub struct RdKafkaStats { pub struct BrokerStats { pub registry: Registry, - pub state_age: IntGaugeVec, - pub outbuf_cnt: IntGaugeVec, - pub outbuf_msg_cnt: IntGaugeVec, - pub waitresp_cnt: IntGaugeVec, - pub waitresp_msg_cnt: IntGaugeVec, - pub tx: GenericGaugeVec, - pub tx_bytes: GenericGaugeVec, - pub tx_errs: GenericGaugeVec, - pub tx_retries: GenericGaugeVec, - pub tx_idle: IntGaugeVec, - pub req_timeouts: GenericGaugeVec, - pub rx: GenericGaugeVec, - pub rx_bytes: GenericGaugeVec, - pub rx_errs: GenericGaugeVec, - pub rx_corriderrs: GenericGaugeVec, - pub rx_partial: GenericGaugeVec, - pub rx_idle: IntGaugeVec, - pub req: IntGaugeVec, - pub zbuf_grow: GenericGaugeVec, - pub buf_grow: GenericGaugeVec, - pub wakeups: GenericGaugeVec, - pub connects: IntGaugeVec, - pub disconnects: IntGaugeVec, + pub state_age: LabelGuardedIntGaugeVec<4>, + pub outbuf_cnt: LabelGuardedIntGaugeVec<4>, + pub outbuf_msg_cnt: LabelGuardedIntGaugeVec<4>, + pub waitresp_cnt: LabelGuardedIntGaugeVec<4>, + pub waitresp_msg_cnt: LabelGuardedIntGaugeVec<4>, + pub tx: LabelGuardedUintGaugeVec<4>, + pub tx_bytes: LabelGuardedUintGaugeVec<4>, + pub tx_errs: LabelGuardedUintGaugeVec<4>, + pub tx_retries: LabelGuardedUintGaugeVec<4>, + pub tx_idle: LabelGuardedIntGaugeVec<4>, + pub req_timeouts: LabelGuardedUintGaugeVec<4>, + pub rx: LabelGuardedUintGaugeVec<4>, + pub rx_bytes: LabelGuardedUintGaugeVec<4>, + pub rx_errs: LabelGuardedUintGaugeVec<4>, + pub rx_corriderrs: LabelGuardedUintGaugeVec<4>, + pub rx_partial: LabelGuardedUintGaugeVec<4>, + pub rx_idle: LabelGuardedIntGaugeVec<4>, + pub req: LabelGuardedIntGaugeVec<5>, + pub zbuf_grow: LabelGuardedUintGaugeVec<4>, + pub buf_grow: LabelGuardedUintGaugeVec<4>, + pub wakeups: LabelGuardedUintGaugeVec<4>, + pub connects: LabelGuardedIntGaugeVec<4>, + pub disconnects: LabelGuardedIntGaugeVec<4>, pub int_latency: StatsWindow, pub outbuf_latency: StatsWindow, pub rtt: StatsWindow, @@ -83,7 +86,7 @@ pub struct BrokerStats { pub struct TopicStats { pub registry: Registry, - pub metadata_age: IntGaugeVec, + pub metadata_age: LabelGuardedIntGaugeVec<3>, pub batch_size: StatsWindow, pub batch_cnt: StatsWindow, pub partitions: PartitionStats, @@ -93,58 +96,58 @@ pub struct TopicStats { pub struct StatsWindow { pub registry: Registry, - pub min: IntGaugeVec, - pub max: IntGaugeVec, - pub avg: IntGaugeVec, - pub sum: IntGaugeVec, - pub cnt: IntGaugeVec, - pub stddev: IntGaugeVec, - pub hdr_size: IntGaugeVec, - pub p50: IntGaugeVec, - pub p75: IntGaugeVec, - pub p90: IntGaugeVec, - pub p95: IntGaugeVec, - pub p99: IntGaugeVec, - pub p99_99: IntGaugeVec, - pub out_of_range: IntGaugeVec, + pub min: LabelGuardedIntGaugeVec<4>, + pub max: LabelGuardedIntGaugeVec<4>, + pub avg: LabelGuardedIntGaugeVec<4>, + pub sum: LabelGuardedIntGaugeVec<4>, + pub cnt: LabelGuardedIntGaugeVec<4>, + pub stddev: LabelGuardedIntGaugeVec<4>, + pub hdr_size: LabelGuardedIntGaugeVec<4>, + pub p50: LabelGuardedIntGaugeVec<4>, + pub p75: LabelGuardedIntGaugeVec<4>, + pub p90: LabelGuardedIntGaugeVec<4>, + pub p95: LabelGuardedIntGaugeVec<4>, + pub p99: LabelGuardedIntGaugeVec<4>, + pub p99_99: LabelGuardedIntGaugeVec<4>, + pub out_of_range: LabelGuardedIntGaugeVec<4>, } #[derive(Debug, Clone)] pub struct ConsumerGroupStats { pub registry: Registry, - pub state_age: IntGaugeVec, + pub state_age: LabelGuardedIntGaugeVec<3>, // todo: (do not know value set) join_state: IntGaugeVec, - pub rebalance_age: IntGaugeVec, - pub rebalance_cnt: IntGaugeVec, + pub rebalance_age: LabelGuardedIntGaugeVec<3>, + pub rebalance_cnt: LabelGuardedIntGaugeVec<3>, // todo: (cannot handle string) rebalance_reason, - pub assignment_size: IntGaugeVec, + pub assignment_size: LabelGuardedIntGaugeVec<3>, } impl ConsumerGroupStats { pub fn new(registry: Registry) -> Self { - let state_age = register_int_gauge_vec_with_registry!( + let state_age = register_guarded_int_gauge_vec_with_registry!( "rdkafka_consumer_group_state_age", "Age of the consumer group state in seconds", &["id", "client_id", "state"], registry ) .unwrap(); - let rebalance_age = register_int_gauge_vec_with_registry!( + let rebalance_age = register_guarded_int_gauge_vec_with_registry!( "rdkafka_consumer_group_rebalance_age", "Age of the last rebalance in seconds", &["id", "client_id", "state"], registry ) .unwrap(); - let rebalance_cnt = register_int_gauge_vec_with_registry!( + let rebalance_cnt = register_guarded_int_gauge_vec_with_registry!( "rdkafka_consumer_group_rebalance_cnt", "Number of rebalances", &["id", "client_id", "state"], registry ) .unwrap(); - let assignment_size = register_int_gauge_vec_with_registry!( + let assignment_size = register_guarded_int_gauge_vec_with_registry!( "rdkafka_consumer_group_assignment_size", "Number of assigned partitions", &["id", "client_id", "state"], @@ -164,16 +167,16 @@ impl ConsumerGroupStats { pub fn report(&self, id: &str, client_id: &str, stats: &ConsumerGroup) { let state = stats.state.as_str(); self.state_age - .with_label_values(&[id, client_id, state]) + .with_guarded_label_values(&[id, client_id, state]) .set(stats.stateage); self.rebalance_age - .with_label_values(&[id, client_id, state]) + .with_guarded_label_values(&[id, client_id, state]) .set(stats.rebalance_age); self.rebalance_cnt - .with_label_values(&[id, client_id, state]) + .with_guarded_label_values(&[id, client_id, state]) .set(stats.rebalance_cnt); self.assignment_size - .with_label_values(&[id, client_id, state]) + .with_guarded_label_values(&[id, client_id, state]) .set(stats.assignment_size as i64); } } @@ -181,98 +184,98 @@ impl ConsumerGroupStats { impl StatsWindow { pub fn new(registry: Registry, path: &str) -> Self { let get_metric_name = |name: &str| format!("rdkafka_{}_{}", path, name); - let min = register_int_gauge_vec_with_registry!( + let min = register_guarded_int_gauge_vec_with_registry!( get_metric_name("min"), "Minimum value", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let max = register_int_gauge_vec_with_registry!( + let max = register_guarded_int_gauge_vec_with_registry!( get_metric_name("max"), "Maximum value", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let avg = register_int_gauge_vec_with_registry!( + let avg = register_guarded_int_gauge_vec_with_registry!( get_metric_name("avg"), "Average value", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let sum = register_int_gauge_vec_with_registry!( + let sum = register_guarded_int_gauge_vec_with_registry!( get_metric_name("sum"), "Sum of values", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let cnt = register_int_gauge_vec_with_registry!( + let cnt = register_guarded_int_gauge_vec_with_registry!( get_metric_name("cnt"), "Count of values", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let stddev = register_int_gauge_vec_with_registry!( + let stddev = register_guarded_int_gauge_vec_with_registry!( get_metric_name("stddev"), "Standard deviation", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let hdr_size = register_int_gauge_vec_with_registry!( + let hdr_size = register_guarded_int_gauge_vec_with_registry!( get_metric_name("hdrsize"), "Size of the histogram header", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let p50 = register_int_gauge_vec_with_registry!( + let p50 = register_guarded_int_gauge_vec_with_registry!( get_metric_name("p50"), "50th percentile", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let p75 = register_int_gauge_vec_with_registry!( + let p75 = register_guarded_int_gauge_vec_with_registry!( get_metric_name("p75"), "75th percentile", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let p90 = register_int_gauge_vec_with_registry!( + let p90 = register_guarded_int_gauge_vec_with_registry!( get_metric_name("p90"), "90th percentile", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let p95 = register_int_gauge_vec_with_registry!( + let p95 = register_guarded_int_gauge_vec_with_registry!( get_metric_name("p95"), "95th percentile", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let p99 = register_int_gauge_vec_with_registry!( + let p99 = register_guarded_int_gauge_vec_with_registry!( get_metric_name("p99"), "99th percentile", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let p99_99 = register_int_gauge_vec_with_registry!( + let p99_99 = register_guarded_int_gauge_vec_with_registry!( get_metric_name("p99_99"), "99.99th percentile", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let out_of_range = register_int_gauge_vec_with_registry!( + let out_of_range = register_guarded_int_gauge_vec_with_registry!( get_metric_name("out_of_range"), "Out of range values", &["id", "client_id", "broker", "topic"], @@ -302,26 +305,32 @@ impl StatsWindow { pub fn report(&self, id: &str, client_id: &str, broker: &str, topic: &str, stats: &Window) { let labels = [id, client_id, broker, topic]; - self.min.with_label_values(&labels).set(stats.min); - self.max.with_label_values(&labels).set(stats.max); - self.avg.with_label_values(&labels).set(stats.avg); - self.sum.with_label_values(&labels).set(stats.sum); - self.cnt.with_label_values(&labels).set(stats.cnt); - self.stddev.with_label_values(&labels).set(stats.stddev); - self.hdr_size.with_label_values(&labels).set(stats.hdrsize); - self.p50.with_label_values(&labels).set(stats.p50); - self.p75.with_label_values(&labels).set(stats.p75); - self.p90.with_label_values(&labels).set(stats.p90); - self.p99_99.with_label_values(&labels).set(stats.p99_99); + self.min.with_guarded_label_values(&labels).set(stats.min); + self.max.with_guarded_label_values(&labels).set(stats.max); + self.avg.with_guarded_label_values(&labels).set(stats.avg); + self.sum.with_guarded_label_values(&labels).set(stats.sum); + self.cnt.with_guarded_label_values(&labels).set(stats.cnt); + self.stddev + .with_guarded_label_values(&labels) + .set(stats.stddev); + self.hdr_size + .with_guarded_label_values(&labels) + .set(stats.hdrsize); + self.p50.with_guarded_label_values(&labels).set(stats.p50); + self.p75.with_guarded_label_values(&labels).set(stats.p75); + self.p90.with_guarded_label_values(&labels).set(stats.p90); + self.p99_99 + .with_guarded_label_values(&labels) + .set(stats.p99_99); self.out_of_range - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.outofrange); } } impl TopicStats { pub fn new(registry: Registry) -> Self { - let metadata_age = register_int_gauge_vec_with_registry!( + let metadata_age = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_metadata_age", "Age of the topic metadata in milliseconds", &["id", "client_id", "topic"], @@ -348,7 +357,7 @@ impl TopicStats { fn report_inner(&self, id: &str, client_id: &str, topic: &str, stats: &Topic) { self.metadata_age - .with_label_values(&[id, client_id, topic]) + .with_guarded_label_values(&[id, client_id, topic]) .set(stats.metadata_age); self.batch_size .report(id, client_id, "", topic, &stats.batchsize); @@ -362,212 +371,212 @@ impl TopicStats { pub struct PartitionStats { pub registry: Registry, - pub msgq_cnt: IntGaugeVec, - pub msgq_bytes: GenericGaugeVec, - pub xmit_msgq_cnt: IntGaugeVec, - pub xmit_msgq_bytes: GenericGaugeVec, - pub fetchq_cnt: IntGaugeVec, - pub fetchq_size: GenericGaugeVec, - pub query_offset: IntGaugeVec, - pub next_offset: IntGaugeVec, - pub app_offset: IntGaugeVec, - pub stored_offset: IntGaugeVec, - pub committed_offset: IntGaugeVec, - pub eof_offset: IntGaugeVec, - pub lo_offset: IntGaugeVec, - pub hi_offset: IntGaugeVec, - pub consumer_lag: IntGaugeVec, - pub consumer_lag_store: IntGaugeVec, - pub txmsgs: GenericGaugeVec, - pub txbytes: GenericGaugeVec, - pub rxmsgs: GenericGaugeVec, - pub rxbytes: GenericGaugeVec, - pub msgs: GenericGaugeVec, - pub rx_ver_drops: GenericGaugeVec, - pub msgs_inflight: IntGaugeVec, - pub next_ack_seq: IntGaugeVec, - pub next_err_seq: IntGaugeVec, - pub acked_msgid: GenericGaugeVec, + pub msgq_cnt: LabelGuardedIntGaugeVec<4>, + pub msgq_bytes: LabelGuardedUintGaugeVec<4>, + pub xmit_msgq_cnt: LabelGuardedIntGaugeVec<4>, + pub xmit_msgq_bytes: LabelGuardedUintGaugeVec<4>, + pub fetchq_cnt: LabelGuardedIntGaugeVec<4>, + pub fetchq_size: LabelGuardedUintGaugeVec<4>, + pub query_offset: LabelGuardedIntGaugeVec<4>, + pub next_offset: LabelGuardedIntGaugeVec<4>, + pub app_offset: LabelGuardedIntGaugeVec<4>, + pub stored_offset: LabelGuardedIntGaugeVec<4>, + pub committed_offset: LabelGuardedIntGaugeVec<4>, + pub eof_offset: LabelGuardedIntGaugeVec<4>, + pub lo_offset: LabelGuardedIntGaugeVec<4>, + pub hi_offset: LabelGuardedIntGaugeVec<4>, + pub consumer_lag: LabelGuardedIntGaugeVec<4>, + pub consumer_lag_store: LabelGuardedIntGaugeVec<4>, + pub txmsgs: LabelGuardedUintGaugeVec<4>, + pub txbytes: LabelGuardedUintGaugeVec<4>, + pub rxmsgs: LabelGuardedUintGaugeVec<4>, + pub rxbytes: LabelGuardedUintGaugeVec<4>, + pub msgs: LabelGuardedUintGaugeVec<4>, + pub rx_ver_drops: LabelGuardedUintGaugeVec<4>, + pub msgs_inflight: LabelGuardedIntGaugeVec<4>, + pub next_ack_seq: LabelGuardedIntGaugeVec<4>, + pub next_err_seq: LabelGuardedIntGaugeVec<4>, + pub acked_msgid: LabelGuardedUintGaugeVec<4>, } impl PartitionStats { pub fn new(registry: Registry) -> Self { - let msgq_cnt = register_int_gauge_vec_with_registry!( + let msgq_cnt = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_msgq_cnt", "Number of messages in the producer queue", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let msgq_bytes = register_uint_gauge_vec_with_registry!( + let msgq_bytes = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_topic_partition_msgq_bytes", "Size of messages in the producer queue", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let xmit_msgq_cnt = register_int_gauge_vec_with_registry!( + let xmit_msgq_cnt = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_xmit_msgq_cnt", "Number of messages in the transmit queue", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let xmit_msgq_bytes = register_uint_gauge_vec_with_registry!( + let xmit_msgq_bytes = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_topic_partition_xmit_msgq_bytes", "Size of messages in the transmit queue", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let fetchq_cnt = register_int_gauge_vec_with_registry!( + let fetchq_cnt = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_fetchq_cnt", "Number of messages in the fetch queue", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let fetchq_size = register_uint_gauge_vec_with_registry!( + let fetchq_size = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_topic_partition_fetchq_size", "Size of messages in the fetch queue", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let query_offset = register_int_gauge_vec_with_registry!( + let query_offset = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_query_offset", "Current query offset", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let next_offset = register_int_gauge_vec_with_registry!( + let next_offset = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_next_offset", "Next offset to query", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let app_offset = register_int_gauge_vec_with_registry!( + let app_offset = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_app_offset", "Last acknowledged offset", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let stored_offset = register_int_gauge_vec_with_registry!( + let stored_offset = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_stored_offset", "Last stored offset", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let committed_offset = register_int_gauge_vec_with_registry!( + let committed_offset = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_committed_offset", "Last committed offset", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let eof_offset = register_int_gauge_vec_with_registry!( + let eof_offset = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_eof_offset", "Last offset in broker log", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let lo_offset = register_int_gauge_vec_with_registry!( + let lo_offset = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_lo_offset", "Low offset", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let hi_offset = register_int_gauge_vec_with_registry!( + let hi_offset = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_hi_offset", "High offset", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let consumer_lag = register_int_gauge_vec_with_registry!( + let consumer_lag = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_consumer_lag", "Consumer lag", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let consumer_lag_store = register_int_gauge_vec_with_registry!( + let consumer_lag_store = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_consumer_lag_store", "Consumer lag stored", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let txmsgs = register_uint_gauge_vec_with_registry!( + let txmsgs = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_topic_partition_txmsgs", "Number of transmitted messages", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let txbytes = register_uint_gauge_vec_with_registry!( + let txbytes = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_topic_partition_txbytes", "Number of transmitted bytes", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let rxmsgs = register_uint_gauge_vec_with_registry!( + let rxmsgs = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_topic_partition_rxmsgs", "Number of received messages", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let rxbytes = register_uint_gauge_vec_with_registry!( + let rxbytes = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_topic_partition_rxbytes", "Number of received bytes", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let msgs = register_uint_gauge_vec_with_registry!( + let msgs = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_topic_partition_msgs", "Number of messages in partition", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let rx_ver_drops = register_uint_gauge_vec_with_registry!( + let rx_ver_drops = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_topic_partition_rx_ver_drops", "Number of received messages dropped due to version mismatch", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let msgs_inflight = register_int_gauge_vec_with_registry!( + let msgs_inflight = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_msgs_inflight", "Number of messages in-flight", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let next_ack_seq = register_int_gauge_vec_with_registry!( + let next_ack_seq = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_next_ack_seq", "Next ack sequence number", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let next_err_seq = register_int_gauge_vec_with_registry!( + let next_err_seq = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_next_err_seq", "Next error sequence number", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let acked_msgid = register_uint_gauge_vec_with_registry!( + let acked_msgid = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_topic_partition_acked_msgid", "Acknowledged message ID", &["id", "client_id", "topic", "partition"], @@ -615,78 +624,88 @@ impl PartitionStats { fn report_inner(&self, id: &str, client_id: &str, topic: &str, stats: &Partition) { let labels = [id, client_id, topic, &stats.partition.to_string()]; - self.msgq_cnt.with_label_values(&labels).set(stats.msgq_cnt); + self.msgq_cnt + .with_guarded_label_values(&labels) + .set(stats.msgq_cnt); self.msgq_bytes - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.msgq_bytes); self.xmit_msgq_cnt - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.xmit_msgq_cnt); self.xmit_msgq_bytes - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.xmit_msgq_bytes); self.fetchq_cnt - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.fetchq_cnt); self.fetchq_size - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.fetchq_size); self.query_offset - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.query_offset); self.next_offset - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.next_offset); self.app_offset - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.app_offset); self.stored_offset - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.stored_offset); self.committed_offset - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.committed_offset); self.eof_offset - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.eof_offset); self.lo_offset - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.lo_offset); self.hi_offset - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.hi_offset); self.consumer_lag - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.consumer_lag); self.consumer_lag_store - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.consumer_lag_stored); - self.txmsgs.with_label_values(&labels).set(stats.txmsgs); - self.txbytes.with_label_values(&labels).set(stats.txbytes); - self.rxmsgs.with_label_values(&labels).set(stats.rxmsgs); - self.rxbytes.with_label_values(&labels).set(stats.rxbytes); - self.msgs.with_label_values(&labels).set(stats.msgs); + self.txmsgs + .with_guarded_label_values(&labels) + .set(stats.txmsgs); + self.txbytes + .with_guarded_label_values(&labels) + .set(stats.txbytes); + self.rxmsgs + .with_guarded_label_values(&labels) + .set(stats.rxmsgs); + self.rxbytes + .with_guarded_label_values(&labels) + .set(stats.rxbytes); + self.msgs.with_guarded_label_values(&labels).set(stats.msgs); self.rx_ver_drops - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.rx_ver_drops); self.msgs_inflight - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.msgs_inflight); self.next_ack_seq - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.next_ack_seq); self.next_err_seq - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.next_err_seq); self.acked_msgid - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.acked_msgid); } } impl RdKafkaStats { pub fn new(registry: Registry) -> Self { - let ts = register_int_gauge_vec_with_registry!( + let ts = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_ts", "librdkafka's internal monotonic clock (microseconds)", // we cannot tell whether it is for consumer or producer, @@ -695,119 +714,119 @@ impl RdKafkaStats { registry ) .unwrap(); - let time = register_int_gauge_vec_with_registry!( + let time = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_time", "Wall clock time in seconds since the epoch", &["id", "client_id"], registry ) .unwrap(); - let age = register_int_gauge_vec_with_registry!( + let age = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_age", "Age of the topic metadata in milliseconds", &["id", "client_id"], registry ) .unwrap(); - let replyq = register_int_gauge_vec_with_registry!( + let replyq = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_replyq", "Number of replies waiting to be served", &["id", "client_id"], registry ) .unwrap(); - let msg_cnt = register_uint_gauge_vec_with_registry!( + let msg_cnt = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_top_msg_cnt", "Number of messages in all topics", &["id", "client_id"], registry ) .unwrap(); - let msg_size = register_uint_gauge_vec_with_registry!( + let msg_size = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_top_msg_size", "Size of messages in all topics", &["id", "client_id"], registry ) .unwrap(); - let msg_max = register_uint_gauge_vec_with_registry!( + let msg_max = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_top_msg_max", "Maximum message size in all topics", &["id", "client_id"], registry ) .unwrap(); - let msg_size_max = register_uint_gauge_vec_with_registry!( + let msg_size_max = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_top_msg_size_max", "Maximum message size in all topics", &["id", "client_id"], registry ) .unwrap(); - let tx = register_int_gauge_vec_with_registry!( + let tx = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_tx", "Number of transmitted messages", &["id", "client_id"], registry ) .unwrap(); - let tx_bytes = register_int_gauge_vec_with_registry!( + let tx_bytes = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_tx_bytes", "Number of transmitted bytes", &["id", "client_id"], registry ) .unwrap(); - let rx = register_int_gauge_vec_with_registry!( + let rx = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_rx", "Number of received messages", &["id", "client_id"], registry ) .unwrap(); - let rx_bytes = register_int_gauge_vec_with_registry!( + let rx_bytes = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_rx_bytes", "Number of received bytes", &["id", "client_id"], registry ) .unwrap(); - let tx_msgs = register_int_gauge_vec_with_registry!( + let tx_msgs = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_tx_msgs", "Number of transmitted messages", &["id", "client_id"], registry ) .unwrap(); - let tx_msgs_bytes = register_int_gauge_vec_with_registry!( + let tx_msgs_bytes = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_tx_msgs_bytes", "Number of transmitted bytes", &["id", "client_id"], registry ) .unwrap(); - let rx_msgs = register_int_gauge_vec_with_registry!( + let rx_msgs = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_rx_msgs", "Number of received messages", &["id", "client_id"], registry ) .unwrap(); - let rx_msgs_bytes = register_int_gauge_vec_with_registry!( + let rx_msgs_bytes = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_rx_msgs_bytes", "Number of received bytes", &["id", "client_id"], registry ) .unwrap(); - let simple_cnt = register_int_gauge_vec_with_registry!( + let simple_cnt = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_simple_cnt", "Number of simple consumer queues", &["id", "client_id"], registry ) .unwrap(); - let metadata_cache_cnt = register_int_gauge_vec_with_registry!( + let metadata_cache_cnt = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_metadata_cache_cnt", "Number of entries in the metadata cache", &["id", "client_id"], @@ -846,51 +865,59 @@ impl RdKafkaStats { pub fn report(&self, id: &str, stats: &Statistics) { let client_id = stats.name.as_str(); - self.ts.with_label_values(&[id, client_id]).set(stats.ts); + self.ts + .with_guarded_label_values(&[id, client_id]) + .set(stats.ts); self.time - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.time); - self.age.with_label_values(&[id, client_id]).set(stats.age); + self.age + .with_guarded_label_values(&[id, client_id]) + .set(stats.age); self.replyq - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.replyq); self.msg_cnt - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.msg_cnt); self.msg_size - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.msg_size); self.msg_max - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.msg_max); self.msg_size_max - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.msg_size_max); - self.tx.with_label_values(&[id, client_id]).set(stats.tx); + self.tx + .with_guarded_label_values(&[id, client_id]) + .set(stats.tx); self.tx_bytes - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.tx_bytes); - self.rx.with_label_values(&[id, client_id]).set(stats.rx); + self.rx + .with_guarded_label_values(&[id, client_id]) + .set(stats.rx); self.rx_bytes - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.rx_bytes); self.tx_msgs - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.txmsgs); self.tx_msgs_bytes - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.txmsg_bytes); self.rx_msgs - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.rxmsgs); self.rx_msgs_bytes - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.rxmsg_bytes); self.simple_cnt - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.simple_cnt); self.metadata_cache_cnt - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.metadata_cache_cnt); self.broker_stats.report(id, client_id, stats); @@ -903,161 +930,161 @@ impl RdKafkaStats { impl BrokerStats { pub fn new(registry: Registry) -> Self { - let state_age = register_int_gauge_vec_with_registry!( + let state_age = register_guarded_int_gauge_vec_with_registry!( "rdkafka_broker_state_age", "Age of the broker state in seconds", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let outbuf_cnt = register_int_gauge_vec_with_registry!( + let outbuf_cnt = register_guarded_int_gauge_vec_with_registry!( "rdkafka_broker_outbuf_cnt", "Number of messages waiting to be sent to broker", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let outbuf_msg_cnt = register_int_gauge_vec_with_registry!( + let outbuf_msg_cnt = register_guarded_int_gauge_vec_with_registry!( "rdkafka_broker_outbuf_msg_cnt", "Number of messages waiting to be sent to broker", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let waitresp_cnt = register_int_gauge_vec_with_registry!( + let waitresp_cnt = register_guarded_int_gauge_vec_with_registry!( "rdkafka_broker_waitresp_cnt", "Number of requests waiting for response", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let waitresp_msg_cnt = register_int_gauge_vec_with_registry!( + let waitresp_msg_cnt = register_guarded_int_gauge_vec_with_registry!( "rdkafka_broker_waitresp_msg_cnt", "Number of messages waiting for response", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let tx = register_uint_gauge_vec_with_registry!( + let tx = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_tx", "Number of transmitted messages", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let tx_bytes = register_uint_gauge_vec_with_registry!( + let tx_bytes = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_tx_bytes", "Number of transmitted bytes", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let tx_errs = register_uint_gauge_vec_with_registry!( + let tx_errs = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_tx_errs", "Number of failed transmitted messages", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let tx_retries = register_uint_gauge_vec_with_registry!( + let tx_retries = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_tx_retries", "Number of message retries", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let tx_idle = register_int_gauge_vec_with_registry!( + let tx_idle = register_guarded_int_gauge_vec_with_registry!( "rdkafka_broker_tx_idle", "Number of idle transmit connections", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let req_timeouts = register_uint_gauge_vec_with_registry!( + let req_timeouts = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_req_timeouts", "Number of request timeouts", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let rx = register_uint_gauge_vec_with_registry!( + let rx = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_rx", "Number of received messages", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let rx_bytes = register_uint_gauge_vec_with_registry!( + let rx_bytes = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_rx_bytes", "Number of received bytes", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let rx_errs = register_uint_gauge_vec_with_registry!( + let rx_errs = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_rx_errs", "Number of failed received messages", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let rx_corriderrs = register_uint_gauge_vec_with_registry!( + let rx_corriderrs = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_rx_corriderrs", "Number of received messages with invalid correlation id", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let rx_partial = register_uint_gauge_vec_with_registry!( + let rx_partial = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_rx_partial", "Number of partial messages received", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let rx_idle = register_int_gauge_vec_with_registry!( + let rx_idle = register_guarded_int_gauge_vec_with_registry!( "rdkafka_broker_rx_idle", "Number of idle receive connections", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let req = register_int_gauge_vec_with_registry!( + let req = register_guarded_int_gauge_vec_with_registry!( "rdkafka_broker_req", "Number of requests in flight", &["id", "client_id", "broker", "state", "type"], registry ) .unwrap(); - let zbuf_grow = register_uint_gauge_vec_with_registry!( + let zbuf_grow = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_zbuf_grow", "Number of times the broker's output buffer has been reallocated", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let buf_grow = register_uint_gauge_vec_with_registry!( + let buf_grow = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_buf_grow", "Number of times the broker's input buffer has been reallocated", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let wakeups = register_uint_gauge_vec_with_registry!( + let wakeups = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_wakeups", "Number of wakeups", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let connects = register_int_gauge_vec_with_registry!( + let connects = register_guarded_int_gauge_vec_with_registry!( "rdkafka_broker_connects", "Number of connection attempts", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let disconnects = register_int_gauge_vec_with_registry!( + let disconnects = register_guarded_int_gauge_vec_with_registry!( "rdkafka_broker_disconnects", "Number of disconnects", &["id", "client_id", "broker", "state"], @@ -1113,57 +1140,75 @@ impl BrokerStats { let labels = [id, client_id, broker, state]; self.state_age - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.stateage); self.outbuf_cnt - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.outbuf_cnt); self.outbuf_msg_cnt - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.outbuf_msg_cnt); self.waitresp_cnt - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.waitresp_cnt); self.waitresp_msg_cnt - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.waitresp_msg_cnt); - self.tx.with_label_values(&labels).set(stats.tx); - self.tx_bytes.with_label_values(&labels).set(stats.txbytes); - self.tx_errs.with_label_values(&labels).set(stats.txerrs); + self.tx.with_guarded_label_values(&labels).set(stats.tx); + self.tx_bytes + .with_guarded_label_values(&labels) + .set(stats.txbytes); + self.tx_errs + .with_guarded_label_values(&labels) + .set(stats.txerrs); self.tx_retries - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.txretries); - self.tx_idle.with_label_values(&labels).set(stats.txidle); + self.tx_idle + .with_guarded_label_values(&labels) + .set(stats.txidle); self.req_timeouts - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.req_timeouts); - self.rx.with_label_values(&labels).set(stats.rx); - self.rx_bytes.with_label_values(&labels).set(stats.rxbytes); - self.rx_errs.with_label_values(&labels).set(stats.rxerrs); + self.rx.with_guarded_label_values(&labels).set(stats.rx); + self.rx_bytes + .with_guarded_label_values(&labels) + .set(stats.rxbytes); + self.rx_errs + .with_guarded_label_values(&labels) + .set(stats.rxerrs); self.rx_corriderrs - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.rxcorriderrs); self.rx_partial - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.rxpartial); - self.rx_idle.with_label_values(&labels).set(stats.rxidle); + self.rx_idle + .with_guarded_label_values(&labels) + .set(stats.rxidle); for (req_type, req_cnt) in &stats.req { self.req - .with_label_values(&[id, client_id, broker, state, req_type]) + .with_guarded_label_values(&[id, client_id, broker, state, req_type]) .set(*req_cnt); } self.zbuf_grow - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.zbuf_grow); - self.buf_grow.with_label_values(&labels).set(stats.buf_grow); + self.buf_grow + .with_guarded_label_values(&labels) + .set(stats.buf_grow); if let Some(wakeups) = stats.wakeups { - self.wakeups.with_label_values(&labels).set(wakeups); + self.wakeups.with_guarded_label_values(&labels).set(wakeups); } if let Some(connects) = stats.connects { - self.connects.with_label_values(&labels).set(connects); + self.connects + .with_guarded_label_values(&labels) + .set(connects); } if let Some(disconnects) = stats.disconnects { - self.disconnects.with_label_values(&labels).set(disconnects); + self.disconnects + .with_guarded_label_values(&labels) + .set(disconnects); } if let Some(int_latency) = &stats.int_latency { self.int_latency diff --git a/src/connector/src/source/pulsar/mod.rs b/src/connector/src/source/pulsar/mod.rs index 5d6d111b13bff..ffbc3be495bf9 100644 --- a/src/connector/src/source/pulsar/mod.rs +++ b/src/connector/src/source/pulsar/mod.rs @@ -74,6 +74,16 @@ pub struct PulsarProperties { #[serde(rename = "iceberg.bucket", default)] pub iceberg_bucket: Option, + /// Specify a custom consumer group id prefix for the source. + /// Defaults to `rw-consumer`. + /// + /// Notes: + /// - Each job (materialized view) will have multiple subscriptions and + /// contains a generated suffix in the subscription name. + /// The subscription name will be `{subscription_name_prefix}-{fragment_id}-{actor_id}`. + #[serde(rename = "subscription.name.prefix")] + pub subscription_name_prefix: Option, + #[serde(flatten)] pub unknown_fields: HashMap, } diff --git a/src/connector/src/source/pulsar/source/reader.rs b/src/connector/src/source/pulsar/source/reader.rs index 212c459388b25..20f6872474e88 100644 --- a/src/connector/src/source/pulsar/source/reader.rs +++ b/src/connector/src/source/pulsar/source/reader.rs @@ -42,6 +42,8 @@ use crate::source::{ SplitMetaData, SplitReader, }; +const PULSAR_DEFAULT_SUBSCRIPTION_PREFIX: &str = "rw-consumer"; + pub enum PulsarSplitReader { Broker(PulsarBrokerReader), Iceberg(PulsarIcebergReader), @@ -174,8 +176,12 @@ impl SplitReader for PulsarBrokerReader { .with_topic(&topic) .with_subscription_type(SubType::Exclusive) .with_subscription(format!( - "rw-consumer-{}-{}", - source_ctx.fragment_id, source_ctx.actor_id + "{}-{}-{}", + props + .subscription_name_prefix + .unwrap_or(PULSAR_DEFAULT_SUBSCRIPTION_PREFIX.to_string()), + source_ctx.fragment_id, + source_ctx.actor_id )); let builder = match split.start_offset.clone() { diff --git a/src/connector/src/source/reader/reader.rs b/src/connector/src/source/reader/reader.rs index 95764792c0025..9a7cb1e440e9f 100644 --- a/src/connector/src/source/reader/reader.rs +++ b/src/connector/src/source/reader/reader.rs @@ -93,27 +93,47 @@ impl SourceReader { match config { ConnectorProperties::Gcs(prop) => { list_interval_sec = get_list_interval_sec(prop.fs_common.refresh_interval_sec); + let recursive_scan = prop.fs_common.recursive_scan.unwrap_or_default(); let lister: OpendalEnumerator = OpendalEnumerator::new_gcs_source(*prop)?; - Ok(build_opendal_fs_list_stream(lister, list_interval_sec)) + Ok(build_opendal_fs_list_stream( + lister, + list_interval_sec, + recursive_scan, + )) } ConnectorProperties::OpendalS3(prop) => { list_interval_sec = get_list_interval_sec(prop.fs_common.refresh_interval_sec); + let recursive_scan = prop.fs_common.recursive_scan.unwrap_or_default(); let lister: OpendalEnumerator = OpendalEnumerator::new_s3_source(prop.s3_properties, prop.assume_role)?; - Ok(build_opendal_fs_list_stream(lister, list_interval_sec)) + Ok(build_opendal_fs_list_stream( + lister, + list_interval_sec, + recursive_scan, + )) } ConnectorProperties::Azblob(prop) => { list_interval_sec = get_list_interval_sec(prop.fs_common.refresh_interval_sec); + let recursive_scan = prop.fs_common.recursive_scan.unwrap_or_default(); let lister: OpendalEnumerator = OpendalEnumerator::new_azblob_source(*prop)?; - Ok(build_opendal_fs_list_stream(lister, list_interval_sec)) + Ok(build_opendal_fs_list_stream( + lister, + list_interval_sec, + recursive_scan, + )) } ConnectorProperties::PosixFs(prop) => { list_interval_sec = get_list_interval_sec(prop.fs_common.refresh_interval_sec); + let recursive_scan = prop.fs_common.recursive_scan.unwrap_or_default(); let lister: OpendalEnumerator = OpendalEnumerator::new_posix_fs_source(*prop)?; - Ok(build_opendal_fs_list_stream(lister, list_interval_sec)) + Ok(build_opendal_fs_list_stream( + lister, + list_interval_sec, + recursive_scan, + )) } other => bail!("Unsupported source: {:?}", other), } @@ -264,10 +284,11 @@ impl SourceReader { async fn build_opendal_fs_list_stream( lister: OpendalEnumerator, list_interval_sec: u64, + recursive_scan: bool, ) { loop { let matcher = lister.get_matcher(); - let mut object_metadata_iter = lister.list().await?; + let mut object_metadata_iter = lister.list(recursive_scan).await?; while let Some(list_res) = object_metadata_iter.next().await { match list_res { @@ -294,9 +315,12 @@ async fn build_opendal_fs_list_stream( } #[try_stream(boxed, ok = OpendalFsSplit, error = crate::error::ConnectorError)] -pub async fn build_opendal_fs_list_for_batch(lister: OpendalEnumerator) { +pub async fn build_opendal_fs_list_for_batch( + lister: OpendalEnumerator, + recursive_scan: bool, +) { let matcher = lister.get_matcher(); - let mut object_metadata_iter = lister.list().await?; + let mut object_metadata_iter = lister.list(recursive_scan).await?; while let Some(list_res) = object_metadata_iter.next().await { match list_res { diff --git a/src/connector/with_options_sink.yaml b/src/connector/with_options_sink.yaml index cc92f9a0a664a..1af3435eaea24 100644 --- a/src/connector/with_options_sink.yaml +++ b/src/connector/with_options_sink.yaml @@ -115,7 +115,7 @@ ClickHouseConfig: field_type: u64 comments: Commit every n(>0) checkpoints, default is 10. required: false - default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL + default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE - name: r#type field_type: String required: true @@ -143,7 +143,7 @@ DeltaLakeConfig: field_type: u64 comments: Commit every n(>0) checkpoints, default is 10. required: false - default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL + default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE - name: r#type field_type: String required: true @@ -339,7 +339,7 @@ IcebergConfig: field_type: u64 comments: Commit every n(>0) checkpoints, default is 10. required: false - default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL + default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE - name: create_table_if_not_exists field_type: bool required: false @@ -373,14 +373,26 @@ KafkaConfig: field_type: String comments: Path to CA certificate file for verifying the broker's key. required: false + - name: properties.ssl.ca.pem + field_type: String + comments: CA certificate string (PEM format) for verifying the broker's key. + required: false - name: properties.ssl.certificate.location field_type: String comments: Path to client's certificate file (PEM). required: false + - name: properties.ssl.certificate.pem + field_type: String + comments: Client's public key string (PEM format) used for authentication. + required: false - name: properties.ssl.key.location field_type: String comments: Path to client's private key file (PEM). required: false + - name: properties.ssl.key.pem + field_type: String + comments: Client's private key string (PEM format) used for authentication. + required: false - name: properties.ssl.key.password field_type: String comments: Passphrase of client's private key. @@ -1009,7 +1021,7 @@ StarrocksConfig: also, in this time, the `sink_decouple` option should be enabled as well. Defaults to 10 if commit_checkpoint_interval <= 0 required: false - default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL + default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE - name: starrocks.partial_update field_type: String comments: Enable partial update diff --git a/src/connector/with_options_source.yaml b/src/connector/with_options_source.yaml index 4eaf1e0d3db4b..c54dce97ad1cd 100644 --- a/src/connector/with_options_source.yaml +++ b/src/connector/with_options_source.yaml @@ -24,6 +24,10 @@ AzblobProperties: - name: refresh.interval.sec field_type: u64 required: false + - name: recursive_scan + field_type: bool + required: false + default: Default::default - name: compression_format field_type: CompressionFormat required: false @@ -75,6 +79,10 @@ GcsProperties: - name: refresh.interval.sec field_type: u64 required: false + - name: recursive_scan + field_type: bool + required: false + default: Default::default - name: compression_format field_type: CompressionFormat required: false @@ -199,14 +207,26 @@ KafkaProperties: field_type: String comments: Path to CA certificate file for verifying the broker's key. required: false + - name: properties.ssl.ca.pem + field_type: String + comments: CA certificate string (PEM format) for verifying the broker's key. + required: false - name: properties.ssl.certificate.location field_type: String comments: Path to client's certificate file (PEM). required: false + - name: properties.ssl.certificate.pem + field_type: String + comments: Client's public key string (PEM format) used for authentication. + required: false - name: properties.ssl.key.location field_type: String comments: Path to client's private key file (PEM). required: false + - name: properties.ssl.key.pem + field_type: String + comments: Client's private key string (PEM format) used for authentication. + required: false - name: properties.ssl.key.password field_type: String comments: Passphrase of client's private key. @@ -828,6 +848,10 @@ OpendalS3Properties: - name: refresh.interval.sec field_type: u64 required: false + - name: recursive_scan + field_type: bool + required: false + default: Default::default PosixFsProperties: fields: - name: posix_fs.root @@ -842,6 +866,10 @@ PosixFsProperties: - name: refresh.interval.sec field_type: u64 required: false + - name: recursive_scan + field_type: bool + required: false + default: Default::default - name: compression_format field_type: CompressionFormat required: false @@ -988,6 +1016,17 @@ PulsarProperties: field_type: String required: false default: Default::default + - name: subscription.name.prefix + field_type: String + comments: |- + Specify a custom consumer group id prefix for the source. + Defaults to `rw-consumer`. + + Notes: + - Each job (materialized view) will have multiple subscriptions and + contains a generated suffix in the subscription name. + The subscription name will be `{subscription_name_prefix}-{fragment_id}-{actor_id}`. + required: false S3Properties: fields: - name: s3.region_name diff --git a/src/ctl/src/cmd_impl/hummock/compaction_group.rs b/src/ctl/src/cmd_impl/hummock/compaction_group.rs index a0395d236d504..c41b4c6e25b9e 100644 --- a/src/ctl/src/cmd_impl/hummock/compaction_group.rs +++ b/src/ctl/src/cmd_impl/hummock/compaction_group.rs @@ -131,10 +131,11 @@ pub async fn split_compaction_group( context: &CtlContext, group_id: CompactionGroupId, table_ids_to_new_group: &[StateTableId], + partition_vnode_count: u32, ) -> anyhow::Result<()> { let meta_client = context.meta_client().await?; let new_group_id = meta_client - .split_compaction_group(group_id, table_ids_to_new_group) + .split_compaction_group(group_id, table_ids_to_new_group, partition_vnode_count) .await?; println!( "Succeed: split compaction group {}. tables {:#?} are moved to new group {}.", @@ -284,3 +285,15 @@ pub async fn cancel_compact_task(context: &CtlContext, task_id: u64) -> anyhow:: Ok(()) } + +pub async fn merge_compaction_group( + context: &CtlContext, + left_group_id: CompactionGroupId, + right_group_id: CompactionGroupId, +) -> anyhow::Result<()> { + let meta_client = context.meta_client().await?; + meta_client + .merge_compaction_group(left_group_id, right_group_id) + .await?; + Ok(()) +} diff --git a/src/ctl/src/cmd_impl/meta/cluster_info.rs b/src/ctl/src/cmd_impl/meta/cluster_info.rs index cbc21ca6ec610..76b91d37fbd3c 100644 --- a/src/ctl/src/cmd_impl/meta/cluster_info.rs +++ b/src/ctl/src/cmd_impl/meta/cluster_info.rs @@ -31,7 +31,7 @@ pub async fn get_cluster_info(context: &CtlContext) -> anyhow::Result anyhow::Result<()> { +pub async fn source_split_info(context: &CtlContext, ignore_id: bool) -> anyhow::Result<()> { let GetClusterInfoResponse { worker_nodes: _, source_infos: _, @@ -40,37 +40,113 @@ pub async fn source_split_info(context: &CtlContext) -> anyhow::Result<()> { revision: _, } = get_cluster_info(context).await?; + let mut actor_splits_map: BTreeMap = BTreeMap::new(); + + // build actor_splits_map for table_fragment in &table_fragments { if table_fragment.actor_splits.is_empty() { continue; } - println!("Table #{}", table_fragment.table_id); - for fragment in table_fragment.fragments.values() { let fragment_type_mask = fragment.fragment_type_mask; if fragment_type_mask & FragmentTypeFlag::Source as u32 == 0 - || fragment_type_mask & FragmentTypeFlag::Dml as u32 != 0 + && fragment_type_mask & FragmentTypeFlag::SourceScan as u32 == 0 { + // no source or source backfill + continue; + } + if fragment_type_mask & FragmentTypeFlag::Dml as u32 != 0 { // skip dummy source for dml fragment continue; } - println!("\tFragment #{}", fragment.fragment_id); for actor in &fragment.actors { if let Some(ConnectorSplits { splits }) = actor_splits.remove(&actor.actor_id) { let splits = splits .iter() .map(|split| SplitImpl::try_from(split).unwrap()) .map(|split| split.id()) - .collect_vec(); + .collect_vec() + .join(","); + actor_splits_map.insert(actor.actor_id, (splits.len(), splits)); + } + } + } + } + // print in the second iteration. Otherwise we don't have upstream splits info + for table_fragment in &table_fragments { + if table_fragment.actor_splits.is_empty() { + continue; + } + if ignore_id { + println!("Table"); + } else { + println!("Table #{}", table_fragment.table_id); + } + for fragment in table_fragment.fragments.values() { + let fragment_type_mask = fragment.fragment_type_mask; + if fragment_type_mask & FragmentTypeFlag::Source as u32 == 0 + && fragment_type_mask & FragmentTypeFlag::SourceScan as u32 == 0 + { + // no source or source backfill + continue; + } + if fragment_type_mask & FragmentTypeFlag::Dml as u32 != 0 { + // skip dummy source for dml fragment + continue; + } + + println!( + "\tFragment{} ({})", + if ignore_id { + "".to_string() + } else { + format!(" #{}", fragment.fragment_id) + }, + if fragment_type_mask == FragmentTypeFlag::Source as u32 { + "Source" + } else { + "SourceScan" + } + ); + for actor in &fragment.actors { + if let Some((split_count, splits)) = actor_splits_map.get(&actor.actor_id) { println!( - "\t\tActor #{:<3} ({}): [{}]", - actor.actor_id, - splits.len(), - splits.join(",") + "\t\tActor{} ({} splits): [{}]{}", + if ignore_id { + "".to_string() + } else { + format!(" #{:<3}", actor.actor_id,) + }, + split_count, + splits, + if !actor.upstream_actor_id.is_empty() { + assert!( + actor.upstream_actor_id.len() == 1, + "should have only one upstream actor, got {actor:?}" + ); + let upstream_splits = + actor_splits_map.get(&actor.upstream_actor_id[0]).unwrap(); + format!( + " <- Upstream Actor{}: [{}]", + if ignore_id { + "".to_string() + } else { + format!(" #{}", actor.upstream_actor_id[0]) + }, + upstream_splits.1 + ) + } else { + "".to_string() + } ); + } else { + println!( + "\t\tError: Actor #{:<3} (not found in actor_splits)", + actor.actor_id, + ) } } } diff --git a/src/ctl/src/cmd_impl/table/scan.rs b/src/ctl/src/cmd_impl/table/scan.rs index e5bba170bf97a..f5cee710a40fc 100644 --- a/src/ctl/src/cmd_impl/table/scan.rs +++ b/src/ctl/src/cmd_impl/table/scan.rs @@ -14,6 +14,8 @@ use anyhow::{anyhow, Result}; use futures::{pin_mut, StreamExt}; +use risingwave_common::bitmap::Bitmap; +use risingwave_common::hash::VirtualNode; use risingwave_frontend::TableCatalog; use risingwave_hummock_sdk::HummockReadEpoch; use risingwave_rpc_client::MetaClient; @@ -63,7 +65,8 @@ pub async fn make_state_table(hummock: S, table: &TableCatalog) - .collect(), table.pk().iter().map(|x| x.order_type).collect(), table.pk().iter().map(|x| x.column_index).collect(), - TableDistribution::all(table.distribution_key().to_vec()), // scan all vnodes + // TODO(var-vnode): use vnode count from table desc + TableDistribution::all(table.distribution_key().to_vec(), VirtualNode::COUNT), // scan all vnodes Some(table.value_indices.clone()), ) .await @@ -81,7 +84,8 @@ pub fn make_storage_table( Ok(StorageTable::new_partial( hummock, output_columns_ids, - Some(TableDistribution::all_vnodes()), + // TODO(var-vnode): use vnode count from table desc + Some(Bitmap::ones(VirtualNode::COUNT).into()), &table.table_desc().try_to_protobuf()?, )) } diff --git a/src/ctl/src/lib.rs b/src/ctl/src/lib.rs index d1deba4f99140..b35b8d1e42cb2 100644 --- a/src/ctl/src/lib.rs +++ b/src/ctl/src/lib.rs @@ -276,6 +276,8 @@ enum HummockCommands { compaction_group_id: u64, #[clap(long, value_delimiter = ',')] table_ids: Vec, + #[clap(long, default_value_t = 0)] + partition_vnode_count: u32, }, /// Pause version checkpoint, which subsequently pauses GC of delta log and SST object. PauseVersionCheckpoint, @@ -340,6 +342,12 @@ enum HummockCommands { #[clap(long)] record_hybrid_fetch_threshold_ms: Option, }, + MergeCompactionGroup { + #[clap(long)] + left_group_id: u64, + #[clap(long)] + right_group_id: u64, + }, } #[derive(Subcommand)] @@ -404,7 +412,10 @@ enum MetaCommands { /// get cluster info ClusterInfo, /// get source split info - SourceSplitInfo, + SourceSplitInfo { + #[clap(long)] + ignore_id: bool, + }, /// Reschedule the actors in the stream graph /// /// The format is `fragment_id-[worker_id:count]+[worker_id:count]` @@ -708,9 +719,15 @@ async fn start_impl(opts: CliOpts, context: &CtlContext) -> Result<()> { Commands::Hummock(HummockCommands::SplitCompactionGroup { compaction_group_id, table_ids, + partition_vnode_count, }) => { - cmd_impl::hummock::split_compaction_group(context, compaction_group_id, &table_ids) - .await?; + cmd_impl::hummock::split_compaction_group( + context, + compaction_group_id, + &table_ids, + partition_vnode_count, + ) + .await?; } Commands::Hummock(HummockCommands::PauseVersionCheckpoint) => { cmd_impl::hummock::pause_version_checkpoint(context).await?; @@ -787,6 +804,13 @@ async fn start_impl(opts: CliOpts, context: &CtlContext) -> Result<()> { ) .await? } + Commands::Hummock(HummockCommands::MergeCompactionGroup { + left_group_id, + right_group_id, + }) => { + cmd_impl::hummock::merge_compaction_group(context, left_group_id, right_group_id) + .await? + } Commands::Table(TableCommands::Scan { mv_name, data_dir, @@ -808,8 +832,8 @@ async fn start_impl(opts: CliOpts, context: &CtlContext) -> Result<()> { Commands::Meta(MetaCommands::Pause) => cmd_impl::meta::pause(context).await?, Commands::Meta(MetaCommands::Resume) => cmd_impl::meta::resume(context).await?, Commands::Meta(MetaCommands::ClusterInfo) => cmd_impl::meta::cluster_info(context).await?, - Commands::Meta(MetaCommands::SourceSplitInfo) => { - cmd_impl::meta::source_split_info(context).await? + Commands::Meta(MetaCommands::SourceSplitInfo { ignore_id }) => { + cmd_impl::meta::source_split_info(context, ignore_id).await? } Commands::Meta(MetaCommands::Reschedule { from, diff --git a/src/dml/src/lib.rs b/src/dml/src/lib.rs index a15a4dfb3fba9..f0034a630a823 100644 --- a/src/dml/src/lib.rs +++ b/src/dml/src/lib.rs @@ -14,7 +14,6 @@ #![allow(clippy::derive_partial_eq_without_eq)] #![feature(trait_alias)] -#![feature(lint_reasons)] #![feature(coroutines)] #![feature(hash_extract_if)] #![feature(type_alias_impl_trait)] diff --git a/src/error/src/lib.rs b/src/error/src/lib.rs index 4dde816be458b..010308bf95cc8 100644 --- a/src/error/src/lib.rs +++ b/src/error/src/lib.rs @@ -21,7 +21,6 @@ //! access if `risingwave_common` is already a dependency. #![feature(error_generic_member_access)] -#![feature(lint_reasons)] #![feature(register_tool)] #![register_tool(rw)] #![feature(trait_alias)] diff --git a/src/expr/core/src/lib.rs b/src/expr/core/src/lib.rs index d45d4ca11f80a..73e3b6a6ed2e3 100644 --- a/src/expr/core/src/lib.rs +++ b/src/expr/core/src/lib.rs @@ -13,7 +13,6 @@ // limitations under the License. #![feature(let_chains)] -#![feature(lint_reasons)] #![feature(iterator_try_collect)] #![feature(coroutines)] #![feature(never_type)] diff --git a/src/expr/impl/src/lib.rs b/src/expr/impl/src/lib.rs index e5c69c2660eeb..e710749a122d6 100644 --- a/src/expr/impl/src/lib.rs +++ b/src/expr/impl/src/lib.rs @@ -23,7 +23,6 @@ #![allow(non_snake_case)] // for `ctor` generated code #![feature(let_chains)] #![feature(assert_matches)] -#![feature(lint_reasons)] #![feature(iterator_try_collect)] #![feature(coroutines)] #![feature(test)] diff --git a/src/expr/impl/src/scalar/array.rs b/src/expr/impl/src/scalar/array.rs index d5f53213bf277..7b7d272000597 100644 --- a/src/expr/impl/src/scalar/array.rs +++ b/src/expr/impl/src/scalar/array.rs @@ -15,7 +15,7 @@ use risingwave_common::array::{ListValue, StructValue}; use risingwave_common::row::Row; use risingwave_common::types::{ - DataType, ListRef, MapRef, MapType, MapValue, ScalarRefImpl, ToOwnedDatum, + DataType, ListRef, MapRef, MapType, MapValue, ScalarRef, ScalarRefImpl, ToOwnedDatum, }; use risingwave_expr::expr::Context; use risingwave_expr::{function, ExprError}; @@ -241,6 +241,60 @@ fn map_delete(map: MapRef<'_>, key: Option>) -> MapValue { MapValue::delete(map, key) } +/// # Example +/// +/// ```slt +/// query T +/// select map_keys(map{'a':1, 'b':2, 'c':3}); +/// ---- +/// {a,b,c} +/// ``` +#[function( + "map_keys(anymap) -> anyarray", + type_infer = "|args|{ + Ok(DataType::List(Box::new(args[0].as_map().key().clone()))) + }" +)] +fn map_keys(map: MapRef<'_>) -> ListValue { + map.into_kv().0.to_owned_scalar() +} + +/// # Example +/// +/// ```slt +/// query T +/// select map_values(map{'a':1, 'b':2, 'c':3}); +/// ---- +/// {1,2,3} +/// ``` +#[function( + "map_values(anymap) -> anyarray", + type_infer = "|args|{ + Ok(DataType::List(Box::new(args[0].as_map().value().clone()))) + }" +)] +fn map_values(map: MapRef<'_>) -> ListValue { + map.into_kv().1.to_owned_scalar() +} + +/// # Example +/// +/// ```slt +/// query T +/// select map_entries(map{'a':1, 'b':2, 'c':3}); +/// ---- +/// {"(a,1)","(b,2)","(c,3)"} +/// ``` +#[function( + "map_entries(anymap) -> anyarray", + type_infer = "|args|{ + Ok(args[0].as_map().clone().into_list()) + }" +)] +fn map_entries(map: MapRef<'_>) -> ListValue { + map.into_inner().to_owned() +} + #[cfg(test)] mod tests { use risingwave_common::array::DataChunk; diff --git a/src/expr/impl/src/scalar/vnode.rs b/src/expr/impl/src/scalar/vnode.rs index e544c39f62499..edd4caa39970e 100644 --- a/src/expr/impl/src/scalar/vnode.rs +++ b/src/expr/impl/src/scalar/vnode.rs @@ -43,7 +43,8 @@ impl Expression for VnodeExpression { } async fn eval(&self, input: &DataChunk) -> Result { - let vnodes = VirtualNode::compute_chunk(input, &self.dist_key_indices); + // TODO(var-vnode): get vnode count from context + let vnodes = VirtualNode::compute_chunk(input, &self.dist_key_indices, VirtualNode::COUNT); let mut builder = I16ArrayBuilder::new(input.capacity()); vnodes .into_iter() @@ -52,8 +53,9 @@ impl Expression for VnodeExpression { } async fn eval_row(&self, input: &OwnedRow) -> Result { + // TODO(var-vnode): get vnode count from context Ok(Some( - VirtualNode::compute_row(input, &self.dist_key_indices) + VirtualNode::compute_row(input, &self.dist_key_indices, VirtualNode::COUNT) .to_scalar() .into(), )) diff --git a/src/expr/macro/src/lib.rs b/src/expr/macro/src/lib.rs index 8fd03e344db89..630c82a87701b 100644 --- a/src/expr/macro/src/lib.rs +++ b/src/expr/macro/src/lib.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(lint_reasons)] #![feature(let_chains)] use std::vec; diff --git a/src/frontend/src/catalog/system_catalog/rw_catalog/mod.rs b/src/frontend/src/catalog/system_catalog/rw_catalog/mod.rs index 879e375e2b762..5e3261c06d186 100644 --- a/src/frontend/src/catalog/system_catalog/rw_catalog/mod.rs +++ b/src/frontend/src/catalog/system_catalog/rw_catalog/mod.rs @@ -59,3 +59,4 @@ mod rw_worker_nodes; mod rw_actor_id_to_ddl; mod rw_fragment_id_to_ddl; +mod rw_worker_actor_count; diff --git a/src/frontend/src/catalog/system_catalog/rw_catalog/rw_ddl_progress.rs b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_ddl_progress.rs index 9f592d4e4f6b3..032b0f82907ef 100644 --- a/src/frontend/src/catalog/system_catalog/rw_catalog/rw_ddl_progress.rs +++ b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_ddl_progress.rs @@ -31,7 +31,7 @@ struct RwDdlProgress { #[system_catalog(table, "rw_catalog.rw_ddl_progress")] async fn read(reader: &SysCatalogReaderImpl) -> Result> { - let ddl_progresses = reader.meta_client.list_ddl_progress().await?; + let ddl_progresses = reader.meta_client.get_ddl_progress().await?; let table_ids = ddl_progresses .iter() diff --git a/src/frontend/src/catalog/system_catalog/rw_catalog/rw_worker_actor_count.rs b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_worker_actor_count.rs new file mode 100644 index 0000000000000..a336f69b2029f --- /dev/null +++ b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_worker_actor_count.rs @@ -0,0 +1,31 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use risingwave_common::types::Fields; +use risingwave_frontend_macro::system_catalog; + +#[system_catalog( + view, + "rw_catalog.rw_worker_actor_count", + "SELECT t2.id as worker_id, parallelism, count(*) as actor_count + FROM rw_actors t1, rw_worker_nodes t2 + where t1.worker_id = t2.id + GROUP BY t2.id, t2.parallelism;" +)] +#[derive(Fields)] +struct RwWorkerActorCount { + worker_id: i32, + parallelism: i32, + actor_count: i64, +} diff --git a/src/frontend/src/expr/mod.rs b/src/frontend/src/expr/mod.rs index f650fa3cb521b..c7acdfa5c4a3c 100644 --- a/src/frontend/src/expr/mod.rs +++ b/src/frontend/src/expr/mod.rs @@ -988,10 +988,9 @@ impl ExprImpl { _ => return None, }; let list: Vec<_> = inputs - .map(|expr| { + .inspect(|expr| { // Non constant IN will be bound to OR assert!(expr.is_const()); - expr }) .collect(); diff --git a/src/frontend/src/handler/alter_parallelism.rs b/src/frontend/src/handler/alter_parallelism.rs index 3c6ab52f51e39..ee3c26708908c 100644 --- a/src/frontend/src/handler/alter_parallelism.rs +++ b/src/frontend/src/handler/alter_parallelism.rs @@ -103,21 +103,23 @@ pub async fn handle_alter_parallelism( .filter(|w| w.is_streaming_schedulable()) .map(|w| w.parallelism) .sum::(); + // TODO(var-vnode): use vnode count from config + let max_parallelism = VirtualNode::COUNT; let mut builder = RwPgResponse::builder(stmt_type); match &target_parallelism.parallelism { Some(Parallelism::Adaptive(_)) | Some(Parallelism::Auto(_)) => { - if available_parallelism > VirtualNode::COUNT as u32 { - builder = builder.notice(format!("Available parallelism exceeds the maximum parallelism limit, the actual parallelism will be limited to {}", VirtualNode::COUNT)); + if available_parallelism > max_parallelism as u32 { + builder = builder.notice(format!("Available parallelism exceeds the maximum parallelism limit, the actual parallelism will be limited to {max_parallelism}")); } } Some(Parallelism::Fixed(FixedParallelism { parallelism })) => { - if *parallelism > VirtualNode::COUNT as u32 { - builder = builder.notice(format!("Provided parallelism exceeds the maximum parallelism limit, resetting to FIXED({})", VirtualNode::COUNT)); + if *parallelism > max_parallelism as u32 { + builder = builder.notice(format!("Provided parallelism exceeds the maximum parallelism limit, resetting to FIXED({max_parallelism})")); target_parallelism = PbTableParallelism { parallelism: Some(PbParallelism::Fixed(FixedParallelism { - parallelism: VirtualNode::COUNT as u32, + parallelism: max_parallelism as u32, })), }; } diff --git a/src/frontend/src/handler/create_index.rs b/src/frontend/src/handler/create_index.rs index ee6429a85e32e..a6cc1e20548f7 100644 --- a/src/frontend/src/handler/create_index.rs +++ b/src/frontend/src/handler/create_index.rs @@ -25,7 +25,6 @@ use risingwave_common::acl::AclMode; use risingwave_common::catalog::{IndexId, TableDesc, TableId}; use risingwave_common::util::sort_util::{ColumnOrder, OrderType}; use risingwave_pb::catalog::{PbIndex, PbIndexColumnProperties, PbStreamJobStatus, PbTable}; -use risingwave_pb::stream_plan::stream_fragment_graph::Parallelism; use risingwave_pb::user::grant_privilege::Object; use risingwave_sqlparser::ast; use risingwave_sqlparser::ast::{Ident, ObjectName, OrderByExpr}; @@ -448,14 +447,8 @@ pub async fn handle_create_index( include, distributed_by, )?; - let mut graph = build_graph(plan)?; - graph.parallelism = - session - .config() - .streaming_parallelism() - .map(|parallelism| Parallelism { - parallelism: parallelism.get(), - }); + let graph = build_graph(plan)?; + (graph, index_table, index) }; diff --git a/src/frontend/src/handler/create_mv.rs b/src/frontend/src/handler/create_mv.rs index 4399d80811c19..1c8a866db3e06 100644 --- a/src/frontend/src/handler/create_mv.rs +++ b/src/frontend/src/handler/create_mv.rs @@ -20,7 +20,6 @@ use pgwire::pg_response::{PgResponse, StatementType}; use risingwave_common::acl::AclMode; use risingwave_common::catalog::TableId; use risingwave_pb::catalog::PbTable; -use risingwave_pb::stream_plan::stream_fragment_graph::Parallelism; use risingwave_sqlparser::ast::{EmitMode, Ident, ObjectName, Query}; use super::privilege::resolve_relation_privileges; @@ -205,6 +204,9 @@ pub async fn handle_create_mv_bound( ) -> Result { let session = handler_args.session.clone(); + // Check cluster limits + session.check_cluster_limits().await?; + if let Either::Right(resp) = session.check_relation_name_duplicated( name.clone(), StatementType::CREATE_MATERIALIZED_VIEW, @@ -240,18 +242,7 @@ It only indicates the physical clustering of the data, which may improve the per emit_mode, )?; - let context = plan.plan_base().ctx().clone(); - let mut graph = build_graph(plan)?; - graph.parallelism = - session - .config() - .streaming_parallelism() - .map(|parallelism| Parallelism { - parallelism: parallelism.get(), - }); - // Set the timezone for the stream context - let ctx = graph.ctx.as_mut().unwrap(); - ctx.timezone = context.get_session_timezone(); + let graph = build_graph(plan)?; (table, graph) }; diff --git a/src/frontend/src/handler/create_sink.rs b/src/frontend/src/handler/create_sink.rs index d0bd1d0cc8f2f..9f4f2f63975f1 100644 --- a/src/frontend/src/handler/create_sink.rs +++ b/src/frontend/src/handler/create_sink.rs @@ -35,7 +35,6 @@ use risingwave_connector::sink::{ }; use risingwave_pb::catalog::{PbSink, PbSource, Table}; use risingwave_pb::ddl_service::{ReplaceTablePlan, TableJobType}; -use risingwave_pb::stream_plan::stream_fragment_graph::Parallelism; use risingwave_pb::stream_plan::stream_node::{NodeBody, PbNodeBody}; use risingwave_pb::stream_plan::{MergeNode, StreamFragmentGraph, StreamNode}; use risingwave_sqlparser::ast::{ @@ -419,6 +418,8 @@ pub async fn handle_create_sink( ) -> Result { let session = handle_args.session.clone(); + session.check_cluster_limits().await?; + if let Either::Right(resp) = session.check_relation_name_duplicated( stmt.sink_name.clone(), StatementType::CREATE_SINK, @@ -443,15 +444,7 @@ pub async fn handle_create_sink( ); } - let mut graph = build_graph(plan)?; - - graph.parallelism = - session - .config() - .streaming_parallelism() - .map(|parallelism| Parallelism { - parallelism: parallelism.get(), - }); + let graph = build_graph(plan)?; (sink, graph, target_table_catalog) }; diff --git a/src/frontend/src/handler/create_source.rs b/src/frontend/src/handler/create_source.rs index 432f814cd4c41..5186c8322095d 100644 --- a/src/frontend/src/handler/create_source.rs +++ b/src/frontend/src/handler/create_source.rs @@ -62,7 +62,6 @@ use risingwave_connector::WithPropertiesExt; use risingwave_pb::catalog::{PbSchemaRegistryNameStrategy, StreamSourceInfo, WatermarkDesc}; use risingwave_pb::plan_common::additional_column::ColumnType as AdditionalColumnType; use risingwave_pb::plan_common::{EncodeType, FormatType}; -use risingwave_pb::stream_plan::stream_fragment_graph::Parallelism; use risingwave_sqlparser::ast::{ get_delimiter, AstString, ColumnDef, ConnectorSchema, CreateSourceStatement, Encode, Format, ObjectName, ProtobufSchema, SourceWatermark, TableConstraint, @@ -1697,15 +1696,7 @@ pub async fn handle_create_source( )?; let stream_plan = source_node.to_stream(&mut ToStreamContext::new(false))?; - let mut graph = build_graph(stream_plan)?; - graph.parallelism = - session - .config() - .streaming_parallelism() - .map(|parallelism| Parallelism { - parallelism: parallelism.get(), - }); - graph + build_graph(stream_plan)? }; catalog_writer .create_source_with_graph(source, graph) diff --git a/src/frontend/src/handler/create_table.rs b/src/frontend/src/handler/create_table.rs index a10453a43ea4e..6b3da5d001e60 100644 --- a/src/frontend/src/handler/create_table.rs +++ b/src/frontend/src/handler/create_table.rs @@ -41,7 +41,6 @@ use risingwave_pb::plan_common::column_desc::GeneratedOrDefaultColumn; use risingwave_pb::plan_common::{ AdditionalColumn, ColumnDescVersion, DefaultColumnDesc, GeneratedColumnDesc, }; -use risingwave_pb::stream_plan::stream_fragment_graph::Parallelism; use risingwave_pb::stream_plan::StreamFragmentGraph; use risingwave_sqlparser::ast::{ CdcTableInfo, ColumnDef, ColumnOption, ConnectorSchema, DataType as AstDataType, @@ -1235,6 +1234,8 @@ pub async fn handle_create_table( session.notice_to_user("APPEND ONLY TABLE is currently an experimental feature."); } + session.check_cluster_limits().await?; + if let Either::Right(resp) = session.check_relation_name_duplicated( table_name.clone(), StatementType::CREATE_TABLE, @@ -1261,14 +1262,8 @@ pub async fn handle_create_table( ) .await?; - let mut graph = build_graph(plan)?; - graph.parallelism = - session - .config() - .streaming_parallelism() - .map(|parallelism| Parallelism { - parallelism: parallelism.get(), - }); + let graph = build_graph(plan)?; + (graph, source, table, job_type) }; @@ -1313,7 +1308,7 @@ pub fn check_create_table_with_source( #[allow(clippy::too_many_arguments)] pub async fn generate_stream_graph_for_table( - session: &Arc, + _session: &Arc, table_name: ObjectName, original_catalog: &Arc, source_schema: Option, @@ -1428,15 +1423,7 @@ pub async fn generate_stream_graph_for_table( ))? } - let graph = StreamFragmentGraph { - parallelism: session - .config() - .streaming_parallelism() - .map(|parallelism| Parallelism { - parallelism: parallelism.get(), - }), - ..build_graph(plan)? - }; + let graph = build_graph(plan)?; // Fill the original table ID. let table = Table { diff --git a/src/frontend/src/handler/create_table_as.rs b/src/frontend/src/handler/create_table_as.rs index bb00be2dfa486..27c527969f9b2 100644 --- a/src/frontend/src/handler/create_table_as.rs +++ b/src/frontend/src/handler/create_table_as.rs @@ -16,7 +16,6 @@ use either::Either; use pgwire::pg_response::StatementType; use risingwave_common::catalog::{ColumnCatalog, ColumnDesc}; use risingwave_pb::ddl_service::TableJobType; -use risingwave_pb::stream_plan::stream_fragment_graph::Parallelism; use risingwave_sqlparser::ast::{ColumnDef, ObjectName, OnConflict, Query, Statement}; use super::{HandlerArgs, RwPgResponse}; @@ -110,14 +109,8 @@ pub async fn handle_create_as( with_version_column, Some(col_id_gen.into_version()), )?; - let mut graph = build_graph(plan)?; - graph.parallelism = - session - .config() - .streaming_parallelism() - .map(|parallelism| Parallelism { - parallelism: parallelism.get(), - }); + let graph = build_graph(plan)?; + (graph, None, table) }; diff --git a/src/frontend/src/handler/show.rs b/src/frontend/src/handler/show.rs index 6cd8b95f95b49..1821ccc289ebc 100644 --- a/src/frontend/src/handler/show.rs +++ b/src/frontend/src/handler/show.rs @@ -450,7 +450,7 @@ pub async fn handle_show_object( .into()); } ShowObject::Jobs => { - let resp = session.env().meta_client().list_ddl_progress().await?; + let resp = session.env().meta_client().get_ddl_progress().await?; let rows = resp.into_iter().map(|job| ShowJobRow { id: job.id as i64, statement: job.statement, diff --git a/src/frontend/src/lib.rs b/src/frontend/src/lib.rs index d8b484e3d6fa2..d3d5d1623bd58 100644 --- a/src/frontend/src/lib.rs +++ b/src/frontend/src/lib.rs @@ -23,7 +23,6 @@ #![feature(if_let_guard)] #![feature(let_chains)] #![feature(assert_matches)] -#![feature(lint_reasons)] #![feature(box_patterns)] #![feature(macro_metavar_expr)] #![feature(min_specialization)] @@ -142,8 +141,9 @@ pub struct FrontendOpts { pub config_path: String, /// Used for control the metrics level, similar to log level. - /// 0 = disable metrics - /// >0 = enable metrics + /// + /// level = 0: disable metrics + /// level > 0: enable metrics #[clap(long, hide = true, env = "RW_METRICS_LEVEL")] #[override_opts(path = server.metrics_level)] pub metrics_level: Option, diff --git a/src/frontend/src/meta_client.rs b/src/frontend/src/meta_client.rs index 60fa992bdbe2d..c58dcc365f431 100644 --- a/src/frontend/src/meta_client.rs +++ b/src/frontend/src/meta_client.rs @@ -17,6 +17,7 @@ use std::collections::HashMap; use anyhow::Context; use risingwave_common::session_config::SessionConfig; use risingwave_common::system_param::reader::SystemParamsReader; +use risingwave_common::util::cluster_limit::ClusterLimit; use risingwave_hummock_sdk::version::{HummockVersion, HummockVersionDelta}; use risingwave_hummock_sdk::HummockVersionId; use risingwave_pb::backup_service::MetaSnapshotMetadata; @@ -90,7 +91,7 @@ pub trait FrontendMetaClient: Send + Sync { async fn set_session_param(&self, param: String, value: Option) -> Result; - async fn list_ddl_progress(&self) -> Result>; + async fn get_ddl_progress(&self) -> Result>; async fn get_tables(&self, table_ids: &[u32]) -> Result>; @@ -136,6 +137,8 @@ pub trait FrontendMetaClient: Send + Sync { ) -> Result>; async fn get_cluster_recovery_status(&self) -> Result; + + async fn get_cluster_limits(&self) -> Result>; } pub struct FrontendMetaClientImpl(pub MetaClient); @@ -229,7 +232,7 @@ impl FrontendMetaClient for FrontendMetaClientImpl { self.0.set_session_param(param, value).await } - async fn list_ddl_progress(&self) -> Result> { + async fn get_ddl_progress(&self) -> Result> { let ddl_progress = self.0.get_ddl_progress().await?; Ok(ddl_progress) } @@ -345,4 +348,8 @@ impl FrontendMetaClient for FrontendMetaClientImpl { async fn get_cluster_recovery_status(&self) -> Result { self.0.get_cluster_recovery_status().await } + + async fn get_cluster_limits(&self) -> Result> { + self.0.get_cluster_limits().await + } } diff --git a/src/frontend/src/optimizer/delta_join_solver.rs b/src/frontend/src/optimizer/delta_join_solver.rs index 5dc1bb30cc9f9..470fc0426d7d5 100644 --- a/src/frontend/src/optimizer/delta_join_solver.rs +++ b/src/frontend/src/optimizer/delta_join_solver.rs @@ -66,7 +66,8 @@ //! possible that every lookup path produces different distribution. We need to shuffle them //! before feeding data to union. -#![expect(dead_code)] +// FIXME: https://github.com/rust-lang/rust-analyzer/issues/17685 +#![allow(dead_code)] use std::collections::{BTreeMap, BTreeSet}; diff --git a/src/frontend/src/optimizer/plan_node/logical_over_window.rs b/src/frontend/src/optimizer/plan_node/logical_over_window.rs index 7a81b164fbafe..bb78380482752 100644 --- a/src/frontend/src/optimizer/plan_node/logical_over_window.rs +++ b/src/frontend/src/optimizer/plan_node/logical_over_window.rs @@ -548,11 +548,10 @@ impl ColPrunable for LogicalOverWindow { let new_window_functions = req_cols_win_func_part .indices() .map(|idx| self.window_functions()[idx - input_len].clone()) - .map(|func| { + .inspect(|func| { tmp.extend(func.args.iter().map(|x| x.index())); tmp.extend(func.partition_by.iter().map(|x| x.index())); tmp.extend(func.order_by.iter().map(|x| x.column_index)); - func }) .collect_vec(); (tmp, new_window_functions) diff --git a/src/frontend/src/optimizer/plan_node/stream_sink.rs b/src/frontend/src/optimizer/plan_node/stream_sink.rs index 2717c454e6435..3e34475c8d4bb 100644 --- a/src/frontend/src/optimizer/plan_node/stream_sink.rs +++ b/src/frontend/src/optimizer/plan_node/stream_sink.rs @@ -212,7 +212,7 @@ impl StreamSink { partition_info: Option, ) -> Result { let columns = derive_columns(input.schema(), out_names, &user_cols)?; - let (input, sink) = Self::derive_sink_desc( + let (input, mut sink) = Self::derive_sink_desc( input, user_distributed_by, name, @@ -241,8 +241,11 @@ impl StreamSink { if connector == TABLE_SINK && sink.target_table.is_none() { unsupported_sink(TABLE_SINK) } else { + SinkType::set_default_commit_checkpoint_interval( + &mut sink, + &input.ctx().session_ctx().config().sink_decouple(), + )?; SinkType::is_sink_decouple( - &sink, &input.ctx().session_ctx().config().sink_decouple(), ) } diff --git a/src/frontend/src/optimizer/rule/index_selection_rule.rs b/src/frontend/src/optimizer/rule/index_selection_rule.rs index 548fda7b92af4..a995dd9878620 100644 --- a/src/frontend/src/optimizer/rule/index_selection_rule.rs +++ b/src/frontend/src/optimizer/rule/index_selection_rule.rs @@ -48,7 +48,7 @@ use std::cmp::min; use std::collections::hash_map::Entry::{Occupied, Vacant}; -use std::collections::{BTreeMap, HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap}; use std::rc::Rc; use itertools::Itertools; @@ -962,17 +962,6 @@ impl ExprVisitor for TableScanIoEstimator<'_> { } } -#[derive(Default)] -struct ExprInputRefFinder { - pub input_ref_index_set: HashSet, -} - -impl ExprVisitor for ExprInputRefFinder { - fn visit_input_ref(&mut self, input_ref: &InputRef) { - self.input_ref_index_set.insert(input_ref.index); - } -} - struct ShiftInputRefRewriter { offset: usize, } diff --git a/src/frontend/src/scheduler/distributed/query_manager.rs b/src/frontend/src/scheduler/distributed/query_manager.rs index 86a54cf9c0f98..2d977cfb675e6 100644 --- a/src/frontend/src/scheduler/distributed/query_manager.rs +++ b/src/frontend/src/scheduler/distributed/query_manager.rs @@ -230,14 +230,13 @@ impl QueryManager { self.query_metrics.clone(), ) .await - .map_err(|err| { + .inspect_err(|_| { // Clean up query execution on error. context .session() .env() .query_manager() .delete_query(&query_id); - err })?; Ok(query_result_fetcher.stream_from_channel()) } diff --git a/src/frontend/src/scheduler/distributed/stage.rs b/src/frontend/src/scheduler/distributed/stage.rs index bb18e2143aa7f..e933d3f271108 100644 --- a/src/frontend/src/scheduler/distributed/stage.rs +++ b/src/frontend/src/scheduler/distributed/stage.rs @@ -1028,7 +1028,7 @@ impl StageRunner { .expect("no partition info for seq scan") .into_table() .expect("PartitionInfo should be TablePartitionInfo"); - scan_node.vnode_bitmap = Some(partition.vnode_bitmap); + scan_node.vnode_bitmap = Some(partition.vnode_bitmap.to_protobuf()); scan_node.scan_ranges = partition.scan_ranges; PbPlanNode { children: vec![], @@ -1045,7 +1045,7 @@ impl StageRunner { .expect("no partition info for seq scan") .into_table() .expect("PartitionInfo should be TablePartitionInfo"); - scan_node.vnode_bitmap = Some(partition.vnode_bitmap); + scan_node.vnode_bitmap = Some(partition.vnode_bitmap.to_protobuf()); PbPlanNode { children: vec![], identity, diff --git a/src/frontend/src/scheduler/local.rs b/src/frontend/src/scheduler/local.rs index a727ddd9db7dd..fcd15368bb5fc 100644 --- a/src/frontend/src/scheduler/local.rs +++ b/src/frontend/src/scheduler/local.rs @@ -500,7 +500,7 @@ impl LocalQueryExecution { let partition = partition .into_table() .expect("PartitionInfo should be TablePartitionInfo here"); - scan_node.vnode_bitmap = Some(partition.vnode_bitmap); + scan_node.vnode_bitmap = Some(partition.vnode_bitmap.to_protobuf()); scan_node.scan_ranges = partition.scan_ranges; } } @@ -522,7 +522,7 @@ impl LocalQueryExecution { let partition = partition .into_table() .expect("PartitionInfo should be TablePartitionInfo here"); - scan_node.vnode_bitmap = Some(partition.vnode_bitmap); + scan_node.vnode_bitmap = Some(partition.vnode_bitmap.to_protobuf()); } } _ => unreachable!(), diff --git a/src/frontend/src/scheduler/plan_fragmenter.rs b/src/frontend/src/scheduler/plan_fragmenter.rs index 09e4cbc0bfa03..63b6eef38da71 100644 --- a/src/frontend/src/scheduler/plan_fragmenter.rs +++ b/src/frontend/src/scheduler/plan_fragmenter.rs @@ -30,7 +30,7 @@ use risingwave_common::bail; use risingwave_common::bitmap::{Bitmap, BitmapBuilder}; use risingwave_common::catalog::{Schema, TableDesc}; use risingwave_common::hash::table_distribution::TableDistribution; -use risingwave_common::hash::{VirtualNode, WorkerSlotId, WorkerSlotMapping}; +use risingwave_common::hash::{WorkerSlotId, WorkerSlotMapping}; use risingwave_common::util::scan_range::ScanRange; use risingwave_connector::source::filesystem::opendal_source::opendal_enumerator::OpendalEnumerator; use risingwave_connector::source::filesystem::opendal_source::{ @@ -44,7 +44,6 @@ use risingwave_connector::source::{ }; use risingwave_pb::batch_plan::plan_node::NodeBody; use risingwave_pb::batch_plan::{ExchangeInfo, ScanRange as ScanRangeProto}; -use risingwave_pb::common::Buffer; use risingwave_pb::plan_common::Field as PbField; use risingwave_sqlparser::ast::AsOf; use serde::ser::SerializeStruct; @@ -311,9 +310,11 @@ impl SourceScanInfo { Ok(SourceScanInfo::Complete(split_info)) } ConnectorProperties::OpendalS3(prop) => { + let recursive_scan = prop.fs_common.recursive_scan.unwrap_or_default(); + let lister: OpendalEnumerator = OpendalEnumerator::new_s3_source(prop.s3_properties, prop.assume_role)?; - let stream = build_opendal_fs_list_for_batch(lister); + let stream = build_opendal_fs_list_for_batch(lister, recursive_scan); let batch_res: Vec<_> = stream.try_collect().await?; let res = batch_res @@ -324,18 +325,22 @@ impl SourceScanInfo { Ok(SourceScanInfo::Complete(res)) } ConnectorProperties::Gcs(prop) => { + let recursive_scan = prop.fs_common.recursive_scan.unwrap_or_default(); + let lister: OpendalEnumerator = OpendalEnumerator::new_gcs_source(*prop)?; - let stream = build_opendal_fs_list_for_batch(lister); + let stream = build_opendal_fs_list_for_batch(lister, recursive_scan); let batch_res: Vec<_> = stream.try_collect().await?; let res = batch_res.into_iter().map(SplitImpl::Gcs).collect_vec(); Ok(SourceScanInfo::Complete(res)) } ConnectorProperties::Azblob(prop) => { + let recursive_scan = prop.fs_common.recursive_scan.unwrap_or_default(); + let lister: OpendalEnumerator = OpendalEnumerator::new_azblob_source(*prop)?; - let stream = build_opendal_fs_list_for_batch(lister); + let stream = build_opendal_fs_list_for_batch(lister, recursive_scan); let batch_res: Vec<_> = stream.try_collect().await?; let res = batch_res.into_iter().map(SplitImpl::Azblob).collect_vec(); @@ -437,7 +442,7 @@ impl TableScanInfo { #[derive(Clone, Debug)] pub struct TablePartitionInfo { - pub vnode_bitmap: Buffer, + pub vnode_bitmap: Bitmap, pub scan_ranges: Vec, } @@ -922,8 +927,7 @@ impl BatchPlanFragmenter { .drain() .take(1) .update(|(_, info)| { - info.vnode_bitmap = - Bitmap::ones(VirtualNode::COUNT).to_protobuf(); + info.vnode_bitmap = Bitmap::ones(info.vnode_bitmap.len()); }) .collect(); } @@ -1230,7 +1234,7 @@ fn derive_partitions( table_desc: &TableDesc, vnode_mapping: &WorkerSlotMapping, ) -> SchedulerResult> { - let num_vnodes = vnode_mapping.len(); + let vnode_count = vnode_mapping.len(); let mut partitions: HashMap)> = HashMap::new(); if scan_ranges.is_empty() { @@ -1241,7 +1245,7 @@ fn derive_partitions( ( k, TablePartitionInfo { - vnode_bitmap: vnode_bitmap.to_protobuf(), + vnode_bitmap, scan_ranges: vec![], }, ) @@ -1250,7 +1254,7 @@ fn derive_partitions( } let table_distribution = TableDistribution::new_from_storage_table_desc( - Some(TableDistribution::all_vnodes()), + Some(Bitmap::ones(vnode_count).into()), &table_desc.try_to_protobuf()?, ); @@ -1263,7 +1267,7 @@ fn derive_partitions( |(worker_slot_id, vnode_bitmap)| { let (bitmap, scan_ranges) = partitions .entry(worker_slot_id) - .or_insert_with(|| (BitmapBuilder::zeroed(num_vnodes), vec![])); + .or_insert_with(|| (BitmapBuilder::zeroed(vnode_count), vec![])); vnode_bitmap .iter() .enumerate() @@ -1277,7 +1281,7 @@ fn derive_partitions( let worker_slot_id = vnode_mapping[vnode]; let (bitmap, scan_ranges) = partitions .entry(worker_slot_id) - .or_insert_with(|| (BitmapBuilder::zeroed(num_vnodes), vec![])); + .or_insert_with(|| (BitmapBuilder::zeroed(vnode_count), vec![])); bitmap.set(vnode.to_index(), true); scan_ranges.push(scan_range.to_protobuf()); } @@ -1290,7 +1294,7 @@ fn derive_partitions( ( k, TablePartitionInfo { - vnode_bitmap: bitmap.finish().to_protobuf(), + vnode_bitmap: bitmap.finish(), scan_ranges, }, ) diff --git a/src/frontend/src/session.rs b/src/frontend/src/session.rs index 16f0c7226be21..a1150798951cb 100644 --- a/src/frontend/src/session.rs +++ b/src/frontend/src/session.rs @@ -59,9 +59,10 @@ use risingwave_common::telemetry::manager::TelemetryManager; use risingwave_common::telemetry::telemetry_env_enabled; use risingwave_common::types::DataType; use risingwave_common::util::addr::HostAddr; +use risingwave_common::util::cluster_limit::ActorCountPerParallelism; use risingwave_common::util::iter_util::ZipEqFast; -use risingwave_common::util::resource_util; use risingwave_common::util::runtime::BackgroundShutdownRuntime; +use risingwave_common::util::{cluster_limit, resource_util}; use risingwave_common::{GIT_SHA, RW_VERSION}; use risingwave_common_heap_profiling::HeapProfiler; use risingwave_common_service::{MetricsManager, ObserverManager}; @@ -1194,6 +1195,47 @@ impl SessionImpl { pub fn temporary_source_manager(&self) -> TemporarySourceManager { self.temporary_source_manager.lock().clone() } + + pub async fn check_cluster_limits(&self) -> Result<()> { + if self.config().bypass_cluster_limits() { + return Ok(()); + } + + let gen_message = |violated_limit: &ActorCountPerParallelism, + exceed_hard_limit: bool| + -> String { + let (limit_type, action) = if exceed_hard_limit { + ("critical", "Please scale the cluster before proceeding!") + } else { + ("recommended", "Scaling the cluster is recommended.") + }; + format!( + "\n- {}\n- {}\n- {}\n- {}\n- {}\n{}", + format_args!("Actor count per parallelism exceeds the {} limit.", limit_type), + format_args!("Depending on your workload, this may overload the cluster and cause performance/stability issues. {}", action), + "Contact us via slack or https://risingwave.com/contact-us/ for further enquiry.", + "You can bypass this check via SQL `SET bypass_cluster_limits TO true`.", + "You can check actor count distribution via SQL `SELECT * FROM rw_worker_actor_count`.", + violated_limit, + ) + }; + + let limits = self.env().meta_client().get_cluster_limits().await?; + for limit in limits { + match limit { + cluster_limit::ClusterLimit::ActorCount(l) => { + if l.exceed_hard_limit() { + return Err(RwError::from(ErrorCode::ProtocolError(gen_message( + &l, true, + )))); + } else if l.exceed_soft_limit() { + self.notice_to_user(gen_message(&l, false)); + } + } + } + } + Ok(()) + } } pub static SESSION_MANAGER: std::sync::OnceLock> = diff --git a/src/frontend/src/stream_fragmenter/graph/fragment_graph.rs b/src/frontend/src/stream_fragmenter/graph/fragment_graph.rs index d1251f2295642..9ab491ec3a41f 100644 --- a/src/frontend/src/stream_fragmenter/graph/fragment_graph.rs +++ b/src/frontend/src/stream_fragmenter/graph/fragment_graph.rs @@ -19,8 +19,7 @@ use risingwave_pb::stream_plan::stream_fragment_graph::{ StreamFragment as StreamFragmentProto, StreamFragmentEdge as StreamFragmentEdgeProto, }; use risingwave_pb::stream_plan::{ - DispatchStrategy, FragmentTypeFlag, StreamContext, - StreamFragmentGraph as StreamFragmentGraphProto, StreamNode, + DispatchStrategy, FragmentTypeFlag, StreamFragmentGraph as StreamFragmentGraphProto, StreamNode, }; use thiserror_ext::AsReport; @@ -92,9 +91,6 @@ pub struct StreamFragmentGraph { /// stores edges between fragments: (upstream, downstream) => edge. edges: HashMap<(LocalFragmentId, LocalFragmentId), StreamFragmentEdgeProto>, - - /// Stores the streaming context for the streaming plan - ctx: StreamContext, } impl StreamFragmentGraph { @@ -106,8 +102,9 @@ impl StreamFragmentGraph { .map(|(k, v)| (*k, v.to_protobuf())) .collect(), edges: self.edges.values().cloned().collect(), - ctx: Some(self.ctx.clone()), - // To be filled later + + // Following fields will be filled later in `build_graph` based on session context. + ctx: None, dependent_table_ids: vec![], table_ids_cnt: 0, parallelism: None, diff --git a/src/frontend/src/stream_fragmenter/mod.rs b/src/frontend/src/stream_fragmenter/mod.rs index 66e9d5aff9e54..790f18d109a75 100644 --- a/src/frontend/src/stream_fragmenter/mod.rs +++ b/src/frontend/src/stream_fragmenter/mod.rs @@ -16,6 +16,7 @@ mod graph; use graph::*; use risingwave_common::util::recursive::{self, Recurse as _}; use risingwave_connector::WithPropertiesExt; +use risingwave_pb::stream_plan::stream_fragment_graph::Parallelism; use risingwave_pb::stream_plan::stream_node::NodeBody; mod rewrite; @@ -26,12 +27,13 @@ use educe::Educe; use risingwave_common::catalog::TableId; use risingwave_pb::plan_common::JoinType; use risingwave_pb::stream_plan::{ - DispatchStrategy, DispatcherType, ExchangeNode, FragmentTypeFlag, NoOpNode, + DispatchStrategy, DispatcherType, ExchangeNode, FragmentTypeFlag, NoOpNode, StreamContext, StreamFragmentGraph as StreamFragmentGraphProto, StreamNode, StreamScanType, }; use self::rewrite::build_delta_join_without_arrange; use crate::error::Result; +use crate::optimizer::plan_node::generic::GenericPlanRef; use crate::optimizer::plan_node::reorganize_elements_id; use crate::optimizer::PlanRef; use crate::scheduler::SchedulerResult; @@ -116,18 +118,38 @@ impl BuildFragmentGraphState { } pub fn build_graph(plan_node: PlanRef) -> SchedulerResult { + let ctx = plan_node.plan_base().ctx(); let plan_node = reorganize_elements_id(plan_node); let mut state = BuildFragmentGraphState::default(); let stream_node = plan_node.to_stream_prost(&mut state)?; generate_fragment_graph(&mut state, stream_node).unwrap(); let mut fragment_graph = state.fragment_graph.to_protobuf(); + + // Set table ids. fragment_graph.dependent_table_ids = state .dependent_table_ids .into_iter() .map(|id| id.table_id) .collect(); fragment_graph.table_ids_cnt = state.next_table_id; + + // Set parallelism. + { + let config = ctx.session_ctx().config(); + fragment_graph.parallelism = + config + .streaming_parallelism() + .map(|parallelism| Parallelism { + parallelism: parallelism.get(), + }); + } + + // Set timezone. + fragment_graph.ctx = Some(StreamContext { + timezone: ctx.get_session_timezone(), + }); + Ok(fragment_graph) } diff --git a/src/frontend/src/test_utils.rs b/src/frontend/src/test_utils.rs index ee6ff589e0cdb..6123889262155 100644 --- a/src/frontend/src/test_utils.rs +++ b/src/frontend/src/test_utils.rs @@ -30,6 +30,7 @@ use risingwave_common::catalog::{ }; use risingwave_common::session_config::SessionConfig; use risingwave_common::system_param::reader::SystemParamsReader; +use risingwave_common::util::cluster_limit::ClusterLimit; use risingwave_common::util::column_index_mapping::ColIndexMapping; use risingwave_hummock_sdk::version::{HummockVersion, HummockVersionDelta}; use risingwave_pb::backup_service::MetaSnapshotMetadata; @@ -1012,7 +1013,7 @@ impl FrontendMetaClient for MockFrontendMetaClient { Ok("".to_string()) } - async fn list_ddl_progress(&self) -> RpcResult> { + async fn get_ddl_progress(&self) -> RpcResult> { Ok(vec![]) } @@ -1065,7 +1066,7 @@ impl FrontendMetaClient for MockFrontendMetaClient { } async fn list_all_nodes(&self) -> RpcResult> { - unimplemented!() + Ok(vec![]) } async fn list_compact_task_progress(&self) -> RpcResult> { @@ -1097,6 +1098,10 @@ impl FrontendMetaClient for MockFrontendMetaClient { ) -> RpcResult> { unimplemented!() } + + async fn get_cluster_limits(&self) -> RpcResult> { + Ok(vec![]) + } } #[cfg(test)] diff --git a/src/jni_core/src/lib.rs b/src/jni_core/src/lib.rs index 419f4ffd21cb5..8b771629df315 100644 --- a/src/jni_core/src/lib.rs +++ b/src/jni_core/src/lib.rs @@ -320,6 +320,7 @@ impl<'a> Deref for JavaBindingIterator<'a> { #[no_mangle] extern "system" fn Java_com_risingwave_java_binding_Binding_vnodeCount(_env: EnvParam<'_>) -> jint { + // TODO(var-vnode): use vnode count from config VirtualNode::COUNT as jint } diff --git a/src/meta/Cargo.toml b/src/meta/Cargo.toml index 4511e9f61d894..a7f37bf505910 100644 --- a/src/meta/Cargo.toml +++ b/src/meta/Cargo.toml @@ -28,6 +28,7 @@ clap = { workspace = true } comfy-table = "7" crepe = "0.1" easy-ext = "1" +educe = "0.6" either = "1" enum-as-inner = "0.6" etcd-client = { workspace = true } diff --git a/src/meta/model_v2/migration/src/lib.rs b/src/meta/model_v2/migration/src/lib.rs index 08291e5b163d5..0b09f3c4d4e11 100644 --- a/src/meta/model_v2/migration/src/lib.rs +++ b/src/meta/model_v2/migration/src/lib.rs @@ -20,6 +20,7 @@ mod m20240702_080451_system_param_value; mod m20240702_084927_unnecessary_fk; mod m20240726_063833_auto_schema_change; mod m20240806_143329_add_rate_limit_to_source_catalog; +mod m20240820_081248_add_time_travel_per_table_epoch; pub struct Migrator; @@ -45,6 +46,7 @@ impl MigratorTrait for Migrator { Box::new(m20240702_084927_unnecessary_fk::Migration), Box::new(m20240726_063833_auto_schema_change::Migration), Box::new(m20240806_143329_add_rate_limit_to_source_catalog::Migration), + Box::new(m20240820_081248_add_time_travel_per_table_epoch::Migration), ] } } diff --git a/src/meta/model_v2/migration/src/m20240820_081248_add_time_travel_per_table_epoch.rs b/src/meta/model_v2/migration/src/m20240820_081248_add_time_travel_per_table_epoch.rs new file mode 100644 index 0000000000000..85d9475aa8f01 --- /dev/null +++ b/src/meta/model_v2/migration/src/m20240820_081248_add_time_travel_per_table_epoch.rs @@ -0,0 +1,197 @@ +use sea_orm_migration::prelude::*; + +#[derive(DeriveMigrationName)] +pub struct Migration; + +const TABLE_NAME: &str = "hummock_epoch_to_version"; + +#[async_trait::async_trait] +impl MigrationTrait for Migration { + async fn up(&self, manager: &SchemaManager) -> Result<(), DbErr> { + // modify PK + match manager.get_database_backend() { + sea_orm::DatabaseBackend::MySql => { + manager + .alter_table( + Table::alter() + .table(HummockEpochToVersion::Table) + .add_column( + ColumnDef::new(HummockEpochToVersion::TableId).big_integer(), + ) + .to_owned(), + ) + .await?; + manager + .get_connection() + .execute(sea_orm::Statement::from_string( + sea_orm::DatabaseBackend::MySql, + format!("ALTER TABLE {TABLE_NAME} DROP PRIMARY KEY, ADD PRIMARY KEY (epoch, table_id)"), + )) + .await?; + } + sea_orm::DatabaseBackend::Postgres => { + manager + .alter_table( + Table::alter() + .table(HummockEpochToVersion::Table) + .add_column( + ColumnDef::new(HummockEpochToVersion::TableId).big_integer(), + ) + .to_owned(), + ) + .await?; + manager + .get_connection() + .execute(sea_orm::Statement::from_string( + sea_orm::DatabaseBackend::Postgres, + format!("ALTER TABLE {TABLE_NAME} DROP CONSTRAINT {TABLE_NAME}_pkey"), + )) + .await?; + manager + .get_connection() + .execute(sea_orm::Statement::from_string( + sea_orm::DatabaseBackend::Postgres, + format!("ALTER TABLE {TABLE_NAME} ADD PRIMARY KEY (epoch, table_id)"), + )) + .await?; + } + sea_orm::DatabaseBackend::Sqlite => { + // sqlite is not for prod usage, so recreating the table is fine. + manager + .drop_table( + sea_orm_migration::prelude::Table::drop() + .table(HummockEpochToVersion::Table) + .if_exists() + .cascade() + .to_owned(), + ) + .await?; + + manager + .create_table( + Table::create() + .table(HummockEpochToVersion::Table) + .if_not_exists() + .col( + ColumnDef::new(HummockEpochToVersion::Epoch) + .big_integer() + .not_null(), + ) + .col( + ColumnDef::new(HummockEpochToVersion::TableId) + .big_integer() + .not_null(), + ) + .col( + ColumnDef::new(HummockEpochToVersion::VersionId) + .big_integer() + .not_null(), + ) + .primary_key( + Index::create() + .col(HummockEpochToVersion::Epoch) + .col(HummockEpochToVersion::TableId), + ) + .to_owned(), + ) + .await?; + } + } + Ok(()) + } + + async fn down(&self, manager: &SchemaManager) -> Result<(), DbErr> { + // The downgrade for MySql and Postgres may not work due to PK confliction. + match manager.get_database_backend() { + sea_orm::DatabaseBackend::MySql => { + manager + .get_connection() + .execute(sea_orm::Statement::from_string( + sea_orm::DatabaseBackend::MySql, + format!("ALTER TABLE {TABLE_NAME} DROP PRIMARY KEY"), + )) + .await?; + manager + .alter_table( + Table::alter() + .table(HummockEpochToVersion::Table) + .drop_column(HummockEpochToVersion::TableId) + .to_owned(), + ) + .await?; + manager + .get_connection() + .execute(sea_orm::Statement::from_string( + sea_orm::DatabaseBackend::MySql, + format!("ALTER TABLE {TABLE_NAME} ADD PRIMARY KEY (epoch)"), + )) + .await?; + } + sea_orm::DatabaseBackend::Postgres => { + manager + .get_connection() + .execute(sea_orm::Statement::from_string( + sea_orm::DatabaseBackend::Postgres, + format!("ALTER TABLE {TABLE_NAME} DROP CONSTRAINT {TABLE_NAME}_pkey"), + )) + .await?; + manager + .alter_table( + Table::alter() + .table(HummockEpochToVersion::Table) + .drop_column(HummockEpochToVersion::TableId) + .to_owned(), + ) + .await?; + manager + .get_connection() + .execute(sea_orm::Statement::from_string( + sea_orm::DatabaseBackend::Postgres, + format!("ALTER TABLE {TABLE_NAME} ADD PRIMARY KEY (epoch)"), + )) + .await?; + } + sea_orm::DatabaseBackend::Sqlite => { + manager + .drop_table( + sea_orm_migration::prelude::Table::drop() + .table(HummockEpochToVersion::Table) + .if_exists() + .cascade() + .to_owned(), + ) + .await?; + + manager + .create_table( + Table::create() + .table(HummockEpochToVersion::Table) + .if_not_exists() + .col( + ColumnDef::new(HummockEpochToVersion::Epoch) + .big_integer() + .not_null() + .primary_key(), + ) + .col( + ColumnDef::new(HummockEpochToVersion::VersionId) + .big_integer() + .not_null(), + ) + .to_owned(), + ) + .await?; + } + } + + Ok(()) + } +} + +#[derive(DeriveIden)] +enum HummockEpochToVersion { + Table, + Epoch, + TableId, + VersionId, +} diff --git a/src/meta/model_v2/src/hummock_epoch_to_version.rs b/src/meta/model_v2/src/hummock_epoch_to_version.rs index 181b1b320bc54..f54551aa80178 100644 --- a/src/meta/model_v2/src/hummock_epoch_to_version.rs +++ b/src/meta/model_v2/src/hummock_epoch_to_version.rs @@ -22,6 +22,8 @@ use crate::{Epoch, HummockVersionId}; pub struct Model { #[sea_orm(primary_key, auto_increment = false)] pub epoch: Epoch, + #[sea_orm(primary_key, auto_increment = false)] + pub table_id: i64, pub version_id: HummockVersionId, } diff --git a/src/meta/node/src/lib.rs b/src/meta/node/src/lib.rs index 049519372c81e..6fa88fd412e31 100644 --- a/src/meta/node/src/lib.rs +++ b/src/meta/node/src/lib.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(lint_reasons)] #![feature(let_chains)] #![cfg_attr(coverage, feature(coverage_attribute))] @@ -457,6 +456,14 @@ pub fn start( table_info_statistic_history_times: config .storage .table_info_statistic_history_times, + actor_cnt_per_worker_parallelism_hard_limit: config + .meta + .developer + .actor_cnt_per_worker_parallelism_hard_limit, + actor_cnt_per_worker_parallelism_soft_limit: config + .meta + .developer + .actor_cnt_per_worker_parallelism_soft_limit, }, config.system.into_init_system_params(), Default::default(), diff --git a/src/meta/node/src/server.rs b/src/meta/node/src/server.rs index 1f0f7f6a3fe8e..11b22014f9f98 100644 --- a/src/meta/node/src/server.rs +++ b/src/meta/node/src/server.rs @@ -27,7 +27,6 @@ use risingwave_common::telemetry::manager::TelemetryManager; use risingwave_common::telemetry::{report_scarf_enabled, report_to_scarf, telemetry_env_enabled}; use risingwave_common::util::tokio_util::sync::CancellationToken; use risingwave_common_service::{MetricsManager, TracingExtractLayer}; -use risingwave_meta::barrier::StreamRpcManager; use risingwave_meta::controller::catalog::CatalogController; use risingwave_meta::controller::cluster::ClusterController; use risingwave_meta::manager::{ @@ -40,6 +39,7 @@ use risingwave_meta::stream::ScaleController; use risingwave_meta::MetaStoreBackend; use risingwave_meta_service::backup_service::BackupServiceImpl; use risingwave_meta_service::cloud_service::CloudServiceImpl; +use risingwave_meta_service::cluster_limit_service::ClusterLimitServiceImpl; use risingwave_meta_service::cluster_service::ClusterServiceImpl; use risingwave_meta_service::ddl_service::DdlServiceImpl; use risingwave_meta_service::event_log_service::EventLogServiceImpl; @@ -63,6 +63,7 @@ use risingwave_pb::connector_service::sink_coordination_service_server::SinkCoor use risingwave_pb::ddl_service::ddl_service_server::DdlServiceServer; use risingwave_pb::health::health_server::HealthServer; use risingwave_pb::hummock::hummock_manager_service_server::HummockManagerServiceServer; +use risingwave_pb::meta::cluster_limit_service_server::ClusterLimitServiceServer; use risingwave_pb::meta::cluster_service_server::ClusterServiceServer; use risingwave_pb::meta::event_log_service_server::EventLogServiceServer; use risingwave_pb::meta::heartbeat_service_server::HeartbeatServiceServer; @@ -550,12 +551,9 @@ pub async fn start_service_as_election_leader( // TODO(shutdown): remove this as there's no need to gracefully shutdown some of these sub-tasks. let mut sub_tasks = vec![shutdown_handle]; - let stream_rpc_manager = StreamRpcManager::new(env.clone()); - let scale_controller = Arc::new(ScaleController::new( &metadata_manager, source_manager.clone(), - stream_rpc_manager.clone(), env.clone(), )); @@ -567,7 +565,6 @@ pub async fn start_service_as_election_leader( source_manager.clone(), sink_manager.clone(), meta_metrics.clone(), - stream_rpc_manager.clone(), scale_controller.clone(), ) .await; @@ -585,7 +582,6 @@ pub async fn start_service_as_election_leader( metadata_manager.clone(), barrier_scheduler.clone(), source_manager.clone(), - stream_rpc_manager, scale_controller.clone(), ) .unwrap(), @@ -657,6 +653,7 @@ pub async fn start_service_as_election_leader( ServingServiceImpl::new(serving_vnode_mapping.clone(), metadata_manager.clone()); let cloud_srv = CloudServiceImpl::new(metadata_manager.clone(), aws_cli); let event_log_srv = EventLogServiceImpl::new(env.event_log_manager_ref()); + let cluster_limit_srv = ClusterLimitServiceImpl::new(env.clone(), metadata_manager.clone()); if let Some(prometheus_addr) = address_info.prometheus_addr { MetricsManager::boot_metrics_service(prometheus_addr.to_string()) @@ -795,7 +792,8 @@ pub async fn start_service_as_election_leader( .add_service(ServingServiceServer::new(serving_srv)) .add_service(CloudServiceServer::new(cloud_srv)) .add_service(SinkCoordinationServiceServer::new(sink_coordination_srv)) - .add_service(EventLogServiceServer::new(event_log_srv)); + .add_service(EventLogServiceServer::new(event_log_srv)) + .add_service(ClusterLimitServiceServer::new(cluster_limit_srv)); #[cfg(not(madsim))] // `otlp-embedded` does not use madsim-patched tonic let server_builder = server_builder.add_service(TraceServiceServer::new(trace_srv)); diff --git a/src/meta/service/src/cluster_limit_service.rs b/src/meta/service/src/cluster_limit_service.rs new file mode 100644 index 0000000000000..df19b24b234e6 --- /dev/null +++ b/src/meta/service/src/cluster_limit_service.rs @@ -0,0 +1,107 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use risingwave_common::util::cluster_limit::{ + ActorCountPerParallelism, ClusterLimit, WorkerActorCount, +}; +use risingwave_meta::manager::{MetaSrvEnv, MetadataManager, WorkerId}; +use risingwave_meta::MetaResult; +use risingwave_pb::common::worker_node::State; +use risingwave_pb::common::WorkerType; +use risingwave_pb::meta::cluster_limit_service_server::ClusterLimitService; +use risingwave_pb::meta::{GetClusterLimitsRequest, GetClusterLimitsResponse}; +use tonic::{Request, Response, Status}; + +#[derive(Clone)] +pub struct ClusterLimitServiceImpl { + env: MetaSrvEnv, + metadata_manager: MetadataManager, +} + +impl ClusterLimitServiceImpl { + pub fn new(env: MetaSrvEnv, metadata_manager: MetadataManager) -> Self { + ClusterLimitServiceImpl { + env, + metadata_manager, + } + } + + async fn get_active_actor_limit(&self) -> MetaResult> { + let (soft_limit, hard_limit) = ( + self.env.opts.actor_cnt_per_worker_parallelism_soft_limit, + self.env.opts.actor_cnt_per_worker_parallelism_hard_limit, + ); + + let running_worker_parallelism: HashMap = self + .metadata_manager + .list_worker_node(Some(WorkerType::ComputeNode), Some(State::Running)) + .await? + .into_iter() + .map(|e| (e.id, e.parallelism())) + .collect(); + let worker_actor_count: HashMap = self + .metadata_manager + .worker_actor_count() + .await? + .into_iter() + .filter_map(|(worker_id, actor_count)| { + running_worker_parallelism + .get(&worker_id) + .map(|parallelism| { + ( + worker_id, + WorkerActorCount { + actor_count, + parallelism: *parallelism, + }, + ) + }) + }) + .collect(); + + let limit = ActorCountPerParallelism { + worker_id_to_actor_count: worker_actor_count, + hard_limit, + soft_limit, + }; + + if limit.exceed_limit() { + Ok(Some(ClusterLimit::ActorCount(limit))) + } else { + Ok(None) + } + } +} + +#[async_trait::async_trait] +impl ClusterLimitService for ClusterLimitServiceImpl { + #[cfg_attr(coverage, coverage(off))] + async fn get_cluster_limits( + &self, + _request: Request, + ) -> Result, Status> { + // TODO: support more limits + match self.get_active_actor_limit().await { + Ok(Some(limit)) => Ok(Response::new(GetClusterLimitsResponse { + active_limits: vec![limit.into()], + })), + Ok(None) => Ok(Response::new(GetClusterLimitsResponse { + active_limits: vec![], + })), + Err(e) => Err(e.into()), + } + } +} diff --git a/src/meta/service/src/hummock_service.rs b/src/meta/service/src/hummock_service.rs index 21e203d8440bd..c3fc2da229585 100644 --- a/src/meta/service/src/hummock_service.rs +++ b/src/meta/service/src/hummock_service.rs @@ -457,7 +457,7 @@ impl HummockManagerService for HummockServiceImpl { let req = request.into_inner(); let new_group_id = self .hummock_manager - .split_compaction_group(req.group_id, &req.table_ids) + .split_compaction_group(req.group_id, &req.table_ids, req.partition_vnode_count) .await?; Ok(Response::new(SplitCompactionGroupResponse { new_group_id })) } @@ -710,12 +710,26 @@ impl HummockManagerService for HummockServiceImpl { &self, request: Request, ) -> Result, Status> { - let GetVersionByEpochRequest { epoch } = request.into_inner(); - let version = self.hummock_manager.epoch_to_version(epoch).await?; + let GetVersionByEpochRequest { epoch, table_id } = request.into_inner(); + let version = self + .hummock_manager + .epoch_to_version(epoch, table_id) + .await?; Ok(Response::new(GetVersionByEpochResponse { version: Some(version.to_protobuf()), })) } + + async fn merge_compaction_group( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + self.hummock_manager + .merge_compaction_group(req.left_group_id, req.right_group_id) + .await?; + Ok(Response::new(MergeCompactionGroupResponse {})) + } } #[cfg(test)] diff --git a/src/meta/service/src/lib.rs b/src/meta/service/src/lib.rs index 9ab248802772e..2e327dc47a59e 100644 --- a/src/meta/service/src/lib.rs +++ b/src/meta/service/src/lib.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(lint_reasons)] #![feature(let_chains)] #![feature(impl_trait_in_assoc_type)] #![cfg_attr(coverage, feature(coverage_attribute))] @@ -21,6 +20,7 @@ use risingwave_meta::*; pub mod backup_service; pub mod cloud_service; +pub mod cluster_limit_service; pub mod cluster_service; pub mod ddl_service; pub mod event_log_service; diff --git a/src/meta/src/backup_restore/restore_impl/v2.rs b/src/meta/src/backup_restore/restore_impl/v2.rs index a887293e0c8ef..938050ce4d300 100644 --- a/src/meta/src/backup_restore/restore_impl/v2.rs +++ b/src/meta/src/backup_restore/restore_impl/v2.rs @@ -106,8 +106,8 @@ impl Writer for WriterModelV2ToMetaStoreV2 { insert_models(metadata.workers.clone(), db).await?; insert_models(metadata.worker_properties.clone(), db).await?; insert_models(metadata.users.clone(), db).await?; - insert_models(metadata.user_privileges.clone(), db).await?; insert_models(metadata.objects.clone(), db).await?; + insert_models(metadata.user_privileges.clone(), db).await?; insert_models(metadata.object_dependencies.clone(), db).await?; insert_models(metadata.databases.clone(), db).await?; insert_models(metadata.schemas.clone(), db).await?; diff --git a/src/meta/src/barrier/command.rs b/src/meta/src/barrier/command.rs index 6e4ebe40b93b0..577a0bef25360 100644 --- a/src/meta/src/barrier/command.rs +++ b/src/meta/src/barrier/command.rs @@ -16,7 +16,6 @@ use std::collections::{HashMap, HashSet}; use std::fmt::Formatter; use futures::future::try_join_all; -use itertools::Itertools; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::TableId; use risingwave_common::hash::ActorMapping; @@ -147,8 +146,10 @@ impl ReplaceTablePlan { } } -#[derive(Debug, Clone)] +#[derive(educe::Educe, Clone)] +#[educe(Debug)] pub struct CreateStreamingJobCommandInfo { + #[educe(Debug(ignore))] pub table_fragments: TableFragments, /// Refer to the doc on [`MetadataManager::get_upstream_root_fragments`] for the meaning of "root". pub upstream_root_actors: HashMap>, @@ -959,19 +960,6 @@ impl Command { } impl CommandContext { - /// Clean up actors in CNs if needed, used by drop, cancel and reschedule commands. - async fn clean_up(&self, actors: Vec) -> MetaResult<()> { - self.barrier_manager_context - .stream_rpc_manager - .drop_actors( - &self.node_map, - self.node_map - .keys() - .map(|worker_id| (*worker_id, actors.clone())), - ) - .await - } - pub async fn wait_epoch_commit(&self, epoch: HummockEpoch) -> MetaResult<()> { let futures = self.node_map.values().map(|worker_node| async { let client = self @@ -1021,13 +1009,9 @@ impl CommandContext { } Command::DropStreamingJobs { - actors, unregistered_state_table_ids, .. } => { - // Tell compute nodes to drop actors. - self.clean_up(actors.clone()).await?; - self.barrier_manager_context .hummock_manager .unregister_table_ids(unregistered_state_table_ids.iter().cloned()) @@ -1036,7 +1020,6 @@ impl CommandContext { Command::CancelStreamingJob(table_fragments) => { tracing::debug!(id = ?table_fragments.table_id(), "cancelling stream job"); - self.clean_up(table_fragments.actor_ids()).await?; // NOTE(kwannoel): At this point, meta has already registered the table ids. // We should unregister them. @@ -1136,8 +1119,6 @@ impl CommandContext { .. }) = job_type { - self.clean_up(old_table_fragments.actor_ids()).await?; - // Drop fragment info in meta store. mgr.fragment_manager .post_replace_table( @@ -1164,13 +1145,9 @@ impl CommandContext { new_table_fragments, dispatchers, init_split_assignment, - old_table_fragments, .. }) = job_type { - // Tell compute nodes to drop actors. - self.clean_up(old_table_fragments.actor_ids()).await?; - mgr.catalog_controller .post_collect_table_fragments( new_table_fragments.table_id().table_id as _, @@ -1201,11 +1178,6 @@ impl CommandContext { table_parallelism, .. } => { - let removed_actors = reschedules - .values() - .flat_map(|reschedule| reschedule.removed_actors.clone().into_iter()) - .collect_vec(); - self.clean_up(removed_actors).await?; self.barrier_manager_context .scale_controller .post_apply_reschedule(reschedules, table_parallelism) @@ -1220,8 +1192,6 @@ impl CommandContext { init_split_assignment, .. }) => { - self.clean_up(old_table_fragments.actor_ids()).await?; - match &self.barrier_manager_context.metadata_manager { MetadataManager::V1(mgr) => { // Drop fragment info in meta store. diff --git a/src/meta/src/barrier/mod.rs b/src/meta/src/barrier/mod.rs index daa82306bff6d..0772bac6699e1 100644 --- a/src/meta/src/barrier/mod.rs +++ b/src/meta/src/barrier/mod.rs @@ -86,7 +86,6 @@ pub use self::command::{ Reschedule, SnapshotBackfillInfo, }; pub use self::info::InflightSubscriptionInfo; -pub use self::rpc::StreamRpcManager; pub use self::schedule::BarrierScheduler; pub use self::trace::TracedEpoch; @@ -172,8 +171,6 @@ pub struct GlobalBarrierManagerContext { pub(super) metrics: Arc, - stream_rpc_manager: StreamRpcManager, - env: MetaSrvEnv, } @@ -596,7 +593,6 @@ impl GlobalBarrierManager { source_manager: SourceManagerRef, sink_manager: SinkCoordinatorManager, metrics: Arc, - stream_rpc_manager: StreamRpcManager, scale_controller: ScaleControllerRef, ) -> Self { let enable_recovery = env.opts.enable_recovery; @@ -624,7 +620,6 @@ impl GlobalBarrierManager { scale_controller, sink_manager, metrics, - stream_rpc_manager, env: env.clone(), }; @@ -768,7 +763,9 @@ impl GlobalBarrierManager { if let Some(request) = request { match request { BarrierManagerRequest::GetDdlProgress(result_tx) => { + // Progress of normal backfill let mut progress = self.checkpoint_control.create_mview_tracker.gen_ddl_progress(); + // Progress of snapshot backfill for creating_job in self.checkpoint_control.creating_streaming_job_controls.values() { progress.extend([(creating_job.info.table_fragments.table_id().table_id, creating_job.gen_ddl_progress())]); } @@ -1639,6 +1636,7 @@ impl GlobalBarrierManagerContext { Ok(info) } + /// Serving `SHOW JOBS / SELECT * FROM rw_ddl_progress` pub async fn get_ddl_progress(&self) -> MetaResult> { let mut ddl_progress = { let (tx, rx) = oneshot::channel(); diff --git a/src/meta/src/barrier/progress.rs b/src/meta/src/barrier/progress.rs index 5754e4c60e364..2e1b6f9dc397a 100644 --- a/src/meta/src/barrier/progress.rs +++ b/src/meta/src/barrier/progress.rs @@ -55,6 +55,7 @@ pub(super) struct Progress { upstream_mv_count: HashMap, /// Total key count in the upstream materialized view + /// TODO: implement this for source backfill upstream_total_key_count: u64, /// Consumed rows @@ -122,6 +123,12 @@ impl Progress { /// Returns whether all backfill executors are done. fn is_done(&self) -> bool { + tracing::trace!( + "Progress::is_done? {}, {}, {:?}", + self.done_count, + self.states.len(), + self.states + ); self.done_count == self.states.len() } @@ -274,6 +281,7 @@ pub(super) struct TrackingCommand { /// 4. With `actor_map` we can use an actor's `ActorId` to find the ID of the `StreamJob`. #[derive(Default, Debug)] pub(super) struct CreateMviewProgressTracker { + // TODO: add a specialized progress for source /// Progress of the create-mview DDL indicated by the `TableId`. progress_map: HashMap, @@ -494,6 +502,7 @@ impl CreateMviewProgressTracker { replace_table: Option<&ReplaceTablePlan>, version_stats: &HummockVersionStats, ) -> Option { + tracing::trace!(?info, "add job to track"); let (info, actors, replace_table_info) = { let CreateStreamingJobCommandInfo { table_fragments, .. @@ -596,6 +605,7 @@ impl CreateMviewProgressTracker { progress: &CreateMviewProgress, version_stats: &HummockVersionStats, ) -> Option { + tracing::trace!(?progress, "update progress"); let actor = progress.backfill_actor_id; let Some(table_id) = self.actor_map.get(&actor).copied() else { // On restart, backfill will ALWAYS notify CreateMviewProgressTracker, diff --git a/src/meta/src/barrier/recovery.rs b/src/meta/src/barrier/recovery.rs index 25fe1fd2ceff7..63cd4c16d9aaf 100644 --- a/src/meta/src/barrier/recovery.rs +++ b/src/meta/src/barrier/recovery.rs @@ -1121,6 +1121,14 @@ impl GlobalBarrierManagerContext { return Err(anyhow!("actors dropped during update").into()); } + { + for (node_id, actors) in &info.actor_map { + if !actors.is_empty() && !all_node_actors.contains_key(node_id) { + return Err(anyhow!("streaming job dropped during update").into()); + } + } + } + Ok(all_node_actors) } } diff --git a/src/meta/src/barrier/rpc.rs b/src/meta/src/barrier/rpc.rs index 14ee8b0c15f7b..97b3636e8dba3 100644 --- a/src/meta/src/barrier/rpc.rs +++ b/src/meta/src/barrier/rpc.rs @@ -14,14 +14,13 @@ use std::collections::{HashMap, HashSet}; use std::error::Error; -use std::future::Future; use std::time::Duration; use anyhow::anyhow; use fail::fail_point; use futures::future::try_join_all; use futures::stream::{BoxStream, FuturesUnordered}; -use futures::{pin_mut, FutureExt, StreamExt}; +use futures::StreamExt; use itertools::Itertools; use risingwave_common::catalog::TableId; use risingwave_common::hash::ActorId; @@ -34,11 +33,9 @@ use risingwave_pb::stream_service::build_actor_info::SubscriptionIds; use risingwave_pb::stream_service::streaming_control_stream_request::RemovePartialGraphRequest; use risingwave_pb::stream_service::{ streaming_control_stream_request, streaming_control_stream_response, BarrierCompleteResponse, - BuildActorInfo, DropActorsRequest, InjectBarrierRequest, StreamingControlStreamRequest, + BuildActorInfo, InjectBarrierRequest, StreamingControlStreamRequest, StreamingControlStreamResponse, }; -use risingwave_rpc_client::error::RpcError; -use risingwave_rpc_client::StreamClient; use rw_futures_util::pending_on_none; use thiserror_ext::AsReport; use tokio::sync::mpsc::UnboundedSender; @@ -50,7 +47,7 @@ use uuid::Uuid; use super::command::CommandContext; use super::{BarrierKind, GlobalBarrierManagerContext, TracedEpoch}; use crate::barrier::info::InflightGraphInfo; -use crate::manager::{MetaSrvEnv, WorkerId}; +use crate::manager::WorkerId; use crate::{MetaError, MetaResult}; const COLLECT_ERROR_TIMEOUT: Duration = Duration::from_secs(3); @@ -60,33 +57,47 @@ struct ControlStreamNode { sender: UnboundedSender, } -fn into_future( - worker_id: WorkerId, - stream: BoxStream< - 'static, - risingwave_rpc_client::error::Result, - >, -) -> ResponseStreamFuture { - stream.into_future().map(move |(opt, stream)| { - ( - worker_id, - stream, - opt.ok_or_else(|| anyhow!("end of stream").into()) - .and_then(|result| result.map_err(|e| e.into())), - ) - }) +mod response_stream_future { + use std::future::Future; + + use anyhow::anyhow; + use futures::stream::BoxStream; + use futures::{FutureExt, StreamExt}; + use risingwave_pb::stream_service::StreamingControlStreamResponse; + + use crate::manager::WorkerId; + use crate::MetaResult; + + pub(super) fn into_future( + worker_id: WorkerId, + stream: BoxStream< + 'static, + risingwave_rpc_client::error::Result, + >, + ) -> ResponseStreamFuture { + stream.into_future().map(move |(opt, stream)| { + ( + worker_id, + stream, + opt.ok_or_else(|| anyhow!("end of stream").into()) + .and_then(|result| result.map_err(|e| e.into())), + ) + }) + } + + pub(super) type ResponseStreamFuture = impl Future< + Output = ( + WorkerId, + BoxStream< + 'static, + risingwave_rpc_client::error::Result, + >, + MetaResult, + ), + > + 'static; } -type ResponseStreamFuture = impl Future< - Output = ( - WorkerId, - BoxStream< - 'static, - risingwave_rpc_client::error::Result, - >, - MetaResult, - ), - > + 'static; +use response_stream_future::*; pub(super) struct ControlStreamManager { context: GlobalBarrierManagerContext, @@ -362,7 +373,7 @@ impl ControlStreamManager { self.nodes .iter_mut() - .map(|(node_id, node)| { + .try_for_each(|(node_id, node)| { let actor_ids_to_collect: Vec<_> = pre_applied_graph_info .actor_ids_to_collect(*node_id) .collect(); @@ -393,7 +404,7 @@ impl ControlStreamManager { request: Some( streaming_control_stream_request::Request::InjectBarrier( InjectBarrierRequest { - request_id: StreamRpcManager::new_request_id(), + request_id: Uuid::new_v4().to_string(), barrier: Some(barrier), actor_ids_to_collect, table_ids_to_sync, @@ -429,7 +440,6 @@ impl ControlStreamManager { Result::<_, MetaError>::Ok(()) } }) - .try_collect() .inspect_err(|e| { // Record failure in event log. use risingwave_pb::meta::event_log; @@ -512,95 +522,6 @@ impl GlobalBarrierManagerContext { } } -#[derive(Clone)] -pub struct StreamRpcManager { - env: MetaSrvEnv, -} - -impl StreamRpcManager { - pub fn new(env: MetaSrvEnv) -> Self { - Self { env } - } - - async fn make_request> + 'static>( - &self, - request: impl Iterator, - f: impl Fn(StreamClient, REQ) -> Fut, - ) -> MetaResult> { - let pool = self.env.stream_client_pool(); - let f = &f; - let iters = request.map(|(node, input)| async move { - let client = pool.get(node).await.map_err(|e| (node.id, e))?; - f(client, input).await.map_err(|e| (node.id, e)) - }); - let result = try_join_all_with_error_timeout(iters, COLLECT_ERROR_TIMEOUT).await; - result.map_err(|results_err| merge_node_rpc_errors("merged RPC Error", results_err)) - } - - fn new_request_id() -> String { - Uuid::new_v4().to_string() - } - - pub async fn drop_actors( - &self, - node_map: &HashMap, - node_actors: impl Iterator)>, - ) -> MetaResult<()> { - self.make_request( - node_actors - .map(|(worker_id, actor_ids)| (node_map.get(&worker_id).unwrap(), actor_ids)), - |client, actor_ids| async move { - client - .drop_actors(DropActorsRequest { - request_id: Self::new_request_id(), - actor_ids, - }) - .await - }, - ) - .await?; - Ok(()) - } -} - -/// This function is similar to `try_join_all`, but it attempts to collect as many error as possible within `error_timeout`. -async fn try_join_all_with_error_timeout( - iters: I, - error_timeout: Duration, -) -> Result, Vec> -where - I: IntoIterator, - F: Future>, -{ - let stream = FuturesUnordered::from_iter(iters); - pin_mut!(stream); - let mut results_ok = vec![]; - let mut results_err = vec![]; - while let Some(result) = stream.next().await { - match result { - Ok(rsp) => { - results_ok.push(rsp); - } - Err(err) => { - results_err.push(err); - break; - } - } - } - if results_err.is_empty() { - return Ok(results_ok); - } - let _ = timeout(error_timeout, async { - while let Some(result) = stream.next().await { - if let Err(err) = result { - results_err.push(err); - } - } - }) - .await; - Err(results_err) -} - pub(super) fn merge_node_rpc_errors( message: &str, errors: impl IntoIterator, diff --git a/src/meta/src/controller/fragment.rs b/src/meta/src/controller/fragment.rs index 16228a06d0a9a..31575e72804f9 100644 --- a/src/meta/src/controller/fragment.rs +++ b/src/meta/src/controller/fragment.rs @@ -1411,7 +1411,7 @@ mod tests { use std::collections::{BTreeMap, HashMap}; use itertools::Itertools; - use risingwave_common::hash::ActorMapping; + use risingwave_common::hash::{ActorMapping, VirtualNode}; use risingwave_common::util::iter_util::ZipEqDebug; use risingwave_common::util::stream_graph_visitor::visit_stream_node; use risingwave_meta_model_v2::actor::ActorStatus; @@ -1497,8 +1497,11 @@ mod tests { }) .collect(); - let actor_bitmaps = - ActorMapping::new_uniform((0..actor_count).map(|i| i as _)).to_bitmaps(); + let actor_bitmaps = ActorMapping::new_uniform( + (0..actor_count).map(|i| i as _), + VirtualNode::COUNT_FOR_TEST, + ) + .to_bitmaps(); let pb_actors = (0..actor_count) .map(|actor_id| { @@ -1610,8 +1613,11 @@ mod tests { }) .collect(); - let mut actor_bitmaps = - ActorMapping::new_uniform((0..actor_count).map(|i| i as _)).to_bitmaps(); + let mut actor_bitmaps = ActorMapping::new_uniform( + (0..actor_count).map(|i| i as _), + VirtualNode::COUNT_FOR_TEST, + ) + .to_bitmaps(); let actors = (0..actor_count) .map(|actor_id| { diff --git a/src/meta/src/hummock/manager/checkpoint.rs b/src/meta/src/hummock/manager/checkpoint.rs index bc3701a6b9d82..f678014d440c8 100644 --- a/src/meta/src/hummock/manager/checkpoint.rs +++ b/src/meta/src/hummock/manager/checkpoint.rs @@ -156,8 +156,8 @@ impl HummockManager { .hummock_version_deltas .range((Excluded(old_checkpoint_id), Included(new_checkpoint_id))) { - for group_deltas in version_delta.group_deltas.values() { - let summary = summarize_group_deltas(group_deltas); + for (group_id, group_deltas) in &version_delta.group_deltas { + let summary = summarize_group_deltas(group_deltas, *group_id); object_sizes.extend( summary .insert_table_infos diff --git a/src/meta/src/hummock/manager/commit_epoch.rs b/src/meta/src/hummock/manager/commit_epoch.rs index 08428e5472e23..e92e91c8503d0 100644 --- a/src/meta/src/hummock/manager/commit_epoch.rs +++ b/src/meta/src/hummock/manager/commit_epoch.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap, HashSet}; use risingwave_common::catalog::TableId; use risingwave_hummock_sdk::change_log::ChangeLogDelta; @@ -220,23 +220,8 @@ impl HummockManager { NewTableFragmentInfo::None => (HashMap::new(), None, None), }; - let mut group_members_table_ids: HashMap> = HashMap::new(); - { - // expand group_members_table_ids - for (table_id, group_id) in &table_compaction_group_mapping { - group_members_table_ids - .entry(*group_id) - .or_default() - .insert(*table_id); - } - } - let commit_sstables = self - .correct_commit_ssts( - sstables, - &table_compaction_group_mapping, - &group_members_table_ids, - ) + .correct_commit_ssts(sstables, &table_compaction_group_mapping) .await?; let modified_compaction_groups: Vec<_> = commit_sstables.keys().cloned().collect(); @@ -247,7 +232,7 @@ impl HummockManager { is_visible_table_committed_epoch, new_compaction_group, commit_sstables, - new_table_ids, + &new_table_ids, new_table_watermarks, change_log_delta, ); @@ -304,6 +289,9 @@ impl HummockManager { .values() .map(|g| (g.group_id, g.parent_group_id)) .collect(); + let time_travel_tables_to_commit = table_compaction_group_mapping + .iter() + .filter(|(table_id, _)| tables_to_commit.contains(table_id)); let mut txn = sql_store.conn.begin().await?; let version_snapshot_sst_ids = self .write_time_travel_metadata( @@ -312,6 +300,8 @@ impl HummockManager { time_travel_delta, &group_parents, &versioning.last_time_travel_snapshot_sst_ids, + time_travel_tables_to_commit, + committed_epoch, ) .await?; commit_multi_var_with_provided_txn!( @@ -389,7 +379,6 @@ impl HummockManager { &self, sstables: Vec, table_compaction_group_mapping: &HashMap, - group_members_table_ids: &HashMap>, ) -> Result>> { let mut new_sst_id_number = 0; let mut sst_to_cg_vec = Vec::with_capacity(sstables.len()); @@ -413,7 +402,7 @@ impl HummockManager { } } - new_sst_id_number += group_table_ids.len(); + new_sst_id_number += group_table_ids.len() * 2; // `split_sst` will split the SST into two parts and consumer 2 SST IDs sst_to_cg_vec.push((commit_sst, group_table_ids)); } @@ -424,17 +413,16 @@ impl HummockManager { let mut commit_sstables: BTreeMap> = BTreeMap::new(); for (mut sst, group_table_ids) in sst_to_cg_vec { - for (group_id, match_ids) in group_table_ids { - let group_members_table_ids = group_members_table_ids.get(&group_id).unwrap(); - if match_ids - .iter() - .all(|id| group_members_table_ids.contains(&TableId::new(*id))) - { + let len = group_table_ids.len(); + for (index, (group_id, match_ids)) in group_table_ids.into_iter().enumerate() { + if sst.sst_info.table_ids == match_ids { + // The SST contains all the tables in the group should be last key + assert!(index == len - 1); commit_sstables .entry(group_id) .or_default() - .push(sst.sst_info.clone()); - continue; + .push(sst.sst_info); + break; } let origin_sst_size = sst.sst_info.sst_size; diff --git a/src/meta/src/hummock/manager/compaction_group_manager.rs b/src/meta/src/hummock/manager/compaction/compaction_group_manager.rs similarity index 96% rename from src/meta/src/hummock/manager/compaction_group_manager.rs rename to src/meta/src/hummock/manager/compaction/compaction_group_manager.rs index c68fc4222f283..807ba6f3fd35f 100644 --- a/src/meta/src/hummock/manager/compaction_group_manager.rs +++ b/src/meta/src/hummock/manager/compaction/compaction_group_manager.rs @@ -54,7 +54,7 @@ use crate::model::{ type CompactionGroupTransaction<'a> = BTreeMapTransaction<'a, CompactionGroupId, CompactionGroup>; impl CompactionGroupManager { - pub(super) async fn new(env: &MetaSrvEnv) -> Result { + pub(crate) async fn new(env: &MetaSrvEnv) -> Result { let default_config = match env.opts.compaction_config.as_ref() { None => CompactionConfigBuilder::new().build(), Some(opt) => CompactionConfigBuilder::with_opt(opt).build(), @@ -62,7 +62,7 @@ impl CompactionGroupManager { Self::new_with_config(env, default_config).await } - pub(super) async fn new_with_config( + pub(crate) async fn new_with_config( env: &MetaSrvEnv, default_config: CompactionConfig, ) -> Result { @@ -231,12 +231,9 @@ impl HummockManager { let mut is_group_init = false; group_id = *new_compaction_group_id .get_or_try_init(|| async { - next_compaction_group_id(&self.env) - .await - .map(|new_group_id| { - is_group_init = true; - new_group_id - }) + next_compaction_group_id(&self.env).await.inspect(|_| { + is_group_init = true; + }) }) .await?; if is_group_init { @@ -428,24 +425,6 @@ impl HummockManager { results } - /// Splits a compaction group into two. The new one will contain `table_ids`. - /// Returns the newly created compaction group id. - pub async fn split_compaction_group( - &self, - parent_group_id: CompactionGroupId, - table_ids: &[StateTableId], - ) -> Result { - let result = self - .move_state_table_to_compaction_group( - parent_group_id, - table_ids, - self.env.opts.partition_vnode_count, - ) - .await?; - - Ok(result) - } - /// move some table to another compaction-group. Create a new compaction group if it does not /// exist. pub async fn move_state_table_to_compaction_group( @@ -651,7 +630,7 @@ impl HummockManager { infos } - pub(super) async fn initial_compaction_group_config_after_load( + pub(crate) async fn initial_compaction_group_config_after_load( &self, versioning_guard: &Versioning, compaction_group_manager: &mut CompactionGroupManager, @@ -675,7 +654,7 @@ impl HummockManager { /// 1. initialize default static compaction group. /// 2. register new table to new compaction group. /// 3. move existent table to new compaction group. -pub(super) struct CompactionGroupManager { +pub(crate) struct CompactionGroupManager { compaction_groups: BTreeMap, default_config: Arc, /// Tables that write limit is trigger for. @@ -709,7 +688,7 @@ impl CompactionGroupManager { } /// Tries to get compaction group config for `compaction_group_id`. - pub(super) fn try_get_compaction_group_config( + pub(crate) fn try_get_compaction_group_config( &self, compaction_group_id: CompactionGroupId, ) -> Option { @@ -717,7 +696,7 @@ impl CompactionGroupManager { } /// Tries to get compaction group config for `compaction_group_id`. - pub(super) fn default_compaction_config(&self) -> Arc { + pub(crate) fn default_compaction_config(&self) -> Arc { self.default_config.clone() } } @@ -814,7 +793,7 @@ impl<'a> CompactionGroupTransaction<'a> { } /// Tries to get compaction group config for `compaction_group_id`. - pub(super) fn try_get_compaction_group_config( + pub(crate) fn try_get_compaction_group_config( &self, compaction_group_id: CompactionGroupId, ) -> Option<&CompactionGroup> { @@ -822,7 +801,7 @@ impl<'a> CompactionGroupTransaction<'a> { } /// Removes stale group configs. - fn purge(&mut self, existing_groups: HashSet) { + pub fn purge(&mut self, existing_groups: HashSet) { let stale_group = self .tree_ref() .keys() @@ -837,7 +816,7 @@ impl<'a> CompactionGroupTransaction<'a> { } } - pub(super) fn update_compaction_config( + pub(crate) fn update_compaction_config( &mut self, compaction_group_ids: &[CompactionGroupId], config_to_update: &[MutableConfig], diff --git a/src/meta/src/hummock/manager/compaction/compaction_group_schedule.rs b/src/meta/src/hummock/manager/compaction/compaction_group_schedule.rs new file mode 100644 index 0000000000000..93103ca87abf5 --- /dev/null +++ b/src/meta/src/hummock/manager/compaction/compaction_group_schedule.rs @@ -0,0 +1,359 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::{HashMap, HashSet, VecDeque}; +use std::ops::DerefMut; + +use itertools::Itertools; +use risingwave_common::catalog::TableId; +use risingwave_hummock_sdk::compact_task::ReportTask; +use risingwave_hummock_sdk::compaction_group::hummock_version_ext::TableGroupInfo; +use risingwave_hummock_sdk::compaction_group::{StateTableId, StaticCompactionGroupId}; +use risingwave_hummock_sdk::version::{GroupDelta, GroupDeltas}; +use risingwave_hummock_sdk::{can_concat, CompactionGroupId}; +use risingwave_pb::hummock::compact_task::TaskStatus; +use risingwave_pb::hummock::{PbGroupMerge, PbStateTableInfoDelta}; +use thiserror_ext::AsReport; + +use crate::hummock::error::{Error, Result}; +use crate::hummock::manager::transaction::HummockVersionTransaction; +use crate::hummock::manager::{commit_multi_var, HummockManager}; +use crate::hummock::metrics_utils::remove_compaction_group_in_sst_stat; + +impl HummockManager { + /// Splits a compaction group into two. The new one will contain `table_ids`. + /// Returns the newly created compaction group id. + pub async fn split_compaction_group( + &self, + parent_group_id: CompactionGroupId, + table_ids: &[StateTableId], + partition_vnode_count: u32, + ) -> Result { + let result = self + .move_state_table_to_compaction_group(parent_group_id, table_ids, partition_vnode_count) + .await?; + + Ok(result) + } + + pub async fn merge_compaction_group( + &self, + group_1: CompactionGroupId, + group_2: CompactionGroupId, + ) -> Result<()> { + let compaction_guard = self.compaction.write().await; + let mut versioning_guard = self.versioning.write().await; + let versioning = versioning_guard.deref_mut(); + // Validate parameters. + if !versioning.current_version.levels.contains_key(&group_1) { + return Err(Error::CompactionGroup(format!("invalid group {}", group_1))); + } + + if !versioning.current_version.levels.contains_key(&group_2) { + return Err(Error::CompactionGroup(format!("invalid group {}", group_2))); + } + + let state_table_info = versioning.current_version.state_table_info.clone(); + let mut member_table_ids_1 = state_table_info + .compaction_group_member_table_ids(group_1) + .iter() + .cloned() + .collect_vec(); + + let mut member_table_ids_2 = state_table_info + .compaction_group_member_table_ids(group_2) + .iter() + .cloned() + .collect_vec(); + + debug_assert!(!member_table_ids_1.is_empty()); + debug_assert!(!member_table_ids_2.is_empty()); + assert!(member_table_ids_1.is_sorted()); + assert!(member_table_ids_2.is_sorted()); + + // Make sure `member_table_ids_1` is smaller than `member_table_ids_2` + let (left_group_id, right_group_id) = + if member_table_ids_1.first().unwrap() < member_table_ids_2.first().unwrap() { + (group_1, group_2) + } else { + std::mem::swap(&mut member_table_ids_1, &mut member_table_ids_2); + (group_2, group_1) + }; + + // We can only merge two groups with non-overlapping member table ids + if member_table_ids_1.last().unwrap() >= member_table_ids_2.first().unwrap() { + return Err(Error::CompactionGroup(format!( + "invalid merge group_1 {} group_2 {}", + left_group_id, right_group_id + ))); + } + + let combined_member_table_ids = member_table_ids_1 + .iter() + .chain(member_table_ids_2.iter()) + .collect_vec(); + assert!(combined_member_table_ids.is_sorted()); + + // check duplicated sst_id + let mut sst_id_set = HashSet::new(); + for sst_id in versioning + .current_version + .get_sst_ids_by_group_id(left_group_id) + .chain( + versioning + .current_version + .get_sst_ids_by_group_id(right_group_id), + ) + { + if !sst_id_set.insert(sst_id) { + return Err(Error::CompactionGroup(format!( + "invalid merge group_1 {} group_2 {} duplicated sst_id {}", + left_group_id, right_group_id, sst_id + ))); + } + } + + // check branched sst on non-overlap level + { + let left_levels = versioning + .current_version + .get_compaction_group_levels(group_1); + + let right_levels = versioning + .current_version + .get_compaction_group_levels(group_2); + + // we can not check the l0 sub level, because the sub level id will be rewritten when merge + // This check will ensure that other non-overlapping level ssts can be concat and that the key_range is correct. + let max_level = std::cmp::max(left_levels.levels.len(), right_levels.levels.len()); + for level_idx in 1..=max_level { + let left_level = left_levels.get_level(level_idx); + let right_level = right_levels.get_level(level_idx); + if left_level.table_infos.is_empty() || right_level.table_infos.is_empty() { + continue; + } + + let left_last_sst = left_level.table_infos.last().unwrap().clone(); + let right_first_sst = right_level.table_infos.first().unwrap().clone(); + let left_sst_id = left_last_sst.sst_id; + let right_sst_id = right_first_sst.sst_id; + let left_obj_id = left_last_sst.object_id; + let right_obj_id = right_first_sst.object_id; + + // Since the sst key_range within a group is legal, we only need to check the ssts adjacent to the two groups. + if !can_concat(&[left_last_sst, right_first_sst]) { + return Err(Error::CompactionGroup(format!( + "invalid merge group_1 {} group_2 {} level_idx {} left_last_sst_id {} right_first_sst_id {} left_obj_id {} right_obj_id {}", + left_group_id, right_group_id, level_idx, left_sst_id, right_sst_id, left_obj_id, right_obj_id + ))); + } + } + } + + let mut version = HummockVersionTransaction::new( + &mut versioning.current_version, + &mut versioning.hummock_version_deltas, + self.env.notification_manager(), + &self.metrics, + ); + let mut new_version_delta = version.new_delta(); + + let target_compaction_group_id = { + // merge right_group_id to left_group_id and remove right_group_id + new_version_delta.group_deltas.insert( + left_group_id, + GroupDeltas { + group_deltas: vec![GroupDelta::GroupMerge(PbGroupMerge { + left_group_id, + right_group_id, + })], + }, + ); + left_group_id + }; + + // TODO: remove compaciton group_id from state_table_info + // rewrite compaction_group_id for all tables + new_version_delta.with_latest_version(|version, new_version_delta| { + for table_id in combined_member_table_ids { + let table_id = TableId::new(table_id.table_id()); + let info = version + .state_table_info + .info() + .get(&table_id) + .expect("have check exist previously"); + assert!(new_version_delta + .state_table_info_delta + .insert( + table_id, + PbStateTableInfoDelta { + committed_epoch: info.committed_epoch, + safe_epoch: info.safe_epoch, + compaction_group_id: target_compaction_group_id, + } + ) + .is_none()); + } + }); + + { + let mut compaction_group_manager = self.compaction_group_manager.write().await; + let mut compaction_groups_txn = compaction_group_manager.start_compaction_groups_txn(); + + // for metrics reclaim + { + let right_group_max_level = new_version_delta + .latest_version() + .get_compaction_group_levels(right_group_id) + .levels + .len(); + + remove_compaction_group_in_sst_stat( + &self.metrics, + right_group_id, + right_group_max_level, + ); + } + + new_version_delta.pre_apply(); + + // remove right_group_id + compaction_groups_txn.remove(right_group_id); + commit_multi_var!(self.meta_store_ref(), version, compaction_groups_txn)?; + } + + // Instead of handling DeltaType::GroupConstruct for time travel, simply enforce a version snapshot. + versioning.mark_next_time_travel_version_snapshot(); + + // cancel tasks + let mut canceled_tasks = vec![]; + // after merge, all tasks in right_group_id should be canceled + // otherwise, pending size calculation by level handler will make some mistake + for task_assignment in compaction_guard.compact_task_assignment.values() { + if let Some(task) = task_assignment.compact_task.as_ref() { + let need_cancel = task.compaction_group_id == right_group_id; + if need_cancel { + canceled_tasks.push(ReportTask { + task_id: task.task_id, + task_status: TaskStatus::ManualCanceled, + table_stats_change: HashMap::default(), + sorted_output_ssts: vec![], + }); + } + } + } + + drop(versioning_guard); + drop(compaction_guard); + self.report_compact_tasks(canceled_tasks).await?; + + Ok(()) + } + + pub async fn try_split_compaction_group( + &self, + table_write_throughput: &HashMap>, + checkpoint_secs: u64, + group: &TableGroupInfo, + created_tables: &HashSet, + ) { + // split high throughput table to dedicated compaction group + for (table_id, table_size) in &group.table_statistic { + self.try_move_table_to_dedicated_cg( + table_write_throughput, + table_id, + table_size, + !created_tables.contains(table_id), + checkpoint_secs, + group.group_id, + group.group_size, + ) + .await; + } + } + + pub async fn try_move_table_to_dedicated_cg( + &self, + table_write_throughput: &HashMap>, + table_id: &u32, + table_size: &u64, + is_creating_table: bool, + checkpoint_secs: u64, + parent_group_id: u64, + group_size: u64, + ) { + let default_group_id: CompactionGroupId = StaticCompactionGroupId::StateDefault.into(); + let mv_group_id: CompactionGroupId = StaticCompactionGroupId::MaterializedView.into(); + let partition_vnode_count = self.env.opts.partition_vnode_count; + let window_size = + self.env.opts.table_info_statistic_history_times / (checkpoint_secs as usize); + + let mut is_high_write_throughput = false; + let mut is_low_write_throughput = true; + if let Some(history) = table_write_throughput.get(table_id) { + if history.len() >= window_size { + is_high_write_throughput = history.iter().all(|throughput| { + *throughput / checkpoint_secs > self.env.opts.table_write_throughput_threshold + }); + is_low_write_throughput = history.iter().any(|throughput| { + *throughput / checkpoint_secs < self.env.opts.min_table_split_write_throughput + }); + } + } + + let state_table_size = *table_size; + + // 1. Avoid splitting a creating table + // 2. Avoid splitting a is_low_write_throughput creating table + // 3. Avoid splitting a non-high throughput medium-sized table + if is_creating_table + || (is_low_write_throughput) + || (state_table_size < self.env.opts.min_table_split_size && !is_high_write_throughput) + { + return; + } + + // do not split a large table and a small table because it would increase IOPS + // of small table. + if parent_group_id != default_group_id && parent_group_id != mv_group_id { + let rest_group_size = group_size - state_table_size; + if rest_group_size < state_table_size + && rest_group_size < self.env.opts.min_table_split_size + { + return; + } + } + + let ret = self + .move_state_table_to_compaction_group( + parent_group_id, + &[*table_id], + partition_vnode_count, + ) + .await; + match ret { + Ok(new_group_id) => { + tracing::info!("move state table [{}] from group-{} to group-{} success table_vnode_partition_count {:?}", table_id, parent_group_id, new_group_id, partition_vnode_count); + } + Err(e) => { + tracing::info!( + error = %e.as_report(), + "failed to move state table [{}] from group-{}", + table_id, + parent_group_id, + ) + } + } + } +} diff --git a/src/meta/src/hummock/manager/compaction.rs b/src/meta/src/hummock/manager/compaction/mod.rs similarity index 95% rename from src/meta/src/hummock/manager/compaction.rs rename to src/meta/src/hummock/manager/compaction/mod.rs index 4696c07452018..8f2ecc33c60b0 100644 --- a/src/meta/src/hummock/manager/compaction.rs +++ b/src/meta/src/hummock/manager/compaction/mod.rs @@ -27,7 +27,7 @@ // limitations under the License. use std::cmp::min; -use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::sync::{Arc, LazyLock}; use std::time::{Instant, SystemTime}; @@ -43,7 +43,6 @@ use rand::thread_rng; use risingwave_common::util::epoch::Epoch; use risingwave_hummock_sdk::compact_task::{CompactTask, ReportTask}; use risingwave_hummock_sdk::compaction_group::hummock_version_ext::HummockLevelsExt; -use risingwave_hummock_sdk::compaction_group::StaticCompactionGroupId; use risingwave_hummock_sdk::key_range::KeyRange; use risingwave_hummock_sdk::level::{InputLevel, Level, Levels}; use risingwave_hummock_sdk::sstable_info::SstableInfo; @@ -96,6 +95,9 @@ use crate::hummock::{commit_multi_var, start_measure_real_process_timer, Hummock use crate::manager::{MetadataManager, META_NODE_ID}; use crate::model::BTreeMapTransaction; +pub mod compaction_group_manager; +pub mod compaction_group_schedule; + const MAX_SKIP_TIMES: usize = 8; const MAX_REPORT_COUNT: usize = 16; @@ -1567,80 +1569,6 @@ impl HummockManager { .retain(|table_id, _| compact_task.existing_table_ids.contains(table_id)); } } - - pub async fn try_move_table_to_dedicated_cg( - &self, - table_write_throughput: &HashMap>, - table_id: &u32, - table_size: &u64, - is_creating_table: bool, - checkpoint_secs: u64, - parent_group_id: u64, - group_size: u64, - ) { - let default_group_id: CompactionGroupId = StaticCompactionGroupId::StateDefault.into(); - let mv_group_id: CompactionGroupId = StaticCompactionGroupId::MaterializedView.into(); - let partition_vnode_count = self.env.opts.partition_vnode_count; - let window_size = - self.env.opts.table_info_statistic_history_times / (checkpoint_secs as usize); - - let mut is_high_write_throughput = false; - let mut is_low_write_throughput = true; - if let Some(history) = table_write_throughput.get(table_id) { - if history.len() >= window_size { - is_high_write_throughput = history.iter().all(|throughput| { - *throughput / checkpoint_secs > self.env.opts.table_write_throughput_threshold - }); - is_low_write_throughput = history.iter().any(|throughput| { - *throughput / checkpoint_secs < self.env.opts.min_table_split_write_throughput - }); - } - } - - let state_table_size = *table_size; - - // 1. Avoid splitting a creating table - // 2. Avoid splitting a is_low_write_throughput creating table - // 3. Avoid splitting a non-high throughput medium-sized table - if is_creating_table - || (is_low_write_throughput) - || (state_table_size < self.env.opts.min_table_split_size && !is_high_write_throughput) - { - return; - } - - // do not split a large table and a small table because it would increase IOPS - // of small table. - if parent_group_id != default_group_id && parent_group_id != mv_group_id { - let rest_group_size = group_size - state_table_size; - if rest_group_size < state_table_size - && rest_group_size < self.env.opts.min_table_split_size - { - return; - } - } - - let ret = self - .move_state_table_to_compaction_group( - parent_group_id, - &[*table_id], - partition_vnode_count, - ) - .await; - match ret { - Ok(new_group_id) => { - tracing::info!("move state table [{}] from group-{} to group-{} success table_vnode_partition_count {:?}", table_id, parent_group_id, new_group_id, partition_vnode_count); - } - Err(e) => { - tracing::info!( - error = %e.as_report(), - "failed to move state table [{}] from group-{}", - table_id, - parent_group_id, - ) - } - } - } } #[cfg(any(test, feature = "test"))] diff --git a/src/meta/src/hummock/manager/mod.rs b/src/meta/src/hummock/manager/mod.rs index ded8d507dbffc..d43b1cc6f5421 100644 --- a/src/meta/src/hummock/manager/mod.rs +++ b/src/meta/src/hummock/manager/mod.rs @@ -50,7 +50,6 @@ use crate::manager::{MetaSrvEnv, MetaStoreImpl, MetadataManager}; use crate::model::{ClusterId, MetadataModel, MetadataModelError}; use crate::rpc::metrics::MetaMetrics; -mod compaction_group_manager; mod context; mod gc; mod tests; diff --git a/src/meta/src/hummock/manager/tests.rs b/src/meta/src/hummock/manager/tests.rs index 56b4836f585a1..d0183d84d23c5 100644 --- a/src/meta/src/hummock/manager/tests.rs +++ b/src/meta/src/hummock/manager/tests.rs @@ -17,7 +17,6 @@ use std::borrow::Borrow; use std::cmp::Ordering; use std::collections::HashMap; -use std::sync::Arc; use itertools::Itertools; use prometheus::Registry; @@ -1327,7 +1326,22 @@ async fn test_split_compaction_group_on_commit() { sst_size: 100, ..Default::default() }, - table_stats: Default::default(), + table_stats: HashMap::from([ + ( + 100, + TableStats { + total_compressed_size: 50, + ..Default::default() + }, + ), + ( + 101, + TableStats { + total_compressed_size: 50, + ..Default::default() + }, + ), + ]), }; hummock_manager .commit_epoch_for_test(30, vec![sst_1], HashMap::from([(10, context_id)])) @@ -1378,13 +1392,13 @@ async fn test_split_compaction_group_on_demand_basic() { assert_eq!(original_groups, vec![2, 3]); let err = hummock_manager - .split_compaction_group(100, &[0]) + .split_compaction_group(100, &[0], 0) .await .unwrap_err(); assert_eq!("compaction group error: invalid group 100", err.to_string()); let err = hummock_manager - .split_compaction_group(2, &[100]) + .split_compaction_group(2, &[100], 0) .await .unwrap_err(); assert_eq!( @@ -1446,7 +1460,7 @@ async fn test_split_compaction_group_on_demand_basic() { .unwrap(); let err = hummock_manager - .split_compaction_group(2, &[100, 101]) + .split_compaction_group(2, &[100, 101], 0) .await .unwrap_err(); assert_eq!( @@ -1462,7 +1476,7 @@ async fn test_split_compaction_group_on_demand_basic() { .unwrap(); hummock_manager - .split_compaction_group(2, &[100, 101]) + .split_compaction_group(2, &[100, 101], 0) .await .unwrap(); let current_version = hummock_manager.get_current_version().await; @@ -1530,7 +1544,7 @@ async fn test_split_compaction_group_on_demand_non_trivial() { .unwrap(); hummock_manager - .split_compaction_group(2, &[100]) + .split_compaction_group(2, &[100], 0) .await .unwrap(); @@ -1658,7 +1672,7 @@ async fn test_split_compaction_group_trivial_expired() { .unwrap(); hummock_manager - .split_compaction_group(2, &[100]) + .split_compaction_group(2, &[100], 0) .await .unwrap(); @@ -1830,7 +1844,7 @@ async fn test_split_compaction_group_on_demand_bottom_levels() { ); hummock_manager - .split_compaction_group(2, &[100]) + .split_compaction_group(2, &[100], 0) .await .unwrap(); let current_version = hummock_manager.get_current_version().await; @@ -1939,7 +1953,7 @@ async fn test_compaction_task_expiration_due_to_split_group() { let compaction_task = get_manual_compact_task(&hummock_manager, context_id).await; assert_eq!(compaction_task.input_ssts[0].table_infos.len(), 2); hummock_manager - .split_compaction_group(2, &[100]) + .split_compaction_group(2, &[100], 0) .await .unwrap(); @@ -2023,7 +2037,7 @@ async fn test_move_tables_between_compaction_group() { ); hummock_manager - .split_compaction_group(2, &[100]) + .split_compaction_group(2, &[100], 0) .await .unwrap(); let current_version = hummock_manager.get_current_version().await; @@ -2122,11 +2136,9 @@ async fn test_partition_level() { .level0_overlapping_sub_level_compact_level_count(3) .build(); let registry = Registry::new(); - let (_env, hummock_manager, _, worker_node) = + let (env, hummock_manager, _, worker_node) = setup_compute_env_with_metric(80, config.clone(), Some(MetaMetrics::for_test(®istry))) .await; - let config = Arc::new(config); - let context_id = worker_node.id; hummock_manager @@ -2161,7 +2173,7 @@ async fn test_partition_level() { .unwrap()); hummock_manager - .split_compaction_group(2, &[100]) + .split_compaction_group(2, &[100], env.opts.partition_vnode_count) .await .unwrap(); let current_version = hummock_manager.get_current_version().await; @@ -2303,7 +2315,7 @@ async fn test_unregister_moved_table() { .unwrap(); let new_group_id = hummock_manager - .split_compaction_group(2, &[100]) + .split_compaction_group(2, &[100], 0) .await .unwrap(); assert_ne!(new_group_id, 2); diff --git a/src/meta/src/hummock/manager/time_travel.rs b/src/meta/src/hummock/manager/time_travel.rs index 61c1e820fab0c..0b6ef73e52605 100644 --- a/src/meta/src/hummock/manager/time_travel.rs +++ b/src/meta/src/hummock/manager/time_travel.rs @@ -16,6 +16,7 @@ use std::collections::{HashMap, HashSet, VecDeque}; use anyhow::anyhow; use itertools::Itertools; +use risingwave_common::catalog::TableId; use risingwave_common::system_param::reader::SystemParamsRead; use risingwave_common::util::epoch::Epoch; use risingwave_hummock_sdk::compaction_group::StaticCompactionGroupId; @@ -36,7 +37,7 @@ use risingwave_pb::hummock::{PbHummockVersion, PbHummockVersionDelta}; use sea_orm::sea_query::OnConflict; use sea_orm::ActiveValue::Set; use sea_orm::{ - ColumnTrait, DatabaseTransaction, EntityTrait, QueryFilter, QueryOrder, QuerySelect, + ColumnTrait, Condition, DatabaseTransaction, EntityTrait, QueryFilter, QueryOrder, QuerySelect, TransactionTrait, }; @@ -101,6 +102,7 @@ impl HummockManager { .lt(risingwave_meta_model_v2::Epoch::try_from(epoch_watermark).unwrap()), ) .order_by_desc(hummock_epoch_to_version::Column::Epoch) + .order_by_asc(hummock_epoch_to_version::Column::VersionId) .one(&txn) .await?; let Some(version_watermark) = version_watermark else { @@ -275,9 +277,19 @@ impl HummockManager { /// The version is retrieved from `hummock_epoch_to_version`, selecting the entry with the largest epoch that's lte `query_epoch`. /// /// The resulted version is complete, i.e. with correct `SstableInfo`. - pub async fn epoch_to_version(&self, query_epoch: HummockEpoch) -> Result { + pub async fn epoch_to_version( + &self, + query_epoch: HummockEpoch, + table_id: u32, + ) -> Result { let sql_store = self.sql_store().ok_or_else(require_sql_meta_store_err)?; let epoch_to_version = hummock_epoch_to_version::Entity::find() + .filter( + Condition::any() + .add(hummock_epoch_to_version::Column::TableId.eq(i64::from(table_id))) + // for backward compatibility + .add(hummock_epoch_to_version::Column::TableId.eq(0)), + ) .filter( hummock_epoch_to_version::Column::Epoch .lte(risingwave_meta_model_v2::Epoch::try_from(query_epoch).unwrap()), @@ -362,7 +374,19 @@ impl HummockManager { delta: HummockVersionDelta, group_parents: &HashMap, skip_sst_ids: &HashSet, + tables_to_commit: impl Iterator, + committed_epoch: u64, ) -> Result>> { + let select_groups = group_parents + .iter() + .filter_map(|(cg_id, _)| { + if should_ignore_group(find_root_group(*cg_id, group_parents)) { + None + } else { + Some(*cg_id) + } + }) + .collect::>(); async fn write_sstable_infos( sst_infos: impl Iterator, txn: &DatabaseTransaction, @@ -388,35 +412,23 @@ impl HummockManager { Ok(count) } - let epoch = delta.visible_table_committed_epoch(); - let version_id: u64 = delta.id.to_u64(); - let m = hummock_epoch_to_version::ActiveModel { - epoch: Set(epoch.try_into().unwrap()), - version_id: Set(version_id.try_into().unwrap()), - }; - hummock_epoch_to_version::Entity::insert(m) - .on_conflict( - OnConflict::column(hummock_epoch_to_version::Column::Epoch) - // The existing row must be inserted by the common committed epoch of created MVs. - // While any duplicate row must be inserted by MVs still in creation. - // So the row shouldn't be updated. - .do_nothing() - .to_owned(), - ) - .do_nothing() - .exec(txn) - .await?; + for (table_id, cg_id) in tables_to_commit { + if !select_groups.contains(cg_id) { + continue; + } + let version_id: u64 = delta.id.to_u64(); + let m = hummock_epoch_to_version::ActiveModel { + epoch: Set(committed_epoch.try_into().unwrap()), + table_id: Set(table_id.table_id.into()), + version_id: Set(version_id.try_into().unwrap()), + }; + // There should be no conflict rows. + hummock_epoch_to_version::Entity::insert(m) + .exec(txn) + .await?; + } + let mut version_sst_ids = None; - let select_groups = group_parents - .iter() - .filter_map(|(cg_id, _)| { - if should_ignore_group(find_root_group(*cg_id, group_parents)) { - None - } else { - Some(*cg_id) - } - }) - .collect::>(); if let Some(version) = version { version_sst_ids = Some( version diff --git a/src/meta/src/hummock/manager/timer_task.rs b/src/meta/src/hummock/manager/timer_task.rs index ec0f77ac88a8a..94537e9c33e1f 100644 --- a/src/meta/src/hummock/manager/timer_task.rs +++ b/src/meta/src/hummock/manager/timer_task.rs @@ -43,7 +43,7 @@ impl HummockManager { const COMPACTION_HEARTBEAT_PERIOD_SEC: u64 = 1; pub enum HummockTimerEvent { - GroupSplit, + GroupSchedule, CheckDeadTask, Report, CompactionHeartBeatExpiredCheck, @@ -158,7 +158,7 @@ impl HummockManager { .set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); let split_group_trigger = IntervalStream::new(split_group_trigger_interval) - .map(|_| HummockTimerEvent::GroupSplit); + .map(|_| HummockTimerEvent::GroupSchedule); triggers.push(Box::pin(split_group_trigger)); } @@ -189,12 +189,12 @@ impl HummockManager { hummock_manager.check_dead_task().await; } - HummockTimerEvent::GroupSplit => { + HummockTimerEvent::GroupSchedule => { if hummock_manager.env.opts.compaction_deterministic_test { continue; } - hummock_manager.on_handle_check_split_multi_group().await; + hummock_manager.on_handle_schedule_group().await; } HummockTimerEvent::Report => { @@ -443,7 +443,7 @@ impl HummockManager { /// throughput keep larger than `table_write_throughput_threshold` for a long time. /// * For state-table whose throughput less than `min_table_split_write_throughput`, do not /// increase it size of base-level. - async fn on_handle_check_split_multi_group(&self) { + async fn on_handle_schedule_group(&self) { let params = self.env.system_params_reader().await; let barrier_interval_ms = params.barrier_interval_ms() as u64; let checkpoint_secs = std::cmp::max( @@ -469,18 +469,13 @@ impl HummockManager { continue; } - for (table_id, table_size) in &group.table_statistic { - self.try_move_table_to_dedicated_cg( - &table_write_throughput, - table_id, - table_size, - !created_tables.contains(table_id), - checkpoint_secs, - group.group_id, - group.group_size, - ) - .await; - } + self.try_split_compaction_group( + &table_write_throughput, + checkpoint_secs, + group, + &created_tables, + ) + .await; } } diff --git a/src/meta/src/hummock/manager/transaction.rs b/src/meta/src/hummock/manager/transaction.rs index aa0ead3cef2aa..9a795608f7e1a 100644 --- a/src/meta/src/hummock/manager/transaction.rs +++ b/src/meta/src/hummock/manager/transaction.rs @@ -122,7 +122,7 @@ impl<'a> HummockVersionTransaction<'a> { is_visible_table_committed_epoch: bool, new_compaction_group: Option<(CompactionGroupId, CompactionConfig)>, commit_sstables: BTreeMap>, - new_table_ids: HashMap, + new_table_ids: &HashMap, new_table_watermarks: HashMap, change_log_delta: HashMap, ) -> HummockVersionDelta { @@ -175,7 +175,7 @@ impl<'a> HummockVersionTransaction<'a> { // update state table info new_version_delta.with_latest_version(|version, delta| { - for (table_id, cg_id) in &new_table_ids { + for (table_id, cg_id) in new_table_ids { assert!( !version.state_table_info.info().contains_key(table_id), "newly added table exists previously: {:?}", diff --git a/src/meta/src/hummock/mock_hummock_meta_client.rs b/src/meta/src/hummock/mock_hummock_meta_client.rs index 1cdd8547c8247..499d9df0958c4 100644 --- a/src/meta/src/hummock/mock_hummock_meta_client.rs +++ b/src/meta/src/hummock/mock_hummock_meta_client.rs @@ -348,7 +348,11 @@ impl HummockMetaClient for MockHummockMetaClient { )) } - async fn get_version_by_epoch(&self, _epoch: HummockEpoch) -> Result { + async fn get_version_by_epoch( + &self, + _epoch: HummockEpoch, + _table_id: u32, + ) -> Result { unimplemented!() } } diff --git a/src/meta/src/lib.rs b/src/meta/src/lib.rs index 61e29b2fb1129..eab9dd1287ebf 100644 --- a/src/meta/src/lib.rs +++ b/src/meta/src/lib.rs @@ -15,7 +15,6 @@ #![allow(clippy::derive_partial_eq_without_eq)] #![feature(trait_alias)] #![feature(type_alias_impl_trait)] -#![feature(lint_reasons)] #![feature(map_try_insert)] #![feature(extract_if)] #![feature(hash_extract_if)] diff --git a/src/meta/src/manager/catalog/mod.rs b/src/meta/src/manager/catalog/mod.rs index 12c1596841f67..4db6711862810 100644 --- a/src/meta/src/manager/catalog/mod.rs +++ b/src/meta/src/manager/catalog/mod.rs @@ -1811,15 +1811,11 @@ impl CatalogManager { all_table_ids.extend(index_table_ids.iter().cloned()); for index_table_id in &index_table_ids { - let internal_table_ids = match fragment_manager + let internal_table_ids = fragment_manager .select_table_fragments_by_table_id(&(index_table_id.into())) .await .map(|fragments| fragments.internal_table_ids()) - { - Ok(v) => v, - // Handle backwards compat with no state persistence. - Err(_) => vec![], - }; + .unwrap_or_default(); // 1 should be used by table scan. if internal_table_ids.len() == 1 { @@ -1901,15 +1897,11 @@ impl CatalogManager { } all_table_ids.insert(index.index_table_id); - let internal_table_ids = match fragment_manager + let internal_table_ids = fragment_manager .select_table_fragments_by_table_id(&(index.index_table_id.into())) .await .map(|fragments| fragments.internal_table_ids()) - { - Ok(v) => v, - // Handle backwards compat with no state persistence. - Err(_) => vec![], - }; + .unwrap_or_default(); // 1 should be used by table scan. if internal_table_ids.len() == 1 { diff --git a/src/meta/src/manager/catalog/user.rs b/src/meta/src/manager/catalog/user.rs index 81181b0fc1e17..68e5e31395c0d 100644 --- a/src/meta/src/manager/catalog/user.rs +++ b/src/meta/src/manager/catalog/user.rs @@ -74,6 +74,7 @@ impl UserManager { .values() .map(|connection| connection.owner), ) + .chain(database.secrets.values().map(|secret| secret.owner)) .for_each(|owner_id| user_manager.increase_ref(owner_id)); Ok(user_manager) diff --git a/src/meta/src/manager/env.rs b/src/meta/src/manager/env.rs index 22f88bd9c0a75..ed18be6b0f483 100644 --- a/src/meta/src/manager/env.rs +++ b/src/meta/src/manager/env.rs @@ -294,6 +294,10 @@ pub struct MetaOpts { pub temp_secret_file_dir: String, pub table_info_statistic_history_times: usize, + + // Cluster limits + pub actor_cnt_per_worker_parallelism_hard_limit: usize, + pub actor_cnt_per_worker_parallelism_soft_limit: usize, } impl MetaOpts { @@ -358,6 +362,8 @@ impl MetaOpts { secret_store_private_key: Some("0123456789abcdef".as_bytes().to_vec()), temp_secret_file_dir: "./secrets".to_string(), table_info_statistic_history_times: 240, + actor_cnt_per_worker_parallelism_hard_limit: usize::MAX, + actor_cnt_per_worker_parallelism_soft_limit: usize::MAX, } } } @@ -408,9 +414,11 @@ impl MetaSrvEnv { (ClusterId::new(), true) }; - // For new clusters, the name of the object store needs to be prefixed according to the object id. - // For old clusters, the prefix is ​​not divided for the sake of compatibility. - + // For new clusters: + // - the name of the object store needs to be prefixed according to the object id. + // + // For old clusters + // - the prefix is ​​not divided for the sake of compatibility. init_system_params.use_new_object_prefix_strategy = Some(cluster_first_launch); let system_params_manager = Arc::new( SystemParamsManager::new( @@ -455,7 +463,7 @@ impl MetaSrvEnv { } } MetaStoreImpl::Sql(sql_meta_store) => { - let is_sql_backend_cluster_first_launch = + let cluster_first_launch = is_first_launch_for_sql_backend_cluster(sql_meta_store).await?; // Try to upgrade if any new model changes are added. Migrator::up(&sql_meta_store.conn, None) @@ -469,10 +477,14 @@ impl MetaSrvEnv { .await? .map(|c| c.cluster_id.to_string().into()) .unwrap(); - init_system_params.use_new_object_prefix_strategy = - Some(is_sql_backend_cluster_first_launch); - // For new clusters, the name of the object store needs to be prefixed according to the object id. - // For old clusters, the prefix is ​​not divided for the sake of compatibility. + + // For new clusters: + // - the name of the object store needs to be prefixed according to the object id. + // + // For old clusters + // - the prefix is ​​not divided for the sake of compatibility. + init_system_params.use_new_object_prefix_strategy = Some(cluster_first_launch); + let system_param_controller = Arc::new( SystemParamsController::new( sql_meta_store.clone(), diff --git a/src/meta/src/manager/metadata.rs b/src/meta/src/manager/metadata.rs index 52fc811787d30..935d4773865ed 100644 --- a/src/meta/src/manager/metadata.rs +++ b/src/meta/src/manager/metadata.rs @@ -917,6 +917,7 @@ impl MetadataManager { &self, job: &StreamingJob, ) -> MetaResult { + tracing::debug!("wait_streaming_job_finished: {job:?}"); match self { MetadataManager::V1(mgr) => mgr.wait_streaming_job_finished(job).await, MetadataManager::V2(mgr) => mgr.wait_streaming_job_finished(job.id() as _).await, diff --git a/src/meta/src/manager/sink_coordination/coordinator_worker.rs b/src/meta/src/manager/sink_coordination/coordinator_worker.rs index 8409e714852c2..8ed063e5325c0 100644 --- a/src/meta/src/manager/sink_coordination/coordinator_worker.rs +++ b/src/meta/src/manager/sink_coordination/coordinator_worker.rs @@ -12,64 +12,191 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashSet; +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::future::{poll_fn, Future}; use std::pin::pin; +use std::task::Poll; +use std::time::{Duration, Instant}; use anyhow::anyhow; use futures::future::{select, Either}; -use futures::stream::FuturesUnordered; -use futures::{StreamExt, TryStreamExt}; +use futures::pin_mut; +use itertools::Itertools; use risingwave_common::bitmap::Bitmap; -use risingwave_common::hash::{VirtualNode, VnodeBitmapExt}; +use risingwave_common::hash::VirtualNode; use risingwave_connector::dispatch_sink; use risingwave_connector::sink::{build_sink, Sink, SinkCommitCoordinator, SinkParam}; -use risingwave_pb::connector_service::coordinate_request::CommitRequest; -use risingwave_pb::connector_service::coordinate_response::{ - CommitResponse, StartCoordinationResponse, -}; -use risingwave_pb::connector_service::{ - coordinate_request, coordinate_response, CoordinateRequest, CoordinateResponse, SinkMetadata, -}; +use risingwave_pb::connector_service::SinkMetadata; use thiserror_ext::AsReport; +use tokio::select; use tokio::sync::mpsc::UnboundedReceiver; +use tokio::time::sleep; use tonic::Status; use tracing::{error, warn}; -use crate::manager::sink_coordination::{ - NewSinkWriterRequest, SinkCoordinatorResponseSender, SinkWriterRequestStream, -}; +use crate::manager::sink_coordination::handle::SinkWriterCoordinationHandle; -macro_rules! send_await_with_err_check { - ($tx:expr, $msg:expr) => { - if $tx.send($msg).await.is_err() { - error!("unable to send msg"); +async fn run_future_with_periodic_fn( + future: F, + interval: Duration, + mut f: impl FnMut(), +) -> F::Output { + pin_mut!(future); + loop { + match select(&mut future, pin!(sleep(interval))).await { + Either::Left((output, _)) => { + break output; + } + Either::Right(_) => f(), } - }; + } } -pub struct CoordinatorWorker { +struct EpochCommitRequests { + epoch: u64, + metadatas: Vec, + handle_ids: HashSet, + bitmap: Bitmap, +} + +impl EpochCommitRequests { + fn new(epoch: u64) -> Self { + Self { + epoch, + metadatas: vec![], + handle_ids: Default::default(), + bitmap: Bitmap::zeros(VirtualNode::COUNT), + } + } + + fn add_new_request( + &mut self, + handle_id: usize, + metadata: SinkMetadata, + vnode_bitmap: Bitmap, + ) -> anyhow::Result<()> { + self.metadatas.push(metadata); + assert!(self.handle_ids.insert(handle_id)); + let check_bitmap = (&self.bitmap) & &vnode_bitmap; + if check_bitmap.count_ones() > 0 { + return Err(anyhow!( + "duplicate vnode {:?} on epoch {}. request vnode: {:?}, prev vnode: {:?}", + check_bitmap.iter_ones().collect_vec(), + self.epoch, + vnode_bitmap, + self.bitmap + )); + } + self.bitmap |= &vnode_bitmap; + Ok(()) + } + + fn can_commit(&self) -> bool { + self.bitmap.count_ones() == VirtualNode::COUNT + } +} + +struct CoordinationHandleManager { param: SinkParam, - request_streams: Vec, - response_senders: Vec, - request_rx: UnboundedReceiver, + writer_handles: HashMap, + next_handle_id: usize, + request_rx: UnboundedReceiver, +} + +impl CoordinationHandleManager { + fn ack_commit( + &mut self, + epoch: u64, + handle_ids: impl IntoIterator, + ) -> anyhow::Result<()> { + for handle_id in handle_ids { + let handle = self.writer_handles.get_mut(&handle_id).ok_or_else(|| { + anyhow!( + "fail to find handle for {} when ack commit on epoch {}", + handle_id, + epoch + ) + })?; + handle.ack_commit(epoch).map_err(|_| { + anyhow!( + "fail to ack commit on epoch {} for handle {}", + epoch, + handle_id + ) + })?; + } + Ok(()) + } + + async fn next_commit_request_inner( + writer_handles: &mut HashMap, + ) -> anyhow::Result<(usize, Bitmap, u64, SinkMetadata)> { + poll_fn(|cx| 'outer: loop { + for (handle_id, handle) in writer_handles.iter_mut() { + if let Poll::Ready(result) = handle.poll_next_commit_request(cx) { + match result { + Ok(Some((epoch, metadata))) => { + return Poll::Ready(Ok(( + *handle_id, + handle.vnode_bitmap().clone(), + epoch, + metadata, + ))); + } + Ok(None) => { + let handle_id = *handle_id; + writer_handles.remove(&handle_id); + continue 'outer; + } + Err(e) => { + return Poll::Ready(Err(e)); + } + } + } + } + return Poll::Pending; + }) + .await + } + + async fn next_commit_request(&mut self) -> anyhow::Result<(usize, Bitmap, u64, SinkMetadata)> { + loop { + select! { + handle = self.request_rx.recv() => { + let mut handle = handle.ok_or_else(|| anyhow!("end of writer request stream"))?; + if handle.param() != &self.param { + warn!(prev_param = ?self.param, new_param = ?handle.param(), "sink param mismatch"); + } + handle.start()?; + let handle_id = self.next_handle_id; + self.next_handle_id += 1; + self.writer_handles.insert(handle_id, handle); + } + result = Self::next_commit_request_inner(&mut self.writer_handles) => { + break result; + } + } + } + } +} + +pub struct CoordinatorWorker { + handle_manager: CoordinationHandleManager, + pending_epochs: BTreeMap, } impl CoordinatorWorker { pub async fn run( - first_writer_request: NewSinkWriterRequest, - request_rx: UnboundedReceiver, + param: SinkParam, + request_rx: UnboundedReceiver, ) { - let sink = match build_sink(first_writer_request.param.clone()) { + let sink = match build_sink(param.clone()) { Ok(sink) => sink, Err(e) => { error!( error = %e.as_report(), "unable to build sink with param {:?}", - first_writer_request.param - ); - send_await_with_err_check!( - first_writer_request.response_tx, - Err(Status::invalid_argument("failed to build sink")) + param ); return; } @@ -81,247 +208,77 @@ impl CoordinatorWorker { error!( error = %e.as_report(), "unable to build coordinator with param {:?}", - first_writer_request.param - ); - send_await_with_err_check!( - first_writer_request.response_tx, - Err(Status::invalid_argument("failed to build coordinator")) + param ); return; } }; - Self::execute_coordinator(first_writer_request, request_rx, coordinator).await + Self::execute_coordinator(param, request_rx, coordinator).await }); } pub async fn execute_coordinator( - first_writer_request: NewSinkWriterRequest, - request_rx: UnboundedReceiver, + param: SinkParam, + request_rx: UnboundedReceiver, coordinator: impl SinkCommitCoordinator, ) { let mut worker = CoordinatorWorker { - param: first_writer_request.param, - request_streams: vec![first_writer_request.request_stream], - response_senders: vec![first_writer_request.response_tx], - request_rx, + handle_manager: CoordinationHandleManager { + param, + writer_handles: HashMap::new(), + next_handle_id: 0, + request_rx, + }, + pending_epochs: Default::default(), }; - if let Err(e) = worker - .wait_for_writers(first_writer_request.vnode_bitmap) - .await - { - error!(error = %e.as_report(), "failed to wait for all writers"); - worker - .send_to_all_sink_writers(|| { - Err(Status::cancelled("failed to wait for all writers")) - }) - .await; - } - - worker.start_coordination(coordinator).await; - } - - async fn send_to_all_sink_writers( - &mut self, - new_msg: impl Fn() -> Result, - ) { - for sender in &self.response_senders { - send_await_with_err_check!(sender, new_msg()); - } - } - - async fn next_new_writer(&mut self) -> anyhow::Result { - // TODO: add timeout log - match select( - pin!(self.request_rx.recv()), - pin!(FuturesUnordered::from_iter( - self.request_streams - .iter_mut() - .map(|stream| stream.try_next()), - ) - .next()), - ) - .await - { - Either::Left((Some(req), _)) => Ok(req), - Either::Left((None, _)) => Err(anyhow!("manager request stream reaches the end")), - Either::Right((Some(Ok(Some(request))), _)) => Err(anyhow!( - "get new request from sink writer before initialize: {:?}", - request - )), - Either::Right((Some(Ok(None)), _)) => Err(anyhow!( - "one sink writer stream reaches the end before initialize" - )), - Either::Right((Some(Err(e)), _)) => { - Err(anyhow!(e).context("unable to poll one sink writer stream")) + if let Err(e) = worker.run_coordination(coordinator).await { + for handle in worker.handle_manager.writer_handles.into_values() { + handle.abort(Status::internal(format!( + "failed to run coordination: {:?}", + e.as_report() + ))) } - Either::Right((None, _)) => unreachable!("request_streams must not be empty"), } } - async fn wait_for_writers(&mut self, first_vnode_bitmap: Bitmap) -> anyhow::Result<()> { - let mut remaining_count = VirtualNode::COUNT; - let mut registered_vnode = HashSet::with_capacity(VirtualNode::COUNT); - - for vnode in first_vnode_bitmap.iter_vnodes() { - remaining_count -= 1; - registered_vnode.insert(vnode); - } - - while remaining_count > 0 { - let new_writer_request = self.next_new_writer().await?; - if self.param != new_writer_request.param { - // TODO: may return error. - warn!( - "get different param {:?} while current param {:?}", - new_writer_request.param, self.param - ); - } - self.request_streams.push(new_writer_request.request_stream); - self.response_senders.push(new_writer_request.response_tx); - - for vnode in new_writer_request.vnode_bitmap.iter_vnodes() { - if registered_vnode.contains(&vnode) { - return Err(anyhow!( - "get overlapped vnode: {}, current vnode {:?}", - vnode, - registered_vnode - )); - } - registered_vnode.insert(vnode); - remaining_count -= 1; - } - } - - self.send_to_all_sink_writers(|| { - Ok(CoordinateResponse { - msg: Some(coordinate_response::Msg::StartResponse( - StartCoordinationResponse {}, - )), - }) - }) - .await; - Ok(()) - } - - async fn collect_all_metadata(&mut self) -> anyhow::Result<(u64, Vec)> { - let mut epoch = None; - let mut metadata_list = Vec::with_capacity(self.request_streams.len()); - let mut uncollected_futures = FuturesUnordered::from_iter( - self.request_streams - .iter_mut() - .map(|stream| stream.try_next()), - ); - + async fn run_coordination( + &mut self, + mut coordinator: impl SinkCommitCoordinator, + ) -> anyhow::Result<()> { + coordinator.init().await?; loop { - match select( - pin!(self.request_rx.recv()), - pin!(uncollected_futures.next()), - ) - .await + let (handle_id, vnode_bitmap, epoch, metadata) = + self.handle_manager.next_commit_request().await?; + self.pending_epochs + .entry(epoch) + .or_insert_with(|| EpochCommitRequests::new(epoch)) + .add_new_request(handle_id, metadata, vnode_bitmap)?; + if self + .pending_epochs + .first_key_value() + .expect("non-empty") + .1 + .can_commit() { - Either::Left((Some(new_request), _)) => { - warn!("get new writer request while collecting metadata"); - send_await_with_err_check!( - new_request.response_tx, - Err(Status::already_exists( - "coordinator already running, should not get new request" - )) - ); - continue; - } - Either::Left((None, _)) => { - return Err(anyhow!( - "coordinator get notified to stop while collecting metadata" - )); - } - Either::Right((Some(next_result), _)) => match next_result { - Ok(Some(CoordinateRequest { - msg: - Some(coordinate_request::Msg::CommitRequest(CommitRequest { - epoch: request_epoch, - metadata: Some(metadata), - })), - })) => { - match &epoch { - Some(epoch) => { - if *epoch != request_epoch { - warn!( - "current epoch is {} but get request from {}", - epoch, request_epoch - ); - } - } - None => { - epoch = Some(request_epoch); - } - } - metadata_list.push(metadata); - } - Ok(Some(req)) => { - return Err(anyhow!("expect commit request but get {:?}", req)); - } - Ok(None) => { - return Err(anyhow!( - "sink writer input reaches the end while collecting metadata" - )); - } - Err(e) => { - return Err( - anyhow!(e).context("failed to poll one of the writer request streams") - ); - } - }, - Either::Right((None, _)) => { - break; - } - } - } - Ok(( - epoch.expect("should not be empty when have at least one writer"), - metadata_list, - )) - } - - async fn start_coordination(&mut self, mut coordinator: impl SinkCommitCoordinator) { - let result: Result<(), String> = try { - coordinator.init().await.map_err(|e| { - error!(error = %e.as_report(), "failed to initialize coordinator"); - format!("failed to initialize coordinator: {:?}", e.as_report()) - })?; - loop { - let (epoch, metadata_list) = self.collect_all_metadata().await.map_err(|e| { - error!(error = %e.as_report(), "failed to collect all metadata"); - format!("failed to collect all metadata: {:?}", e.as_report()) - })?; + let (epoch, requests) = self.pending_epochs.pop_first().expect("non-empty"); // TODO: measure commit time - coordinator - .commit(epoch, metadata_list) - .await - .map_err(|e| { - error!(epoch, error = %e.as_report(), "failed to commit metadata of epoch"); - format!("failed to commit: {:?}", e.as_report()) - })?; - - self.send_to_all_sink_writers(|| { - Ok(CoordinateResponse { - msg: Some(coordinate_response::Msg::CommitResponse(CommitResponse { - epoch, - })), - }) - }) - .await; + let start_time = Instant::now(); + run_future_with_periodic_fn( + coordinator.commit(epoch, requests.metadatas), + Duration::from_secs(5), + || { + warn!( + elapsed = ?start_time.elapsed(), + sink_id = self.handle_manager.param.sink_id.sink_id, + "committing" + ); + }, + ) + .await + .map_err(|e| anyhow!(e))?; + self.handle_manager.ack_commit(epoch, requests.handle_ids)?; } - }; - - if let Err(err_str) = result { - self.send_to_all_sink_writers(|| { - Err(Status::aborted(format!( - "failed to run coordination: {}", - err_str - ))) - }) - .await; } } } diff --git a/src/meta/src/manager/sink_coordination/handle.rs b/src/meta/src/manager/sink_coordination/handle.rs new file mode 100644 index 0000000000000..60b49cfd623ab --- /dev/null +++ b/src/meta/src/manager/sink_coordination/handle.rs @@ -0,0 +1,139 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::pin::pin; +use std::task::{Context, Poll}; + +use anyhow::anyhow; +use futures::{Future, TryStreamExt}; +use risingwave_common::bitmap::Bitmap; +use risingwave_connector::sink::SinkParam; +use risingwave_pb::connector_service::coordinate_response::{ + CommitResponse, StartCoordinationResponse, +}; +use risingwave_pb::connector_service::{ + coordinate_request, coordinate_response, CoordinateResponse, SinkMetadata, +}; +use tonic::Status; + +use crate::manager::sink_coordination::{SinkCoordinatorResponseSender, SinkWriterRequestStream}; + +pub(super) struct SinkWriterCoordinationHandle { + request_stream: SinkWriterRequestStream, + response_tx: SinkCoordinatorResponseSender, + param: SinkParam, + vnode_bitmap: Bitmap, + prev_epoch: Option, +} + +impl SinkWriterCoordinationHandle { + pub(super) fn new( + request_stream: SinkWriterRequestStream, + response_tx: SinkCoordinatorResponseSender, + param: SinkParam, + vnode_bitmap: Bitmap, + ) -> Self { + Self { + request_stream, + response_tx, + param, + vnode_bitmap, + prev_epoch: None, + } + } + + pub(super) fn param(&self) -> &SinkParam { + &self.param + } + + pub(super) fn vnode_bitmap(&self) -> &Bitmap { + &self.vnode_bitmap + } + + pub(super) fn start(&mut self) -> anyhow::Result<()> { + self.response_tx + .send(Ok(CoordinateResponse { + msg: Some(coordinate_response::Msg::StartResponse( + StartCoordinationResponse {}, + )), + })) + .map_err(|_| anyhow!("fail to send start response")) + } + + pub(super) fn abort(self, status: Status) { + let _ = self.response_tx.send(Err(status)); + } + + pub(super) fn ack_commit(&mut self, epoch: u64) -> anyhow::Result<()> { + self.response_tx + .send(Ok(CoordinateResponse { + msg: Some(coordinate_response::Msg::CommitResponse(CommitResponse { + epoch, + })), + })) + .map_err(|_| anyhow!("fail to send commit response of epoch {}", epoch)) + } + + pub(super) fn poll_next_commit_request( + &mut self, + cx: &mut Context<'_>, + ) -> Poll>> { + let future = self.next_commit_request(); + let future = pin!(future); + future.poll(cx) + } + + async fn next_commit_request(&mut self) -> anyhow::Result> { + loop { + let request = self + .request_stream + .try_next() + .await? + .ok_or_else(|| anyhow!("end of request stream"))?; + match request.msg.ok_or_else(|| anyhow!("None msg in request"))? { + coordinate_request::Msg::StartRequest(_) => { + return Err(anyhow!("should have started")); + } + coordinate_request::Msg::CommitRequest(request) => { + if let Some(prev_epoch) = self.prev_epoch { + if request.epoch < prev_epoch { + return Err(anyhow!( + "invalid commit epoch {}, prev_epoch {}", + request.epoch, + prev_epoch + )); + } + } + let Some(metadata) = request.metadata else { + return Err(anyhow!("empty commit metadata")); + }; + self.prev_epoch = Some(request.epoch); + return Ok(Some((request.epoch, metadata))); + } + coordinate_request::Msg::UpdateVnodeRequest(request) => { + let bitmap = Bitmap::from( + &request + .vnode_bitmap + .ok_or_else(|| anyhow!("empty vnode bitmap"))?, + ); + self.vnode_bitmap = bitmap; + continue; + } + coordinate_request::Msg::Stop(_) => { + return Ok(None); + } + } + } + } +} diff --git a/src/meta/src/manager/sink_coordination/manager.rs b/src/meta/src/manager/sink_coordination/manager.rs index fd2b986be28e7..2fe2e8bfb3b8c 100644 --- a/src/meta/src/manager/sink_coordination/manager.rs +++ b/src/meta/src/manager/sink_coordination/manager.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::hash_map::Entry; use std::collections::HashMap; use std::pin::pin; @@ -30,12 +29,13 @@ use tokio::sync::mpsc; use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; use tokio::sync::oneshot::{channel, Receiver, Sender}; use tokio::task::{JoinError, JoinHandle}; -use tokio_stream::wrappers::ReceiverStream; +use tokio_stream::wrappers::UnboundedReceiverStream; use tonic::Status; use tracing::{debug, error, info, warn}; use crate::manager::sink_coordination::coordinator_worker::CoordinatorWorker; -use crate::manager::sink_coordination::{NewSinkWriterRequest, SinkWriterRequestStream}; +use crate::manager::sink_coordination::handle::SinkWriterCoordinationHandle; +use crate::manager::sink_coordination::SinkWriterRequestStream; macro_rules! send_with_err_check { ($tx:expr, $msg:expr) => { @@ -56,7 +56,7 @@ macro_rules! send_await_with_err_check { const BOUNDED_CHANNEL_SIZE: usize = 16; enum ManagerRequest { - NewSinkWriter(NewSinkWriterRequest), + NewSinkWriter(SinkWriterCoordinationHandle), StopCoordinator { finish_notifier: Sender<()>, /// sink id to stop. When `None`, stop all sink coordinator @@ -71,11 +71,8 @@ pub struct SinkCoordinatorManager { impl SinkCoordinatorManager { pub fn start_worker() -> (Self, (JoinHandle<()>, Sender<()>)) { - Self::start_worker_with_spawn_worker(|writer_request, manager_request_stream| { - tokio::spawn(CoordinatorWorker::run( - writer_request, - manager_request_stream, - )) + Self::start_worker_with_spawn_worker(|param, manager_request_stream| { + tokio::spawn(CoordinatorWorker::run(param, manager_request_stream)) }) } @@ -111,14 +108,11 @@ impl SinkCoordinatorManager { ))); } }; - let (response_tx, response_rx) = mpsc::channel(BOUNDED_CHANNEL_SIZE); + let (response_tx, response_rx) = mpsc::unbounded_channel(); self.request_tx - .send(ManagerRequest::NewSinkWriter(NewSinkWriterRequest { - request_stream, - response_tx, - param, - vnode_bitmap, - })) + .send(ManagerRequest::NewSinkWriter( + SinkWriterCoordinationHandle::new(request_stream, response_tx, param, vnode_bitmap), + )) .await .map_err(|_| { Status::unavailable( @@ -126,7 +120,7 @@ impl SinkCoordinatorManager { ) })?; - Ok(ReceiverStream::new(response_rx)) + Ok(UnboundedReceiverStream::new(response_rx)) } async fn stop_coordinator(&self, sink_id: Option) { @@ -155,7 +149,7 @@ impl SinkCoordinatorManager { struct CoordinatorWorkerHandle { /// Sender to coordinator worker. Drop the sender as a stop signal - request_sender: Option>, + request_sender: Option>, /// Notify when the coordinator worker stops finish_notifiers: Vec>, } @@ -163,7 +157,7 @@ struct CoordinatorWorkerHandle { struct ManagerWorker { request_rx: mpsc::Receiver, // Make it option so that it can be polled with &mut SinkManagerWorker - shutdown_rx: Option>, + shutdown_rx: Receiver<()>, running_coordinator_worker_join_handles: FuturesUnordered)>>, @@ -178,7 +172,7 @@ enum ManagerEvent { }, } -trait SpawnCoordinatorFn = FnMut(NewSinkWriterRequest, UnboundedReceiver) -> JoinHandle<()> +trait SpawnCoordinatorFn = FnMut(SinkParam, UnboundedReceiver) -> JoinHandle<()> + Send + 'static; @@ -186,7 +180,7 @@ impl ManagerWorker { fn new(request_rx: mpsc::Receiver, shutdown_rx: Receiver<()>) -> Self { ManagerWorker { request_rx, - shutdown_rx: Some(shutdown_rx), + shutdown_rx, running_coordinator_worker_join_handles: Default::default(), running_coordinator_worker: Default::default(), } @@ -237,7 +231,6 @@ impl ManagerWorker { } async fn next_event(&mut self) -> Option { - let shutdown_rx = self.shutdown_rx.take().expect("should not be empty"); match select( select( pin!(self.request_rx.recv()), @@ -245,23 +238,20 @@ impl ManagerWorker { self.running_coordinator_worker_join_handles.next() )), ), - shutdown_rx, + &mut self.shutdown_rx, ) .await { - Either::Left((either, shutdown_rx)) => { - self.shutdown_rx = Some(shutdown_rx); - match either { - Either::Left((Some(request), _)) => Some(ManagerEvent::NewRequest(request)), - Either::Left((None, _)) => None, - Either::Right(((sink_id, join_result), _)) => { - Some(ManagerEvent::CoordinatorWorkerFinished { - sink_id, - join_result, - }) - } + Either::Left((either, _)) => match either { + Either::Left((Some(request), _)) => Some(ManagerEvent::NewRequest(request)), + Either::Left((None, _)) => None, + Either::Right(((sink_id, join_result), _)) => { + Some(ManagerEvent::CoordinatorWorkerFinished { + sink_id, + join_result, + }) } - } + }, Either::Right(_) => None, } } @@ -309,39 +299,39 @@ impl ManagerWorker { fn handle_new_sink_writer( &mut self, - request: NewSinkWriterRequest, + new_writer: SinkWriterCoordinationHandle, spawn_coordinator_worker: &mut impl SpawnCoordinatorFn, ) { - let param = &request.param; + let param = new_writer.param(); let sink_id = param.sink_id; - // Launch the coordinator worker task if it is the first - match self.running_coordinator_worker.entry(param.sink_id) { - Entry::Occupied(mut entry) => { - if let Some(sender) = entry.get_mut().request_sender.as_mut() { - send_with_err_check!(sender, request); - } else { - warn!( - "handle a new request while the sink coordinator is being stopped: {:?}", - param - ); - drop(request.response_tx); - } - } - Entry::Vacant(entry) => { + let handle = self + .running_coordinator_worker + .entry(param.sink_id) + .or_insert_with(|| { + // Launch the coordinator worker task if it is the first let (request_tx, request_rx) = unbounded_channel(); - let join_handle = spawn_coordinator_worker(request, request_rx); + let join_handle = spawn_coordinator_worker(param.clone(), request_rx); self.running_coordinator_worker_join_handles.push( join_handle .map(move |join_result| (sink_id, join_result)) .boxed(), ); - entry.insert(CoordinatorWorkerHandle { + CoordinatorWorkerHandle { request_sender: Some(request_tx), finish_notifiers: Vec::new(), - }); - } - }; + } + }); + + if let Some(sender) = handle.request_sender.as_mut() { + send_with_err_check!(sender, new_writer); + } else { + warn!( + "handle a new request while the sink coordinator is being stopped: {:?}", + param + ); + new_writer.abort(Status::internal("the sink is being stopped")); + } } } @@ -357,7 +347,7 @@ mod tests { use futures::{FutureExt, StreamExt}; use itertools::Itertools; use rand::seq::SliceRandom; - use risingwave_common::bitmap::{Bitmap, BitmapBuilder}; + use risingwave_common::bitmap::BitmapBuilder; use risingwave_common::hash::VirtualNode; use risingwave_connector::sink::catalog::{SinkId, SinkType}; use risingwave_connector::sink::{SinkCommitCoordinator, SinkError, SinkParam}; @@ -367,7 +357,7 @@ mod tests { use tokio_stream::wrappers::ReceiverStream; use crate::manager::sink_coordination::coordinator_worker::CoordinatorWorker; - use crate::manager::sink_coordination::{NewSinkWriterRequest, SinkCoordinatorManager}; + use crate::manager::sink_coordination::SinkCoordinatorManager; struct MockCoordinator, &mut C) -> Result<(), SinkError>> { context: C, @@ -434,16 +424,16 @@ mod tests { let (manager, (_join_handle, _stop_tx)) = SinkCoordinatorManager::start_worker_with_spawn_worker({ - let param = param.clone(); + let expected_param = param.clone(); let metadata = metadata.clone(); - move |first_request: NewSinkWriterRequest, new_writer_rx| { - let param = param.clone(); + move |param, new_writer_rx| { let metadata = metadata.clone(); + let expected_param = expected_param.clone(); tokio::spawn(async move { // validate the start request - assert_eq!(first_request.param, param); + assert_eq!(param, expected_param); CoordinatorWorker::execute_coordinator( - first_request, + param.clone(), new_writer_rx, MockCoordinator::new(0, |epoch, metadata_list, count: &mut usize| { *count += 1; @@ -497,14 +487,8 @@ mod tests { .unwrap() }; - let mut build_client_future1 = pin!(build_client(vnode1)); - assert!( - poll_fn(|cx| Poll::Ready(build_client_future1.as_mut().poll(cx))) - .await - .is_pending() - ); let (mut client1, mut client2) = - join(build_client_future1, pin!(build_client(vnode2))).await; + join(build_client(vnode1), pin!(build_client(vnode2))).await; { // commit epoch1 @@ -598,16 +582,16 @@ mod tests { let (manager, (_join_handle, _stop_tx)) = SinkCoordinatorManager::start_worker_with_spawn_worker({ - let param = param.clone(); + let expected_param = param.clone(); let metadata = metadata.clone(); - move |first_request: NewSinkWriterRequest, new_writer_rx| { - let param = param.clone(); + move |param, new_writer_rx| { let metadata = metadata.clone(); + let expected_param = expected_param.clone(); tokio::spawn(async move { // validate the start request - assert_eq!(first_request.param, param); + assert_eq!(param, expected_param); CoordinatorWorker::execute_coordinator( - first_request, + param.clone(), new_writer_rx, MockCoordinator::new(0, |epoch, metadata_list, count: &mut usize| { *count += 1; @@ -686,46 +670,6 @@ mod tests { .unwrap(); } - #[tokio::test] - async fn test_drop_sink_while_init() { - let sink_id = SinkId::from(1); - let param = SinkParam { - sink_id, - sink_name: "test".into(), - properties: Default::default(), - columns: vec![], - downstream_pk: vec![], - sink_type: SinkType::AppendOnly, - format_desc: None, - db_name: "test".into(), - sink_from_name: "test".into(), - }; - - let (manager, (_join_handle, _stop_tx)) = SinkCoordinatorManager::start_worker(); - - let mut build_client_future1 = pin!(CoordinatorStreamHandle::new_with_init_stream( - param.to_proto(), - Bitmap::zeros(VirtualNode::COUNT), - |rx| async { - Ok(tonic::Response::new( - manager - .handle_new_request(ReceiverStream::new(rx).map(Ok).boxed()) - .await - .unwrap() - .boxed(), - )) - }, - )); - assert!( - poll_fn(|cx| Poll::Ready(build_client_future1.as_mut().poll(cx))) - .await - .is_pending() - ); - manager.stop_sink_coordinator(sink_id).await; - - assert!(build_client_future1.await.is_err()); - } - #[tokio::test] async fn test_partial_commit() { let param = SinkParam { @@ -757,14 +701,14 @@ mod tests { let (manager, (_join_handle, _stop_tx)) = SinkCoordinatorManager::start_worker_with_spawn_worker({ - let param = param.clone(); - move |first_request: NewSinkWriterRequest, new_writer_rx| { - let param = param.clone(); + let expected_param = param.clone(); + move |param, new_writer_rx| { + let expected_param = expected_param.clone(); tokio::spawn(async move { // validate the start request - assert_eq!(first_request.param, param); + assert_eq!(param, expected_param); CoordinatorWorker::execute_coordinator( - first_request, + param, new_writer_rx, MockCoordinator::new((), |_, _, _| unreachable!()), ) @@ -836,14 +780,14 @@ mod tests { let (manager, (_join_handle, _stop_tx)) = SinkCoordinatorManager::start_worker_with_spawn_worker({ - let param = param.clone(); - move |first_request: NewSinkWriterRequest, new_writer_rx| { - let param = param.clone(); + let expected_param = param.clone(); + move |param, new_writer_rx| { + let expected_param = expected_param.clone(); tokio::spawn(async move { // validate the start request - assert_eq!(first_request.param, param); + assert_eq!(param, expected_param); CoordinatorWorker::execute_coordinator( - first_request, + param, new_writer_rx, MockCoordinator::new((), |_, _, _| { Err(SinkError::Coordinator(anyhow!("failed to commit"))) @@ -897,4 +841,269 @@ mod tests { assert!(result1.is_err()); assert!(result2.is_err()); } + + #[tokio::test] + async fn test_update_vnode_bitmap() { + let param = SinkParam { + sink_id: SinkId::from(1), + sink_name: "test".into(), + properties: Default::default(), + columns: vec![], + downstream_pk: vec![], + sink_type: SinkType::AppendOnly, + format_desc: None, + db_name: "test".into(), + sink_from_name: "test".into(), + }; + + let epoch1 = 233; + let epoch2 = 234; + let epoch3 = 235; + let epoch4 = 236; + + let mut all_vnode = (0..VirtualNode::COUNT).collect_vec(); + all_vnode.shuffle(&mut rand::thread_rng()); + let (first, second) = all_vnode.split_at(VirtualNode::COUNT / 2); + let build_bitmap = |indexes: &[usize]| { + let mut builder = BitmapBuilder::zeroed(VirtualNode::COUNT); + for i in indexes { + builder.set(*i, true); + } + builder.finish() + }; + let vnode1 = build_bitmap(first); + let vnode2 = build_bitmap(second); + + let metadata = [ + [vec![1u8, 2u8], vec![3u8, 4u8]], + [vec![5u8, 6u8], vec![7u8, 8u8]], + ]; + + let metadata_scale_out = [vec![9u8, 10u8], vec![11u8, 12u8], vec![13u8, 14u8]]; + let metadata_scale_in = [vec![13u8, 14u8], vec![15u8, 16u8]]; + + let (manager, (_join_handle, _stop_tx)) = + SinkCoordinatorManager::start_worker_with_spawn_worker({ + let expected_param = param.clone(); + let metadata = metadata.clone(); + let metadata_scale_out = metadata_scale_out.clone(); + let metadata_scale_in = metadata_scale_in.clone(); + move |param, new_writer_rx| { + let metadata = metadata.clone(); + let metadata_scale_out = metadata_scale_out.clone(); + let metadata_scale_in = metadata_scale_in.clone(); + let expected_param = expected_param.clone(); + tokio::spawn(async move { + // validate the start request + assert_eq!(param, expected_param); + CoordinatorWorker::execute_coordinator( + param.clone(), + new_writer_rx, + MockCoordinator::new(0, |epoch, metadata_list, count: &mut usize| { + *count += 1; + let mut metadata_list = metadata_list + .into_iter() + .map(|metadata| match metadata { + SinkMetadata { + metadata: + Some(Metadata::Serialized(SerializedMetadata { + metadata, + })), + } => metadata, + _ => unreachable!(), + }) + .collect_vec(); + metadata_list.sort(); + let (expected_epoch, expected_metadata_list) = match *count { + 1 => (epoch1, metadata[0].as_slice()), + 2 => (epoch2, metadata[1].as_slice()), + 3 => (epoch3, metadata_scale_out.as_slice()), + 4 => (epoch4, metadata_scale_in.as_slice()), + _ => unreachable!(), + }; + assert_eq!(expected_epoch, epoch); + assert_eq!(expected_metadata_list, &metadata_list); + Ok(()) + }), + ) + .await; + }) + } + }); + + let build_client = |vnode| async { + CoordinatorStreamHandle::new_with_init_stream(param.to_proto(), vnode, |rx| async { + Ok(tonic::Response::new( + manager + .handle_new_request(ReceiverStream::new(rx).map(Ok).boxed()) + .await + .unwrap() + .boxed(), + )) + }) + .await + .unwrap() + }; + + let (mut client1, mut client2) = + join(build_client(vnode1), pin!(build_client(vnode2))).await; + + { + // commit epoch1 + let mut commit_future = pin!(client2 + .commit( + epoch1, + SinkMetadata { + metadata: Some(Metadata::Serialized(SerializedMetadata { + metadata: metadata[0][1].clone(), + })), + }, + ) + .map(|result| result.unwrap())); + assert!(poll_fn(|cx| Poll::Ready(commit_future.as_mut().poll(cx))) + .await + .is_pending()); + join( + commit_future, + client1 + .commit( + epoch1, + SinkMetadata { + metadata: Some(Metadata::Serialized(SerializedMetadata { + metadata: metadata[0][0].clone(), + })), + }, + ) + .map(|result| result.unwrap()), + ) + .await; + } + + let (vnode1, vnode2, vnode3) = { + let (first, second) = all_vnode.split_at(VirtualNode::COUNT / 3); + let (second, third) = second.split_at(VirtualNode::COUNT / 3); + ( + build_bitmap(first), + build_bitmap(second), + build_bitmap(third), + ) + }; + + let mut client3 = build_client(vnode3).await; + { + let mut commit_future3 = pin!(client3 + .commit( + epoch3, + SinkMetadata { + metadata: Some(Metadata::Serialized(SerializedMetadata { + metadata: metadata_scale_out[2].clone(), + })), + }, + ) + .map(|result| result.unwrap())); + assert!(poll_fn(|cx| Poll::Ready(commit_future3.as_mut().poll(cx))) + .await + .is_pending()); + + { + // commit epoch2 + let mut commit_future = pin!(client1 + .commit( + epoch2, + SinkMetadata { + metadata: Some(Metadata::Serialized(SerializedMetadata { + metadata: metadata[1][0].clone(), + })), + }, + ) + .map(|result| result.unwrap())); + assert!(poll_fn(|cx| Poll::Ready(commit_future.as_mut().poll(cx))) + .await + .is_pending()); + join( + commit_future, + client2 + .commit( + epoch2, + SinkMetadata { + metadata: Some(Metadata::Serialized(SerializedMetadata { + metadata: metadata[1][1].clone(), + })), + }, + ) + .map(|result| result.unwrap()), + ) + .await; + } + + client1.update_vnode_bitmap(&vnode1).await.unwrap(); + client2.update_vnode_bitmap(&vnode2).await.unwrap(); + let mut commit_future1 = pin!(client1 + .commit( + epoch3, + SinkMetadata { + metadata: Some(Metadata::Serialized(SerializedMetadata { + metadata: metadata_scale_out[0].clone(), + })), + }, + ) + .map(|result| result.unwrap())); + assert!(poll_fn(|cx| Poll::Ready(commit_future1.as_mut().poll(cx))) + .await + .is_pending()); + assert!(poll_fn(|cx| Poll::Ready(commit_future3.as_mut().poll(cx))) + .await + .is_pending()); + client2 + .commit( + epoch3, + SinkMetadata { + metadata: Some(Metadata::Serialized(SerializedMetadata { + metadata: metadata_scale_out[1].clone(), + })), + }, + ) + .map(|result| result.unwrap()) + .await; + } + + let (vnode2, vnode3) = { + let (first, second) = all_vnode.split_at(VirtualNode::COUNT / 3); + (build_bitmap(first), build_bitmap(second)) + }; + + // client1.stop().await.unwrap(); + client2.update_vnode_bitmap(&vnode2).await.unwrap(); + client3.update_vnode_bitmap(&vnode3).await.unwrap(); + + { + let mut commit_future = pin!(client2 + .commit( + epoch4, + SinkMetadata { + metadata: Some(Metadata::Serialized(SerializedMetadata { + metadata: metadata_scale_in[0].clone(), + })), + }, + ) + .map(|result| result.unwrap())); + assert!(poll_fn(|cx| Poll::Ready(commit_future.as_mut().poll(cx))) + .await + .is_pending()); + join( + commit_future, + client3 + .commit( + epoch4, + SinkMetadata { + metadata: Some(Metadata::Serialized(SerializedMetadata { + metadata: metadata_scale_in[1].clone(), + })), + }, + ) + .map(|result| result.unwrap()), + ) + .await; + } + } } diff --git a/src/meta/src/manager/sink_coordination/mod.rs b/src/meta/src/manager/sink_coordination/mod.rs index ab44965891d5f..2f5f4d6ba62b1 100644 --- a/src/meta/src/manager/sink_coordination/mod.rs +++ b/src/meta/src/manager/sink_coordination/mod.rs @@ -13,22 +13,14 @@ // limitations under the License. mod coordinator_worker; +mod handle; mod manager; use futures::stream::BoxStream; pub use manager::SinkCoordinatorManager; -use risingwave_common::bitmap::Bitmap; -use risingwave_connector::sink::SinkParam; use risingwave_pb::connector_service::{CoordinateRequest, CoordinateResponse}; -use tokio::sync::mpsc::Sender; +use tokio::sync::mpsc::UnboundedSender; use tonic::Status; pub type SinkWriterRequestStream = BoxStream<'static, Result>; -pub type SinkCoordinatorResponseSender = Sender>; - -pub struct NewSinkWriterRequest { - pub request_stream: SinkWriterRequestStream, - pub response_tx: SinkCoordinatorResponseSender, - pub param: SinkParam, - pub vnode_bitmap: Bitmap, -} +pub type SinkCoordinatorResponseSender = UnboundedSender>; diff --git a/src/meta/src/model/stream.rs b/src/meta/src/model/stream.rs index 447cf5cf85645..aaff076688785 100644 --- a/src/meta/src/model/stream.rs +++ b/src/meta/src/model/stream.rs @@ -363,7 +363,9 @@ impl TableFragments { return vec![]; } if (fragment.fragment_type_mask - & (FragmentTypeFlag::Values as u32 | FragmentTypeFlag::StreamScan as u32)) + & (FragmentTypeFlag::Values as u32 + | FragmentTypeFlag::StreamScan as u32 + | FragmentTypeFlag::SourceScan as u32)) != 0 { actor_ids.extend(fragment.actors.iter().map(|actor| actor.actor_id)); diff --git a/src/meta/src/rpc/ddl_controller.rs b/src/meta/src/rpc/ddl_controller.rs index feb7a959083bb..4c1988a37d44c 100644 --- a/src/meta/src/rpc/ddl_controller.rs +++ b/src/meta/src/rpc/ddl_controller.rs @@ -368,12 +368,14 @@ impl DdlController { } } + #[tracing::instrument(skip(self), level = "debug")] pub async fn alter_parallelism( &self, table_id: u32, parallelism: PbTableParallelism, mut deferred: bool, ) -> MetaResult<()> { + tracing::info!("alter parallelism"); if self.barrier_manager.check_status_running().is_err() { tracing::info!( "alter parallelism is set to deferred mode because the system is in recovery state" @@ -1612,6 +1614,7 @@ impl DdlController { let parallelism = self.resolve_stream_parallelism(specified_parallelism, &cluster_info)?; + // TODO(var-vnode): use vnode count from config const MAX_PARALLELISM: NonZeroUsize = NonZeroUsize::new(VirtualNode::COUNT).unwrap(); let parallelism_limited = parallelism > MAX_PARALLELISM; @@ -1643,7 +1646,7 @@ impl DdlController { // Otherwise, it defaults to FIXED based on deduction. let table_parallelism = match (specified_parallelism, &self.env.opts.default_parallelism) { (None, DefaultParallelism::Full) if parallelism_limited => { - tracing::warn!("Parallelism limited to 256 in ADAPTIVE mode"); + tracing::warn!("Parallelism limited to {MAX_PARALLELISM} in ADAPTIVE mode"); TableParallelism::Adaptive } (None, DefaultParallelism::Full) => TableParallelism::Adaptive, diff --git a/src/meta/src/serving/mod.rs b/src/meta/src/serving/mod.rs index 69e17a978212e..30f1466eae7f7 100644 --- a/src/meta/src/serving/mod.rs +++ b/src/meta/src/serving/mod.rs @@ -16,7 +16,7 @@ use std::collections::HashMap; use std::sync::Arc; use parking_lot::RwLock; -use risingwave_common::hash::WorkerSlotMapping; +use risingwave_common::hash::{VirtualNode, WorkerSlotMapping}; use risingwave_common::vnode_mapping::vnode_placement::place_vnode; use risingwave_pb::common::{WorkerNode, WorkerType}; use risingwave_pb::meta::subscribe_response::{Info, Operation}; @@ -57,7 +57,8 @@ impl ServingVnodeMapping { } else { None }; - place_vnode(old_mapping, workers, max_parallelism) + // TODO(var-vnode): use vnode count from config + place_vnode(old_mapping, workers, max_parallelism, VirtualNode::COUNT) }; match new_mapping { None => { @@ -192,7 +193,16 @@ pub async fn start_serving_vnode_mapping_worker( continue; } let (workers, streaming_parallelisms) = fetch_serving_infos(&metadata_manager).await; - let (upserted, failed) = serving_vnode_mapping.upsert(streaming_parallelisms, &workers); + let filtered_streaming_parallelisms = fragment_ids.iter().filter_map(|frag_id|{ + match streaming_parallelisms.get(frag_id) { + Some(parallelism) => Some((*frag_id, *parallelism)), + None => { + tracing::warn!(fragment_id = *frag_id, "streaming parallelism not found"); + None + } + } + }).collect(); + let (upserted, failed) = serving_vnode_mapping.upsert(filtered_streaming_parallelisms, &workers); if !upserted.is_empty() { tracing::debug!("Update serving vnode mapping for fragments {:?}.", upserted.keys()); notification_manager.notify_frontend_without_version(Operation::Update, Info::ServingWorkerSlotMappings(FragmentWorkerSlotMappings{ mappings: to_fragment_worker_slot_mapping(&upserted) })); diff --git a/src/meta/src/stream/scale.rs b/src/meta/src/stream/scale.rs index d10fa83710d85..08a36ce3f7275 100644 --- a/src/meta/src/stream/scale.rs +++ b/src/meta/src/stream/scale.rs @@ -31,7 +31,7 @@ use risingwave_common::catalog::TableId; use risingwave_common::hash::{ActorMapping, VirtualNode}; use risingwave_common::util::iter_util::ZipEqDebug; use risingwave_meta_model_v2::{actor, fragment, ObjectId, StreamingParallelism}; -use risingwave_pb::common::{Buffer, PbActorLocation, WorkerNode, WorkerType}; +use risingwave_pb::common::{PbActorLocation, WorkerNode, WorkerType}; use risingwave_pb::meta::subscribe_response::{Info, Operation}; use risingwave_pb::meta::table_fragments::actor_status::ActorState; use risingwave_pb::meta::table_fragments::fragment::{ @@ -49,7 +49,7 @@ use tokio::sync::{oneshot, RwLock, RwLockReadGuard, RwLockWriteGuard}; use tokio::task::JoinHandle; use tokio::time::{Instant, MissedTickBehavior}; -use crate::barrier::{Command, Reschedule, StreamRpcManager}; +use crate::barrier::{Command, Reschedule}; use crate::controller::scale::RescheduleWorkingSet; use crate::manager::{ IdCategory, IdGenManagerImpl, LocalNotification, MetaSrvEnv, MetadataManager, @@ -126,7 +126,8 @@ pub struct CustomActorInfo { pub fragment_id: u32, pub dispatcher: Vec, pub upstream_actor_id: Vec, - pub vnode_bitmap: Option, + /// `None` if singleton. + pub vnode_bitmap: Option, } impl From<&PbStreamActor> for CustomActorInfo { @@ -145,7 +146,7 @@ impl From<&PbStreamActor> for CustomActorInfo { fragment_id: *fragment_id, dispatcher: dispatcher.clone(), upstream_actor_id: upstream_actor_id.clone(), - vnode_bitmap: vnode_bitmap.clone(), + vnode_bitmap: vnode_bitmap.as_ref().map(Bitmap::from), } } } @@ -183,17 +184,26 @@ impl CustomFragmentInfo { } } +use educe::Educe; + +// The debug implementation is arbitrary. Just used in debug logs. +#[derive(Educe)] +#[educe(Debug)] pub struct RescheduleContext { /// Meta information for all Actors + #[educe(Debug(ignore))] actor_map: HashMap, /// Status of all Actors, used to find the location of the `Actor` actor_status: BTreeMap, /// Meta information of all `Fragment`, used to find the `Fragment`'s `Actor` + #[educe(Debug(ignore))] fragment_map: HashMap, /// Index of all `Actor` upstreams, specific to `Dispatcher` upstream_dispatchers: HashMap>, - /// Fragments with stream source + /// Fragments with `StreamSource` stream_source_fragment_ids: HashSet, + /// Fragments with `StreamSourceBackfill` + stream_source_backfill_fragment_ids: HashSet, /// Target fragments in `NoShuffle` relation no_shuffle_target_fragment_ids: HashSet, /// Source fragments in `NoShuffle` relation @@ -252,6 +262,13 @@ pub fn rebalance_actor_vnode( let target_actor_count = actors.len() - actors_to_remove.len() + actors_to_create.len(); assert!(target_actor_count > 0); + // `vnode_bitmap` must be set on distributed fragments. + let vnode_count = actors[0] + .vnode_bitmap + .as_ref() + .expect("vnode bitmap unset") + .len(); + // represents the balance of each actor, used to sort later #[derive(Debug)] struct Balance { @@ -259,7 +276,7 @@ pub fn rebalance_actor_vnode( balance: i32, builder: BitmapBuilder, } - let (expected, mut remain) = VirtualNode::COUNT.div_rem(&target_actor_count); + let (expected, mut remain) = vnode_count.div_rem(&target_actor_count); tracing::debug!( "expected {}, remain {}, prev actors {}, target actors {}", @@ -271,11 +288,11 @@ pub fn rebalance_actor_vnode( let (mut removed, mut rest): (Vec<_>, Vec<_>) = actors .iter() - .filter_map(|actor| { - actor - .vnode_bitmap - .as_ref() - .map(|buffer| (actor.actor_id as ActorId, Bitmap::from(buffer))) + .map(|actor| { + ( + actor.actor_id as ActorId, + actor.vnode_bitmap.clone().expect("vnode bitmap unset"), + ) }) .partition(|(actor_id, _)| actors_to_remove.contains(actor_id)); @@ -294,7 +311,7 @@ pub fn rebalance_actor_vnode( builder }; - let (prev_expected, _) = VirtualNode::COUNT.div_rem(&actors.len()); + let (prev_expected, _) = vnode_count.div_rem(&actors.len()); let prev_remain = removed .iter() @@ -327,7 +344,7 @@ pub fn rebalance_actor_vnode( .map(|actor_id| Balance { actor_id: *actor_id, balance: -(expected as i32), - builder: BitmapBuilder::zeroed(VirtualNode::COUNT), + builder: BitmapBuilder::zeroed(vnode_count), }) .collect_vec(); @@ -389,7 +406,7 @@ pub fn rebalance_actor_vnode( let n = min(abs(src.balance), abs(dst.balance)); let mut moved = 0; - for idx in (0..VirtualNode::COUNT).rev() { + for idx in (0..vnode_count).rev() { if moved >= n { break; } @@ -437,10 +454,10 @@ pub struct ScaleController { pub source_manager: SourceManagerRef, - pub stream_rpc_manager: StreamRpcManager, - pub env: MetaSrvEnv, + /// We will acquire lock during DDL to prevent scaling operations on jobs that are in the creating state. + /// e.g., a MV cannot be rescheduled during foreground backfill. pub reschedule_lock: RwLock<()>, } @@ -448,11 +465,9 @@ impl ScaleController { pub fn new( metadata_manager: &MetadataManager, source_manager: SourceManagerRef, - stream_rpc_manager: StreamRpcManager, env: MetaSrvEnv, ) -> Self { Self { - stream_rpc_manager, metadata_manager: metadata_manager.clone(), source_manager, env, @@ -605,7 +620,7 @@ impl ScaleController { .flatten() .map(|id| *id as _) .collect(), - vnode_bitmap: vnode_bitmap.map(|bitmap| bitmap.to_protobuf()), + vnode_bitmap: vnode_bitmap.map(|b| Bitmap::from(&b.to_protobuf())), }; actor_map.insert(actor_id as _, actor_info.clone()); @@ -657,7 +672,7 @@ impl ScaleController { fragment_id: fragment_id as _, dispatcher, upstream_actor_id, - vnode_bitmap, + vnode_bitmap: vnode_bitmap.map(|b| b.to_protobuf()), // todo, we need to fill this part mview_definition: "".to_string(), expr_context: expr_contexts @@ -770,6 +785,7 @@ impl ScaleController { } let mut stream_source_fragment_ids = HashSet::new(); + let mut stream_source_backfill_fragment_ids = HashSet::new(); let mut no_shuffle_reschedule = HashMap::new(); for (fragment_id, WorkerReschedule { worker_actor_diff }) in &*reschedule { let fragment = fragment_map @@ -798,6 +814,7 @@ impl ScaleController { // correspondence, so we need to clone the reschedule plan to the downstream of all // cascading relations. if no_shuffle_source_fragment_ids.contains(fragment_id) { + // This fragment is a NoShuffle's upstream. let mut queue: VecDeque<_> = fragment_dispatcher_map .get(fragment_id) .unwrap() @@ -887,6 +904,17 @@ impl ScaleController { "reschedule plan rewritten with NoShuffle reschedule {:?}", no_shuffle_reschedule ); + + for noshuffle_downstream in no_shuffle_reschedule.keys() { + let fragment = fragment_map.get(noshuffle_downstream).unwrap(); + // SourceScan is always a NoShuffle downstream, rescheduled together with the upstream Source. + if (fragment.get_fragment_type_mask() & FragmentTypeFlag::SourceScan as u32) != 0 { + let stream_node = fragment.actor_template.nodes.as_ref().unwrap(); + if stream_node.find_source_backfill().is_some() { + stream_source_backfill_fragment_ids.insert(fragment.fragment_id); + } + } + } } // Modifications for NoShuffle downstream. @@ -898,6 +926,7 @@ impl ScaleController { fragment_map, upstream_dispatchers, stream_source_fragment_ids, + stream_source_backfill_fragment_ids, no_shuffle_target_fragment_ids, no_shuffle_source_fragment_ids, fragment_dispatcher_map, @@ -924,9 +953,11 @@ impl ScaleController { HashMap, HashMap>, )> { + tracing::debug!("build_reschedule_context, reschedules: {:#?}", reschedules); let ctx = self .build_reschedule_context(&mut reschedules, options, table_parallelisms) .await?; + tracing::debug!("reschedule context: {:#?}", ctx); let reschedules = reschedules; // Here, the plan for both upstream and downstream of the NO_SHUFFLE Fragment should already have been populated. @@ -1264,9 +1295,9 @@ impl ScaleController { } } - // For stream source fragments, we need to reallocate the splits. + // For stream source & source backfill fragments, we need to reallocate the splits. // Because we are in the Pause state, so it's no problem to reallocate - let mut fragment_stream_source_actor_splits = HashMap::new(); + let mut fragment_actor_splits = HashMap::new(); for fragment_id in reschedules.keys() { let actors_after_reschedule = fragment_actors_after_reschedule.get(fragment_id).unwrap(); @@ -1284,13 +1315,51 @@ impl ScaleController { let actor_splits = self .source_manager - .migrate_splits(*fragment_id, &prev_actor_ids, &curr_actor_ids) + .migrate_splits_for_source_actors( + *fragment_id, + &prev_actor_ids, + &curr_actor_ids, + ) .await?; - fragment_stream_source_actor_splits.insert(*fragment_id, actor_splits); + tracing::debug!( + "source actor splits: {:?}, fragment_id: {}", + actor_splits, + fragment_id + ); + fragment_actor_splits.insert(*fragment_id, actor_splits); + } + } + // We use 2 iterations to make sure source actors are migrated first, and then align backfill actors + if !ctx.stream_source_backfill_fragment_ids.is_empty() { + for fragment_id in reschedules.keys() { + let actors_after_reschedule = + fragment_actors_after_reschedule.get(fragment_id).unwrap(); + + if ctx + .stream_source_backfill_fragment_ids + .contains(fragment_id) + { + let fragment = ctx.fragment_map.get(fragment_id).unwrap(); + + let curr_actor_ids = actors_after_reschedule.keys().cloned().collect_vec(); + + let actor_splits = self.source_manager.migrate_splits_for_backfill_actors( + *fragment_id, + &fragment.upstream_fragment_ids, + &curr_actor_ids, + &fragment_actor_splits, + &no_shuffle_upstream_actor_map, + )?; + tracing::debug!( + "source backfill actor splits: {:?}, fragment_id: {}", + actor_splits, + fragment_id + ); + fragment_actor_splits.insert(*fragment_id, actor_splits); + } } } - // TODO: support migrate splits for SourceBackfill // Generate fragment reschedule plan let mut reschedule_fragment: HashMap = @@ -1409,9 +1478,7 @@ impl ScaleController { if let Some(actor) = ctx.actor_map.get(actor_id) { let bitmap = vnode_bitmap_updates.get(actor_id).unwrap(); - if let Some(buffer) = actor.vnode_bitmap.as_ref() { - let prev_bitmap = Bitmap::from(buffer); - + if let Some(prev_bitmap) = actor.vnode_bitmap.as_ref() { if prev_bitmap.eq(bitmap) { vnode_bitmap_updates.remove(actor_id); } @@ -1428,7 +1495,7 @@ impl ScaleController { let upstream_fragment_dispatcher_ids = upstream_fragment_dispatcher_set.into_iter().collect_vec(); - let actor_splits = fragment_stream_source_actor_splits + let actor_splits = fragment_actor_splits .get(&fragment_id) .cloned() .unwrap_or_default(); @@ -1479,6 +1546,8 @@ impl ScaleController { .pre_apply_reschedules(fragment_created_actors) .await; + tracing::debug!("analyze_reschedule_plan result: {:#?}", reschedule_fragment); + Ok((reschedule_fragment, applied_reschedules)) } @@ -1813,6 +1882,9 @@ impl ScaleController { &self, policy: TableResizePolicy, ) -> MetaResult> { + // TODO(var-vnode): use vnode count from config + let max_parallelism = VirtualNode::COUNT; + let TableResizePolicy { worker_ids, table_parallelisms, @@ -1867,12 +1939,12 @@ impl ScaleController { actor_location: &mut HashMap, table_fragment_id_map: &mut HashMap>, fragment_actor_id_map: &mut HashMap>, - table_fragments: &BTreeMap, + all_table_fragments: &BTreeMap, ) -> MetaResult<()> { // This is only for assertion purposes and will be removed once the dispatcher_id is guaranteed to always correspond to the downstream fragment_id, // such as through the foreign key constraints in the SQL backend. let mut actor_fragment_id_map_for_check = HashMap::new(); - for table_fragments in table_fragments.values() { + for table_fragments in all_table_fragments.values() { for (fragment_id, fragment) in &table_fragments.fragments { for actor in &fragment.actors { let prev = @@ -1883,7 +1955,7 @@ impl ScaleController { } } - for (table_id, table_fragments) in table_fragments { + for (table_id, table_fragments) in all_table_fragments { for (fragment_id, fragment) in &table_fragments.fragments { for actor in &fragment.actors { fragment_actor_id_map @@ -1911,8 +1983,15 @@ impl ScaleController { dispatcher.dispatcher_id as FragmentId ); } else { + tracing::error!( + "downstream actor id {} from actor {} (fragment {}) not found in actor_fragment_id_map_for_check: {actor_fragment_id_map_for_check:?}\n\ndispatchers: {:#?}", + downstream_actor_id, + actor.actor_id, + actor.fragment_id, + actor.dispatcher + ); bail!( - "downstream actor id {} from actor {} not found in fragment_actor_id_map", + "downstream actor id {} from actor {} not found", downstream_actor_id, actor.actor_id, ); @@ -2029,6 +2108,17 @@ impl ScaleController { .await?; } } + tracing::debug!( + ?worker_ids, + ?table_parallelisms, + ?no_shuffle_source_fragment_ids, + ?no_shuffle_target_fragment_ids, + ?fragment_distribution_map, + ?actor_location, + ?table_fragment_id_map, + ?fragment_actor_id_map, + "generate_table_resize_plan, after build_index" + ); let mut target_plan = HashMap::new(); @@ -2096,12 +2186,12 @@ impl ScaleController { } FragmentDistributionType::Hash => match parallelism { TableParallelism::Adaptive => { - if all_available_slots > VirtualNode::COUNT { - tracing::warn!("available parallelism for table {table_id} is larger than VirtualNode::COUNT, force limit to VirtualNode::COUNT"); - // force limit to VirtualNode::COUNT + if all_available_slots > max_parallelism { + tracing::warn!("available parallelism for table {table_id} is larger than max parallelism, force limit to {max_parallelism}"); + // force limit to `max_parallelism` let target_worker_slots = schedule_units_for_slots( &schedulable_worker_slots, - VirtualNode::COUNT, + max_parallelism, table_id, )?; @@ -2123,10 +2213,10 @@ impl ScaleController { } } TableParallelism::Fixed(mut n) => { - if n > VirtualNode::COUNT { + if n > max_parallelism { // This should be unreachable, but we still intercept it to prevent accidental modifications. - tracing::warn!("parallelism {n} for table {table_id} is larger than VirtualNode::COUNT, force limit to VirtualNode::COUNT"); - n = VirtualNode::COUNT + tracing::warn!("specified parallelism {n} for table {table_id} is larger than max parallelism, force limit to {max_parallelism}"); + n = max_parallelism } let target_worker_slots = @@ -2149,7 +2239,10 @@ impl ScaleController { } target_plan.retain(|_, plan| !plan.worker_actor_diff.is_empty()); - + tracing::debug!( + ?target_plan, + "generate_table_resize_plan finished target_plan" + ); Ok(target_plan) } @@ -2380,6 +2473,7 @@ impl ScaleController { /// At present, for table level scaling, we use the strategy `TableResizePolicy`. /// Currently, this is used as an internal interface, so it won’t be included in Protobuf. +#[derive(Debug)] pub struct TableResizePolicy { pub(crate) worker_ids: BTreeSet, pub(crate) table_parallelisms: HashMap, diff --git a/src/meta/src/stream/source_manager.rs b/src/meta/src/stream/source_manager.rs index a383bfee8e46a..ae5ca2a610b9c 100644 --- a/src/meta/src/stream/source_manager.rs +++ b/src/meta/src/stream/source_manager.rs @@ -188,10 +188,9 @@ impl ConnectorSourceWorker

{ let source_is_up = |res: i64| { self.source_is_up.set(res); }; - let splits = self.enumerator.list_splits().await.map_err(|e| { + let splits = self.enumerator.list_splits().await.inspect_err(|_| { source_is_up(0); self.fail_cnt += 1; - e })?; source_is_up(1); self.fail_cnt = 0; @@ -231,7 +230,8 @@ pub struct SourceManagerCore { /// `source_id` -> `(fragment_id, upstream_fragment_id)` backfill_fragments: HashMap>, - /// Splits assigned per actor + /// Splits assigned per actor, + /// incl. both `Source` and `SourceBackfill`. actor_splits: HashMap>, } @@ -468,13 +468,13 @@ impl Default for SplitDiffOptions { } /// Reassigns splits if there are new splits or dropped splits, -/// i.e., `actor_splits` and `discovered_splits` differ. +/// i.e., `actor_splits` and `discovered_splits` differ, or actors are rescheduled. /// /// The existing splits will remain unmoved in their currently assigned actor. /// /// If an actor has an upstream actor, it should be a backfill executor, -/// and its splits should be aligned with the upstream actor. `reassign_splits` should not be used in this case. -/// Use `align_backfill_splits` instead. +/// and its splits should be aligned with the upstream actor. **`reassign_splits` should not be used in this case. +/// Use `align_backfill_splits` instead.** /// /// - `fragment_id`: just for logging /// @@ -790,11 +790,10 @@ impl SourceManager { /// Migrates splits from previous actors to the new actors for a rescheduled fragment. /// - /// Very occasionally split removal may happen - /// during scaling, in which case we need to use the old splits for reallocation instead of the - /// latest splits (which may be missing), so that we can resolve the split removal in the next - /// command. - pub async fn migrate_splits( + /// Very occasionally split removal may happen during scaling, in which case we need to + /// use the old splits for reallocation instead of the latest splits (which may be missing), + /// so that we can resolve the split removal in the next command. + pub async fn migrate_splits_for_source_actors( &self, fragment_id: FragmentId, prev_actor_ids: &[ActorId], @@ -817,7 +816,7 @@ impl SourceManager { fragment_id, empty_actor_splits, &prev_splits, - // pre-allocate splits is the first time getting splits and it does not have scale in scene + // pre-allocate splits is the first time getting splits and it does not have scale-in scene SplitDiffOptions::default(), ) .unwrap_or_default(); @@ -825,6 +824,43 @@ impl SourceManager { Ok(diff) } + /// Migrates splits from previous actors to the new actors for a rescheduled fragment. + pub fn migrate_splits_for_backfill_actors( + &self, + fragment_id: FragmentId, + upstream_fragment_ids: &Vec, + curr_actor_ids: &[ActorId], + fragment_actor_splits: &HashMap>>, + no_shuffle_upstream_actor_map: &HashMap>, + ) -> MetaResult>> { + // align splits for backfill fragments with its upstream source fragment + debug_assert!(upstream_fragment_ids.len() == 1); + let upstream_fragment_id = upstream_fragment_ids[0]; + let actors = no_shuffle_upstream_actor_map + .iter() + .filter(|(id, _)| curr_actor_ids.contains(id)) + .map(|(id, upstream_fragment_actors)| { + debug_assert!(upstream_fragment_actors.len() == 1); + ( + *id, + vec![*upstream_fragment_actors.get(&upstream_fragment_id).unwrap()], + ) + }); + let upstream_assignment = fragment_actor_splits.get(&upstream_fragment_id).unwrap(); + tracing::info!( + fragment_id, + upstream_fragment_id, + ?upstream_assignment, + "migrate_splits_for_backfill_actors" + ); + Ok(align_backfill_splits( + actors, + upstream_assignment, + fragment_id, + upstream_fragment_id, + )?) + } + /// Allocates splits to actors for a newly created source executor. pub async fn allocate_splits(&self, table_id: &TableId) -> MetaResult { let core = self.core.lock().await; diff --git a/src/meta/src/stream/stream_graph/schedule.rs b/src/meta/src/stream/stream_graph/schedule.rs index 0f9e473c26486..d054beb0772b0 100644 --- a/src/meta/src/stream/stream_graph/schedule.rs +++ b/src/meta/src/stream/stream_graph/schedule.rs @@ -25,7 +25,7 @@ use either::Either; use enum_as_inner::EnumAsInner; use itertools::Itertools; use risingwave_common::bitmap::Bitmap; -use risingwave_common::hash::{ActorMapping, WorkerSlotId, WorkerSlotMapping}; +use risingwave_common::hash::{ActorMapping, VirtualNode, WorkerSlotId, WorkerSlotMapping}; use risingwave_common::{bail, hash}; use risingwave_pb::common::{ActorInfo, WorkerNode}; use risingwave_pb::meta::table_fragments::fragment::{ @@ -235,7 +235,9 @@ impl Scheduler { assert_eq!(scheduled_worker_slots.len(), parallelism); // Build the default hash mapping uniformly. - let default_hash_mapping = WorkerSlotMapping::build_from_ids(&scheduled_worker_slots); + // TODO(var-vnode): use vnode count from config + let default_hash_mapping = + WorkerSlotMapping::build_from_ids(&scheduled_worker_slots, VirtualNode::COUNT); let single_scheduled = schedule_units_for_slots(&slots, 1, streaming_job_id)?; let default_single_worker_id = single_scheduled.keys().exactly_one().cloned().unwrap(); diff --git a/src/meta/src/stream/stream_manager.rs b/src/meta/src/stream/stream_manager.rs index a8e8bc47752a5..5dc174106197c 100644 --- a/src/meta/src/stream/stream_manager.rs +++ b/src/meta/src/stream/stream_manager.rs @@ -31,7 +31,7 @@ use tracing::Instrument; use super::{Locations, RescheduleOptions, ScaleControllerRef, TableResizePolicy}; use crate::barrier::{ BarrierScheduler, Command, CreateStreamingJobCommandInfo, CreateStreamingJobType, - ReplaceTablePlan, SnapshotBackfillInfo, StreamRpcManager, + ReplaceTablePlan, SnapshotBackfillInfo, }; use crate::manager::{DdlType, MetaSrvEnv, MetadataManager, NotificationVersion, StreamingJob}; use crate::model::{ActorId, FragmentId, MetadataModel, TableFragments, TableParallelism}; @@ -203,8 +203,6 @@ pub struct GlobalStreamManager { creating_job_info: CreatingStreamingJobInfoRef, pub scale_controller: ScaleControllerRef, - - pub stream_rpc_manager: StreamRpcManager, } impl GlobalStreamManager { @@ -213,7 +211,6 @@ impl GlobalStreamManager { metadata_manager: MetadataManager, barrier_scheduler: BarrierScheduler, source_manager: SourceManagerRef, - stream_rpc_manager: StreamRpcManager, scale_controller: ScaleControllerRef, ) -> MetaResult { Ok(Self { @@ -223,7 +220,6 @@ impl GlobalStreamManager { source_manager, creating_job_info: Arc::new(CreatingStreamingJobInfo::default()), scale_controller, - stream_rpc_manager, }) } @@ -764,8 +760,7 @@ mod tests { use std::time::Duration; use futures::{Stream, TryStreamExt}; - use risingwave_common::hash; - use risingwave_common::hash::{ActorMapping, WorkerSlotId}; + use risingwave_common::hash::{self, ActorMapping, VirtualNode, WorkerSlotId}; use risingwave_common::system_param::reader::SystemParamsRead; use risingwave_pb::common::{HostAddress, WorkerType}; use risingwave_pb::meta::add_worker_node_request::Property; @@ -816,13 +811,6 @@ mod tests { type StreamingControlStreamStream = impl Stream>; - async fn drop_actors( - &self, - _request: Request, - ) -> std::result::Result, Status> { - Ok(Response::new(DropActorsResponse::default())) - } - async fn streaming_control_stream( &self, request: Request>, @@ -989,11 +977,9 @@ mod tests { let (sink_manager, _) = SinkCoordinatorManager::start_worker(); - let stream_rpc_manager = StreamRpcManager::new(env.clone()); let scale_controller = Arc::new(ScaleController::new( &metadata_manager, source_manager.clone(), - stream_rpc_manager.clone(), env.clone(), )); @@ -1005,7 +991,6 @@ mod tests { source_manager.clone(), sink_manager, meta_metrics.clone(), - stream_rpc_manager.clone(), scale_controller.clone(), ) .await; @@ -1015,7 +1000,6 @@ mod tests { metadata_manager, barrier_scheduler.clone(), source_manager.clone(), - stream_rpc_manager, scale_controller.clone(), )?; @@ -1137,12 +1121,14 @@ mod tests { } fn make_mview_stream_actors(table_id: &TableId, count: usize) -> Vec { - let mut actor_bitmaps: HashMap<_, _> = - ActorMapping::new_uniform((0..count).map(|i| i as hash::ActorId)) - .to_bitmaps() - .into_iter() - .map(|(actor_id, bitmap)| (actor_id, bitmap.to_protobuf())) - .collect(); + let mut actor_bitmaps: HashMap<_, _> = ActorMapping::new_uniform( + (0..count).map(|i| i as hash::ActorId), + VirtualNode::COUNT_FOR_TEST, + ) + .to_bitmaps() + .into_iter() + .map(|(actor_id, bitmap)| (actor_id, bitmap.to_protobuf())) + .collect(); (0..count) .map(|i| StreamActor { diff --git a/src/meta/src/stream/test_scale.rs b/src/meta/src/stream/test_scale.rs index 0dc0bced84005..589abb5bdab66 100644 --- a/src/meta/src/stream/test_scale.rs +++ b/src/meta/src/stream/test_scale.rs @@ -26,7 +26,7 @@ mod tests { use crate::stream::CustomActorInfo; fn simulated_parallelism(min: Option, max: Option) -> Vec { - let mut raw = vec![1, 3, 12, 42, VirtualNode::COUNT]; + let mut raw = vec![1, 3, 12, 42, VirtualNode::COUNT_FOR_TEST]; if let Some(min) = min { raw.retain(|n| *n > min); raw.push(min); @@ -39,23 +39,23 @@ mod tests { } fn build_fake_actors(actor_ids: Vec) -> Vec { - let actor_bitmaps = ActorMapping::new_uniform(actor_ids.clone().into_iter()).to_bitmaps(); + let actor_bitmaps = + ActorMapping::new_uniform(actor_ids.clone().into_iter(), VirtualNode::COUNT_FOR_TEST) + .to_bitmaps(); actor_ids .iter() .map(|actor_id| CustomActorInfo { actor_id: *actor_id, - vnode_bitmap: actor_bitmaps - .get(actor_id) - .map(|bitmap| bitmap.to_protobuf()), + vnode_bitmap: actor_bitmaps.get(actor_id).cloned(), ..Default::default() }) .collect() } fn check_affinity_for_scale_in(bitmap: &Bitmap, actor: &CustomActorInfo) { - let prev_bitmap = Bitmap::from(actor.vnode_bitmap.as_ref().unwrap()); + let prev_bitmap = actor.vnode_bitmap.as_ref().unwrap(); - for idx in 0..VirtualNode::COUNT { + for idx in 0..VirtualNode::COUNT_FOR_TEST { if prev_bitmap.is_set(idx) { assert!(bitmap.is_set(idx)); } @@ -63,7 +63,9 @@ mod tests { } fn check_bitmaps(bitmaps: &HashMap) { - let mut target = (0..VirtualNode::COUNT).map(|_| false).collect_vec(); + let mut target = (0..VirtualNode::COUNT_FOR_TEST) + .map(|_| false) + .collect_vec(); for bitmap in bitmaps.values() { for (idx, pos) in target.iter_mut().enumerate() { @@ -89,9 +91,10 @@ mod tests { fn test_build_actor_mapping() { for parallelism in simulated_parallelism(None, None) { let actor_ids = (0..parallelism as ActorId).collect_vec(); - let actor_mapping = ActorMapping::new_uniform(actor_ids.into_iter()); + let actor_mapping = + ActorMapping::new_uniform(actor_ids.into_iter(), VirtualNode::COUNT_FOR_TEST); - assert_eq!(actor_mapping.len(), VirtualNode::COUNT); + assert_eq!(actor_mapping.len(), VirtualNode::COUNT_FOR_TEST); let mut check: HashMap> = HashMap::new(); for (vnode, actor_id) in actor_mapping.iter_with_vnode() { @@ -120,7 +123,7 @@ mod tests { .map(|actor| { ( actor.actor_id as ActorId, - Bitmap::from(actor.vnode_bitmap.as_ref().unwrap()), + actor.vnode_bitmap.unwrap().clone(), ) }) .collect(); @@ -178,7 +181,7 @@ mod tests { #[test] fn test_rebalance_scale_out() { - for parallelism in simulated_parallelism(Some(3), Some(VirtualNode::COUNT - 1)) { + for parallelism in simulated_parallelism(Some(3), Some(VirtualNode::COUNT_FOR_TEST - 1)) { let actors = build_fake_actors((0..parallelism as ActorId).collect_vec()); // add 1 @@ -189,8 +192,9 @@ mod tests { let actors = build_fake_actors((0..parallelism as ActorId).collect_vec()); - // add to VirtualNode::COUNT - let actors_to_add = (parallelism as ActorId..VirtualNode::COUNT as ActorId).collect(); + // add to VirtualNode::COUNT_FOR_TEST + let actors_to_add = + (parallelism as ActorId..VirtualNode::COUNT_FOR_TEST as ActorId).collect(); let result = rebalance_actor_vnode(&actors, &BTreeSet::new(), &actors_to_add); assert_eq!(result.len(), actors.len() + actors_to_add.len()); check_bitmaps(&result); @@ -220,7 +224,7 @@ mod tests { } let target_bitmap = result.get(&actor.actor_id).unwrap(); - let prev_bitmap = Bitmap::from(actor.vnode_bitmap.as_ref().unwrap()); + let prev_bitmap = actor.vnode_bitmap.as_ref().unwrap(); assert!(prev_bitmap.eq(target_bitmap)); } } @@ -275,7 +279,7 @@ mod tests { #[test] fn test_rebalance_scale_real() { - let actor_ids = (0..(VirtualNode::COUNT - 1) as ActorId).collect_vec(); + let actor_ids = (0..(VirtualNode::COUNT_FOR_TEST - 1) as ActorId).collect_vec(); let actors = build_fake_actors(actor_ids); let actors_to_remove = btreeset! {0, 1}; let actors_to_add = btreeset! {255}; diff --git a/src/object_store/src/lib.rs b/src/object_store/src/lib.rs index d9e768b7f0290..c70d38eb90a90 100644 --- a/src/object_store/src/lib.rs +++ b/src/object_store/src/lib.rs @@ -14,7 +14,6 @@ #![feature(trait_alias)] #![feature(type_alias_impl_trait)] -#![feature(lint_reasons)] #![feature(error_generic_member_access)] #![feature(let_chains)] diff --git a/src/prost/build.rs b/src/prost/build.rs index 18bc2d4ae9494..da595949b4427 100644 --- a/src/prost/build.rs +++ b/src/prost/build.rs @@ -179,6 +179,7 @@ fn main() -> Result<(), Box> { .type_attribute("hummock.GroupDestroy", "#[derive(Eq)]") .type_attribute("hummock.GroupMetaChange", "#[derive(Eq)]") .type_attribute("hummock.GroupTableChange", "#[derive(Eq)]") + .type_attribute("hummock.GroupMerge", "#[derive(Eq)]") .type_attribute("hummock.GroupDelta", "#[derive(Eq)]") .type_attribute("hummock.LevelHandler.RunningCompactTask", "#[derive(Eq)]") .type_attribute("hummock.LevelHandler", "#[derive(Eq)]") diff --git a/src/prost/src/lib.rs b/src/prost/src/lib.rs index c8ad9de582edc..e965f76282da4 100644 --- a/src/prost/src/lib.rs +++ b/src/prost/src/lib.rs @@ -15,7 +15,6 @@ // for derived code of `Message` #![expect(clippy::all)] #![expect(clippy::doc_markdown)] -#![feature(lint_reasons)] use std::str::FromStr; diff --git a/src/risedevtool/src/lib.rs b/src/risedevtool/src/lib.rs index 57294e5a7eafa..e7b2fdf56f777 100644 --- a/src/risedevtool/src/lib.rs +++ b/src/risedevtool/src/lib.rs @@ -15,7 +15,6 @@ #![allow(clippy::derive_partial_eq_without_eq)] #![feature(exit_status_error)] #![feature(let_chains)] -#![feature(lint_reasons)] mod config; pub use config::*; diff --git a/src/risedevtool/src/task/task_kafka_ready_check.rs b/src/risedevtool/src/task/task_kafka_ready_check.rs index 79838bf8eca66..b749822a1ebe2 100644 --- a/src/risedevtool/src/task/task_kafka_ready_check.rs +++ b/src/risedevtool/src/task/task_kafka_ready_check.rs @@ -42,7 +42,7 @@ impl Task for KafkaReadyCheckTask { let mut config = ClientConfig::new(); config.set( "bootstrap.servers", - &format!("{}:{}", self.config.address, self.config.port), + format!("{}:{}", self.config.address, self.config.port), ); let rt = tokio::runtime::Builder::new_current_thread() diff --git a/src/rpc_client/src/hummock_meta_client.rs b/src/rpc_client/src/hummock_meta_client.rs index df42a0da3ff35..bb62875b3fae1 100644 --- a/src/rpc_client/src/hummock_meta_client.rs +++ b/src/rpc_client/src/hummock_meta_client.rs @@ -66,5 +66,9 @@ pub trait HummockMetaClient: Send + Sync + 'static { BoxStream<'static, CompactionEventItem>, )>; - async fn get_version_by_epoch(&self, epoch: HummockEpoch) -> Result; + async fn get_version_by_epoch( + &self, + epoch: HummockEpoch, + table_id: u32, + ) -> Result; } diff --git a/src/rpc_client/src/meta_client.rs b/src/rpc_client/src/meta_client.rs index b4e06d8690b72..c7b7204bff7c8 100644 --- a/src/rpc_client/src/meta_client.rs +++ b/src/rpc_client/src/meta_client.rs @@ -22,6 +22,7 @@ use std::time::{Duration, SystemTime}; use anyhow::{anyhow, Context}; use async_trait::async_trait; +use cluster_limit_service_client::ClusterLimitServiceClient; use either::Either; use futures::stream::BoxStream; use lru::LruCache; @@ -1245,10 +1246,12 @@ impl MetaClient { &self, group_id: CompactionGroupId, table_ids_to_new_group: &[StateTableId], + partition_vnode_count: u32, ) -> Result { let req = SplitCompactionGroupRequest { group_id, table_ids: table_ids_to_new_group.to_vec(), + partition_vnode_count, }; let resp = self.inner.split_compaction_group(req).await?; Ok(resp.new_group_id) @@ -1431,11 +1434,36 @@ impl MetaClient { Ok(resp.ret) } - pub async fn get_version_by_epoch(&self, epoch: HummockEpoch) -> Result { - let req = GetVersionByEpochRequest { epoch }; + pub async fn get_version_by_epoch( + &self, + epoch: HummockEpoch, + table_id: u32, + ) -> Result { + let req = GetVersionByEpochRequest { epoch, table_id }; let resp = self.inner.get_version_by_epoch(req).await?; Ok(resp.version.unwrap()) } + + pub async fn get_cluster_limits( + &self, + ) -> Result> { + let req = GetClusterLimitsRequest {}; + let resp = self.inner.get_cluster_limits(req).await?; + Ok(resp.active_limits.into_iter().map(|l| l.into()).collect()) + } + + pub async fn merge_compaction_group( + &self, + left_group_id: CompactionGroupId, + right_group_id: CompactionGroupId, + ) -> Result<()> { + let req = MergeCompactionGroupRequest { + left_group_id, + right_group_id, + }; + self.inner.merge_compaction_group(req).await?; + Ok(()) + } } #[async_trait] @@ -1598,8 +1626,12 @@ impl HummockMetaClient for MetaClient { Ok((request_sender, Box::pin(stream))) } - async fn get_version_by_epoch(&self, epoch: HummockEpoch) -> Result { - self.get_version_by_epoch(epoch).await + async fn get_version_by_epoch( + &self, + epoch: HummockEpoch, + table_id: u32, + ) -> Result { + self.get_version_by_epoch(epoch, table_id).await } } @@ -1636,6 +1668,7 @@ struct GrpcMetaClientCore { cloud_client: CloudServiceClient, sink_coordinate_client: SinkCoordinationRpcClient, event_log_client: EventLogServiceClient, + cluster_limit_client: ClusterLimitServiceClient, } impl GrpcMetaClientCore { @@ -1662,7 +1695,8 @@ impl GrpcMetaClientCore { let serving_client = ServingServiceClient::new(channel.clone()); let cloud_client = CloudServiceClient::new(channel.clone()); let sink_coordinate_client = SinkCoordinationServiceClient::new(channel.clone()); - let event_log_client = EventLogServiceClient::new(channel); + let event_log_client = EventLogServiceClient::new(channel.clone()); + let cluster_limit_client = ClusterLimitServiceClient::new(channel); GrpcMetaClientCore { cluster_client, @@ -1682,6 +1716,7 @@ impl GrpcMetaClientCore { cloud_client, sink_coordinate_client, event_log_client, + cluster_limit_client, } } } @@ -2105,6 +2140,7 @@ macro_rules! for_all_meta_rpc { ,{ hummock_client, cancel_compact_task, CancelCompactTaskRequest, CancelCompactTaskResponse} ,{ hummock_client, list_change_log_epochs, ListChangeLogEpochsRequest, ListChangeLogEpochsResponse } ,{ hummock_client, get_version_by_epoch, GetVersionByEpochRequest, GetVersionByEpochResponse } + ,{ hummock_client, merge_compaction_group, MergeCompactionGroupRequest, MergeCompactionGroupResponse } ,{ user_client, create_user, CreateUserRequest, CreateUserResponse } ,{ user_client, update_user, UpdateUserRequest, UpdateUserResponse } ,{ user_client, drop_user, DropUserRequest, DropUserResponse } @@ -2126,6 +2162,7 @@ macro_rules! for_all_meta_rpc { ,{ cloud_client, rw_cloud_validate_source, RwCloudValidateSourceRequest, RwCloudValidateSourceResponse } ,{ event_log_client, list_event_log, ListEventLogRequest, ListEventLogResponse } ,{ event_log_client, add_event_log, AddEventLogRequest, AddEventLogResponse } + ,{ cluster_limit_client, get_cluster_limits, GetClusterLimitsRequest, GetClusterLimitsResponse } } }; } diff --git a/src/rpc_client/src/sink_coordinate_client.rs b/src/rpc_client/src/sink_coordinate_client.rs index 06602ef4db3b7..8823dd440bc77 100644 --- a/src/rpc_client/src/sink_coordinate_client.rs +++ b/src/rpc_client/src/sink_coordinate_client.rs @@ -18,7 +18,7 @@ use anyhow::anyhow; use futures::{Stream, TryStreamExt}; use risingwave_common::bitmap::Bitmap; use risingwave_pb::connector_service::coordinate_request::{ - CommitRequest, StartCoordinationRequest, + CommitRequest, StartCoordinationRequest, UpdateVnodeBitmapRequest, }; use risingwave_pb::connector_service::{ coordinate_request, coordinate_response, CoordinateRequest, CoordinateResponse, PbSinkParam, @@ -99,4 +99,24 @@ impl CoordinatorStreamHandle { msg => Err(anyhow!("should get commit response but get {:?}", msg)), } } + + pub async fn update_vnode_bitmap(&mut self, vnode_bitmap: &Bitmap) -> anyhow::Result<()> { + self.send_request(CoordinateRequest { + msg: Some(coordinate_request::Msg::UpdateVnodeRequest( + UpdateVnodeBitmapRequest { + vnode_bitmap: Some(vnode_bitmap.to_protobuf()), + }, + )), + }) + .await?; + Ok(()) + } + + pub async fn stop(&mut self) -> anyhow::Result<()> { + self.send_request(CoordinateRequest { + msg: Some(coordinate_request::Msg::Stop(true)), + }) + .await?; + Ok(()) + } } diff --git a/src/rpc_client/src/stream_client.rs b/src/rpc_client/src/stream_client.rs index 920b6f0777f37..40a6d48dacb37 100644 --- a/src/rpc_client/src/stream_client.rs +++ b/src/rpc_client/src/stream_client.rs @@ -70,8 +70,7 @@ pub type StreamClientPoolRef = Arc; macro_rules! for_all_stream_rpc { ($macro:ident) => { $macro! { - { 0, drop_actors, DropActorsRequest, DropActorsResponse } - ,{ 0, wait_epoch_commit, WaitEpochCommitRequest, WaitEpochCommitResponse } + { 0, wait_epoch_commit, WaitEpochCommitRequest, WaitEpochCommitResponse } } }; } diff --git a/src/sqlparser/src/lib.rs b/src/sqlparser/src/lib.rs index a102e5428edae..07967d4cf75a7 100644 --- a/src/sqlparser/src/lib.rs +++ b/src/sqlparser/src/lib.rs @@ -31,7 +31,6 @@ //! ``` #![cfg_attr(not(feature = "std"), no_std)] -#![feature(lint_reasons)] #![feature(let_chains)] #![expect(clippy::doc_markdown)] #![expect(clippy::upper_case_acronyms)] diff --git a/src/storage/Cargo.toml b/src/storage/Cargo.toml index 2886c4e4e23f7..6a6bde4b146e0 100644 --- a/src/storage/Cargo.toml +++ b/src/storage/Cargo.toml @@ -96,7 +96,7 @@ workspace-hack = { path = "../workspace-hack" } bincode = "1" criterion = { workspace = true, features = ["async_futures", "async_tokio"] } expect-test = "1" -risingwave_hummock_sdk = { workspace = true } +risingwave_hummock_sdk = { workspace = true, features = ["test"] } risingwave_test_runner = { workspace = true } uuid = { version = "1", features = ["v4"] } diff --git a/src/storage/backup/integration_tests/test_basic.sh b/src/storage/backup/integration_tests/test_basic.sh index afaee3ac6c507..9674807e62c6e 100644 --- a/src/storage/backup/integration_tests/test_basic.sh +++ b/src/storage/backup/integration_tests/test_basic.sh @@ -34,12 +34,20 @@ if ! psql -h localhost -p 4566 -d dev -U root -c "show materialized views;" | gr echo "expect 0 MV" exit 1 fi +if ! psql -h localhost -p 4566 -d dev -U root -c "show secrets;" | grep -q "0 row"; then + echo "expect 0 SECRET" + exit 1 +fi echo "restore snapshot ${job_id_1} succeeded" restore "${job_id_2}" start_cluster if ! psql -h localhost -p 4566 -d dev -U root -c "show materialized views;" | grep -q "1 row"; then - echo "expect 1 MVs" + echo "expect 1 MV" + exit 1 +fi +if ! psql -h localhost -p 4566 -d dev -U root -c "show secrets;" | grep -q "1 row"; then + echo "expect 1 SECRET" exit 1 fi echo "restore snapshot ${job_id_2} succeeded" @@ -55,6 +63,10 @@ if ! psql -h localhost -p 4566 -d dev -U root -c "show materialized views;" | gr echo "expect 0 MV" exit 1 fi +if ! psql -h localhost -p 4566 -d dev -U root -c "show secrets;" | grep -q "0 row"; then + echo "expect 0 SECRET" + exit 1 +fi echo "restore snapshot ${job_id_3} succeeded" echo "test succeeded" diff --git a/src/storage/backup/src/lib.rs b/src/storage/backup/src/lib.rs index 8dfba1b62a181..e543d139b44f0 100644 --- a/src/storage/backup/src/lib.rs +++ b/src/storage/backup/src/lib.rs @@ -17,7 +17,6 @@ #![feature(type_alias_impl_trait)] #![feature(extract_if)] #![feature(custom_test_frameworks)] -#![feature(lint_reasons)] #![feature(map_try_insert)] #![feature(hash_extract_if)] #![feature(btree_extract_if)] diff --git a/src/storage/benches/bench_table_watermarks.rs b/src/storage/benches/bench_table_watermarks.rs index 4a9e1c5edda0b..5153dd0f9fe38 100644 --- a/src/storage/benches/bench_table_watermarks.rs +++ b/src/storage/benches/bench_table_watermarks.rs @@ -166,7 +166,7 @@ fn bench_table_watermarks(c: &mut Criterion) { let mut pinned_version = PinnedVersion::new(versions.pop_front().unwrap(), unbounded_channel().0); while let Some(version) = versions.pop_front() { - pinned_version = pinned_version.new_pin_version(version); + pinned_version = pinned_version.new_pin_version(version).unwrap(); } }, BatchSize::SmallInput, diff --git a/src/storage/compactor/src/lib.rs b/src/storage/compactor/src/lib.rs index 22e70ac759aed..4c503f3d7a8d5 100644 --- a/src/storage/compactor/src/lib.rs +++ b/src/storage/compactor/src/lib.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(lint_reasons)] - mod compactor_observer; mod rpc; pub mod server; diff --git a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs index ca6585f46fd51..682cb107f3395 100644 --- a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs +++ b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs @@ -22,12 +22,13 @@ use itertools::Itertools; use risingwave_common::catalog::TableId; use risingwave_common::hash::VnodeBitmapExt; use risingwave_pb::hummock::{ - CompactionConfig, CompatibilityVersion, GroupConstruct, GroupDestroy, GroupMetaChange, + CompactionConfig, CompatibilityVersion, GroupConstruct, GroupMerge, GroupMetaChange, GroupTableChange, PbLevelType, }; use tracing::warn; -use super::StateTableId; +use super::group_split::get_sub_level_insert_hint; +use super::{group_split, StateTableId}; use crate::change_log::TableChangeLog; use crate::compaction_group::StaticCompactionGroupId; use crate::key_range::KeyRangeCommon; @@ -47,13 +48,17 @@ pub struct GroupDeltasSummary { pub insert_sub_level_id: u64, pub insert_table_infos: Vec, pub group_construct: Option, - pub group_destroy: Option, + pub group_destroy: Option, pub group_meta_changes: Vec, pub group_table_change: Option, pub new_vnode_partition_count: u32, + pub group_merge: Option, } -pub fn summarize_group_deltas(group_deltas: &GroupDeltas) -> GroupDeltasSummary { +pub fn summarize_group_deltas( + group_deltas: &GroupDeltas, + compaction_group_id: CompactionGroupId, +) -> GroupDeltasSummary { let mut delete_sst_levels = Vec::with_capacity(group_deltas.group_deltas.len()); let mut delete_sst_ids_set = HashSet::new(); let mut insert_sst_level_id = u32::MAX; @@ -64,6 +69,7 @@ pub fn summarize_group_deltas(group_deltas: &GroupDeltas) -> GroupDeltasSummary let mut group_meta_changes = vec![]; let mut group_table_change = None; let mut new_vnode_partition_count = 0; + let mut group_merge = None; for group_delta in &group_deltas.group_deltas { match group_delta { @@ -83,9 +89,9 @@ pub fn summarize_group_deltas(group_deltas: &GroupDeltas) -> GroupDeltasSummary assert!(group_construct.is_none()); group_construct = Some(construct_delta.clone()); } - GroupDelta::GroupDestroy(destroy_delta) => { + GroupDelta::GroupDestroy(_) => { assert!(group_destroy.is_none()); - group_destroy = Some(*destroy_delta); + group_destroy = Some(compaction_group_id); } GroupDelta::GroupMetaChange(meta_delta) => { group_meta_changes.push(meta_delta.clone()); @@ -93,6 +99,11 @@ pub fn summarize_group_deltas(group_deltas: &GroupDeltas) -> GroupDeltasSummary GroupDelta::GroupTableChange(meta_delta) => { group_table_change = Some(meta_delta.clone()); } + GroupDelta::GroupMerge(merge_delta) => { + assert!(group_merge.is_none()); + group_merge = Some(*merge_delta); + group_destroy = Some(merge_delta.right_group_id); + } } } @@ -110,6 +121,7 @@ pub fn summarize_group_deltas(group_deltas: &GroupDeltas) -> GroupDeltasSummary group_meta_changes, group_table_change, new_vnode_partition_count, + group_merge, } } @@ -173,6 +185,25 @@ impl HummockVersion { })) } + // only scan the sst infos from levels in the specified compaction group (without table change log) + pub fn get_sst_ids_by_group_id( + &self, + compaction_group_id: CompactionGroupId, + ) -> impl Iterator + '_ { + self.levels + .iter() + .filter_map(move |(cg_id, level)| { + if *cg_id == compaction_group_id { + Some(level) + } else { + None + } + }) + .flat_map(|level| level.l0.sub_levels.iter().rev().chain(level.levels.iter())) + .flat_map(|level| level.table_infos.iter()) + .map(|s| s.sst_id) + } + /// `get_sst_infos_from_groups` doesn't guarantee that all returned sst info belongs to `select_group`. /// i.e. `select_group` is just a hint. /// We separate `get_sst_infos_from_groups` and `get_sst_infos` because `get_sst_infos_from_groups` may be further customized in the future. @@ -354,7 +385,7 @@ impl HummockVersion { &mut self, parent_group_id: CompactionGroupId, group_id: CompactionGroupId, - member_table_ids: HashSet, + member_table_ids: BTreeSet, new_sst_start_id: u64, ) { let mut new_sst_id = new_sst_start_id; @@ -386,23 +417,6 @@ impl HummockVersion { { for sub_level in &mut l0.sub_levels { let target_l0 = &mut cur_levels.l0; - // When `insert_hint` is `Ok(idx)`, it means that the sub level `idx` in `target_l0` - // will extend these SSTs. When `insert_hint` is `Err(idx)`, it - // means that we will add a new sub level `idx` into `target_l0`. - let mut insert_hint = Err(target_l0.sub_levels.len()); - for (idx, other) in target_l0.sub_levels.iter_mut().enumerate() { - match other.sub_level_id.cmp(&sub_level.sub_level_id) { - Ordering::Less => {} - Ordering::Equal => { - insert_hint = Ok(idx); - break; - } - Ordering::Greater => { - insert_hint = Err(idx); - break; - } - } - } // Remove SST from sub level may result in empty sub level. It will be purged // whenever another compaction task is finished. let insert_table_infos = @@ -419,7 +433,7 @@ impl HummockVersion { if insert_table_infos.is_empty() { continue; } - match insert_hint { + match get_sub_level_insert_hint(&target_l0.sub_levels, sub_level) { Ok(idx) => { add_ssts_to_sub_level(target_l0, idx, insert_table_infos); } @@ -570,7 +584,7 @@ impl HummockVersion { // apply to `levels`, which is different compaction groups for (compaction_group_id, group_deltas) in &version_delta.group_deltas { - let summary = summarize_group_deltas(group_deltas); + let summary = summarize_group_deltas(group_deltas, *compaction_group_id); if let Some(group_construct) = &summary.group_construct { let mut new_levels = build_initial_compaction_group_levels( *compaction_group_id, @@ -594,7 +608,7 @@ impl HummockVersion { } else { #[expect(deprecated)] // for backward-compatibility of previous hummock version delta - HashSet::from_iter(group_construct.table_ids.clone()) + BTreeSet::from_iter(group_construct.table_ids.clone()) }; self.init_with_parent_group( @@ -614,7 +628,7 @@ impl HummockVersion { self.init_with_parent_group( group_change.origin_group_id, group_change.target_group_id, - HashSet::from_iter(group_change.table_ids.clone()), + BTreeSet::from_iter(group_change.table_ids.clone()), group_change.new_sst_start_id, ); @@ -635,14 +649,19 @@ impl HummockVersion { .expect("compaction group should exist") .member_table_ids .append(&mut moving_tables); + } else if let Some(group_merge) = &summary.group_merge { + tracing::info!( + "group_merge left {:?} right {:?}", + group_merge.left_group_id, + group_merge.right_group_id + ); + self.merge_compaction_group(group_merge.left_group_id, group_merge.right_group_id) } - let has_destroy = summary.group_destroy.is_some(); let visible_table_committed_epoch = self.visible_table_committed_epoch(); - let levels = self - .levels - .get_mut(compaction_group_id) - .expect("compaction group should exist"); - + let group_destroy = summary.group_destroy; + let levels = self.levels.get_mut(compaction_group_id).unwrap_or_else(|| { + panic!("compaction group {} does not exist", compaction_group_id) + }); #[expect(deprecated)] // for backward-compatibility of previous hummock version delta for group_meta_delta in &summary.group_meta_changes { levels @@ -669,7 +688,8 @@ impl HummockVersion { } = summary; assert!( - delete_sst_levels.is_empty() && delete_sst_ids_set.is_empty() || has_destroy, + delete_sst_levels.is_empty() && delete_sst_ids_set.is_empty() + || group_destroy.is_some(), "no sst should be deleted when committing an epoch" ); for group_delta in &group_deltas.group_deltas { @@ -703,8 +723,8 @@ impl HummockVersion { .compaction_group_member_table_ids(*compaction_group_id), ); } - if has_destroy { - self.levels.remove(compaction_group_id); + if let Some(destroy_group_id) = &group_destroy { + self.levels.remove(destroy_group_id); } } self.id = version_delta.id; @@ -835,6 +855,45 @@ impl HummockVersion { } ret } + + pub fn merge_compaction_group( + &mut self, + left_group_id: CompactionGroupId, + right_group_id: CompactionGroupId, + ) { + // Double check + let left_group_id_table_ids = self + .state_table_info + .compaction_group_member_table_ids(left_group_id) + .iter() + .map(|table_id| table_id.table_id); + let right_group_id_table_ids = self + .state_table_info + .compaction_group_member_table_ids(right_group_id) + .iter() + .map(|table_id| table_id.table_id); + + assert!(left_group_id_table_ids + .chain(right_group_id_table_ids) + .is_sorted()); + + let total_cg = self.levels.keys().cloned().collect::>(); + let right_levels = self.levels.remove(&right_group_id).unwrap_or_else(|| { + panic!( + "compaction group should exist right {} all {:?}", + right_group_id, total_cg + ) + }); + + let left_levels = self.levels.get_mut(&left_group_id).unwrap_or_else(|| { + panic!( + "compaction group should exist left {} all {:?}", + left_group_id, total_cg + ) + }); + + group_split::merge_levels(left_levels, right_levels); + } } #[easy_ext::ext(HummockLevelsExt)] @@ -998,7 +1057,7 @@ pub fn build_initial_compaction_group_levels( } fn split_sst_info_for_level( - member_table_ids: &HashSet, + member_table_ids: &BTreeSet, level: &mut Level, new_sst_id: &mut u64, ) -> Vec { @@ -1228,6 +1287,14 @@ pub fn object_size_map(version: &HummockVersion) -> HashMap, + new_table_ids: Vec, ) -> SstableInfo { let mut branch_table_info = sst_info.clone(); branch_table_info.sst_id = *new_sst_id; branch_table_info.sst_size = new_sst_size; + *new_sst_id += 1; - sst_info.sst_id = *new_sst_id + 1; + sst_info.sst_id = *new_sst_id; sst_info.sst_size = old_sst_size; + *new_sst_id += 1; { // related github.com/risingwavelabs/risingwave/pull/17898/ // This is a temporary implementation that will update `table_ids`` based on the new split rule after PR 17898 - - let set1: HashSet<_> = sst_info.table_ids.iter().cloned().collect(); - let set2: HashSet<_> = new_sst_table_ids.iter().cloned().collect(); + // sst_info.table_ids = vec[1, 2, 3]; + // new_table_ids = vec[2, 3, 4]; + // branch_table_info.table_ids = vec[1, 2, 3] ∩ vec[2, 3, 4] = vec[2, 3] + let set1: BTreeSet<_> = sst_info.table_ids.iter().cloned().collect(); + let set2: BTreeSet<_> = new_table_ids.into_iter().collect(); let intersection: Vec<_> = set1.intersection(&set2).cloned().collect(); // Update table_ids @@ -1362,8 +1433,6 @@ pub fn split_sst( .retain(|table_id| !branch_table_info.table_ids.contains(table_id)); } - *new_sst_id += 1; - branch_table_info } @@ -1371,9 +1440,15 @@ pub fn split_sst( mod tests { use std::collections::HashMap; + use bytes::Bytes; + use risingwave_common::catalog::TableId; + use risingwave_common::hash::VirtualNode; use risingwave_pb::hummock::{CompactionConfig, GroupConstruct, GroupDestroy, LevelType}; + use crate::compaction_group::group_split; use crate::compaction_group::hummock_version_ext::build_initial_compaction_group_levels; + use crate::key::{gen_key_from_str, FullKey}; + use crate::key_range::KeyRange; use crate::level::{Level, Levels, OverlappingLevel}; use crate::sstable_info::SstableInfo; use crate::version::{ @@ -1531,4 +1606,404 @@ mod tests { version }); } + + fn gen_sst_info(object_id: u64, table_ids: Vec, left: Bytes, right: Bytes) -> SstableInfo { + SstableInfo { + object_id, + sst_id: object_id, + key_range: KeyRange { + left, + right, + right_exclusive: false, + }, + table_ids, + file_size: 100, + sst_size: 100, + uncompressed_file_size: 100, + ..Default::default() + } + } + + #[test] + fn test_merge_levels() { + let mut left_levels = build_initial_compaction_group_levels( + 1, + &CompactionConfig { + max_level: 6, + ..Default::default() + }, + ); + + let mut right_levels = build_initial_compaction_group_levels( + 2, + &CompactionConfig { + max_level: 6, + ..Default::default() + }, + ); + + left_levels.levels[0] = Level { + level_idx: 1, + level_type: LevelType::Nonoverlapping, + table_infos: vec![ + gen_sst_info( + 1, + vec![3], + FullKey::for_test( + TableId::new(3), + gen_key_from_str(VirtualNode::from_index(1), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(3), + gen_key_from_str(VirtualNode::from_index(200), "1"), + 0, + ) + .encode() + .into(), + ), + gen_sst_info( + 10, + vec![3, 4], + FullKey::for_test( + TableId::new(3), + gen_key_from_str(VirtualNode::from_index(201), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(4), + gen_key_from_str(VirtualNode::from_index(10), "1"), + 0, + ) + .encode() + .into(), + ), + gen_sst_info( + 11, + vec![4], + FullKey::for_test( + TableId::new(4), + gen_key_from_str(VirtualNode::from_index(11), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(4), + gen_key_from_str(VirtualNode::from_index(200), "1"), + 0, + ) + .encode() + .into(), + ), + ], + total_file_size: 300, + ..Default::default() + }; + + left_levels.l0.sub_levels.push(Level { + level_idx: 0, + table_infos: vec![gen_sst_info( + 3, + vec![3], + FullKey::for_test( + TableId::new(3), + gen_key_from_str(VirtualNode::from_index(1), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(3), + gen_key_from_str(VirtualNode::from_index(200), "1"), + 0, + ) + .encode() + .into(), + )], + sub_level_id: 101, + level_type: LevelType::Overlapping, + total_file_size: 100, + ..Default::default() + }); + + left_levels.l0.sub_levels.push(Level { + level_idx: 0, + table_infos: vec![gen_sst_info( + 3, + vec![3], + FullKey::for_test( + TableId::new(3), + gen_key_from_str(VirtualNode::from_index(1), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(3), + gen_key_from_str(VirtualNode::from_index(200), "1"), + 0, + ) + .encode() + .into(), + )], + sub_level_id: 103, + level_type: LevelType::Overlapping, + total_file_size: 100, + ..Default::default() + }); + + left_levels.l0.sub_levels.push(Level { + level_idx: 0, + table_infos: vec![gen_sst_info( + 3, + vec![3], + FullKey::for_test( + TableId::new(3), + gen_key_from_str(VirtualNode::from_index(1), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(3), + gen_key_from_str(VirtualNode::from_index(200), "1"), + 0, + ) + .encode() + .into(), + )], + sub_level_id: 105, + level_type: LevelType::Nonoverlapping, + total_file_size: 100, + ..Default::default() + }); + + right_levels.levels[0] = Level { + level_idx: 1, + level_type: LevelType::Nonoverlapping, + table_infos: vec![ + gen_sst_info( + 1, + vec![5], + FullKey::for_test( + TableId::new(5), + gen_key_from_str(VirtualNode::from_index(1), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(5), + gen_key_from_str(VirtualNode::from_index(200), "1"), + 0, + ) + .encode() + .into(), + ), + gen_sst_info( + 10, + vec![5, 6], + FullKey::for_test( + TableId::new(5), + gen_key_from_str(VirtualNode::from_index(201), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(6), + gen_key_from_str(VirtualNode::from_index(10), "1"), + 0, + ) + .encode() + .into(), + ), + gen_sst_info( + 11, + vec![6], + FullKey::for_test( + TableId::new(6), + gen_key_from_str(VirtualNode::from_index(11), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(6), + gen_key_from_str(VirtualNode::from_index(200), "1"), + 0, + ) + .encode() + .into(), + ), + ], + total_file_size: 300, + ..Default::default() + }; + + right_levels.l0.sub_levels.push(Level { + level_idx: 0, + table_infos: vec![gen_sst_info( + 3, + vec![5], + FullKey::for_test( + TableId::new(5), + gen_key_from_str(VirtualNode::from_index(1), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(5), + gen_key_from_str(VirtualNode::from_index(200), "1"), + 0, + ) + .encode() + .into(), + )], + sub_level_id: 101, + level_type: LevelType::Overlapping, + total_file_size: 100, + ..Default::default() + }); + + right_levels.l0.sub_levels.push(Level { + level_idx: 0, + table_infos: vec![gen_sst_info( + 5, + vec![5], + FullKey::for_test( + TableId::new(5), + gen_key_from_str(VirtualNode::from_index(1), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(5), + gen_key_from_str(VirtualNode::from_index(200), "1"), + 0, + ) + .encode() + .into(), + )], + sub_level_id: 102, + level_type: LevelType::Overlapping, + total_file_size: 100, + ..Default::default() + }); + + right_levels.l0.sub_levels.push(Level { + level_idx: 0, + table_infos: vec![gen_sst_info( + 3, + vec![5], + FullKey::for_test( + TableId::new(5), + gen_key_from_str(VirtualNode::from_index(1), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(5), + gen_key_from_str(VirtualNode::from_index(200), "1"), + 0, + ) + .encode() + .into(), + )], + sub_level_id: 103, + level_type: LevelType::Nonoverlapping, + total_file_size: 100, + ..Default::default() + }); + + { + // test empty + let mut left_levels = Levels::default(); + let right_levels = Levels::default(); + + group_split::merge_levels(&mut left_levels, right_levels); + } + + { + // test empty left + let mut left_levels = build_initial_compaction_group_levels( + 1, + &CompactionConfig { + max_level: 6, + ..Default::default() + }, + ); + let right_levels = right_levels.clone(); + + group_split::merge_levels(&mut left_levels, right_levels); + + assert!(left_levels.l0.sub_levels.len() == 3); + assert!(left_levels.l0.sub_levels[0].sub_level_id == 101); + assert_eq!(100, left_levels.l0.sub_levels[0].total_file_size); + assert!(left_levels.l0.sub_levels[1].sub_level_id == 102); + assert_eq!(100, left_levels.l0.sub_levels[1].total_file_size); + assert!(left_levels.l0.sub_levels[2].sub_level_id == 103); + assert_eq!(100, left_levels.l0.sub_levels[2].total_file_size); + + assert!(left_levels.levels[0].level_idx == 1); + assert_eq!(300, left_levels.levels[0].total_file_size); + } + + { + // test empty right + let mut left_levels = left_levels.clone(); + let right_levels = build_initial_compaction_group_levels( + 2, + &CompactionConfig { + max_level: 6, + ..Default::default() + }, + ); + + group_split::merge_levels(&mut left_levels, right_levels); + + assert!(left_levels.l0.sub_levels.len() == 3); + assert!(left_levels.l0.sub_levels[0].sub_level_id == 101); + assert_eq!(100, left_levels.l0.sub_levels[0].total_file_size); + assert!(left_levels.l0.sub_levels[1].sub_level_id == 103); + assert_eq!(100, left_levels.l0.sub_levels[1].total_file_size); + assert!(left_levels.l0.sub_levels[2].sub_level_id == 105); + assert_eq!(100, left_levels.l0.sub_levels[2].total_file_size); + + assert!(left_levels.levels[0].level_idx == 1); + assert_eq!(300, left_levels.levels[0].total_file_size); + } + + { + let mut left_levels = left_levels.clone(); + let right_levels = right_levels.clone(); + + group_split::merge_levels(&mut left_levels, right_levels); + + assert!(left_levels.l0.sub_levels.len() == 6); + assert!(left_levels.l0.sub_levels[0].sub_level_id == 101); + assert_eq!(100, left_levels.l0.sub_levels[0].total_file_size); + assert!(left_levels.l0.sub_levels[1].sub_level_id == 103); + assert_eq!(100, left_levels.l0.sub_levels[1].total_file_size); + assert!(left_levels.l0.sub_levels[2].sub_level_id == 105); + assert_eq!(100, left_levels.l0.sub_levels[2].total_file_size); + assert!(left_levels.l0.sub_levels[3].sub_level_id == 106); + assert_eq!(100, left_levels.l0.sub_levels[3].total_file_size); + assert!(left_levels.l0.sub_levels[4].sub_level_id == 107); + assert_eq!(100, left_levels.l0.sub_levels[4].total_file_size); + assert!(left_levels.l0.sub_levels[5].sub_level_id == 108); + assert_eq!(100, left_levels.l0.sub_levels[5].total_file_size); + + assert!(left_levels.levels[0].level_idx == 1); + assert_eq!(600, left_levels.levels[0].total_file_size); + } + } } diff --git a/src/storage/hummock_sdk/src/compaction_group/mod.rs b/src/storage/hummock_sdk/src/compaction_group/mod.rs index 973cc3e3c6140..94ef89b8046e2 100644 --- a/src/storage/hummock_sdk/src/compaction_group/mod.rs +++ b/src/storage/hummock_sdk/src/compaction_group/mod.rs @@ -43,3 +43,115 @@ impl From for CompactionGroupId { cg as CompactionGroupId } } + +pub mod group_split { + use std::cmp::Ordering; + + use super::hummock_version_ext::insert_new_sub_level; + use crate::can_concat; + use crate::level::{Level, Levels}; + + pub fn merge_levels(left_levels: &mut Levels, right_levels: Levels) { + let right_l0 = right_levels.l0; + + let mut max_left_sub_level_id = left_levels + .l0 + .sub_levels + .iter() + .map(|sub_level| sub_level.sub_level_id + 1) + .max() + .unwrap_or(0); // If there are no sub levels, the max sub level id is 0. + let need_rewrite_right_sub_level_id = max_left_sub_level_id != 0; + + for mut right_sub_level in right_l0.sub_levels { + // Rewrtie the sub level id of right sub level to avoid conflict with left sub levels. (conflict level type) + // e.g. left sub levels: [0, 1, 2], right sub levels: [0, 1, 2], after rewrite, right sub levels: [3, 4, 5] + if need_rewrite_right_sub_level_id { + right_sub_level.sub_level_id = max_left_sub_level_id; + max_left_sub_level_id += 1; + } + + insert_new_sub_level( + &mut left_levels.l0, + right_sub_level.sub_level_id, + right_sub_level.level_type, + right_sub_level.table_infos, + None, + ); + } + + assert!( + left_levels + .l0 + .sub_levels + .is_sorted_by_key(|sub_level| sub_level.sub_level_id), + "{}", + format!("left_levels.l0.sub_levels: {:?}", left_levels.l0.sub_levels) + ); + + // Reinitialise `vnode_partition_count` to avoid misaligned hierarchies + // caused by the merge of different compaction groups.(picker might reject the different `vnode_partition_count` sub_level to compact) + left_levels + .l0 + .sub_levels + .iter_mut() + .for_each(|sub_level| sub_level.vnode_partition_count = 0); + + for (idx, level) in right_levels.levels.into_iter().enumerate() { + if level.table_infos.is_empty() { + continue; + } + + let insert_table_infos = level.table_infos; + left_levels.levels[idx].total_file_size += insert_table_infos + .iter() + .map(|sst| sst.sst_size) + .sum::(); + left_levels.levels[idx].uncompressed_file_size += insert_table_infos + .iter() + .map(|sst| sst.uncompressed_file_size) + .sum::(); + + left_levels.levels[idx] + .table_infos + .extend(insert_table_infos); + left_levels.levels[idx] + .table_infos + .sort_by(|sst1, sst2| sst1.key_range.cmp(&sst2.key_range)); + assert!( + can_concat(&left_levels.levels[idx].table_infos), + "{}", + format!( + "left-group {} right-group {} left_levels.levels[{}].table_infos: {:?} level_idx {:?}", + left_levels.group_id, + right_levels.group_id, + idx, + left_levels.levels[idx].table_infos, + left_levels.levels[idx].level_idx + ) + ); + } + } + + // When `insert_hint` is `Ok(idx)`, it means that the sub level `idx` in `target_l0` + // will extend these SSTs. When `insert_hint` is `Err(idx)`, it + // means that we will add a new sub level `idx` into `target_l0`. + pub fn get_sub_level_insert_hint( + target_levels: &Vec, + sub_level: &Level, + ) -> Result { + for (idx, other) in target_levels.iter().enumerate() { + match other.sub_level_id.cmp(&sub_level.sub_level_id) { + Ordering::Less => {} + Ordering::Equal => { + return Ok(idx); + } + Ordering::Greater => { + return Err(idx); + } + } + } + + Err(target_levels.len()) + } +} diff --git a/src/storage/hummock_sdk/src/lib.rs b/src/storage/hummock_sdk/src/lib.rs index 9e6962ab117aa..921ab18fcf7cd 100644 --- a/src/storage/hummock_sdk/src/lib.rs +++ b/src/storage/hummock_sdk/src/lib.rs @@ -15,7 +15,6 @@ #![feature(async_closure)] #![feature(extract_if)] #![feature(hash_extract_if)] -#![feature(lint_reasons)] #![feature(map_many_mut)] #![feature(type_alias_impl_trait)] #![feature(impl_trait_in_assoc_type)] @@ -130,6 +129,7 @@ pub const FIRST_VERSION_ID: HummockVersionId = HummockVersionId(1); pub const SPLIT_TABLE_COMPACTION_GROUP_ID_HEAD: u64 = 1u64 << 56; pub const SINGLE_TABLE_COMPACTION_GROUP_ID_HEAD: u64 = 2u64 << 56; pub const OBJECT_SUFFIX: &str = "data"; +pub const HUMMOCK_SSTABLE_OBJECT_ID_MAX_DECIMAL_LENGTH: usize = 20; #[macro_export] /// This is wrapper for `info` log. @@ -359,3 +359,14 @@ impl EpochWithGap { self.0 & EPOCH_SPILL_TIME_MASK } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_object_id_decimal_max_length() { + let len = HummockSstableObjectId::MAX.to_string().len(); + assert_eq!(len, HUMMOCK_SSTABLE_OBJECT_ID_MAX_DECIMAL_LENGTH) + } +} diff --git a/src/storage/hummock_sdk/src/sstable_info.rs b/src/storage/hummock_sdk/src/sstable_info.rs index 2f64508e57314..20943e4dd101a 100644 --- a/src/storage/hummock_sdk/src/sstable_info.rs +++ b/src/storage/hummock_sdk/src/sstable_info.rs @@ -63,6 +63,7 @@ impl SstableInfo { impl From for SstableInfo { fn from(pb_sstable_info: PbSstableInfo) -> Self { + assert!(pb_sstable_info.table_ids.is_sorted()); Self { object_id: pb_sstable_info.object_id, sst_id: pb_sstable_info.sst_id, @@ -100,6 +101,7 @@ impl From for SstableInfo { impl From<&PbSstableInfo> for SstableInfo { fn from(pb_sstable_info: &PbSstableInfo) -> Self { + assert!(pb_sstable_info.table_ids.is_sorted()); Self { object_id: pb_sstable_info.object_id, sst_id: pb_sstable_info.sst_id, @@ -136,7 +138,8 @@ impl From<&PbSstableInfo> for SstableInfo { impl From for PbSstableInfo { fn from(sstable_info: SstableInfo) -> Self { - assert_ne!(0, sstable_info.sst_size); + assert!(sstable_info.sst_size > 0 || sstable_info.is_stripped()); + assert!(sstable_info.table_ids.is_sorted()); PbSstableInfo { object_id: sstable_info.object_id, sst_id: sstable_info.sst_id, @@ -174,7 +177,8 @@ impl From for PbSstableInfo { impl From<&SstableInfo> for PbSstableInfo { fn from(sstable_info: &SstableInfo) -> Self { - assert_ne!(0, sstable_info.sst_size); + assert!(sstable_info.sst_size > 0 || sstable_info.is_stripped()); + assert!(sstable_info.table_ids.is_sorted()); PbSstableInfo { object_id: sstable_info.object_id, sst_id: sstable_info.sst_id, @@ -212,3 +216,10 @@ impl SstableInfo { self.key_range = KeyRange::default(); } } + +// Time travel +impl SstableInfo { + pub fn is_stripped(&self) -> bool { + self.object_id == 0 + } +} diff --git a/src/storage/hummock_sdk/src/version.rs b/src/storage/hummock_sdk/src/version.rs index e418250f0b6bf..1c8cfd1e310b4 100644 --- a/src/storage/hummock_sdk/src/version.rs +++ b/src/storage/hummock_sdk/src/version.rs @@ -24,9 +24,9 @@ use risingwave_common::util::epoch::INVALID_EPOCH; use risingwave_pb::hummock::group_delta::PbDeltaType; use risingwave_pb::hummock::hummock_version_delta::PbGroupDeltas; use risingwave_pb::hummock::{ - CompactionConfig, PbGroupConstruct, PbGroupDelta, PbGroupDestroy, PbGroupMetaChange, - PbGroupTableChange, PbHummockVersion, PbHummockVersionDelta, PbIntraLevelDelta, - PbStateTableInfo, StateTableInfo, StateTableInfoDelta, + CompactionConfig, PbGroupConstruct, PbGroupDelta, PbGroupDestroy, PbGroupMerge, + PbGroupMetaChange, PbGroupTableChange, PbHummockVersion, PbHummockVersionDelta, + PbIntraLevelDelta, PbStateTableInfo, StateTableInfo, StateTableInfoDelta, }; use tracing::warn; @@ -501,12 +501,10 @@ impl HummockVersionDelta { .flat_map(|group_deltas| { group_deltas.group_deltas.iter().flat_map(|group_delta| { static EMPTY_VEC: Vec = Vec::new(); - let sst_slice = match group_delta { - GroupDelta::IntraLevel(level_delta) => &level_delta.inserted_table_infos, - GroupDelta::GroupConstruct(_) - | GroupDelta::GroupDestroy(_) - | GroupDelta::GroupMetaChange(_) - | GroupDelta::GroupTableChange(_) => &EMPTY_VEC, + let sst_slice = if let GroupDelta::IntraLevel(level_delta) = &group_delta { + &level_delta.inserted_table_infos + } else { + &EMPTY_VEC }; sst_slice.iter().map(|sst| sst.object_id) }) @@ -526,12 +524,10 @@ impl HummockVersionDelta { let ssts_from_group_deltas = self.group_deltas.values().flat_map(|group_deltas| { group_deltas.group_deltas.iter().flat_map(|group_delta| { static EMPTY_VEC: Vec = Vec::new(); - let sst_slice = match group_delta { - GroupDelta::IntraLevel(level_delta) => &level_delta.inserted_table_infos, - GroupDelta::GroupConstruct(_) - | GroupDelta::GroupDestroy(_) - | GroupDelta::GroupMetaChange(_) - | GroupDelta::GroupTableChange(_) => &EMPTY_VEC, + let sst_slice = if let GroupDelta::IntraLevel(level_delta) = &group_delta { + &level_delta.inserted_table_infos + } else { + &EMPTY_VEC }; sst_slice.iter() }) @@ -564,12 +560,10 @@ impl HummockVersionDelta { .flat_map(|group_deltas| { group_deltas.group_deltas.iter().flat_map(|group_delta| { static EMPTY_VEC: Vec = Vec::new(); - let sst_slice = match group_delta { - GroupDelta::IntraLevel(level_delta) => &level_delta.inserted_table_infos, - GroupDelta::GroupConstruct(_) - | GroupDelta::GroupDestroy(_) - | GroupDelta::GroupMetaChange(_) - | GroupDelta::GroupTableChange(_) => &EMPTY_VEC, + let sst_slice = if let GroupDelta::IntraLevel(level_delta) = &group_delta { + &level_delta.inserted_table_infos + } else { + &EMPTY_VEC }; sst_slice.iter() }) @@ -881,6 +875,8 @@ pub enum GroupDelta { #[allow(dead_code)] GroupTableChange(PbGroupTableChange), + + GroupMerge(PbGroupMerge), } impl From for GroupDelta { @@ -901,6 +897,7 @@ impl From for GroupDelta { Some(PbDeltaType::GroupTableChange(pb_group_table_change)) => { GroupDelta::GroupTableChange(pb_group_table_change) } + Some(PbDeltaType::GroupMerge(pb_group_merge)) => GroupDelta::GroupMerge(pb_group_merge), None => panic!("delta_type is not set"), } } @@ -924,6 +921,9 @@ impl From for PbGroupDelta { GroupDelta::GroupTableChange(pb_group_table_change) => PbGroupDelta { delta_type: Some(PbDeltaType::GroupTableChange(pb_group_table_change)), }, + GroupDelta::GroupMerge(pb_group_merge) => PbGroupDelta { + delta_type: Some(PbDeltaType::GroupMerge(pb_group_merge)), + }, } } } @@ -946,6 +946,9 @@ impl From<&GroupDelta> for PbGroupDelta { GroupDelta::GroupTableChange(pb_group_table_change) => PbGroupDelta { delta_type: Some(PbDeltaType::GroupTableChange(pb_group_table_change.clone())), }, + GroupDelta::GroupMerge(pb_group_merge) => PbGroupDelta { + delta_type: Some(PbDeltaType::GroupMerge(*pb_group_merge)), + }, } } } @@ -968,6 +971,9 @@ impl From<&PbGroupDelta> for GroupDelta { Some(PbDeltaType::GroupTableChange(pb_group_table_change)) => { GroupDelta::GroupTableChange(pb_group_table_change.clone()) } + Some(PbDeltaType::GroupMerge(pb_group_merge)) => { + GroupDelta::GroupMerge(*pb_group_merge) + } None => panic!("delta_type is not set"), } } diff --git a/src/storage/hummock_test/src/bin/replay/main.rs b/src/storage/hummock_test/src/bin/replay/main.rs index 9181e37c992e2..7760d7ce530c6 100644 --- a/src/storage/hummock_test/src/bin/replay/main.rs +++ b/src/storage/hummock_test/src/bin/replay/main.rs @@ -31,7 +31,7 @@ use clap::Parser; use foyer::HybridCacheBuilder; use replay_impl::{get_replay_notification_client, GlobalReplayImpl}; use risingwave_common::config::{ - extract_storage_memory_config, load_config, NoOverride, ObjectStoreConfig, StorageConfig, + extract_storage_memory_config, load_config, NoOverride, ObjectStoreConfig, }; use risingwave_common::system_param::reader::SystemParamsReader; use risingwave_hummock_trace::{ @@ -46,7 +46,6 @@ use risingwave_storage::filter_key_extractor::{ use risingwave_storage::hummock::{HummockStorage, SstableStore, SstableStoreConfig}; use risingwave_storage::monitor::{CompactorMetrics, HummockStateStoreMetrics, ObjectStoreMetrics}; use risingwave_storage::opts::StorageOpts; -use serde::{Deserialize, Serialize}; // use a large offset to avoid collision with real sstables const SST_OFFSET: u64 = 2147383647000; @@ -183,8 +182,3 @@ async fn create_replay_hummock(r: Record, args: &Args) -> Result, notification_client: impl NotificationClient, hummock_manager_ref: &HummockManagerRef, - table_id: TableId, + table_ids: &[u32], ) -> HummockStorage { let remote_dir = "hummock_001_test".to_string(); let options = Arc::new(StorageOpts { @@ -117,7 +117,7 @@ pub(crate) mod tests { register_tables_with_id_for_test( hummock.filter_key_extractor_manager(), hummock_manager_ref, - &[table_id.table_id()], + table_ids, ) .await; @@ -189,7 +189,6 @@ pub(crate) mod tests { local.seal_current_epoch(u64::MAX, SealCurrentEpochOptions::for_test()); } let res = storage.seal_and_sync_epoch(epoch).await.unwrap(); - hummock_meta_client.commit_epoch(epoch, res).await.unwrap(); } } @@ -236,7 +235,7 @@ pub(crate) mod tests { hummock_meta_client.clone(), get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node.clone()), &hummock_manager_ref, - Default::default(), + &[0], ) .await; let rpc_filter_key_extractor_manager = match storage.filter_key_extractor_manager().clone() @@ -406,7 +405,7 @@ pub(crate) mod tests { hummock_meta_client.clone(), get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node.clone()), &hummock_manager_ref, - Default::default(), + &[0], ) .await; @@ -604,7 +603,7 @@ pub(crate) mod tests { hummock_meta_client.clone(), get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node), &hummock_manager_ref, - TableId::from(existing_table_id), + &[existing_table_id], ) .await; @@ -885,7 +884,7 @@ pub(crate) mod tests { hummock_meta_client.clone(), get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node.clone()), &hummock_manager_ref, - TableId::from(existing_table_id), + &[existing_table_id], ) .await; @@ -1090,7 +1089,7 @@ pub(crate) mod tests { hummock_meta_client.clone(), get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node.clone()), &hummock_manager_ref, - TableId::from(existing_table_id), + &[existing_table_id], ) .await; @@ -1290,7 +1289,7 @@ pub(crate) mod tests { hummock_meta_client.clone(), get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node.clone()), &hummock_manager_ref, - TableId::from(existing_table_id), + &[existing_table_id], ) .await; let (compact_ctx, filter_key_extractor_manager) = @@ -1505,7 +1504,7 @@ pub(crate) mod tests { hummock_meta_client.clone(), get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node.clone()), &hummock_manager_ref, - TableId::from(existing_table_id), + &[existing_table_id], ) .await; hummock_manager_ref.get_new_sst_ids(10).await.unwrap(); @@ -1680,7 +1679,7 @@ pub(crate) mod tests { hummock_meta_client.clone(), get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node.clone()), &hummock_manager_ref, - TableId::from(existing_table_id), + &[existing_table_id], ) .await; hummock_manager_ref.get_new_sst_ids(10).await.unwrap(); @@ -1798,7 +1797,7 @@ pub(crate) mod tests { hummock_meta_client.clone(), get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node.clone()), &hummock_manager_ref, - TableId::from(existing_table_id), + &[existing_table_id], ) .await; hummock_manager_ref.get_new_sst_ids(10).await.unwrap(); @@ -1980,4 +1979,504 @@ pub(crate) mod tests { count += 1; } } + + #[tokio::test] + async fn test_split_and_merge() { + let (env, hummock_manager_ref, _cluster_manager_ref, worker_node) = + setup_compute_env(8080).await; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager_ref.clone(), + worker_node.id, + )); + + let table_id_1 = TableId::from(1); + let table_id_2 = TableId::from(2); + + let storage = get_hummock_storage( + hummock_meta_client.clone(), + get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node.clone()), + &hummock_manager_ref, + &[table_id_1.table_id(), table_id_2.table_id()], + ) + .await; + + // basic cg2 -> [1, 2] + let rpc_filter_key_extractor_manager = match storage.filter_key_extractor_manager().clone() + { + FilterKeyExtractorManager::RpcFilterKeyExtractorManager( + rpc_filter_key_extractor_manager, + ) => rpc_filter_key_extractor_manager, + FilterKeyExtractorManager::StaticFilterKeyExtractorManager(_) => unreachable!(), + }; + + let mut key = BytesMut::default(); + key.put_u16(1); + key.put_slice(b"key_prefix"); + let key_prefix = key.freeze(); + + rpc_filter_key_extractor_manager.update( + table_id_1.table_id(), + Arc::new(FilterKeyExtractorImpl::FixedLength( + FixedLengthFilterKeyExtractor::new(TABLE_PREFIX_LEN + key_prefix.len()), + )), + ); + rpc_filter_key_extractor_manager.update( + table_id_2.table_id(), + Arc::new(FilterKeyExtractorImpl::FixedLength( + FixedLengthFilterKeyExtractor::new(TABLE_PREFIX_LEN + key_prefix.len()), + )), + ); + + let filter_key_extractor_manager = FilterKeyExtractorManager::RpcFilterKeyExtractorManager( + rpc_filter_key_extractor_manager, + ); + let compact_ctx = get_compactor_context(&storage); + let sstable_object_id_manager = Arc::new(SstableObjectIdManager::new( + hummock_meta_client.clone(), + storage + .storage_opts() + .clone() + .sstable_id_remote_fetch_number, + )); + + let base_epoch = Epoch::now(); + let mut epoch: u64 = base_epoch.0; + let millisec_interval_epoch: u64 = (1 << 16) * 100; + + let mut local_1 = storage + .new_local(NewLocalOptions::for_test(table_id_1)) + .await; + let mut local_2 = storage + .new_local(NewLocalOptions::for_test(table_id_2)) + .await; + + let val = Bytes::from(b"0"[..].to_vec()); + + async fn write_data( + storage: &HummockStorage, + local_1: (&mut LocalHummockStorage, bool), + local_2: (&mut LocalHummockStorage, bool), + epoch: &mut u64, + val: Bytes, + kv_count: u64, + millisec_interval_epoch: u64, + key_prefix: Bytes, + hummock_meta_client: Arc, + is_init: &mut bool, + ) { + let table_id_set = + HashSet::from_iter(vec![local_1.0.table_id(), local_2.0.table_id()].into_iter()); + + storage.start_epoch(*epoch, table_id_set.clone()); + for i in 0..kv_count { + if i == 0 && *is_init { + local_1.0.init_for_test(*epoch).await.unwrap(); + local_2.0.init_for_test(*epoch).await.unwrap(); + + *is_init = false; + } + let next_epoch = *epoch + millisec_interval_epoch; + storage.start_epoch(next_epoch, table_id_set.clone()); + + let ramdom_key = + [key_prefix.as_ref(), &rand::thread_rng().gen::<[u8; 32]>()].concat(); + + if local_1.1 { + local_1 + .0 + .insert(TableKey(Bytes::from(ramdom_key.clone())), val.clone(), None) + .unwrap(); + } + local_1.0.flush().await.unwrap(); + local_1 + .0 + .seal_current_epoch(next_epoch, SealCurrentEpochOptions::for_test()); + + if local_2.1 { + local_2 + .0 + .insert(TableKey(Bytes::from(ramdom_key.clone())), val.clone(), None) + .unwrap(); + } + local_2.0.flush().await.unwrap(); + local_2 + .0 + .seal_current_epoch(next_epoch, SealCurrentEpochOptions::for_test()); + + let res = storage.seal_and_sync_epoch(*epoch).await.unwrap(); + hummock_meta_client.commit_epoch(*epoch, res).await.unwrap(); + *epoch += millisec_interval_epoch; + } + } + + let mut is_init = true; + write_data( + &storage, + (&mut local_1, true), + (&mut local_2, true), + &mut epoch, + val.clone(), + 1, + millisec_interval_epoch, + key_prefix.clone(), + hummock_meta_client.clone(), + &mut is_init, + ) + .await; + epoch += millisec_interval_epoch; + + let parent_group_id = 2; + let split_table_ids = vec![table_id_2.table_id()]; + + async fn compact_once( + group_id: CompactionGroupId, + level: usize, + hummock_manager_ref: HummockManagerRef, + compact_ctx: CompactorContext, + filter_key_extractor_manager: FilterKeyExtractorManager, + sstable_object_id_manager: Arc, + ) { + // compact left group + let manual_compcation_option = ManualCompactionOption { + level, + ..Default::default() + }; + // 2. get compact task + let compact_task = hummock_manager_ref + .manual_get_compact_task(group_id, manual_compcation_option) + .await + .unwrap(); + + if compact_task.is_none() { + return; + } + + let mut compact_task = compact_task.unwrap(); + + let compaction_filter_flag = + CompactionFilterFlag::STATE_CLEAN | CompactionFilterFlag::TTL; + compact_task.compaction_filter_mask = compaction_filter_flag.bits(); + compact_task.current_epoch_time = hummock_manager_ref + .get_current_version() + .await + .max_committed_epoch(); + + // 3. compact + let (_tx, rx) = tokio::sync::oneshot::channel(); + let ((result_task, task_stats), _) = compact( + compact_ctx, + compact_task.clone(), + rx, + Box::new(sstable_object_id_manager.clone()), + filter_key_extractor_manager.clone(), + ) + .await; + + hummock_manager_ref + .report_compact_task( + result_task.task_id, + result_task.task_status, + result_task.sorted_output_ssts, + Some(to_prost_table_stats_map(task_stats)), + ) + .await + .unwrap(); + } + + // compact + compact_once( + parent_group_id, + 0, + hummock_manager_ref.clone(), + compact_ctx.clone(), + filter_key_extractor_manager.clone(), + sstable_object_id_manager.clone(), + ) + .await; + + let new_cg_id = hummock_manager_ref + .split_compaction_group(parent_group_id, &split_table_ids, 0) + .await + .unwrap(); + + assert_ne!(parent_group_id, new_cg_id); + assert!(hummock_manager_ref + .merge_compaction_group(parent_group_id, new_cg_id) + .await + .is_err()); + + write_data( + &storage, + (&mut local_1, true), + (&mut local_2, true), + &mut epoch, + val.clone(), + 100, + millisec_interval_epoch, + key_prefix.clone(), + hummock_meta_client.clone(), + &mut is_init, + ) + .await; + epoch += millisec_interval_epoch; + + compact_once( + parent_group_id, + 0, + hummock_manager_ref.clone(), + compact_ctx.clone(), + filter_key_extractor_manager.clone(), + sstable_object_id_manager.clone(), + ) + .await; + + compact_once( + new_cg_id, + 0, + hummock_manager_ref.clone(), + compact_ctx.clone(), + filter_key_extractor_manager.clone(), + sstable_object_id_manager.clone(), + ) + .await; + + // try merge + hummock_manager_ref + .merge_compaction_group(parent_group_id, new_cg_id) + .await + .unwrap(); + + let new_cg_id = hummock_manager_ref + .split_compaction_group(parent_group_id, &split_table_ids, 0) + .await + .unwrap(); + + compact_once( + parent_group_id, + 0, + hummock_manager_ref.clone(), + compact_ctx.clone(), + filter_key_extractor_manager.clone(), + sstable_object_id_manager.clone(), + ) + .await; + + compact_once( + new_cg_id, + 0, + hummock_manager_ref.clone(), + compact_ctx.clone(), + filter_key_extractor_manager.clone(), + sstable_object_id_manager.clone(), + ) + .await; + + // write left + write_data( + &storage, + (&mut local_1, true), + (&mut local_2, false), + &mut epoch, + val.clone(), + 16, + millisec_interval_epoch, + key_prefix.clone(), + hummock_meta_client.clone(), + &mut is_init, + ) + .await; + + epoch += millisec_interval_epoch; + + // try merge + hummock_manager_ref + .merge_compaction_group(parent_group_id, new_cg_id) + .await + .unwrap(); + + // compact + compact_once( + parent_group_id, + 0, + hummock_manager_ref.clone(), + compact_ctx.clone(), + filter_key_extractor_manager.clone(), + sstable_object_id_manager.clone(), + ) + .await; + + // try split + let new_cg_id = hummock_manager_ref + .split_compaction_group(parent_group_id, &split_table_ids, 0) + .await + .unwrap(); + + // write right + write_data( + &storage, + (&mut local_1, false), + (&mut local_2, true), + &mut epoch, + val.clone(), + 16, + millisec_interval_epoch, + key_prefix.clone(), + hummock_meta_client.clone(), + &mut is_init, + ) + .await; + + epoch += millisec_interval_epoch; + + hummock_manager_ref + .merge_compaction_group(parent_group_id, new_cg_id) + .await + .unwrap(); + + // write left and right + + write_data( + &storage, + (&mut local_1, true), + (&mut local_2, true), + &mut epoch, + val.clone(), + 1, + millisec_interval_epoch, + key_prefix.clone(), + hummock_meta_client.clone(), + &mut is_init, + ) + .await; + + epoch += millisec_interval_epoch; + + compact_once( + parent_group_id, + 0, + hummock_manager_ref.clone(), + compact_ctx.clone(), + filter_key_extractor_manager.clone(), + sstable_object_id_manager.clone(), + ) + .await; + + compact_once( + new_cg_id, + 0, + hummock_manager_ref.clone(), + compact_ctx.clone(), + filter_key_extractor_manager.clone(), + sstable_object_id_manager.clone(), + ) + .await; + + async fn compact_all( + group_id: CompactionGroupId, + level: usize, + hummock_manager_ref: HummockManagerRef, + compact_ctx: CompactorContext, + filter_key_extractor_manager: FilterKeyExtractorManager, + sstable_object_id_manager: Arc, + ) { + loop { + let manual_compcation_option = ManualCompactionOption { + level, + ..Default::default() + }; + let compact_task = hummock_manager_ref + .manual_get_compact_task(group_id, manual_compcation_option) + .await + .unwrap(); + + if compact_task.is_none() { + break; + } + + let mut compact_task = compact_task.unwrap(); + let compaction_filter_flag = + CompactionFilterFlag::STATE_CLEAN | CompactionFilterFlag::TTL; + compact_task.compaction_filter_mask = compaction_filter_flag.bits(); + compact_task.current_epoch_time = hummock_manager_ref + .get_current_version() + .await + .max_committed_epoch(); + + // 3. compact + let (_tx, rx) = tokio::sync::oneshot::channel(); + let ((result_task, task_stats), _) = compact( + compact_ctx.clone(), + compact_task.clone(), + rx, + Box::new(sstable_object_id_manager.clone()), + filter_key_extractor_manager.clone(), + ) + .await; + + hummock_manager_ref + .report_compact_task( + result_task.task_id, + result_task.task_status, + result_task.sorted_output_ssts, + Some(to_prost_table_stats_map(task_stats)), + ) + .await + .unwrap(); + } + } + + // try split + let new_cg_id = hummock_manager_ref + .split_compaction_group(parent_group_id, &split_table_ids, 0) + .await + .unwrap(); + + // try merge + assert!(hummock_manager_ref + .merge_compaction_group(parent_group_id, new_cg_id) + .await + .is_err()); + + // write left and write + + write_data( + &storage, + (&mut local_1, true), + (&mut local_2, true), + &mut epoch, + val.clone(), + 200, + millisec_interval_epoch, + key_prefix.clone(), + hummock_meta_client.clone(), + &mut is_init, + ) + .await; + + compact_all( + parent_group_id, + 0, + hummock_manager_ref.clone(), + compact_ctx.clone(), + filter_key_extractor_manager.clone(), + sstable_object_id_manager.clone(), + ) + .await; + + compact_all( + new_cg_id, + 0, + hummock_manager_ref.clone(), + compact_ctx.clone(), + filter_key_extractor_manager.clone(), + sstable_object_id_manager.clone(), + ) + .await; + + // try merge + hummock_manager_ref + .merge_compaction_group(parent_group_id, new_cg_id) + .await + .unwrap(); + } } diff --git a/src/storage/hummock_test/src/hummock_storage_tests.rs b/src/storage/hummock_test/src/hummock_storage_tests.rs index 7f3d35f16b80b..fc0fd6ae97b4f 100644 --- a/src/storage/hummock_test/src/hummock_storage_tests.rs +++ b/src/storage/hummock_test/src/hummock_storage_tests.rs @@ -31,6 +31,7 @@ use risingwave_hummock_sdk::key::{ gen_key_from_bytes, prefixed_range_with_vnode, FullKey, TableKey, UserKey, TABLE_PREFIX_LEN, }; use risingwave_hummock_sdk::sstable_info::SstableInfo; +use risingwave_hummock_sdk::table_stats::TableStats; use risingwave_hummock_sdk::table_watermark::{ TableWatermarksIndex, VnodeWatermark, WatermarkDirection, }; @@ -2510,8 +2511,20 @@ async fn test_commit_multi_epoch() { new_table_watermarks: Default::default(), sst_to_context: context_id_map(&[sst.object_id]), sstables: vec![LocalSstableInfo { + table_stats: sst + .table_ids + .iter() + .map(|&table_id| { + ( + table_id, + TableStats { + total_compressed_size: 10, + ..Default::default() + }, + ) + }) + .collect(), sst_info: sst, - table_stats: Default::default(), }], new_table_fragment_info, change_log_delta: Default::default(), diff --git a/src/storage/hummock_test/src/state_store_tests.rs b/src/storage/hummock_test/src/state_store_tests.rs index 35f3d08a9ed8a..67da2150735af 100644 --- a/src/storage/hummock_test/src/state_store_tests.rs +++ b/src/storage/hummock_test/src/state_store_tests.rs @@ -24,7 +24,6 @@ use futures::{pin_mut, StreamExt}; use itertools::Itertools; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::{TableId, TableOption}; -use risingwave_common::hash::table_distribution::TableDistribution; use risingwave_common::hash::VirtualNode; use risingwave_common::util::epoch::{test_epoch, EpochExt, MAX_EPOCH}; use risingwave_hummock_sdk::key::{prefixed_range_with_vnode, TableKeyRange}; @@ -1565,7 +1564,7 @@ async fn test_iter_log() { }, table_option: Default::default(), is_replicated: false, - vnodes: TableDistribution::all_vnodes(), + vnodes: Bitmap::ones(VirtualNode::COUNT_FOR_TEST).into(), }) .await; @@ -1580,7 +1579,7 @@ async fn test_iter_log() { }, table_option: Default::default(), is_replicated: false, - vnodes: TableDistribution::all_vnodes(), + vnodes: Bitmap::ones(VirtualNode::COUNT_FOR_TEST).into(), }) .await; // flush for about 10 times per epoch diff --git a/src/storage/hummock_test/src/sync_point_tests.rs b/src/storage/hummock_test/src/sync_point_tests.rs index f5ee41783813d..008c667ccedf5 100644 --- a/src/storage/hummock_test/src/sync_point_tests.rs +++ b/src/storage/hummock_test/src/sync_point_tests.rs @@ -242,7 +242,7 @@ async fn test_syncpoints_get_in_delete_range_boundary() { hummock_meta_client.clone(), get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node.clone()), &hummock_manager_ref, - TableId::from(existing_table_id), + &[existing_table_id], ) .await; let (compact_ctx, filter_key_extractor_manager) = diff --git a/src/storage/hummock_trace/src/opts.rs b/src/storage/hummock_trace/src/opts.rs index 5d480cca96b58..ff8b43c15c458 100644 --- a/src/storage/hummock_trace/src/opts.rs +++ b/src/storage/hummock_trace/src/opts.rs @@ -109,7 +109,7 @@ pub struct TracedReadOptions { pub retention_seconds: Option, pub table_id: TracedTableId, pub read_version_from_backup: bool, - pub read_version_from_time_travel: bool, + pub read_committed: bool, } impl TracedReadOptions { @@ -125,7 +125,7 @@ impl TracedReadOptions { retention_seconds: None, table_id: TracedTableId { table_id }, read_version_from_backup: false, - read_version_from_time_travel: false, + read_committed: false, } } } diff --git a/src/storage/src/hummock/event_handler/hummock_event_handler.rs b/src/storage/src/hummock/event_handler/hummock_event_handler.rs index f2aa2ea7fd88d..1c8abc78ddffc 100644 --- a/src/storage/src/hummock/event_handler/hummock_event_handler.rs +++ b/src/storage/src/hummock/event_handler/hummock_event_handler.rs @@ -50,6 +50,7 @@ use crate::hummock::event_handler::{ ReadOnlyRwLockRef, }; use crate::hummock::local_version::pinned_version::PinnedVersion; +use crate::hummock::local_version::recent_versions::RecentVersions; use crate::hummock::store::version::{ HummockReadVersion, StagingData, StagingSstableInfo, VersionUpdate, }; @@ -197,7 +198,7 @@ pub struct HummockEventHandler { local_read_version_mapping: HashMap, version_update_notifier_tx: Arc>, - pinned_version: Arc>, + recent_versions: Arc>, write_conflict_detector: Option>, uploader: HummockUploader, @@ -355,7 +356,10 @@ impl HummockEventHandler { hummock_event_rx, version_update_rx, version_update_notifier_tx, - pinned_version: Arc::new(ArcSwap::from_pointee(pinned_version)), + recent_versions: Arc::new(ArcSwap::from_pointee(RecentVersions::new( + pinned_version, + storage_opts.max_cached_recent_versions_number, + ))), write_conflict_detector, read_version_mapping, local_read_version_mapping: Default::default(), @@ -371,8 +375,8 @@ impl HummockEventHandler { self.version_update_notifier_tx.clone() } - pub fn pinned_version(&self) -> Arc> { - self.pinned_version.clone() + pub fn recent_versions(&self) -> Arc> { + self.recent_versions.clone() } pub fn read_version_mapping(&self) -> ReadOnlyReadVersionMapping { @@ -529,17 +533,18 @@ impl HummockEventHandler { .await .expect("should not be empty"); let prev_version_id = latest_version_ref.id(); - let new_version = Self::resolve_version_update_info( + if let Some(new_version) = Self::resolve_version_update_info( latest_version_ref.clone(), version_update, None, - ); - info!( - ?prev_version_id, - new_version_id = ?new_version.id(), - "recv new version" - ); - latest_version = Some(new_version); + ) { + info!( + ?prev_version_id, + new_version_id = ?new_version.id(), + "recv new version" + ); + latest_version = Some(new_version); + } } self.apply_version_update( @@ -582,21 +587,21 @@ impl HummockEventHandler { .unwrap_or_else(|| self.uploader.hummock_version().clone()); let mut sst_delta_infos = vec![]; - let new_pinned_version = Self::resolve_version_update_info( + if let Some(new_pinned_version) = Self::resolve_version_update_info( pinned_version.clone(), version_payload, Some(&mut sst_delta_infos), - ); - - self.refiller - .start_cache_refill(sst_delta_infos, pinned_version, new_pinned_version); + ) { + self.refiller + .start_cache_refill(sst_delta_infos, pinned_version, new_pinned_version); + } } fn resolve_version_update_info( pinned_version: PinnedVersion, version_payload: HummockVersionUpdate, mut sst_delta_infos: Option<&mut Vec>, - ) -> PinnedVersion { + ) -> Option { let newly_pinned_version = match version_payload { HummockVersionUpdate::VersionDeltas(version_deltas) => { let mut version_to_apply = pinned_version.version().clone(); @@ -629,8 +634,9 @@ impl HummockEventHandler { .metrics .event_handler_on_apply_version_update .start_timer(); - self.pinned_version - .store(Arc::new(new_pinned_version.clone())); + self.recent_versions.rcu(|prev_recent_versions| { + prev_recent_versions.with_new_version(new_pinned_version.clone()) + }); { self.for_each_read_version( @@ -663,7 +669,10 @@ impl HummockEventHandler { // TODO: should we change the logic when supporting partial ckpt? if let Some(sstable_object_id_manager) = &self.sstable_object_id_manager { sstable_object_id_manager.remove_watermark_object_id(TrackerId::Epoch( - self.pinned_version.load().visible_table_committed_epoch(), + self.recent_versions + .load() + .latest_version() + .visible_table_committed_epoch(), )); } @@ -789,13 +798,13 @@ impl HummockEventHandler { is_replicated, vnodes, } => { - let pinned_version = self.pinned_version.load(); + let pinned_version = self.recent_versions.load().latest_version().clone(); let instance_id = self.generate_instance_id(); let basic_read_version = Arc::new(RwLock::new( HummockReadVersion::new_with_replication_option( table_id, instance_id, - (**pinned_version).clone(), + pinned_version, is_replicated, vnodes, ), @@ -992,7 +1001,7 @@ mod tests { ); let event_tx = event_handler.event_sender(); - let latest_version = event_handler.pinned_version.clone(); + let latest_version = event_handler.recent_versions.clone(); let latest_version_update_tx = event_handler.version_update_notifier_tx.clone(); let send_clear = |version_id| { @@ -1018,12 +1027,15 @@ mod tests { let (old_version, new_version, refill_finish_tx) = refill_task_rx.recv().await.unwrap(); assert_eq!(old_version.version(), initial_version.version()); assert_eq!(new_version.version(), &version1); - assert_eq!(latest_version.load().version(), initial_version.version()); + assert_eq!( + latest_version.load().latest_version().version(), + initial_version.version() + ); let mut changed = latest_version_update_tx.subscribe(); refill_finish_tx.send(()).unwrap(); changed.changed().await.unwrap(); - assert_eq!(latest_version.load().version(), &version1); + assert_eq!(latest_version.load().latest_version().version(), &version1); } // test recovery with pending refill task @@ -1050,11 +1062,11 @@ mod tests { refill_task_rx.recv().await.unwrap(); assert_eq!(old_version3.version(), &version2); assert_eq!(new_version3.version(), &version3); - assert_eq!(latest_version.load().version(), &version1); + assert_eq!(latest_version.load().latest_version().version(), &version1); let rx = send_clear(version3.id); rx.await.unwrap(); - assert_eq!(latest_version.load().version(), &version3); + assert_eq!(latest_version.load().latest_version().version(), &version3); } async fn assert_pending(fut: &mut (impl Future + Unpin)) { @@ -1081,7 +1093,7 @@ mod tests { ))) .unwrap(); rx.await.unwrap(); - assert_eq!(latest_version.load().version(), &version5); + assert_eq!(latest_version.load().latest_version().version(), &version5); } } diff --git a/src/storage/src/hummock/event_handler/uploader/mod.rs b/src/storage/src/hummock/event_handler/uploader/mod.rs index 4494049d93b0b..90e6a9306930a 100644 --- a/src/storage/src/hummock/event_handler/uploader/mod.rs +++ b/src/storage/src/hummock/event_handler/uploader/mod.rs @@ -1643,7 +1643,8 @@ pub(crate) mod tests { let new_pinned_version = uploader .context .pinned_version - .new_pin_version(test_hummock_version(epoch1)); + .new_pin_version(test_hummock_version(epoch1)) + .unwrap(); uploader.update_pinned_version(new_pinned_version); assert_eq!(epoch1, uploader.max_committed_epoch()); } @@ -1672,7 +1673,8 @@ pub(crate) mod tests { let new_pinned_version = uploader .context .pinned_version - .new_pin_version(test_hummock_version(epoch1)); + .new_pin_version(test_hummock_version(epoch1)) + .unwrap(); uploader.update_pinned_version(new_pinned_version); assert!(uploader.data().syncing_data.is_empty()); assert_eq!(epoch1, uploader.max_committed_epoch()); @@ -1706,7 +1708,8 @@ pub(crate) mod tests { let new_pinned_version = uploader .context .pinned_version - .new_pin_version(test_hummock_version(epoch1)); + .new_pin_version(test_hummock_version(epoch1)) + .unwrap(); uploader.update_pinned_version(new_pinned_version); assert!(uploader.data().syncing_data.is_empty()); assert_eq!(epoch1, uploader.max_committed_epoch()); @@ -1730,11 +1733,21 @@ pub(crate) mod tests { let epoch4 = epoch3.next_epoch(); let epoch5 = epoch4.next_epoch(); let epoch6 = epoch5.next_epoch(); - let version1 = initial_pinned_version.new_pin_version(test_hummock_version(epoch1)); - let version2 = initial_pinned_version.new_pin_version(test_hummock_version(epoch2)); - let version3 = initial_pinned_version.new_pin_version(test_hummock_version(epoch3)); - let version4 = initial_pinned_version.new_pin_version(test_hummock_version(epoch4)); - let version5 = initial_pinned_version.new_pin_version(test_hummock_version(epoch5)); + let version1 = initial_pinned_version + .new_pin_version(test_hummock_version(epoch1)) + .unwrap(); + let version2 = initial_pinned_version + .new_pin_version(test_hummock_version(epoch2)) + .unwrap(); + let version3 = initial_pinned_version + .new_pin_version(test_hummock_version(epoch3)) + .unwrap(); + let version4 = initial_pinned_version + .new_pin_version(test_hummock_version(epoch4)) + .unwrap(); + let version5 = initial_pinned_version + .new_pin_version(test_hummock_version(epoch5)) + .unwrap(); uploader.start_epochs_for_test([epoch6]); uploader.init_instance(TEST_LOCAL_INSTANCE_ID, TEST_TABLE_ID, epoch6); diff --git a/src/storage/src/hummock/hummock_meta_client.rs b/src/storage/src/hummock/hummock_meta_client.rs index 4445a74884d5a..d123558acc50b 100644 --- a/src/storage/src/hummock/hummock_meta_client.rs +++ b/src/storage/src/hummock/hummock_meta_client.rs @@ -130,7 +130,11 @@ impl HummockMetaClient for MonitoredHummockMetaClient { self.meta_client.subscribe_compaction_event().await } - async fn get_version_by_epoch(&self, epoch: HummockEpoch) -> Result { - self.meta_client.get_version_by_epoch(epoch).await + async fn get_version_by_epoch( + &self, + epoch: HummockEpoch, + table_id: u32, + ) -> Result { + self.meta_client.get_version_by_epoch(epoch, table_id).await } } diff --git a/src/storage/src/hummock/iterator/change_log.rs b/src/storage/src/hummock/iterator/change_log.rs index 6fc99f29a80f3..ae8061c37b07d 100644 --- a/src/storage/src/hummock/iterator/change_log.rs +++ b/src/storage/src/hummock/iterator/change_log.rs @@ -527,8 +527,9 @@ mod tests { use bytes::Bytes; use itertools::Itertools; + use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::TableId; - use risingwave_common::hash::table_distribution::TableDistribution; + use risingwave_common::hash::VirtualNode; use risingwave_common::util::epoch::test_epoch; use risingwave_hummock_sdk::key::{TableKey, UserKey}; use risingwave_hummock_sdk::EpochWithGap; @@ -699,7 +700,7 @@ mod tests { }, table_option: Default::default(), is_replicated: false, - vnodes: TableDistribution::all_vnodes(), + vnodes: Bitmap::ones(VirtualNode::COUNT_FOR_TEST).into(), }) .await; let logs = gen_test_data(epoch_count, 10000, 0.05, 0.2); diff --git a/src/storage/src/hummock/local_version/mod.rs b/src/storage/src/hummock/local_version/mod.rs index 578e123c6574e..4a45c8dc9075c 100644 --- a/src/storage/src/hummock/local_version/mod.rs +++ b/src/storage/src/hummock/local_version/mod.rs @@ -13,3 +13,4 @@ // limitations under the License. pub mod pinned_version; +pub mod recent_versions; diff --git a/src/storage/src/hummock/local_version/pinned_version.rs b/src/storage/src/hummock/local_version/pinned_version.rs index 5ef53edcd26ef..afaafdf7cbe8a 100644 --- a/src/storage/src/hummock/local_version/pinned_version.rs +++ b/src/storage/src/hummock/local_version/pinned_version.rs @@ -92,22 +92,25 @@ impl PinnedVersion { } } - pub fn new_pin_version(&self, version: HummockVersion) -> Self { + pub fn new_pin_version(&self, version: HummockVersion) -> Option { assert!( version.id >= self.version.id, "pinning a older version {}. Current is {}", version.id, self.version.id ); + if version.id == self.version.id { + return None; + } let version_id = version.id; - PinnedVersion { + Some(PinnedVersion { version: Arc::new(version), guard: Arc::new(PinnedVersionGuard::new( version_id, self.guard.pinned_version_manager_tx.clone(), )), - } + }) } pub fn id(&self) -> HummockVersionId { diff --git a/src/storage/src/hummock/local_version/recent_versions.rs b/src/storage/src/hummock/local_version/recent_versions.rs new file mode 100644 index 0000000000000..8d3f1a015ad6a --- /dev/null +++ b/src/storage/src/hummock/local_version/recent_versions.rs @@ -0,0 +1,321 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::cmp::Ordering; + +use risingwave_common::catalog::TableId; +use risingwave_hummock_sdk::HummockEpoch; + +use crate::hummock::local_version::pinned_version::PinnedVersion; + +pub struct RecentVersions { + latest_version: PinnedVersion, + is_latest_committed: bool, + recent_versions: Vec, // earlier version at the front + max_version_num: usize, +} + +impl RecentVersions { + pub fn new(version: PinnedVersion, max_version_num: usize) -> Self { + assert!(max_version_num > 0); + Self { + latest_version: version, + is_latest_committed: true, // The first version is always treated as committed epochs + recent_versions: Vec::new(), + max_version_num, + } + } + + fn has_table_committed(&self, new_version: &PinnedVersion) -> bool { + let mut has_table_committed = false; + for (table_id, info) in new_version.version().state_table_info.info() { + if let Some(prev_info) = self + .latest_version + .version() + .state_table_info + .info() + .get(table_id) + { + match info.committed_epoch.cmp(&prev_info.committed_epoch) { + Ordering::Less => { + unreachable!( + "table {} has regress committed epoch {}, prev committed epoch {}", + table_id, info.committed_epoch, prev_info.committed_epoch + ); + } + Ordering::Equal => {} + Ordering::Greater => { + has_table_committed = true; + } + } + } else { + has_table_committed = true; + } + } + has_table_committed + } + + #[must_use] + pub fn with_new_version(&self, version: PinnedVersion) -> Self { + assert!(version.version().id > self.latest_version.version().id); + let is_committed = self.has_table_committed(&version); + let recent_versions = if self.is_latest_committed { + let prev_recent_versions = if self.recent_versions.len() >= self.max_version_num { + assert_eq!(self.recent_versions.len(), self.max_version_num); + &self.recent_versions[1..] + } else { + &self.recent_versions[..] + }; + let mut recent_versions = Vec::with_capacity(prev_recent_versions.len() + 1); + recent_versions.extend(prev_recent_versions.iter().cloned()); + recent_versions.push(self.latest_version.clone()); + recent_versions + } else { + self.recent_versions.clone() + }; + Self { + latest_version: version, + is_latest_committed: is_committed, + recent_versions, + max_version_num: self.max_version_num, + } + } + + pub fn latest_version(&self) -> &PinnedVersion { + &self.latest_version + } + + /// Return the latest version that is safe to read `epoch` on `table_id`. + /// + /// `safe to read` means that the `committed_epoch` of the `table_id` in the version won't be greater than the given `epoch` + pub fn get_safe_version( + &self, + table_id: TableId, + epoch: HummockEpoch, + ) -> Option { + if let Some(info) = self + .latest_version + .version() + .state_table_info + .info() + .get(&table_id) + { + if info.committed_epoch <= epoch { + Some(self.latest_version.clone()) + } else { + self.get_safe_version_from_recent(table_id, epoch) + } + } else { + None + } + } + + fn get_safe_version_from_recent( + &self, + table_id: TableId, + epoch: HummockEpoch, + ) -> Option { + if cfg!(debug_assertions) { + assert!( + epoch + < self + .latest_version + .version() + .state_table_info + .info() + .get(&table_id) + .expect("should exist") + .committed_epoch + ); + } + let result = self.recent_versions.binary_search_by(|version| { + let committed_epoch = version + .version() + .state_table_info + .info() + .get(&table_id) + .map(|info| info.committed_epoch); + if let Some(committed_epoch) = committed_epoch { + committed_epoch.cmp(&epoch) + } else { + // We have ensured that the table_id exists in the latest version, so if the table_id does not exist in a + // previous version, the table must have not created yet, and therefore has less ordering. + Ordering::Less + } + }); + match result { + Ok(index) => Some(self.recent_versions[index].clone()), + Err(index) => { + // `index` is index of the first version that has `committed_epoch` greater than `epoch` + // or `index` equals `recent_version.len()` when `epoch` is greater than all `committed_epoch` + let version = if index >= self.recent_versions.len() { + assert_eq!(index, self.recent_versions.len()); + self.recent_versions.last().cloned() + } else if index == 0 { + // The earliest version has a higher committed epoch + None + } else { + self.recent_versions.get(index - 1).cloned() + }; + version.and_then(|version| { + if version + .version() + .state_table_info + .info() + .contains_key(&table_id) + { + Some(version) + } else { + // if the table does not exist in the version, return `None` to try get a time travel version + None + } + }) + } + } + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use risingwave_common::catalog::TableId; + use risingwave_hummock_sdk::version::HummockVersion; + use risingwave_pb::hummock::{PbHummockVersion, StateTableInfo}; + use tokio::sync::mpsc::unbounded_channel; + + use crate::hummock::local_version::pinned_version::PinnedVersion; + use crate::hummock::local_version::recent_versions::RecentVersions; + + const TEST_TABLE_ID1: TableId = TableId::new(233); + const TEST_TABLE_ID2: TableId = TableId::new(234); + + fn gen_pin_version( + version_id: u64, + table_committed_epoch: impl IntoIterator, + ) -> PinnedVersion { + PinnedVersion::new( + HummockVersion::from_rpc_protobuf(&PbHummockVersion { + id: version_id, + state_table_info: HashMap::from_iter(table_committed_epoch.into_iter().map( + |(table_id, committed_epoch)| { + ( + table_id.table_id, + StateTableInfo { + committed_epoch, + safe_epoch: 0, + compaction_group_id: 0, + }, + ) + }, + )), + ..Default::default() + }), + unbounded_channel().0, + ) + } + + fn assert_query_equal( + recent_version: &RecentVersions, + expected: &[(TableId, u64, Option<&PinnedVersion>)], + ) { + for (table_id, epoch, expected_version) in expected + .iter() + .cloned() + .chain([(TEST_TABLE_ID1, 0, None), (TEST_TABLE_ID2, 0, None)]) + { + let version = recent_version.get_safe_version(table_id, epoch); + assert_eq!( + version.as_ref().map(|version| version.id()), + expected_version.map(|version| version.id()) + ); + } + } + + #[test] + fn test_basic() { + let epoch1 = 233; + let epoch0 = epoch1 - 1; + let epoch2 = epoch1 + 1; + let epoch3 = epoch2 + 1; + let epoch4 = epoch3 + 1; + let version1 = gen_pin_version(1, [(TEST_TABLE_ID1, epoch1)]); + // with at most 2 historical versions + let recent_versions = RecentVersions::new(version1.clone(), 2); + assert!(recent_versions.recent_versions.is_empty()); + assert!(recent_versions.is_latest_committed); + assert_query_equal( + &recent_versions, + &[ + (TEST_TABLE_ID1, epoch0, None), + (TEST_TABLE_ID1, epoch1, Some(&version1)), + (TEST_TABLE_ID1, epoch2, Some(&version1)), + ], + ); + + let recent_versions = + recent_versions.with_new_version(gen_pin_version(2, [(TEST_TABLE_ID1, epoch1)])); + assert_eq!(recent_versions.recent_versions.len(), 1); + assert!(!recent_versions.is_latest_committed); + + let version3 = gen_pin_version(3, [(TEST_TABLE_ID1, epoch2)]); + let recent_versions = recent_versions.with_new_version(version3.clone()); + assert_eq!(recent_versions.recent_versions.len(), 1); + assert!(recent_versions.is_latest_committed); + assert_query_equal( + &recent_versions, + &[ + (TEST_TABLE_ID1, epoch0, None), + (TEST_TABLE_ID1, epoch1, Some(&version1)), + (TEST_TABLE_ID1, epoch2, Some(&version3)), + (TEST_TABLE_ID1, epoch3, Some(&version3)), + ], + ); + + let version4 = gen_pin_version(4, [(TEST_TABLE_ID2, epoch1), (TEST_TABLE_ID1, epoch2)]); + let recent_versions = recent_versions.with_new_version(version4.clone()); + assert_eq!(recent_versions.recent_versions.len(), 2); + assert!(recent_versions.is_latest_committed); + assert_query_equal( + &recent_versions, + &[ + (TEST_TABLE_ID1, epoch0, None), + (TEST_TABLE_ID1, epoch1, Some(&version1)), + (TEST_TABLE_ID1, epoch2, Some(&version4)), + (TEST_TABLE_ID1, epoch3, Some(&version4)), + (TEST_TABLE_ID2, epoch0, None), + (TEST_TABLE_ID2, epoch1, Some(&version4)), + (TEST_TABLE_ID2, epoch2, Some(&version4)), + ], + ); + + let version5 = gen_pin_version(5, [(TEST_TABLE_ID2, epoch1), (TEST_TABLE_ID1, epoch3)]); + let recent_versions = recent_versions.with_new_version(version5.clone()); + assert_eq!(recent_versions.recent_versions.len(), 2); + assert!(recent_versions.is_latest_committed); + assert_query_equal( + &recent_versions, + &[ + (TEST_TABLE_ID1, epoch0, None), + (TEST_TABLE_ID1, epoch1, None), + (TEST_TABLE_ID1, epoch2, Some(&version4)), + (TEST_TABLE_ID1, epoch3, Some(&version5)), + (TEST_TABLE_ID1, epoch4, Some(&version5)), + (TEST_TABLE_ID2, epoch0, None), + (TEST_TABLE_ID2, epoch1, Some(&version5)), + (TEST_TABLE_ID2, epoch2, Some(&version5)), + ], + ); + } +} diff --git a/src/storage/src/hummock/mod.rs b/src/storage/src/hummock/mod.rs index 14ac9532c8cb3..f10b6deee503e 100644 --- a/src/storage/src/hummock/mod.rs +++ b/src/storage/src/hummock/mod.rs @@ -172,8 +172,7 @@ pub fn get_from_batch( read_options: &ReadOptions, local_stats: &mut StoreLocalStatistic, ) -> Option<(HummockValue, EpochWithGap)> { - imm.get(table_key, read_epoch, read_options).map(|v| { + imm.get(table_key, read_epoch, read_options).inspect(|_| { local_stats.get_shared_buffer_hit_counts += 1; - v }) } diff --git a/src/storage/src/hummock/sstable/bloom.rs b/src/storage/src/hummock/sstable/bloom.rs index f2ca47ba00e12..b38a4c10ada30 100644 --- a/src/storage/src/hummock/sstable/bloom.rs +++ b/src/storage/src/hummock/sstable/bloom.rs @@ -102,7 +102,7 @@ impl BloomFilterReader { true } else { let nbits = self.data.bit_len(); - let delta = (h >> 17) | (h << 15); + let delta = h.rotate_left(15); for _ in 0..self.k { let bit_pos = h % (nbits as u32); if !self.data.get_bit(bit_pos as usize) { @@ -171,7 +171,7 @@ impl FilterBuilder for BloomFilterBuilder { filter.resize(nbytes, 0); for h in &self.key_hash_entries { let mut h = *h; - let delta = (h >> 17) | (h << 15); + let delta = h.rotate_left(15); for _ in 0..k { let bit_pos = (h as usize) % nbits; filter.set_bit(bit_pos, true); diff --git a/src/storage/src/hummock/sstable_store.rs b/src/storage/src/hummock/sstable_store.rs index b9f29c5740e4b..d1367b92a9ce8 100644 --- a/src/storage/src/hummock/sstable_store.rs +++ b/src/storage/src/hummock/sstable_store.rs @@ -26,7 +26,9 @@ use foyer::{ use futures::{future, StreamExt}; use itertools::Itertools; use risingwave_hummock_sdk::sstable_info::SstableInfo; -use risingwave_hummock_sdk::{HummockSstableObjectId, OBJECT_SUFFIX}; +use risingwave_hummock_sdk::{ + HummockSstableObjectId, HUMMOCK_SSTABLE_OBJECT_ID_MAX_DECIMAL_LENGTH, OBJECT_SUFFIX, +}; use risingwave_hummock_trace::TracedCachePolicy; use risingwave_object_store::object::{ ObjectError, ObjectMetadataIter, ObjectResult, ObjectStoreRef, ObjectStreamingUploader, @@ -519,10 +521,21 @@ impl SstableStore { let obj_prefix = self .store .get_object_prefix(object_id, self.use_new_object_prefix_strategy); - format!( - "{}/{}{}.{}", - self.path, obj_prefix, object_id, OBJECT_SUFFIX - ) + let mut path = String::with_capacity( + self.path.len() + + "/".len() + + obj_prefix.len() + + HUMMOCK_SSTABLE_OBJECT_ID_MAX_DECIMAL_LENGTH + + ".".len() + + OBJECT_SUFFIX.len(), + ); + path.push_str(&self.path); + path.push('/'); + path.push_str(&obj_prefix); + path.push_str(&object_id.to_string()); + path.push('.'); + path.push_str(OBJECT_SUFFIX); + path } pub fn get_object_id_from_path(path: &str) -> HummockSstableObjectId { diff --git a/src/storage/src/hummock/store/hummock_storage.rs b/src/storage/src/hummock/store/hummock_storage.rs index b64752fca7fd6..b4924a5dca60f 100644 --- a/src/storage/src/hummock/store/hummock_storage.rs +++ b/src/storage/src/hummock/store/hummock_storage.rs @@ -14,7 +14,7 @@ use std::collections::HashSet; use std::future::Future; -use std::ops::{Bound, Deref}; +use std::ops::Bound; use std::sync::Arc; use arc_swap::ArcSwap; @@ -50,9 +50,10 @@ use crate::hummock::event_handler::{ }; use crate::hummock::iterator::change_log::ChangeLogIterator; use crate::hummock::local_version::pinned_version::{start_pinned_version_worker, PinnedVersion}; +use crate::hummock::local_version::recent_versions::RecentVersions; use crate::hummock::observer_manager::HummockObserverNode; use crate::hummock::time_travel_version_cache::SimpleTimeTravelVersionCache; -use crate::hummock::utils::{validate_safe_epoch, wait_for_epoch}; +use crate::hummock::utils::wait_for_epoch; use crate::hummock::write_limiter::{WriteLimiter, WriteLimiterRef}; use crate::hummock::{ HummockEpoch, HummockError, HummockResult, HummockStorageIterator, HummockStorageRevIterator, @@ -97,7 +98,7 @@ pub struct HummockStorage { version_update_notifier_tx: Arc>, - pinned_version: Arc>, + recent_versions: Arc>, hummock_version_reader: HummockVersionReader, @@ -223,7 +224,7 @@ impl HummockStorage { version_update_notifier_tx: hummock_event_handler.version_update_notifier_tx(), hummock_event_sender: event_tx.clone(), _version_update_sender: version_update_tx, - pinned_version: hummock_event_handler.pinned_version(), + recent_versions: hummock_event_handler.recent_versions(), hummock_version_reader: HummockVersionReader::new( sstable_store, state_store_metrics.clone(), @@ -260,15 +261,9 @@ impl HummockStorage { ) -> StorageResult> { let key_range = (Bound::Included(key.clone()), Bound::Included(key.clone())); - let (key_range, read_version_tuple) = if read_options.read_version_from_time_travel { - self.build_read_version_by_time_travel(epoch, read_options.table_id, key_range) - .await? - } else if read_options.read_version_from_backup { - self.build_read_version_tuple_from_backup(epoch, read_options.table_id, key_range) - .await? - } else { - self.build_read_version_tuple(epoch, read_options.table_id, key_range)? - }; + let (key_range, read_version_tuple) = self + .build_read_version_tuple(epoch, key_range, &read_options) + .await?; if is_empty_key_range(&key_range) { return Ok(None); @@ -285,15 +280,9 @@ impl HummockStorage { epoch: u64, read_options: ReadOptions, ) -> StorageResult { - let (key_range, read_version_tuple) = if read_options.read_version_from_time_travel { - self.build_read_version_by_time_travel(epoch, read_options.table_id, key_range) - .await? - } else if read_options.read_version_from_backup { - self.build_read_version_tuple_from_backup(epoch, read_options.table_id, key_range) - .await? - } else { - self.build_read_version_tuple(epoch, read_options.table_id, key_range)? - }; + let (key_range, read_version_tuple) = self + .build_read_version_tuple(epoch, key_range, &read_options) + .await?; self.hummock_version_reader .iter(key_range, epoch, read_options, read_version_tuple) @@ -306,36 +295,28 @@ impl HummockStorage { epoch: u64, read_options: ReadOptions, ) -> StorageResult { - let (key_range, read_version_tuple) = if read_options.read_version_from_time_travel { - self.build_read_version_by_time_travel(epoch, read_options.table_id, key_range) - .await? - } else if read_options.read_version_from_backup { - self.build_read_version_tuple_from_backup(epoch, read_options.table_id, key_range) - .await? - } else { - self.build_read_version_tuple(epoch, read_options.table_id, key_range)? - }; + let (key_range, read_version_tuple) = self + .build_read_version_tuple(epoch, key_range, &read_options) + .await?; self.hummock_version_reader .rev_iter(key_range, epoch, read_options, read_version_tuple, None) .await } - async fn build_read_version_by_time_travel( + async fn get_time_travel_version( &self, epoch: u64, table_id: TableId, - key_range: TableKeyRange, - ) -> StorageResult<(TableKeyRange, ReadVersionTuple)> { + ) -> StorageResult { let fetch = async { let pb_version = self .hummock_meta_client - .get_version_by_epoch(epoch) + .get_version_by_epoch(epoch, table_id.table_id()) .await .inspect_err(|e| tracing::error!("{}", e.to_report_string())) .map_err(|e| HummockError::meta_error(e.to_report_string()))?; let version = HummockVersion::from_rpc_protobuf(&pb_version); - validate_safe_epoch(&version, table_id, epoch)?; let (tx, _rx) = unbounded_channel(); Ok(PinnedVersion::new(version, tx)) }; @@ -343,9 +324,24 @@ impl HummockStorage { .simple_time_travel_version_cache .get_or_insert(epoch, fetch) .await?; - Ok(get_committed_read_version_tuple( - version, table_id, key_range, epoch, - )) + Ok(version) + } + + async fn build_read_version_tuple( + &self, + epoch: u64, + key_range: TableKeyRange, + read_options: &ReadOptions, + ) -> StorageResult<(TableKeyRange, ReadVersionTuple)> { + if read_options.read_version_from_backup { + self.build_read_version_tuple_from_backup(epoch, read_options.table_id, key_range) + .await + } else if read_options.read_committed { + self.build_read_version_tuple_from_committed(epoch, read_options.table_id, key_range) + .await + } else { + self.build_read_version_tuple_from_all(epoch, read_options.table_id, key_range) + } } async fn build_read_version_tuple_from_backup( @@ -359,16 +355,12 @@ impl HummockStorage { .try_get_hummock_version(table_id, epoch) .await { - Ok(Some(backup_version)) => { - validate_safe_epoch(backup_version.version(), table_id, epoch)?; - - Ok(get_committed_read_version_tuple( - backup_version, - table_id, - key_range, - epoch, - )) - } + Ok(Some(backup_version)) => Ok(get_committed_read_version_tuple( + backup_version, + table_id, + key_range, + epoch, + )), Ok(None) => Err(HummockError::read_backup_error(format!( "backup include epoch {} not found", epoch @@ -378,27 +370,47 @@ impl HummockStorage { } } - fn build_read_version_tuple( + async fn build_read_version_tuple_from_committed( &self, epoch: u64, table_id: TableId, key_range: TableKeyRange, ) -> StorageResult<(TableKeyRange, ReadVersionTuple)> { - let pinned_version = self.pinned_version.load(); - validate_safe_epoch(pinned_version.version(), table_id, epoch)?; - let table_committed_epoch = pinned_version + let version = match self + .recent_versions + .load() + .get_safe_version(table_id, epoch) + { + Some(version) => version, + None => self.get_time_travel_version(epoch, table_id).await?, + }; + Ok(get_committed_read_version_tuple( + version, table_id, key_range, epoch, + )) + } + + fn build_read_version_tuple_from_all( + &self, + epoch: u64, + table_id: TableId, + key_range: TableKeyRange, + ) -> StorageResult<(TableKeyRange, ReadVersionTuple)> { + let pinned_version = self.recent_versions.load().latest_version().clone(); + let info = pinned_version .version() .state_table_info .info() - .get(&table_id) - .map(|info| info.committed_epoch); + .get(&table_id); // check epoch if lower mce - let ret = if let Some(table_committed_epoch) = table_committed_epoch - && epoch <= table_committed_epoch + let ret = if let Some(info) = info + && epoch <= info.committed_epoch { + if epoch < info.safe_epoch { + return Err(HummockError::expired_epoch(table_id, info.safe_epoch, epoch).into()); + } // read committed_version directly without build snapshot - get_committed_read_version_tuple((**pinned_version).clone(), table_id, key_range, epoch) + get_committed_read_version_tuple(pinned_version, table_id, key_range, epoch) } else { let vnode = vnode(&key_range); let mut matched_replicated_read_version_cnt = 0; @@ -431,6 +443,7 @@ impl HummockStorage { // When the system has just started and no state has been created, the memory state // may be empty if read_version_vec.is_empty() { + let table_committed_epoch = info.map(|info| info.committed_epoch); if matched_replicated_read_version_cnt > 0 { tracing::warn!( "Read(table_id={} vnode={} epoch={}) is not allowed on replicated read version ({} found). Fall back to committed version (epoch={:?})", @@ -449,12 +462,7 @@ impl HummockStorage { table_committed_epoch ); } - get_committed_read_version_tuple( - (**pinned_version).clone(), - table_id, - key_range, - epoch, - ) + get_committed_read_version_tuple(pinned_version, table_id, key_range, epoch) } else { if read_version_vec.len() != 1 { let read_version_vnodes = read_version_vec @@ -538,7 +546,7 @@ impl HummockStorage { } pub fn get_pinned_version(&self) -> PinnedVersion { - self.pinned_version.load().deref().deref().clone() + self.recent_versions.load().latest_version().clone() } pub fn backup_reader(&self) -> BackupReaderRef { @@ -604,7 +612,7 @@ impl StateStoreRead for HummockStorage { key_range: TableKeyRange, options: ReadLogOptions, ) -> StorageResult { - let version = (**self.pinned_version.load()).clone(); + let version = self.recent_versions.load().latest_version().clone(); let iter = self .hummock_version_reader .iter_log(version, epoch_range, key_range, options) @@ -655,8 +663,9 @@ impl HummockStorage { epoch: u64, ) -> StorageResult { let table_ids = self - .pinned_version + .recent_versions .load() + .latest_version() .version() .state_table_info .info() @@ -675,7 +684,7 @@ impl HummockStorage { .send(HummockVersionUpdate::PinnedVersion(Box::new(version))) .unwrap(); loop { - if self.pinned_version.load().id() >= version_id { + if self.recent_versions.load().latest_version().id() >= version_id { break; } @@ -686,7 +695,7 @@ impl HummockStorage { pub async fn wait_version(&self, version: HummockVersion) { use tokio::task::yield_now; loop { - if self.pinned_version.load().id() >= version.id { + if self.recent_versions.load().latest_version().id() >= version.id { break; } @@ -736,7 +745,7 @@ impl HummockStorage { pub async fn wait_version_update(&self, old_id: HummockVersionId) -> HummockVersionId { use tokio::task::yield_now; loop { - let cur_id = self.pinned_version.load().id(); + let cur_id = self.recent_versions.load().latest_version().id(); if cur_id > old_id { return cur_id; } diff --git a/src/storage/src/hummock/utils.rs b/src/storage/src/hummock/utils.rs index 3f2d1f989f529..c2f6cbafed79b 100644 --- a/src/storage/src/hummock/utils.rs +++ b/src/storage/src/hummock/utils.rs @@ -30,11 +30,10 @@ use risingwave_hummock_sdk::key::{ bound_table_key_range, EmptySliceRef, FullKey, TableKey, UserKey, }; use risingwave_hummock_sdk::sstable_info::SstableInfo; -use risingwave_hummock_sdk::version::HummockVersion; use risingwave_hummock_sdk::{can_concat, HummockEpoch}; use tokio::sync::oneshot::{channel, Receiver, Sender}; -use super::{HummockError, HummockResult, SstableStoreRef}; +use super::{HummockError, SstableStoreRef}; use crate::error::StorageResult; use crate::hummock::CachePolicy; use crate::mem_table::{KeyOp, MemTableError}; @@ -72,24 +71,6 @@ where !too_left && !too_right } -pub fn validate_safe_epoch( - version: &HummockVersion, - table_id: TableId, - epoch: u64, -) -> HummockResult<()> { - if let Some(info) = version.state_table_info.info().get(&table_id) - && epoch < info.safe_epoch - { - return Err(HummockError::expired_epoch( - table_id, - info.safe_epoch, - epoch, - )); - } - - Ok(()) -} - pub fn filter_single_sst(info: &SstableInfo, table_id: TableId, table_key_range: &R) -> bool where R: RangeBounds>, diff --git a/src/storage/src/lib.rs b/src/storage/src/lib.rs index e11d3e1cee1ca..779062767c7ae 100644 --- a/src/storage/src/lib.rs +++ b/src/storage/src/lib.rs @@ -18,7 +18,6 @@ #![feature(extract_if)] #![feature(coroutines)] #![feature(hash_extract_if)] -#![feature(lint_reasons)] #![feature(proc_macro_hygiene)] #![feature(stmt_expr_attributes)] #![feature(strict_provenance)] diff --git a/src/storage/src/opts.rs b/src/storage/src/opts.rs index f6d6f31fb3a4f..a3a787f55c97d 100644 --- a/src/storage/src/opts.rs +++ b/src/storage/src/opts.rs @@ -63,6 +63,8 @@ pub struct StorageOpts { /// max memory usage for large query. pub prefetch_buffer_capacity_mb: usize, + pub max_cached_recent_versions_number: usize, + pub max_prefetch_block_number: usize, pub disable_remote_compactor: bool, @@ -170,6 +172,10 @@ impl From<(&RwConfig, &SystemParamsReader, &StorageMemoryConfig)> for StorageOpt meta_cache_shard_num: s.meta_cache_shard_num, meta_cache_eviction_config: s.meta_cache_eviction_config.clone(), prefetch_buffer_capacity_mb: s.prefetch_buffer_capacity_mb, + max_cached_recent_versions_number: c + .storage + .max_cached_recent_versions_number + .unwrap_or(60), max_prefetch_block_number: c.storage.max_prefetch_block_number, disable_remote_compactor: c.storage.disable_remote_compactor, share_buffer_upload_concurrency: c.storage.share_buffer_upload_concurrency, diff --git a/src/storage/src/store.rs b/src/storage/src/store.rs index 91f79231f6939..ab80f712570ca 100644 --- a/src/storage/src/store.rs +++ b/src/storage/src/store.rs @@ -502,7 +502,7 @@ pub struct ReadOptions { /// Read from historical hummock version of meta snapshot backup. /// It should only be used by `StorageTable` for batch query. pub read_version_from_backup: bool, - pub read_version_from_time_travel: bool, + pub read_committed: bool, } impl From for ReadOptions { @@ -515,7 +515,7 @@ impl From for ReadOptions { retention_seconds: value.retention_seconds, table_id: value.table_id.into(), read_version_from_backup: value.read_version_from_backup, - read_version_from_time_travel: value.read_version_from_time_travel, + read_committed: value.read_committed, } } } @@ -530,7 +530,7 @@ impl From for TracedReadOptions { retention_seconds: value.retention_seconds, table_id: value.table_id.into(), read_version_from_backup: value.read_version_from_backup, - read_version_from_time_travel: value.read_version_from_time_travel, + read_committed: value.read_committed, } } } diff --git a/src/storage/src/table/batch_table/storage_table.rs b/src/storage/src/table/batch_table/storage_table.rs index 7a0ad76cce4a5..8c5f432f46c57 100644 --- a/src/storage/src/table/batch_table/storage_table.rs +++ b/src/storage/src/table/batch_table/storage_table.rs @@ -361,7 +361,10 @@ impl StorageTableInner { ) -> StorageResult> { let epoch = wait_epoch.get_epoch(); let read_backup = matches!(wait_epoch, HummockReadEpoch::Backup(_)); - let read_time_travel = matches!(wait_epoch, HummockReadEpoch::TimeTravel(_)); + let read_committed = matches!( + wait_epoch, + HummockReadEpoch::TimeTravel(_) | HummockReadEpoch::Committed(_) + ); self.store.try_wait_epoch(wait_epoch).await?; let serialized_pk = serialize_pk_with_vnode( &pk, @@ -382,7 +385,7 @@ impl StorageTableInner { retention_seconds: self.table_option.retention_seconds, table_id: self.table_id, read_version_from_backup: read_backup, - read_version_from_time_travel: read_time_travel, + read_committed, cache_policy: CachePolicy::Fill(CacheContext::Default), ..Default::default() }; @@ -487,14 +490,17 @@ impl StorageTableInner { let iterators: Vec<_> = try_join_all(table_key_ranges.map(|table_key_range| { let prefix_hint = prefix_hint.clone(); let read_backup = matches!(wait_epoch, HummockReadEpoch::Backup(_)); - let read_time_travel = matches!(wait_epoch, HummockReadEpoch::TimeTravel(_)); + let read_committed = matches!( + wait_epoch, + HummockReadEpoch::TimeTravel(_) | HummockReadEpoch::Committed(_) + ); async move { let read_options = ReadOptions { prefix_hint, retention_seconds: self.table_option.retention_seconds, table_id: self.table_id, read_version_from_backup: read_backup, - read_version_from_time_travel: read_time_travel, + read_committed, prefetch_options, cache_policy, ..Default::default() diff --git a/src/stream/src/common/log_store_impl/kv_log_store/reader.rs b/src/stream/src/common/log_store_impl/kv_log_store/reader.rs index 5497b989a0873..c84db97002b02 100644 --- a/src/stream/src/common/log_store_impl/kv_log_store/reader.rs +++ b/src/stream/src/common/log_store_impl/kv_log_store/reader.rs @@ -16,7 +16,7 @@ use std::future::Future; use std::ops::Bound; use std::ops::Bound::{Excluded, Included, Unbounded}; use std::pin::Pin; -use std::time::{Duration, Instant}; +use std::time::Duration; use anyhow::anyhow; use await_tree::InstrumentAwait; @@ -53,18 +53,28 @@ use crate::common::log_store_impl::kv_log_store::serde::{ }; use crate::common::log_store_impl::kv_log_store::KvLogStoreMetrics; -type RewindBackoffPolicy = impl Iterator; pub(crate) const REWIND_BASE_DELAY: Duration = Duration::from_secs(1); pub(crate) const REWIND_BACKOFF_FACTOR: u64 = 2; pub(crate) const REWIND_MAX_DELAY: Duration = Duration::from_secs(180); -fn initial_rewind_backoff_policy() -> RewindBackoffPolicy { - tokio_retry::strategy::ExponentialBackoff::from_millis(REWIND_BASE_DELAY.as_millis() as _) - .factor(REWIND_BACKOFF_FACTOR) - .max_delay(REWIND_MAX_DELAY) - .map(tokio_retry::strategy::jitter) +mod rewind_backoff_policy { + use std::time::Duration; + + use crate::common::log_store_impl::kv_log_store::{ + REWIND_BACKOFF_FACTOR, REWIND_BASE_DELAY, REWIND_MAX_DELAY, + }; + + pub(super) type RewindBackoffPolicy = impl Iterator; + pub(super) fn initial_rewind_backoff_policy() -> RewindBackoffPolicy { + tokio_retry::strategy::ExponentialBackoff::from_millis(REWIND_BASE_DELAY.as_millis() as _) + .factor(REWIND_BACKOFF_FACTOR) + .max_delay(REWIND_MAX_DELAY) + .map(tokio_retry::strategy::jitter) + } } +use rewind_backoff_policy::*; + struct RewindDelay { last_rewind_truncate_offset: Option, backoff_policy: RewindBackoffPolicy, @@ -218,58 +228,71 @@ impl bool> AutoRebuildStateStoreReadIter } } -type TimeoutAutoRebuildIter = - AutoRebuildStateStoreReadIter bool + Send>; +mod timeout_auto_rebuild { + use std::time::{Duration, Instant}; -async fn iter_with_timeout_rebuild( - state_store: S, - range: TableKeyRange, - epoch: HummockEpoch, - options: ReadOptions, - timeout: Duration, -) -> StorageResult> { - const CHECK_TIMEOUT_PERIOD: usize = 100; - // use a struct here to avoid accidental copy instead of move on primitive usize - struct Count(usize); - let mut check_count = Count(0); - let mut total_count = Count(0); - let mut curr_iter_item_count = Count(0); - let mut start_time = Instant::now(); - let initial_start_time = start_time; - AutoRebuildStateStoreReadIter::new( - state_store, - move || { - check_count.0 += 1; - curr_iter_item_count.0 += 1; - total_count.0 += 1; - if check_count.0 == CHECK_TIMEOUT_PERIOD { - check_count.0 = 0; - if start_time.elapsed() > timeout { - let prev_iter_item_count = curr_iter_item_count.0; - curr_iter_item_count.0 = 0; - start_time = Instant::now(); - info!( - table_id = options.table_id.table_id, - iter_exist_time_secs = initial_start_time.elapsed().as_secs(), - prev_iter_item_count, - total_iter_item_count = total_count.0, - "kv log store iter is rebuilt" - ); - true + use risingwave_hummock_sdk::key::TableKeyRange; + use risingwave_hummock_sdk::HummockEpoch; + use risingwave_storage::error::StorageResult; + use risingwave_storage::store::{ReadOptions, StateStoreRead}; + + use crate::common::log_store_impl::kv_log_store::reader::AutoRebuildStateStoreReadIter; + + pub(super) type TimeoutAutoRebuildIter = + AutoRebuildStateStoreReadIter bool + Send>; + + pub(super) async fn iter_with_timeout_rebuild( + state_store: S, + range: TableKeyRange, + epoch: HummockEpoch, + options: ReadOptions, + timeout: Duration, + ) -> StorageResult> { + const CHECK_TIMEOUT_PERIOD: usize = 100; + // use a struct here to avoid accidental copy instead of move on primitive usize + struct Count(usize); + let mut check_count = Count(0); + let mut total_count = Count(0); + let mut curr_iter_item_count = Count(0); + let mut start_time = Instant::now(); + let initial_start_time = start_time; + AutoRebuildStateStoreReadIter::new( + state_store, + move || { + check_count.0 += 1; + curr_iter_item_count.0 += 1; + total_count.0 += 1; + if check_count.0 == CHECK_TIMEOUT_PERIOD { + check_count.0 = 0; + if start_time.elapsed() > timeout { + let prev_iter_item_count = curr_iter_item_count.0; + curr_iter_item_count.0 = 0; + start_time = Instant::now(); + info!( + table_id = options.table_id.table_id, + iter_exist_time_secs = initial_start_time.elapsed().as_secs(), + prev_iter_item_count, + total_iter_item_count = total_count.0, + "kv log store iter is rebuilt" + ); + true + } else { + false + } } else { false } - } else { - false - } - }, - range, - epoch, - options, - ) - .await + }, + range, + epoch, + options, + ) + .await + } } +use timeout_auto_rebuild::*; + impl bool + Send> StateStoreIter for AutoRebuildStateStoreReadIter { diff --git a/src/stream/src/common/log_store_impl/kv_log_store/serde.rs b/src/stream/src/common/log_store_impl/kv_log_store/serde.rs index 92a3caf4cd2e3..17ab103d758b4 100644 --- a/src/stream/src/common/log_store_impl/kv_log_store/serde.rs +++ b/src/stream/src/common/log_store_impl/kv_log_store/serde.rs @@ -25,7 +25,7 @@ use itertools::Itertools; use risingwave_common::array::{Op, StreamChunk}; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::ColumnDesc; -use risingwave_common::hash::VirtualNode; +use risingwave_common::hash::{VirtualNode, VnodeBitmapExt}; use risingwave_common::row::{OwnedRow, Row, RowExt}; use risingwave_common::types::{DataType, ScalarImpl}; use risingwave_common::util::chunk_coalesce::DataChunkBuilder; @@ -42,7 +42,7 @@ use risingwave_storage::error::StorageResult; use risingwave_storage::row_serde::row_serde_util::{serialize_pk, serialize_pk_with_vnode}; use risingwave_storage::row_serde::value_serde::ValueRowSerdeNew; use risingwave_storage::store::{StateStoreIterExt, StateStoreReadIter}; -use risingwave_storage::table::{compute_vnode, TableDistribution, SINGLETON_VNODE}; +use risingwave_storage::table::{compute_vnode, SINGLETON_VNODE}; use rw_futures_util::select_all; use crate::common::log_store_impl::kv_log_store::{ @@ -201,8 +201,7 @@ impl LogStoreRowSerde { let vnodes = match vnodes { Some(vnodes) => vnodes, - - None => TableDistribution::singleton_vnode_bitmap(), + None => Bitmap::singleton().into(), }; // epoch and seq_id. The seq_id of barrier is set null, and therefore the second order type @@ -216,7 +215,7 @@ impl LogStoreRowSerde { ); let dist_key_indices = if dist_key_indices.is_empty() { - if &vnodes != TableDistribution::singleton_vnode_bitmap_ref() { + if !vnodes.is_singleton() { warn!( ?vnodes, "singleton log store gets non-singleton vnode bitmap" @@ -946,7 +945,7 @@ mod tests { use risingwave_storage::store::{ FromStreamStateStoreIter, StateStoreIterItem, StateStoreReadIter, }; - use risingwave_storage::table::DEFAULT_VNODE; + use risingwave_storage::table::SINGLETON_VNODE; use tokio::sync::oneshot; use tokio::sync::oneshot::Sender; @@ -1024,7 +1023,7 @@ mod tests { seq_id += 1; } - let (key, encoded_barrier) = serde.serialize_barrier(epoch, DEFAULT_VNODE, false); + let (key, encoded_barrier) = serde.serialize_barrier(epoch, SINGLETON_VNODE, false); let key = remove_vnode_prefix(&key.0); match serde.deserialize(&encoded_barrier).unwrap() { (decoded_epoch, LogStoreRowOp::Barrier { is_checkpoint }) => { @@ -1062,7 +1061,8 @@ mod tests { seq_id += 1; } - let (key, encoded_checkpoint_barrier) = serde.serialize_barrier(epoch, DEFAULT_VNODE, true); + let (key, encoded_checkpoint_barrier) = + serde.serialize_barrier(epoch, SINGLETON_VNODE, true); let key = remove_vnode_prefix(&key.0); match serde.deserialize(&encoded_checkpoint_barrier).unwrap() { (decoded_epoch, LogStoreRowOp::Barrier { is_checkpoint }) => { @@ -1200,7 +1200,7 @@ mod tests { ) { let (ops, rows) = gen_test_data(base); let first_barrier = { - let (key, value) = serde.serialize_barrier(EPOCH0, DEFAULT_VNODE, true); + let (key, value) = serde.serialize_barrier(EPOCH0, SINGLETON_VNODE, true); Ok((FullKey::new(TEST_TABLE_ID, key, EPOCH0), value)) }; let stream = stream::once(async move { first_barrier }); @@ -1210,7 +1210,7 @@ mod tests { let stream = stream.chain(stream::once({ let serde = serde.clone(); async move { - let (key, value) = serde.serialize_barrier(EPOCH1, DEFAULT_VNODE, false); + let (key, value) = serde.serialize_barrier(EPOCH1, SINGLETON_VNODE, false); Ok((FullKey::new(TEST_TABLE_ID, key, EPOCH1), value)) } })); @@ -1218,7 +1218,7 @@ mod tests { gen_row_stream(serde.clone(), ops.clone(), rows.clone(), EPOCH2, seq_id); let stream = stream.chain(row_stream).chain(stream::once({ async move { - let (key, value) = serde.serialize_barrier(EPOCH2, DEFAULT_VNODE, true); + let (key, value) = serde.serialize_barrier(EPOCH2, SINGLETON_VNODE, true); Ok((FullKey::new(TEST_TABLE_ID, key, EPOCH2), value)) } })); diff --git a/src/stream/src/common/log_store_impl/kv_log_store/test_utils.rs b/src/stream/src/common/log_store_impl/kv_log_store/test_utils.rs index 5fc10cd0cc58a..3114c22e63323 100644 --- a/src/stream/src/common/log_store_impl/kv_log_store/test_utils.rs +++ b/src/stream/src/common/log_store_impl/kv_log_store/test_utils.rs @@ -143,7 +143,7 @@ pub(crate) fn gen_multi_vnode_stream_chunks( .collect_vec(); let (ops, rows) = gen_sized_test_data(base, max_count); for (op, row) in zip_eq(ops, rows) { - let vnode = VirtualNode::compute_row(&row, &[TEST_SCHEMA_DIST_KEY_INDEX]); + let vnode = VirtualNode::compute_row_for_test(&row, &[TEST_SCHEMA_DIST_KEY_INDEX]); let (ops, builder) = &mut data_builder[vnode.to_index() % MOD_COUNT]; ops.push(op); assert!(builder.append_one_row(row).is_none()); @@ -177,9 +177,9 @@ pub(crate) fn gen_test_log_store_table(pk_info: &'static KvLogStorePkInfo) -> Pb pub(crate) fn calculate_vnode_bitmap<'a>( test_data: impl Iterator)>, ) -> Bitmap { - let mut builder = BitmapBuilder::zeroed(VirtualNode::COUNT); - for vnode in - test_data.map(|(_, row)| VirtualNode::compute_row(row, &[TEST_SCHEMA_DIST_KEY_INDEX])) + let mut builder = BitmapBuilder::zeroed(VirtualNode::COUNT_FOR_TEST); + for vnode in test_data + .map(|(_, row)| VirtualNode::compute_row_for_test(row, &[TEST_SCHEMA_DIST_KEY_INDEX])) { builder.set(vnode.to_index(), true); } diff --git a/src/stream/src/common/table/test_state_table.rs b/src/stream/src/common/table/test_state_table.rs index 098548c21ac93..dde0d8a581406 100644 --- a/src/stream/src/common/table/test_state_table.rs +++ b/src/stream/src/common/table/test_state_table.rs @@ -27,7 +27,7 @@ use risingwave_common::util::value_encoding::BasicSerde; use risingwave_hummock_test::test_utils::prepare_hummock_test_env; use risingwave_storage::hummock::HummockStorage; use risingwave_storage::store::PrefetchOptions; -use risingwave_storage::table::DEFAULT_VNODE; +use risingwave_storage::table::SINGLETON_VNODE; use crate::common::table::state_table::{ ReplicatedStateTable, StateTable, WatermarkCacheStateTable, @@ -445,7 +445,7 @@ async fn test_state_table_iter_with_pk_range() { std::ops::Bound::Included(OwnedRow::new(vec![Some(4_i32.into())])), ); let iter = state_table - .iter_with_vnode(DEFAULT_VNODE, &pk_range, Default::default()) + .iter_with_vnode(SINGLETON_VNODE, &pk_range, Default::default()) .await .unwrap(); pin_mut!(iter); @@ -470,7 +470,7 @@ async fn test_state_table_iter_with_pk_range() { std::ops::Bound::::Unbounded, ); let iter = state_table - .iter_with_vnode(DEFAULT_VNODE, &pk_range, Default::default()) + .iter_with_vnode(SINGLETON_VNODE, &pk_range, Default::default()) .await .unwrap(); pin_mut!(iter); @@ -1976,11 +1976,11 @@ async fn test_replicated_state_table_replication() { std::ops::Bound::Included(OwnedRow::new(vec![Some(2_i32.into())])), ); let iter = state_table - .iter_with_vnode(DEFAULT_VNODE, &range_bounds, Default::default()) + .iter_with_vnode(SINGLETON_VNODE, &range_bounds, Default::default()) .await .unwrap(); let replicated_iter = replicated_state_table - .iter_with_vnode_and_output_indices(DEFAULT_VNODE, &range_bounds, Default::default()) + .iter_with_vnode_and_output_indices(SINGLETON_VNODE, &range_bounds, Default::default()) .await .unwrap(); pin_mut!(iter); @@ -2039,7 +2039,7 @@ async fn test_replicated_state_table_replication() { ); let iter = state_table - .iter_with_vnode(DEFAULT_VNODE, &range_bounds, Default::default()) + .iter_with_vnode(SINGLETON_VNODE, &range_bounds, Default::default()) .await .unwrap(); @@ -2048,7 +2048,7 @@ async fn test_replicated_state_table_replication() { std::ops::Bound::Unbounded, ); let replicated_iter = replicated_state_table - .iter_with_vnode_and_output_indices(DEFAULT_VNODE, &range_bounds, Default::default()) + .iter_with_vnode_and_output_indices(SINGLETON_VNODE, &range_bounds, Default::default()) .await .unwrap(); pin_mut!(iter); @@ -2079,7 +2079,7 @@ async fn test_replicated_state_table_replication() { let range_bounds: (Bound, Bound) = (std::ops::Bound::Unbounded, std::ops::Bound::Unbounded); let replicated_iter = replicated_state_table - .iter_with_vnode_and_output_indices(DEFAULT_VNODE, &range_bounds, Default::default()) + .iter_with_vnode_and_output_indices(SINGLETON_VNODE, &range_bounds, Default::default()) .await .unwrap(); pin_mut!(replicated_iter); diff --git a/src/stream/src/executor/backfill/arrangement_backfill.rs b/src/stream/src/executor/backfill/arrangement_backfill.rs index e3979496731b5..540ffe1a020fc 100644 --- a/src/stream/src/executor/backfill/arrangement_backfill.rs +++ b/src/stream/src/executor/backfill/arrangement_backfill.rs @@ -34,7 +34,7 @@ use crate::executor::backfill::utils::{ update_pos_by_vnode, BackfillProgressPerVnode, BackfillRateLimiter, BackfillState, }; use crate::executor::prelude::*; -use crate::task::CreateMviewProgress; +use crate::task::CreateMviewProgressReporter; type Builders = HashMap; @@ -56,7 +56,7 @@ pub struct ArrangementBackfillExecutor { /// The column indices need to be forwarded to the downstream from the upstream and table scan. output_indices: Vec, - progress: CreateMviewProgress, + progress: CreateMviewProgressReporter, actor_id: ActorId, @@ -79,7 +79,7 @@ where upstream: Executor, state_table: StateTable, output_indices: Vec, - progress: CreateMviewProgress, + progress: CreateMviewProgressReporter, metrics: Arc, chunk_size: usize, rate_limit: Option, diff --git a/src/stream/src/executor/backfill/cdc/cdc_backfill.rs b/src/stream/src/executor/backfill/cdc/cdc_backfill.rs index bfffa066fc265..066dc86ba551c 100644 --- a/src/stream/src/executor/backfill/cdc/cdc_backfill.rs +++ b/src/stream/src/executor/backfill/cdc/cdc_backfill.rs @@ -43,7 +43,7 @@ use crate::executor::backfill::CdcScanOptions; use crate::executor::monitor::CdcBackfillMetrics; use crate::executor::prelude::*; use crate::executor::UpdateMutation; -use crate::task::CreateMviewProgress; +use crate::task::CreateMviewProgressReporter; /// `split_id`, `is_finished`, `row_count`, `cdc_offset` all occupy 1 column each. const METADATA_STATE_LEN: usize = 4; @@ -68,7 +68,7 @@ pub struct CdcBackfillExecutor { // TODO: introduce a CdcBackfillProgress to report finish to Meta // This object is just a stub right now - progress: Option, + progress: Option, metrics: CdcBackfillMetrics, @@ -86,7 +86,7 @@ impl CdcBackfillExecutor { upstream: Executor, output_indices: Vec, output_columns: Vec, - progress: Option, + progress: Option, metrics: Arc, state_table: StateTable, rate_limit_rps: Option, diff --git a/src/stream/src/executor/backfill/no_shuffle_backfill.rs b/src/stream/src/executor/backfill/no_shuffle_backfill.rs index 761aedfa55ee3..d8de07375d721 100644 --- a/src/stream/src/executor/backfill/no_shuffle_backfill.rs +++ b/src/stream/src/executor/backfill/no_shuffle_backfill.rs @@ -30,7 +30,7 @@ use crate::executor::backfill::utils::{ METADATA_STATE_LEN, }; use crate::executor::prelude::*; -use crate::task::CreateMviewProgress; +use crate::task::CreateMviewProgressReporter; /// Schema: | vnode | pk ... | `backfill_finished` | `row_count` | /// We can decode that into `BackfillState` on recovery. @@ -76,7 +76,7 @@ pub struct BackfillExecutor { output_indices: Vec, /// PTAL at the docstring for `CreateMviewProgress` to understand how we compute it. - progress: CreateMviewProgress, + progress: CreateMviewProgressReporter, actor_id: ActorId, @@ -100,7 +100,7 @@ where upstream: Executor, state_table: Option>, output_indices: Vec, - progress: CreateMviewProgress, + progress: CreateMviewProgressReporter, metrics: Arc, chunk_size: usize, rate_limit: Option, diff --git a/src/stream/src/executor/backfill/snapshot_backfill.rs b/src/stream/src/executor/backfill/snapshot_backfill.rs index ac625f53a02dd..593a13df9cbcd 100644 --- a/src/stream/src/executor/backfill/snapshot_backfill.rs +++ b/src/stream/src/executor/backfill/snapshot_backfill.rs @@ -43,7 +43,7 @@ use crate::executor::{ DispatcherBarrier, DispatcherMessage, Execute, Executor, Message, Mutation, StreamExecutorError, StreamExecutorResult, }; -use crate::task::CreateMviewProgress; +use crate::task::CreateMviewProgressReporter; pub struct SnapshotBackfillExecutor { /// Upstream table @@ -55,7 +55,7 @@ pub struct SnapshotBackfillExecutor { /// The column indices need to be forwarded to the downstream from the upstream and table scan. output_indices: Vec, - progress: CreateMviewProgress, + progress: CreateMviewProgressReporter, chunk_size: usize, rate_limit: Option, @@ -73,7 +73,7 @@ impl SnapshotBackfillExecutor { upstream: Executor, output_indices: Vec, actor_ctx: ActorContextRef, - progress: CreateMviewProgress, + progress: CreateMviewProgressReporter, chunk_size: usize, rate_limit: Option, barrier_rx: UnboundedReceiver, @@ -617,7 +617,7 @@ async fn make_consume_snapshot_stream<'a, S: StateStore>( rate_limit: Option, barrier_rx: &'a mut UnboundedReceiver, output_indices: &'a [usize], - mut progress: CreateMviewProgress, + mut progress: CreateMviewProgressReporter, first_recv_barrier: Barrier, ) { let mut barrier_epoch = first_recv_barrier.epoch; diff --git a/src/stream/src/executor/chain.rs b/src/stream/src/executor/chain.rs index 6f198ff2b7e12..ca06319e11bfb 100644 --- a/src/stream/src/executor/chain.rs +++ b/src/stream/src/executor/chain.rs @@ -13,7 +13,7 @@ // limitations under the License. use crate::executor::prelude::*; -use crate::task::CreateMviewProgress; +use crate::task::CreateMviewProgressReporter; /// [`ChainExecutor`] is an executor that enables synchronization between the existing stream and /// newly appended executors. Currently, [`ChainExecutor`] is mainly used to implement MV on MV @@ -24,7 +24,7 @@ pub struct ChainExecutor { upstream: Executor, - progress: CreateMviewProgress, + progress: CreateMviewProgressReporter, actor_id: ActorId, @@ -36,7 +36,7 @@ impl ChainExecutor { pub fn new( snapshot: Executor, upstream: Executor, - progress: CreateMviewProgress, + progress: CreateMviewProgressReporter, upstream_only: bool, ) -> Self { Self { @@ -115,12 +115,12 @@ mod test { use super::ChainExecutor; use crate::executor::test_utils::MockSource; use crate::executor::{AddMutation, Barrier, Execute, Message, Mutation, PkIndices}; - use crate::task::{CreateMviewProgress, LocalBarrierManager}; + use crate::task::{CreateMviewProgressReporter, LocalBarrierManager}; #[tokio::test] async fn test_basic() { let barrier_manager = LocalBarrierManager::for_test(); - let progress = CreateMviewProgress::for_test(barrier_manager); + let progress = CreateMviewProgressReporter::for_test(barrier_manager); let actor_id = progress.actor_id(); let schema = Schema::new(vec![Field::unnamed(DataType::Int64)]); diff --git a/src/stream/src/executor/dispatch.rs b/src/stream/src/executor/dispatch.rs index 82d11db49513b..bb1db4662b0d7 100644 --- a/src/stream/src/executor/dispatch.rs +++ b/src/stream/src/executor/dispatch.rs @@ -755,7 +755,8 @@ impl Dispatcher for HashDataDispatcher { let num_outputs = self.outputs.len(); // get hash value of every line by its key - let vnodes = VirtualNode::compute_chunk(chunk.data_chunk(), &self.keys); + let vnode_count = self.hash_mapping.len(); + let vnodes = VirtualNode::compute_chunk(chunk.data_chunk(), &self.keys, vnode_count); tracing::debug!(target: "events::stream::dispatch::hash", "\n{}\n keys {:?} => {:?}", chunk.to_pretty(), self.keys, vnodes); @@ -1102,8 +1103,8 @@ mod tests { } async fn test_hash_dispatcher_complex_inner() { - // This test only works when VirtualNode::COUNT is 256. - static_assertions::const_assert_eq!(VirtualNode::COUNT, 256); + // This test only works when vnode count is 256. + assert_eq!(VirtualNode::COUNT_FOR_TEST, 256); let num_outputs = 2; // actor id ranges from 1 to 2 let key_indices = &[0, 2]; @@ -1118,9 +1119,9 @@ mod tests { }) .collect::>(); let mut hash_mapping = (1..num_outputs + 1) - .flat_map(|id| vec![id as ActorId; VirtualNode::COUNT / num_outputs]) + .flat_map(|id| vec![id as ActorId; VirtualNode::COUNT_FOR_TEST / num_outputs]) .collect_vec(); - hash_mapping.resize(VirtualNode::COUNT, num_outputs as u32); + hash_mapping.resize(VirtualNode::COUNT_FOR_TEST, num_outputs as u32); let mut hash_dispatcher = HashDataDispatcher::new( outputs, key_indices.to_vec(), @@ -1225,6 +1226,32 @@ mod tests { ) .unwrap(); + let dispatcher_updates = maplit::hashmap! { + actor_id => vec![PbDispatcherUpdate { + actor_id, + dispatcher_id: broadcast_dispatcher_id, + added_downstream_actor_id: vec![new], + removed_downstream_actor_id: vec![old], + hash_mapping: Default::default(), + }] + }; + let b1 = Barrier::new_test_barrier(test_epoch(1)).with_mutation(Mutation::Update( + UpdateMutation { + dispatchers: dispatcher_updates, + merges: Default::default(), + vnode_bitmaps: Default::default(), + dropped_actors: Default::default(), + actor_splits: Default::default(), + actor_new_dispatchers: Default::default(), + }, + )); + barrier_test_env.inject_barrier(&b1, [actor_id]); + barrier_test_env + .shared_context + .local_barrier_manager + .flush_all_events() + .await; + let executor = Box::new(DispatchExecutor::new( input, vec![broadcast_dispatcher, simple_dispatcher], @@ -1253,27 +1280,6 @@ mod tests { .await .unwrap(); - // 4. Send a configuration change barrier for broadcast dispatcher. - let dispatcher_updates = maplit::hashmap! { - actor_id => vec![PbDispatcherUpdate { - actor_id, - dispatcher_id: broadcast_dispatcher_id, - added_downstream_actor_id: vec![new], - removed_downstream_actor_id: vec![old], - hash_mapping: Default::default(), - }] - }; - let b1 = Barrier::new_test_barrier(test_epoch(1)).with_mutation(Mutation::Update( - UpdateMutation { - dispatchers: dispatcher_updates, - merges: Default::default(), - vnode_bitmaps: Default::default(), - dropped_actors: Default::default(), - actor_splits: Default::default(), - actor_new_dispatchers: Default::default(), - }, - )); - barrier_test_env.inject_barrier(&b1, [actor_id]); tx.send(Message::Barrier(b1.clone().into_dispatcher())) .await .unwrap(); @@ -1359,6 +1365,9 @@ mod tests { #[tokio::test] async fn test_hash_dispatcher() { + // This test only works when vnode count is 256. + assert_eq!(VirtualNode::COUNT_FOR_TEST, 256); + let num_outputs = 5; // actor id ranges from 1 to 5 let cardinality = 10; let dimension = 4; @@ -1374,9 +1383,9 @@ mod tests { }) .collect::>(); let mut hash_mapping = (1..num_outputs + 1) - .flat_map(|id| vec![id as ActorId; VirtualNode::COUNT / num_outputs]) + .flat_map(|id| vec![id as ActorId; VirtualNode::COUNT_FOR_TEST / num_outputs]) .collect_vec(); - hash_mapping.resize(VirtualNode::COUNT, num_outputs as u32); + hash_mapping.resize(VirtualNode::COUNT_FOR_TEST, num_outputs as u32); let mut hash_dispatcher = HashDataDispatcher::new( outputs, key_indices.to_vec(), @@ -1410,7 +1419,7 @@ mod tests { hasher.update(&bytes); } let output_idx = - hash_mapping[hasher.finish() as usize % VirtualNode::COUNT] as usize - 1; + hash_mapping[hasher.finish() as usize % VirtualNode::COUNT_FOR_TEST] as usize - 1; for (builder, val) in builders.iter_mut().zip_eq_fast(one_row.iter()) { builder.append(Some(*val)); } diff --git a/src/stream/src/executor/error.rs b/src/stream/src/executor/error.rs index fa625d8bb8cec..66070ba81e90c 100644 --- a/src/stream/src/executor/error.rs +++ b/src/stream/src/executor/error.rs @@ -67,7 +67,12 @@ pub enum ErrorKind { ), #[error("Sink error: sink_id={1}, error: {0}")] - SinkError(SinkError, u32), + SinkError( + #[source] + #[backtrace] + SinkError, + u32, + ), #[error(transparent)] RpcError( @@ -90,7 +95,11 @@ pub enum ErrorKind { AlignBarrier(Box, Box), #[error("Connector error: {0}")] - ConnectorError(BoxedError), + ConnectorError( + #[source] + #[backtrace] + BoxedError, + ), #[error(transparent)] DmlError( diff --git a/src/stream/src/executor/exchange/input.rs b/src/stream/src/executor/exchange/input.rs index e00a0da45979a..7ecac2c625e69 100644 --- a/src/stream/src/executor/exchange/input.rs +++ b/src/stream/src/executor/exchange/input.rs @@ -15,16 +15,13 @@ use std::pin::Pin; use std::task::{Context, Poll}; -use anyhow::{anyhow, Context as _}; -use futures::pin_mut; -use futures_async_stream::try_stream; +use anyhow::anyhow; +use local_input::LocalInputStreamInner; use pin_project::pin_project; use risingwave_common::util::addr::{is_local_address, HostAddr}; -use risingwave_pb::task_service::{permits, GetStreamResponse}; use risingwave_rpc_client::ComputeClientPool; use tokio::sync::mpsc; -use super::error::ExchangeChannelClosed; use super::permit::Receiver; use crate::executor::prelude::*; use crate::executor::{DispatcherBarrier, DispatcherMessage}; @@ -64,7 +61,6 @@ pub struct LocalInput { actor_id: ActorId, } -type LocalInputStreamInner = impl MessageStream; async fn process_msg<'a>( msg: DispatcherMessage, @@ -110,7 +106,7 @@ impl LocalInput { local_barrier_manager: LocalBarrierManager, ) -> Self { Self { - inner: Self::run( + inner: local_input::run( channel, upstream_actor_id, self_actor_id, @@ -119,9 +115,36 @@ impl LocalInput { actor_id: upstream_actor_id, } } +} + +mod local_input { + use await_tree::InstrumentAwait; + + use crate::executor::exchange::error::ExchangeChannelClosed; + use crate::executor::exchange::input::process_msg; + use crate::executor::exchange::permit::Receiver; + use crate::executor::prelude::try_stream; + use crate::executor::{Message, StreamExecutorError}; + use crate::task::{ActorId, LocalBarrierManager}; + + pub(super) type LocalInputStreamInner = impl crate::executor::MessageStream; + + pub(super) fn run( + channel: Receiver, + upstream_actor_id: ActorId, + self_actor_id: ActorId, + local_barrier_manager: LocalBarrierManager, + ) -> LocalInputStreamInner { + run_inner( + channel, + upstream_actor_id, + self_actor_id, + local_barrier_manager, + ) + } #[try_stream(ok = Message, error = StreamExecutorError)] - async fn run( + async fn run_inner( mut channel: Receiver, upstream_actor_id: ActorId, self_actor_id: ActorId, @@ -166,7 +189,8 @@ pub struct RemoteInput { actor_id: ActorId, } -type RemoteInputStreamInner = impl MessageStream; + +use remote_input::RemoteInputStreamInner; impl RemoteInput { /// Create a remote input from compute client and related info. Should provide the corresponding @@ -184,7 +208,7 @@ impl RemoteInput { Self { actor_id, - inner: Self::run( + inner: remote_input::run( local_barrier_manager, client_pool, upstream_addr, @@ -195,9 +219,48 @@ impl RemoteInput { ), } } +} + +mod remote_input { + use std::sync::Arc; + + use anyhow::Context; + use await_tree::InstrumentAwait; + use risingwave_common::util::addr::HostAddr; + use risingwave_pb::task_service::{permits, GetStreamResponse}; + use risingwave_rpc_client::ComputeClientPool; + + use crate::executor::exchange::error::ExchangeChannelClosed; + use crate::executor::exchange::input::process_msg; + use crate::executor::monitor::StreamingMetrics; + use crate::executor::prelude::{pin_mut, try_stream, StreamExt}; + use crate::executor::{DispatcherMessage, Message, StreamExecutorError}; + use crate::task::{LocalBarrierManager, UpDownActorIds, UpDownFragmentIds}; + + pub(super) type RemoteInputStreamInner = impl crate::executor::MessageStream; + + pub(super) fn run( + local_barrier_manager: LocalBarrierManager, + client_pool: ComputeClientPool, + upstream_addr: HostAddr, + up_down_ids: UpDownActorIds, + up_down_frag: UpDownFragmentIds, + metrics: Arc, + batched_permits_limit: usize, + ) -> RemoteInputStreamInner { + run_inner( + local_barrier_manager, + client_pool, + upstream_addr, + up_down_ids, + up_down_frag, + metrics, + batched_permits_limit, + ) + } #[try_stream(ok = Message, error = StreamExecutorError)] - async fn run( + async fn run_inner( local_barrier_manager: LocalBarrierManager, client_pool: ComputeClientPool, upstream_addr: HostAddr, diff --git a/src/stream/src/executor/integration_tests.rs b/src/stream/src/executor/integration_tests.rs index d65abc5a5ce53..13e9a67d1c525 100644 --- a/src/stream/src/executor/integration_tests.rs +++ b/src/stream/src/executor/integration_tests.rs @@ -14,6 +14,8 @@ use std::sync::Mutex; +use futures::future::BoxFuture; +use futures::FutureExt; use futures_async_stream::try_stream; use multimap::MultiMap; use risingwave_common::array::*; @@ -100,7 +102,7 @@ async fn test_merger_sum_aggr() { }; // join handles of all actors - let mut handles = vec![]; + let mut actor_futures: Vec> = vec![]; // input and output channels of the local aggregation actors let mut inputs = vec![]; @@ -113,7 +115,7 @@ async fn test_merger_sum_aggr() { let (tx, rx) = channel_for_test(); let (actor, channel) = make_actor(rx); outputs.push(channel); - handles.push(tokio::spawn(actor.run())); + actor_futures.push(actor.run().boxed()); inputs.push(Box::new(LocalOutput::new(233, tx)) as BoxedOutput); } @@ -154,7 +156,7 @@ async fn test_merger_sum_aggr() { .local_barrier_manager .clone(), ); - handles.push(tokio::spawn(actor.run())); + actor_futures.push(actor.run().boxed()); let actor_ctx = ActorContext::for_test(gen_next_actor_id()); @@ -225,11 +227,21 @@ async fn test_merger_sum_aggr() { .local_barrier_manager .clone(), ); - handles.push(tokio::spawn(actor.run())); + actor_futures.push(actor.run().boxed()); let mut epoch = test_epoch(1); let b1 = Barrier::new_test_barrier(epoch); barrier_test_env.inject_barrier(&b1, actors.clone()); + barrier_test_env + .shared_context + .local_barrier_manager + .flush_all_events() + .await; + let handles = actor_futures + .into_iter() + .map(|actor_future| tokio::spawn(actor_future)) + .collect_vec(); + input .send(Message::Barrier(b1.into_dispatcher())) .await diff --git a/src/stream/src/executor/merge.rs b/src/stream/src/executor/merge.rs index 393b800895151..d45d75604fa57 100644 --- a/src/stream/src/executor/merge.rs +++ b/src/stream/src/executor/merge.rs @@ -531,6 +531,11 @@ mod tests { let b2 = Barrier::with_prev_epoch_for_test(test_epoch(1000), *prev_epoch) .with_mutation(Mutation::Stop(HashSet::default())); barrier_test_env.inject_barrier(&b2, [actor_id]); + barrier_test_env + .shared_context + .local_barrier_manager + .flush_all_events() + .await; for (tx_id, tx) in txs.into_iter().enumerate() { let epochs = epochs.clone(); @@ -634,6 +639,33 @@ mod tests { .try_collect() .unwrap(); + let merge_updates = maplit::hashmap! { + (actor_id, upstream_fragment_id) => MergeUpdate { + actor_id, + upstream_fragment_id, + new_upstream_fragment_id: None, + added_upstream_actor_id: vec![new], + removed_upstream_actor_id: vec![old], + } + }; + + let b1 = Barrier::new_test_barrier(test_epoch(1)).with_mutation(Mutation::Update( + UpdateMutation { + dispatchers: Default::default(), + merges: merge_updates, + vnode_bitmaps: Default::default(), + dropped_actors: Default::default(), + actor_splits: Default::default(), + actor_new_dispatchers: Default::default(), + }, + )); + barrier_test_env.inject_barrier(&b1, [actor_id]); + barrier_test_env + .shared_context + .local_barrier_manager + .flush_all_events() + .await; + let mut merge = MergeExecutor::new( ActorContext::for_test(actor_id), fragment_id, @@ -682,28 +714,6 @@ mod tests { recv!().unwrap().as_chunk().unwrap(); assert_recv_pending!(); - // 4. Send a configuration change barrier. - let merge_updates = maplit::hashmap! { - (actor_id, upstream_fragment_id) => MergeUpdate { - actor_id, - upstream_fragment_id, - new_upstream_fragment_id: None, - added_upstream_actor_id: vec![new], - removed_upstream_actor_id: vec![old], - } - }; - - let b1 = Barrier::new_test_barrier(test_epoch(1)).with_mutation(Mutation::Update( - UpdateMutation { - dispatchers: Default::default(), - merges: merge_updates, - vnode_bitmaps: Default::default(), - dropped_actors: Default::default(), - actor_splits: Default::default(), - actor_new_dispatchers: Default::default(), - }, - )); - barrier_test_env.inject_barrier(&b1, [actor_id]); send!( [untouched, old], Message::Barrier(b1.clone().into_dispatcher()) diff --git a/src/stream/src/executor/nested_loop_temporal_join.rs b/src/stream/src/executor/nested_loop_temporal_join.rs index 0888d8981fc8c..55d21b468a777 100644 --- a/src/stream/src/executor/nested_loop_temporal_join.rs +++ b/src/stream/src/executor/nested_loop_temporal_join.rs @@ -98,8 +98,7 @@ async fn phase1_handle_chunk( } impl NestedLoopTemporalJoinExecutor { - #[allow(clippy::too_many_arguments)] - #[expect(dead_code)] + #[expect(clippy::too_many_arguments)] pub fn new( ctx: ActorContextRef, info: ExecutorInfo, diff --git a/src/stream/src/executor/rearranged_chain.rs b/src/stream/src/executor/rearranged_chain.rs index 37717d270d90e..d70d6c2955c3a 100644 --- a/src/stream/src/executor/rearranged_chain.rs +++ b/src/stream/src/executor/rearranged_chain.rs @@ -17,7 +17,7 @@ use futures::stream; use futures::stream::select_with_strategy; use crate::executor::prelude::*; -use crate::task::CreateMviewProgress; +use crate::task::CreateMviewProgressReporter; /// `ChainExecutor` is an executor that enables synchronization between the existing stream and /// newly appended executors. Currently, `ChainExecutor` is mainly used to implement MV on MV @@ -31,7 +31,7 @@ pub struct RearrangedChainExecutor { upstream: Executor, - progress: CreateMviewProgress, + progress: CreateMviewProgressReporter, actor_id: ActorId, } @@ -74,7 +74,11 @@ impl RearrangedMessage { } impl RearrangedChainExecutor { - pub fn new(snapshot: Executor, upstream: Executor, progress: CreateMviewProgress) -> Self { + pub fn new( + snapshot: Executor, + upstream: Executor, + progress: CreateMviewProgressReporter, + ) -> Self { Self { snapshot, upstream, diff --git a/src/stream/src/executor/receiver.rs b/src/stream/src/executor/receiver.rs index 6cabb79388333..9a99e59214bd5 100644 --- a/src/stream/src/executor/receiver.rs +++ b/src/stream/src/executor/receiver.rs @@ -231,6 +231,35 @@ mod tests { let (upstream_fragment_id, fragment_id) = (10, 18); + // 4. Send a configuration change barrier. + let merge_updates = maplit::hashmap! { + (actor_id, upstream_fragment_id) => MergeUpdate { + actor_id, + upstream_fragment_id, + new_upstream_fragment_id: None, + added_upstream_actor_id: vec![new], + removed_upstream_actor_id: vec![old], + } + }; + + let b1 = Barrier::new_test_barrier(test_epoch(1)).with_mutation(Mutation::Update( + UpdateMutation { + dispatchers: Default::default(), + merges: merge_updates, + vnode_bitmaps: Default::default(), + dropped_actors: Default::default(), + actor_splits: Default::default(), + actor_new_dispatchers: Default::default(), + }, + )); + + barrier_test_env.inject_barrier(&b1, [actor_id]); + barrier_test_env + .shared_context + .local_barrier_manager + .flush_all_events() + .await; + let input = new_input( &ctx, metrics.clone(), @@ -297,30 +326,6 @@ mod tests { recv!().unwrap().as_chunk().unwrap(); // We should be able to receive the chunk. assert_recv_pending!(); - // 4. Send a configuration change barrier. - let merge_updates = maplit::hashmap! { - (actor_id, upstream_fragment_id) => MergeUpdate { - actor_id, - upstream_fragment_id, - new_upstream_fragment_id: None, - added_upstream_actor_id: vec![new], - removed_upstream_actor_id: vec![old], - } - }; - - let b1 = Barrier::new_test_barrier(test_epoch(1)).with_mutation(Mutation::Update( - UpdateMutation { - dispatchers: Default::default(), - merges: merge_updates, - vnode_bitmaps: Default::default(), - dropped_actors: Default::default(), - actor_splits: Default::default(), - actor_new_dispatchers: Default::default(), - }, - )); - - barrier_test_env.inject_barrier(&b1, [actor_id]); - send!([new], Message::Barrier(b1.clone().into_dispatcher())); assert_recv_pending!(); // We should not receive the barrier, as new is not the upstream. diff --git a/src/stream/src/executor/row_id_gen.rs b/src/stream/src/executor/row_id_gen.rs index 1fcb85c26f88e..5465a1b54ec2e 100644 --- a/src/stream/src/executor/row_id_gen.rs +++ b/src/stream/src/executor/row_id_gen.rs @@ -134,13 +134,16 @@ mod tests { #[tokio::test] async fn test_row_id_gen_executor() { + // This test only works when vnode count is 256. + assert_eq!(VirtualNode::COUNT_FOR_TEST, 256); + let schema = Schema::new(vec![ Field::unnamed(DataType::Serial), Field::unnamed(DataType::Int64), ]); let pk_indices = vec![0]; let row_id_index = 0; - let row_id_generator = Bitmap::ones(VirtualNode::COUNT); + let row_id_generator = Bitmap::ones(VirtualNode::COUNT_FOR_TEST); let (mut tx, upstream) = MockSource::channel(); let upstream = upstream.into_executor(schema.clone(), pk_indices.clone()); diff --git a/src/stream/src/executor/source/source_backfill_executor.rs b/src/stream/src/executor/source/source_backfill_executor.rs index b28c707bdedd0..3f2cd83aca286 100644 --- a/src/stream/src/executor/source/source_backfill_executor.rs +++ b/src/stream/src/executor/source/source_backfill_executor.rs @@ -14,6 +14,7 @@ use std::cmp::Ordering; use std::collections::{HashMap, HashSet}; +use std::sync::Once; use std::time::Instant; use anyhow::anyhow; @@ -30,6 +31,7 @@ use risingwave_connector::source::{ BackfillInfo, BoxChunkSourceStream, SourceContext, SourceCtrlOpts, SplitId, SplitImpl, SplitMetaData, }; +use risingwave_hummock_sdk::HummockReadEpoch; use serde::{Deserialize, Serialize}; use thiserror_ext::AsReport; @@ -40,6 +42,7 @@ use crate::common::rate_limit::limited_chunk_size; use crate::executor::prelude::*; use crate::executor::source::source_executor::WAIT_BARRIER_MULTIPLE_TIMES; use crate::executor::UpdateMutation; +use crate::task::CreateMviewProgressReporter; #[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] pub enum BackfillState { @@ -88,6 +91,8 @@ pub struct SourceBackfillExecutorInner { /// Rate limit in rows/s. rate_limit_rps: Option, + + progress: CreateMviewProgressReporter, } /// Local variables used in the backfill stage. @@ -230,6 +235,7 @@ impl BackfillStage { } impl SourceBackfillExecutorInner { + #[expect(clippy::too_many_arguments)] pub fn new( actor_ctx: ActorContextRef, info: ExecutorInfo, @@ -238,6 +244,7 @@ impl SourceBackfillExecutorInner { system_params: SystemParamsReaderRef, backfill_state_store: BackfillStateTableHandler, rate_limit_rps: Option, + progress: CreateMviewProgressReporter, ) -> Self { let source_split_change_count = metrics .source_split_change_count @@ -247,6 +254,7 @@ impl SourceBackfillExecutorInner { &actor_ctx.id.to_string(), &actor_ctx.fragment_id.to_string(), ]); + Self { actor_ctx, info, @@ -256,6 +264,7 @@ impl SourceBackfillExecutorInner { source_split_change_count, system_params, rate_limit_rps, + progress, } } @@ -346,7 +355,6 @@ impl SourceBackfillExecutorInner { splits: owned_splits, }; backfill_stage.debug_assert_consistent(); - tracing::debug!(?backfill_stage, "source backfill started"); // Return the ownership of `stream_source_core` to the source executor. self.stream_source_core = core; @@ -370,6 +378,7 @@ impl SourceBackfillExecutorInner { } } } + tracing::debug!(?backfill_stage, "source backfill started"); fn select_strategy(_: &mut ()) -> PollNext { futures::stream::PollNext::Left @@ -407,9 +416,23 @@ impl SourceBackfillExecutorInner { pause_reader!(); } + let state_store = self.backfill_state_store.state_store.state_store().clone(); + static STATE_TABLE_INITIALIZED: Once = Once::new(); + tokio::spawn(async move { + // This is for self.backfill_finished() to be safe. + // We wait for 1st epoch's curr, i.e., the 2nd epoch's prev. + let epoch = barrier.epoch.curr; + tracing::info!("waiting for epoch: {}", epoch); + state_store + .try_wait_epoch(HummockReadEpoch::Committed(epoch)) + .await + .expect("failed to wait epoch"); + STATE_TABLE_INITIALIZED.call_once(|| ()); + tracing::info!("finished waiting for epoch: {}", epoch); + }); yield Message::Barrier(barrier); - if !self.backfill_finished(&backfill_stage.states).await? { + { let source_backfill_row_count = self .metrics .source_backfill_row_count @@ -552,10 +575,26 @@ impl SourceBackfillExecutorInner { .commit(barrier.epoch) .await?; - yield Message::Barrier(barrier); - - if self.backfill_finished(&backfill_stage.states).await? { - break 'backfill_loop; + if self.should_report_finished(&backfill_stage.states) { + // TODO: use a specialized progress for source + // Currently, `CreateMviewProgress` is designed for MV backfill, and rw_ddl_progress calculates + // progress based on the number of consumed rows and an estimated total number of rows from hummock. + // For now, we just rely on the same code path, and for source backfill, the progress will always be 99.99%. + tracing::info!("progress finish"); + let epoch = barrier.epoch; + self.progress.finish(epoch, 114514); + // yield barrier after reporting progress + yield Message::Barrier(barrier); + + // After we reported finished, we still don't exit the loop. + // Because we need to handle split migration. + if STATE_TABLE_INITIALIZED.is_completed() + && self.backfill_finished(&backfill_stage.states).await? + { + break 'backfill_loop; + } + } else { + yield Message::Barrier(barrier); } } Message::Chunk(chunk) => { @@ -665,7 +704,7 @@ impl SourceBackfillExecutorInner { self.apply_split_change_forward_stage( actor_splits, &mut splits, - true, + false, ) .await?; } @@ -688,11 +727,34 @@ impl SourceBackfillExecutorInner { } } - /// All splits finished backfilling. + /// When we should call `progress.finish()` to let blocking DDL return. + /// We report as soon as `SourceCachingUp`. Otherwise the DDL might be blocked forever until upstream messages come. + /// + /// Note: split migration (online scaling) is related with progress tracking. + /// - For foreground DDL, scaling is not allowed before progress is finished. + /// - For background DDL, scaling is skipped when progress is not finished, and can be triggered by recreating actors during recovery. + /// + /// See for more details. + fn should_report_finished(&self, states: &BackfillStates) -> bool { + states.values().all(|state| { + matches!( + state, + BackfillState::Finished | BackfillState::SourceCachingUp(_) + ) + }) + } + + /// All splits entered `Finished` state. /// /// We check all splits for the source, including other actors' splits here, before going to the forward stage. - /// Otherwise if we break early, but after rescheduling, an unfinished split is migrated to + /// Otherwise if we `break` early, but after rescheduling, an unfinished split is migrated to /// this actor, we still need to backfill it. + /// + /// Note: at the beginning, the actor will only read the state written by itself. + /// It needs to _wait until it can read all actors' written data_. + /// i.e., wait for the first checkpoint has been available. + /// + /// See for more details. async fn backfill_finished(&self, states: &BackfillStates) -> StreamExecutorResult { Ok(states .values() @@ -761,7 +823,6 @@ impl SourceBackfillExecutorInner { } Some(backfill_state) => { // Migrated split. Backfill if unfinished. - // TODO: disallow online scaling during backfilling. target_state.insert(split_id, backfill_state); } } diff --git a/src/stream/src/executor/source/source_backfill_state_table.rs b/src/stream/src/executor/source/source_backfill_state_table.rs index be9abe8490e63..3579aff2ec4fb 100644 --- a/src/stream/src/executor/source/source_backfill_state_table.rs +++ b/src/stream/src/executor/source/source_backfill_state_table.rs @@ -76,6 +76,7 @@ impl BackfillStateTableHandler { }; ret.push(state); } + tracing::trace!("scan SourceBackfill state table: {:?}", ret); Ok(ret) } diff --git a/src/stream/src/executor/stream_reader.rs b/src/stream/src/executor/stream_reader.rs index 30de0804b0ac0..bd22e47c737ad 100644 --- a/src/stream/src/executor/stream_reader.rs +++ b/src/stream/src/executor/stream_reader.rs @@ -16,7 +16,7 @@ use std::pin::Pin; use std::task::Poll; use either::Either; -use futures::stream::{select_with_strategy, BoxStream, PollNext, SelectWithStrategy}; +use futures::stream::BoxStream; use futures::{Stream, StreamExt, TryStreamExt}; use crate::executor::error::StreamExecutorResult; @@ -25,8 +25,34 @@ use crate::executor::Message; type ExecutorMessageStream = BoxStream<'static, StreamExecutorResult>; type StreamReaderData = StreamExecutorResult>; type ReaderArm = BoxStream<'static, StreamReaderData>; -type StreamReaderWithPauseInner = - SelectWithStrategy, ReaderArm, impl FnMut(&mut PollNext) -> PollNext, PollNext>; + +mod stream_reader_with_pause { + use futures::stream::{select_with_strategy, PollNext, SelectWithStrategy}; + + use crate::executor::stream_reader::ReaderArm; + + pub(super) type StreamReaderWithPauseInner = SelectWithStrategy< + ReaderArm, + ReaderArm, + impl FnMut(&mut PollNext) -> PollNext, + PollNext, + >; + + pub(super) fn new_inner( + message_stream: ReaderArm, + data_stream: ReaderArm, + ) -> StreamReaderWithPauseInner { + let strategy = if BIASED { + |_: &mut PollNext| PollNext::Left + } else { + // The poll strategy is not biased: we poll the two streams in a round robin way. + |last: &mut PollNext| last.toggle() + }; + select_with_strategy(message_stream, data_stream, strategy) + } +} + +use stream_reader_with_pause::*; /// [`StreamReaderWithPause`] merges two streams, with one receiving barriers (and maybe other types /// of messages) and the other receiving data only (no barrier). The merged stream can be paused @@ -40,7 +66,7 @@ type StreamReaderWithPauseInner = /// priority over the right-hand one. Otherwise, the two streams will be polled in a round robin /// fashion. pub(super) struct StreamReaderWithPause { - inner: StreamReaderWithPauseInner, + inner: StreamReaderWithPauseInner, /// Whether the source stream is paused. paused: bool, } @@ -54,26 +80,13 @@ impl StreamReaderWithPause { ) -> Self { let message_stream_arm = message_stream.map_ok(Either::Left).boxed(); let data_stream_arm = data_stream.map_ok(Either::Right).boxed(); - let inner = Self::new_inner(message_stream_arm, data_stream_arm); + let inner = new_inner(message_stream_arm, data_stream_arm); Self { inner, paused: false, } } - fn new_inner( - message_stream: ReaderArm, - data_stream: ReaderArm, - ) -> StreamReaderWithPauseInner { - let strategy = if BIASED { - |_: &mut PollNext| PollNext::Left - } else { - // The poll strategy is not biased: we poll the two streams in a round robin way. - |last: &mut PollNext| last.toggle() - }; - select_with_strategy(message_stream, data_stream, strategy) - } - /// Replace the data stream with a new one for given `stream`. Used for split change. pub fn replace_data_stream( &mut self, @@ -87,7 +100,7 @@ impl StreamReaderWithPause { // Note: create a new `SelectWithStrategy` instead of replacing the source stream arm here, // to ensure the internal state of the `SelectWithStrategy` is reset. (#6300) - self.inner = Self::new_inner( + self.inner = new_inner( barrier_receiver_arm, data_stream.map_ok(Either::Right).boxed(), ); diff --git a/src/stream/src/executor/values.rs b/src/stream/src/executor/values.rs index 83da0ff68a7d5..89946d9dc94e6 100644 --- a/src/stream/src/executor/values.rs +++ b/src/stream/src/executor/values.rs @@ -21,7 +21,7 @@ use risingwave_expr::expr::NonStrictExpression; use tokio::sync::mpsc::UnboundedReceiver; use crate::executor::prelude::*; -use crate::task::CreateMviewProgress; +use crate::task::CreateMviewProgressReporter; const DEFAULT_CHUNK_SIZE: usize = 1024; @@ -33,7 +33,7 @@ pub struct ValuesExecutor { schema: Schema, // Receiver of barrier channel. barrier_receiver: UnboundedReceiver, - progress: CreateMviewProgress, + progress: CreateMviewProgressReporter, rows: vec::IntoIter>, } @@ -43,7 +43,7 @@ impl ValuesExecutor { pub fn new( ctx: ActorContextRef, schema: Schema, - progress: CreateMviewProgress, + progress: CreateMviewProgressReporter, rows: Vec>, barrier_receiver: UnboundedReceiver, ) -> Self { @@ -150,12 +150,12 @@ mod tests { use super::ValuesExecutor; use crate::executor::test_utils::StreamExecutorTestExt; use crate::executor::{ActorContext, AddMutation, Barrier, Execute, Mutation}; - use crate::task::{CreateMviewProgress, LocalBarrierManager}; + use crate::task::{CreateMviewProgressReporter, LocalBarrierManager}; #[tokio::test] async fn test_values() { let barrier_manager = LocalBarrierManager::for_test(); - let progress = CreateMviewProgress::for_test(barrier_manager); + let progress = CreateMviewProgressReporter::for_test(barrier_manager); let actor_id = progress.actor_id(); let (tx, barrier_receiver) = unbounded_channel(); let value = StructValue::new(vec![Some(1.into()), Some(2.into()), Some(3.into())]); diff --git a/src/stream/src/executor/watermark_filter.rs b/src/stream/src/executor/watermark_filter.rs index 8f8b166626d21..01497c37fdab5 100644 --- a/src/stream/src/executor/watermark_filter.rs +++ b/src/stream/src/executor/watermark_filter.rs @@ -13,7 +13,6 @@ // limitations under the License. use std::cmp; -use std::ops::Deref; use futures::future::{try_join, try_join_all}; use risingwave_common::hash::VnodeBitmapExt; @@ -27,7 +26,6 @@ use risingwave_expr::Result as ExprResult; use risingwave_hummock_sdk::HummockReadEpoch; use risingwave_pb::expr::expr_node::Type; use risingwave_storage::table::batch_table::storage_table::StorageTable; -use risingwave_storage::table::TableDistribution; use super::filter::FilterExecutor; use crate::executor::prelude::*; @@ -219,10 +217,7 @@ impl WatermarkFilterExecutor { let mut need_update_global_max_watermark = false; // Update the vnode bitmap for state tables of all agg calls if asked. if let Some(vnode_bitmap) = barrier.as_update_vnode_bitmap(ctx.id) { - let other_vnodes_bitmap = Arc::new( - (!(*vnode_bitmap).clone()) - & TableDistribution::all_vnodes_ref().deref(), - ); + let other_vnodes_bitmap = Arc::new(!(*vnode_bitmap).clone()); let _ = global_watermark_table.update_vnode_bitmap(other_vnodes_bitmap); let (previous_vnode_bitmap, _cache_may_stale) = table.update_vnode_bitmap(vnode_bitmap.clone()); @@ -373,7 +368,9 @@ impl WatermarkFilterExecutor { #[cfg(test)] mod tests { use itertools::Itertools; + use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::{ColumnDesc, ColumnId, Field, TableDesc}; + use risingwave_common::hash::VirtualNode; use risingwave_common::test_prelude::StreamChunkTestExt; use risingwave_common::types::Date; use risingwave_common::util::epoch::test_epoch; @@ -431,7 +428,7 @@ mod tests { let state_table = StateTable::from_table_catalog_inconsistent_op( &table, mem_state.clone(), - Some(TableDistribution::all_vnodes()), + Some(Bitmap::ones(VirtualNode::COUNT_FOR_TEST).into()), ) .await; @@ -440,7 +437,7 @@ mod tests { let storage_table = StorageTable::new_partial( mem_state, val_indices.iter().map(|i| ColumnId::new(*i as _)).collect(), - Some(TableDistribution::all_vnodes()), + Some(Bitmap::ones(VirtualNode::COUNT_FOR_TEST).into()), &desc, ); (storage_table, state_table) diff --git a/src/stream/src/from_proto/mview.rs b/src/stream/src/from_proto/mview.rs index 41fc47609fba7..43fc929edf455 100644 --- a/src/stream/src/from_proto/mview.rs +++ b/src/stream/src/from_proto/mview.rs @@ -100,7 +100,7 @@ impl ExecutorBuilder for ArrangeExecutorBuilder { let table = node.get_table()?; // FIXME: Lookup is now implemented without cell-based table API and relies on all vnodes - // being `DEFAULT_VNODE`, so we need to make the Arrange a singleton. + // being `SINGLETON_VNODE`, so we need to make the Arrange a singleton. let vnodes = params.vnode_bitmap.map(Arc::new); let conflict_behavior = ConflictBehavior::from_protobuf(&table.handle_pk_conflict_behavior()); diff --git a/src/stream/src/from_proto/source_backfill.rs b/src/stream/src/from_proto/source_backfill.rs index ba3ab599af700..65329a26bd40b 100644 --- a/src/stream/src/from_proto/source_backfill.rs +++ b/src/stream/src/from_proto/source_backfill.rs @@ -72,6 +72,9 @@ impl ExecutorBuilder for SourceBackfillExecutorBuilder { source_desc_builder, state_table_handler, ); + let progress = params + .local_barrier_manager + .register_create_mview_progress(params.actor_context.id); let exec = SourceBackfillExecutorInner::new( params.actor_context.clone(), @@ -81,6 +84,7 @@ impl ExecutorBuilder for SourceBackfillExecutorBuilder { params.env.system_params_manager_ref().get_params(), backfill_state_table, node.rate_limit, + progress, ); let [input]: [_; 1] = params.input.try_into().unwrap(); diff --git a/src/stream/src/from_proto/watermark_filter.rs b/src/stream/src/from_proto/watermark_filter.rs index 0081f00cc39e6..4e3147d10853e 100644 --- a/src/stream/src/from_proto/watermark_filter.rs +++ b/src/stream/src/from_proto/watermark_filter.rs @@ -12,14 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::ops::Deref; use std::sync::Arc; use risingwave_common::catalog::{ColumnId, TableDesc}; use risingwave_expr::expr::build_non_strict_from_prost; use risingwave_pb::stream_plan::WatermarkFilterNode; use risingwave_storage::table::batch_table::storage_table::StorageTable; -use risingwave_storage::table::TableDistribution; use super::*; use crate::common::table::state_table::StateTable; @@ -57,8 +55,7 @@ impl ExecutorBuilder for WatermarkFilterBuilder { .iter() .map(|i| ColumnId::new(*i as _)) .collect_vec(); - let other_vnodes = - Arc::new((!(*vnodes).clone()) & TableDistribution::all_vnodes_ref().deref()); + let other_vnodes = Arc::new(!(*vnodes).clone()); let global_watermark_table = StorageTable::new_partial(store.clone(), column_ids, Some(other_vnodes), &desc); diff --git a/src/stream/src/lib.rs b/src/stream/src/lib.rs index 876deabc80f98..577b829945620 100644 --- a/src/stream/src/lib.rs +++ b/src/stream/src/lib.rs @@ -17,7 +17,6 @@ #![feature(trait_alias)] #![feature(type_alias_impl_trait)] #![feature(more_qualified_paths)] -#![feature(lint_reasons)] #![feature(let_chains)] #![feature(hash_extract_if)] #![feature(extract_if)] diff --git a/src/stream/src/task/barrier_manager.rs b/src/stream/src/task/barrier_manager.rs index 88e86a5998758..406459e25c389 100644 --- a/src/stream/src/task/barrier_manager.rs +++ b/src/stream/src/task/barrier_manager.rs @@ -37,8 +37,7 @@ use tonic::{Code, Status}; use self::managed_state::ManagedBarrierState; use crate::error::{IntoUnexpectedExit, StreamError, StreamResult}; use crate::task::{ - ActorHandle, ActorId, AtomicU64Ref, PartialGraphId, SharedContext, StreamEnvironment, - UpDownActorIds, + ActorId, AtomicU64Ref, PartialGraphId, SharedContext, StreamEnvironment, UpDownActorIds, }; mod managed_state; @@ -46,7 +45,7 @@ mod progress; #[cfg(test)] mod tests; -pub use progress::CreateMviewProgress; +pub use progress::CreateMviewProgressReporter; use risingwave_common::catalog::TableId; use risingwave_common::util::epoch::EpochPair; use risingwave_common::util::runtime::BackgroundShutdownRuntime; @@ -210,10 +209,6 @@ pub(super) enum LocalActorOperation { handle: ControlStreamHandle, init_request: InitRequest, }, - DropActors { - actors: Vec, - result_sender: oneshot::Sender<()>, - }, TakeReceiver { ids: UpDownActorIds, result_sender: oneshot::Sender>, @@ -228,29 +223,6 @@ pub(super) enum LocalActorOperation { }, } -pub(crate) struct StreamActorManagerState { - /// Each processor runs in a future. Upon receiving a `Terminate` message, they will exit. - /// `handles` store join handles of these futures, and therefore we could wait their - /// termination. - pub(super) handles: HashMap, - - /// Stores all actor information, taken after actor built. - pub(super) actors: HashMap, - - /// Stores all actor tokio runtime monitoring tasks. - pub(super) actor_monitor_tasks: HashMap, -} - -impl StreamActorManagerState { - fn new() -> Self { - Self { - handles: HashMap::new(), - actors: HashMap::new(), - actor_monitor_tasks: HashMap::new(), - } - } -} - pub(crate) struct StreamActorManager { pub(super) env: StreamEnvironment, pub(super) streaming_metrics: Arc, @@ -294,7 +266,7 @@ impl Display for LocalBarrierWorkerDebugInfo<'_> { /// barriers to and collect them from all actors, and finally report the progress. pub(super) struct LocalBarrierWorker { /// Current barrier collection state. - state: ManagedBarrierState, + pub(super) state: ManagedBarrierState, /// Record all unexpected exited actors. failure_actors: HashMap, @@ -303,8 +275,6 @@ pub(super) struct LocalBarrierWorker { pub(super) actor_manager: Arc, - pub(super) actor_manager_state: StreamActorManagerState, - pub(super) current_shared_context: Arc, barrier_event_rx: UnboundedReceiver, @@ -328,14 +298,9 @@ impl LocalBarrierWorker { )); Self { failure_actors: HashMap::default(), - state: ManagedBarrierState::new( - actor_manager.env.state_store(), - actor_manager.streaming_metrics.clone(), - actor_manager.await_tree_reg.clone(), - ), + state: ManagedBarrierState::new(actor_manager.clone(), shared_context.clone()), control_stream_handle: ControlStreamHandle::empty(), actor_manager, - actor_manager_state: StreamActorManagerState::new(), current_shared_context: shared_context, barrier_event_rx: event_rx, actor_failure_rx: failure_rx, @@ -345,7 +310,7 @@ impl LocalBarrierWorker { fn to_debug_info(&self) -> LocalBarrierWorkerDebugInfo<'_> { LocalBarrierWorkerDebugInfo { - running_actors: self.actor_manager_state.handles.keys().cloned().collect(), + running_actors: self.state.actor_states.keys().cloned().collect(), managed_barrier_state: self.state.to_debug_info(), has_control_stream_connected: self.control_stream_handle.connected(), } @@ -384,7 +349,7 @@ impl LocalBarrierWorker { }); } LocalActorOperation::Shutdown { result_sender } => { - if !self.actor_manager_state.handles.is_empty() { + if !self.state.actor_states.is_empty() { tracing::warn!( "shutdown with running actors, scaling or migration will be triggered" ); @@ -419,15 +384,9 @@ impl LocalBarrierWorker { Request::InjectBarrier(req) => { let barrier = Barrier::from_protobuf(req.get_barrier().unwrap())?; self.update_actor_info(req.broadcast_info)?; - let actors = req - .actors_to_build - .iter() - .map(|actor| actor.actor.as_ref().unwrap().actor_id) - .collect_vec(); - self.update_actors(req.actors_to_build)?; - self.start_create_actors(&actors)?; self.send_barrier( &barrier, + req.actors_to_build, req.actor_ids_to_collect.into_iter().collect(), req.table_ids_to_sync .into_iter() @@ -484,7 +443,13 @@ impl LocalBarrierWorker { .map_err(|e| (actor_id, e))?; } #[cfg(test)] - LocalBarrierEvent::Flush(sender) => sender.send(()).unwrap(), + LocalBarrierEvent::Flush(sender) => { + use futures::FutureExt; + while let Some(request) = self.control_stream_handle.next_request().now_or_never() { + self.handle_streaming_control_request(request).unwrap(); + } + sender.send(()).unwrap() + } } Ok(()) } @@ -494,13 +459,6 @@ impl LocalBarrierWorker { LocalActorOperation::NewControlStream { .. } | LocalActorOperation::Shutdown { .. } => { unreachable!("event {actor_op} should be handled separately in async context") } - LocalActorOperation::DropActors { - actors, - result_sender, - } => { - self.drop_actors(&actors); - let _ = result_sender.send(()); - } LocalActorOperation::TakeReceiver { ids, result_sender } => { let _ = result_sender.send(self.current_shared_context.take_receiver(ids)); } @@ -596,30 +554,12 @@ impl LocalBarrierWorker { fn send_barrier( &mut self, barrier: &Barrier, + to_build: Vec, to_collect: HashSet, table_ids: HashSet, partial_graph_id: PartialGraphId, actor_ids_to_pre_sync_barrier: HashSet, ) -> StreamResult<()> { - if !cfg!(test) { - // The barrier might be outdated and been injected after recovery in some certain extreme - // scenarios. So some newly creating actors in the barrier are possibly not rebuilt during - // recovery. Check it here and return an error here if some actors are not found to - // avoid collection hang. We need some refine in meta side to remove this workaround since - // it will cause another round of unnecessary recovery. - let missing_actor_ids = to_collect - .iter() - .filter(|id| !self.actor_manager_state.handles.contains_key(id)) - .collect_vec(); - if !missing_actor_ids.is_empty() { - tracing::warn!( - "to collect actors not found, they should be cleaned when recovering: {:?}", - missing_actor_ids - ); - return Err(anyhow!("to collect actors not found: {:?}", to_collect).into()); - } - } - if barrier.kind == BarrierKind::Initial { self.actor_manager .watermark_epoch @@ -647,20 +587,12 @@ impl LocalBarrierWorker { self.state.transform_to_issued( barrier, + to_build, to_collect, table_ids, partial_graph_id, actor_ids_to_pre_sync_barrier, )?; - - // Actors to stop should still accept this barrier, but won't get sent to in next times. - if let Some(actors) = barrier.all_stop_actors() { - debug!( - target: "events::stream::barrier::manager", - "remove actors {:?} from senders", - actors - ); - } Ok(()) } diff --git a/src/stream/src/task/barrier_manager/managed_state.rs b/src/stream/src/task/barrier_manager/managed_state.rs index 5ccde5004801d..8f4ab2b49ea2e 100644 --- a/src/stream/src/task/barrier_manager/managed_state.rs +++ b/src/stream/src/task/barrier_manager/managed_state.rs @@ -15,7 +15,7 @@ use std::assert_matches::assert_matches; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::{Debug, Display, Formatter}; -use std::future::{poll_fn, Future}; +use std::future::{pending, poll_fn, Future}; use std::mem::replace; use std::sync::Arc; use std::task::{ready, Context, Poll}; @@ -31,17 +31,18 @@ use risingwave_common::must_match; use risingwave_common::util::epoch::EpochPair; use risingwave_hummock_sdk::SyncResult; use risingwave_pb::stream_plan::barrier::BarrierKind; -use risingwave_pb::stream_service::barrier_complete_response::CreateMviewProgress; +use risingwave_pb::stream_service::BuildActorInfo; use risingwave_storage::{dispatch_state_store, StateStore, StateStoreImpl}; use thiserror_ext::AsReport; use tokio::sync::mpsc; +use tokio::task::JoinHandle; use super::progress::BackfillState; use super::{BarrierCompleteResult, SubscribeMutationItem}; use crate::error::{StreamError, StreamResult}; use crate::executor::monitor::StreamingMetrics; use crate::executor::{Barrier, Mutation}; -use crate::task::{await_tree_key, ActorId, PartialGraphId}; +use crate::task::{ActorId, PartialGraphId, SharedContext, StreamActorManager}; struct IssuedState { pub mutation: Option>, @@ -83,12 +84,63 @@ enum ManagedBarrierStateInner { #[derive(Debug)] pub(super) struct BarrierState { - curr_epoch: u64, + barrier: Barrier, inner: ManagedBarrierStateInner, } -type AwaitEpochCompletedFuture = - impl Future)> + 'static; +mod await_epoch_completed_future { + use std::future::Future; + + use futures::future::BoxFuture; + use futures::FutureExt; + use risingwave_hummock_sdk::SyncResult; + use risingwave_pb::stream_service::barrier_complete_response::PbCreateMviewProgress; + + use crate::error::StreamResult; + use crate::executor::Barrier; + use crate::task::{await_tree_key, BarrierCompleteResult}; + + pub(super) type AwaitEpochCompletedFuture = + impl Future)> + 'static; + + pub(super) fn instrument_complete_barrier_future( + complete_barrier_future: Option>>, + barrier: Barrier, + barrier_await_tree_reg: Option<&await_tree::Registry>, + create_mview_progress: Vec, + ) -> AwaitEpochCompletedFuture { + let prev_epoch = barrier.epoch.prev; + let future = async move { + if let Some(future) = complete_barrier_future { + let result = future.await; + result.map(Some) + } else { + Ok(None) + } + } + .map(move |result| { + ( + barrier, + result.map(|sync_result| BarrierCompleteResult { + sync_result, + create_mview_progress, + }), + ) + }); + if let Some(reg) = barrier_await_tree_reg { + reg.register( + await_tree_key::BarrierAwait { prev_epoch }, + format!("SyncEpoch({})", prev_epoch), + ) + .instrument(future) + .left_future() + } else { + future.right_future() + } + } +} + +use await_epoch_completed_future::*; fn sync_epoch( state_store: &S, @@ -192,8 +244,6 @@ impl Display for &'_ PartialGraphManagedBarrierState { } enum InflightActorStatus { - /// The actor is just spawned and not issued any barrier yet - NotStarted, /// The actor has been issued some barriers, but has not collected the first barrier IssuedFirst(Vec), /// The actor has been issued some barriers, and has collected the first barrier @@ -201,12 +251,11 @@ enum InflightActorStatus { } impl InflightActorStatus { - fn max_issued_epoch(&self) -> Option { + fn max_issued_epoch(&self) -> u64 { match self { - InflightActorStatus::NotStarted => None, - InflightActorStatus::Running(epoch) => Some(*epoch), + InflightActorStatus::Running(epoch) => *epoch, InflightActorStatus::IssuedFirst(issued_barriers) => { - Some(issued_barriers.last().expect("non-empty").epoch.prev) + issued_barriers.last().expect("non-empty").epoch.prev } } } @@ -223,18 +272,35 @@ pub(crate) struct InflightActorState { status: InflightActorStatus, /// Whether the actor has been issued a stop barrier is_stopping: bool, + + join_handle: JoinHandle<()>, + monitor_task_handle: Option>, } impl InflightActorState { - pub(super) fn not_started(actor_id: ActorId) -> Self { + pub(super) fn start( + actor_id: ActorId, + initial_partial_graph_id: PartialGraphId, + initial_barrier: &Barrier, + join_handle: JoinHandle<()>, + monitor_task_handle: Option>, + ) -> Self { Self { actor_id, pending_subscribers: Default::default(), barrier_senders: vec![], - inflight_barriers: BTreeMap::default(), - barrier_mutations: Default::default(), - status: InflightActorStatus::NotStarted, + inflight_barriers: BTreeMap::from_iter([( + initial_barrier.epoch.prev, + initial_partial_graph_id, + )]), + barrier_mutations: BTreeMap::from_iter([( + initial_barrier.epoch.prev, + (initial_barrier.mutation.clone(), initial_barrier.epoch.curr), + )]), + status: InflightActorStatus::IssuedFirst(vec![initial_barrier.clone()]), is_stopping: false, + join_handle, + monitor_task_handle, } } @@ -263,9 +329,7 @@ impl InflightActorState { barrier: &Barrier, is_stop: bool, ) -> StreamResult<()> { - if let Some(max_issued_epoch) = self.status.max_issued_epoch() { - assert!(barrier.epoch.prev > max_issued_epoch); - } + assert!(barrier.epoch.prev > self.status.max_issued_epoch()); if let Some((first_epoch, _)) = self.pending_subscribers.first_key_value() { assert!( @@ -312,9 +376,6 @@ impl InflightActorState { } match &mut self.status { - InflightActorStatus::NotStarted => { - self.status = InflightActorStatus::IssuedFirst(vec![barrier.clone()]); - } InflightActorStatus::IssuedFirst(pending_barriers) => { pending_barriers.push(barrier.clone()); } @@ -338,9 +399,6 @@ impl InflightActorState { let (min_mutation_epoch, _) = self.barrier_mutations.pop_first().expect("should exist"); assert_eq!(min_mutation_epoch, epoch.prev); match &self.status { - InflightActorStatus::NotStarted => { - unreachable!("should have issued a barrier when collect") - } InflightActorStatus::IssuedFirst(pending_barriers) => { assert_eq!( prev_epoch, @@ -372,6 +430,9 @@ pub(super) struct PartialGraphManagedBarrierState { prev_barrier_table_ids: Option<(EpochPair, HashSet)>, /// Record the progress updates of creating mviews for each epoch of concurrent checkpoints. + /// + /// This is updated by [`super::CreateMviewProgressReporter::update`] and will be reported to meta + /// in [`BarrierCompleteResult`]. pub(super) create_mview_progress: HashMap>, pub(super) state_store: StateStoreImpl, @@ -416,32 +477,27 @@ impl PartialGraphManagedBarrierState { } } -pub(super) struct ManagedBarrierState { +pub(crate) struct ManagedBarrierState { pub(super) actor_states: HashMap, pub(super) graph_states: HashMap, - pub(super) state_store: StateStoreImpl, - - pub(super) streaming_metrics: Arc, + actor_manager: Arc, - /// Manages the await-trees of all barriers. - barrier_await_tree_reg: Option, + current_shared_context: Arc, } impl ManagedBarrierState { /// Create a barrier manager state. This will be called only once. pub(super) fn new( - state_store: StateStoreImpl, - streaming_metrics: Arc, - barrier_await_tree_reg: Option, + actor_manager: Arc, + current_shared_context: Arc, ) -> Self { Self { actor_states: Default::default(), graph_states: Default::default(), - state_store, - streaming_metrics, - barrier_await_tree_reg, + actor_manager, + current_shared_context, } } @@ -450,6 +506,21 @@ impl ManagedBarrierState { graph_states: &self.graph_states, } } + + pub(crate) async fn abort_actors(&mut self) { + for (actor_id, state) in &self.actor_states { + tracing::debug!("force stopping actor {}", actor_id); + state.join_handle.abort(); + if let Some(monitor_task_handle) = &state.monitor_task_handle { + monitor_task_handle.abort(); + } + } + for (actor_id, state) in self.actor_states.drain() { + tracing::debug!("join actor {}", actor_id); + let result = state.join_handle.await; + assert!(result.is_ok() || result.unwrap_err().is_cancelled()); + } + } } impl InflightActorState { @@ -485,17 +556,13 @@ impl InflightActorState { .push(tx); } } else { - // Barrier has not issued yet. Store the pending tx - if let Some(max_issued_epoch) = self.status.max_issued_epoch() { - assert!( - max_issued_epoch < start_prev_epoch, - "later barrier {} has been issued, but skip the start epoch {:?}", - max_issued_epoch, - start_prev_epoch - ); - } else { - assert!(!self.is_stopping, "actor has been stopped and has not inflight barrier. unlikely to get further barrier"); - } + let max_issued_epoch = self.status.max_issued_epoch(); + assert!( + max_issued_epoch < start_prev_epoch, + "later barrier {} has been issued, but skip the start epoch {:?}", + max_issued_epoch, + start_prev_epoch + ); self.pending_subscribers .entry(start_prev_epoch) .or_default() @@ -508,9 +575,6 @@ impl InflightActorState { tx: mpsc::UnboundedSender, ) -> StreamResult<()> { match &self.status { - InflightActorStatus::NotStarted => { - self.barrier_senders.push(tx); - } InflightActorStatus::IssuedFirst(pending_barriers) => { for barrier in pending_barriers { tx.send(barrier.clone()).map_err(|_| { @@ -539,8 +603,8 @@ impl ManagedBarrierState { tx: mpsc::UnboundedSender, ) { self.actor_states - .entry(actor_id) - .or_insert_with(|| InflightActorState::not_started(actor_id)) + .get_mut(&actor_id) + .expect("should exist") .subscribe_actor_mutation(start_prev_epoch, tx); } @@ -550,53 +614,105 @@ impl ManagedBarrierState { tx: mpsc::UnboundedSender, ) -> StreamResult<()> { self.actor_states - .entry(actor_id) - .or_insert_with(|| InflightActorState::not_started(actor_id)) + .get_mut(&actor_id) + .expect("should exist") .register_barrier_sender(tx) } pub(super) fn transform_to_issued( &mut self, barrier: &Barrier, + actors_to_build: Vec, actor_ids_to_collect: HashSet, table_ids: HashSet, partial_graph_id: PartialGraphId, actor_ids_to_pre_sync_barrier: HashSet, ) -> StreamResult<()> { let actor_to_stop = barrier.all_stop_actors(); + let is_stop_actor = |actor_id| { + actor_to_stop + .map(|actors| actors.contains(&actor_id)) + .unwrap_or(false) + }; let graph_state = self .graph_states .entry(partial_graph_id) .or_insert_with(|| { PartialGraphManagedBarrierState::new( - self.state_store.clone(), - self.streaming_metrics.clone(), - self.barrier_await_tree_reg.clone(), + self.actor_manager.env.state_store(), + self.actor_manager.streaming_metrics.clone(), + self.actor_manager.await_tree_reg.clone(), ) }); graph_state.transform_to_issued(barrier, actor_ids_to_collect.clone(), table_ids); + let mut new_actors = HashSet::new(); + for actor in actors_to_build { + let actor_id = actor.actor.as_ref().unwrap().actor_id; + assert!(!is_stop_actor(actor_id)); + assert!(new_actors.insert(actor_id)); + assert!(actor_ids_to_collect.contains(&actor_id)); + let (join_handle, monitor_join_handle) = self + .actor_manager + .spawn_actor(actor, self.current_shared_context.clone()); + assert!(self + .actor_states + .try_insert( + actor_id, + InflightActorState::start( + actor_id, + partial_graph_id, + barrier, + join_handle, + monitor_join_handle + ) + ) + .is_ok()); + } + + // Spawn a trivial join handle to be compatible with the unit test + if cfg!(test) { + for actor_id in &actor_ids_to_collect { + if !self.actor_states.contains_key(actor_id) { + let join_handle = self.actor_manager.runtime.spawn(async { pending().await }); + assert!(self + .actor_states + .try_insert( + *actor_id, + InflightActorState::start( + *actor_id, + partial_graph_id, + barrier, + join_handle, + None, + ) + ) + .is_ok()); + new_actors.insert(*actor_id); + } + } + } + // Note: it's important to issue barrier to actor after issuing to graph to ensure that // we call `start_epoch` on the graph before the actors receive the barrier - for actor_id in actor_ids_to_collect { + for actor_id in &actor_ids_to_collect { + if new_actors.contains(actor_id) { + continue; + } self.actor_states - .entry(actor_id) - .or_insert_with(|| InflightActorState::not_started(actor_id)) - .issue_barrier( - partial_graph_id, - barrier, - actor_to_stop - .map(|actors| actors.contains(&actor_id)) - .unwrap_or(false), - )?; + .get_mut(actor_id) + .unwrap_or_else(|| { + panic!("should exist: {} {:?}", actor_id, actor_ids_to_collect); + }) + .issue_barrier(partial_graph_id, barrier, is_stop_actor(*actor_id))?; } if partial_graph_id.is_global_graph() { for actor_id in actor_ids_to_pre_sync_barrier { self.actor_states - .entry(actor_id) - .or_insert_with(|| InflightActorState::not_started(actor_id)) + .get_mut(&actor_id) + .expect("should exist") .sync_barrier(barrier); } } else { @@ -610,9 +726,12 @@ impl ManagedBarrierState { ) -> impl Future + '_ { poll_fn(|cx| { for (partial_graph_id, graph_state) in &mut self.graph_states { - if let Poll::Ready(epoch) = graph_state.poll_next_completed_epoch(cx) { + if let Poll::Ready(barrier) = graph_state.poll_next_completed_barrier(cx) { + if let Some(actors_to_stop) = barrier.all_stop_actors() { + self.current_shared_context.drop_actors(actors_to_stop); + } let partial_graph_id = *partial_graph_id; - return Poll::Ready((partial_graph_id, epoch)); + return Poll::Ready((partial_graph_id, barrier.epoch.prev)); } } Poll::Pending @@ -626,7 +745,10 @@ impl ManagedBarrierState { .expect("should exist") .collect(epoch); if is_finished { - self.actor_states.remove(&actor_id); + let state = self.actor_states.remove(&actor_id).expect("should exist"); + if let Some(monitor_task_handle) = state.monitor_task_handle { + monitor_task_handle.abort(); + } } let prev_graph_state = self .graph_states @@ -677,21 +799,10 @@ impl PartialGraphManagedBarrierState { let create_mview_progress = self .create_mview_progress - .remove(&barrier_state.curr_epoch) + .remove(&barrier_state.barrier.epoch.curr) .unwrap_or_default() .into_iter() - .map(|(actor, state)| CreateMviewProgress { - backfill_actor_id: actor, - done: matches!(state, BackfillState::Done(_)), - consumed_epoch: match state { - BackfillState::ConsumingUpstream(consumed_epoch, _) => consumed_epoch, - BackfillState::Done(_) => barrier_state.curr_epoch, - }, - consumed_rows: match state { - BackfillState::ConsumingUpstream(_, consumed_rows) => consumed_rows, - BackfillState::Done(consumed_rows) => consumed_rows, - }, - }) + .map(|(actor, state)| state.to_pb(actor)) .collect(); let complete_barrier_future = match kind { @@ -724,34 +835,15 @@ impl PartialGraphManagedBarrierState { } }; + let barrier = barrier_state.barrier.clone(); + self.await_epoch_completed_futures.push_back({ - let future = async move { - if let Some(future) = complete_barrier_future { - let result = future.await; - result.map(Some) - } else { - Ok(None) - } - } - .map(move |result| { - ( - prev_epoch, - result.map(|sync_result| BarrierCompleteResult { - sync_result, - create_mview_progress, - }), - ) - }); - if let Some(reg) = &self.barrier_await_tree_reg { - reg.register( - await_tree_key::BarrierAwait { prev_epoch }, - format!("SyncEpoch({})", prev_epoch), - ) - .instrument(future) - .left_future() - } else { - future.right_future() - } + instrument_complete_barrier_future( + complete_barrier_future, + barrier, + self.barrier_await_tree_reg.as_ref(), + create_mview_progress, + ) }); } } @@ -775,7 +867,7 @@ impl PartialGraphManagedBarrierState { ) } Some(&mut BarrierState { - curr_epoch, + ref barrier, inner: ManagedBarrierStateInner::Issued(IssuedState { ref mut remaining_actors, @@ -789,7 +881,7 @@ impl PartialGraphManagedBarrierState { "the actor doesn't exist. actor_id: {:?}, curr_epoch: {:?}", actor_id, epoch.curr ); - assert_eq!(curr_epoch, epoch.curr); + assert_eq!(barrier.epoch.curr, epoch.curr); self.may_have_collected_all(epoch.prev); } Some(BarrierState { inner, .. }) => { @@ -871,7 +963,7 @@ impl PartialGraphManagedBarrierState { self.epoch_barrier_state_map.insert( barrier.epoch.prev, BarrierState { - curr_epoch: barrier.epoch.curr, + barrier: barrier.clone(), inner: ManagedBarrierStateInner::Issued(IssuedState { remaining_actors: BTreeSet::from_iter(actor_ids_to_collect), mutation: barrier.mutation.clone(), @@ -885,17 +977,17 @@ impl PartialGraphManagedBarrierState { } /// Return a future that yields the next completed epoch. The future is cancellation safe. - pub(crate) fn poll_next_completed_epoch(&mut self, cx: &mut Context<'_>) -> Poll { + pub(crate) fn poll_next_completed_barrier(&mut self, cx: &mut Context<'_>) -> Poll { ready!(self.await_epoch_completed_futures.next().poll_unpin(cx)) - .map(|(prev_epoch, result)| { + .map(|(barrier, result)| { let state = self .epoch_barrier_state_map - .get_mut(&prev_epoch) + .get_mut(&barrier.epoch.prev) .expect("should exist"); // sanity check on barrier state assert_matches!(&state.inner, ManagedBarrierStateInner::AllCollected); state.inner = ManagedBarrierStateInner::Completed(result); - prev_epoch + barrier }) .map(Poll::Ready) .unwrap_or(Poll::Pending) @@ -941,9 +1033,12 @@ impl PartialGraphManagedBarrierState { #[cfg(test)] async fn pop_next_completed_epoch(&mut self) -> u64 { - let epoch = poll_fn(|cx| self.poll_next_completed_epoch(cx)).await; - let _ = self.pop_completed_epoch(epoch).unwrap().unwrap(); - epoch + let barrier = poll_fn(|cx| self.poll_next_completed_barrier(cx)).await; + let _ = self + .pop_completed_epoch(barrier.epoch.prev) + .unwrap() + .unwrap(); + barrier.epoch.prev } } diff --git a/src/stream/src/task/barrier_manager/progress.rs b/src/stream/src/task/barrier_manager/progress.rs index 9a243c2e975d1..9b2820bb3bfed 100644 --- a/src/stream/src/task/barrier_manager/progress.rs +++ b/src/stream/src/task/barrier_manager/progress.rs @@ -15,6 +15,7 @@ use std::fmt::{Display, Formatter}; use risingwave_common::util::epoch::EpochPair; +use risingwave_pb::stream_service::barrier_complete_response::PbCreateMviewProgress; use super::LocalBarrierManager; use crate::task::barrier_manager::LocalBarrierEvent::ReportCreateProgress; @@ -30,6 +31,23 @@ pub(crate) enum BackfillState { Done(ConsumedRows), } +impl BackfillState { + pub fn to_pb(self, actor_id: ActorId) -> PbCreateMviewProgress { + PbCreateMviewProgress { + backfill_actor_id: actor_id, + done: matches!(self, BackfillState::Done(_)), + consumed_epoch: match self { + BackfillState::ConsumingUpstream(consumed_epoch, _) => consumed_epoch, + BackfillState::Done(_) => 0, // unused field for done + }, + consumed_rows: match self { + BackfillState::ConsumingUpstream(_, consumed_rows) => consumed_rows, + BackfillState::Done(consumed_rows) => consumed_rows, + }, + } + } +} + impl Display for BackfillState { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { @@ -103,7 +121,7 @@ impl LocalBarrierManager { /// TODO(kwannoel): Perhaps it is possible to get total key count of the replicated state table /// for arrangement backfill. We can use that to estimate the progress as well, and avoid recording /// `row_count` state for it. -pub struct CreateMviewProgress { +pub struct CreateMviewProgressReporter { barrier_manager: LocalBarrierManager, /// The id of the actor containing the backfill executors. @@ -112,7 +130,7 @@ pub struct CreateMviewProgress { state: Option, } -impl CreateMviewProgress { +impl CreateMviewProgressReporter { pub fn new(barrier_manager: LocalBarrierManager, backfill_actor_id: ActorId) -> Self { Self { barrier_manager, @@ -186,8 +204,8 @@ impl LocalBarrierManager { pub fn register_create_mview_progress( &self, backfill_actor_id: ActorId, - ) -> CreateMviewProgress { + ) -> CreateMviewProgressReporter { trace!("register create mview progress: {}", backfill_actor_id); - CreateMviewProgress::new(self.clone(), backfill_actor_id) + CreateMviewProgressReporter::new(self.clone(), backfill_actor_id) } } diff --git a/src/stream/src/task/barrier_manager/tests.rs b/src/stream/src/task/barrier_manager/tests.rs index d6a8256aebb61..112ee533d8e6d 100644 --- a/src/stream/src/task/barrier_manager/tests.rs +++ b/src/stream/src/task/barrier_manager/tests.rs @@ -40,19 +40,22 @@ async fn test_managed_barrier_collection() -> StreamResult<()> { // Register actors let actor_ids = vec![233, 234, 235]; - let count = actor_ids.len(); - let mut rxs = actor_ids - .clone() - .into_iter() - .map(register_sender) - .collect_vec(); // Send a barrier to all actors let curr_epoch = test_epoch(2); let barrier = Barrier::new_test_barrier(curr_epoch); let epoch = barrier.epoch.prev; - test_env.inject_barrier(&barrier, actor_ids); + test_env.inject_barrier(&barrier, actor_ids.clone()); + + manager.flush_all_events().await; + + let count = actor_ids.len(); + let mut rxs = actor_ids + .clone() + .into_iter() + .map(register_sender) + .collect_vec(); // Collect barriers from actors let collected_barriers = join_all(rxs.iter_mut().map(|(actor_id, rx)| async move { @@ -105,6 +108,14 @@ async fn test_managed_barrier_collection_separately() -> StreamResult<()> { .chain(once(extra_actor_id)) .collect_vec(); + // Prepare the barrier + let curr_epoch = test_epoch(2); + let barrier = Barrier::new_test_barrier(curr_epoch).with_stop(); + + test_env.inject_barrier(&barrier, actor_ids_to_collect.clone()); + + manager.flush_all_events().await; + // Register actors let count = actor_ids_to_send.len(); let mut rxs = actor_ids_to_send @@ -113,10 +124,6 @@ async fn test_managed_barrier_collection_separately() -> StreamResult<()> { .map(register_sender) .collect_vec(); - // Prepare the barrier - let curr_epoch = test_epoch(2); - let barrier = Barrier::new_test_barrier(curr_epoch).with_stop(); - let mut mutation_subscriber = manager.subscribe_barrier_mutation(extra_actor_id, &barrier.clone().into_dispatcher()); @@ -124,8 +131,6 @@ async fn test_managed_barrier_collection_separately() -> StreamResult<()> { let mut mutation_reader = pin!(mutation_subscriber.recv()); assert!(poll_fn(|cx| Poll::Ready(mutation_reader.as_mut().poll(cx).is_pending())).await); - test_env.inject_barrier(&barrier, actor_ids_to_collect); - let (epoch, mutation) = mutation_reader.await.unwrap(); assert_eq!((epoch, &mutation), (barrier.epoch.prev, &barrier.mutation)); @@ -196,6 +201,8 @@ async fn test_late_register_barrier_sender() -> StreamResult<()> { test_env.inject_barrier(&barrier1, actor_ids_to_collect.clone()); test_env.inject_barrier(&barrier2, actor_ids_to_collect.clone()); + manager.flush_all_events().await; + // register sender after inject barrier let mut rxs = actor_ids_to_send .clone() diff --git a/src/stream/src/task/mod.rs b/src/stream/src/task/mod.rs index b5382b3418052..59851fdf09ad8 100644 --- a/src/stream/src/task/mod.rs +++ b/src/stream/src/task/mod.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use anyhow::anyhow; use parking_lot::{MappedMutexGuard, Mutex, MutexGuard, RwLock}; @@ -194,7 +194,7 @@ impl SharedContext { &self.config } - pub fn drop_actors(&self, actors: &[ActorId]) { + pub(super) fn drop_actors(&self, actors: &HashSet) { self.channel_map .lock() .retain(|(up_id, _), _| !actors.contains(up_id)); diff --git a/src/stream/src/task/stream_manager.rs b/src/stream/src/task/stream_manager.rs index 60b7341371497..ba76e6fab791d 100644 --- a/src/stream/src/task/stream_manager.rs +++ b/src/stream/src/task/stream_manager.rs @@ -19,7 +19,6 @@ use std::sync::atomic::AtomicU64; use std::sync::Arc; use std::time::Instant; -use anyhow::anyhow; use async_recursion::async_recursion; use await_tree::InstrumentAwait; use futures::stream::BoxStream; @@ -59,8 +58,8 @@ use crate::task::barrier_manager::{ ControlStreamHandle, EventSender, LocalActorOperation, LocalBarrierWorker, }; use crate::task::{ - ActorId, FragmentId, LocalBarrierManager, SharedContext, StreamActorManager, - StreamActorManagerState, StreamEnvironment, UpDownActorIds, + ActorId, FragmentId, LocalBarrierManager, SharedContext, StreamActorManager, StreamEnvironment, + UpDownActorIds, }; #[cfg(test)] @@ -214,16 +213,6 @@ impl LocalStreamManager { }) } - /// Drop the resources of the given actors. - pub async fn drop_actors(&self, actors: Vec) -> StreamResult<()> { - self.actor_op_tx - .send_and_await(|result_sender| LocalActorOperation::DropActors { - actors, - result_sender, - }) - .await - } - pub async fn take_receiver(&self, ids: UpDownActorIds) -> StreamResult { self.actor_op_tx .send_and_await(|result_sender| LocalActorOperation::TakeReceiver { @@ -256,28 +245,9 @@ impl LocalStreamManager { } impl LocalBarrierWorker { - /// Drop the resources of the given actors. - pub(super) fn drop_actors(&mut self, actors: &[ActorId]) { - self.current_shared_context.drop_actors(actors); - for &id in actors { - self.actor_manager_state.drop_actor(id); - } - tracing::debug!(actors = ?actors, "drop actors"); - } - /// Force stop all actors on this worker, and then drop their resources. pub(super) async fn reset(&mut self, version_id: HummockVersionId) { - let actor_handles = self.actor_manager_state.drain_actor_handles(); - for (actor_id, handle) in &actor_handles { - tracing::debug!("force stopping actor {}", actor_id); - handle.abort(); - } - for (actor_id, handle) in actor_handles { - tracing::debug!("join actor {}", actor_id); - let result = handle.await; - assert!(result.is_ok() || result.unwrap_err().is_cancelled()); - } - self.actor_manager_state.clear_state(); + self.state.abort_actors().await; if let Some(m) = self.actor_manager.await_tree_reg.as_ref() { m.clear(); } @@ -291,26 +261,6 @@ impl LocalBarrierWorker { self.reset_state(); self.actor_manager.env.dml_manager_ref().clear(); } - - pub(super) fn update_actors(&mut self, actors: Vec) -> StreamResult<()> { - self.actor_manager_state.update_actors(actors) - } - - /// This function could only be called once during the lifecycle of `LocalStreamManager` for - /// now. - pub(super) fn start_create_actors(&mut self, actors: &[ActorId]) -> StreamResult<()> { - let actors: Vec<_> = actors - .iter() - .map(|actor_id| { - self.actor_manager_state - .actors - .remove(actor_id) - .ok_or_else(|| anyhow!("No such actor with actor id:{}", actor_id)) - }) - .try_collect()?; - self.spawn_actors(actors); - Ok(()) - } } impl StreamActorManager { @@ -559,18 +509,22 @@ impl StreamActorManager { } } -impl LocalBarrierWorker { - pub(super) fn spawn_actors(&mut self, actors: Vec) { - for actor in actors { +impl StreamActorManager { + pub(super) fn spawn_actor( + self: &Arc, + actor: BuildActorInfo, + current_shared_context: Arc, + ) -> (JoinHandle<()>, Option>) { + { let monitor = tokio_metrics::TaskMonitor::new(); let stream_actor_ref = actor.actor.as_ref().unwrap(); let actor_id = stream_actor_ref.actor_id; let handle = { let trace_span = format!("Actor {actor_id}: `{}`", stream_actor_ref.mview_definition); - let barrier_manager = self.current_shared_context.local_barrier_manager.clone(); + let barrier_manager = current_shared_context.local_barrier_manager.clone(); // wrap the future of `create_actor` with `boxed` to avoid stack overflow - let actor = self.actor_manager.clone().create_actor(actor, self.current_shared_context.clone()).boxed().and_then(|actor| actor.run()).map(move |result| { + let actor = self.clone().create_actor(actor, current_shared_context).boxed().and_then(|actor| actor.run()).map(move |result| { if let Err(err) = result { // TODO: check error type and panic if it's unexpected. // Intentionally use `?` on the report to also include the backtrace. @@ -578,7 +532,7 @@ impl LocalBarrierWorker { barrier_manager.notify_failure(actor_id, err); } }); - let traced = match &self.actor_manager.await_tree_reg { + let traced = match &self.await_tree_reg { Some(m) => m .register(await_tree_key::Actor(actor_id), trace_span) .instrument(actor) @@ -586,24 +540,17 @@ impl LocalBarrierWorker { None => actor.right_future(), }; let instrumented = monitor.instrument(traced); - let with_config = - crate::CONFIG.scope(self.actor_manager.env.config().clone(), instrumented); + let with_config = crate::CONFIG.scope(self.env.config().clone(), instrumented); - self.actor_manager.runtime.spawn(with_config) + self.runtime.spawn(with_config) }; - self.actor_manager_state.handles.insert(actor_id, handle); - - if self.actor_manager.streaming_metrics.level >= MetricLevel::Debug - || self - .actor_manager - .env - .config() - .developer - .enable_actor_tokio_metrics + + let monitor_handle = if self.streaming_metrics.level >= MetricLevel::Debug + || self.env.config().developer.enable_actor_tokio_metrics { tracing::info!("Tokio metrics are enabled."); - let streaming_metrics = self.actor_manager.streaming_metrics.clone(); - let actor_monitor_task = self.actor_manager.runtime.spawn(async move { + let streaming_metrics = self.streaming_metrics.clone(); + let actor_monitor_task = self.runtime.spawn(async move { let metrics = streaming_metrics.new_actor_metrics(actor_id); loop { let task_metrics = monitor.cumulative(); @@ -643,10 +590,11 @@ impl LocalBarrierWorker { tokio::time::sleep(Duration::from_secs(1)).await; } }); - self.actor_manager_state - .actor_monitor_tasks - .insert(actor_id, actor_monitor_task); - } + Some(actor_monitor_task) + } else { + None + }; + (handle, monitor_handle) } } } @@ -671,44 +619,6 @@ impl LocalBarrierWorker { } } -impl StreamActorManagerState { - /// `drop_actor` is invoked by meta node via RPC once the stop barrier arrives at the - /// sink. All the actors in the actors should stop themselves before this method is invoked. - fn drop_actor(&mut self, actor_id: ActorId) { - self.actor_monitor_tasks - .remove(&actor_id) - .inspect(|handle| handle.abort()); - self.actors.remove(&actor_id); - - // Task should have already stopped when this method is invoked. There might be some - // clean-up work left (like dropping in-memory data structures), but we don't have to wait - // for them to finish, in order to make this request non-blocking. - self.handles.remove(&actor_id); - } - - fn drain_actor_handles(&mut self) -> Vec<(ActorId, ActorHandle)> { - self.handles.drain().collect() - } - - /// `stop_all_actors` is invoked by meta node via RPC for recovery purpose. Different from the - /// `drop_actor`, the execution of the actors will be aborted. - fn clear_state(&mut self) { - self.actors.clear(); - self.actor_monitor_tasks.clear(); - } - - fn update_actors(&mut self, actors: Vec) -> StreamResult<()> { - for actor in actors { - let actor_id = actor.actor.as_ref().unwrap().get_actor_id(); - self.actors - .try_insert(actor_id, actor) - .map_err(|_| anyhow!("duplicated actor {}", actor_id))?; - } - - Ok(()) - } -} - #[cfg(test)] pub mod test_utils { use risingwave_pb::common::HostAddress; diff --git a/src/tests/simulation/Cargo.toml b/src/tests/simulation/Cargo.toml index 8729207c0d025..c82f2b7d5911e 100644 --- a/src/tests/simulation/Cargo.toml +++ b/src/tests/simulation/Cargo.toml @@ -25,6 +25,7 @@ glob = "0.3" itertools = { workspace = true } lru = { workspace = true } madsim = "0.2.30" +maplit = "1" paste = "1" pin-project = "1.1" pretty_assertions = "1" diff --git a/src/tests/simulation/src/cluster.rs b/src/tests/simulation/src/cluster.rs index 26fdc3a8757e1..a9ffba0063562 100644 --- a/src/tests/simulation/src/cluster.rs +++ b/src/tests/simulation/src/cluster.rs @@ -158,27 +158,16 @@ impl Configuration { /// Provides a configuration for scale test which ensures that the arrangement backfill is disabled, /// so table scan will use `no_shuffle`. pub fn for_scale_no_shuffle() -> Self { - // Embed the config file and create a temporary file at runtime. The file will be deleted - // automatically when it's dropped. - let config_path = { - let mut file = - tempfile::NamedTempFile::new().expect("failed to create temp config file"); - file.write_all(include_bytes!("risingwave-scale.toml")) - .expect("failed to write config file"); - file.into_temp_path() - }; + let mut conf = Self::for_scale(); + conf.per_session_queries = + vec!["SET STREAMING_USE_ARRANGEMENT_BACKFILL = false;".into()].into(); + conf + } - Configuration { - config_path: ConfigPath::Temp(config_path.into()), - frontend_nodes: 2, - compute_nodes: 3, - meta_nodes: 3, - compactor_nodes: 2, - compute_node_cores: 2, - per_session_queries: vec!["SET STREAMING_USE_ARRANGEMENT_BACKFILL = false;".into()] - .into(), - ..Default::default() - } + pub fn for_scale_shared_source() -> Self { + let mut conf = Self::for_scale(); + conf.per_session_queries = vec!["SET RW_ENABLE_SHARED_SOURCE = true;".into()].into(); + conf } pub fn for_auto_parallelism( diff --git a/src/tests/simulation/src/ctl_ext.rs b/src/tests/simulation/src/ctl_ext.rs index 9b57673e49c16..3986a826e21e7 100644 --- a/src/tests/simulation/src/ctl_ext.rs +++ b/src/tests/simulation/src/ctl_ext.rs @@ -12,9 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![cfg_attr(not(madsim), expect(unused_imports))] - -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::ffi::OsString; use std::fmt::Write; use std::sync::Arc; @@ -23,17 +21,17 @@ use anyhow::{anyhow, Result}; use cfg_or_panic::cfg_or_panic; use clap::Parser; use itertools::Itertools; -use rand::seq::{IteratorRandom, SliceRandom}; +use rand::seq::IteratorRandom; use rand::{thread_rng, Rng}; use risingwave_common::catalog::TableId; use risingwave_common::hash::WorkerSlotId; +use risingwave_connector::source::{SplitImpl, SplitMetaData}; use risingwave_hummock_sdk::{CompactionGroupId, HummockSstableId}; use risingwave_pb::meta::table_fragments::fragment::FragmentDistributionType; use risingwave_pb::meta::table_fragments::PbFragment; use risingwave_pb::meta::update_worker_node_schedulability_request::Schedulability; use risingwave_pb::meta::GetClusterInfoResponse; use risingwave_pb::stream_plan::StreamNode; -use serde::de::IntoDeserializer; use self::predicate::BoxedPredicate; use crate::cluster::Cluster; @@ -76,7 +74,7 @@ pub mod predicate { Box::new(p) } - /// There exists operators whose identity contains `s` in the fragment. + /// There exists operators whose identity contains `s` in the fragment (case insensitive). pub fn identity_contains(s: impl Into) -> BoxedPredicate { let s: String = s.into(); let p = move |f: &PbFragment| { @@ -363,6 +361,30 @@ impl Cluster { Ok(response) } + /// `table_id -> actor_id -> splits` + pub async fn list_source_splits(&self) -> Result>> { + let info = self.get_cluster_info().await?; + let mut res = BTreeMap::new(); + + for table in info.table_fragments { + let mut table_actor_splits = BTreeMap::new(); + + for (actor_id, splits) in table.actor_splits { + let splits = splits + .splits + .iter() + .map(|split| SplitImpl::try_from(split).unwrap()) + .map(|split| split.id()) + .collect_vec() + .join(","); + table_actor_splits.insert(actor_id, splits); + } + res.insert(table.table_id, table_actor_splits); + } + + Ok(res) + } + // update node schedulability #[cfg_or_panic(madsim)] async fn update_worker_node_schedulability( diff --git a/src/tests/simulation/src/lib.rs b/src/tests/simulation/src/lib.rs index aa6303b8e2f65..af9cf158a3350 100644 --- a/src/tests/simulation/src/lib.rs +++ b/src/tests/simulation/src/lib.rs @@ -13,7 +13,6 @@ // limitations under the License. #![feature(trait_alias)] -#![feature(lint_reasons)] #![feature(let_chains)] #![feature(try_blocks)] #![feature(register_tool)] diff --git a/src/tests/simulation/tests/integration_tests/scale/mod.rs b/src/tests/simulation/tests/integration_tests/scale/mod.rs index f6940f072409e..3c7a702dc6290 100644 --- a/src/tests/simulation/tests/integration_tests/scale/mod.rs +++ b/src/tests/simulation/tests/integration_tests/scale/mod.rs @@ -20,6 +20,7 @@ mod nexmark_q4; mod nexmark_source; mod no_shuffle; mod schedulability; +mod shared_source; mod singleton_migration; mod sink; mod streaming_parallelism; diff --git a/src/tests/simulation/tests/integration_tests/scale/shared_source.rs b/src/tests/simulation/tests/integration_tests/scale/shared_source.rs new file mode 100644 index 0000000000000..175b3a043100c --- /dev/null +++ b/src/tests/simulation/tests/integration_tests/scale/shared_source.rs @@ -0,0 +1,192 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::BTreeMap; + +use anyhow::Result; +use itertools::Itertools; +use maplit::{convert_args, hashmap}; +use risingwave_common::hash::WorkerSlotId; +use risingwave_pb::meta::table_fragments::Fragment; +use risingwave_simulation::cluster::{Cluster, Configuration}; +use risingwave_simulation::ctl_ext::predicate::{identity_contains, no_identity_contains}; + +const CREATE_SOURCE: &str = r#" +CREATE SOURCE s(v1 int, v2 varchar) WITH ( + connector='kafka', + properties.bootstrap.server='192.168.11.1:29092', + topic='shared_source' +) FORMAT PLAIN ENCODE JSON;"#; + +fn actor_upstream(fragment: &Fragment) -> Vec<(u32, Vec)> { + fragment + .actors + .iter() + .map(|actor| (actor.actor_id, actor.upstream_actor_id.clone())) + .collect_vec() +} + +async fn validate_splits_aligned(cluster: &mut Cluster) -> Result<()> { + let source_backfill_fragment = cluster + .locate_one_fragment([identity_contains("StreamSourceScan")]) + .await?; + // The result of scaling is non-deterministic. + // So we just print the result here, instead of asserting with a fixed value. + let actor_upstream = actor_upstream(&source_backfill_fragment.inner); + tracing::info!( + "{}", + actor_upstream + .iter() + .format_with("\n", |(actor_id, upstream), f| f(&format_args!( + "{} <- {:?}", + actor_id, upstream + ))) + ); + let splits = cluster.list_source_splits().await?; + tracing::info!("{:#?}", splits); + let actor_splits: BTreeMap = splits + .values() + .flat_map(|m| m.clone().into_iter()) + .collect(); + for (actor, upstream) in actor_upstream { + assert!(upstream.len() == 1, "invalid upstream: {:?}", upstream); + let upstream_actor = upstream[0]; + assert_eq!( + actor_splits.get(&actor).unwrap(), + actor_splits.get(&upstream_actor).unwrap() + ); + } + Ok(()) +} + +#[tokio::test] +async fn test_shared_source() -> Result<()> { + tracing_subscriber::fmt::Subscriber::builder() + .with_max_level(tracing::Level::ERROR) + .with_env_filter("risingwave_stream::executor::source::source_backfill_executor=DEBUG,integration_tests=DEBUG") + .init(); + + let mut cluster = Cluster::start(Configuration::for_scale_shared_source()).await?; + cluster.create_kafka_topics(convert_args!(hashmap!( + "shared_source" => 4, + ))); + let mut session = cluster.start_session(); + + session.run("set rw_implicit_flush = true;").await?; + + session.run(CREATE_SOURCE).await?; + session + .run("create materialized view mv as select count(*) from s group by v1;") + .await?; + let source_fragment = cluster + .locate_one_fragment([ + identity_contains("Source"), + no_identity_contains("StreamSourceScan"), + ]) + .await?; + let source_workers = source_fragment.all_worker_count().into_keys().collect_vec(); + let source_backfill_fragment = cluster + .locate_one_fragment([identity_contains("StreamSourceScan")]) + .await?; + let source_backfill_workers = source_backfill_fragment + .all_worker_count() + .into_keys() + .collect_vec(); + let hash_agg_fragment = cluster + .locate_one_fragment([identity_contains("hashagg")]) + .await?; + let hash_agg_workers = hash_agg_fragment + .all_worker_count() + .into_keys() + .collect_vec(); + validate_splits_aligned(&mut cluster).await?; + expect_test::expect![[r#" + 1 1 HASH {2} {} {SOURCE} 6 + 2 3 HASH {4,3} {3} {MVIEW} 6 + 3 3 HASH {5} {1} {SOURCE_SCAN} 6"#]] + .assert_eq(&cluster.run("select * from rw_fragments;").await?); + expect_test::expect![[r#" + 1 CREATED ADAPTIVE + 3 CREATED ADAPTIVE"#]] + .assert_eq(&cluster.run("select * from rw_table_fragments;").await?); + + // SourceBackfill cannot be scaled because of NoShuffle. + assert!( + &cluster + .reschedule( + source_backfill_fragment + .reschedule([WorkerSlotId::new(source_backfill_workers[0], 0)], []), + ) + .await.unwrap_err().to_string().contains("rescheduling NoShuffle downstream fragment (maybe Chain fragment) is forbidden, please use NoShuffle upstream fragment (like Materialized fragment) to scale"), + ); + + // hash agg can be scaled independently + cluster + .reschedule(hash_agg_fragment.reschedule([WorkerSlotId::new(hash_agg_workers[0], 0)], [])) + .await + .unwrap(); + expect_test::expect![[r#" + 1 1 HASH {2} {} {SOURCE} 6 + 2 3 HASH {4,3} {3} {MVIEW} 5 + 3 3 HASH {5} {1} {SOURCE_SCAN} 6"#]] + .assert_eq(&cluster.run("select * from rw_fragments;").await?); + + // source is the NoShuffle upstream. It can be scaled, and the downstream SourceBackfill will be scaled together. + cluster + .reschedule(source_fragment.reschedule( + [ + WorkerSlotId::new(source_workers[0], 0), + WorkerSlotId::new(source_workers[0], 1), + WorkerSlotId::new(source_workers[2], 0), + ], + [], + )) + .await + .unwrap(); + validate_splits_aligned(&mut cluster).await?; + expect_test::expect![[r#" + 1 1 HASH {2} {} {SOURCE} 3 + 2 3 HASH {4,3} {3} {MVIEW} 5 + 3 3 HASH {5} {1} {SOURCE_SCAN} 3"#]] + .assert_eq(&cluster.run("select * from rw_fragments;").await?); + expect_test::expect![[r#" + 1 CREATED CUSTOM + 3 CREATED CUSTOM"#]] + .assert_eq(&cluster.run("select * from rw_table_fragments;").await?); + + // resolve_no_shuffle for backfill fragment is OK, which will scale the upstream together. + cluster + .reschedule_resolve_no_shuffle(source_backfill_fragment.reschedule( + [], + [ + WorkerSlotId::new(source_workers[0], 0), + WorkerSlotId::new(source_workers[0], 1), + WorkerSlotId::new(source_workers[2], 0), + WorkerSlotId::new(source_workers[2], 1), + ], + )) + .await + .unwrap(); + validate_splits_aligned(&mut cluster).await?; + expect_test::expect![[r#" + 1 1 HASH {2} {} {SOURCE} 7 + 2 3 HASH {4,3} {3} {MVIEW} 5 + 3 3 HASH {5} {1} {SOURCE_SCAN} 7"#]] + .assert_eq(&cluster.run("select * from rw_fragments;").await?); + expect_test::expect![[r#" +1 CREATED CUSTOM +3 CREATED CUSTOM"#]] + .assert_eq(&cluster.run("select * from rw_table_fragments;").await?); + Ok(()) +} diff --git a/src/utils/futures_util/src/lib.rs b/src/utils/futures_util/src/lib.rs index 4d086951dbb5f..115da2e7676f9 100644 --- a/src/utils/futures_util/src/lib.rs +++ b/src/utils/futures_util/src/lib.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(lint_reasons)] - use std::future::Future; use futures::stream::TryStream; diff --git a/src/utils/iter_util/src/lib.rs b/src/utils/iter_util/src/lib.rs index 58758c64a1ce5..92f19a0ee46fc 100644 --- a/src/utils/iter_util/src/lib.rs +++ b/src/utils/iter_util/src/lib.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(lint_reasons)] - pub trait ZipEqFast: ExactSizeIterator + Sized where B::IntoIter: ExactSizeIterator, diff --git a/src/utils/local_stats_alloc/src/lib.rs b/src/utils/local_stats_alloc/src/lib.rs index 3950d0cb4931e..94265768815c2 100644 --- a/src/utils/local_stats_alloc/src/lib.rs +++ b/src/utils/local_stats_alloc/src/lib.rs @@ -13,7 +13,6 @@ // limitations under the License. #![feature(allocator_api)] -#![feature(lint_reasons)] use std::alloc::Allocator; use std::ops::Deref; diff --git a/src/utils/pgwire/src/lib.rs b/src/utils/pgwire/src/lib.rs index 8d1c00541bb95..fae5489e81097 100644 --- a/src/utils/pgwire/src/lib.rs +++ b/src/utils/pgwire/src/lib.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(lint_reasons)] #![feature(trait_alias)] #![feature(iterator_try_collect)] #![feature(trusted_len)]