From f804ed09529340f5131484956a6db328ff5c5113 Mon Sep 17 00:00:00 2001 From: Noel Kwan <47273164+kwannoel@users.noreply.github.com> Date: Tue, 23 Jan 2024 19:09:11 +0800 Subject: [PATCH] feat(object_store): add retry for `Minio` `SlowDown` and `TooManyRequests` errors (#14739) --- Makefile.toml | 5 +++ ci/scripts/backfill-test.sh | 2 +- ci/scripts/run-backfill-tests.sh | 55 +++++++++++++++++++------------ ci/workflows/main-cron.yml | 2 +- ci/workflows/pull-request.yml | 2 +- risedev.yml | 50 ++++++++++++++++++++++++++++ src/common/src/config.rs | 34 ++++++++++++++++--- src/config/ci.toml | 2 +- src/config/example.toml | 4 +++ src/object_store/src/object/s3.rs | 30 +++++++++++++---- 10 files changed, 151 insertions(+), 35 deletions(-) diff --git a/Makefile.toml b/Makefile.toml index 790564671f358..983b304d74e51 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -1305,6 +1305,11 @@ command = "target/${BUILD_MODE_DIR}/risedev-dev" args = ["${@}"] description = "Clean data and start a full RisingWave dev cluster using risedev-dev" +[tasks.ci-kill-no-dump-logs] +category = "RiseDev - CI" +dependencies = ["k", "check-logs", "wait-processes-exit"] +description = "Kill cluster and check logs, do not dump logs" + [tasks.ci-kill] category = "RiseDev - CI" dependencies = ["k", "l", "check-logs", "wait-processes-exit"] diff --git a/ci/scripts/backfill-test.sh b/ci/scripts/backfill-test.sh index 056db77c842a4..4769d7c5d229d 100755 --- a/ci/scripts/backfill-test.sh +++ b/ci/scripts/backfill-test.sh @@ -32,4 +32,4 @@ download_and_prepare_rw "$profile" source ################ TESTS -profile=$profile ./ci/scripts/run-backfill-tests.sh +BUILDKITE=${BUILDKITE:-} profile=$profile ./ci/scripts/run-backfill-tests.sh diff --git a/ci/scripts/run-backfill-tests.sh b/ci/scripts/run-backfill-tests.sh index 0fb97b978e67c..e809053f9f376 100755 --- a/ci/scripts/run-backfill-tests.sh +++ b/ci/scripts/run-backfill-tests.sh @@ -23,12 +23,17 @@ BACKGROUND_DDL_DIR=$TEST_DIR/background_ddl COMMON_DIR=$BACKGROUND_DDL_DIR/common CLUSTER_PROFILE='ci-1cn-1fe-kafka-with-recovery' +echo "--- Configuring cluster profiles" if [[ -n "${BUILDKITE:-}" ]]; then - RUNTIME_CLUSTER_PROFILE='ci-3cn-1fe-with-monitoring' -else + echo "Running in buildkite" RUNTIME_CLUSTER_PROFILE='ci-3cn-1fe' + MINIO_RATE_LIMIT_CLUSTER_PROFILE='ci-3cn-1fe-with-minio-rate-limit' +else + echo "Running locally" + RUNTIME_CLUSTER_PROFILE='ci-3cn-1fe-with-monitoring' + MINIO_RATE_LIMIT_CLUSTER_PROFILE='ci-3cn-1fe-with-monitoring-and-minio-rate-limit' fi -export RUST_LOG="info,risingwave_meta::barrier::progress=debug,risingwave_meta::rpc::ddl_controller=debug" +export RUST_LOG="info,risingwave_stream=info,risingwave_batch=info,risingwave_storage=info" \ run_sql_file() { psql -h localhost -p 4566 -d dev -U root -f "$@" @@ -60,8 +65,8 @@ rename_logs_with_prefix() { } kill_cluster() { - cargo make kill - cargo make wait-processes-exit + cargo make ci-kill-no-dump-logs + wait } restart_cluster() { @@ -150,7 +155,6 @@ test_backfill_tombstone() { ./risedev psql -c "CREATE MATERIALIZED VIEW m1 as select * from tomb;" echo "--- Kill cluster" kill_cluster - cargo make wait-processes-exit wait } @@ -171,9 +175,7 @@ test_replication_with_column_pruning() { run_sql_file "$PARENT_PATH"/sql/backfill/replication_with_column_pruning/select.sql , } impl SystemConfig { @@ -1526,8 +1546,14 @@ pub mod default { DEFAULT_RETRY_MAX_ATTEMPTS } - pub fn retry_unknown_service_error() -> bool { - false + pub mod developer { + pub fn object_store_retry_unknown_service_error() -> bool { + false + } + + pub fn object_store_retryable_service_error_codes() -> Vec { + vec!["SlowDown".into(), "TooManyRequests".into()] + } } } } diff --git a/src/config/ci.toml b/src/config/ci.toml index 51793d3dba698..db207ebf44412 100644 --- a/src/config/ci.toml +++ b/src/config/ci.toml @@ -19,4 +19,4 @@ imm_merge_threshold = 2 [system] barrier_interval_ms = 250 checkpoint_frequency = 5 -max_concurrent_creating_streaming_jobs = 0 +max_concurrent_creating_streaming_jobs = 0 \ No newline at end of file diff --git a/src/config/example.toml b/src/config/example.toml index 413321d6ff3ec..914cfe63889d4 100644 --- a/src/config/example.toml +++ b/src/config/example.toml @@ -181,6 +181,10 @@ object_store_req_retry_max_delay_ms = 10000 object_store_req_retry_max_attempts = 8 retry_unknown_service_error = false +[storage.object_store.s3.developer] +object_store_retry_unknown_service_error = false +object_store_retryable_service_error_codes = ["SlowDown", "TooManyRequests"] + [system] barrier_interval_ms = 1000 checkpoint_frequency = 1 diff --git a/src/object_store/src/object/s3.rs b/src/object_store/src/object/s3.rs index cddb9c0c75e33..349d3b7142322 100644 --- a/src/object_store/src/object/s3.rs +++ b/src/object_store/src/object/s3.rs @@ -938,12 +938,18 @@ impl From for ObjectError { struct RetryCondition { retry_unknown_service_error: bool, + retryable_service_error_codes: Vec, } impl RetryCondition { fn new(config: &S3ObjectStoreConfig) -> Self { Self { - retry_unknown_service_error: config.retry_unknown_service_error, + retry_unknown_service_error: config.developer.object_store_retry_unknown_service_error + || config.retry_unknown_service_error, + retryable_service_error_codes: config + .developer + .object_store_retryable_service_error_codes + .clone(), } } } @@ -958,12 +964,24 @@ impl tokio_retry::Condition for RetryCondition { return true; } } - SdkError::ServiceError(e) => { - if self.retry_unknown_service_error && e.err().code().is_none() { - tracing::warn!(target: "unknown_service_error", "{e:?} occurs, retry S3 get_object request."); - return true; + SdkError::ServiceError(e) => match e.err().code() { + None => { + if self.retry_unknown_service_error { + tracing::warn!(target: "unknown_service_error", "{e:?} occurs, retry S3 get_object request."); + return true; + } } - } + Some(code) => { + if self + .retryable_service_error_codes + .iter() + .any(|s| s.as_str().eq_ignore_ascii_case(code)) + { + tracing::warn!(target: "retryable_service_error", "{e:?} occurs, retry S3 get_object request."); + return true; + } + } + }, _ => {} }, Either::Right(_) => {