From dc3807ce840ed43a81a6f965ef7d2f84d1cb56b0 Mon Sep 17 00:00:00 2001 From: Wen Xu Date: Sun, 1 Oct 2023 22:19:11 +0000 Subject: [PATCH] =?UTF-8?q?add=20context=20timeout=20for=20waitInstanceSta?= =?UTF-8?q?te=20call=20for=20alertmanager=20and=20s=E2=80=A6=20(#5581)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + docs/blocks-storage/store-gateway.md | 4 ++++ docs/configuration/config-file-reference.md | 8 ++++++++ pkg/alertmanager/alertmanager_ring.go | 6 +++++- pkg/alertmanager/multitenant.go | 10 ++++++++-- pkg/storegateway/gateway.go | 10 ++++++++-- pkg/storegateway/gateway_ring.go | 4 ++++ 7 files changed, 38 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dbdc6f3865..ed4bc24eca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -87,6 +87,7 @@ * [BUGFIX] DDBKV: When no change detected in ring, retry the CAS until there is change. #5502 * [BUGFIX] Fix bug on objstore when configured to use S3 fips endpoints. #5540 * [BUGFIX] Ruler: Fix bug on ruler where a failure to load a single RuleGroup would prevent rulers to sync all RuleGroup. #5563 +* [BUGFIX] Store-Gateway and AlertManager: Add a `wait_instance_time_out` to WaitInstanceState context to avoid waiting forever. #5581 ## 1.15.1 2023-04-26 diff --git a/docs/blocks-storage/store-gateway.md b/docs/blocks-storage/store-gateway.md index d407806542..77f7546761 100644 --- a/docs/blocks-storage/store-gateway.md +++ b/docs/blocks-storage/store-gateway.md @@ -309,6 +309,10 @@ store_gateway: # CLI flag: -store-gateway.sharding-ring.wait-stability-max-duration [wait_stability_max_duration: | default = 5m] + # Timeout for waiting on store-gateway to become desired state in the ring. + # CLI flag: -store-gateway.sharding-ring.wait-instance-state-timeout + [wait_instance_state_timeout: | default = 10m] + # The sleep seconds when store-gateway is shutting down. Need to be close to # or larger than KV Store information propagation delay # CLI flag: -store-gateway.sharding-ring.final-sleep diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 0fb7777c30..6bb6d06ca7 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -378,6 +378,10 @@ sharding_ring: # CLI flag: -alertmanager.sharding-ring.final-sleep [final_sleep: | default = 0s] + # Timeout for waiting on alertmanager to become desired state in the ring. + # CLI flag: -alertmanager.sharding-ring.wait-instance-state-timeout + [wait_instance_state_timeout: | default = 10m] + # Name of network interface to read address from. # CLI flag: -alertmanager.sharding-ring.instance-interface-names [instance_interface_names: | default = [eth0 en0]] @@ -4867,6 +4871,10 @@ sharding_ring: # CLI flag: -store-gateway.sharding-ring.wait-stability-max-duration [wait_stability_max_duration: | default = 5m] + # Timeout for waiting on store-gateway to become desired state in the ring. + # CLI flag: -store-gateway.sharding-ring.wait-instance-state-timeout + [wait_instance_state_timeout: | default = 10m] + # The sleep seconds when store-gateway is shutting down. Need to be close to # or larger than KV Store information propagation delay # CLI flag: -store-gateway.sharding-ring.final-sleep diff --git a/pkg/alertmanager/alertmanager_ring.go b/pkg/alertmanager/alertmanager_ring.go index dc26f6a4db..cdb52b5ae8 100644 --- a/pkg/alertmanager/alertmanager_ring.go +++ b/pkg/alertmanager/alertmanager_ring.go @@ -49,7 +49,8 @@ type RingConfig struct { ReplicationFactor int `yaml:"replication_factor"` ZoneAwarenessEnabled bool `yaml:"zone_awareness_enabled"` - FinalSleep time.Duration `yaml:"final_sleep"` + FinalSleep time.Duration `yaml:"final_sleep"` + WaitInstanceStateTimeout time.Duration `yaml:"wait_instance_state_timeout"` // Instance details InstanceID string `yaml:"instance_id" doc:"hidden"` @@ -94,6 +95,9 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { f.StringVar(&cfg.InstanceZone, rfprefix+"instance-availability-zone", "", "The availability zone where this instance is running. Required if zone-awareness is enabled.") cfg.RingCheckPeriod = 5 * time.Second + + // Timeout durations + f.DurationVar(&cfg.WaitInstanceStateTimeout, rfprefix+"wait-instance-state-timeout", 10*time.Minute, "Timeout for waiting on alertmanager to become desired state in the ring.") } // ToLifecyclerConfig returns a LifecyclerConfig based on the alertmanager diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 8ce007abc1..49a3e401ad 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -486,7 +486,10 @@ func (am *MultitenantAlertmanager) starting(ctx context.Context) (err error) { // We wait until the instance is in the JOINING state, once it does we know that tokens are assigned to this instance and we'll be ready to perform an initial sync of configs. level.Info(am.logger).Log("msg", "waiting until alertmanager is JOINING in the ring") - if err = ring.WaitInstanceState(ctx, am.ring, am.ringLifecycler.GetInstanceID(), ring.JOINING); err != nil { + ctxWithTimeout, cancel := context.WithTimeout(ctx, am.cfg.ShardingRing.WaitInstanceStateTimeout) + defer cancel() + if err = ring.WaitInstanceState(ctxWithTimeout, am.ring, am.ringLifecycler.GetInstanceID(), ring.JOINING); err != nil { + level.Error(am.logger).Log("msg", "alertmanager failed to become JOINING in the ring", "err", err) return err } level.Info(am.logger).Log("msg", "alertmanager is JOINING in the ring") @@ -519,7 +522,10 @@ func (am *MultitenantAlertmanager) starting(ctx context.Context) (err error) { // Wait until the ring client detected this instance in the ACTIVE state. level.Info(am.logger).Log("msg", "waiting until alertmanager is ACTIVE in the ring") - if err := ring.WaitInstanceState(ctx, am.ring, am.ringLifecycler.GetInstanceID(), ring.ACTIVE); err != nil { + ctxWithTimeout, cancel := context.WithTimeout(ctx, am.cfg.ShardingRing.WaitInstanceStateTimeout) + defer cancel() + if err := ring.WaitInstanceState(ctxWithTimeout, am.ring, am.ringLifecycler.GetInstanceID(), ring.ACTIVE); err != nil { + level.Error(am.logger).Log("msg", "alertmanager failed to become ACTIVE in the ring", "err", err) return err } level.Info(am.logger).Log("msg", "alertmanager is ACTIVE in the ring") diff --git a/pkg/storegateway/gateway.go b/pkg/storegateway/gateway.go index fe99a32fa1..536a7f2556 100644 --- a/pkg/storegateway/gateway.go +++ b/pkg/storegateway/gateway.go @@ -244,7 +244,10 @@ func (g *StoreGateway) starting(ctx context.Context) (err error) { // make sure that when we'll run the initial sync we already know the tokens // assigned to this instance. level.Info(g.logger).Log("msg", "waiting until store-gateway is JOINING in the ring") - if err := ring.WaitInstanceState(ctx, g.ring, g.ringLifecycler.GetInstanceID(), ring.JOINING); err != nil { + ctxWithTimeout, cancel := context.WithTimeout(ctx, g.gatewayCfg.ShardingRing.WaitInstanceStateTimeout) + defer cancel() + if err := ring.WaitInstanceState(ctxWithTimeout, g.ring, g.ringLifecycler.GetInstanceID(), ring.JOINING); err != nil { + level.Error(g.logger).Log("msg", "store-gateway failed to become JOINING in the ring", "err", err) return err } level.Info(g.logger).Log("msg", "store-gateway is JOINING in the ring") @@ -285,7 +288,10 @@ func (g *StoreGateway) starting(ctx context.Context) (err error) { // make sure that when we'll run the loop it won't be detected as a ring // topology change. level.Info(g.logger).Log("msg", "waiting until store-gateway is ACTIVE in the ring") - if err := ring.WaitInstanceState(ctx, g.ring, g.ringLifecycler.GetInstanceID(), ring.ACTIVE); err != nil { + ctxWithTimeout, cancel := context.WithTimeout(ctx, g.gatewayCfg.ShardingRing.WaitInstanceStateTimeout) + defer cancel() + if err := ring.WaitInstanceState(ctxWithTimeout, g.ring, g.ringLifecycler.GetInstanceID(), ring.ACTIVE); err != nil { + level.Error(g.logger).Log("msg", "store-gateway failed to become ACTIVE in the ring", "err", err) return err } level.Info(g.logger).Log("msg", "store-gateway is ACTIVE in the ring") diff --git a/pkg/storegateway/gateway_ring.go b/pkg/storegateway/gateway_ring.go index 06d2836835..8965c32f95 100644 --- a/pkg/storegateway/gateway_ring.go +++ b/pkg/storegateway/gateway_ring.go @@ -72,6 +72,7 @@ type RingConfig struct { // Wait ring stability. WaitStabilityMinDuration time.Duration `yaml:"wait_stability_min_duration"` WaitStabilityMaxDuration time.Duration `yaml:"wait_stability_max_duration"` + WaitInstanceStateTimeout time.Duration `yaml:"wait_instance_state_timeout"` FinalSleep time.Duration `yaml:"final_sleep"` @@ -123,6 +124,9 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { // Defaults for internal settings. cfg.RingCheckPeriod = 5 * time.Second + + // Timeout durations + f.DurationVar(&cfg.WaitInstanceStateTimeout, ringFlagsPrefix+"wait-instance-state-timeout", 10*time.Minute, "Timeout for waiting on store-gateway to become desired state in the ring.") } func (cfg *RingConfig) ToRingConfig() ring.Config {