Skip to content

Commit

Permalink
add context timeout for waitInstanceState call for alertmanager and s… (
Browse files Browse the repository at this point in the history
  • Loading branch information
wenxu1024 authored Oct 1, 2023
1 parent cbcf039 commit dc3807c
Show file tree
Hide file tree
Showing 7 changed files with 38 additions and 5 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@
* [BUGFIX] DDBKV: When no change detected in ring, retry the CAS until there is change. #5502
* [BUGFIX] Fix bug on objstore when configured to use S3 fips endpoints. #5540
* [BUGFIX] Ruler: Fix bug on ruler where a failure to load a single RuleGroup would prevent rulers to sync all RuleGroup. #5563
* [BUGFIX] Store-Gateway and AlertManager: Add a `wait_instance_time_out` to WaitInstanceState context to avoid waiting forever. #5581

## 1.15.1 2023-04-26

Expand Down
4 changes: 4 additions & 0 deletions docs/blocks-storage/store-gateway.md
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,10 @@ store_gateway:
# CLI flag: -store-gateway.sharding-ring.wait-stability-max-duration
[wait_stability_max_duration: <duration> | default = 5m]

# Timeout for waiting on store-gateway to become desired state in the ring.
# CLI flag: -store-gateway.sharding-ring.wait-instance-state-timeout
[wait_instance_state_timeout: <duration> | default = 10m]

# The sleep seconds when store-gateway is shutting down. Need to be close to
# or larger than KV Store information propagation delay
# CLI flag: -store-gateway.sharding-ring.final-sleep
Expand Down
8 changes: 8 additions & 0 deletions docs/configuration/config-file-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,10 @@ sharding_ring:
# CLI flag: -alertmanager.sharding-ring.final-sleep
[final_sleep: <duration> | default = 0s]
# Timeout for waiting on alertmanager to become desired state in the ring.
# CLI flag: -alertmanager.sharding-ring.wait-instance-state-timeout
[wait_instance_state_timeout: <duration> | default = 10m]
# Name of network interface to read address from.
# CLI flag: -alertmanager.sharding-ring.instance-interface-names
[instance_interface_names: <list of string> | default = [eth0 en0]]
Expand Down Expand Up @@ -4867,6 +4871,10 @@ sharding_ring:
# CLI flag: -store-gateway.sharding-ring.wait-stability-max-duration
[wait_stability_max_duration: <duration> | default = 5m]
# Timeout for waiting on store-gateway to become desired state in the ring.
# CLI flag: -store-gateway.sharding-ring.wait-instance-state-timeout
[wait_instance_state_timeout: <duration> | default = 10m]
# The sleep seconds when store-gateway is shutting down. Need to be close to
# or larger than KV Store information propagation delay
# CLI flag: -store-gateway.sharding-ring.final-sleep
Expand Down
6 changes: 5 additions & 1 deletion pkg/alertmanager/alertmanager_ring.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ type RingConfig struct {
ReplicationFactor int `yaml:"replication_factor"`
ZoneAwarenessEnabled bool `yaml:"zone_awareness_enabled"`

FinalSleep time.Duration `yaml:"final_sleep"`
FinalSleep time.Duration `yaml:"final_sleep"`
WaitInstanceStateTimeout time.Duration `yaml:"wait_instance_state_timeout"`

// Instance details
InstanceID string `yaml:"instance_id" doc:"hidden"`
Expand Down Expand Up @@ -94,6 +95,9 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {
f.StringVar(&cfg.InstanceZone, rfprefix+"instance-availability-zone", "", "The availability zone where this instance is running. Required if zone-awareness is enabled.")

cfg.RingCheckPeriod = 5 * time.Second

// Timeout durations
f.DurationVar(&cfg.WaitInstanceStateTimeout, rfprefix+"wait-instance-state-timeout", 10*time.Minute, "Timeout for waiting on alertmanager to become desired state in the ring.")
}

// ToLifecyclerConfig returns a LifecyclerConfig based on the alertmanager
Expand Down
10 changes: 8 additions & 2 deletions pkg/alertmanager/multitenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,10 @@ func (am *MultitenantAlertmanager) starting(ctx context.Context) (err error) {

// We wait until the instance is in the JOINING state, once it does we know that tokens are assigned to this instance and we'll be ready to perform an initial sync of configs.
level.Info(am.logger).Log("msg", "waiting until alertmanager is JOINING in the ring")
if err = ring.WaitInstanceState(ctx, am.ring, am.ringLifecycler.GetInstanceID(), ring.JOINING); err != nil {
ctxWithTimeout, cancel := context.WithTimeout(ctx, am.cfg.ShardingRing.WaitInstanceStateTimeout)
defer cancel()
if err = ring.WaitInstanceState(ctxWithTimeout, am.ring, am.ringLifecycler.GetInstanceID(), ring.JOINING); err != nil {
level.Error(am.logger).Log("msg", "alertmanager failed to become JOINING in the ring", "err", err)
return err
}
level.Info(am.logger).Log("msg", "alertmanager is JOINING in the ring")
Expand Down Expand Up @@ -519,7 +522,10 @@ func (am *MultitenantAlertmanager) starting(ctx context.Context) (err error) {

// Wait until the ring client detected this instance in the ACTIVE state.
level.Info(am.logger).Log("msg", "waiting until alertmanager is ACTIVE in the ring")
if err := ring.WaitInstanceState(ctx, am.ring, am.ringLifecycler.GetInstanceID(), ring.ACTIVE); err != nil {
ctxWithTimeout, cancel := context.WithTimeout(ctx, am.cfg.ShardingRing.WaitInstanceStateTimeout)
defer cancel()
if err := ring.WaitInstanceState(ctxWithTimeout, am.ring, am.ringLifecycler.GetInstanceID(), ring.ACTIVE); err != nil {
level.Error(am.logger).Log("msg", "alertmanager failed to become ACTIVE in the ring", "err", err)
return err
}
level.Info(am.logger).Log("msg", "alertmanager is ACTIVE in the ring")
Expand Down
10 changes: 8 additions & 2 deletions pkg/storegateway/gateway.go
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,10 @@ func (g *StoreGateway) starting(ctx context.Context) (err error) {
// make sure that when we'll run the initial sync we already know the tokens
// assigned to this instance.
level.Info(g.logger).Log("msg", "waiting until store-gateway is JOINING in the ring")
if err := ring.WaitInstanceState(ctx, g.ring, g.ringLifecycler.GetInstanceID(), ring.JOINING); err != nil {
ctxWithTimeout, cancel := context.WithTimeout(ctx, g.gatewayCfg.ShardingRing.WaitInstanceStateTimeout)
defer cancel()
if err := ring.WaitInstanceState(ctxWithTimeout, g.ring, g.ringLifecycler.GetInstanceID(), ring.JOINING); err != nil {
level.Error(g.logger).Log("msg", "store-gateway failed to become JOINING in the ring", "err", err)
return err
}
level.Info(g.logger).Log("msg", "store-gateway is JOINING in the ring")
Expand Down Expand Up @@ -285,7 +288,10 @@ func (g *StoreGateway) starting(ctx context.Context) (err error) {
// make sure that when we'll run the loop it won't be detected as a ring
// topology change.
level.Info(g.logger).Log("msg", "waiting until store-gateway is ACTIVE in the ring")
if err := ring.WaitInstanceState(ctx, g.ring, g.ringLifecycler.GetInstanceID(), ring.ACTIVE); err != nil {
ctxWithTimeout, cancel := context.WithTimeout(ctx, g.gatewayCfg.ShardingRing.WaitInstanceStateTimeout)
defer cancel()
if err := ring.WaitInstanceState(ctxWithTimeout, g.ring, g.ringLifecycler.GetInstanceID(), ring.ACTIVE); err != nil {
level.Error(g.logger).Log("msg", "store-gateway failed to become ACTIVE in the ring", "err", err)
return err
}
level.Info(g.logger).Log("msg", "store-gateway is ACTIVE in the ring")
Expand Down
4 changes: 4 additions & 0 deletions pkg/storegateway/gateway_ring.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ type RingConfig struct {
// Wait ring stability.
WaitStabilityMinDuration time.Duration `yaml:"wait_stability_min_duration"`
WaitStabilityMaxDuration time.Duration `yaml:"wait_stability_max_duration"`
WaitInstanceStateTimeout time.Duration `yaml:"wait_instance_state_timeout"`

FinalSleep time.Duration `yaml:"final_sleep"`

Expand Down Expand Up @@ -123,6 +124,9 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {

// Defaults for internal settings.
cfg.RingCheckPeriod = 5 * time.Second

// Timeout durations
f.DurationVar(&cfg.WaitInstanceStateTimeout, ringFlagsPrefix+"wait-instance-state-timeout", 10*time.Minute, "Timeout for waiting on store-gateway to become desired state in the ring.")
}

func (cfg *RingConfig) ToRingConfig() ring.Config {
Expand Down

0 comments on commit dc3807c

Please sign in to comment.