diff --git a/docs/monitoring/README.md b/docs/monitoring/README.md index fab5de0a..7323afe8 100644 --- a/docs/monitoring/README.md +++ b/docs/monitoring/README.md @@ -2,4 +2,4 @@ The Observability Operator is in charge of configuring the Prometheus Agent instances running in workload clusters like remote write configuration, [sharding](sharding.md) and so on. -TODO(atlas): Add documentation here. +TODO(atlas): Add operator specific documentation here ("sequence diagrams", list of created and managed resources) diff --git a/main.go b/main.go index 98dc9a28..b46647cf 100644 --- a/main.go +++ b/main.go @@ -45,6 +45,7 @@ import ( "github.com/giantswarm/observability-operator/pkg/monitoring/heartbeat" "github.com/giantswarm/observability-operator/pkg/monitoring/mimir" "github.com/giantswarm/observability-operator/pkg/monitoring/prometheusagent" + "github.com/giantswarm/observability-operator/pkg/monitoring/prometheusagent/sharding" //+kubebuilder:scaffold:imports ) @@ -197,10 +198,12 @@ func main() { organizationRepository := organization.NewNamespaceRepository(mgr.GetClient()) monitoringConfig := monitoring.Config{ - Enabled: monitoringEnabled, - ShardingScaleUpSeriesCount: monitoringShardingScaleUpSeriesCount, - ShardingScaleDownPercentage: monitoringShardingScaleDownPercentage, - PrometheusVersion: prometheusVersion, + Enabled: monitoringEnabled, + DefaultShardingStrategy: sharding.Strategy{ + ScaleUpSeriesCount: monitoringShardingScaleUpSeriesCount, + ScaleDownPercentage: monitoringShardingScaleDownPercentage, + }, + PrometheusVersion: prometheusVersion, } prometheusAgentService := prometheusagent.PrometheusAgentService{ @@ -223,7 +226,7 @@ func main() { HeartbeatRepository: heartbeatRepository, PrometheusAgentService: prometheusAgentService, MimirService: mimirService, - MonitoringConfig: monitoringConfig, + MonitoringConfig: monitoringConfig, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Cluster") os.Exit(1) diff --git a/pkg/monitoring/config.go b/pkg/monitoring/config.go index e1427c17..77ad1477 100644 --- a/pkg/monitoring/config.go +++ b/pkg/monitoring/config.go @@ -1,10 +1,11 @@ package monitoring +import "github.com/giantswarm/observability-operator/pkg/monitoring/prometheusagent/sharding" + // Config represents the configuration used by the monitoring package. type Config struct { - Enabled bool - ShardingScaleUpSeriesCount float64 - ShardingScaleDownPercentage float64 - // TODO(atlas): validate prometheus version + Enabled bool + DefaultShardingStrategy sharding.Strategy + // TODO(atlas): validate prometheus version using SemVer PrometheusVersion string } diff --git a/pkg/monitoring/prometheusagent/configmap.go b/pkg/monitoring/prometheusagent/configmap.go index 8543b9cc..a03b139c 100644 --- a/pkg/monitoring/prometheusagent/configmap.go +++ b/pkg/monitoring/prometheusagent/configmap.go @@ -96,29 +96,26 @@ func getServicePriority(cluster *clusterv1.Cluster) string { func (pas PrometheusAgentService) getShardsCountForCluster( ctx context.Context, cluster *clusterv1.Cluster, currentShardCount int) (int, error) { - clusterShardingStrategy, err := getClusterShardingStrategy(cluster) + clusterStrategy, err := getClusterStrategy(cluster) if err != nil { return 0, errors.WithStack(err) } - shardingStrategy := sharding.ShardingStrategy{ - ScaleUpSeriesCount: pas.MonitoringConfig.ShardingScaleUpSeriesCount, - ScaleDownPercentage: pas.MonitoringConfig.ShardingScaleDownPercentage, - }.Merge(clusterShardingStrategy) + Strategy := pas.MonitoringConfig.DefaultShardingStrategy.Merge(clusterStrategy) headSeries, err := querier.QueryTSDBHeadSeries(ctx, cluster.Name) if err != nil { // If Prometheus is not accessible (DNSError), or if we don't have any data yet (ErrNoTimeSeries) // Then, return the default number of shards. var dnsError *net.DNSError if errors.As(err, &dnsError) || errors.Is(err, querier.ErrorNoTimeSeries) { - return shardingStrategy.ComputeShards(currentShardCount, defaultShards), nil + return Strategy.ComputeShards(currentShardCount, defaultShards), nil } return 0, errors.WithStack(err) } - return shardingStrategy.ComputeShards(currentShardCount, headSeries), nil + return Strategy.ComputeShards(currentShardCount, headSeries), nil } -func getClusterShardingStrategy(cluster metav1.Object) (*sharding.ShardingStrategy, error) { +func getClusterStrategy(cluster metav1.Object) (*sharding.Strategy, error) { var err error var scaleUpSeriesCount, scaleDownPercentage float64 if value, ok := cluster.GetAnnotations()["monitoring.giantswarm.io/prometheus-agent-scale-up-series-count"]; ok { @@ -131,7 +128,7 @@ func getClusterShardingStrategy(cluster metav1.Object) (*sharding.ShardingStrate return nil, err } } - return &sharding.ShardingStrategy{ + return &sharding.Strategy{ ScaleUpSeriesCount: scaleUpSeriesCount, ScaleDownPercentage: scaleDownPercentage, }, nil diff --git a/pkg/monitoring/prometheusagent/sharding/sharding.go b/pkg/monitoring/prometheusagent/sharding/sharding.go index f462bfb9..ffaa0845 100644 --- a/pkg/monitoring/prometheusagent/sharding/sharding.go +++ b/pkg/monitoring/prometheusagent/sharding/sharding.go @@ -2,31 +2,31 @@ package sharding import "math" -type ShardingStrategy struct { +type Strategy struct { // Configures the number of series needed to add a new shard. Computation is number of series / ScaleUpSeriesCount ScaleUpSeriesCount float64 // Percentage of needed series based on ScaleUpSeriesCount to scale down agents ScaleDownPercentage float64 } -func (pass1 ShardingStrategy) Merge(pass2 *ShardingStrategy) ShardingStrategy { - strategy := ShardingStrategy{ - pass1.ScaleUpSeriesCount, - pass1.ScaleDownPercentage, +func (s Strategy) Merge(newStrategy *Strategy) Strategy { + strategy := Strategy{ + s.ScaleUpSeriesCount, + s.ScaleDownPercentage, } - if pass2 != nil { - if pass2.ScaleUpSeriesCount > 0 { - strategy.ScaleUpSeriesCount = pass2.ScaleUpSeriesCount + if newStrategy != nil { + if newStrategy.ScaleUpSeriesCount > 0 { + strategy.ScaleUpSeriesCount = newStrategy.ScaleUpSeriesCount } - if pass2.ScaleDownPercentage > 0 { - strategy.ScaleDownPercentage = pass2.ScaleDownPercentage + if newStrategy.ScaleDownPercentage > 0 { + strategy.ScaleDownPercentage = newStrategy.ScaleDownPercentage } } return strategy } // We want to start with 1 prometheus-agent for each 1M time series with a scale down 20% threshold. -func (pass ShardingStrategy) ComputeShards(currentShardCount int, timeSeries float64) int { +func (pass Strategy) ComputeShards(currentShardCount int, timeSeries float64) int { shardScaleDownThreshold := pass.ScaleDownPercentage * pass.ScaleUpSeriesCount desiredShardCount := int(math.Ceil(timeSeries / pass.ScaleUpSeriesCount)) diff --git a/pkg/monitoring/prometheusagent/sharding/sharding_test.go b/pkg/monitoring/prometheusagent/sharding/sharding_test.go index 2d3baa25..479aa8bc 100644 --- a/pkg/monitoring/prometheusagent/sharding/sharding_test.go +++ b/pkg/monitoring/prometheusagent/sharding/sharding_test.go @@ -5,7 +5,7 @@ import ( ) func TestShardComputationScaleUp(t *testing.T) { - pass := ShardingStrategy{ScaleUpSeriesCount: float64(1_000_000), ScaleDownPercentage: float64(0.20)} + pass := Strategy{ScaleUpSeriesCount: float64(1_000_000), ScaleDownPercentage: float64(0.20)} expected := 1 result := pass.ComputeShards(0, float64(1_000_000)) @@ -27,7 +27,7 @@ func TestShardComputationScaleUp(t *testing.T) { } func TestShardComputationReturnsAtLeast1Shart(t *testing.T) { - pass := ShardingStrategy{ScaleUpSeriesCount: float64(1_000_000), ScaleDownPercentage: float64(0.20)} + pass := Strategy{ScaleUpSeriesCount: float64(1_000_000), ScaleDownPercentage: float64(0.20)} expected := 1 result := pass.ComputeShards(0, 0) @@ -43,7 +43,7 @@ func TestShardComputationReturnsAtLeast1Shart(t *testing.T) { } func TestShardComputationScaleDown(t *testing.T) { - pass := ShardingStrategy{ScaleUpSeriesCount: float64(1_000_000), ScaleDownPercentage: float64(0.20)} + pass := Strategy{ScaleUpSeriesCount: float64(1_000_000), ScaleDownPercentage: float64(0.20)} expected := 2 result := pass.ComputeShards(1, 1_000_001) if result != expected {