diff --git a/CHANGELOG.md b/CHANGELOG.md index 960ef7b2..eed4563a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Disable prometheus agents if monitoring is disabled at the installation or cluster level. + ## [0.2.0] - 2024-06-25 ### Added diff --git a/go.mod b/go.mod index b3f3f977..7dbe35e3 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/giantswarm/observability-operator go 1.22.0 require ( + github.com/giantswarm/apiextensions-application v0.6.2 github.com/go-logr/logr v1.4.2 github.com/onsi/ginkgo/v2 v2.19.0 github.com/onsi/gomega v1.33.1 @@ -20,6 +21,8 @@ require ( sigs.k8s.io/yaml v1.4.0 ) +require github.com/giantswarm/k8smetadata v0.24.0 // indirect + require ( github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect @@ -68,7 +71,7 @@ require ( gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect google.golang.org/protobuf v1.34.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect - gopkg.in/yaml.v2 v2.4.0 // indirect + gopkg.in/yaml.v2 v2.4.0 gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/apiextensions-apiserver v0.29.5 // indirect k8s.io/component-base v0.29.5 // indirect diff --git a/go.sum b/go.sum index cb2fd1ac..da7c0726 100644 --- a/go.sum +++ b/go.sum @@ -34,6 +34,10 @@ github.com/evanphx/json-patch/v5 v5.9.0 h1:kcBlZQbplgElYIlo/n1hJbls2z/1awpXxpRi0 github.com/evanphx/json-patch/v5 v5.9.0/go.mod h1:VNkHZ/282BpEyt/tObQO8s5CMPmYYq14uClGH4abBuQ= github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= +github.com/giantswarm/apiextensions-application v0.6.2 h1:XL86OrpprWl5Wp38EUvUXt3ztTo25+V63oDVlFwDpNg= +github.com/giantswarm/apiextensions-application v0.6.2/go.mod h1:8ylqSmDSzFblCppRQTFo8v9s/F6MX6RTusVVoDDfWso= +github.com/giantswarm/k8smetadata v0.24.0 h1:mAIgH4W06qx8X5rV9QEtJhCJLn8DMXfTfNVZi5ROp4c= +github.com/giantswarm/k8smetadata v0.24.0/go.mod h1:QiQAyaZnwco1U0lENLF0Kp4bSN4dIPwIlHWEvUo3ES8= github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= diff --git a/internal/controller/cluster_monitoring_controller.go b/internal/controller/cluster_monitoring_controller.go index c0873033..48e9b0dd 100644 --- a/internal/controller/cluster_monitoring_controller.go +++ b/internal/controller/cluster_monitoring_controller.go @@ -30,6 +30,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "github.com/giantswarm/observability-operator/pkg/bundle" "github.com/giantswarm/observability-operator/pkg/common" "github.com/giantswarm/observability-operator/pkg/monitoring" "github.com/giantswarm/observability-operator/pkg/monitoring/heartbeat" @@ -48,6 +49,8 @@ type ClusterMonitoringReconciler struct { heartbeat.HeartbeatRepository // MimirService is the service for managing mimir configuration. mimir.MimirService + // BundleConfigurationService is the service for configuring the observability bundle. + *bundle.BundleConfigurationService // MonitoringConfig is the configuration for the monitoring package. MonitoringConfig monitoring.Config } @@ -88,23 +91,27 @@ func (r *ClusterMonitoringReconciler) Reconcile(ctx context.Context, req ctrl.Re ctx = log.IntoContext(ctx, logger) if !r.MonitoringConfig.Enabled { - logger.Info("Monitoring is disabled at the installation level") - return ctrl.Result{}, nil + logger.Info("Monitoring is disabled at the installation level.") + } + + if !r.MonitoringConfig.IsMonitored(cluster) { + logger.Info("Monitoring is disabled for this cluster.") } // Handle deletion reconciliation loop. if !cluster.ObjectMeta.DeletionTimestamp.IsZero() { - logger.Info("Handling deletion for Cluster") + logger.Info("Handling deletion for cluster") return r.reconcileDelete(ctx, cluster) } - logger.Info("Reconciling Cluster") + logger.Info("Reconciling cluster") // Handle normal reconciliation loop. return r.reconcile(ctx, cluster) } // reconcile handles cluster reconciliation. func (r *ClusterMonitoringReconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster) (ctrl.Result, error) { + var err error logger := log.FromContext(ctx) // Add finalizer first if not set to avoid the race condition between init and delete. @@ -126,24 +133,49 @@ func (r *ClusterMonitoringReconciler) reconcile(ctx context.Context, cluster *cl return ctrl.Result{}, nil } + // Management cluster specific configuration if cluster.Name == r.ManagementCluster.Name { - err := r.HeartbeatRepository.CreateOrUpdate(ctx) + // If monitoring is enabled as the installation level, configure the monitoring stack, otherwise, tear it down. + if r.MonitoringConfig.Enabled { + err = r.HeartbeatRepository.CreateOrUpdate(ctx) + if err != nil { + logger.Error(err, "failed to create or update heartbeat") + return ctrl.Result{RequeueAfter: 5 * time.Minute}, errors.WithStack(err) + } + + err = r.MimirService.ConfigureMimir(ctx) + if err != nil { + logger.Error(err, "failed to configure mimir") + return ctrl.Result{RequeueAfter: 5 * time.Minute}, errors.WithStack(err) + } + } else { + err = r.tearDown(ctx) + if err != nil { + return ctrl.Result{RequeueAfter: 5 * time.Minute}, errors.WithStack(err) + } + } + } + + // Cluster specific configuration + if r.MonitoringConfig.IsMonitored(cluster) { + // Create or update PrometheusAgent remote write configuration. + err = r.PrometheusAgentService.ReconcileRemoteWriteConfiguration(ctx, cluster) if err != nil { - logger.Error(err, "failed to create or update heartbeat") + logger.Error(err, "failed to create or update prometheus agent remote write config") return ctrl.Result{RequeueAfter: 5 * time.Minute}, errors.WithStack(err) } - - err = r.MimirService.ConfigureMimir(ctx) + } else { + err := r.PrometheusAgentService.DeleteRemoteWriteConfiguration(ctx, cluster) if err != nil { - logger.Error(err, "failed to configure mimir") + logger.Error(err, "failed to delete prometheus agent remote write config") return ctrl.Result{RequeueAfter: 5 * time.Minute}, errors.WithStack(err) } } - // Create or update PrometheusAgent remote write configuration. - err := r.PrometheusAgentService.ReconcileRemoteWriteConfiguration(ctx, cluster) + // We always configure the bundle, even if monitoring is disabled for the cluster. + err = r.BundleConfigurationService.Configure(ctx, cluster) if err != nil { - logger.Error(err, "failed to create or update prometheus agent remote write config") + logger.Error(err, "failed to configure the observability-bundle") return ctrl.Result{RequeueAfter: 5 * time.Minute}, errors.WithStack(err) } @@ -153,27 +185,33 @@ func (r *ClusterMonitoringReconciler) reconcile(ctx context.Context, cluster *cl // reconcileDelete handles cluster deletion. func (r *ClusterMonitoringReconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Cluster) (reconcile.Result, error) { logger := log.FromContext(ctx) + + // We do not need to delete anything if there is no finalizer on the cluster if controllerutil.ContainsFinalizer(cluster, monitoring.MonitoringFinalizer) { - if cluster.Name == r.ManagementCluster.Name { - err := r.HeartbeatRepository.Delete(ctx) + // We always remote the bundle configure, even if monitoring is disabled for the cluster. + err := r.BundleConfigurationService.RemoveConfiguration(ctx, cluster) + if err != nil { + logger.Error(err, "failed to remove the observability-bundle configuration") + return ctrl.Result{RequeueAfter: 5 * time.Minute}, errors.WithStack(err) + } + + // Cluster specific configuration + if r.MonitoringConfig.IsMonitored(cluster) { + err := r.PrometheusAgentService.DeleteRemoteWriteConfiguration(ctx, cluster) if err != nil { - logger.Error(err, "failed to delete heartbeat") + logger.Error(err, "failed to delete prometheus agent remote write config") return ctrl.Result{RequeueAfter: 5 * time.Minute}, errors.WithStack(err) } + } - err = r.MimirService.DeleteMimirSecrets(ctx) + // Management cluster specific configuration + if cluster.Name == r.ManagementCluster.Name { + err := r.tearDown(ctx) if err != nil { - logger.Error(err, "failed to delete mimir ingress secret") return ctrl.Result{RequeueAfter: 5 * time.Minute}, errors.WithStack(err) } } - err := r.PrometheusAgentService.DeleteRemoteWriteConfiguration(ctx, cluster) - if err != nil { - logger.Error(err, "failed to delete prometheus agent remote write config") - return ctrl.Result{RequeueAfter: 5 * time.Minute}, errors.WithStack(err) - } - // We get the latest state of the object to avoid race conditions. // Finalizer handling needs to come last. // We use a patch rather than an update to avoid conflicts when multiple controllers are removing their finalizer from the ClusterCR @@ -183,6 +221,7 @@ func (r *ClusterMonitoringReconciler) reconcileDelete(ctx context.Context, clust if err != nil { return ctrl.Result{}, errors.WithStack(err) } + controllerutil.RemoveFinalizer(cluster, monitoring.MonitoringFinalizer) if err := patchHelper.Patch(ctx, cluster); err != nil { logger.Error(err, "failed to remove finalizer, requeuing", "finalizer", monitoring.MonitoringFinalizer) @@ -192,3 +231,22 @@ func (r *ClusterMonitoringReconciler) reconcileDelete(ctx context.Context, clust } return ctrl.Result{}, nil } + +// tearDown tears down the monitoring stack management cluster specific components like the hearbeat, mimir secrets and so on. +func (r *ClusterMonitoringReconciler) tearDown(ctx context.Context) error { + logger := log.FromContext(ctx) + + err := r.HeartbeatRepository.Delete(ctx) + if err != nil { + logger.Error(err, "failed to delete heartbeat") + return err + } + + err = r.MimirService.DeleteMimirSecrets(ctx) + if err != nil { + logger.Error(err, "failed to delete mimir ingress secret") + return err + } + + return nil +} diff --git a/main.go b/main.go index b46647cf..4834daef 100644 --- a/main.go +++ b/main.go @@ -26,6 +26,7 @@ import ( // to ensure that exec-entrypoint and run can make use of them. _ "k8s.io/client-go/plugin/pkg/client/auth" + appv1 "github.com/giantswarm/apiextensions-application/api/v1alpha1" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" @@ -38,6 +39,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/webhook" "github.com/giantswarm/observability-operator/internal/controller" + "github.com/giantswarm/observability-operator/pkg/bundle" "github.com/giantswarm/observability-operator/pkg/common" "github.com/giantswarm/observability-operator/pkg/common/organization" "github.com/giantswarm/observability-operator/pkg/common/password" @@ -79,6 +81,7 @@ const ( func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) utilruntime.Must(clusterv1.AddToScheme(scheme)) + utilruntime.Must(appv1.AddToScheme(scheme)) //+kubebuilder:scaffold:scheme } @@ -221,12 +224,13 @@ func main() { } if err = (&controller.ClusterMonitoringReconciler{ - Client: mgr.GetClient(), - ManagementCluster: managementCluster, - HeartbeatRepository: heartbeatRepository, - PrometheusAgentService: prometheusAgentService, - MimirService: mimirService, - MonitoringConfig: monitoringConfig, + Client: mgr.GetClient(), + ManagementCluster: managementCluster, + HeartbeatRepository: heartbeatRepository, + PrometheusAgentService: prometheusAgentService, + MimirService: mimirService, + MonitoringConfig: monitoringConfig, + BundleConfigurationService: bundle.NewBundleConfigurationService(mgr.GetClient(), monitoringConfig), }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Cluster") os.Exit(1) diff --git a/pkg/bundle/service.go b/pkg/bundle/service.go new file mode 100644 index 00000000..fae271a8 --- /dev/null +++ b/pkg/bundle/service.go @@ -0,0 +1,192 @@ +package bundle + +import ( + "context" + "fmt" + "reflect" + "slices" + + appv1 "github.com/giantswarm/apiextensions-application/api/v1alpha1" + "github.com/pkg/errors" + "gopkg.in/yaml.v2" + v1 "k8s.io/api/core/v1" + apimachineryerrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + + "github.com/giantswarm/observability-operator/pkg/monitoring" +) + +type BundleConfigurationService struct { + client client.Client + config monitoring.Config +} + +func NewBundleConfigurationService(client client.Client, config monitoring.Config) *BundleConfigurationService { + return &BundleConfigurationService{ + client: client, + config: config, + } +} + +func getConfigMapObjectKey(cluster *clusterv1.Cluster) types.NamespacedName { + return types.NamespacedName{ + Name: fmt.Sprintf("%s-observability-platform-configuration", cluster.Name), + Namespace: cluster.Namespace, + } +} + +// Configure configures the observability-bundle application. +// the observabilitybundle application to enable logging agents. +func (s BundleConfigurationService) Configure(ctx context.Context, cluster *clusterv1.Cluster) error { + logger := log.FromContext(ctx) + logger.Info("configuring observability-bundle") + + bundleConfiguration := bundleConfiguration{ + Apps: map[string]app{ + "prometheusAgent": { + Enabled: s.config.IsMonitored(cluster), + }, + }, + } + + logger.Info("creating or updating observability-bundle configmap") + err := s.createOrUpdateObservabilityBundleConfigMap(ctx, cluster, bundleConfiguration) + if err != nil { + return errors.WithStack(err) + } + + logger.Info("configure observability-bundle app") + err = s.configureObservabilityBundleApp(ctx, cluster) + if err != nil { + return errors.WithStack(err) + } + + logger.Info("observability-bundle is configured successfully") + + return nil +} + +func (s BundleConfigurationService) createOrUpdateObservabilityBundleConfigMap( + ctx context.Context, cluster *clusterv1.Cluster, configuration bundleConfiguration) error { + + logger := log.FromContext(ctx) + + values, err := yaml.Marshal(configuration) + if err != nil { + return errors.WithStack(err) + } + + configMapObjectKey := getConfigMapObjectKey(cluster) + desired := v1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: configMapObjectKey.Name, + Namespace: configMapObjectKey.Namespace, + Labels: map[string]string{ + "app.kubernetes.io/name": "observability-bundle", + "app.kubernetes.io/managed-by": "observability-operator", + "app.kubernetes.io/part-of": "observability-platform", + }, + }, + Data: map[string]string{"values": string(values)}, + } + + var current v1.ConfigMap + err = s.client.Get(ctx, configMapObjectKey, ¤t) + if err != nil { + if apimachineryerrors.IsNotFound(err) { + err = s.client.Create(ctx, &desired) + if err != nil { + return errors.WithStack(err) + } + logger.Info("observability-bundle configuration created") + } else { + return errors.WithStack(err) + } + } + + if !reflect.DeepEqual(current.Data, desired.Data) || + !reflect.DeepEqual(current.ObjectMeta.Labels, desired.ObjectMeta.Labels) { + err := s.client.Update(ctx, &desired) + if err != nil { + return errors.WithStack(err) + } + logger.Info("observability-bundle configuration updated") + } + + logger.Info("observability-bundle configuration up to date") + return nil +} + +func (s BundleConfigurationService) configureObservabilityBundleApp( + ctx context.Context, cluster *clusterv1.Cluster) error { + + configMapObjectKey := getConfigMapObjectKey(cluster) + + // Get observability bundle app metadata. + appObjectKey := types.NamespacedName{ + Name: fmt.Sprintf("%s-observability-bundle", cluster.Name), + Namespace: cluster.Namespace, + } + + var current appv1.App + err := s.client.Get(ctx, appObjectKey, ¤t) + if err != nil { + return errors.WithStack(err) + } + + desired := current.DeepCopy() + + desiredExtraConfig := appv1.AppExtraConfig{ + Kind: "configMap", + Name: configMapObjectKey.Name, + Namespace: configMapObjectKey.Namespace, + Priority: 25, + } + + foundIndex := slices.IndexFunc(current.Spec.ExtraConfigs, func(extraConfig appv1.AppExtraConfig) bool { + // We skip priority in case we want to change it + return extraConfig.Kind == desiredExtraConfig.Kind && + extraConfig.Name == desiredExtraConfig.Name && + extraConfig.Namespace == desiredExtraConfig.Namespace + }) + + if foundIndex == -1 { + desired.Spec.ExtraConfigs = append(desired.Spec.ExtraConfigs, desiredExtraConfig) + } else { + desired.Spec.ExtraConfigs[foundIndex] = desiredExtraConfig + } + + if !reflect.DeepEqual(current, *desired) { + err := s.client.Update(ctx, desired) + if err != nil { + return errors.WithStack(err) + } + } + + return nil +} + +func (s BundleConfigurationService) RemoveConfiguration(ctx context.Context, cluster *clusterv1.Cluster) error { + logger := log.FromContext(ctx) + + logger.Info("deleting observability-bundle configuration") + + configMapObjectKey := getConfigMapObjectKey(cluster) + var current = v1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: configMapObjectKey.Name, + Namespace: configMapObjectKey.Namespace, + }, + } + if err := s.client.Delete(ctx, ¤t); client.IgnoreNotFound(err) != nil { + return errors.WithStack(err) + } + + logger.Info("observability-bundle configuration has been deleted successfully") + + return nil +} diff --git a/pkg/bundle/types.go b/pkg/bundle/types.go new file mode 100644 index 00000000..c6125ce4 --- /dev/null +++ b/pkg/bundle/types.go @@ -0,0 +1,9 @@ +package bundle + +type bundleConfiguration struct { + Apps map[string]app `yaml:"apps" json:"apps"` +} + +type app struct { + Enabled bool `yaml:"enabled" json:"enabled"` +} diff --git a/pkg/monitoring/config.go b/pkg/monitoring/config.go index 77ad1477..69c0e803 100644 --- a/pkg/monitoring/config.go +++ b/pkg/monitoring/config.go @@ -1,6 +1,14 @@ package monitoring -import "github.com/giantswarm/observability-operator/pkg/monitoring/prometheusagent/sharding" +import ( + "strconv" + + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + + "github.com/giantswarm/observability-operator/pkg/monitoring/prometheusagent/sharding" +) + +const MonitoringLabel = "giantswarm.io/monitoring" // Config represents the configuration used by the monitoring package. type Config struct { @@ -9,3 +17,26 @@ type Config struct { // TODO(atlas): validate prometheus version using SemVer PrometheusVersion string } + +// Monitoring should be enabled when all conditions are met: +// - global monitoring flag is enabled +// - monitoring label is not set or is set to true on the cluster object +func (c Config) IsMonitored(cluster *clusterv1.Cluster) bool { + if !c.Enabled { + return false + } + + // Check if label is set on the cluster object + labels := cluster.GetLabels() + monitoringLabelValue, ok := labels[MonitoringLabel] + if !ok { + // If it's not set, monitoring is enabled by default + return true + } + + monitoringEnabled, err := strconv.ParseBool(monitoringLabelValue) + if err != nil { + return true + } + return monitoringEnabled +} diff --git a/pkg/monitoring/prometheusagent/sharding/sharding.go b/pkg/monitoring/prometheusagent/sharding/sharding.go index 9b0eae3a..ab3e623c 100644 --- a/pkg/monitoring/prometheusagent/sharding/sharding.go +++ b/pkg/monitoring/prometheusagent/sharding/sharding.go @@ -26,14 +26,14 @@ func (s Strategy) Merge(newStrategy *Strategy) Strategy { } // We want to start with 1 prometheus-agent for each 1M time series with a scale down 20% threshold. -func (pass Strategy) ComputeShards(currentShardCount int, timeSeries float64) int { - shardScaleDownThreshold := pass.ScaleDownPercentage * pass.ScaleUpSeriesCount - desiredShardCount := int(math.Ceil(timeSeries / pass.ScaleUpSeriesCount)) +func (s Strategy) ComputeShards(currentShardCount int, timeSeries float64) int { + shardScaleDownThreshold := s.ScaleDownPercentage * s.ScaleUpSeriesCount + desiredShardCount := int(math.Ceil(timeSeries / s.ScaleUpSeriesCount)) // Compute Scale Down if currentShardCount > desiredShardCount { // Check if the remainder of (timeSeries mod ScaleupSeriesCount) is bigger than the scale down threshold. - if math.Mod(timeSeries, pass.ScaleUpSeriesCount) > pass.ScaleUpSeriesCount-shardScaleDownThreshold { + if math.Mod(timeSeries, s.ScaleUpSeriesCount) > s.ScaleUpSeriesCount-shardScaleDownThreshold { desiredShardCount = currentShardCount } }