diff --git a/go.mod b/go.mod index 64dabf07..33b057c9 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/giantswarm/observability-operator -go 1.21 +go 1.22.0 require ( github.com/go-logr/logr v1.4.1 @@ -11,7 +11,7 @@ require ( github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.72.0 github.com/prometheus/client_golang v1.19.0 github.com/prometheus/common v0.51.1 - github.com/sirupsen/logrus v1.9.0 + github.com/sirupsen/logrus v1.9.3 gopkg.in/yaml.v2 v2.4.0 k8s.io/api v0.29.3 k8s.io/apimachinery v0.29.3 diff --git a/helm/observability-operator/templates/deployment.yaml b/helm/observability-operator/templates/deployment.yaml index 1bea8099..e2aec3dc 100644 --- a/helm/observability-operator/templates/deployment.yaml +++ b/helm/observability-operator/templates/deployment.yaml @@ -24,7 +24,9 @@ spec: image: "{{ .Values.image.registry }}/{{ .Values.image.name }}:{{ default .Chart.Version .Values.image.tag }}" args: - --leader-elect + - --management-cluster-base-domain={{ $.Values.managementCluster.baseDomain }} - --management-cluster-customer={{ $.Values.managementCluster.customer }} + - --management-cluster-insecure-ca={{ $.Values.managementCluster.insecureCA }} - --management-cluster-name={{ $.Values.managementCluster.name }} - --management-cluster-pipeline={{ $.Values.managementCluster.pipeline }} - --management-cluster-region={{ $.Values.managementCluster.region }} diff --git a/helm/observability-operator/values.schema.json b/helm/observability-operator/values.schema.json index 4be449fe..fa4f3a8d 100644 --- a/helm/observability-operator/values.schema.json +++ b/helm/observability-operator/values.schema.json @@ -32,9 +32,15 @@ "managementCluster": { "type": "object", "properties": { + "baseDomain": { + "type": "string" + }, "customer": { "type": "string" }, + "insecureCA": { + "type": "boolean" + }, "name": { "type": "string" }, @@ -54,6 +60,9 @@ }, "opsgenieApiKey": { "type": "string" + }, + "prometheusVersion": { + "type": "string" } } }, diff --git a/helm/observability-operator/values.yaml b/helm/observability-operator/values.yaml index 8e02a133..2d4231e0 100644 --- a/helm/observability-operator/values.yaml +++ b/helm/observability-operator/values.yaml @@ -8,7 +8,9 @@ image: tag: "" managementCluster: + baseDomain: domain customer: customer + insecureCA: false name: name pipeline: pipeline region: region diff --git a/internal/controller/cluster_monitoring_controller.go b/internal/controller/cluster_monitoring_controller.go index e80d490e..6ae9bccb 100644 --- a/internal/controller/cluster_monitoring_controller.go +++ b/internal/controller/cluster_monitoring_controller.go @@ -117,7 +117,7 @@ func (r *ClusterMonitoringReconciler) reconcile(ctx context.Context, cluster *cl } // Create or update PrometheusAgent remote write configuration. - err := r.PrometheusAgentService.ReconcilePrometheusAgentRemoteWriteConfig(ctx, cluster) + err := r.PrometheusAgentService.ReconcileRemoteWriteConfig(ctx, cluster) if err != nil { logger.Error(err, "failed to create or update prometheus agent remote write config") return ctrl.Result{Requeue: true}, errors.WithStack(err) @@ -138,7 +138,7 @@ func (r *ClusterMonitoringReconciler) reconcileDelete(ctx context.Context, clust } } - err := r.PrometheusAgentService.DeletePrometheusAgentRemoteWriteConfig(ctx, cluster) + err := r.PrometheusAgentService.DeleteRemoteWriteConfig(ctx, cluster) if err != nil { logger.Error(err, "failed to delete prometheus agent remote write config") return ctrl.Result{Requeue: true}, errors.WithStack(err) diff --git a/main.go b/main.go index 1a90c946..1d40ecd8 100644 --- a/main.go +++ b/main.go @@ -40,6 +40,7 @@ import ( "github.com/giantswarm/observability-operator/internal/controller" "github.com/giantswarm/observability-operator/pkg/common" "github.com/giantswarm/observability-operator/pkg/common/organization" + "github.com/giantswarm/observability-operator/pkg/common/password" "github.com/giantswarm/observability-operator/pkg/monitoring/heartbeat" "github.com/giantswarm/observability-operator/pkg/monitoring/prometheusagent" //+kubebuilder:scaffold:imports @@ -49,17 +50,19 @@ var ( scheme = runtime.NewScheme() setupLog = ctrl.Log.WithName("setup") - metricsAddr string - enableLeaderElection bool - probeAddr string - secureMetrics bool - enableHTTP2 bool - managementClusterCustomer string - managementClusterName string - managementClusterPipeline string - managementClusterRegion string - monitoringEnabled bool - prometheusVersion string + metricsAddr string + enableLeaderElection bool + probeAddr string + secureMetrics bool + enableHTTP2 bool + managementClusterBaseDomain string + managementClusterCustomer string + managementClusterInsecureCA bool + managementClusterName string + managementClusterPipeline string + managementClusterRegion string + monitoringEnabled bool + prometheusVersion string ) const ( @@ -85,8 +88,12 @@ func main() { "If set the metrics endpoint is served securely") flag.BoolVar(&enableHTTP2, "enable-http2", false, "If set, HTTP/2 will be enabled for the metrics and webhook servers") + flag.StringVar(&managementClusterBaseDomain, "management-cluster-base-domain", "", + "The base domain of the management cluster.") flag.StringVar(&managementClusterCustomer, "management-cluster-customer", "", "The customer of the management cluster.") + flag.BoolVar(&managementClusterInsecureCA, "management-cluster-insecure-ca", false, + "Flag to indicate if the management cluster has an insecure CA that should be trusted") flag.StringVar(&managementClusterName, "management-cluster-name", "", "The name of the management cluster.") flag.StringVar(&managementClusterPipeline, "management-cluster-pipeline", "", @@ -157,10 +164,12 @@ func main() { record.InitFromRecorder(mgr.GetEventRecorderFor("observability-operator")) var managementCluster common.ManagementCluster = common.ManagementCluster{ - Customer: managementClusterCustomer, - Name: managementClusterName, - Pipeline: managementClusterPipeline, - Region: managementClusterRegion, + BaseDomain: managementClusterBaseDomain, + Customer: managementClusterCustomer, + InsecureCA: managementClusterInsecureCA, + Name: managementClusterName, + Pipeline: managementClusterPipeline, + Region: managementClusterRegion, } var opsgenieApiKey = os.Getenv(OpsgenieApiKey) @@ -181,11 +190,12 @@ func main() { prometheusAgentService := prometheusagent.PrometheusAgentService{ Client: mgr.GetClient(), OrganizationRepository: organizationRepository, + PasswordManager: password.SimpleManager{}, ManagementCluster: managementCluster, PrometheusVersion: prometheusVersion, } - if err = (&controller.ClusterMonitoringReconciler{ + if err = (&controller.ClusterMonitoringReconciler{ Client: mgr.GetClient(), ManagementCluster: managementCluster, HeartbeatRepository: heartbeatRepository, diff --git a/pkg/common/password/manager.go b/pkg/common/password/manager.go new file mode 100644 index 00000000..80c2726b --- /dev/null +++ b/pkg/common/password/manager.go @@ -0,0 +1,21 @@ +package password + +import ( + "crypto/rand" + "encoding/hex" +) + +type Manager interface { + GeneratePassword(length int) (string, error) +} + +type SimpleManager struct { +} + +func (m SimpleManager) GeneratePassword(length int) (string, error) { + bytes := make([]byte, length) + if _, err := rand.Read(bytes); err != nil { + return "", err + } + return hex.EncodeToString(bytes), nil +} diff --git a/pkg/common/types.go b/pkg/common/types.go index f2b5c382..96fd8a95 100644 --- a/pkg/common/types.go +++ b/pkg/common/types.go @@ -33,8 +33,12 @@ const ( ) type ManagementCluster struct { + // BaseDomain is the base domain of the management cluster. + BaseDomain string // Customer is the customer name of the management cluster. Customer string + // InsecureCA is a flag to indicate if the management cluster has an insecure CA that should be truster + InsecureCA bool // Name is the name of the management cluster. Name string // Pipeline is the pipeline name of the management cluster. diff --git a/pkg/monitoring/mimir/querier/querier.go b/pkg/monitoring/mimir/querier/querier.go index 83c32e27..8abc1d50 100644 --- a/pkg/monitoring/mimir/querier/querier.go +++ b/pkg/monitoring/mimir/querier/querier.go @@ -51,7 +51,8 @@ func QueryTSDBHeadSeries(ctx context.Context, clusterName string) (float64, erro api := v1.NewAPI(c) queryContext, cancel := context.WithTimeout(ctx, 2*time.Minute) - val, _, err := api.Query(queryContext, fmt.Sprintf("max_over_time(count({cluster_id=\"%s\"})[6h])", clusterName), time.Now()) + query := fmt.Sprintf("max_over_time(count({cluster_id=\"%s\"})[6h])", clusterName) + val, _, err := api.Query(queryContext, query, time.Now()) cancel() if err != nil { return 0, err diff --git a/pkg/monitoring/prometheusagent/config.go b/pkg/monitoring/prometheusagent/config.go new file mode 100644 index 00000000..36b0b249 --- /dev/null +++ b/pkg/monitoring/prometheusagent/config.go @@ -0,0 +1,38 @@ +package prometheusagent + +import ( + "context" + "fmt" + "net" + + "github.com/pkg/errors" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + + "github.com/giantswarm/observability-operator/pkg/monitoring/mimir/querier" + "github.com/giantswarm/observability-operator/pkg/monitoring/prometheusagent/shards" +) + +func getPrometheusAgentRemoteWriteConfigName(cluster *clusterv1.Cluster) string { + return fmt.Sprintf("%s-remote-write-config", cluster.Name) +} + +func getServicePriority(cluster *clusterv1.Cluster) string { + if servicePriority, ok := cluster.GetLabels()[servicePriorityLabel]; ok && servicePriority != "" { + return servicePriority + } + return defaultServicePriority +} + +// We want to compute the number of shards based on the number of nodes. +func getShardsCountForCluster(ctx context.Context, cluster *clusterv1.Cluster, currentShardCount int) (int, error) { + headSeries, err := querier.QueryTSDBHeadSeries(ctx, cluster.Name) + if err != nil { + // Verify that Prometheus is accessible. If not, return the default number of shards. + var dnsError *net.DNSError + if errors.As(err, &dnsError) { + return shards.ComputeShards(currentShardCount, defaultShards), nil + } + return 0, errors.WithStack(err) + } + return shards.ComputeShards(currentShardCount, headSeries), nil +} diff --git a/pkg/monitoring/prometheusagent/parsing.go b/pkg/monitoring/prometheusagent/parsing.go new file mode 100644 index 00000000..38217cdb --- /dev/null +++ b/pkg/monitoring/prometheusagent/parsing.go @@ -0,0 +1,36 @@ +package prometheusagent + +import ( + "gopkg.in/yaml.v2" + + "github.com/pkg/errors" + corev1 "k8s.io/api/core/v1" + + "github.com/giantswarm/observability-operator/pkg/monitoring/prometheusagent/remotewrite" +) + +func readCurrentShardsFromConfig(configMap corev1.ConfigMap) (int, error) { + remoteWriteConfig := remotewrite.RemoteWriteConfig{} + err := yaml.Unmarshal([]byte(configMap.Data["values"]), &remoteWriteConfig) + if err != nil { + return 0, errors.WithStack(err) + } + + return remoteWriteConfig.PrometheusAgentConfig.Shards, nil +} + +func readRemoteWritePasswordFromSecret(secret corev1.Secret) (string, error) { + remoteWriteConfig := remotewrite.RemoteWriteConfig{} + err := yaml.Unmarshal(secret.Data["values"], &remoteWriteConfig) + if err != nil { + return "", errors.WithStack(err) + } + + for _, rw := range remoteWriteConfig.PrometheusAgentConfig.RemoteWrite { + if rw.Name == "prometheus-meta-operator" { + return rw.Password, nil + } + } + + return "", errors.New("remote write password not found in secret") +} diff --git a/pkg/monitoring/prometheusagent/secret.go b/pkg/monitoring/prometheusagent/secret.go new file mode 100644 index 00000000..b279bc88 --- /dev/null +++ b/pkg/monitoring/prometheusagent/secret.go @@ -0,0 +1,11 @@ +package prometheusagent + +import ( + "fmt" + + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" +) + +func getPrometheusAgentRemoteWriteSecretName(cluster *clusterv1.Cluster) string { + return fmt.Sprintf("%s-remote-write-secret", cluster.Name) +} diff --git a/pkg/monitoring/prometheusagent/service.go b/pkg/monitoring/prometheusagent/service.go index 29d8d18c..37a6cb88 100644 --- a/pkg/monitoring/prometheusagent/service.go +++ b/pkg/monitoring/prometheusagent/service.go @@ -3,11 +3,11 @@ package prometheusagent import ( "context" "fmt" - "net" "reflect" "github.com/go-logr/logr" "github.com/pkg/errors" + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" "gopkg.in/yaml.v2" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -19,10 +19,9 @@ import ( "github.com/giantswarm/observability-operator/pkg/common" "github.com/giantswarm/observability-operator/pkg/common/organization" + "github.com/giantswarm/observability-operator/pkg/common/password" "github.com/giantswarm/observability-operator/pkg/monitoring" - "github.com/giantswarm/observability-operator/pkg/monitoring/mimir/querier" "github.com/giantswarm/observability-operator/pkg/monitoring/prometheusagent/remotewrite" - "github.com/giantswarm/observability-operator/pkg/monitoring/prometheusagent/shards" ) const ( @@ -33,19 +32,44 @@ const ( // servicePriorityLabel is the label used to determine the priority of a service. servicePriorityLabel string = "giantswarm.io/service-priority" + + remoteWriteEndpointTemplateURL = "https://%s/%s/api/v1/write" ) type PrometheusAgentService struct { client.Client organization.OrganizationRepository + PasswordManager password.Manager common.ManagementCluster PrometheusVersion string } // ensurePrometheusAgentRemoteWriteConfig ensures that the prometheus remote write config is present in the cluster. -func (pas *PrometheusAgentService) ReconcilePrometheusAgentRemoteWriteConfig(ctx context.Context, cluster *clusterv1.Cluster) error { +func (pas *PrometheusAgentService) ReconcileRemoteWriteConfig( + ctx context.Context, cluster *clusterv1.Cluster) error { + logger := log.FromContext(ctx).WithValues("cluster", cluster.Name) - logger.Info("ensuring prometheus remote write config") + logger.Info("ensuring prometheus agent remote write configuration") + + err := pas.createOrUpdateConfig(ctx, cluster, logger) + if err != nil { + logger.Error(err, "failed to create or update prometheus agent remote write config") + return errors.WithStack(err) + } + + err = pas.createOrUpdateSecret(ctx, cluster, logger) + if err != nil { + logger.Error(err, "failed to create or update prometheus agent remote write secret") + return errors.WithStack(err) + } + + logger.Info("ensured prometheus agent remote write configuration") + + return nil +} + +func (pas PrometheusAgentService) createOrUpdateConfig( + ctx context.Context, cluster *clusterv1.Cluster, logger logr.Logger) error { objectKey := client.ObjectKey{ Name: getPrometheusAgentRemoteWriteConfigName(cluster), @@ -83,35 +107,98 @@ func (pas *PrometheusAgentService) ReconcilePrometheusAgentRemoteWriteConfig(ctx return errors.WithStack(err) } } + return nil +} + +func (pas PrometheusAgentService) createOrUpdateSecret(ctx context.Context, cluster *clusterv1.Cluster, logger logr.Logger) error { + objectKey := client.ObjectKey{ + Name: getPrometheusAgentRemoteWriteSecretName(cluster), + Namespace: cluster.GetNamespace(), + } + + current := &corev1.Secret{} + // Get the current secret if it exists. + err := pas.Client.Get(ctx, objectKey, current) + if apierrors.IsNotFound(err) { + logger.Info("generating password for the prometheus agent") + password, err := pas.PasswordManager.GeneratePassword(32) + if err != nil { + logger.Error(err, "failed to generate the prometheus agent password") + return errors.WithStack(err) + } + logger.Info("generated password for the prometheus agent") + + secret, err := pas.buildRemoteWriteSecret(ctx, cluster, logger, password) + if err != nil { + return errors.WithStack(err) + } + err = pas.Client.Create(ctx, secret) + return errors.WithStack(err) + } else if err != nil { + return errors.WithStack(err) + } - logger.Info("ensured prometheus remote write config") + // As it takes a long time to apply the new password to the agent due to a built-in delay in the app-platform, + // we keep the already generated remote write password. + password, err := readRemoteWritePasswordFromSecret(*current) + if err != nil { + return errors.WithStack(err) + } + + desired, err := pas.buildRemoteWriteSecret(ctx, cluster, logger, password) + if err != nil { + return errors.WithStack(err) + } + if !reflect.DeepEqual(current.Data, desired.Data) { + err = pas.Client.Patch(ctx, current, client.MergeFrom(desired)) + if err != nil { + return errors.WithStack(err) + } + } return nil } -func (pas *PrometheusAgentService) DeletePrometheusAgentRemoteWriteConfig(ctx context.Context, cluster *clusterv1.Cluster) error { +func (pas *PrometheusAgentService) DeleteRemoteWriteConfig(ctx context.Context, cluster *clusterv1.Cluster) error { logger := log.FromContext(ctx).WithValues("cluster", cluster.Name) - logger.Info("deleting prometheus remote write config") + logger.Info("deleting prometheus agent remote write configuration") + err := pas.deleteConfigMap(ctx, cluster) + if err != nil { + logger.Error(err, "failed to delete prometheus agent remote write config") + return errors.WithStack(err) + } + + err = pas.deleteSecret(ctx, cluster) + if err != nil { + logger.Error(err, "failed to delete prometheus agent remote write secret") + return errors.WithStack(err) + } + + logger.Info("deleted prometheus agent remote write configuration") + + return nil +} + +func (pas PrometheusAgentService) deleteConfigMap(ctx context.Context, cluster *clusterv1.Cluster) error { objectKey := client.ObjectKey{ Name: getPrometheusAgentRemoteWriteConfigName(cluster), Namespace: cluster.GetNamespace(), } - current := &corev1.ConfigMap{} // Get the current configmap if it exists. err := pas.Client.Get(ctx, objectKey, current) if apierrors.IsNotFound(err) { - // We ignore cases where the configmap is not found (it it was manually deleted for instance) + // Ignore cases where the configmap is not found (if it was manually deleted, for instance). return nil } else if err != nil { return errors.WithStack(err) } - desired := current.DeepCopy() // Delete the finalizer + desired := current.DeepCopy() controllerutil.RemoveFinalizer(desired, monitoring.MonitoringFinalizer) - err = pas.Client.Patch(ctx, current, client.MergeFrom(desired)) + err = pas.Client.Patch(ctx, desired, client.MergeFrom(current)) if err != nil { return errors.WithStack(err) } @@ -120,13 +207,42 @@ func (pas *PrometheusAgentService) DeletePrometheusAgentRemoteWriteConfig(ctx co if err != nil { return errors.WithStack(err) } + return nil +} - logger.Info("deleted prometheus remote write config") +func (pas PrometheusAgentService) deleteSecret(ctx context.Context, cluster *clusterv1.Cluster) error { + objectKey := client.ObjectKey{ + Name: getPrometheusAgentRemoteWriteSecretName(cluster), + Namespace: cluster.GetNamespace(), + } + current := &corev1.Secret{} + // Get the current secret if it exists. + err := pas.Client.Get(ctx, objectKey, current) + if apierrors.IsNotFound(err) { + // Ignore cases where the secret is not found (if it was manually deleted, for instance). + return nil + } else if err != nil { + return errors.WithStack(err) + } + // Delete the finalizer + desired := current.DeepCopy() + controllerutil.RemoveFinalizer(desired, monitoring.MonitoringFinalizer) + err = pas.Client.Patch(ctx, current, client.MergeFrom(desired)) + if err != nil { + return errors.WithStack(err) + } + + err = pas.Client.Delete(ctx, desired) + if err != nil { + return errors.WithStack(err) + } return nil } -func (pas PrometheusAgentService) buildRemoteWriteConfig(ctx context.Context, cluster *clusterv1.Cluster, logger logr.Logger, currentShards int) (*corev1.ConfigMap, error) { +func (pas PrometheusAgentService) buildRemoteWriteConfig(ctx context.Context, + cluster *clusterv1.Cluster, logger logr.Logger, currentShards int) (*corev1.ConfigMap, error) { + organization, err := pas.OrganizationRepository.Read(ctx, cluster) if err != nil { logger.Error(err, "failed to get cluster organization") @@ -189,37 +305,52 @@ func (pas PrometheusAgentService) buildRemoteWriteConfig(ctx context.Context, cl }, nil } -func getPrometheusAgentRemoteWriteConfigName(cluster *clusterv1.Cluster) string { - return fmt.Sprintf("%s-remote-write-config", cluster.Name) -} +func (pas PrometheusAgentService) buildRemoteWriteSecret(ctx context.Context, + cluster *clusterv1.Cluster, logger logr.Logger, password string) (*corev1.Secret, error) { -func readCurrentShardsFromConfig(configMap corev1.ConfigMap) (int, error) { - remoteWriteConfig := remotewrite.RemoteWriteConfig{} - err := yaml.Unmarshal([]byte(configMap.Data["values"]), &remoteWriteConfig) - if err != nil { - return 0, errors.WithStack(err) + url := fmt.Sprintf(remoteWriteEndpointTemplateURL, pas.ManagementCluster.BaseDomain, cluster.Name) + remoteWriteConfig := remotewrite.RemoteWriteConfig{ + PrometheusAgentConfig: remotewrite.PrometheusAgentConfig{ + RemoteWrite: []remotewrite.RemoteWrite{ + { + RemoteWriteSpec: promv1.RemoteWriteSpec{ + URL: url, + Name: "prometheus-meta-operator", + RemoteTimeout: "60s", + QueueConfig: &promv1.QueueConfig{ + Capacity: 30000, + MaxSamplesPerSend: 150000, + MaxShards: 10, + }, + TLSConfig: &promv1.TLSConfig{ + SafeTLSConfig: promv1.SafeTLSConfig{ + InsecureSkipVerify: pas.ManagementCluster.InsecureCA, + }, + }, + }, + Username: cluster.Name, + Password: password, + }, + }, + }, } - return remoteWriteConfig.PrometheusAgentConfig.Shards, nil -} - -// We want to compute the number of shards based on the number of nodes. -func getShardsCountForCluster(ctx context.Context, cluster *clusterv1.Cluster, currentShardCount int) (int, error) { - headSeries, err := querier.QueryTSDBHeadSeries(ctx, cluster.Name) + marshalledValues, err := yaml.Marshal(remoteWriteConfig) if err != nil { - // If prometheus is not accessible (for instance, not running because this is a new cluster, we check if prometheus is accessible) - var dnsError *net.DNSError - if errors.As(err, &dnsError) { - return shards.ComputeShards(currentShardCount, defaultShardCount), nil - } - return 0, errors.WithStack(err) + return nil, errors.WithStack(err) } - return shards.ComputeShards(currentShardCount, headSeries), nil -} -func getServicePriority(cluster *clusterv1.Cluster) string { - if servicePriority, ok := cluster.GetLabels()[servicePriorityLabel]; ok && servicePriority != "" { - return servicePriority - } - return defaultServicePriority + return &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: getPrometheusAgentRemoteWriteSecretName(cluster), + Namespace: cluster.Namespace, + Finalizers: []string{ + monitoring.MonitoringFinalizer, + }, + }, + Data: map[string][]byte{ + "values": marshalledValues, + }, + Type: "Opaque", + }, nil }