diff --git a/go.mod b/go.mod index d5602a74..33b057c9 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/giantswarm/observability-operator -go 1.21 +go 1.22.0 require ( github.com/go-logr/logr v1.4.1 diff --git a/helm/observability-operator/templates/deployment.yaml b/helm/observability-operator/templates/deployment.yaml index 1bea8099..d21c7ab8 100644 --- a/helm/observability-operator/templates/deployment.yaml +++ b/helm/observability-operator/templates/deployment.yaml @@ -24,13 +24,15 @@ spec: image: "{{ .Values.image.registry }}/{{ .Values.image.name }}:{{ default .Chart.Version .Values.image.tag }}" args: - --leader-elect + - --management-cluster-base-domain={{ $.Values.managementCluster.baseDomain }} - --management-cluster-customer={{ $.Values.managementCluster.customer }} + - --management-cluster-insecure-ca={{ $.Values.managementCluster.insecureCA }} - --management-cluster-name={{ $.Values.managementCluster.name }} - --management-cluster-pipeline={{ $.Values.managementCluster.pipeline }} - --management-cluster-region={{ $.Values.managementCluster.region }} - --monitoring-enabled={{ $.Values.monitoring.enabled }} {{- if .Values.monitoring.prometheusVersion }} - - --prometheus-version={{ $.Values.monitoring.prometheusVersion | quote }} + - --prometheus-version={{ $.Values.monitoring.prometheusVersion }} {{- end }} env: - name: OPSGENIE_API_KEY diff --git a/helm/observability-operator/templates/rbac.yaml b/helm/observability-operator/templates/rbac.yaml index 20bce7f6..b6d04c5f 100644 --- a/helm/observability-operator/templates/rbac.yaml +++ b/helm/observability-operator/templates/rbac.yaml @@ -8,16 +8,16 @@ rules: - apiGroups: - "" resources: - - secrets - - configmaps + - namespaces verbs: - - create - - update - - delete - - deletecollection - - get - list - watch + - apiGroups: + - "" + resources: + - secrets + - configmaps + verbs: ["*"] - apiGroups: - cluster.x-k8s.io resources: diff --git a/helm/observability-operator/values.schema.json b/helm/observability-operator/values.schema.json index 4be449fe..fa4f3a8d 100644 --- a/helm/observability-operator/values.schema.json +++ b/helm/observability-operator/values.schema.json @@ -32,9 +32,15 @@ "managementCluster": { "type": "object", "properties": { + "baseDomain": { + "type": "string" + }, "customer": { "type": "string" }, + "insecureCA": { + "type": "boolean" + }, "name": { "type": "string" }, @@ -54,6 +60,9 @@ }, "opsgenieApiKey": { "type": "string" + }, + "prometheusVersion": { + "type": "string" } } }, diff --git a/helm/observability-operator/values.yaml b/helm/observability-operator/values.yaml index 8e02a133..2d4231e0 100644 --- a/helm/observability-operator/values.yaml +++ b/helm/observability-operator/values.yaml @@ -8,7 +8,9 @@ image: tag: "" managementCluster: + baseDomain: domain customer: customer + insecureCA: false name: name pipeline: pipeline region: region diff --git a/internal/controller/cluster_monitoring_controller.go b/internal/controller/cluster_monitoring_controller.go index e80d490e..4bbd449c 100644 --- a/internal/controller/cluster_monitoring_controller.go +++ b/internal/controller/cluster_monitoring_controller.go @@ -18,6 +18,7 @@ package controller import ( "context" + "time" "github.com/pkg/errors" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -112,15 +113,15 @@ func (r *ClusterMonitoringReconciler) reconcile(ctx context.Context, cluster *cl err := r.HeartbeatRepository.CreateOrUpdate(ctx) if err != nil { logger.Error(err, "failed to create or update heartbeat") - return ctrl.Result{Requeue: true}, errors.WithStack(err) + return ctrl.Result{RequeueAfter: 5 * time.Minute}, errors.WithStack(err) } } // Create or update PrometheusAgent remote write configuration. - err := r.PrometheusAgentService.ReconcilePrometheusAgentRemoteWriteConfig(ctx, cluster) + err := r.PrometheusAgentService.ReconcileRemoteWriteConfig(ctx, cluster) if err != nil { logger.Error(err, "failed to create or update prometheus agent remote write config") - return ctrl.Result{Requeue: true}, errors.WithStack(err) + return ctrl.Result{RequeueAfter: 5 * time.Minute}, errors.WithStack(err) } return ctrl.Result{}, nil @@ -134,14 +135,14 @@ func (r *ClusterMonitoringReconciler) reconcileDelete(ctx context.Context, clust err := r.HeartbeatRepository.Delete(ctx) if err != nil { logger.Error(err, "failed to delete heartbeat") - return ctrl.Result{Requeue: true}, errors.WithStack(err) + return ctrl.Result{RequeueAfter: 5 * time.Minute}, errors.WithStack(err) } } - err := r.PrometheusAgentService.DeletePrometheusAgentRemoteWriteConfig(ctx, cluster) + err := r.PrometheusAgentService.DeleteRemoteWriteConfig(ctx, cluster) if err != nil { logger.Error(err, "failed to delete prometheus agent remote write config") - return ctrl.Result{Requeue: true}, errors.WithStack(err) + return ctrl.Result{RequeueAfter: 5 * time.Minute}, errors.WithStack(err) } // We get the latest state of the object to avoid race conditions. @@ -153,7 +154,7 @@ func (r *ClusterMonitoringReconciler) reconcileDelete(ctx context.Context, clust // We need to requeue if we fail to remove the finalizer because of race conditions between multiple operators. // This will be eventually consistent. logger.Error(err, "failed to remove finalizer, requeuing", "finalizer", monitoring.MonitoringFinalizer) - return ctrl.Result{Requeue: true}, nil + return ctrl.Result{RequeueAfter: 5 * time.Minute}, nil } logger.Info("removed finalizer", "finalizer", monitoring.MonitoringFinalizer) } diff --git a/main.go b/main.go index 1a90c946..1d40ecd8 100644 --- a/main.go +++ b/main.go @@ -40,6 +40,7 @@ import ( "github.com/giantswarm/observability-operator/internal/controller" "github.com/giantswarm/observability-operator/pkg/common" "github.com/giantswarm/observability-operator/pkg/common/organization" + "github.com/giantswarm/observability-operator/pkg/common/password" "github.com/giantswarm/observability-operator/pkg/monitoring/heartbeat" "github.com/giantswarm/observability-operator/pkg/monitoring/prometheusagent" //+kubebuilder:scaffold:imports @@ -49,17 +50,19 @@ var ( scheme = runtime.NewScheme() setupLog = ctrl.Log.WithName("setup") - metricsAddr string - enableLeaderElection bool - probeAddr string - secureMetrics bool - enableHTTP2 bool - managementClusterCustomer string - managementClusterName string - managementClusterPipeline string - managementClusterRegion string - monitoringEnabled bool - prometheusVersion string + metricsAddr string + enableLeaderElection bool + probeAddr string + secureMetrics bool + enableHTTP2 bool + managementClusterBaseDomain string + managementClusterCustomer string + managementClusterInsecureCA bool + managementClusterName string + managementClusterPipeline string + managementClusterRegion string + monitoringEnabled bool + prometheusVersion string ) const ( @@ -85,8 +88,12 @@ func main() { "If set the metrics endpoint is served securely") flag.BoolVar(&enableHTTP2, "enable-http2", false, "If set, HTTP/2 will be enabled for the metrics and webhook servers") + flag.StringVar(&managementClusterBaseDomain, "management-cluster-base-domain", "", + "The base domain of the management cluster.") flag.StringVar(&managementClusterCustomer, "management-cluster-customer", "", "The customer of the management cluster.") + flag.BoolVar(&managementClusterInsecureCA, "management-cluster-insecure-ca", false, + "Flag to indicate if the management cluster has an insecure CA that should be trusted") flag.StringVar(&managementClusterName, "management-cluster-name", "", "The name of the management cluster.") flag.StringVar(&managementClusterPipeline, "management-cluster-pipeline", "", @@ -157,10 +164,12 @@ func main() { record.InitFromRecorder(mgr.GetEventRecorderFor("observability-operator")) var managementCluster common.ManagementCluster = common.ManagementCluster{ - Customer: managementClusterCustomer, - Name: managementClusterName, - Pipeline: managementClusterPipeline, - Region: managementClusterRegion, + BaseDomain: managementClusterBaseDomain, + Customer: managementClusterCustomer, + InsecureCA: managementClusterInsecureCA, + Name: managementClusterName, + Pipeline: managementClusterPipeline, + Region: managementClusterRegion, } var opsgenieApiKey = os.Getenv(OpsgenieApiKey) @@ -181,11 +190,12 @@ func main() { prometheusAgentService := prometheusagent.PrometheusAgentService{ Client: mgr.GetClient(), OrganizationRepository: organizationRepository, + PasswordManager: password.SimpleManager{}, ManagementCluster: managementCluster, PrometheusVersion: prometheusVersion, } - if err = (&controller.ClusterMonitoringReconciler{ + if err = (&controller.ClusterMonitoringReconciler{ Client: mgr.GetClient(), ManagementCluster: managementCluster, HeartbeatRepository: heartbeatRepository, diff --git a/pkg/common/password/manager.go b/pkg/common/password/manager.go new file mode 100644 index 00000000..80c2726b --- /dev/null +++ b/pkg/common/password/manager.go @@ -0,0 +1,21 @@ +package password + +import ( + "crypto/rand" + "encoding/hex" +) + +type Manager interface { + GeneratePassword(length int) (string, error) +} + +type SimpleManager struct { +} + +func (m SimpleManager) GeneratePassword(length int) (string, error) { + bytes := make([]byte, length) + if _, err := rand.Read(bytes); err != nil { + return "", err + } + return hex.EncodeToString(bytes), nil +} diff --git a/pkg/common/types.go b/pkg/common/types.go index f2b5c382..96fd8a95 100644 --- a/pkg/common/types.go +++ b/pkg/common/types.go @@ -33,8 +33,12 @@ const ( ) type ManagementCluster struct { + // BaseDomain is the base domain of the management cluster. + BaseDomain string // Customer is the customer name of the management cluster. Customer string + // InsecureCA is a flag to indicate if the management cluster has an insecure CA that should be truster + InsecureCA bool // Name is the name of the management cluster. Name string // Pipeline is the pipeline name of the management cluster. diff --git a/pkg/monitoring/finalizers.go b/pkg/monitoring/finalizers.go index d03d5298..39adfdc8 100644 --- a/pkg/monitoring/finalizers.go +++ b/pkg/monitoring/finalizers.go @@ -1,4 +1,4 @@ package monitoring // MonitoringFinalizer is the finalizer for monitoring resources. -const MonitoringFinalizer = "monitoring.giantswarm.io" +const MonitoringFinalizer = "observability.giantswarm.io/monitoring" diff --git a/pkg/monitoring/mimir/querier/querier.go b/pkg/monitoring/mimir/querier/querier.go index 83c32e27..276279af 100644 --- a/pkg/monitoring/mimir/querier/querier.go +++ b/pkg/monitoring/mimir/querier/querier.go @@ -4,7 +4,6 @@ import ( "context" "errors" "fmt" - "net/http" "time" "github.com/prometheus/client_golang/api" @@ -12,33 +11,10 @@ import ( "github.com/prometheus/common/model" ) -// headerAdder is an http.RoundTripper that adds additional headers to the request -type headerAdder struct { - headers map[string][]string - - rt http.RoundTripper -} - -func (h *headerAdder) RoundTrip(req *http.Request) (*http.Response, error) { - for k, vv := range h.headers { - for _, v := range vv { - req.Header.Add(k, v) - } - } - return h.rt.RoundTrip(req) -} - // QueryTSDBHeadSeries performs an instant query against Mimir. func QueryTSDBHeadSeries(ctx context.Context, clusterName string) (float64, error) { - headerAdder := &headerAdder{ - headers: map[string][]string{ - "X-Org-Id": {"anonynous"}, - }, - rt: http.DefaultTransport, - } config := api.Config{ - Address: "http://mimir-gateway.mimir.svc/prometheus", - RoundTripper: headerAdder, + Address: "http://mimir-gateway.mimir.svc/prometheus", } // Create new client. @@ -51,7 +27,8 @@ func QueryTSDBHeadSeries(ctx context.Context, clusterName string) (float64, erro api := v1.NewAPI(c) queryContext, cancel := context.WithTimeout(ctx, 2*time.Minute) - val, _, err := api.Query(queryContext, fmt.Sprintf("max_over_time(count({cluster_id=\"%s\"})[6h])", clusterName), time.Now()) + query := fmt.Sprintf("sum(max_over_time(prometheus_agent_active_series{cluster_id=\"%s\"}[6h]))", clusterName) + val, _, err := api.Query(queryContext, query, time.Now()) cancel() if err != nil { return 0, err @@ -59,7 +36,16 @@ func QueryTSDBHeadSeries(ctx context.Context, clusterName string) (float64, erro switch val.Type() { case model.ValVector: - vector := val.(model.Vector) + vector, ok := val.(model.Vector) + if !ok { + return 0, errors.New("failed to convert value to vector") + } + if len(vector) == 0 { + return 0, errors.New("no time series found") + } + if len(vector) > 1 { + return 0, errors.New("more than one time series found") + } return float64(vector[0].Value), nil default: return 0, errors.New("failed to get current number of time series") diff --git a/pkg/monitoring/prometheusagent/config.go b/pkg/monitoring/prometheusagent/config.go new file mode 100644 index 00000000..f0195fcd --- /dev/null +++ b/pkg/monitoring/prometheusagent/config.go @@ -0,0 +1,119 @@ +package prometheusagent + +import ( + "context" + "fmt" + "net" + + "github.com/go-logr/logr" + "github.com/pkg/errors" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/yaml" + + "github.com/giantswarm/observability-operator/pkg/common" + "github.com/giantswarm/observability-operator/pkg/monitoring" + "github.com/giantswarm/observability-operator/pkg/monitoring/mimir/querier" + "github.com/giantswarm/observability-operator/pkg/monitoring/prometheusagent/shards" +) + +func (pas PrometheusAgentService) buildRemoteWriteConfig(ctx context.Context, + cluster *clusterv1.Cluster, logger logr.Logger, currentShards int) (*corev1.ConfigMap, error) { + + organization, err := pas.OrganizationRepository.Read(ctx, cluster) + if err != nil { + logger.Error(err, "failed to get cluster organization") + return nil, errors.WithStack(err) + } + + provider, err := common.GetClusterProvider(cluster) + if err != nil { + logger.Error(err, "failed to get cluster provider") + return nil, errors.WithStack(err) + } + + clusterType := "workload_cluster" + if val, ok := cluster.Labels["cluster.x-k8s.io/cluster-name"]; ok && val == pas.ManagementCluster.Name { + clusterType = "management_cluster" + } + + externalLabels := map[string]string{ + "cluster_id": cluster.Name, + "cluster_type": clusterType, + "customer": pas.ManagementCluster.Customer, + "installation": pas.ManagementCluster.Name, + "organization": organization, + "pipeline": pas.ManagementCluster.Pipeline, + "provider": provider, + "region": pas.ManagementCluster.Region, + "service_priority": getServicePriority(cluster), + } + + shards, err := getShardsCountForCluster(ctx, cluster, currentShards) + if err != nil { + return nil, errors.WithStack(err) + } + + config, err := yaml.Marshal(RemoteWriteConfig{ + PrometheusAgentConfig: PrometheusAgentConfig{ + ExternalLabels: externalLabels, + Image: PrometheusAgentImage{ + Tag: pas.PrometheusVersion, + }, + Shards: shards, + Version: pas.PrometheusVersion, + }, + }) + if err != nil { + return nil, errors.WithStack(err) + } + + return &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: getPrometheusAgentRemoteWriteConfigName(cluster), + Namespace: cluster.Namespace, + Finalizers: []string{ + monitoring.MonitoringFinalizer, + }, + }, + Data: map[string]string{ + "values": string(config), + }, + }, nil +} + +func getPrometheusAgentRemoteWriteConfigName(cluster *clusterv1.Cluster) string { + return fmt.Sprintf("%s-remote-write-config", cluster.Name) +} + +func getServicePriority(cluster *clusterv1.Cluster) string { + if servicePriority, ok := cluster.GetLabels()[servicePriorityLabel]; ok && servicePriority != "" { + return servicePriority + } + return defaultServicePriority +} + +// We want to compute the number of shards based on the number of nodes. +func getShardsCountForCluster(ctx context.Context, cluster *clusterv1.Cluster, currentShardCount int) (int, error) { + headSeries, err := querier.QueryTSDBHeadSeries(ctx, cluster.Name) + if err != nil { + // Verify that Prometheus is accessible. If not, return the default number of shards. + var dnsError *net.DNSError + if errors.As(err, &dnsError) { + return shards.ComputeShards(currentShardCount, defaultShards), nil + } + return 0, errors.WithStack(err) + } + return shards.ComputeShards(currentShardCount, headSeries), nil +} + +func readCurrentShardsFromConfig(configMap corev1.ConfigMap) (int, error) { + remoteWriteConfig := RemoteWriteConfig{} + err := yaml.Unmarshal([]byte(configMap.Data["values"]), &remoteWriteConfig) + if err != nil { + return 0, errors.WithStack(err) + } + + return remoteWriteConfig.PrometheusAgentConfig.Shards, nil +} diff --git a/pkg/monitoring/prometheusagent/remotewrite/types.go b/pkg/monitoring/prometheusagent/remotewrite/types.go deleted file mode 100644 index 309cf566..00000000 --- a/pkg/monitoring/prometheusagent/remotewrite/types.go +++ /dev/null @@ -1,27 +0,0 @@ -package remotewrite - -import ( - promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" -) - -type RemoteWriteConfig struct { - PrometheusAgentConfig PrometheusAgentConfig `yaml:"prometheus-agent,omitempty" json:"prometheus-agent,omitempty"` -} - -type PrometheusAgentConfig struct { - ExternalLabels map[string]string `yaml:"externalLabels,omitempty" json:"externalLabels,omitempty"` - Image PrometheusAgentImage `yaml:"image,omitempty" json:"image,omitempty"` - RemoteWrite []RemoteWrite `yaml:"remoteWrite,omitempty" json:"remoteWrite,omitempty"` - Shards int `yaml:"shards,omitempty" json:"shards,omitempty"` - Version string `yaml:"version,omitempty" json:"version,omitempty"` -} - -type PrometheusAgentImage struct { - Tag string `yaml:"tag" json:"tag"` -} - -type RemoteWrite struct { - promv1.RemoteWriteSpec `yaml:",inline" json:",inline"` - Password string `yaml:"password" json:"password"` - Username string `yaml:"username" json:"username"` -} diff --git a/pkg/monitoring/prometheusagent/secret.go b/pkg/monitoring/prometheusagent/secret.go new file mode 100644 index 00000000..42d89d3d --- /dev/null +++ b/pkg/monitoring/prometheusagent/secret.go @@ -0,0 +1,83 @@ +package prometheusagent + +import ( + "fmt" + + "github.com/pkg/errors" + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/yaml" + + "github.com/giantswarm/observability-operator/pkg/monitoring" +) + +func getPrometheusAgentRemoteWriteSecretName(cluster *clusterv1.Cluster) string { + return fmt.Sprintf("%s-remote-write-secret", cluster.Name) +} + +// buildRemoteWriteSecret builds the secret that contains the remote write configuration for the Prometheus agent. +func (pas PrometheusAgentService) buildRemoteWriteSecret( + cluster *clusterv1.Cluster, password string) (*corev1.Secret, error) { + + url := fmt.Sprintf(remoteWriteEndpointTemplateURL, pas.ManagementCluster.BaseDomain, cluster.Name) + config := RemoteWriteConfig{ + PrometheusAgentConfig: PrometheusAgentConfig{ + RemoteWrite: []RemoteWrite{ + { + URL: url, + Name: remoteWriteName, + RemoteTimeout: "60s", + QueueConfig: promv1.QueueConfig{ + Capacity: 30000, + MaxSamplesPerSend: 150000, + MaxShards: 10, + }, + TLSConfig: promv1.TLSConfig{ + SafeTLSConfig: promv1.SafeTLSConfig{ + InsecureSkipVerify: pas.ManagementCluster.InsecureCA, + }, + }, + Username: cluster.Name, + Password: password, + }, + }, + }, + } + + marshalledValues, err := yaml.Marshal(config) + if err != nil { + return nil, errors.WithStack(err) + } + + return &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: getPrometheusAgentRemoteWriteSecretName(cluster), + Namespace: cluster.Namespace, + Finalizers: []string{ + monitoring.MonitoringFinalizer, + }, + }, + Data: map[string][]byte{ + "values": marshalledValues, + }, + Type: "Opaque", + }, nil +} + +func readRemoteWritePasswordFromSecret(secret corev1.Secret) (string, error) { + remoteWriteConfig := RemoteWriteConfig{} + err := yaml.Unmarshal(secret.Data["values"], &remoteWriteConfig) + if err != nil { + return "", errors.WithStack(err) + } + + for _, rw := range remoteWriteConfig.PrometheusAgentConfig.RemoteWrite { + if rw.Name == remoteWriteName { + return rw.Password, nil + } + } + + return "", errors.New("remote write password not found in secret") +} diff --git a/pkg/monitoring/prometheusagent/service.go b/pkg/monitoring/prometheusagent/service.go index 29d8d18c..027ad661 100644 --- a/pkg/monitoring/prometheusagent/service.go +++ b/pkg/monitoring/prometheusagent/service.go @@ -2,16 +2,12 @@ package prometheusagent import ( "context" - "fmt" - "net" "reflect" "github.com/go-logr/logr" "github.com/pkg/errors" - "gopkg.in/yaml.v2" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" @@ -19,33 +15,44 @@ import ( "github.com/giantswarm/observability-operator/pkg/common" "github.com/giantswarm/observability-operator/pkg/common/organization" + "github.com/giantswarm/observability-operator/pkg/common/password" "github.com/giantswarm/observability-operator/pkg/monitoring" - "github.com/giantswarm/observability-operator/pkg/monitoring/mimir/querier" - "github.com/giantswarm/observability-operator/pkg/monitoring/prometheusagent/remotewrite" - "github.com/giantswarm/observability-operator/pkg/monitoring/prometheusagent/shards" -) - -const ( - // defaultServicePriority is the default service priority if not set. - defaultServicePriority string = "highest" - // defaultShards is the default number of shards to use. - defaultShards = 1 - - // servicePriorityLabel is the label used to determine the priority of a service. - servicePriorityLabel string = "giantswarm.io/service-priority" ) type PrometheusAgentService struct { client.Client organization.OrganizationRepository + PasswordManager password.Manager common.ManagementCluster PrometheusVersion string } // ensurePrometheusAgentRemoteWriteConfig ensures that the prometheus remote write config is present in the cluster. -func (pas *PrometheusAgentService) ReconcilePrometheusAgentRemoteWriteConfig(ctx context.Context, cluster *clusterv1.Cluster) error { +func (pas *PrometheusAgentService) ReconcileRemoteWriteConfig( + ctx context.Context, cluster *clusterv1.Cluster) error { + logger := log.FromContext(ctx).WithValues("cluster", cluster.Name) - logger.Info("ensuring prometheus remote write config") + logger.Info("ensuring prometheus agent remote write configuration") + + err := pas.createOrUpdateConfig(ctx, cluster, logger) + if err != nil { + logger.Error(err, "failed to create or update prometheus agent remote write config") + return errors.WithStack(err) + } + + err = pas.createOrUpdateSecret(ctx, cluster, logger) + if err != nil { + logger.Error(err, "failed to create or update prometheus agent remote write secret") + return errors.WithStack(err) + } + + logger.Info("ensured prometheus agent remote write configuration") + + return nil +} + +func (pas PrometheusAgentService) createOrUpdateConfig(ctx context.Context, + cluster *clusterv1.Cluster, logger logr.Logger) error { objectKey := client.ObjectKey{ Name: getPrometheusAgentRemoteWriteConfigName(cluster), @@ -62,7 +69,10 @@ func (pas *PrometheusAgentService) ReconcilePrometheusAgentRemoteWriteConfig(ctx } err = pas.Client.Create(ctx, configMap) - return errors.WithStack(err) + if err != nil { + return errors.WithStack(err) + } + return nil } else if err != nil { return errors.WithStack(err) } @@ -77,149 +87,146 @@ func (pas *PrometheusAgentService) ReconcilePrometheusAgentRemoteWriteConfig(ctx return errors.WithStack(err) } - if !reflect.DeepEqual(current.Data, desired.Data) { - err = pas.Client.Patch(ctx, current, client.MergeFrom(desired)) + if !reflect.DeepEqual(current.Data, desired.Data) || !reflect.DeepEqual(current.Finalizers, desired.Finalizers) { + err = pas.Client.Update(ctx, desired) if err != nil { + logger.Info("could not update prometheus agent remote write configuration") return errors.WithStack(err) } } - - logger.Info("ensured prometheus remote write config") - return nil } -func (pas *PrometheusAgentService) DeletePrometheusAgentRemoteWriteConfig(ctx context.Context, cluster *clusterv1.Cluster) error { - logger := log.FromContext(ctx).WithValues("cluster", cluster.Name) - logger.Info("deleting prometheus remote write config") - +func (pas PrometheusAgentService) createOrUpdateSecret(ctx context.Context, + cluster *clusterv1.Cluster, logger logr.Logger) error { objectKey := client.ObjectKey{ - Name: getPrometheusAgentRemoteWriteConfigName(cluster), + Name: getPrometheusAgentRemoteWriteSecretName(cluster), Namespace: cluster.GetNamespace(), } - current := &corev1.ConfigMap{} - // Get the current configmap if it exists. + current := &corev1.Secret{} + // Get the current secret if it exists. err := pas.Client.Get(ctx, objectKey, current) if apierrors.IsNotFound(err) { - // We ignore cases where the configmap is not found (it it was manually deleted for instance) + logger.Info("generating password for the prometheus agent") + password, err := pas.PasswordManager.GeneratePassword(32) + if err != nil { + logger.Error(err, "failed to generate the prometheus agent password") + return errors.WithStack(err) + } + logger.Info("generated password for the prometheus agent") + + secret, err := pas.buildRemoteWriteSecret(cluster, password) + if err != nil { + return errors.WithStack(err) + } + err = pas.Client.Create(ctx, secret) + if err != nil { + return errors.WithStack(err) + } return nil } else if err != nil { return errors.WithStack(err) } - desired := current.DeepCopy() - // Delete the finalizer - controllerutil.RemoveFinalizer(desired, monitoring.MonitoringFinalizer) - err = pas.Client.Patch(ctx, current, client.MergeFrom(desired)) + // As it takes a long time to apply the new password to the agent due to a built-in delay in the app-platform, + // we keep the already generated remote write password. + password, err := readRemoteWritePasswordFromSecret(*current) if err != nil { return errors.WithStack(err) } - err = pas.Client.Delete(ctx, desired) + desired, err := pas.buildRemoteWriteSecret(cluster, password) if err != nil { return errors.WithStack(err) } - - logger.Info("deleted prometheus remote write config") + if !reflect.DeepEqual(current.Data, desired.Data) || !reflect.DeepEqual(current.Finalizers, desired.Finalizers) { + err = pas.Client.Update(ctx, desired) + if err != nil { + return errors.WithStack(err) + } + } return nil } -func (pas PrometheusAgentService) buildRemoteWriteConfig(ctx context.Context, cluster *clusterv1.Cluster, logger logr.Logger, currentShards int) (*corev1.ConfigMap, error) { - organization, err := pas.OrganizationRepository.Read(ctx, cluster) - if err != nil { - logger.Error(err, "failed to get cluster organization") - return nil, errors.WithStack(err) - } +func (pas *PrometheusAgentService) DeleteRemoteWriteConfig(ctx context.Context, cluster *clusterv1.Cluster) error { + logger := log.FromContext(ctx).WithValues("cluster", cluster.Name) + logger.Info("deleting prometheus agent remote write configuration") - provider, err := common.GetClusterProvider(cluster) + err := pas.deleteConfigMap(ctx, cluster) if err != nil { - logger.Error(err, "failed to get cluster provider") - return nil, errors.WithStack(err) + logger.Error(err, "failed to delete prometheus agent remote write config") + return errors.WithStack(err) } - clusterType := "workload_cluster" - if val, ok := cluster.Labels["cluster.x-k8s.io/cluster-name"]; ok && val == pas.ManagementCluster.Name { - clusterType = "management_cluster" + err = pas.deleteSecret(ctx, cluster) + if err != nil { + logger.Error(err, "failed to delete prometheus agent remote write secret") + return errors.WithStack(err) } - externalLabels := map[string]string{ - "cluster_id": cluster.Name, - "cluster_type": clusterType, - "customer": pas.ManagementCluster.Customer, - "installation": pas.ManagementCluster.Name, - "organization": organization, - "pipeline": pas.ManagementCluster.Pipeline, - "provider": provider, - "region": pas.ManagementCluster.Region, - "service_priority": getServicePriority(cluster), - } + logger.Info("deleted prometheus agent remote write configuration") - shards, err := getShardsCountForCluster(ctx, cluster, currentShards) - if err != nil { - return nil, errors.WithStack(err) - } - - config, err := yaml.Marshal(remotewrite.RemoteWriteConfig{ - PrometheusAgentConfig: remotewrite.PrometheusAgentConfig{ - ExternalLabels: externalLabels, - Image: remotewrite.PrometheusAgentImage{ - Tag: pas.PrometheusVersion, - }, - Shards: shards, - Version: pas.PrometheusVersion, - }, - }) - if err != nil { - return nil, errors.WithStack(err) - } - - return &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: getPrometheusAgentRemoteWriteConfigName(cluster), - Namespace: cluster.Namespace, - Finalizers: []string{ - monitoring.MonitoringFinalizer, - }, - }, - Data: map[string]string{ - "values": string(config), - }, - }, nil + return nil } -func getPrometheusAgentRemoteWriteConfigName(cluster *clusterv1.Cluster) string { - return fmt.Sprintf("%s-remote-write-config", cluster.Name) -} +func (pas PrometheusAgentService) deleteConfigMap(ctx context.Context, cluster *clusterv1.Cluster) error { + objectKey := client.ObjectKey{ + Name: getPrometheusAgentRemoteWriteConfigName(cluster), + Namespace: cluster.GetNamespace(), + } + current := &corev1.ConfigMap{} + // Get the current configmap if it exists. + err := pas.Client.Get(ctx, objectKey, current) + if apierrors.IsNotFound(err) { + // Ignore cases where the configmap is not found (if it was manually deleted, for instance). + return nil + } else if err != nil { + return errors.WithStack(err) + } -func readCurrentShardsFromConfig(configMap corev1.ConfigMap) (int, error) { - remoteWriteConfig := remotewrite.RemoteWriteConfig{} - err := yaml.Unmarshal([]byte(configMap.Data["values"]), &remoteWriteConfig) + // Delete the finalizer + desired := current.DeepCopy() + controllerutil.RemoveFinalizer(desired, monitoring.MonitoringFinalizer) + err = pas.Client.Patch(ctx, desired, client.MergeFrom(current)) if err != nil { - return 0, errors.WithStack(err) + return errors.WithStack(err) } - return remoteWriteConfig.PrometheusAgentConfig.Shards, nil + err = pas.Client.Delete(ctx, desired) + if err != nil { + return errors.WithStack(err) + } + return nil } -// We want to compute the number of shards based on the number of nodes. -func getShardsCountForCluster(ctx context.Context, cluster *clusterv1.Cluster, currentShardCount int) (int, error) { - headSeries, err := querier.QueryTSDBHeadSeries(ctx, cluster.Name) +func (pas PrometheusAgentService) deleteSecret(ctx context.Context, cluster *clusterv1.Cluster) error { + objectKey := client.ObjectKey{ + Name: getPrometheusAgentRemoteWriteSecretName(cluster), + Namespace: cluster.GetNamespace(), + } + current := &corev1.Secret{} + // Get the current secret if it exists. + err := pas.Client.Get(ctx, objectKey, current) + if apierrors.IsNotFound(err) { + // Ignore cases where the secret is not found (if it was manually deleted, for instance). + return nil + } else if err != nil { + return errors.WithStack(err) + } + + // Delete the finalizer + desired := current.DeepCopy() + controllerutil.RemoveFinalizer(desired, monitoring.MonitoringFinalizer) + err = pas.Client.Patch(ctx, current, client.MergeFrom(desired)) if err != nil { - // If prometheus is not accessible (for instance, not running because this is a new cluster, we check if prometheus is accessible) - var dnsError *net.DNSError - if errors.As(err, &dnsError) { - return shards.ComputeShards(currentShardCount, defaultShardCount), nil - } - return 0, errors.WithStack(err) + return errors.WithStack(err) } - return shards.ComputeShards(currentShardCount, headSeries), nil -} -func getServicePriority(cluster *clusterv1.Cluster) string { - if servicePriority, ok := cluster.GetLabels()[servicePriorityLabel]; ok && servicePriority != "" { - return servicePriority + err = pas.Client.Delete(ctx, desired) + if err != nil { + return errors.WithStack(err) } - return defaultServicePriority + return nil } diff --git a/pkg/monitoring/prometheusagent/types.go b/pkg/monitoring/prometheusagent/types.go new file mode 100644 index 00000000..bb125ef5 --- /dev/null +++ b/pkg/monitoring/prometheusagent/types.go @@ -0,0 +1,44 @@ +package prometheusagent + +import ( + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +) + +const ( + // defaultServicePriority is the default service priority if not set. + defaultServicePriority string = "highest" + // defaultShards is the default number of shards to use. + defaultShards = 1 + + // servicePriorityLabel is the label used to determine the priority of a service. + servicePriorityLabel string = "giantswarm.io/service-priority" + + remoteWriteEndpointTemplateURL = "https://prometheus.%s/%s/api/v1/write" + remoteWriteName = "prometheus-meta-operator" +) + +type RemoteWriteConfig struct { + PrometheusAgentConfig PrometheusAgentConfig `yaml:"prometheus-agent,omitempty" json:"prometheus-agent,omitempty"` +} + +type PrometheusAgentConfig struct { + ExternalLabels map[string]string `yaml:"externalLabels,omitempty" json:"externalLabels,omitempty"` + Image PrometheusAgentImage `yaml:"image,omitempty" json:"image,omitempty"` + RemoteWrite []RemoteWrite `yaml:"remoteWrite,omitempty" json:"remoteWrite,omitempty"` + Shards int `yaml:"shards,omitempty" json:"shards,omitempty"` + Version string `yaml:"version,omitempty" json:"version,omitempty"` +} + +type PrometheusAgentImage struct { + Tag string `yaml:"tag" json:"tag"` +} + +type RemoteWrite struct { + Name string `yaml:"name" json:"name"` + Password string `yaml:"password" json:"password"` + Username string `yaml:"username" json:"username"` + URL string `yaml:"url" json:"url"` + RemoteTimeout string `yaml:"remoteTimeout" json:"remoteTimeout"` + QueueConfig promv1.QueueConfig `yaml:"queueConfig" json:"queueConfig"` + TLSConfig promv1.TLSConfig `yaml:"tlsConfig" json:"tlsConfig"` +}