Skip to content

Commit

Permalink
Take over prometheus agent remote write config from PMO.
Browse files Browse the repository at this point in the history
  • Loading branch information
QuentinBisson committed Apr 8, 2024
1 parent 2f6d694 commit fb97467
Show file tree
Hide file tree
Showing 14 changed files with 410 additions and 137 deletions.
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module github.com/giantswarm/observability-operator

go 1.21
go 1.22.0

require (
github.com/go-logr/logr v1.4.1
Expand All @@ -11,7 +11,7 @@ require (
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.72.0
github.com/prometheus/client_golang v1.19.0
github.com/prometheus/common v0.51.1
github.com/sirupsen/logrus v1.9.0
github.com/sirupsen/logrus v1.9.3
gopkg.in/yaml.v2 v2.4.0
k8s.io/api v0.29.3
k8s.io/apimachinery v0.29.3
Expand Down
2 changes: 2 additions & 0 deletions helm/observability-operator/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ spec:
image: "{{ .Values.image.registry }}/{{ .Values.image.name }}:{{ default .Chart.Version .Values.image.tag }}"
args:
- --leader-elect
- --management-cluster-base-domain={{ $.Values.managementCluster.baseDomain }}
- --management-cluster-customer={{ $.Values.managementCluster.customer }}
- --management-cluster-insecure-ca={{ $.Values.managementCluster.insecureCA }}
- --management-cluster-name={{ $.Values.managementCluster.name }}
- --management-cluster-pipeline={{ $.Values.managementCluster.pipeline }}
- --management-cluster-region={{ $.Values.managementCluster.region }}
Expand Down
7 changes: 7 additions & 0 deletions helm/observability-operator/templates/rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@ metadata:
{{- include "labels.common" . | nindent 4 }}
name: {{ include "resource.default.name" . }}
rules:
- apiGroups:
- ""
resources:
- namespaces
verbs:
- list
- watch
- apiGroups:
- ""
resources:
Expand Down
9 changes: 9 additions & 0 deletions helm/observability-operator/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,15 @@
"managementCluster": {
"type": "object",
"properties": {
"baseDomain": {
"type": "string"
},
"customer": {
"type": "string"
},
"insecureCA": {
"type": "boolean"
},
"name": {
"type": "string"
},
Expand All @@ -54,6 +60,9 @@
},
"opsgenieApiKey": {
"type": "string"
},
"prometheusVersion": {
"type": "string"
}
}
},
Expand Down
2 changes: 2 additions & 0 deletions helm/observability-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ image:
tag: ""

managementCluster:
baseDomain: domain
customer: customer
insecureCA: false
name: name
pipeline: pipeline
region: region
Expand Down
4 changes: 2 additions & 2 deletions internal/controller/cluster_monitoring_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ func (r *ClusterMonitoringReconciler) reconcile(ctx context.Context, cluster *cl
}

// Create or update PrometheusAgent remote write configuration.
err := r.PrometheusAgentService.ReconcilePrometheusAgentRemoteWriteConfig(ctx, cluster)
err := r.PrometheusAgentService.ReconcileRemoteWriteConfig(ctx, cluster)
if err != nil {
logger.Error(err, "failed to create or update prometheus agent remote write config")
return ctrl.Result{Requeue: true}, errors.WithStack(err)
Expand All @@ -138,7 +138,7 @@ func (r *ClusterMonitoringReconciler) reconcileDelete(ctx context.Context, clust
}
}

err := r.PrometheusAgentService.DeletePrometheusAgentRemoteWriteConfig(ctx, cluster)
err := r.PrometheusAgentService.DeleteRemoteWriteConfig(ctx, cluster)
if err != nil {
logger.Error(err, "failed to delete prometheus agent remote write config")
return ctrl.Result{Requeue: true}, errors.WithStack(err)
Expand Down
42 changes: 26 additions & 16 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import (
"github.com/giantswarm/observability-operator/internal/controller"
"github.com/giantswarm/observability-operator/pkg/common"
"github.com/giantswarm/observability-operator/pkg/common/organization"
"github.com/giantswarm/observability-operator/pkg/common/password"
"github.com/giantswarm/observability-operator/pkg/monitoring/heartbeat"
"github.com/giantswarm/observability-operator/pkg/monitoring/prometheusagent"
//+kubebuilder:scaffold:imports
Expand All @@ -49,17 +50,19 @@ var (
scheme = runtime.NewScheme()
setupLog = ctrl.Log.WithName("setup")

metricsAddr string
enableLeaderElection bool
probeAddr string
secureMetrics bool
enableHTTP2 bool
managementClusterCustomer string
managementClusterName string
managementClusterPipeline string
managementClusterRegion string
monitoringEnabled bool
prometheusVersion string
metricsAddr string
enableLeaderElection bool
probeAddr string
secureMetrics bool
enableHTTP2 bool
managementClusterBaseDomain string
managementClusterCustomer string
managementClusterInsecureCA bool
managementClusterName string
managementClusterPipeline string
managementClusterRegion string
monitoringEnabled bool
prometheusVersion string
)

const (
Expand All @@ -85,8 +88,12 @@ func main() {
"If set the metrics endpoint is served securely")
flag.BoolVar(&enableHTTP2, "enable-http2", false,
"If set, HTTP/2 will be enabled for the metrics and webhook servers")
flag.StringVar(&managementClusterBaseDomain, "management-cluster-base-domain", "",
"The base domain of the management cluster.")
flag.StringVar(&managementClusterCustomer, "management-cluster-customer", "",
"The customer of the management cluster.")
flag.BoolVar(&managementClusterInsecureCA, "management-cluster-insecure-ca", false,
"Flag to indicate if the management cluster has an insecure CA that should be trusted")
flag.StringVar(&managementClusterName, "management-cluster-name", "",
"The name of the management cluster.")
flag.StringVar(&managementClusterPipeline, "management-cluster-pipeline", "",
Expand Down Expand Up @@ -157,10 +164,12 @@ func main() {
record.InitFromRecorder(mgr.GetEventRecorderFor("observability-operator"))

var managementCluster common.ManagementCluster = common.ManagementCluster{
Customer: managementClusterCustomer,
Name: managementClusterName,
Pipeline: managementClusterPipeline,
Region: managementClusterRegion,
BaseDomain: managementClusterBaseDomain,
Customer: managementClusterCustomer,
InsecureCA: managementClusterInsecureCA,
Name: managementClusterName,
Pipeline: managementClusterPipeline,
Region: managementClusterRegion,
}

var opsgenieApiKey = os.Getenv(OpsgenieApiKey)
Expand All @@ -181,11 +190,12 @@ func main() {
prometheusAgentService := prometheusagent.PrometheusAgentService{
Client: mgr.GetClient(),
OrganizationRepository: organizationRepository,
PasswordManager: password.SimpleManager{},
ManagementCluster: managementCluster,
PrometheusVersion: prometheusVersion,
}

if err = (&controller.ClusterMonitoringReconciler{
if err = (&controller.ClusterMonitoringReconciler{
Client: mgr.GetClient(),
ManagementCluster: managementCluster,
HeartbeatRepository: heartbeatRepository,
Expand Down
21 changes: 21 additions & 0 deletions pkg/common/password/manager.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package password

import (
"crypto/rand"
"encoding/hex"
)

type Manager interface {
GeneratePassword(length int) (string, error)
}

type SimpleManager struct {
}

func (m SimpleManager) GeneratePassword(length int) (string, error) {
bytes := make([]byte, length)
if _, err := rand.Read(bytes); err != nil {
return "", err
}
return hex.EncodeToString(bytes), nil
}
4 changes: 4 additions & 0 deletions pkg/common/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,12 @@ const (
)

type ManagementCluster struct {
// BaseDomain is the base domain of the management cluster.
BaseDomain string
// Customer is the customer name of the management cluster.
Customer string
// InsecureCA is a flag to indicate if the management cluster has an insecure CA that should be truster
InsecureCA bool
// Name is the name of the management cluster.
Name string
// Pipeline is the pipeline name of the management cluster.
Expand Down
3 changes: 2 additions & 1 deletion pkg/monitoring/mimir/querier/querier.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ func QueryTSDBHeadSeries(ctx context.Context, clusterName string) (float64, erro
api := v1.NewAPI(c)

queryContext, cancel := context.WithTimeout(ctx, 2*time.Minute)
val, _, err := api.Query(queryContext, fmt.Sprintf("max_over_time(count({cluster_id=\"%s\"})[6h])", clusterName), time.Now())
query := fmt.Sprintf("max_over_time(prometheus_tsdb_head_series{cluster_id=\"%s\"}[6h])", clusterName)
val, _, err := api.Query(queryContext, query, time.Now())
cancel()
if err != nil {
return 0, err
Expand Down
119 changes: 119 additions & 0 deletions pkg/monitoring/prometheusagent/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
package prometheusagent

import (
"context"
"fmt"
"net"

"github.com/go-logr/logr"
"github.com/pkg/errors"
"gopkg.in/yaml.v2"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"

"github.com/giantswarm/observability-operator/pkg/common"
"github.com/giantswarm/observability-operator/pkg/monitoring"
"github.com/giantswarm/observability-operator/pkg/monitoring/mimir/querier"
"github.com/giantswarm/observability-operator/pkg/monitoring/prometheusagent/shards"
)

func (pas PrometheusAgentService) buildRemoteWriteConfig(ctx context.Context,
cluster *clusterv1.Cluster, logger logr.Logger, currentShards int) (*corev1.ConfigMap, error) {

organization, err := pas.OrganizationRepository.Read(ctx, cluster)
if err != nil {
logger.Error(err, "failed to get cluster organization")
return nil, errors.WithStack(err)
}

provider, err := common.GetClusterProvider(cluster)
if err != nil {
logger.Error(err, "failed to get cluster provider")
return nil, errors.WithStack(err)
}

clusterType := "workload_cluster"
if val, ok := cluster.Labels["cluster.x-k8s.io/cluster-name"]; ok && val == pas.ManagementCluster.Name {
clusterType = "management_cluster"
}

externalLabels := map[string]string{
"cluster_id": cluster.Name,
"cluster_type": clusterType,
"customer": pas.ManagementCluster.Customer,
"installation": pas.ManagementCluster.Name,
"organization": organization,
"pipeline": pas.ManagementCluster.Pipeline,
"provider": provider,
"region": pas.ManagementCluster.Region,
"service_priority": getServicePriority(cluster),
}

shards, err := getShardsCountForCluster(ctx, cluster, currentShards)
if err != nil {
return nil, errors.WithStack(err)
}

config, err := yaml.Marshal(RemoteWriteConfig{
PrometheusAgentConfig: PrometheusAgentConfig{
ExternalLabels: externalLabels,
Image: PrometheusAgentImage{
Tag: pas.PrometheusVersion,
},
Shards: shards,
Version: pas.PrometheusVersion,
},
})
if err != nil {
return nil, errors.WithStack(err)
}

return &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: getPrometheusAgentRemoteWriteConfigName(cluster),
Namespace: cluster.Namespace,
Finalizers: []string{
monitoring.MonitoringFinalizer,
},
},
Data: map[string]string{
"values": string(config),
},
}, nil
}

func getPrometheusAgentRemoteWriteConfigName(cluster *clusterv1.Cluster) string {
return fmt.Sprintf("%s-remote-write-config", cluster.Name)
}

func getServicePriority(cluster *clusterv1.Cluster) string {
if servicePriority, ok := cluster.GetLabels()[servicePriorityLabel]; ok && servicePriority != "" {
return servicePriority
}
return defaultServicePriority
}

// We want to compute the number of shards based on the number of nodes.
func getShardsCountForCluster(ctx context.Context, cluster *clusterv1.Cluster, currentShardCount int) (int, error) {
headSeries, err := querier.QueryTSDBHeadSeries(ctx, cluster.Name)
if err != nil {
// Verify that Prometheus is accessible. If not, return the default number of shards.
var dnsError *net.DNSError
if errors.As(err, &dnsError) {
return shards.ComputeShards(currentShardCount, defaultShards), nil
}
return 0, errors.WithStack(err)
}
return shards.ComputeShards(currentShardCount, headSeries), nil
}

func readCurrentShardsFromConfig(configMap corev1.ConfigMap) (int, error) {
remoteWriteConfig := RemoteWriteConfig{}
err := yaml.Unmarshal([]byte(configMap.Data["values"]), &remoteWriteConfig)
if err != nil {
return 0, errors.WithStack(err)
}

return remoteWriteConfig.PrometheusAgentConfig.Shards, nil
}
Loading

0 comments on commit fb97467

Please sign in to comment.