Skip to content

Commit

Permalink
Take over prometheus agent remote write config from PMO.
Browse files Browse the repository at this point in the history
Signed-off-by: QuentinBisson <[email protected]>
  • Loading branch information
QuentinBisson committed Apr 9, 2024
1 parent 945e6dd commit 05595cc
Show file tree
Hide file tree
Showing 16 changed files with 467 additions and 203 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module github.com/giantswarm/observability-operator

go 1.21
go 1.22.0

require (
github.com/go-logr/logr v1.4.1
Expand Down
4 changes: 3 additions & 1 deletion helm/observability-operator/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,15 @@ spec:
image: "{{ .Values.image.registry }}/{{ .Values.image.name }}:{{ default .Chart.Version .Values.image.tag }}"
args:
- --leader-elect
- --management-cluster-base-domain={{ $.Values.managementCluster.baseDomain }}
- --management-cluster-customer={{ $.Values.managementCluster.customer }}
- --management-cluster-insecure-ca={{ $.Values.managementCluster.insecureCA }}
- --management-cluster-name={{ $.Values.managementCluster.name }}
- --management-cluster-pipeline={{ $.Values.managementCluster.pipeline }}
- --management-cluster-region={{ $.Values.managementCluster.region }}
- --monitoring-enabled={{ $.Values.monitoring.enabled }}
{{- if .Values.monitoring.prometheusVersion }}
- --prometheus-version={{ $.Values.monitoring.prometheusVersion | quote }}
- --prometheus-version={{ $.Values.monitoring.prometheusVersion }}
{{- end }}
env:
- name: OPSGENIE_API_KEY
Expand Down
14 changes: 7 additions & 7 deletions helm/observability-operator/templates/rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,16 @@ rules:
- apiGroups:
- ""
resources:
- secrets
- configmaps
- namespaces
verbs:
- create
- update
- delete
- deletecollection
- get
- list
- watch
- apiGroups:
- ""
resources:
- secrets
- configmaps
verbs: ["*"]
- apiGroups:
- cluster.x-k8s.io
resources:
Expand Down
9 changes: 9 additions & 0 deletions helm/observability-operator/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,15 @@
"managementCluster": {
"type": "object",
"properties": {
"baseDomain": {
"type": "string"
},
"customer": {
"type": "string"
},
"insecureCA": {
"type": "boolean"
},
"name": {
"type": "string"
},
Expand All @@ -54,6 +60,9 @@
},
"opsgenieApiKey": {
"type": "string"
},
"prometheusVersion": {
"type": "string"
}
}
},
Expand Down
2 changes: 2 additions & 0 deletions helm/observability-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ image:
tag: ""

managementCluster:
baseDomain: domain
customer: customer
insecureCA: false
name: name
pipeline: pipeline
region: region
Expand Down
15 changes: 8 additions & 7 deletions internal/controller/cluster_monitoring_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package controller

import (
"context"
"time"

"github.com/pkg/errors"
apierrors "k8s.io/apimachinery/pkg/api/errors"
Expand Down Expand Up @@ -112,15 +113,15 @@ func (r *ClusterMonitoringReconciler) reconcile(ctx context.Context, cluster *cl
err := r.HeartbeatRepository.CreateOrUpdate(ctx)
if err != nil {
logger.Error(err, "failed to create or update heartbeat")
return ctrl.Result{Requeue: true}, errors.WithStack(err)
return ctrl.Result{RequeueAfter: 5 * time.Minute}, errors.WithStack(err)
}
}

// Create or update PrometheusAgent remote write configuration.
err := r.PrometheusAgentService.ReconcilePrometheusAgentRemoteWriteConfig(ctx, cluster)
err := r.PrometheusAgentService.ReconcileRemoteWriteConfig(ctx, cluster)
if err != nil {
logger.Error(err, "failed to create or update prometheus agent remote write config")
return ctrl.Result{Requeue: true}, errors.WithStack(err)
return ctrl.Result{RequeueAfter: 5 * time.Minute}, errors.WithStack(err)
}

return ctrl.Result{}, nil
Expand All @@ -134,14 +135,14 @@ func (r *ClusterMonitoringReconciler) reconcileDelete(ctx context.Context, clust
err := r.HeartbeatRepository.Delete(ctx)
if err != nil {
logger.Error(err, "failed to delete heartbeat")
return ctrl.Result{Requeue: true}, errors.WithStack(err)
return ctrl.Result{RequeueAfter: 5 * time.Minute}, errors.WithStack(err)
}
}

err := r.PrometheusAgentService.DeletePrometheusAgentRemoteWriteConfig(ctx, cluster)
err := r.PrometheusAgentService.DeleteRemoteWriteConfig(ctx, cluster)
if err != nil {
logger.Error(err, "failed to delete prometheus agent remote write config")
return ctrl.Result{Requeue: true}, errors.WithStack(err)
return ctrl.Result{RequeueAfter: 5 * time.Minute}, errors.WithStack(err)
}

// We get the latest state of the object to avoid race conditions.
Expand All @@ -153,7 +154,7 @@ func (r *ClusterMonitoringReconciler) reconcileDelete(ctx context.Context, clust
// We need to requeue if we fail to remove the finalizer because of race conditions between multiple operators.
// This will be eventually consistent.
logger.Error(err, "failed to remove finalizer, requeuing", "finalizer", monitoring.MonitoringFinalizer)
return ctrl.Result{Requeue: true}, nil
return ctrl.Result{RequeueAfter: 5 * time.Minute}, nil
}
logger.Info("removed finalizer", "finalizer", monitoring.MonitoringFinalizer)
}
Expand Down
42 changes: 26 additions & 16 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import (
"github.com/giantswarm/observability-operator/internal/controller"
"github.com/giantswarm/observability-operator/pkg/common"
"github.com/giantswarm/observability-operator/pkg/common/organization"
"github.com/giantswarm/observability-operator/pkg/common/password"
"github.com/giantswarm/observability-operator/pkg/monitoring/heartbeat"
"github.com/giantswarm/observability-operator/pkg/monitoring/prometheusagent"
//+kubebuilder:scaffold:imports
Expand All @@ -49,17 +50,19 @@ var (
scheme = runtime.NewScheme()
setupLog = ctrl.Log.WithName("setup")

metricsAddr string
enableLeaderElection bool
probeAddr string
secureMetrics bool
enableHTTP2 bool
managementClusterCustomer string
managementClusterName string
managementClusterPipeline string
managementClusterRegion string
monitoringEnabled bool
prometheusVersion string
metricsAddr string
enableLeaderElection bool
probeAddr string
secureMetrics bool
enableHTTP2 bool
managementClusterBaseDomain string
managementClusterCustomer string
managementClusterInsecureCA bool
managementClusterName string
managementClusterPipeline string
managementClusterRegion string
monitoringEnabled bool
prometheusVersion string
)

const (
Expand All @@ -85,8 +88,12 @@ func main() {
"If set the metrics endpoint is served securely")
flag.BoolVar(&enableHTTP2, "enable-http2", false,
"If set, HTTP/2 will be enabled for the metrics and webhook servers")
flag.StringVar(&managementClusterBaseDomain, "management-cluster-base-domain", "",
"The base domain of the management cluster.")
flag.StringVar(&managementClusterCustomer, "management-cluster-customer", "",
"The customer of the management cluster.")
flag.BoolVar(&managementClusterInsecureCA, "management-cluster-insecure-ca", false,
"Flag to indicate if the management cluster has an insecure CA that should be trusted")
flag.StringVar(&managementClusterName, "management-cluster-name", "",
"The name of the management cluster.")
flag.StringVar(&managementClusterPipeline, "management-cluster-pipeline", "",
Expand Down Expand Up @@ -157,10 +164,12 @@ func main() {
record.InitFromRecorder(mgr.GetEventRecorderFor("observability-operator"))

var managementCluster common.ManagementCluster = common.ManagementCluster{
Customer: managementClusterCustomer,
Name: managementClusterName,
Pipeline: managementClusterPipeline,
Region: managementClusterRegion,
BaseDomain: managementClusterBaseDomain,
Customer: managementClusterCustomer,
InsecureCA: managementClusterInsecureCA,
Name: managementClusterName,
Pipeline: managementClusterPipeline,
Region: managementClusterRegion,
}

var opsgenieApiKey = os.Getenv(OpsgenieApiKey)
Expand All @@ -181,11 +190,12 @@ func main() {
prometheusAgentService := prometheusagent.PrometheusAgentService{
Client: mgr.GetClient(),
OrganizationRepository: organizationRepository,
PasswordManager: password.SimpleManager{},
ManagementCluster: managementCluster,
PrometheusVersion: prometheusVersion,
}

if err = (&controller.ClusterMonitoringReconciler{
if err = (&controller.ClusterMonitoringReconciler{
Client: mgr.GetClient(),
ManagementCluster: managementCluster,
HeartbeatRepository: heartbeatRepository,
Expand Down
21 changes: 21 additions & 0 deletions pkg/common/password/manager.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package password

import (
"crypto/rand"
"encoding/hex"
)

type Manager interface {
GeneratePassword(length int) (string, error)
}

type SimpleManager struct {
}

func (m SimpleManager) GeneratePassword(length int) (string, error) {
bytes := make([]byte, length)
if _, err := rand.Read(bytes); err != nil {
return "", err
}
return hex.EncodeToString(bytes), nil
}
4 changes: 4 additions & 0 deletions pkg/common/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,12 @@ const (
)

type ManagementCluster struct {
// BaseDomain is the base domain of the management cluster.
BaseDomain string
// Customer is the customer name of the management cluster.
Customer string
// InsecureCA is a flag to indicate if the management cluster has an insecure CA that should be truster
InsecureCA bool
// Name is the name of the management cluster.
Name string
// Pipeline is the pipeline name of the management cluster.
Expand Down
2 changes: 1 addition & 1 deletion pkg/monitoring/finalizers.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package monitoring

// MonitoringFinalizer is the finalizer for monitoring resources.
const MonitoringFinalizer = "monitoring.giantswarm.io"
const MonitoringFinalizer = "observability.giantswarm.io/monitoring"
40 changes: 13 additions & 27 deletions pkg/monitoring/mimir/querier/querier.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,41 +4,17 @@ import (
"context"
"errors"
"fmt"
"net/http"
"time"

"github.com/prometheus/client_golang/api"
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
"github.com/prometheus/common/model"
)

// headerAdder is an http.RoundTripper that adds additional headers to the request
type headerAdder struct {
headers map[string][]string

rt http.RoundTripper
}

func (h *headerAdder) RoundTrip(req *http.Request) (*http.Response, error) {
for k, vv := range h.headers {
for _, v := range vv {
req.Header.Add(k, v)
}
}
return h.rt.RoundTrip(req)
}

// QueryTSDBHeadSeries performs an instant query against Mimir.
func QueryTSDBHeadSeries(ctx context.Context, clusterName string) (float64, error) {
headerAdder := &headerAdder{
headers: map[string][]string{
"X-Org-Id": {"anonynous"},
},
rt: http.DefaultTransport,
}
config := api.Config{
Address: "http://mimir-gateway.mimir.svc/prometheus",
RoundTripper: headerAdder,
Address: "http://mimir-gateway.mimir.svc/prometheus",
}

// Create new client.
Expand All @@ -51,15 +27,25 @@ func QueryTSDBHeadSeries(ctx context.Context, clusterName string) (float64, erro
api := v1.NewAPI(c)

queryContext, cancel := context.WithTimeout(ctx, 2*time.Minute)
val, _, err := api.Query(queryContext, fmt.Sprintf("max_over_time(count({cluster_id=\"%s\"})[6h])", clusterName), time.Now())
query := fmt.Sprintf("sum(max_over_time(prometheus_agent_active_series{cluster_id=\"%s\"}[6h]))", clusterName)
val, _, err := api.Query(queryContext, query, time.Now())
cancel()
if err != nil {
return 0, err
}

switch val.Type() {
case model.ValVector:
vector := val.(model.Vector)
vector, ok := val.(model.Vector)
if !ok {
return 0, errors.New("failed to convert value to vector")
}
if len(vector) == 0 {
return 0, errors.New("no time series found")
}
if len(vector) > 1 {
return 0, errors.New("more than one time series found")
}
return float64(vector[0].Value), nil
default:
return 0, errors.New("failed to get current number of time series")
Expand Down
Loading

0 comments on commit 05595cc

Please sign in to comment.