From bf391a62c6ced1840768b4d92ecbcd3a63a8a816 Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Tue, 2 Apr 2024 23:28:45 +0200 Subject: [PATCH] add-mimir-heartbeat --- .gitignore | 2 +- CHANGELOG.md | 4 + go.mod | 9 +- go.sum | 26 ++- .../templates/deployment.yaml | 10 +- .../templates/rbac.yaml | 11 ++ .../templates/secret.yaml | 10 + .../observability-operator/values.schema.json | 6 + helm/observability-operator/values.yaml | 2 + internal/controller/cluster_controller.go | 61 ------ .../cluster_monitoring_controller.go | 147 ++++++++++++++ ... => cluster_monitoring_controller_test.go} | 0 main.go | 65 +++++-- pkg/common/types.go | 8 + pkg/monitoring/finalizers.go | 4 + pkg/monitoring/heartbeat/doc.go | 2 + pkg/monitoring/heartbeat/opsgenie.go | 184 ++++++++++++++++++ pkg/monitoring/heartbeat/types.go | 17 ++ 18 files changed, 485 insertions(+), 83 deletions(-) create mode 100644 helm/observability-operator/templates/secret.yaml delete mode 100644 internal/controller/cluster_controller.go create mode 100644 internal/controller/cluster_monitoring_controller.go rename internal/controller/{cluster_controller_test.go => cluster_monitoring_controller_test.go} (100%) create mode 100644 pkg/common/types.go create mode 100644 pkg/monitoring/finalizers.go create mode 100644 pkg/monitoring/heartbeat/doc.go create mode 100644 pkg/monitoring/heartbeat/opsgenie.go create mode 100644 pkg/monitoring/heartbeat/types.go diff --git a/.gitignore b/.gitignore index 84e99dde..cbf27225 100644 --- a/.gitignore +++ b/.gitignore @@ -25,4 +25,4 @@ Dockerfile.cross *.swo *~ -observability-operator* +/observability-operator* diff --git a/CHANGELOG.md b/CHANGELOG.md index d96e0049..d69bd4fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,4 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Initialize project and create heartbeat for the installation. + [Unreleased]: https://github.com/giantswarm/observability-operator/tree/master diff --git a/go.mod b/go.mod index 74cad4d5..a79cc519 100644 --- a/go.mod +++ b/go.mod @@ -5,15 +5,19 @@ go 1.21 require ( github.com/onsi/ginkgo/v2 v2.17.1 github.com/onsi/gomega v1.32.0 + github.com/opsgenie/opsgenie-go-sdk-v2 v1.2.22 + github.com/pkg/errors v0.9.1 + github.com/sirupsen/logrus v1.9.0 k8s.io/apimachinery v0.29.3 k8s.io/client-go v0.29.3 + sigs.k8s.io/cluster-api v1.6.3 sigs.k8s.io/controller-runtime v0.17.2 ) require ( github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect - github.com/davecgh/go-spew v1.1.1 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/emicklei/go-restful/v3 v3.12.0 // indirect github.com/evanphx/json-patch/v5 v5.9.0 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect @@ -31,6 +35,8 @@ require ( github.com/google/gofuzz v1.2.0 // indirect github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 // indirect github.com/google/uuid v1.6.0 // indirect + github.com/hashicorp/go-cleanhttp v0.5.0 // indirect + github.com/hashicorp/go-retryablehttp v0.5.1 // indirect github.com/imdario/mergo v0.3.16 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect @@ -38,7 +44,6 @@ require ( github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/pkg/errors v0.9.1 // indirect github.com/prometheus/client_golang v1.19.0 // indirect github.com/prometheus/client_model v0.6.0 // indirect github.com/prometheus/common v0.51.1 // indirect diff --git a/go.sum b/go.sum index 4d56a4b1..56fe11da 100644 --- a/go.sum +++ b/go.sum @@ -6,12 +6,13 @@ github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWR github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/emicklei/go-restful/v3 v3.12.0 h1:y2DdzBAURM29NFF94q6RaY4vjIH1rtwDapwQtU84iWk= github.com/emicklei/go-restful/v3 v3.12.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/evanphx/json-patch v4.12.0+incompatible h1:4onqiflcdA9EOZ4RxV643DvftH5pOlLGNtQ5lPWQu84= -github.com/evanphx/json-patch v4.12.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= +github.com/evanphx/json-patch v5.6.0+incompatible h1:jBYDEEiFBPxA0v50tFdvOzQQTCvpL6mnFh5mB2/l16U= +github.com/evanphx/json-patch v5.6.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.0 h1:kcBlZQbplgElYIlo/n1hJbls2z/1awpXxpRi0/FOJfg= github.com/evanphx/json-patch/v5 v5.9.0/go.mod h1:VNkHZ/282BpEyt/tObQO8s5CMPmYYq14uClGH4abBuQ= github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= @@ -49,6 +50,10 @@ github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJY github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/hashicorp/go-cleanhttp v0.5.0 h1:wvCrVc9TjDls6+YGAF2hAifE1E5U1+b4tH6KdvN3Gig= +github.com/hashicorp/go-cleanhttp v0.5.0/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80= +github.com/hashicorp/go-retryablehttp v0.5.1 h1:Vsx5XKPqPs3M6sM4U4GWyUqFS8aBiL9U5gkgvpkg4SE= +github.com/hashicorp/go-retryablehttp v0.5.1/go.mod h1:9B5zBasrRhHXnJnui7y6sL7es7NDiJgTc6Er0maI1Xs= github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4= github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY= @@ -58,6 +63,7 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= @@ -75,6 +81,9 @@ github.com/onsi/ginkgo/v2 v2.17.1 h1:V++EzdbhI4ZV4ev0UTIj0PzhzOcReJFyJaLjtSF55M8 github.com/onsi/ginkgo/v2 v2.17.1/go.mod h1:llBI3WDLL9Z6taip6f33H76YcWtJv+7R3HigUjbIBOs= github.com/onsi/gomega v1.32.0 h1:JRYU78fJ1LPxlckP6Txi/EYqJvjtMrDC04/MM5XRHPk= github.com/onsi/gomega v1.32.0/go.mod h1:a4x4gW6Pz2yK1MAmvluYme5lvYTn61afQ2ETw/8n4Lg= +github.com/opsgenie/opsgenie-go-sdk-v2 v1.2.22 h1:0h+YoXSyipf6XQGyIaDg6z5jwRik1JSm+sQetnD7vGY= +github.com/opsgenie/opsgenie-go-sdk-v2 v1.2.22/go.mod h1:4OjcxgwdXzezqytxN534MooNmrxRD50geWZxTD7845s= +github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= @@ -89,11 +98,17 @@ github.com/prometheus/procfs v0.13.0 h1:GqzLlQyfsPbaEHaQkO7tbDlriv/4o5Hudv6OXHGK github.com/prometheus/procfs v0.13.0/go.mod h1:cd4PFCR54QLnGKPaKGA6l+cfuNXtht43ZKY6tow0Y1g= github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= +github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= +github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0= +github.com/sirupsen/logrus v1.9.0/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= @@ -118,6 +133,7 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20201224014010-6772e930b67b/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.22.0 h1:9sGLhx7iRIHEiX0oAJ3MRZMUCElJgy7Br1nO+AMN3Tc= @@ -130,11 +146,13 @@ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4= golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= @@ -196,6 +214,8 @@ k8s.io/kube-openapi v0.0.0-20240322212309-b815d8309940 h1:qVoMaQV5t62UUvHe16Q3eb k8s.io/kube-openapi v0.0.0-20240322212309-b815d8309940/go.mod h1:yD4MZYeKMBwQKVht279WycxKyM84kkAx2DPrTXaeb98= k8s.io/utils v0.0.0-20240310230437-4693a0247e57 h1:gbqbevonBh57eILzModw6mrkbwM0gQBEuevE/AaBsHY= k8s.io/utils v0.0.0-20240310230437-4693a0247e57/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +sigs.k8s.io/cluster-api v1.6.3 h1:VOlPNg92PQLlhBVLc5pg+cbAuPvGOOBujeFLk9zgnoo= +sigs.k8s.io/cluster-api v1.6.3/go.mod h1:4FzfgPPiYaFq8X9F9j2SvmggH/4OOLEDgVJuWDqKLig= sigs.k8s.io/controller-runtime v0.17.2 h1:FwHwD1CTUemg0pW2otk7/U5/i5m2ymzvOXdbeGOUvw0= sigs.k8s.io/controller-runtime v0.17.2/go.mod h1:+MngTvIQQQhfXtwfdGw/UOQ/aIaqsYywfCINOtwMO/s= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= diff --git a/helm/observability-operator/templates/deployment.yaml b/helm/observability-operator/templates/deployment.yaml index a5d99091..81d313cb 100644 --- a/helm/observability-operator/templates/deployment.yaml +++ b/helm/observability-operator/templates/deployment.yaml @@ -25,7 +25,14 @@ spec: args: - --leader-elect - --management-cluster-name={{ $.Values.managementCluster.name }} + - --management-cluster-pipeline={{ $.Values.managementCluster.pipeline }} - --monitoring-enabled={{ $.Values.monitoring.enabled }} + env: + - name: OPSGENIE_API_KEY + valueFrom: + secretKeyRef: + name: {{ include "resource.default.name" . }}-credentials + key: opsgenieApiKey livenessProbe: httpGet: path: /healthz @@ -41,9 +48,6 @@ spec: name: http protocol: TCP resources: {{ toYaml .Values.operator.resources | nindent 10 }} - volumeMounts: - - name: {{ include "name" . }}-configmap - mountPath: /var/run/{{ include "name" . }}/configmap/ serviceAccountName: {{ include "resource.default.name" . }} securityContext: {{- with .Values.operator.podSecurityContext }} diff --git a/helm/observability-operator/templates/rbac.yaml b/helm/observability-operator/templates/rbac.yaml index fb07b550..20bce7f6 100644 --- a/helm/observability-operator/templates/rbac.yaml +++ b/helm/observability-operator/templates/rbac.yaml @@ -28,6 +28,17 @@ rules: - list - update - patch + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: ["*"] + - apiGroups: + - "" + resources: + - events + verbs: + - create --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding diff --git a/helm/observability-operator/templates/secret.yaml b/helm/observability-operator/templates/secret.yaml new file mode 100644 index 00000000..94b59c5b --- /dev/null +++ b/helm/observability-operator/templates/secret.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: Secret +metadata: + labels: + {{- include "labels.common" . | nindent 4 }} + name: {{ include "resource.default.name" . }}-credentials + namespace: {{ include "resource.default.namespace" . }} +data: + opsgenieApiKey: {{ .Values.monitoring.opsgenieApiKey | b64enc | quote }} +type: Opaque diff --git a/helm/observability-operator/values.schema.json b/helm/observability-operator/values.schema.json index 76aed072..3c4503cd 100644 --- a/helm/observability-operator/values.schema.json +++ b/helm/observability-operator/values.schema.json @@ -34,6 +34,9 @@ "properties": { "name": { "type": "string" + }, + "pipeline": { + "type": "string" } } }, @@ -42,6 +45,9 @@ "properties": { "enabled": { "type": "boolean" + }, + "opsgenieApiKey": { + "type": "string" } } }, diff --git a/helm/observability-operator/values.yaml b/helm/observability-operator/values.yaml index ffbcbe89..00a4e1db 100644 --- a/helm/observability-operator/values.yaml +++ b/helm/observability-operator/values.yaml @@ -9,9 +9,11 @@ image: managementCluster: name: unknown + pipeline: unknown monitoring: enabled: false + opsgenieApiKey: "" operator: # -- Configures the resources for the operator deployment diff --git a/internal/controller/cluster_controller.go b/internal/controller/cluster_controller.go deleted file mode 100644 index 0b73baff..00000000 --- a/internal/controller/cluster_controller.go +++ /dev/null @@ -1,61 +0,0 @@ -/* -Copyright 2024. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package controller - -import ( - "context" - - "k8s.io/apimachinery/pkg/runtime" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log" -) - -// ClusterReconciler reconciles a Cluster object -type ClusterReconciler struct { - client.Client - Scheme *runtime.Scheme -} - -//+kubebuilder:rbac:groups=objectstorage.giantswarm.io,resources=Clusters,verbs=get;list;watch;create;update;patch;delete -//+kubebuilder:rbac:groups=objectstorage.giantswarm.io,resources=Clusters/status,verbs=get;update;patch -//+kubebuilder:rbac:groups=objectstorage.giantswarm.io,resources=Clusters/finalizers,verbs=update - -// Reconcile is part of the main kubernetes reconciliation loop which aims to -// move the current state of the cluster closer to the desired state. -// TODO(user): Modify the Reconcile function to compare the state specified by -// the Cluster object against the actual cluster state, and then -// perform operations to make the cluster state reflect the state specified by -// the user. -// -// For more details, check Reconcile and its Result here: -// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.17.0/pkg/reconcile -func (r *ClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - _ = log.FromContext(ctx) - - // TODO(user): your logic here - - return ctrl.Result{}, nil -} - -// SetupWithManager sets up the controller with the Manager. -func (r *ClusterReconciler) SetupWithManager(mgr ctrl.Manager) error { - return ctrl.NewControllerManagedBy(mgr). - // TODO(user): Uncomment the following line adding a pointer to an instance of the controlled resource as an argument - // For(). - Complete(r) -} diff --git a/internal/controller/cluster_monitoring_controller.go b/internal/controller/cluster_monitoring_controller.go new file mode 100644 index 00000000..b1766029 --- /dev/null +++ b/internal/controller/cluster_monitoring_controller.go @@ -0,0 +1,147 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + + "github.com/pkg/errors" + apierrors "k8s.io/apimachinery/pkg/api/errors" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "github.com/giantswarm/observability-operator/pkg/common" + "github.com/giantswarm/observability-operator/pkg/monitoring" + "github.com/giantswarm/observability-operator/pkg/monitoring/heartbeat" +) + +// ClusterMonitoringReconciler reconciles a Cluster object +type ClusterMonitoringReconciler struct { + // Client is the controller client. + client.Client + common.ManagementCluster + // HeartbeatRepository is the repository for managing heartbeats. + heartbeat.HeartbeatRepository + // MonitoringEnabled defines whether monitoring is enabled at the installation level. + MonitoringEnabled bool +} + +// SetupWithManager sets up the controller with the Manager. +func (r *ClusterMonitoringReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&clusterv1.Cluster{}). + Complete(r) +} + +//+kubebuilder:rbac:groups=objectstorage.giantswarm.io,resources=Clusters,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=objectstorage.giantswarm.io,resources=Clusters/status,verbs=get;update;patch +//+kubebuilder:rbac:groups=objectstorage.giantswarm.io,resources=Clusters/finalizers,verbs=update + +// Reconcile is part of the main kubernetes reconciliation loop which aims to +// move the current state of the cluster closer to the desired state. +// +// For more details, check Reconcile and its Result here: +// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.17.0/pkg/reconcile +func (r *ClusterMonitoringReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + // Fetch the Cluster instance. + cluster := &clusterv1.Cluster{} + if err := r.Client.Get(ctx, req.NamespacedName, cluster); err != nil { + if apierrors.IsNotFound(err) { + // Object not found, return. Created objects are automatically garbage collected. + // For additional cleanup logic use finalizers. + return ctrl.Result{}, nil + } + + // Error reading the object - requeue the request. + return ctrl.Result{}, errors.WithStack(err) + } + + // Handle deletion reconciliation loop. + if !cluster.ObjectMeta.DeletionTimestamp.IsZero() || !r.MonitoringEnabled { + logger.Info("Handling deletion for Cluster", "cluster", cluster.Name) + return r.reconcileDelete(ctx, cluster) + } + + logger.Info("Reconciling Cluster", "cluster", cluster.Name) + // Handle normal reconciliation loop. + return r.reconcile(ctx, cluster) +} + +// reconcile handles cluster reconciliation. +func (r *ClusterMonitoringReconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster) (ctrl.Result, error) { + logger := log.FromContext(ctx).WithValues("cluster", cluster.Name) + + // Add finalizer first if not set to avoid the race condition between init and delete. + // Note: Finalizers in general can only be added when the deletionTimestamp is not set. + if !controllerutil.ContainsFinalizer(cluster, monitoring.MonitoringFinalizer) { + logger.Info("adding finalizer", "finalizer", monitoring.MonitoringFinalizer) + controllerutil.AddFinalizer(cluster, monitoring.MonitoringFinalizer) + err := r.Client.Update(ctx, cluster) + if err != nil { + logger.Error(err, "failed to add finalizer", "finalizer", monitoring.MonitoringFinalizer) + return ctrl.Result{}, errors.WithStack(err) + } + logger.Info("added finalizer", "finalizer", monitoring.MonitoringFinalizer) + return ctrl.Result{}, nil + } + + if cluster.Name == r.ManagementCluster.Name { + err := r.HeartbeatRepository.CreateOrUpdate(ctx) + if err != nil { + logger.Error(err, "failed to create or update heartbeat") + return ctrl.Result{Requeue: true}, errors.WithStack(err) + } + } + + return ctrl.Result{}, nil +} + +// reconcileDelete handles cluster deletion. +func (r *ClusterMonitoringReconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Cluster) (reconcile.Result, error) { + logger := log.FromContext(ctx).WithValues("cluster", cluster.Name) + if controllerutil.ContainsFinalizer(cluster, monitoring.MonitoringFinalizer) { + + if cluster.Name == r.ManagementCluster.Name { + err := r.HeartbeatRepository.Delete(ctx) + if err != nil { + logger.Error(err, "failed to delete heartbeat") + return ctrl.Result{Requeue: true}, errors.WithStack(err) + } + } + + // We get the latest state of the object to avoid race conditions. + // Finalizer handling needs to come last. + logger.Info("removing finalizer", "finalizer", monitoring.MonitoringFinalizer) + controllerutil.RemoveFinalizer(cluster, monitoring.MonitoringFinalizer) + err := r.Client.Update(ctx, cluster) + if err != nil { + // We need to requeue if we fail to remove the finalizer because of race conditions between multiple operators. + // This will be eventually consistent. + logger.Error(err, "failed to remove finalizer, requeuing", "finalizer", monitoring.MonitoringFinalizer) + return ctrl.Result{Requeue: true}, nil + } + logger.Info("removed finalizer", "finalizer", monitoring.MonitoringFinalizer) + } + controllerutil.RemoveFinalizer(cluster, monitoring.MonitoringFinalizer) + return ctrl.Result{}, nil +} diff --git a/internal/controller/cluster_controller_test.go b/internal/controller/cluster_monitoring_controller_test.go similarity index 100% rename from internal/controller/cluster_controller_test.go rename to internal/controller/cluster_monitoring_controller_test.go diff --git a/main.go b/main.go index 3460315b..240ef326 100644 --- a/main.go +++ b/main.go @@ -19,6 +19,7 @@ package main import ( "crypto/tls" "flag" + "fmt" "os" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) @@ -28,6 +29,8 @@ import ( "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/util/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" @@ -35,30 +38,41 @@ import ( "sigs.k8s.io/controller-runtime/pkg/webhook" "github.com/giantswarm/observability-operator/internal/controller" + "github.com/giantswarm/observability-operator/pkg/common" + "github.com/giantswarm/observability-operator/pkg/monitoring/heartbeat" //+kubebuilder:scaffold:imports ) var ( scheme = runtime.NewScheme() setupLog = ctrl.Log.WithName("setup") + + metricsAddr string + enableLeaderElection bool + probeAddr string + secureMetrics bool + enableHTTP2 bool + managementClusterName string + managementClusterPipeline string + monitoringEnabled bool +) + +const ( + OpsgenieApiKey = "OPSGENIE_API_KEY" // #nosec G101 ) func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(clusterv1.AddToScheme(scheme)) //+kubebuilder:scaffold:scheme } func main() { - var metricsAddr string - var enableLeaderElection bool - var probeAddr string - var secureMetrics bool - var enableHTTP2 bool - var managementClusterName string - var monitoringEnabled bool - flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.") - flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") + flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", + "The address the metric endpoint binds to.") + flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", + "The address the probe endpoint binds to.") flag.BoolVar(&enableLeaderElection, "leader-elect", false, "Enable leader election for controller manager. "+ "Enabling this will ensure there is only one active controller manager.") @@ -66,7 +80,10 @@ func main() { "If set the metrics endpoint is served securely") flag.BoolVar(&enableHTTP2, "enable-http2", false, "If set, HTTP/2 will be enabled for the metrics and webhook servers") - flag.StringVar(&managementClusterName, "management-cluster-name", "", "The name of the management cluster.") + flag.StringVar(&managementClusterName, "management-cluster-name", "", + "The name of the management cluster.") + flag.StringVar(&managementClusterPipeline, "management-cluster-pipeline", "", + "The pipeline of the management cluster.") flag.BoolVar(&monitoringEnabled, "monitoring-enabled", false, "Enable monitoring at the management cluster level.") opts := zap.Options{ @@ -125,9 +142,31 @@ func main() { os.Exit(1) } - if err = (&controller.ClusterReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), + // Initialize event recorder. + record.InitFromRecorder(mgr.GetEventRecorderFor("observability-operator")) + + var managementCluster common.ManagementCluster = common.ManagementCluster{ + Name: managementClusterName, + Pipeline: managementClusterPipeline, + } + + var opsgenieApiKey = os.Getenv(OpsgenieApiKey) + if opsgenieApiKey == "" { + setupLog.Error(nil, fmt.Sprintf("environment variable %s not set", OpsgenieApiKey)) + os.Exit(1) + } + + heartbeatRepository, err := heartbeat.NewOpsgenieHeartbeatRepository(opsgenieApiKey, managementCluster) + if err != nil { + setupLog.Error(err, "unable to create heartbeat repository") + os.Exit(1) + } + + if err = (&controller.ClusterMonitoringReconciler{ + Client: mgr.GetClient(), + ManagementCluster: managementCluster, + HeartbeatRepository: heartbeatRepository, + MonitoringEnabled: monitoringEnabled, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Cluster") os.Exit(1) diff --git a/pkg/common/types.go b/pkg/common/types.go new file mode 100644 index 00000000..117dc719 --- /dev/null +++ b/pkg/common/types.go @@ -0,0 +1,8 @@ +package common + +type ManagementCluster struct { + // Name is the name of the management cluster. + Name string + // Pipeline is the pipeline name of the management cluster. + Pipeline string +} diff --git a/pkg/monitoring/finalizers.go b/pkg/monitoring/finalizers.go new file mode 100644 index 00000000..d03d5298 --- /dev/null +++ b/pkg/monitoring/finalizers.go @@ -0,0 +1,4 @@ +package monitoring + +// MonitoringFinalizer is the finalizer for monitoring resources. +const MonitoringFinalizer = "monitoring.giantswarm.io" diff --git a/pkg/monitoring/heartbeat/doc.go b/pkg/monitoring/heartbeat/doc.go new file mode 100644 index 00000000..ed4b1714 --- /dev/null +++ b/pkg/monitoring/heartbeat/doc.go @@ -0,0 +1,2 @@ +// Package heartbeat provides functionality for managing heartbeats in Opsgenie. +package heartbeat diff --git a/pkg/monitoring/heartbeat/opsgenie.go b/pkg/monitoring/heartbeat/opsgenie.go new file mode 100644 index 00000000..70856646 --- /dev/null +++ b/pkg/monitoring/heartbeat/opsgenie.go @@ -0,0 +1,184 @@ +package heartbeat + +import ( + "context" + "fmt" + "net/http" + "reflect" + "sort" + + "github.com/opsgenie/opsgenie-go-sdk-v2/client" + "github.com/opsgenie/opsgenie-go-sdk-v2/heartbeat" + "github.com/opsgenie/opsgenie-go-sdk-v2/og" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + "sigs.k8s.io/controller-runtime/pkg/log" + + "github.com/giantswarm/observability-operator/pkg/common" +) + +// OpsgenieHeartbeatRepository is a repository for managing heartbeats in Opsgenie. +type OpsgenieHeartbeatRepository struct { + *heartbeat.Client + common.ManagementCluster +} + +// NewOpsgenieHeartbeatRepository creates a new OpsgenieHeartbeatRepository. +func NewOpsgenieHeartbeatRepository(apiKey string, mc common.ManagementCluster) (HeartbeatRepository, error) { + c := &client.Config{ + ApiKey: apiKey, + OpsGenieAPIURL: client.API_URL, + RetryCount: 1, + LogLevel: logrus.FatalLevel, + } + + client, err := heartbeat.NewClient(c) + return &OpsgenieHeartbeatRepository{client, mc}, err +} + +// makeHeartbeat creates a new heartbeat for the management cluster. +func (r OpsgenieHeartbeatRepository) makeHeartbeat() *heartbeat.Heartbeat { + tags := []string{ + "team: atlas", + fmt.Sprintf("installation: %s", r.ManagementCluster.Name), + "managed-by: observability-operator", + fmt.Sprintf("pipeline: %s", r.ManagementCluster.Pipeline), + } + // Tags need to be sorted alphabetically to avoid unnecessary heartbeat updates + sort.Strings(tags) + + return &heartbeat.Heartbeat{ + Name: r.ManagementCluster.Name, + Description: "📗 Runbook: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/heartbeat-expired/", + Interval: 60, + IntervalUnit: string(heartbeat.Minutes), + Enabled: true, + Expired: false, + OwnerTeam: og.OwnerTeam{ + Name: "alerts_router_team", + }, + AlertTags: tags, + AlertPriority: "P1", + AlertMessage: fmt.Sprintf("Heartbeat [%s] is expired.", r.ManagementCluster.Name), + } +} + +func (r *OpsgenieHeartbeatRepository) CreateOrUpdate(ctx context.Context) error { + logger := log.FromContext(ctx) + + hb := r.makeHeartbeat() + + // By default, we consider the heartbeat exists + var heartbeatExists = true + logger.Info("checking if heartbeat is already configured") + getResult, err := r.Client.Get(ctx, hb.Name) + if err != nil { + apiErr, ok := err.(*client.ApiError) + // If the error is not a 404, we return it + if !ok || apiErr.StatusCode != http.StatusNotFound { + return errors.WithStack(err) + } + // If the heartbeat does not exist, we set the heartbeatExists to false + heartbeatExists = false + } + + if heartbeatExists { + // If the heartbeat does not need to be updated, we leave early + if !hasChanged(getResult.Heartbeat, *hb) { + logger.Info("heartbeat is up to date") + return nil + } + + // We need to delete and recreate it because the update is a PATCH (so existing alert tags are kept) + // This caused issue when installation pipeline was switched from testing to stable. + logger.Info("heartbeat has changed and needs to be reconfigured") + + logger.Info("deleting heartbeat") + _, err := r.Client.Delete(ctx, hb.Name) + if err != nil { + return errors.WithStack(err) + } + + logger.Info("deleted heartbeat") + } + + logger.Info("creating heartbeat") + err = r.createHeartbeat(ctx, hb) + if err != nil { + return errors.WithStack(err) + } + logger.Info("created heartbeat") + + return nil +} + +func (r *OpsgenieHeartbeatRepository) Delete(ctx context.Context) error { + logger := log.FromContext(ctx) + + logger.Info("checking if heartbeat exists") + _, err := r.Client.Get(ctx, r.ManagementCluster.Name) + if err != nil { + apiErr, ok := err.(*client.ApiError) + if ok && apiErr.StatusCode == http.StatusNotFound { + logger.Info("heartbeat does not exist, skipping") + } else { + return errors.WithStack(err) + } + } + + // The final ping to the heartbeat cleans up any opened heartbeat alerts for the cluster being deleted. + logger.Info("triggering final heartbeat ping") + _, err = r.Client.Ping(ctx, r.ManagementCluster.Name) + if err != nil { + return errors.WithStack(err) + } + logger.Info("triggered final heartbeat ping") + + logger.Info("deleting heartbeat") + _, err = r.Client.Delete(ctx, r.ManagementCluster.Name) + if err != nil { + return errors.WithStack(err) + } + logger.Info("deleted heartbeat") + return nil +} + +// createHeartbeat creates a new heartbeat in Opsgenie. +func (r *OpsgenieHeartbeatRepository) createHeartbeat(ctx context.Context, h *heartbeat.Heartbeat) error { + req := &heartbeat.AddRequest{ + Name: h.Name, + Description: h.Description, + Interval: h.Interval, + IntervalUnit: heartbeat.Unit(h.IntervalUnit), + Enabled: &h.Enabled, + OwnerTeam: h.OwnerTeam, + AlertMessage: h.AlertMessage, + AlertTag: h.AlertTags, + AlertPriority: h.AlertPriority, + } + _, err := r.Client.Add(ctx, req) + if err != nil { + return errors.WithStack(err) + } + + // We ping the heartbeat to active it and make sure it pages. + _, err = r.Client.Ping(ctx, h.Name) + if err != nil { + return errors.WithStack(err) + } + + return nil +} + +// hasChanged returns true if the current heartbeat is different from the desired heartbeat. +func hasChanged(current, desired heartbeat.Heartbeat) bool { + // Ignore those fields for comparison by setting them to the same value. + current.Enabled = true + desired.Enabled = true + current.Expired = true + desired.Expired = true + // We get the ID back from opsgenie so we update it in the heartbeat + desired.OwnerTeam.Id = current.OwnerTeam.Id + + return !reflect.DeepEqual(current, desired) +} diff --git a/pkg/monitoring/heartbeat/types.go b/pkg/monitoring/heartbeat/types.go new file mode 100644 index 00000000..486cb49a --- /dev/null +++ b/pkg/monitoring/heartbeat/types.go @@ -0,0 +1,17 @@ +package heartbeat + +import ( + "context" +) + +// HeartbeatRepository is the interface for the heartbeat repository. +// It provides methods to create or update and delete a heartbeat. +// The heartbeat is used by the monitoring system to detect if the management cluster is alive. +// The current implementation relies on OpsGenie but other implementations can be added in the future. +type HeartbeatRepository interface { + // CreateOrUpdate creates or updates the heartbeat for the management cluster. + CreateOrUpdate(ctx context.Context) error + + // Delete deletes the heartbeat for the management cluster. + Delete(ctx context.Context) error +}