diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 2dcbeea554..28c53008cf 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -174,6 +174,19 @@ rules: - get - list - watch +- apiGroups: + - monitoring.coreos.com + resources: + - alertmanagers + - prometheuses + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - monitoring.coreos.com resources: @@ -186,6 +199,18 @@ rules: - list - update - watch +- apiGroups: + - monitoring.coreos.com + resources: + - servicemonitors + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - noobaa.io resources: @@ -266,8 +291,11 @@ rules: resources: - clusterserviceversions verbs: + - delete - get - list + - patch + - update - watch - apiGroups: - operators.coreos.com diff --git a/controllers/defaults/resources.go b/controllers/defaults/resources.go index a9031ebe96..1f4226378f 100644 --- a/controllers/defaults/resources.go +++ b/controllers/defaults/resources.go @@ -224,4 +224,37 @@ var ( }, }, } + + MonitoringResources = map[string]corev1.ResourceRequirements{ + "kube-rbac-proxy": { + Requests: corev1.ResourceList{ + "memory": resource.MustParse("30Mi"), + "cpu": resource.MustParse("50m"), + }, + Limits: corev1.ResourceList{ + "memory": resource.MustParse("30Mi"), + "cpu": resource.MustParse("50m"), + }, + }, + "alertmanager": { + Requests: corev1.ResourceList{ + "cpu": resource.MustParse("100m"), + "memory": resource.MustParse("200Mi"), + }, + Limits: corev1.ResourceList{ + "cpu": resource.MustParse("100m"), + "memory": resource.MustParse("200Mi"), + }, + }, + "prometheus": { + Requests: corev1.ResourceList{ + "cpu": resource.MustParse("400m"), + "memory": resource.MustParse("250Mi"), + }, + Limits: corev1.ResourceList{ + "cpu": resource.MustParse("400m"), + "memory": resource.MustParse("250Mi"), + }, + }, + } ) diff --git a/controllers/ocsinitialization/ocsinitialization_controller.go b/controllers/ocsinitialization/ocsinitialization_controller.go index 587ef63b6f..243044a763 100644 --- a/controllers/ocsinitialization/ocsinitialization_controller.go +++ b/controllers/ocsinitialization/ocsinitialization_controller.go @@ -6,12 +6,17 @@ import ( "reflect" "slices" "strconv" + "strings" "github.com/go-logr/logr" secv1client "github.com/openshift/client-go/security/clientset/versioned/typed/security/v1" + opv1a1 "github.com/operator-framework/api/pkg/operators/v1alpha1" + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" ocsv1 "github.com/red-hat-storage/ocs-operator/api/v4/v1" "github.com/red-hat-storage/ocs-operator/v4/controllers/defaults" + "github.com/red-hat-storage/ocs-operator/v4/controllers/platform" "github.com/red-hat-storage/ocs-operator/v4/controllers/util" + "github.com/red-hat-storage/ocs-operator/v4/templates" rookCephv1 "github.com/rook/rook/pkg/apis/ceph.rook.io/v1" "gopkg.in/yaml.v2" corev1 "k8s.io/api/core/v1" @@ -22,10 +27,13 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/klog/v2" + "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" ) @@ -33,8 +41,10 @@ import ( var operatorNamespace string const ( - wrongNamespacedName = "Ignoring this resource. Only one should exist, and this one has the wrong name and/or namespace." - random30CharacterString = "KP7TThmSTZegSGmHuPKLnSaaAHSG3RSgqw6akBj0oVk" + wrongNamespacedName = "Ignoring this resource. Only one should exist, and this one has the wrong name and/or namespace." + random30CharacterString = "KP7TThmSTZegSGmHuPKLnSaaAHSG3RSgqw6akBj0oVk" + PrometheusOperatorDeploymentName = "prometheus-operator" + PrometheusOperatorCSVNamePrefix = "odf-prometheus-operator" ) // InitNamespacedName returns a NamespacedName for the singleton instance that @@ -61,6 +71,9 @@ type OCSInitializationReconciler struct { // +kubebuilder:rbac:groups=ocs.openshift.io,resources=*,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=security.openshift.io,resources=securitycontextconstraints,verbs=get;create;update // +kubebuilder:rbac:groups=security.openshift.io,resourceNames=privileged,resources=securitycontextconstraints,verbs=get;create;update +// +kubebuilder:rbac:groups="monitoring.coreos.com",resources={alertmanagers,prometheuses},verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups="monitoring.coreos.com",resources=servicemonitors,verbs=get;list;watch;update;patch;create;delete +// +kubebuilder:rbac:groups=operators.coreos.com,resources=clusterserviceversions,verbs=get;list;watch;delete;update;patch // Reconcile reads that state of the cluster for a OCSInitialization object and makes changes based on the state read // and what is in the OCSInitialization.Spec @@ -179,6 +192,47 @@ func (r *OCSInitializationReconciler) Reconcile(ctx context.Context, request rec r.Log.Error(err, "Failed to ensure uxbackend service") return reconcile.Result{}, err } + if isROSAHCP, err := platform.IsPlatformROSAHCP(); err != nil { + r.Log.Error(err, "Failed to determine if ROSA HCP cluster") + return reconcile.Result{}, err + } else if isROSAHCP { + r.Log.Info("Setting up monitoring resources for ROSA HCP platform") + err = r.reconcilePrometheusOperatorCSV(instance) + if err != nil { + r.Log.Error(err, "Failed to ensure prometheus operator deployment") + return reconcile.Result{}, err + } + + err = r.reconcilePrometheusKubeRBACConfigMap(instance) + if err != nil { + r.Log.Error(err, "Failed to ensure kubeRBACConfig config map") + return reconcile.Result{}, err + } + + err = r.reconcilePrometheusService(instance) + if err != nil { + r.Log.Error(err, "Failed to ensure prometheus service") + return reconcile.Result{}, err + } + + err = r.reconcilePrometheus(instance) + if err != nil { + r.Log.Error(err, "Failed to ensure prometheus instance") + return reconcile.Result{}, err + } + + err = r.reconcileAlertManager(instance) + if err != nil { + r.Log.Error(err, "Failed to ensure alertmanager instance") + return reconcile.Result{}, err + } + + err = r.reconcileK8sMetricsServiceMonitor(instance) + if err != nil { + r.Log.Error(err, "Failed to ensure k8sMetricsService Monitor") + return reconcile.Result{}, err + } + } reason := ocsv1.ReconcileCompleted message := ocsv1.ReconcileCompletedMessage @@ -193,11 +247,19 @@ func (r *OCSInitializationReconciler) Reconcile(ctx context.Context, request rec // SetupWithManager sets up a controller with a manager func (r *OCSInitializationReconciler) SetupWithManager(mgr ctrl.Manager) error { operatorNamespace = r.OperatorNamespace - + prometheusPredicate := predicate.NewPredicateFuncs( + func(client client.Object) bool { + return strings.HasPrefix(client.GetName(), PrometheusOperatorCSVNamePrefix) + }, + ) return ctrl.NewControllerManagedBy(mgr). For(&ocsv1.OCSInitialization{}). Owns(&corev1.Service{}). Owns(&corev1.Secret{}). + Owns(&promv1.Prometheus{}). + Owns(&corev1.ConfigMap{}). + Owns(&promv1.Alertmanager{}). + Owns(&promv1.ServiceMonitor{}). // Watcher for storagecluster required to update // ocs-operator-config configmap if storagecluster spec changes Watches( @@ -259,6 +321,18 @@ func (r *OCSInitializationReconciler) SetupWithManager(mgr ctrl.Manager) error { }, ), ). + // Watcher for prometheus operator csv + Watches( + &opv1a1.ClusterServiceVersion{}, + handler.EnqueueRequestsFromMapFunc( + func(context context.Context, obj client.Object) []reconcile.Request { + return []reconcile.Request{{ + NamespacedName: InitNamespacedName(), + }} + }, + ), + builder.WithPredicates(prometheusPredicate), + ). Complete(r) } @@ -562,3 +636,169 @@ func (r *OCSInitializationReconciler) reconcileUXBackendService(initialData *ocs return nil } + +func (r *OCSInitializationReconciler) reconcilePrometheusKubeRBACConfigMap(initialData *ocsv1.OCSInitialization) error { + prometheusKubeRBACConfigMap := &corev1.ConfigMap{} + prometheusKubeRBACConfigMap.Name = templates.PrometheusKubeRBACProxyConfigMapName + prometheusKubeRBACConfigMap.Namespace = initialData.Namespace + + _, err := ctrl.CreateOrUpdate(r.ctx, r.Client, prometheusKubeRBACConfigMap, func() error { + if err := ctrl.SetControllerReference(initialData, prometheusKubeRBACConfigMap, r.Scheme); err != nil { + return err + } + prometheusKubeRBACConfigMap.Data = templates.KubeRBACProxyConfigMap.Data + return nil + }) + + if err != nil { + r.Log.Error(err, "Failed to create/update prometheus kube-rbac-proxy config map") + return err + } + r.Log.Info("Prometheus kube-rbac-proxy config map creation succeeded", "Name", prometheusKubeRBACConfigMap.Name) + return nil +} + +func (r *OCSInitializationReconciler) reconcilePrometheusService(initialData *ocsv1.OCSInitialization) error { + prometheusService := &corev1.Service{} + prometheusService.Name = "prometheus" + prometheusService.Namespace = initialData.Namespace + + _, err := ctrl.CreateOrUpdate(r.ctx, r.Client, prometheusService, func() error { + if err := ctrl.SetControllerReference(initialData, prometheusService, r.Scheme); err != nil { + return err + } + util.AddAnnotation( + prometheusService, + "service.beta.openshift.io/serving-cert-secret-name", + "prometheus-serving-cert-secret", + ) + util.AddLabel(prometheusService, "prometheus", "odf-prometheus") + prometheusService.Spec.Selector = map[string]string{ + "app.kubernetes.io/name": prometheusService.Name, + } + prometheusService.Spec.Ports = []corev1.ServicePort{ + { + Name: "https", + Protocol: corev1.ProtocolTCP, + Port: int32(templates.KubeRBACProxyPortNumber), + TargetPort: intstr.FromString("https"), + }, + } + return nil + }) + if err != nil { + r.Log.Error(err, "Failed to create/update prometheus service") + return err + } + r.Log.Info("Service creation succeeded", "Name", prometheusService.Name) + return nil +} + +func (r *OCSInitializationReconciler) reconcilePrometheus(initialData *ocsv1.OCSInitialization) error { + prometheus := &promv1.Prometheus{} + prometheus.Name = "odf-prometheus" + prometheus.Namespace = initialData.Namespace + + _, err := ctrl.CreateOrUpdate(r.ctx, r.Client, prometheus, func() error { + if err := ctrl.SetControllerReference(initialData, prometheus, r.Scheme); err != nil { + return err + } + templates.PrometheusSpecTemplate.DeepCopyInto(&prometheus.Spec) + alertManagerEndpoint := util.Find( + prometheus.Spec.Alerting.Alertmanagers, + func(candidate *promv1.AlertmanagerEndpoints) bool { + return candidate.Name == templates.AlertManagerEndpointName + }, + ) + if alertManagerEndpoint == nil { + return fmt.Errorf("unable to find AlertManagerEndpoint") + } + alertManagerEndpoint.Namespace = initialData.Namespace + return nil + }) + + if err != nil { + r.Log.Error(err, "Failed to create/update prometheus instance") + return err + } + r.Log.Info("Prometheus instance creation succeeded", "Name", prometheus.Name) + + return nil +} + +func (r *OCSInitializationReconciler) reconcileAlertManager(initialData *ocsv1.OCSInitialization) error { + alertManager := &promv1.Alertmanager{} + alertManager.Name = "odf-alertmanager" + alertManager.Namespace = initialData.Namespace + + _, err := ctrl.CreateOrUpdate(r.ctx, r.Client, alertManager, func() error { + if err := ctrl.SetControllerReference(initialData, alertManager, r.Scheme); err != nil { + return err + } + util.AddAnnotation(alertManager, "prometheus", "odf-prometheus") + templates.AlertmanagerSpecTemplate.DeepCopyInto(&alertManager.Spec) + return nil + }) + if err != nil { + r.Log.Error(err, "Failed to create/update alertManager instance") + return err + } + r.Log.Info("AlertManager instance creation succeeded", "Name", alertManager.Name) + return nil +} + +func (r *OCSInitializationReconciler) reconcileK8sMetricsServiceMonitor(initialData *ocsv1.OCSInitialization) error { + k8sMetricsServiceMonitor := &promv1.ServiceMonitor{} + k8sMetricsServiceMonitor.Name = "k8s-metrics-service-monitor" + k8sMetricsServiceMonitor.Namespace = initialData.Namespace + + _, err := ctrl.CreateOrUpdate(r.ctx, r.Client, k8sMetricsServiceMonitor, func() error { + if err := ctrl.SetControllerReference(initialData, k8sMetricsServiceMonitor, r.Scheme); err != nil { + return err + } + util.AddLabel(k8sMetricsServiceMonitor, "app", "odf-prometheus") + templates.K8sMetricsServiceMonitorSpecTemplate.DeepCopyInto(&k8sMetricsServiceMonitor.Spec) + return nil + }) + if err != nil { + r.Log.Error(err, "Failed to create/update K8s Metrics Service Monitor") + return err + } + r.Log.Info("K8s Metrics Service Monitor creation succeeded", "Name", k8sMetricsServiceMonitor.Name) + return nil + +} + +func (r *OCSInitializationReconciler) reconcilePrometheusOperatorCSV(initialData *ocsv1.OCSInitialization) error { + csvList := &opv1a1.ClusterServiceVersionList{} + if err := r.Client.List(r.ctx, csvList, client.InNamespace(initialData.Namespace)); err != nil { + return fmt.Errorf("failed to list csvs in namespace %s,%v", initialData.Namespace, err) + } + csv := util.Find( + csvList.Items, + func(csv *opv1a1.ClusterServiceVersion) bool { + return strings.HasPrefix(csv.Name, PrometheusOperatorCSVNamePrefix) + }, + ) + if csv == nil { + return fmt.Errorf("prometheus csv does not exist in namespace :%s", initialData.Namespace) + } + deploymentSpec := util.Find( + csv.Spec.InstallStrategy.StrategySpec.DeploymentSpecs, + func(deploymentSpec *opv1a1.StrategyDeploymentSpec) bool { + return deploymentSpec.Name == PrometheusOperatorDeploymentName + }, + ) + if deploymentSpec == nil { + return fmt.Errorf("unable to find prometheus operator deployment spec") + } + currentDeploymentSpec := deploymentSpec.DeepCopy() + deploymentSpec.Spec.Replicas = ptr.To(int32(1)) + if !reflect.DeepEqual(currentDeploymentSpec, deploymentSpec) { + if err := r.Client.Update(r.ctx, csv); err != nil { + r.Log.Error(err, "Failed to update Prometheus csv") + return err + } + } + return nil +} diff --git a/controllers/ocsinitialization/ocsinitialization_controller_test.go b/controllers/ocsinitialization/ocsinitialization_controller_test.go index abd17c116a..ca83968f23 100644 --- a/controllers/ocsinitialization/ocsinitialization_controller_test.go +++ b/controllers/ocsinitialization/ocsinitialization_controller_test.go @@ -5,11 +5,14 @@ import ( "fmt" "testing" + configv1 "github.com/openshift/api/config/v1" secv1 "github.com/openshift/api/security/v1" fakeSecClient "github.com/openshift/client-go/security/clientset/versioned/typed/security/v1/fake" conditionsv1 "github.com/openshift/custom-resource-status/conditions/v1" + opv1a1 "github.com/operator-framework/api/pkg/operators/v1alpha1" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" v1 "github.com/red-hat-storage/ocs-operator/api/v4/v1" + "github.com/red-hat-storage/ocs-operator/v4/controllers/platform" statusutil "github.com/red-hat-storage/ocs-operator/v4/controllers/util" "github.com/stretchr/testify/assert" appsv1 "k8s.io/api/apps/v1" @@ -106,6 +109,10 @@ func createFakeScheme(t *testing.T) *runtime.Scheme { assert.Fail(t, "failed to add securityv1 scheme") } + err = opv1a1.AddToScheme(scheme) + if err != nil { + assert.Fail(t, "failed to add v1alpha1 scheme") + } return scheme } @@ -191,6 +198,7 @@ func TestCreateWatchedResource(t *testing.T) { testcases := []struct { label string alreadyCreated bool + platform configv1.PlatformType }{ { label: "Case 1", // ocsInit resource not created already before reconcile @@ -203,6 +211,7 @@ func TestCreateWatchedResource(t *testing.T) { } for _, tc := range testcases { + platform.SetFakePlatformInstanceForTesting(true, tc.platform) ctx := context.TODO() ocs, request, reconciler := getTestParams(false, t) if !tc.alreadyCreated { @@ -218,6 +227,7 @@ func TestCreateWatchedResource(t *testing.T) { _ = reconciler.Client.Get(ctx, request.NamespacedName, &obj) assert.Equalf(t, obj.Name, request.Name, "[%s]: failed to create ocsInit resource with correct name", tc.label) assert.Equalf(t, obj.Namespace, request.Namespace, "[%s]: failed to create ocsInit resource with correct namespace", tc.label) + platform.UnsetFakePlatformInstanceForTesting() } } diff --git a/controllers/platform/platform_detection.go b/controllers/platform/platform_detection.go index 79dc1ee823..5a0e7d443d 100644 --- a/controllers/platform/platform_detection.go +++ b/controllers/platform/platform_detection.go @@ -35,6 +35,9 @@ var ( type platform struct { isOpenShift bool platform configv1.PlatformType + // isROSAHCP flag is temporary and needs to be rethought + // open issue: https://github.com/red-hat-storage/ocs-operator/issues/2521 + isROSAHCP bool } // SetFakePlatformInstanceForTesting can be used to fake a Platform while testing. @@ -43,6 +46,7 @@ func SetFakePlatformInstanceForTesting(isOpenShift bool, platformType configv1.P platformInstance = &platform{ isOpenShift: isOpenShift, platform: platformType, + isROSAHCP: false, } } @@ -88,10 +92,20 @@ func Detect() { } } } - if platformInstance.isOpenShift { - if infrastructure, err := configv1client(cfg).Infrastructures().Get(context.TODO(), "cluster", metav1.GetOptions{}); err != nil { - platformInstance.platform = infrastructure.Status.PlatformStatus.Type + var infrastructure *configv1.Infrastructure + if infrastructure, err = configv1client(cfg).Infrastructures().Get(context.TODO(), "cluster", metav1.GetOptions{}); err != nil { + log.Fatal(err) + } + platformInstance.platform = infrastructure.Status.PlatformStatus.Type + if platformInstance.platform == configv1.AWSPlatformType { + if infrastructure.Status.ControlPlaneTopology == configv1.ExternalTopologyMode { + for _, resourceTags := range infrastructure.Status.PlatformStatus.AWS.ResourceTags { + if resourceTags.Key == "red-hat-clustertype" && resourceTags.Value == "rosa" { + platformInstance.isROSAHCP = true + } + } + } } } }) @@ -160,3 +174,10 @@ func SkipObjectStore(p configv1.PlatformType) bool { } return false } + +func IsPlatformROSAHCP() (bool, error) { + if platformInstance == nil { + return false, ErrorPlatformNotDetected + } + return platformInstance.isROSAHCP, nil +} diff --git a/controllers/util/util.go b/controllers/util/util.go index 278ecb8a7d..8e12772056 100644 --- a/controllers/util/util.go +++ b/controllers/util/util.go @@ -2,6 +2,7 @@ package util import ( ocsv1 "github.com/red-hat-storage/ocs-operator/api/v4/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) func RemoveDuplicatesFromStringSlice(slice []string) []string { @@ -54,3 +55,29 @@ func Find[T any](list []T, f func(item *T) bool) *T { } return nil } + +func AddAnnotation(obj metav1.Object, key string, value string) bool { + annotations := obj.GetAnnotations() + if annotations == nil { + annotations = map[string]string{} + obj.SetAnnotations(annotations) + } + if oldValue, exist := annotations[key]; !exist || oldValue != value { + annotations[key] = value + return true + } + return false +} + +func AddLabel(obj metav1.Object, key string, value string) bool { + labels := obj.GetLabels() + if labels == nil { + labels = map[string]string{} + obj.SetLabels(labels) + } + if oldValue, exist := labels[key]; !exist || oldValue != value { + labels[key] = value + return true + } + return false +} diff --git a/deploy/csv-templates/ocs-operator.csv.yaml.in b/deploy/csv-templates/ocs-operator.csv.yaml.in index a2d6855897..6c86bcedad 100644 --- a/deploy/csv-templates/ocs-operator.csv.yaml.in +++ b/deploy/csv-templates/ocs-operator.csv.yaml.in @@ -328,6 +328,19 @@ spec: - get - list - watch + - apiGroups: + - monitoring.coreos.com + resources: + - alertmanagers + - prometheuses + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - monitoring.coreos.com resources: @@ -340,6 +353,18 @@ spec: - list - update - watch + - apiGroups: + - monitoring.coreos.com + resources: + - servicemonitors + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - noobaa.io resources: @@ -420,8 +445,11 @@ spec: resources: - clusterserviceversions verbs: + - delete - get - list + - patch + - update - watch - apiGroups: - operators.coreos.com diff --git a/deploy/ocs-operator/manifests/k8s-metrics-servicemonitor-role-binding.yaml b/deploy/ocs-operator/manifests/k8s-metrics-servicemonitor-role-binding.yaml new file mode 100644 index 0000000000..de28ba69f9 --- /dev/null +++ b/deploy/ocs-operator/manifests/k8s-metrics-servicemonitor-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: k8s-metrics-sm-prometheus-k8s +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: k8s-metrics-sm-prometheus-k8s +subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: odf-storage diff --git a/deploy/ocs-operator/manifests/k8s-metrics-servicemonitor-role.yaml b/deploy/ocs-operator/manifests/k8s-metrics-servicemonitor-role.yaml new file mode 100644 index 0000000000..050f85a3e1 --- /dev/null +++ b/deploy/ocs-operator/manifests/k8s-metrics-servicemonitor-role.yaml @@ -0,0 +1,51 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: k8s-metrics-sm-prometheus-k8s +rules: + - verbs: + - get + apiGroups: + - '' + resources: + - nodes/metrics + - verbs: + - get + nonResourceURLs: + - /metrics + - verbs: + - create + apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + - verbs: + - create + apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + - verbs: + - get + apiGroups: + - '' + resources: + - namespaces + - verbs: + - use + apiGroups: + - security.openshift.io + resources: + - securitycontextconstraints + resourceNames: + - nonroot + - verbs: + - list + - watch + - get + apiGroups: + - '' + resources: + - pods + - endpoints + - services diff --git a/deploy/ocs-operator/manifests/ocs-operator.clusterserviceversion.yaml b/deploy/ocs-operator/manifests/ocs-operator.clusterserviceversion.yaml index 6214055f4c..68ee699aed 100644 --- a/deploy/ocs-operator/manifests/ocs-operator.clusterserviceversion.yaml +++ b/deploy/ocs-operator/manifests/ocs-operator.clusterserviceversion.yaml @@ -349,6 +349,19 @@ spec: - get - list - watch + - apiGroups: + - monitoring.coreos.com + resources: + - alertmanagers + - prometheuses + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - monitoring.coreos.com resources: @@ -361,6 +374,18 @@ spec: - list - update - watch + - apiGroups: + - monitoring.coreos.com + resources: + - servicemonitors + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - noobaa.io resources: @@ -441,8 +466,11 @@ spec: resources: - clusterserviceversions verbs: + - delete - get - list + - patch + - update - watch - apiGroups: - operators.coreos.com diff --git a/deploy/ocs-operator/manifests/odf-prometheus-role-binding.yaml b/deploy/ocs-operator/manifests/odf-prometheus-role-binding.yaml new file mode 100644 index 0000000000..ab0ce55096 --- /dev/null +++ b/deploy/ocs-operator/manifests/odf-prometheus-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: odf-prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: odf-prometheus +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: odf-storage diff --git a/deploy/ocs-operator/manifests/odf-prometheus-role.yaml b/deploy/ocs-operator/manifests/odf-prometheus-role.yaml new file mode 100644 index 0000000000..58f679e113 --- /dev/null +++ b/deploy/ocs-operator/manifests/odf-prometheus-role.yaml @@ -0,0 +1,24 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: odf-prometheus +rules: +- apiGroups: [""] + resources: + - nodes + - nodes/metrics + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - configmaps + verbs: ["get"] +- apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: ["get", "list", "watch"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] diff --git a/rbac/k8s-metrics-servicemonitor-role-binding.yaml b/rbac/k8s-metrics-servicemonitor-role-binding.yaml new file mode 100644 index 0000000000..de28ba69f9 --- /dev/null +++ b/rbac/k8s-metrics-servicemonitor-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: k8s-metrics-sm-prometheus-k8s +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: k8s-metrics-sm-prometheus-k8s +subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: odf-storage diff --git a/rbac/k8s-metrics-servicemonitor-role.yaml b/rbac/k8s-metrics-servicemonitor-role.yaml new file mode 100644 index 0000000000..050f85a3e1 --- /dev/null +++ b/rbac/k8s-metrics-servicemonitor-role.yaml @@ -0,0 +1,51 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: k8s-metrics-sm-prometheus-k8s +rules: + - verbs: + - get + apiGroups: + - '' + resources: + - nodes/metrics + - verbs: + - get + nonResourceURLs: + - /metrics + - verbs: + - create + apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + - verbs: + - create + apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + - verbs: + - get + apiGroups: + - '' + resources: + - namespaces + - verbs: + - use + apiGroups: + - security.openshift.io + resources: + - securitycontextconstraints + resourceNames: + - nonroot + - verbs: + - list + - watch + - get + apiGroups: + - '' + resources: + - pods + - endpoints + - services diff --git a/rbac/odf-prometheus-role-binding.yaml b/rbac/odf-prometheus-role-binding.yaml new file mode 100644 index 0000000000..ab0ce55096 --- /dev/null +++ b/rbac/odf-prometheus-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: odf-prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: odf-prometheus +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: odf-storage diff --git a/rbac/odf-prometheus-role.yaml b/rbac/odf-prometheus-role.yaml new file mode 100644 index 0000000000..58f679e113 --- /dev/null +++ b/rbac/odf-prometheus-role.yaml @@ -0,0 +1,24 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: odf-prometheus +rules: +- apiGroups: [""] + resources: + - nodes + - nodes/metrics + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - configmaps + verbs: ["get"] +- apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: ["get", "list", "watch"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] diff --git a/templates/alertmanager.go b/templates/alertmanager.go new file mode 100644 index 0000000000..d5ba8f8b70 --- /dev/null +++ b/templates/alertmanager.go @@ -0,0 +1,12 @@ +package templates + +import ( + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/red-hat-storage/ocs-operator/v4/controllers/defaults" + "k8s.io/utils/ptr" +) + +var AlertmanagerSpecTemplate = promv1.AlertmanagerSpec{ + Replicas: ptr.To(int32(1)), + Resources: defaults.MonitoringResources["alertmanager"], +} diff --git a/templates/k8smetricsservicemonitor.go b/templates/k8smetricsservicemonitor.go new file mode 100644 index 0000000000..8614ffd646 --- /dev/null +++ b/templates/k8smetricsservicemonitor.go @@ -0,0 +1,69 @@ +package templates + +import ( + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +var params = map[string][]string{ + "match[]": { + "{__name__='kube_node_status_condition'}", + "{__name__='kube_persistentvolume_info'}", + "{__name__='kube_storageclass_info'}", + "{__name__='kube_persistentvolumeclaim_info'}", + "{__name__='kube_deployment_spec_replicas'}", + "{__name__='kube_pod_status_phase'}", + "{__name__='kubelet_volume_stats_capacity_bytes'}", + "{__name__='kubelet_volume_stats_used_bytes'}", + "{__name__='node_disk_read_time_seconds_total'}", + "{__name__='node_disk_write_time_seconds_total'}", + "{__name__='node_disk_reads_completed_total'}", + "{__name__='node_disk_writes_completed_total'}", + }, +} + +var K8sMetricsServiceMonitorSpecTemplate = promv1.ServiceMonitorSpec{ + + Endpoints: []promv1.Endpoint{ + { + Port: "web", + Path: "/federate", + Scheme: "https", + ScrapeTimeout: "1m", + Interval: "2m", + HonorLabels: true, + MetricRelabelConfigs: []*promv1.RelabelConfig{ + { + Action: "labeldrop", + Regex: "prometheus_replica", + }, + }, + RelabelConfigs: []*promv1.RelabelConfig{ + { + Action: "replace", + Regex: "prometheus-k8s-.*", + Replacement: "", + SourceLabels: []promv1.LabelName{ + "pod", + }, + TargetLabel: "pod", + }, + }, + TLSConfig: &promv1.TLSConfig{ + SafeTLSConfig: promv1.SafeTLSConfig{ + InsecureSkipVerify: true, + }, + }, + Params: params, + BearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token", + }, + }, + NamespaceSelector: promv1.NamespaceSelector{ + MatchNames: []string{"openshift-monitoring"}, + }, + Selector: metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app.kubernetes.io/component": "prometheus", + }, + }, +} diff --git a/templates/prometheus.go b/templates/prometheus.go new file mode 100644 index 0000000000..d732e382bd --- /dev/null +++ b/templates/prometheus.go @@ -0,0 +1,96 @@ +package templates + +import ( + "fmt" + + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/red-hat-storage/ocs-operator/v4/controllers/defaults" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" +) + +var serviceMonitorSelector = metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": "odf-prometheus", + }, +} + +var ruleSelector = metav1.LabelSelector{ + MatchLabels: map[string]string{ + "prometheus": "rook-prometheus", + }, +} + +var ( + KubeRBACProxyPortNumber = 9339 + PrometheusKubeRBACProxyConfigMapName = "prometheus-kube-rbac-proxy-config" + AlertManagerEndpointName = "alertmanager-operated" +) + +var PrometheusSpecTemplate = promv1.PrometheusSpec{ + CommonPrometheusFields: promv1.CommonPrometheusFields{ + ServiceAccountName: "prometheus-k8s", + ServiceMonitorSelector: &serviceMonitorSelector, + ListenLocal: true, + Resources: defaults.MonitoringResources["prometheus"], + Containers: []corev1.Container{{ + Name: "kube-rbac-proxy", + // Tech-debt to include kube-rbac-proxy image from environment + Image: "gcr.io/kubebuilder/kube-rbac-proxy:v0.13.0", + Args: []string{ + fmt.Sprintf("--secure-listen-address=0.0.0.0:%d", KubeRBACProxyPortNumber), + "--upstream=http://127.0.0.1:9090/", + "--logtostderr=true", + "--v=10", + "--tls-cert-file=/etc/tls-secret/tls.crt", + "--tls-private-key-file=/etc/tls-secret/tls.key", + "--client-ca-file=/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt", + "--config-file=/etc/kube-rbac-config/config-file.json", + }, + Ports: []corev1.ContainerPort{{ + Name: "https", + ContainerPort: int32(KubeRBACProxyPortNumber), + }}, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "serving-cert", + MountPath: "/etc/tls-secret", + }, + { + Name: "kube-rbac-config", + MountPath: "/etc/kube-rbac-config", + }, + }, + Resources: defaults.MonitoringResources["kube-rbac-proxy"], + }}, + Volumes: []corev1.Volume{ + { + Name: "serving-cert", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{ + SecretName: "prometheus-serving-cert-secret", + }, + }, + }, + { + Name: "kube-rbac-config", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: PrometheusKubeRBACProxyConfigMapName, + }, + }, + }, + }, + }, + }, + RuleSelector: &ruleSelector, + EnableAdminAPI: false, + Alerting: &promv1.AlertingSpec{ + Alertmanagers: []promv1.AlertmanagerEndpoints{{ + Name: AlertManagerEndpointName, + Port: intstr.FromString("web"), + }}, + }, +} diff --git a/templates/prometheuskuberbacproxyconfig.go b/templates/prometheuskuberbacproxyconfig.go new file mode 100644 index 0000000000..c1c4775981 --- /dev/null +++ b/templates/prometheuskuberbacproxyconfig.go @@ -0,0 +1,36 @@ +package templates + +import ( + "encoding/json" + + corev1 "k8s.io/api/core/v1" +) + +var KubeRBACProxyConfigMap = corev1.ConfigMap{ + Data: map[string]string{ + "config-file.json": (func() string { + config := struct { + Authorization struct { + Static [2]struct { + Path string `json:"path"` + ResourceRequest bool `json:"resourceRequest"` + Verb string `json:"verb"` + } `json:"static"` + } `json:"authorization"` + }{} + + item := &config.Authorization.Static[0] + item.Verb = "get" + item.Path = "/metrics" + item.ResourceRequest = false + + item = &config.Authorization.Static[1] + item.Verb = "get" + item.Path = "/federate" + item.ResourceRequest = false + + raw, _ := json.Marshal(config) + return string(raw) + })(), + }, +}