diff --git a/internal/operands/metrics/resources.go b/internal/operands/metrics/resources.go index 4b22d62cb..6fb8c1555 100644 --- a/internal/operands/metrics/resources.go +++ b/internal/operands/metrics/resources.go @@ -2,15 +2,14 @@ package metrics import ( "errors" - "fmt" "os" "strings" promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" rbac "k8s.io/api/rbac/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/intstr" - "k8s.io/utils/ptr" + + "kubevirt.io/ssp-operator/pkg/monitoring/rules" ) const ( @@ -18,12 +17,6 @@ const ( MonitorNamespace = "openshift-monitoring" defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s" runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE" - severityAlertLabelKey = "severity" - healthImpactAlertLabelKey = "operator_health_impact" - partOfAlertLabelKey = "kubernetes_operator_part_of" - partOfAlertLabelValue = "kubevirt" - componentAlertLabelKey = "kubernetes_operator_component" - componentAlertLabelValue = "ssp-operator" PrometheusLabelKey = "prometheus.ssp.kubevirt.io" PrometheusLabelValue = "true" PrometheusClusterRoleName = "prometheus-k8s-ssp" @@ -31,167 +24,6 @@ const ( MetricsPortName = "metrics" ) -const ( - CommonTemplatesRestoredIncreaseQuery = "sum(increase(kubevirt_ssp_common_templates_restored_total{pod=~'ssp-operator.*'}[1h]))" - TemplateValidatorRejectedIncreaseQuery = "sum(increase(kubevirt_ssp_template_validator_rejected_total{pod=~'virt-template-validator.*'}[1h]))" -) - -// RecordRulesDesc represent SSP Operator Prometheus Record Rules -type RecordRulesDesc struct { - Name string - Expr intstr.IntOrString - Description string - Type string -} - -// RecordRulesDescList lists all SSP Operator Prometheus Record Rules -var RecordRulesDescList = []RecordRulesDesc{ - { - Name: "kubevirt_ssp_operator_up", - Expr: intstr.FromString("sum(up{pod=~'ssp-operator.*'}) OR on() vector(0)"), - Description: "The total number of running ssp-operator pods", - Type: "Gauge", - }, - { - Name: "kubevirt_ssp_template_validator_up", - Expr: intstr.FromString("sum(up{pod=~'virt-template-validator.*'}) OR on() vector(0)"), - Description: "The total number of running virt-template-validator pods", - Type: "Gauge", - }, - { - Name: "kubevirt_ssp_operator_reconcile_succeeded_aggregated", - Expr: intstr.FromString("sum(kubevirt_ssp_operator_reconcile_succeeded)"), - Description: "The total number of ssp-operator pods reconciling with no errors", - Type: "Gauge", - }, - { - Name: "kubevirt_ssp_template_validator_rejected_increase", - Expr: intstr.FromString(TemplateValidatorRejectedIncreaseQuery + " OR on() vector(0)"), - Description: "The increase in the number of rejected template validators, over the last hour", - Type: "Gauge", - }, - { - Name: "kubevirt_ssp_common_templates_restored_increase", - Expr: intstr.FromString(CommonTemplatesRestoredIncreaseQuery + " OR on() vector(0)"), - Description: "The increase in the number of common templates restored by the operator back to their original state, over the last hour", - Type: "Gauge", - }, -} - -func getAlertRules() ([]promv1.Rule, error) { - runbookURLTemplate, err := getRunbookURLTemplate() - if err != nil { - return nil, err - } - - return []promv1.Rule{ - { - Expr: intstr.FromString("sum(kubevirt_vmi_phase_count{phase=\"running\"}) by (node,os,workload,flavor,instance_type,preference)"), - Record: "cnv:vmi_status_running:count", - }, - { - Alert: "SSPDown", - Expr: intstr.FromString("kubevirt_ssp_operator_up == 0"), - For: ptr.To[promv1.Duration]("5m"), - Annotations: map[string]string{ - "summary": "All SSP operator pods are down.", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPDown"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "critical", - healthImpactAlertLabelKey: "critical", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - { - Alert: "SSPTemplateValidatorDown", - Expr: intstr.FromString("kubevirt_ssp_template_validator_up == 0"), - For: ptr.To[promv1.Duration]("5m"), - Annotations: map[string]string{ - "summary": "All Template Validator pods are down.", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPTemplateValidatorDown"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "critical", - healthImpactAlertLabelKey: "critical", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - { - Alert: "SSPFailingToReconcile", - Expr: intstr.FromString("(kubevirt_ssp_operator_reconcile_succeeded_aggregated == 0) and (kubevirt_ssp_operator_up > 0)"), - For: ptr.To[promv1.Duration]("5m"), - Annotations: map[string]string{ - "summary": "The ssp-operator pod is up but failing to reconcile", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPFailingToReconcile"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "critical", - healthImpactAlertLabelKey: "critical", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - { - Alert: "SSPHighRateRejectedVms", - Expr: intstr.FromString("kubevirt_ssp_template_validator_rejected_increase > 5"), - For: ptr.To[promv1.Duration]("5m"), - Annotations: map[string]string{ - "summary": "High rate of rejected Vms", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPHighRateRejectedVms"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "warning", - healthImpactAlertLabelKey: "warning", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - { - Alert: "SSPCommonTemplatesModificationReverted", - Expr: intstr.FromString("kubevirt_ssp_common_templates_restored_increase > 0"), - For: ptr.To[promv1.Duration]("0m"), - Annotations: map[string]string{ - "summary": "Common Templates manual modifications were reverted by the operator", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPCommonTemplatesModificationReverted"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "warning", - healthImpactAlertLabelKey: "none", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - { - Alert: "VirtualMachineCRCErrors", - Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"), - Annotations: map[string]string{ - "description": "{{ $value }} Virtual Machines are in risk of causing CRC errors and major service outages", - "summary": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', it will report bad crc/signature errors and cluster performance will be severely degraded if krbd:rxbounce is not set.", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "VirtualMachineCRCErrors"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "warning", - healthImpactAlertLabelKey: "none", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - }, nil -} - -func getRecordRules() []promv1.Rule { - var recordRules []promv1.Rule - - for _, rrd := range RecordRulesDescList { - recordRules = append(recordRules, promv1.Rule{Record: rrd.Name, Expr: rrd.Expr}) - } - - return recordRules -} - func newMonitoringClusterRole() *rbac.ClusterRole { return &rbac.ClusterRole{ ObjectMeta: metav1.ObjectMeta{ @@ -266,7 +98,7 @@ func newServiceMonitorCR(namespace string) *promv1.ServiceMonitor { } func newPrometheusRule(namespace string) (*promv1.PrometheusRule, error) { - alertRules, err := getAlertRules() + runbookURLTemplate, err := getRunbookURLTemplate() if err != nil { return nil, err } @@ -286,7 +118,7 @@ func newPrometheusRule(namespace string) (*promv1.PrometheusRule, error) { Groups: []promv1.RuleGroup{ { Name: "cnv.rules", - Rules: append(alertRules, getRecordRules()...), + Rules: append(rules.RecordRules(), rules.AlertRules(runbookURLTemplate)...), }, }, }, diff --git a/pkg/monitoring/rules/rules.go b/pkg/monitoring/rules/rules.go new file mode 100644 index 000000000..8622931b7 --- /dev/null +++ b/pkg/monitoring/rules/rules.go @@ -0,0 +1,174 @@ +package rules + +import ( + "fmt" + + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/utils/ptr" +) + +const ( + severityAlertLabelKey = "severity" + healthImpactAlertLabelKey = "operator_health_impact" + partOfAlertLabelKey = "kubernetes_operator_part_of" + partOfAlertLabelValue = "kubevirt" + componentAlertLabelKey = "kubernetes_operator_component" + componentAlertLabelValue = "ssp-operator" +) + +const ( + CommonTemplatesRestoredIncreaseQuery = "sum(increase(kubevirt_ssp_common_templates_restored_total{pod=~'ssp-operator.*'}[1h]))" + TemplateValidatorRejectedIncreaseQuery = "sum(increase(kubevirt_ssp_template_validator_rejected_total{pod=~'virt-template-validator.*'}[1h]))" +) + +// RecordRulesDesc represent SSP Operator Prometheus Record Rules +type RecordRulesDesc struct { + Name string + Expr intstr.IntOrString + Description string + Type string +} + +// RecordRulesDescList lists all SSP Operator Prometheus Record Rules +var RecordRulesDescList = []RecordRulesDesc{ + { + Name: "kubevirt_ssp_operator_up", + Expr: intstr.FromString("sum(up{pod=~'ssp-operator.*'}) OR on() vector(0)"), + Description: "The total number of running ssp-operator pods", + Type: "Gauge", + }, + { + Name: "kubevirt_ssp_template_validator_up", + Expr: intstr.FromString("sum(up{pod=~'virt-template-validator.*'}) OR on() vector(0)"), + Description: "The total number of running virt-template-validator pods", + Type: "Gauge", + }, + { + Name: "kubevirt_ssp_operator_reconcile_succeeded_aggregated", + Expr: intstr.FromString("sum(kubevirt_ssp_operator_reconcile_succeeded)"), + Description: "The total number of ssp-operator pods reconciling with no errors", + Type: "Gauge", + }, + { + Name: "kubevirt_ssp_template_validator_rejected_increase", + Expr: intstr.FromString(TemplateValidatorRejectedIncreaseQuery + " OR on() vector(0)"), + Description: "The increase in the number of rejected template validators, over the last hour", + Type: "Gauge", + }, + { + Name: "kubevirt_ssp_common_templates_restored_increase", + Expr: intstr.FromString(CommonTemplatesRestoredIncreaseQuery + " OR on() vector(0)"), + Description: "The increase in the number of common templates restored by the operator back to their original state, over the last hour", + Type: "Gauge", + }, +} + +func RecordRules() []promv1.Rule { + var recordRules []promv1.Rule + + for _, rrd := range RecordRulesDescList { + recordRules = append(recordRules, promv1.Rule{Record: rrd.Name, Expr: rrd.Expr}) + } + + return recordRules +} + +func AlertRules(runbookURLTemplate string) []promv1.Rule { + return []promv1.Rule{ + { + Expr: intstr.FromString("sum(kubevirt_vmi_phase_count{phase=\"running\"}) by (node,os,workload,flavor,instance_type,preference)"), + Record: "cnv:vmi_status_running:count", + }, + { + Alert: "SSPDown", + Expr: intstr.FromString("kubevirt_ssp_operator_up == 0"), + For: ptr.To[promv1.Duration]("5m"), + Annotations: map[string]string{ + "summary": "All SSP operator pods are down.", + "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPDown"), + }, + Labels: map[string]string{ + severityAlertLabelKey: "critical", + healthImpactAlertLabelKey: "critical", + partOfAlertLabelKey: partOfAlertLabelValue, + componentAlertLabelKey: componentAlertLabelValue, + }, + }, + { + Alert: "SSPTemplateValidatorDown", + Expr: intstr.FromString("kubevirt_ssp_template_validator_up == 0"), + For: ptr.To[promv1.Duration]("5m"), + Annotations: map[string]string{ + "summary": "All Template Validator pods are down.", + "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPTemplateValidatorDown"), + }, + Labels: map[string]string{ + severityAlertLabelKey: "critical", + healthImpactAlertLabelKey: "critical", + partOfAlertLabelKey: partOfAlertLabelValue, + componentAlertLabelKey: componentAlertLabelValue, + }, + }, + { + Alert: "SSPFailingToReconcile", + Expr: intstr.FromString("(kubevirt_ssp_operator_reconcile_succeeded_aggregated == 0) and (kubevirt_ssp_operator_up > 0)"), + For: ptr.To[promv1.Duration]("5m"), + Annotations: map[string]string{ + "summary": "The ssp-operator pod is up but failing to reconcile", + "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPFailingToReconcile"), + }, + Labels: map[string]string{ + severityAlertLabelKey: "critical", + healthImpactAlertLabelKey: "critical", + partOfAlertLabelKey: partOfAlertLabelValue, + componentAlertLabelKey: componentAlertLabelValue, + }, + }, + { + Alert: "SSPHighRateRejectedVms", + Expr: intstr.FromString("kubevirt_ssp_template_validator_rejected_increase > 5"), + For: ptr.To[promv1.Duration]("5m"), + Annotations: map[string]string{ + "summary": "High rate of rejected Vms", + "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPHighRateRejectedVms"), + }, + Labels: map[string]string{ + severityAlertLabelKey: "warning", + healthImpactAlertLabelKey: "warning", + partOfAlertLabelKey: partOfAlertLabelValue, + componentAlertLabelKey: componentAlertLabelValue, + }, + }, + { + Alert: "SSPCommonTemplatesModificationReverted", + Expr: intstr.FromString("kubevirt_ssp_common_templates_restored_increase > 0"), + For: ptr.To[promv1.Duration]("0m"), + Annotations: map[string]string{ + "summary": "Common Templates manual modifications were reverted by the operator", + "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPCommonTemplatesModificationReverted"), + }, + Labels: map[string]string{ + severityAlertLabelKey: "warning", + healthImpactAlertLabelKey: "none", + partOfAlertLabelKey: partOfAlertLabelValue, + componentAlertLabelKey: componentAlertLabelValue, + }, + }, + { + Alert: "VirtualMachineCRCErrors", + Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"), + Annotations: map[string]string{ + "description": "{{ $value }} Virtual Machines are in risk of causing CRC errors and major service outages", + "summary": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', it will report bad crc/signature errors and cluster performance will be severely degraded if krbd:rxbounce is not set.", + "runbook_url": fmt.Sprintf(runbookURLTemplate, "VirtualMachineCRCErrors"), + }, + Labels: map[string]string{ + severityAlertLabelKey: "warning", + healthImpactAlertLabelKey: "none", + partOfAlertLabelKey: partOfAlertLabelValue, + componentAlertLabelKey: componentAlertLabelValue, + }, + }, + } +} diff --git a/tests/monitoring_test.go b/tests/monitoring_test.go index ad834e169..538f08ff9 100644 --- a/tests/monitoring_test.go +++ b/tests/monitoring_test.go @@ -31,6 +31,7 @@ import ( ssp "kubevirt.io/ssp-operator/api/v1beta2" "kubevirt.io/ssp-operator/internal/operands/metrics" + "kubevirt.io/ssp-operator/pkg/monitoring/rules" "kubevirt.io/ssp-operator/tests/env" ) @@ -49,7 +50,7 @@ var _ = Describe("Prometheus Alerts", func() { }) It("[test_id:8363] Should fire SSPCommonTemplatesModificationReverted", func() { // we have to wait for prometheus to pick up the series before we increase it. - waitForSeriesToBeDetected(metrics.CommonTemplatesRestoredIncreaseQuery) + waitForSeriesToBeDetected(rules.CommonTemplatesRestoredIncreaseQuery) expectTemplateUpdateToIncreaseTotalRestoredTemplatesCount(testTemplate) waitForAlertToActivate("SSPCommonTemplatesModificationReverted") }) @@ -114,7 +115,7 @@ var _ = Describe("Prometheus Alerts", func() { }) It("[test_id:8377] Should fire SSPHighRateRejectedVms", func() { - waitForSeriesToBeDetected(metrics.TemplateValidatorRejectedIncreaseQuery) + waitForSeriesToBeDetected(rules.TemplateValidatorRejectedIncreaseQuery) Expect(apiClient.Create(ctx, template)).ToNot(HaveOccurred(), "Failed to create template: %s", template.Name) for range [6]int{} { time.Sleep(time.Second * 5) diff --git a/tools/metricsdocs/metricsdocs.go b/tools/metricsdocs/metricsdocs.go index 76a3bd4fa..cf3735afc 100644 --- a/tools/metricsdocs/metricsdocs.go +++ b/tools/metricsdocs/metricsdocs.go @@ -7,9 +7,9 @@ import ( "github.com/machadovilaca/operator-observability/pkg/operatormetrics" - "kubevirt.io/ssp-operator/internal/operands/metrics" sspMetrics "kubevirt.io/ssp-operator/pkg/monitoring/metrics/ssp-operator" validatorMetrics "kubevirt.io/ssp-operator/pkg/monitoring/metrics/template-validator" + "kubevirt.io/ssp-operator/pkg/monitoring/rules" ) const ( @@ -30,7 +30,7 @@ const ( ) func main() { - metricsList := recordRulesDescToMetricList(metrics.RecordRulesDescList) + metricsList := recordRulesDescToMetricList(rules.RecordRulesDescList) sspMetrics.SetupMetrics() validatorMetrics.SetupMetrics() @@ -59,7 +59,7 @@ type metric struct { mtype string } -func recordRulesDescToMetricList(mdl []metrics.RecordRulesDesc) metricList { +func recordRulesDescToMetricList(mdl []rules.RecordRulesDesc) metricList { res := make([]metric, len(mdl)) for i, md := range mdl { res[i] = metricDescriptionToMetric(md) @@ -68,7 +68,7 @@ func recordRulesDescToMetricList(mdl []metrics.RecordRulesDesc) metricList { return res } -func metricDescriptionToMetric(rrd metrics.RecordRulesDesc) metric { +func metricDescriptionToMetric(rrd rules.RecordRulesDesc) metric { return metric{ name: rrd.Name, description: rrd.Description, diff --git a/tools/prom-metrics-collector/metrics_collector.go b/tools/prom-metrics-collector/metrics_collector.go index 522246750..b4e3c38e3 100644 --- a/tools/prom-metrics-collector/metrics_collector.go +++ b/tools/prom-metrics-collector/metrics_collector.go @@ -2,9 +2,9 @@ package main import ( parser "github.com/kubevirt/monitoring/pkg/metrics/parser" - "kubevirt.io/ssp-operator/internal/operands/metrics" - dto "github.com/prometheus/client_model/go" + + "kubevirt.io/ssp-operator/pkg/monitoring/rules" ) // This should be used only for very rare cases where the naming conventions that are explained in the best practices: @@ -14,7 +14,7 @@ var excludedMetrics = map[string]struct{}{} func readMetrics() []*dto.MetricFamily { var metricFamilies []*dto.MetricFamily - sspMetrics := metrics.RecordRulesDescList + sspMetrics := rules.RecordRulesDescList for _, metric := range sspMetrics { if _, isExcludedMetric := excludedMetrics[metric.Name]; !isExcludedMetric {