Skip to content

Commit

Permalink
feat: Move metrics rules and alerts to separate package
Browse files Browse the repository at this point in the history
Moved the definitions of alerts and rules
to pkg/monitoring/rules package.

Signed-off-by: Andrej Krejcir <[email protected]>
  • Loading branch information
akrejcir committed Jan 23, 2024
1 parent 3c7c91d commit aa9a6fb
Show file tree
Hide file tree
Showing 5 changed files with 188 additions and 181 deletions.
176 changes: 4 additions & 172 deletions internal/operands/metrics/resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,196 +2,28 @@ package metrics

import (
"errors"
"fmt"
"os"
"strings"

promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
rbac "k8s.io/api/rbac/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"

"kubevirt.io/ssp-operator/pkg/monitoring/rules"
)

const (
PrometheusRuleName = "prometheus-k8s-rules-cnv"
MonitorNamespace = "openshift-monitoring"
defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s"
runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE"
severityAlertLabelKey = "severity"
healthImpactAlertLabelKey = "operator_health_impact"
partOfAlertLabelKey = "kubernetes_operator_part_of"
partOfAlertLabelValue = "kubevirt"
componentAlertLabelKey = "kubernetes_operator_component"
componentAlertLabelValue = "ssp-operator"
PrometheusLabelKey = "prometheus.ssp.kubevirt.io"
PrometheusLabelValue = "true"
PrometheusClusterRoleName = "prometheus-k8s-ssp"
PrometheusServiceAccountName = "prometheus-k8s"
MetricsPortName = "metrics"
)

const (
CommonTemplatesRestoredIncreaseQuery = "sum(increase(kubevirt_ssp_common_templates_restored_total{pod=~'ssp-operator.*'}[1h]))"
TemplateValidatorRejectedIncreaseQuery = "sum(increase(kubevirt_ssp_template_validator_rejected_total{pod=~'virt-template-validator.*'}[1h]))"
)

// RecordRulesDesc represent SSP Operator Prometheus Record Rules
type RecordRulesDesc struct {
Name string
Expr intstr.IntOrString
Description string
Type string
}

// RecordRulesDescList lists all SSP Operator Prometheus Record Rules
var RecordRulesDescList = []RecordRulesDesc{
{
Name: "kubevirt_ssp_operator_up",
Expr: intstr.FromString("sum(up{pod=~'ssp-operator.*'}) OR on() vector(0)"),
Description: "The total number of running ssp-operator pods",
Type: "Gauge",
},
{
Name: "kubevirt_ssp_template_validator_up",
Expr: intstr.FromString("sum(up{pod=~'virt-template-validator.*'}) OR on() vector(0)"),
Description: "The total number of running virt-template-validator pods",
Type: "Gauge",
},
{
Name: "kubevirt_ssp_operator_reconcile_succeeded_aggregated",
Expr: intstr.FromString("sum(kubevirt_ssp_operator_reconcile_succeeded)"),
Description: "The total number of ssp-operator pods reconciling with no errors",
Type: "Gauge",
},
{
Name: "kubevirt_ssp_template_validator_rejected_increase",
Expr: intstr.FromString(TemplateValidatorRejectedIncreaseQuery + " OR on() vector(0)"),
Description: "The increase in the number of rejected template validators, over the last hour",
Type: "Gauge",
},
{
Name: "kubevirt_ssp_common_templates_restored_increase",
Expr: intstr.FromString(CommonTemplatesRestoredIncreaseQuery + " OR on() vector(0)"),
Description: "The increase in the number of common templates restored by the operator back to their original state, over the last hour",
Type: "Gauge",
},
}

func getAlertRules() ([]promv1.Rule, error) {
runbookURLTemplate, err := getRunbookURLTemplate()
if err != nil {
return nil, err
}

return []promv1.Rule{
{
Expr: intstr.FromString("sum(kubevirt_vmi_phase_count{phase=\"running\"}) by (node,os,workload,flavor,instance_type,preference)"),
Record: "cnv:vmi_status_running:count",
},
{
Alert: "SSPDown",
Expr: intstr.FromString("kubevirt_ssp_operator_up == 0"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "All SSP operator pods are down.",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPDown"),
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
{
Alert: "SSPTemplateValidatorDown",
Expr: intstr.FromString("kubevirt_ssp_template_validator_up == 0"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "All Template Validator pods are down.",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPTemplateValidatorDown"),
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
{
Alert: "SSPFailingToReconcile",
Expr: intstr.FromString("(kubevirt_ssp_operator_reconcile_succeeded_aggregated == 0) and (kubevirt_ssp_operator_up > 0)"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "The ssp-operator pod is up but failing to reconcile",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPFailingToReconcile"),
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
{
Alert: "SSPHighRateRejectedVms",
Expr: intstr.FromString("kubevirt_ssp_template_validator_rejected_increase > 5"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "High rate of rejected Vms",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPHighRateRejectedVms"),
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "warning",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
{
Alert: "SSPCommonTemplatesModificationReverted",
Expr: intstr.FromString("kubevirt_ssp_common_templates_restored_increase > 0"),
For: ptr.To[promv1.Duration]("0m"),
Annotations: map[string]string{
"summary": "Common Templates manual modifications were reverted by the operator",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPCommonTemplatesModificationReverted"),
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
{
Alert: "VirtualMachineCRCErrors",
Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"),
Annotations: map[string]string{
"description": "{{ $value }} Virtual Machines are in risk of causing CRC errors and major service outages",
"summary": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', it will report bad crc/signature errors and cluster performance will be severely degraded if krbd:rxbounce is not set.",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "VirtualMachineCRCErrors"),
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
}, nil
}

func getRecordRules() []promv1.Rule {
var recordRules []promv1.Rule

for _, rrd := range RecordRulesDescList {
recordRules = append(recordRules, promv1.Rule{Record: rrd.Name, Expr: rrd.Expr})
}

return recordRules
}

func newMonitoringClusterRole() *rbac.ClusterRole {
return &rbac.ClusterRole{
ObjectMeta: metav1.ObjectMeta{
Expand Down Expand Up @@ -266,7 +98,7 @@ func newServiceMonitorCR(namespace string) *promv1.ServiceMonitor {
}

func newPrometheusRule(namespace string) (*promv1.PrometheusRule, error) {
alertRules, err := getAlertRules()
runbookURLTemplate, err := getRunbookURLTemplate()
if err != nil {
return nil, err
}
Expand All @@ -286,7 +118,7 @@ func newPrometheusRule(namespace string) (*promv1.PrometheusRule, error) {
Groups: []promv1.RuleGroup{
{
Name: "cnv.rules",
Rules: append(alertRules, getRecordRules()...),
Rules: append(rules.RecordRules(), rules.AlertRules(runbookURLTemplate)...),
},
},
},
Expand Down
174 changes: 174 additions & 0 deletions pkg/monitoring/rules/rules.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
package rules

import (
"fmt"

promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"
)

const (
severityAlertLabelKey = "severity"
healthImpactAlertLabelKey = "operator_health_impact"
partOfAlertLabelKey = "kubernetes_operator_part_of"
partOfAlertLabelValue = "kubevirt"
componentAlertLabelKey = "kubernetes_operator_component"
componentAlertLabelValue = "ssp-operator"
)

const (
CommonTemplatesRestoredIncreaseQuery = "sum(increase(kubevirt_ssp_common_templates_restored_total{pod=~'ssp-operator.*'}[1h]))"
TemplateValidatorRejectedIncreaseQuery = "sum(increase(kubevirt_ssp_template_validator_rejected_total{pod=~'virt-template-validator.*'}[1h]))"
)

// RecordRulesDesc represent SSP Operator Prometheus Record Rules
type RecordRulesDesc struct {
Name string
Expr intstr.IntOrString
Description string
Type string
}

// RecordRulesDescList lists all SSP Operator Prometheus Record Rules
var RecordRulesDescList = []RecordRulesDesc{
{
Name: "kubevirt_ssp_operator_up",
Expr: intstr.FromString("sum(up{pod=~'ssp-operator.*'}) OR on() vector(0)"),
Description: "The total number of running ssp-operator pods",
Type: "Gauge",
},
{
Name: "kubevirt_ssp_template_validator_up",
Expr: intstr.FromString("sum(up{pod=~'virt-template-validator.*'}) OR on() vector(0)"),
Description: "The total number of running virt-template-validator pods",
Type: "Gauge",
},
{
Name: "kubevirt_ssp_operator_reconcile_succeeded_aggregated",
Expr: intstr.FromString("sum(kubevirt_ssp_operator_reconcile_succeeded)"),
Description: "The total number of ssp-operator pods reconciling with no errors",
Type: "Gauge",
},
{
Name: "kubevirt_ssp_template_validator_rejected_increase",
Expr: intstr.FromString(TemplateValidatorRejectedIncreaseQuery + " OR on() vector(0)"),
Description: "The increase in the number of rejected template validators, over the last hour",
Type: "Gauge",
},
{
Name: "kubevirt_ssp_common_templates_restored_increase",
Expr: intstr.FromString(CommonTemplatesRestoredIncreaseQuery + " OR on() vector(0)"),
Description: "The increase in the number of common templates restored by the operator back to their original state, over the last hour",
Type: "Gauge",
},
}

func RecordRules() []promv1.Rule {
var recordRules []promv1.Rule

for _, rrd := range RecordRulesDescList {
recordRules = append(recordRules, promv1.Rule{Record: rrd.Name, Expr: rrd.Expr})
}

return recordRules
}

func AlertRules(runbookURLTemplate string) []promv1.Rule {
return []promv1.Rule{
{
Expr: intstr.FromString("sum(kubevirt_vmi_phase_count{phase=\"running\"}) by (node,os,workload,flavor,instance_type,preference)"),
Record: "cnv:vmi_status_running:count",
},
{
Alert: "SSPDown",
Expr: intstr.FromString("kubevirt_ssp_operator_up == 0"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "All SSP operator pods are down.",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPDown"),
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
{
Alert: "SSPTemplateValidatorDown",
Expr: intstr.FromString("kubevirt_ssp_template_validator_up == 0"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "All Template Validator pods are down.",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPTemplateValidatorDown"),
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
{
Alert: "SSPFailingToReconcile",
Expr: intstr.FromString("(kubevirt_ssp_operator_reconcile_succeeded_aggregated == 0) and (kubevirt_ssp_operator_up > 0)"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "The ssp-operator pod is up but failing to reconcile",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPFailingToReconcile"),
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
{
Alert: "SSPHighRateRejectedVms",
Expr: intstr.FromString("kubevirt_ssp_template_validator_rejected_increase > 5"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "High rate of rejected Vms",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPHighRateRejectedVms"),
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "warning",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
{
Alert: "SSPCommonTemplatesModificationReverted",
Expr: intstr.FromString("kubevirt_ssp_common_templates_restored_increase > 0"),
For: ptr.To[promv1.Duration]("0m"),
Annotations: map[string]string{
"summary": "Common Templates manual modifications were reverted by the operator",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPCommonTemplatesModificationReverted"),
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
{
Alert: "VirtualMachineCRCErrors",
Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"),
Annotations: map[string]string{
"description": "{{ $value }} Virtual Machines are in risk of causing CRC errors and major service outages",
"summary": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', it will report bad crc/signature errors and cluster performance will be severely degraded if krbd:rxbounce is not set.",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "VirtualMachineCRCErrors"),
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
}
}
Loading

0 comments on commit aa9a6fb

Please sign in to comment.