Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: clean up of deleted rule group #223

Merged
merged 1 commit into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 9 additions & 22 deletions controllers/absence_prometheusrule.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"fmt"
"reflect"
"sort"
"strings"
"time"

monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
Expand Down Expand Up @@ -302,8 +303,8 @@ func (r *PrometheusRuleReconciler) updateAbsenceAlertRules(ctx context.Context,

// Step 5: if it's an existing AbsencePrometheusRule then update otherwise create a new resource.
if existingAbsencePrometheusRule {
existingRuleGroups := absencePromRule.Spec.Groups
result := mergeAbsenceRuleGroups(existingRuleGroups, absenceRuleGroups)
existingRuleGroups := unmodifiedAbsencePromRule.Spec.Groups
result := mergeAbsenceRuleGroups(promRuleName, existingRuleGroups, absenceRuleGroups)
if reflect.DeepEqual(unmodifiedAbsencePromRule.GetLabels(), absencePromRule.GetLabels()) &&
reflect.DeepEqual(existingRuleGroups, result) {
return nil
Expand All @@ -318,27 +319,13 @@ func (r *PrometheusRuleReconciler) updateAbsenceAlertRules(ctx context.Context,
// mergeAbsenceRuleGroups merges existing and newly generated AbsenceRuleGroups. If the
// same AbsenceRuleGroup exists in both 'existing' and 'new' then the newer one will be
// used.
func mergeAbsenceRuleGroups(existingRuleGroups, newRuleGroups []monitoringv1.RuleGroup) []monitoringv1.RuleGroup {
func mergeAbsenceRuleGroups(promRuleName string, existingRuleGroups, newRuleGroups []monitoringv1.RuleGroup) []monitoringv1.RuleGroup {
var result []monitoringv1.RuleGroup
added := make(map[string]bool)

OuterLoop:
for _, oldG := range existingRuleGroups {
for _, newG := range newRuleGroups {
if oldG.Name == newG.Name {
// Add the new updated RuleGroup.
result = append(result, newG)
added[newG.Name] = true
continue OuterLoop
}
}
// This RuleGroup should be carried over as is.
result = append(result, oldG)
}

// Add the pending rule groups.
for _, g := range newRuleGroups {
if !added[g.Name] {
// Add the absence rule groups for the PrometheusRule that we are currently dealing with.
result = append(result, newRuleGroups...)
// Carry over the absence rule groups for other PrometheusRule(s) as is.
for _, g := range existingRuleGroups {
if !strings.HasPrefix(g.Name, promRuleName) {
result = append(result, g)
}
}
Expand Down
6 changes: 3 additions & 3 deletions controllers/alert_rule.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,9 @@ func (mex *metricNameExtractor) Visit(node parser.Node, path []parser.Node) (par
return mex, nil
}

// absenceRuleGroupName returns the name of the RuleGroup that holds absence alert rules
// AbsenceRuleGroupName returns the name of the RuleGroup that holds absence alert rules
// for a specific RuleGroup in a specific PrometheusRule.
func absenceRuleGroupName(promRule, ruleGroup string) string {
func AbsenceRuleGroupName(promRule, ruleGroup string) string {
return fmt.Sprintf("%s/%s", promRule, ruleGroup)
}

Expand Down Expand Up @@ -161,7 +161,7 @@ func ParseRuleGroups(logger logr.Logger, in []monitoringv1.RuleGroup, promRuleNa
})

out = append(out, monitoringv1.RuleGroup{
Name: absenceRuleGroupName(promRuleName, g.Name),
Name: AbsenceRuleGroupName(promRuleName, g.Name),
Rules: absenceAlertRules,
})
}
Expand Down
2 changes: 2 additions & 0 deletions controllers/alert_rule_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ var _ = Describe("Alert Rule", func() {
Expect(err).ToNot(HaveOccurred())
Expect(actual).To(HaveLen(len(expected)))

// We only check the alert name, expression, and labels. Annotations are hard-coded and
// don't need to be checked here in unit tests; they are already checked in e2e tests.
for i, wanted := range expected {
got := actual[i]
Expect(got.Alert).To(Equal(wanted.Alert))
Expand Down
22 changes: 20 additions & 2 deletions e2e/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,25 @@ var _ = Describe("Controller", Ordered, func() {
})
})

Context("when a rule group is deleted from a PrometheusRule", func() {
It("should delete the corresponding rule group from the AbsencePromRule "+osAbsencePRName+" in "+resmgmtNs+" namespace", func() {
// We will remove one of the two rule groups.
prName := "openstack-limes-api.alerts"
pr := getPromRule(newObjKey(resmgmtNs, prName))
ruleGroupName := pr.Spec.Groups[0].Name
pr.Spec.Groups = pr.Spec.Groups[1:]
Expect(k8sClient.Update(ctx, &pr)).To(Succeed())

waitForControllerToProcess()
actual := getPromRule(newObjKey(resmgmtNs, osAbsencePRName))
groups := make([]string, 0, len(actual.Spec.Groups))
for _, v := range actual.Spec.Groups {
groups = append(groups, v.Name)
}
Expect(groups).ToNot(ContainElement(controllers.AbsenceRuleGroupName(prName, ruleGroupName)))
})
})

Context("when a PrometheusRule has no alert rules", func() {
It("should delete "+osAbsencePRName+" in "+resmgmtNs+" namespace", func() {
// We will remove all alert rules from a PromRule which should result in
Expand All @@ -204,8 +223,7 @@ var _ = Describe("Controller", Ordered, func() {
Expect(k8sClient.Update(ctx, &pr)).To(Succeed())

waitForControllerToProcess()
expectToNotFindPromRule(newObjKey("resmgmt", osAbsencePRName))

expectToNotFindPromRule(newObjKey(resmgmtNs, osAbsencePRName))
})
})
})
Expand Down
17 changes: 17 additions & 0 deletions e2e/fixtures/resmgmt_openstack_absent_metrics_alert_rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,23 @@ spec:
operator playbook>.
summary: missing limes_foo

- name: openstack-limes-api.alerts/api2.alerts
rules:
- alert: AbsentContainersLimesBar
expr: absent(limes_bar)
for: 10m
labels:
context: absent-metrics
support_group: containers
service: limes
severity: info
annotations:
description:
The metric 'limes_bar' is missing. 'OpenstackLimesBar'
alert using it may not fire as intended. See <https://github.com/sapcc/absent-metrics-operator/blob/master/docs/playbook.md|the
operator playbook>.
summary: missing limes_bar

- name: openstack-limes-roleassign.alerts/roleassignment.alerts
rules:
- alert: AbsentContainersLimesOpenstackAssignmentsPerRole
Expand Down
14 changes: 14 additions & 0 deletions e2e/fixtures/start-data/resmgmt_openstack_limes_api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,17 @@ spec:
annotations:
summary: "Server errors on {{ $labels.kubernetes_name }}"
description: "{{ $labels.kubernetes_name }} is producing HTTP responses with 5xx status codes."

- name: api2.alerts
rules:
- alert: OpenstackLimesBar
expr: limes_bar > 0
for: 5m
labels:
context: api
severity: info
support_group: containers
service: limes
annotations:
summary: "Server errors on {{ $labels.kubernetes_name }}"
description: "{{ $labels.kubernetes_name }} is producing HTTP responses with 5xx status codes."
Loading