From 209954c490a9f0932b6dbca952c14627062b09aa Mon Sep 17 00:00:00 2001 From: JustHumanz Date: Thu, 12 Dec 2024 02:50:57 +0700 Subject: [PATCH 1/5] Enhance `MySQLDown` alert --- .../files/jsonnet/mixins.libsonnet | 27 +++++++++++++++++++ .../files/jsonnet/tests.yml | 15 +++++++++++ 2 files changed, 42 insertions(+) diff --git a/roles/kube_prometheus_stack/files/jsonnet/mixins.libsonnet b/roles/kube_prometheus_stack/files/jsonnet/mixins.libsonnet index 2b876b754..4997edb89 100644 --- a/roles/kube_prometheus_stack/files/jsonnet/mixins.libsonnet +++ b/roles/kube_prometheus_stack/files/jsonnet/mixins.libsonnet @@ -16,6 +16,10 @@ local disabledAlerts = [ // * Dropped `CephPGImbalance` // the balancer module takes care of this 'CephPGImbalance', + + // * Dropped `MySQLDown` due to noisy alerts even + // the replication still more than minimum + 'MySQLDown', ]; // NOTE(mnaser): This is the default mapping for severities: @@ -141,6 +145,29 @@ local mixins = { severity: 'warning', }, }, + { + alert: 'MysqlClusterDown', + 'for': '1m', + expr: 'count(mysql_up==0) != count(mysql_up)', + labels: { + severity: 'info', + }, + annotations: { + summary: '{{ $value }} percona-xtradb replication down', + }, + }, + { + alert: 'MysqlClusterDown', + 'for': '1m', + expr: 'count(mysql_up==1) < 3', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Only {{ $value }} percona-xtradb cluster are online', + description: "percona-xtradb cluster less than 3 replication, please check with kubectl get pods -n openstack -l app.kubernetes.io/component=pxc", + }, + }, ], }, ], diff --git a/roles/kube_prometheus_stack/files/jsonnet/tests.yml b/roles/kube_prometheus_stack/files/jsonnet/tests.yml index 4775bb2b8..216dc24b5 100644 --- a/roles/kube_prometheus_stack/files/jsonnet/tests.yml +++ b/roles/kube_prometheus_stack/files/jsonnet/tests.yml @@ -105,3 +105,18 @@ tests: - eval_time: 5m alertname: NodeTimeSkewDetected exp_alerts: [] + + - interval: 1m + input_series: + - series: 'mysql_up' + values: '1' + input_series: + - series: 'mysql_up' + values: '1' + input_series: + - series: 'mysql_up' + values: '0' + alert_rule_test: + - eval_time: 5m + alertname: MysqlClusterDown + exp_alerts: [] \ No newline at end of file From dc869295708859b6de65ba03abb04f9093fb2e94 Mon Sep 17 00:00:00 2001 From: JustHumanz Date: Fri, 13 Dec 2024 22:54:34 +0700 Subject: [PATCH 2/5] add more labels in MysqlClusterDown test --- roles/kube_prometheus_stack/files/jsonnet/tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/roles/kube_prometheus_stack/files/jsonnet/tests.yml b/roles/kube_prometheus_stack/files/jsonnet/tests.yml index 216dc24b5..a212515c7 100644 --- a/roles/kube_prometheus_stack/files/jsonnet/tests.yml +++ b/roles/kube_prometheus_stack/files/jsonnet/tests.yml @@ -108,13 +108,13 @@ tests: - interval: 1m input_series: - - series: 'mysql_up' + - series: 'mysql_up{instance="percona-xtradb-pxc-0", job="pxc"}' values: '1' input_series: - - series: 'mysql_up' + - series: 'mysql_up{instance="percona-xtradb-pxc-1", job="pxc"}' values: '1' input_series: - - series: 'mysql_up' + - series: 'mysql_up{instance="percona-xtradb-pxc-3", job="pxc"}' values: '0' alert_rule_test: - eval_time: 5m From 7610c8c29d8c904068f77e4621ec3a6d290fad68 Mon Sep 17 00:00:00 2001 From: JustHumanz Date: Sat, 14 Dec 2024 04:30:35 +0700 Subject: [PATCH 3/5] Replace hard code with percentage --- .../kube_prometheus_stack/files/jsonnet/mixins.libsonnet | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/roles/kube_prometheus_stack/files/jsonnet/mixins.libsonnet b/roles/kube_prometheus_stack/files/jsonnet/mixins.libsonnet index 4997edb89..64e83b343 100644 --- a/roles/kube_prometheus_stack/files/jsonnet/mixins.libsonnet +++ b/roles/kube_prometheus_stack/files/jsonnet/mixins.libsonnet @@ -159,14 +159,14 @@ local mixins = { { alert: 'MysqlClusterDown', 'for': '1m', - expr: 'count(mysql_up==1) < 3', + expr: 'round(count(mysql_up==1)/count(mysql_up) * 100) <= 50', labels: { severity: 'warning', }, annotations: { - summary: 'Only {{ $value }} percona-xtradb cluster are online', - description: "percona-xtradb cluster less than 3 replication, please check with kubectl get pods -n openstack -l app.kubernetes.io/component=pxc", - }, + summary: 'Only {{ $value }}% percona-xtradb cluster are online', + description: "percona-xtradb cluster less than minimum replication, please check with kubectl get pods -n openstack -l app.kubernetes.io/component=pxc", + }, }, ], }, From 90cb5c53a71f16bd74ade69ebeda6474acd2ead3 Mon Sep 17 00:00:00 2001 From: JustHumanz Date: Sat, 14 Dec 2024 04:31:45 +0700 Subject: [PATCH 4/5] Fix unit tests&add tests for no/low/high alerts --- .../files/jsonnet/tests.yml | 43 +++++++++++++++++-- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/roles/kube_prometheus_stack/files/jsonnet/tests.yml b/roles/kube_prometheus_stack/files/jsonnet/tests.yml index a212515c7..3faf14a6b 100644 --- a/roles/kube_prometheus_stack/files/jsonnet/tests.yml +++ b/roles/kube_prometheus_stack/files/jsonnet/tests.yml @@ -110,13 +110,50 @@ tests: input_series: - series: 'mysql_up{instance="percona-xtradb-pxc-0", job="pxc"}' values: '1' - input_series: - series: 'mysql_up{instance="percona-xtradb-pxc-1", job="pxc"}' values: '1' + - series: 'mysql_up{instance="percona-xtradb-pxc-3", job="pxc"}' + values: '0' + alert_rule_test: + - eval_time: 1m + alertname: MysqlClusterDown + exp_alerts: + - exp_labels: + severity: P5 + exp_annotations: + summary: "1 percona-xtradb replication down" + + - interval: 1m input_series: + - series: 'mysql_up{instance="percona-xtradb-pxc-0", job="pxc"}' + values: '1' + - series: 'mysql_up{instance="percona-xtradb-pxc-1", job="pxc"}' + values: '0' - series: 'mysql_up{instance="percona-xtradb-pxc-3", job="pxc"}' values: '0' alert_rule_test: - - eval_time: 5m + - eval_time: 1m + alertname: MysqlClusterDown + exp_alerts: + - exp_labels: + severity: P3 + exp_annotations: + summary: 'Only 33% percona-xtradb cluster are online' + description: "percona-xtradb cluster less than minimum replication, please check with kubectl get pods -n openstack -l app.kubernetes.io/component=pxc" + - exp_labels: + severity: P5 + exp_annotations: + summary: "2 percona-xtradb replication down" + + - interval: 1m + input_series: + - series: 'mysql_up{instance="percona-xtradb-pxc-0", job="pxc"}' + values: '1' + - series: 'mysql_up{instance="percona-xtradb-pxc-1", job="pxc"}' + values: '1' + - series: 'mysql_up{instance="percona-xtradb-pxc-3", job="pxc"}' + values: '1' + alert_rule_test: + - eval_time: 1m alertname: MysqlClusterDown - exp_alerts: [] \ No newline at end of file + exp_alerts: [] \ No newline at end of file From 38eb94e36e7ba101293dae12112f65a27620ed44 Mon Sep 17 00:00:00 2001 From: Mohammed Naser Date: Mon, 16 Dec 2024 22:10:15 -0500 Subject: [PATCH 5/5] Tune and make alerts more verbose --- .../files/jsonnet/mixins.libsonnet | 29 ++++++---- .../files/jsonnet/tests.yml | 57 ++++++++++++++----- 2 files changed, 62 insertions(+), 24 deletions(-) diff --git a/roles/kube_prometheus_stack/files/jsonnet/mixins.libsonnet b/roles/kube_prometheus_stack/files/jsonnet/mixins.libsonnet index 64e83b343..3a31daa94 100644 --- a/roles/kube_prometheus_stack/files/jsonnet/mixins.libsonnet +++ b/roles/kube_prometheus_stack/files/jsonnet/mixins.libsonnet @@ -147,25 +147,32 @@ local mixins = { }, { alert: 'MysqlClusterDown', - 'for': '1m', - expr: 'count(mysql_up==0) != count(mysql_up)', - labels: { - severity: 'info', + 'for': '5m', + expr: 'mysql_up == 0', + labels: { severity: 'info' }, + annotations: { + summary: 'Percona XtraDB Cluster replica is down', + description: "{{ $labels.instance }} replica is down.", }, + }, + { + alert: 'MysqlClusterDown', + 'for': '5m', + expr: 'round(count(mysql_up==1) / count(mysql_up) * 100) <= 50', + labels: { severity: 'warning' }, annotations: { - summary: '{{ $value }} percona-xtradb replication down', + summary: 'Percona XtraDB Cluster replicas are down', + description: "{{ $value }}% of replicas are online.", }, }, { alert: 'MysqlClusterDown', 'for': '1m', - expr: 'round(count(mysql_up==1)/count(mysql_up) * 100) <= 50', - labels: { - severity: 'warning', - }, + expr: 'count(mysql_up==0) == count(mysql_up)', + labels: { severity: 'critical' }, annotations: { - summary: 'Only {{ $value }}% percona-xtradb cluster are online', - description: "percona-xtradb cluster less than minimum replication, please check with kubectl get pods -n openstack -l app.kubernetes.io/component=pxc", + summary: 'Percona XtraDB Cluster is down', + description: "All replicas are down.", }, }, ], diff --git a/roles/kube_prometheus_stack/files/jsonnet/tests.yml b/roles/kube_prometheus_stack/files/jsonnet/tests.yml index 3faf14a6b..6ed198da2 100644 --- a/roles/kube_prometheus_stack/files/jsonnet/tests.yml +++ b/roles/kube_prometheus_stack/files/jsonnet/tests.yml @@ -112,16 +112,32 @@ tests: values: '1' - series: 'mysql_up{instance="percona-xtradb-pxc-1", job="pxc"}' values: '1' - - series: 'mysql_up{instance="percona-xtradb-pxc-3", job="pxc"}' - values: '0' + - series: 'mysql_up{instance="percona-xtradb-pxc-2", job="pxc"}' + values: '1' alert_rule_test: - eval_time: 1m + alertname: MysqlClusterDown + exp_alerts: [] + + - interval: 1m + input_series: + - series: 'mysql_up{instance="percona-xtradb-pxc-0", job="pxc"}' + values: '1' + - series: 'mysql_up{instance="percona-xtradb-pxc-1", job="pxc"}' + values: '1' + - series: 'mysql_up{instance="percona-xtradb-pxc-2", job="pxc"}' + values: '0' + alert_rule_test: + - eval_time: 5m alertname: MysqlClusterDown exp_alerts: - exp_labels: severity: P5 + instance: percona-xtradb-pxc-2 + job: pxc exp_annotations: - summary: "1 percona-xtradb replication down" + summary: Percona XtraDB Cluster replica is down + description: percona-xtradb-pxc-2 replica is down. - interval: 1m input_series: @@ -129,31 +145,46 @@ tests: values: '1' - series: 'mysql_up{instance="percona-xtradb-pxc-1", job="pxc"}' values: '0' - - series: 'mysql_up{instance="percona-xtradb-pxc-3", job="pxc"}' - values: '0' + - series: 'mysql_up{instance="percona-xtradb-pxc-2", job="pxc"}' + values: '0' alert_rule_test: - - eval_time: 1m + - eval_time: 5m alertname: MysqlClusterDown exp_alerts: - exp_labels: severity: P3 exp_annotations: - summary: 'Only 33% percona-xtradb cluster are online' - description: "percona-xtradb cluster less than minimum replication, please check with kubectl get pods -n openstack -l app.kubernetes.io/component=pxc" + summary: Percona XtraDB Cluster replicas are down + description: 33% of replicas are online. - exp_labels: severity: P5 + instance: percona-xtradb-pxc-1 + job: pxc exp_annotations: - summary: "2 percona-xtradb replication down" + summary: Percona XtraDB Cluster replica is down + description: percona-xtradb-pxc-1 replica is down. + - exp_labels: + severity: P5 + instance: percona-xtradb-pxc-2 + job: pxc + exp_annotations: + summary: Percona XtraDB Cluster replica is down + description: percona-xtradb-pxc-2 replica is down. - interval: 1m input_series: - series: 'mysql_up{instance="percona-xtradb-pxc-0", job="pxc"}' - values: '1' + values: '0' - series: 'mysql_up{instance="percona-xtradb-pxc-1", job="pxc"}' - values: '1' + values: '0' - series: 'mysql_up{instance="percona-xtradb-pxc-3", job="pxc"}' - values: '1' + values: '0' alert_rule_test: - eval_time: 1m alertname: MysqlClusterDown - exp_alerts: [] \ No newline at end of file + exp_alerts: + - exp_labels: + severity: P1 + exp_annotations: + summary: Percona XtraDB Cluster is down + description: All replicas are down.