Skip to content

Commit

Permalink
change based on ops-recipes
Browse files Browse the repository at this point in the history
  • Loading branch information
QuentinBisson committed Nov 4, 2024
1 parent 40452a5 commit fbc9c8d
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 47 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ spec:
annotations:
dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller
description: '{{`Component evaluations are taking too long under job {{ $labels.job }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id }}.`}}'
opsrecipe: alloy-components/
opsrecipe: alloy/
summary: Component evaluations are taking too long.
expr: sum by (cluster_id, installation, provider, pipeline, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0
for: 15m
Expand All @@ -35,7 +35,7 @@ spec:
annotations:
dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller
description: '{{`Unhealthy components detected under job {{ $labels.job }}`}}'
opsrecipe: alloy-components/
opsrecipe: alloy/
summary: Unhealthy components detected.
expr: sum by (cluster_id, installation, provider, pipeline, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0
for: 15m
Expand All @@ -48,3 +48,29 @@ spec:
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
- name: logging-agent
rules:
# This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready)
# and join the pods with the not running containers
- alert: LoggingAgentDown
annotations:
dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview
description: '{{`Scraping of all logging-agent pods to check if one failed every 30 minutes.`}}'
opsrecipe: alloy/
expr: |-
kube_pod_info{pod=~"alloy-logs.*"}
* on(cluster_id, pod)
group_left ()
up{job="alloy-logs", container="alloy"} == 0
for: 30m
labels:
area: platform
severity: page
team: atlas
topic: observability
cancel_if_outside_working_hours: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_node_unschedulable: "true"
cancel_if_node_not_ready: "true"
Original file line number Diff line number Diff line change
Expand Up @@ -3,45 +3,19 @@ kind: PrometheusRule
metadata:
labels:
{{- include "labels.common" . | nindent 4 }}
name: logging.rules
name: logging-pipeline.rules
namespace: {{ .Values.namespace }}
spec:
groups:
- name: logging-agent
rules:
# This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready)
# and join the pods with the not running containers
- alert: LoggingAgentDown
annotations:
dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview
description: '{{`Scraping of all logging-agent pods to check if one failed every 30 minutes.`}}'
opsrecipe: logging-agent/
expr: |-
kube_pod_info{pod=~"alloy-logs.*"}
* on(cluster_id, pod)
group_left ()
up{job="alloy-logs", container="alloy"} == 0
for: 30m
labels:
area: platform
severity: page
team: atlas
topic: observability
cancel_if_outside_working_hours: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_node_unschedulable: "true"
cancel_if_node_not_ready: "true"
- name: log-ingestion
- name: logging-pipeline
rules:
# Any alloy component that uses the loki.write component can throw such errors.
# This includes alloy-logs and the observability-gateway
- alert: LogForwardingErrors
annotations:
dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview
description: '{{`More that 10% of the requests to Loki are failing.`}}'
opsrecipe: log-shipping-errors/
opsrecipe: logging-pipeline/
expr: |-
(
100
Expand Down Expand Up @@ -79,7 +53,7 @@ spec:
annotations:
dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview
description: '{{`More that 10% of the loki requests to the observability gateway are failing.`}}'
opsrecipe: log-shipping-errors/
opsrecipe: logging-pipeline/
expr: |-
(
100
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ tests:
exp_annotations:
dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller
description: "Component evaluations are taking too long under job alloy-controller, component_path path1, component_id comp1."
opsrecipe: "alloy-components/"
opsrecipe: "alloy/"
summary: "Component evaluations are taking too long."
- alertname: AlloySlowComponentEvaluations
eval_time: 80m
Expand Down Expand Up @@ -68,7 +68,7 @@ tests:
exp_annotations:
dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller
description: "Unhealthy components detected under job alloy-controller"
opsrecipe: "alloy-components/"
opsrecipe: "alloy/"
summary: "Unhealthy components detected."
- alertname: AlloyUnhealthyComponents
eval_time: 80m
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
---
rule_files:
- logging.rules.yml
- logging-pipeline.rules.yml

tests:
# Test LoggingAgentDown
Expand Down Expand Up @@ -47,8 +47,8 @@ tests:
team: atlas
topic: observability
exp_annotations:
description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
opsrecipe: "logging-agent/"
description: "Scraping of all alloy pods to check if one failed every 30 minutes."
opsrecipe: "alloy/"
# Tests with 2 pods
- alertname: LoggingAgentDown
eval_time: 111m
Expand All @@ -72,8 +72,8 @@ tests:
team: atlas
topic: observability
exp_annotations:
description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
opsrecipe: "logging-agent/"
description: "Scraping of all alloy pods to check if one failed every 30 minutes."
opsrecipe: "alloy/"
- alertname: LoggingAgentDown
eval_time: 121m
- alertname: LoggingAgentDown
Expand All @@ -98,8 +98,8 @@ tests:
team: atlas
topic: observability
exp_annotations:
description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
opsrecipe: "logging-agent/"
description: "Scraping of all alloy pods to check if one failed every 30 minutes."
opsrecipe: "alloy/"
- exp_labels:
area: platform
cancel_if_outside_working_hours: "true"
Expand All @@ -119,8 +119,8 @@ tests:
team: atlas
topic: observability
exp_annotations:
description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
opsrecipe: "logging-agent/"
description: "Scraping of all alloy pods to check if one failed every 30 minutes."
opsrecipe: "alloy/"
# Test LogForwardingErrors
- interval: 1m
input_series:
Expand Down Expand Up @@ -155,7 +155,7 @@ tests:
topic: observability
exp_annotations:
description: "More that 10% of the requests to Loki are failing."
opsrecipe: "log-shipping-errors/"
opsrecipe: "logging-pipeline/"
- alertname: LogForwardingErrors
eval_time: 330m
exp_alerts:
Expand All @@ -173,7 +173,7 @@ tests:
topic: observability
exp_annotations:
description: "More that 10% of the requests to Loki are failing."
opsrecipe: "log-shipping-errors/"
opsrecipe: "logging-pipeline/"
# Test LogReceivingErrors
- interval: 1m
input_series:
Expand Down Expand Up @@ -208,7 +208,7 @@ tests:
topic: observability
exp_annotations:
description: "More that 10% of the loki requests to the observability gateway are failing."
opsrecipe: "log-shipping-errors/"
opsrecipe: "logging-pipeline/"
- alertname: LogReceivingErrors
eval_time: 330m
exp_alerts:
Expand All @@ -226,4 +226,4 @@ tests:
topic: observability
exp_annotations:
description: "More that 10% of the loki requests to the observability gateway are failing."
opsrecipe: "log-shipping-errors/"
opsrecipe: "logging-pipeline/"

0 comments on commit fbc9c8d

Please sign in to comment.