From ad8d3d2e0bbe033f61710fdfedda5b610f68d320 Mon Sep 17 00:00:00 2001 From: EliseCastle23 <109446148+EliseCastle23@users.noreply.github.com> Date: Tue, 15 Oct 2024 13:54:14 -0600 Subject: [PATCH] fixing linting --- helm/alloy/README.md | 2 +- helm/alloy/values.yaml | 2 +- helm/faro-collector/README.md | 2 +- helm/faro-collector/values.yaml | 2 +- helm/observability/Chart.yaml | 2 +- helm/observability/README.md | 4 ++-- helm/observability/values.yaml | 2 +- helm/test.yaml | 1 - 8 files changed, 8 insertions(+), 9 deletions(-) delete mode 100644 helm/test.yaml diff --git a/helm/alloy/README.md b/helm/alloy/README.md index 5c6a6b75..5ff407d1 100644 --- a/helm/alloy/README.md +++ b/helm/alloy/README.md @@ -22,7 +22,7 @@ A Helm chart for deploying Grafana Alloy | alloy.alloy.resources.requests.memory | string | `"1Gi"` | | | alloy.alloy.stabilityLevel | string | `"public-preview"` | | | alloy.alloy.uiPathPrefix | string | `"/alloy"` | | -| alloy.alloyConfigmapData | string | `"logging {\n level = \"info\"\n format = \"json\"\n write_to = [loki.write.endpoint.receiver]\n}\n\n/////////////////////// OTLP START ///////////////////////\n\notelcol.receiver.otlp \"default\" {\n grpc {}\n http {}\n\n output {\n metrics = [otelcol.processor.batch.default.input]\n traces = [otelcol.processor.batch.default.input]\n }\n}\n\notelcol.processor.batch \"default\" {\n output {\n metrics = [otelcol.exporter.prometheus.default.input]\n traces = [otelcol.exporter.otlp.tempo.input]\n }\n}\n\notelcol.exporter.prometheus \"default\" {\n forward_to = [prometheus.remote_write.default.receiver]\n}\n\notelcol.exporter.otlp \"tempo\" {\n client {\n endpoint = \"http://monitoring-tempo-distributor.monitoring:4317\"\n // Configure TLS settings for communicating with the endpoint.\n tls {\n // The connection is insecure.\n insecure = true\n // Do not verify TLS certificates when connecting.\n insecure_skip_verify = true\n }\n }\n}\n\n\n/////////////////////// OTLP END ///////////////////////\n\n// discover all pods, to be used later in this config\ndiscovery.kubernetes \"pods\" {\n role = \"pod\"\n}\n\n// discover all services, to be used later in this config\ndiscovery.kubernetes \"services\" {\n role = \"service\"\n}\n\n// discover all nodes, to be used later in this config\ndiscovery.kubernetes \"nodes\" {\n role = \"node\"\n}\n\n// Generic scrape of any pod with Annotation \"prometheus.io/scrape: true\"\ndiscovery.relabel \"annotation_autodiscovery_pods\" {\n targets = discovery.kubernetes.pods.targets\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_scrape\"]\n regex = \"true\"\n action = \"keep\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_job\"]\n action = \"replace\"\n target_label = \"job\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_instance\"]\n action = \"replace\"\n target_label = \"instance\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_path\"]\n action = \"replace\"\n target_label = \"__metrics_path__\"\n }\n\n // Choose the pod port\n // The discovery generates a target for each declared container port of the pod.\n // If the metricsPortName annotation has value, keep only the target where the port name matches the one of the annotation.\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_port_name\"]\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_portName\"]\n regex = \"(.+)\"\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_port_name\"]\n action = \"keepequal\"\n target_label = \"__tmp_port\"\n }\n\n // If the metrics port number annotation has a value, override the target address to use it, regardless whether it is\n // one of the declared ports on that Pod.\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_port\", \"__meta_kubernetes_pod_ip\"]\n regex = \"(\\\\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})\"\n replacement = \"[$2]:$1\" // IPv6\n target_label = \"__address__\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_port\", \"__meta_kubernetes_pod_ip\"]\n regex = \"(\\\\d+);((([0-9]+?)(\\\\.|$)){4})\" // IPv4, takes priority over IPv6 when both exists\n replacement = \"$2:$1\"\n target_label = \"__address__\"\n }\n\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_scheme\"]\n action = \"replace\"\n target_label = \"__scheme__\"\n }\n\n\n // add labels\n rule {\n source_labels = [\"__meta_kubernetes_pod_name\"]\n target_label = \"pod\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_name\"]\n target_label = \"container\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_controller_name\"]\n target_label = \"controller\"\n }\n\n rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n target_label = \"namespace\"\n }\n\n\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app\"]\n target_label = \"app\"\n }\n\n // map all labels\n rule {\n action = \"labelmap\"\n regex = \"__meta_kubernetes_pod_label_(.+)\"\n }\n}\n\n// Generic scrape of any service with\n// Annotation Autodiscovery\ndiscovery.relabel \"annotation_autodiscovery_services\" {\n targets = discovery.kubernetes.services.targets\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_scrape\"]\n regex = \"true\"\n action = \"keep\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_job\"]\n action = \"replace\"\n target_label = \"job\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_instance\"]\n action = \"replace\"\n target_label = \"instance\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_path\"]\n action = \"replace\"\n target_label = \"__metrics_path__\"\n }\n\n // Choose the service port\n rule {\n source_labels = [\"__meta_kubernetes_service_port_name\"]\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_portName\"]\n regex = \"(.+)\"\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_port_name\"]\n action = \"keepequal\"\n target_label = \"__tmp_port\"\n }\n\n rule {\n source_labels = [\"__meta_kubernetes_service_port_number\"]\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_port\"]\n regex = \"(.+)\"\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_port_number\"]\n action = \"keepequal\"\n target_label = \"__tmp_port\"\n }\n\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_scheme\"]\n action = \"replace\"\n target_label = \"__scheme__\"\n }\n}\n\nprometheus.scrape \"metrics\" {\n job_name = \"integrations/autodiscovery_metrics\"\n targets = concat(discovery.relabel.annotation_autodiscovery_pods.output, discovery.relabel.annotation_autodiscovery_services.output)\n honor_labels = true\n clustering {\n enabled = true\n }\n forward_to = [prometheus.relabel.metrics_service.receiver]\n}\n\n\n// Node Exporter\n// TODO: replace with https://grafana.com/docs/alloy/latest/reference/components/prometheus.exporter.unix/\ndiscovery.relabel \"node_exporter\" {\n targets = discovery.kubernetes.pods.targets\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app_kubernetes_io_instance\"]\n regex = \"monitoring-extras\"\n action = \"keep\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app_kubernetes_io_name\"]\n regex = \"node-exporter\"\n action = \"keep\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_node_name\"]\n action = \"replace\"\n target_label = \"instance\"\n }\n}\n\nprometheus.scrape \"node_exporter\" {\n job_name = \"integrations/node_exporter\"\n targets = discovery.relabel.node_exporter.output\n scrape_interval = \"60s\"\n clustering {\n enabled = true\n }\n forward_to = [prometheus.relabel.node_exporter.receiver]\n}\n\nprometheus.relabel \"node_exporter\" {\n rule {\n source_labels = [\"__name__\"]\n regex = \"up|node_cpu.*|node_network.*|node_exporter_build_info|node_filesystem.*|node_memory.*|process_cpu_seconds_total|process_resident_memory_bytes\"\n action = \"keep\"\n }\n forward_to = [prometheus.relabel.metrics_service.receiver]\n}\n\n// Logs from all pods\ndiscovery.relabel \"all_pods\" {\n targets = discovery.kubernetes.pods.targets\n rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n target_label = \"namespace\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_name\"]\n target_label = \"pod\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_name\"]\n target_label = \"container\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_controller_name\"]\n target_label = \"controller\"\n }\n\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app\"]\n target_label = \"app\"\n }\n\n // map all labels\n rule {\n action = \"labelmap\"\n regex = \"__meta_kubernetes_pod_label_(.+)\"\n }\n\n}\n\nloki.source.kubernetes \"pods\" {\n targets = discovery.relabel.all_pods.output\n forward_to = [loki.write.endpoint.receiver]\n}\n\n// kube-state-metrics\ndiscovery.relabel \"relabel_kube_state_metrics\" {\n targets = discovery.kubernetes.services.targets\n rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n regex = \"monitoring\"\n action = \"keep\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_name\"]\n regex = \"monitoring-extras-kube-state-metrics\"\n action = \"keep\"\n }\n}\n\nprometheus.scrape \"kube_state_metrics\" {\n targets = discovery.relabel.relabel_kube_state_metrics.output\n job_name = \"kube-state-metrics\"\n metrics_path = \"/metrics\"\n forward_to = [prometheus.remote_write.default.receiver]\n}\n\n// Kubelet\ndiscovery.relabel \"kubelet\" {\n targets = discovery.kubernetes.nodes.targets\n rule {\n target_label = \"__address__\"\n replacement = \"kubernetes.default.svc.cluster.local:443\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_node_name\"]\n regex = \"(.+)\"\n replacement = \"/api/v1/nodes/${1}/proxy/metrics\"\n target_label = \"__metrics_path__\"\n }\n}\n\nprometheus.scrape \"kubelet\" {\n job_name = \"integrations/kubernetes/kubelet\"\n targets = discovery.relabel.kubelet.output\n scheme = \"https\"\n scrape_interval = \"60s\"\n bearer_token_file = \"/var/run/secrets/kubernetes.io/serviceaccount/token\"\n tls_config {\n insecure_skip_verify = true\n }\n clustering {\n enabled = true\n }\n forward_to = [prometheus.relabel.kubelet.receiver]\n}\n\nprometheus.relabel \"kubelet\" {\n rule {\n source_labels = [\"__name__\"]\n regex = \"up|container_cpu_usage_seconds_total|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_certificate_manager_client_ttl_seconds|kubelet_certificate_manager_server_ttl_seconds|kubelet_cgroup_manager_duration_seconds_bucket|kubelet_cgroup_manager_duration_seconds_count|kubelet_node_config_error|kubelet_node_name|kubelet_pleg_relist_duration_seconds_bucket|kubelet_pleg_relist_duration_seconds_count|kubelet_pleg_relist_interval_seconds_bucket|kubelet_pod_start_duration_seconds_bucket|kubelet_pod_start_duration_seconds_count|kubelet_pod_worker_duration_seconds_bucket|kubelet_pod_worker_duration_seconds_count|kubelet_running_container_count|kubelet_running_containers|kubelet_running_pod_count|kubelet_running_pods|kubelet_runtime_operations_errors_total|kubelet_runtime_operations_total|kubelet_server_expiration_renew_errors|kubelet_volume_stats_available_bytes|kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_inodes|kubelet_volume_stats_inodes_used|kubernetes_build_info|namespace_workload_pod|rest_client_requests_total|storage_operation_duration_seconds_count|storage_operation_errors_total|volume_manager_total_volumes\"\n action = \"keep\"\n }\n forward_to = [prometheus.relabel.metrics_service.receiver]\n}\n\n// Cluster Events\nloki.source.kubernetes_events \"cluster_events\" {\n job_name = \"integrations/kubernetes/eventhandler\"\n log_format = \"logfmt\"\n forward_to = [loki.write.endpoint.receiver]\n}\n\nprometheus.relabel \"metrics_service\" {\n forward_to = [prometheus.remote_write.default.receiver]\n}\n\n\n// Write Endpoints\n// prometheus write endpoint\nprometheus.remote_write \"default\" {\n external_labels = {\n cluster = \"{{ .Values.cluster }}\",\n project = \"{{ .Values.project }}\",\n }\n endpoint {\n url = \"https://mimir.example.com/api/v1/push\"\n\n headers = {\n \"X-Scope-OrgID\" = \"anonymous\",\n }\n\n }\n}\n\n// loki write endpoint\nloki.write \"endpoint\" {\n external_labels = {\n cluster = \"{{ .Values.cluster }}\",\n project = \"{{ .Values.project }}\",\n }\n endpoint {\n url = \"https://loki.example.com/loki/api/v1/push\"\n }\n}"` | | +| alloy.alloyConfigmapData | string | `"logging {\n level = \"info\"\n format = \"json\"\n write_to = [loki.write.endpoint.receiver]\n}\n\n/////////////////////// OTLP START ///////////////////////\n\notelcol.receiver.otlp \"default\" {\n grpc {}\n http {}\n\n output {\n metrics = [otelcol.processor.batch.default.input]\n traces = [otelcol.processor.batch.default.input]\n }\n}\n\notelcol.processor.batch \"default\" {\n output {\n metrics = [otelcol.exporter.prometheus.default.input]\n traces = [otelcol.exporter.otlp.tempo.input]\n }\n}\n\notelcol.exporter.prometheus \"default\" {\n forward_to = [prometheus.remote_write.default.receiver]\n}\n\notelcol.exporter.otlp \"tempo\" {\n client {\n endpoint = \"http://monitoring-tempo-distributor.monitoring:4317\"\n // Configure TLS settings for communicating with the endpoint.\n tls {\n // The connection is insecure.\n insecure = true\n // Do not verify TLS certificates when connecting.\n insecure_skip_verify = true\n }\n }\n}\n\n\n/////////////////////// OTLP END ///////////////////////\n\n// discover all pods, to be used later in this config\ndiscovery.kubernetes \"pods\" {\n role = \"pod\"\n}\n\n// discover all services, to be used later in this config\ndiscovery.kubernetes \"services\" {\n role = \"service\"\n}\n\n// discover all nodes, to be used later in this config\ndiscovery.kubernetes \"nodes\" {\n role = \"node\"\n}\n\n// Generic scrape of any pod with Annotation \"prometheus.io/scrape: true\"\ndiscovery.relabel \"annotation_autodiscovery_pods\" {\n targets = discovery.kubernetes.pods.targets\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_scrape\"]\n regex = \"true\"\n action = \"keep\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_job\"]\n action = \"replace\"\n target_label = \"job\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_instance\"]\n action = \"replace\"\n target_label = \"instance\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_path\"]\n action = \"replace\"\n target_label = \"__metrics_path__\"\n }\n\n // Choose the pod port\n // The discovery generates a target for each declared container port of the pod.\n // If the metricsPortName annotation has value, keep only the target where the port name matches the one of the annotation.\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_port_name\"]\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_portName\"]\n regex = \"(.+)\"\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_port_name\"]\n action = \"keepequal\"\n target_label = \"__tmp_port\"\n }\n\n // If the metrics port number annotation has a value, override the target address to use it, regardless whether it is\n // one of the declared ports on that Pod.\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_port\", \"__meta_kubernetes_pod_ip\"]\n regex = \"(\\\\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})\"\n replacement = \"[$2]:$1\" // IPv6\n target_label = \"__address__\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_port\", \"__meta_kubernetes_pod_ip\"]\n regex = \"(\\\\d+);((([0-9]+?)(\\\\.|$)){4})\" // IPv4, takes priority over IPv6 when both exists\n replacement = \"$2:$1\"\n target_label = \"__address__\"\n }\n\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_scheme\"]\n action = \"replace\"\n target_label = \"__scheme__\"\n }\n\n\n // add labels\n rule {\n source_labels = [\"__meta_kubernetes_pod_name\"]\n target_label = \"pod\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_name\"]\n target_label = \"container\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_controller_name\"]\n target_label = \"controller\"\n }\n\n rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n target_label = \"namespace\"\n }\n\n\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app\"]\n target_label = \"app\"\n }\n\n // map all labels\n rule {\n action = \"labelmap\"\n regex = \"__meta_kubernetes_pod_label_(.+)\"\n }\n}\n\n// Generic scrape of any service with\n// Annotation Autodiscovery\ndiscovery.relabel \"annotation_autodiscovery_services\" {\n targets = discovery.kubernetes.services.targets\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_scrape\"]\n regex = \"true\"\n action = \"keep\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_job\"]\n action = \"replace\"\n target_label = \"job\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_instance\"]\n action = \"replace\"\n target_label = \"instance\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_path\"]\n action = \"replace\"\n target_label = \"__metrics_path__\"\n }\n\n // Choose the service port\n rule {\n source_labels = [\"__meta_kubernetes_service_port_name\"]\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_portName\"]\n regex = \"(.+)\"\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_port_name\"]\n action = \"keepequal\"\n target_label = \"__tmp_port\"\n }\n\n rule {\n source_labels = [\"__meta_kubernetes_service_port_number\"]\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_port\"]\n regex = \"(.+)\"\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_port_number\"]\n action = \"keepequal\"\n target_label = \"__tmp_port\"\n }\n\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_scheme\"]\n action = \"replace\"\n target_label = \"__scheme__\"\n }\n}\n\nprometheus.scrape \"metrics\" {\n job_name = \"integrations/autodiscovery_metrics\"\n targets = concat(discovery.relabel.annotation_autodiscovery_pods.output, discovery.relabel.annotation_autodiscovery_services.output)\n honor_labels = true\n clustering {\n enabled = true\n }\n forward_to = [prometheus.relabel.metrics_service.receiver]\n}\n\n\n// Node Exporter\n// TODO: replace with https://grafana.com/docs/alloy/latest/reference/components/prometheus.exporter.unix/\ndiscovery.relabel \"node_exporter\" {\n targets = discovery.kubernetes.pods.targets\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app_kubernetes_io_instance\"]\n regex = \"monitoring-extras\"\n action = \"keep\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app_kubernetes_io_name\"]\n regex = \"node-exporter\"\n action = \"keep\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_node_name\"]\n action = \"replace\"\n target_label = \"instance\"\n }\n}\n\nprometheus.scrape \"node_exporter\" {\n job_name = \"integrations/node_exporter\"\n targets = discovery.relabel.node_exporter.output\n scrape_interval = \"60s\"\n clustering {\n enabled = true\n }\n forward_to = [prometheus.relabel.node_exporter.receiver]\n}\n\nprometheus.relabel \"node_exporter\" {\n rule {\n source_labels = [\"__name__\"]\n regex = \"up|node_cpu.*|node_network.*|node_exporter_build_info|node_filesystem.*|node_memory.*|process_cpu_seconds_total|process_resident_memory_bytes\"\n action = \"keep\"\n }\n forward_to = [prometheus.relabel.metrics_service.receiver]\n}\n\n// Logs from all pods\ndiscovery.relabel \"all_pods\" {\n targets = discovery.kubernetes.pods.targets\n rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n target_label = \"namespace\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_name\"]\n target_label = \"pod\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_name\"]\n target_label = \"container\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_controller_name\"]\n target_label = \"controller\"\n }\n\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app\"]\n target_label = \"app\"\n }\n\n // map all labels\n rule {\n action = \"labelmap\"\n regex = \"__meta_kubernetes_pod_label_(.+)\"\n }\n\n}\n\nloki.source.kubernetes \"pods\" {\n targets = discovery.relabel.all_pods.output\n forward_to = [loki.write.endpoint.receiver]\n}\n\n// kube-state-metrics\ndiscovery.relabel \"relabel_kube_state_metrics\" {\n targets = discovery.kubernetes.services.targets\n rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n regex = \"monitoring\"\n action = \"keep\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_name\"]\n regex = \"monitoring-extras-kube-state-metrics\"\n action = \"keep\"\n }\n}\n\nprometheus.scrape \"kube_state_metrics\" {\n targets = discovery.relabel.relabel_kube_state_metrics.output\n job_name = \"kube-state-metrics\"\n metrics_path = \"/metrics\"\n forward_to = [prometheus.remote_write.default.receiver]\n}\n\n// Kubelet\ndiscovery.relabel \"kubelet\" {\n targets = discovery.kubernetes.nodes.targets\n rule {\n target_label = \"__address__\"\n replacement = \"kubernetes.default.svc.cluster.local:443\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_node_name\"]\n regex = \"(.+)\"\n replacement = \"/api/v1/nodes/${1}/proxy/metrics\"\n target_label = \"__metrics_path__\"\n }\n}\n\nprometheus.scrape \"kubelet\" {\n job_name = \"integrations/kubernetes/kubelet\"\n targets = discovery.relabel.kubelet.output\n scheme = \"https\"\n scrape_interval = \"60s\"\n bearer_token_file = \"/var/run/secrets/kubernetes.io/serviceaccount/token\"\n tls_config {\n insecure_skip_verify = true\n }\n clustering {\n enabled = true\n }\n forward_to = [prometheus.relabel.kubelet.receiver]\n}\n\nprometheus.relabel \"kubelet\" {\n rule {\n source_labels = [\"__name__\"]\n regex = \"up|container_cpu_usage_seconds_total|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_certificate_manager_client_ttl_seconds|kubelet_certificate_manager_server_ttl_seconds|kubelet_cgroup_manager_duration_seconds_bucket|kubelet_cgroup_manager_duration_seconds_count|kubelet_node_config_error|kubelet_node_name|kubelet_pleg_relist_duration_seconds_bucket|kubelet_pleg_relist_duration_seconds_count|kubelet_pleg_relist_interval_seconds_bucket|kubelet_pod_start_duration_seconds_bucket|kubelet_pod_start_duration_seconds_count|kubelet_pod_worker_duration_seconds_bucket|kubelet_pod_worker_duration_seconds_count|kubelet_running_container_count|kubelet_running_containers|kubelet_running_pod_count|kubelet_running_pods|kubelet_runtime_operations_errors_total|kubelet_runtime_operations_total|kubelet_server_expiration_renew_errors|kubelet_volume_stats_available_bytes|kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_inodes|kubelet_volume_stats_inodes_used|kubernetes_build_info|namespace_workload_pod|rest_client_requests_total|storage_operation_duration_seconds_count|storage_operation_errors_total|volume_manager_total_volumes\"\n action = \"keep\"\n }\n forward_to = [prometheus.relabel.metrics_service.receiver]\n}\n\n// Cluster Events\nloki.source.kubernetes_events \"cluster_events\" {\n job_name = \"integrations/kubernetes/eventhandler\"\n log_format = \"logfmt\"\n forward_to = [loki.write.endpoint.receiver]\n}\n\nprometheus.relabel \"metrics_service\" {\n forward_to = [prometheus.remote_write.default.receiver]\n}\n\n\n// Write Endpoints\n// prometheus write endpoint\nprometheus.remote_write \"default\" {\n external_labels = {\n cluster = \"{{ .Values.cluster }}\",\n project = \"{{ .Values.project }}\",\n }\n endpoint {\n url = \"https://mimir.example.com/api/v1/push\"\n\n headers = {\n \"X-Scope-OrgID\" = \"anonymous\",\n }\n\n }\n}\n\n// loki write endpoint\nloki.write \"endpoint\" {\n external_labels = {\n cluster = \"{{ .Values.cluster }}\",\n project = \"{{ .Values.project }}\",\n }\n endpoint {\n url = \"https://loki.example.com/loki/api/v1/push\"\n }\n}\n"` | | | alloy.controller.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].key | string | `"topology.kubernetes.io/zone"` | | | alloy.controller.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].operator | string | `"In"` | | | alloy.controller.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[0] | string | `"us-east-1a"` | | diff --git a/helm/alloy/values.yaml b/helm/alloy/values.yaml index 146cb8ea..27232540 100644 --- a/helm/alloy/values.yaml +++ b/helm/alloy/values.yaml @@ -442,4 +442,4 @@ alloy: endpoint { url = "https://loki.example.com/loki/api/v1/push" } - } \ No newline at end of file + } diff --git a/helm/faro-collector/README.md b/helm/faro-collector/README.md index 92ef2253..a01378de 100644 --- a/helm/faro-collector/README.md +++ b/helm/faro-collector/README.md @@ -21,7 +21,7 @@ A Helm chart for deploying Grafana Alloy | alloy.alloy.extraPorts[0].port | int | `12347` | | | alloy.alloy.extraPorts[0].protocol | string | `"TCP"` | | | alloy.alloy.extraPorts[0].targetPort | int | `12347` | | -| alloy.alloyConfigmapData | string | `"logging {\n level = \"info\"\n format = \"json\"\n}\n\notelcol.exporter.otlp \"tempo\" {\n client {\n endpoint = \"http://grafana-tempo-distributor.monitoring:4317\"\n tls {\n insecure = true\n insecure_skip_verify = true\n }\n }\n}\n\n// loki write endpoint\nloki.write \"endpoint\" {\n endpoint {\n url = \"http://grafana-loki-gateway.monitoring:80/loki/api/v1/push\"\n }\n}\n\nfaro.receiver \"default\" {\n server {\n listen_address = \"0.0.0.0\"\n listen_port = 12347\n cors_allowed_origins = [\"*\"]\n }\n\n extra_log_labels = {\n service = \"frontend-app\",\n app_name = \"\",\n app_environment = \"\",\n app_namespace = \"\",\n app_version = \"\",\n }\n output {\n logs = [loki.write.endpoint.receiver]\n traces = [otelcol.exporter.otlp.tempo.input]\n }\n}"` | | +| alloy.alloyConfigmapData | string | `"logging {\n level = \"info\"\n format = \"json\"\n}\n\notelcol.exporter.otlp \"tempo\" {\n client {\n endpoint = \"http://grafana-tempo-distributor.monitoring:4317\"\n tls {\n insecure = true\n insecure_skip_verify = true\n }\n }\n}\n\n// loki write endpoint\nloki.write \"endpoint\" {\n endpoint {\n url = \"http://grafana-loki-gateway.monitoring:80/loki/api/v1/push\"\n }\n}\n\nfaro.receiver \"default\" {\n server {\n listen_address = \"0.0.0.0\"\n listen_port = 12347\n cors_allowed_origins = [\"*\"]\n }\n\n extra_log_labels = {\n service = \"frontend-app\",\n app_name = \"\",\n app_environment = \"\",\n app_namespace = \"\",\n app_version = \"\",\n }\n output {\n logs = [loki.write.endpoint.receiver]\n traces = [otelcol.exporter.otlp.tempo.input]\n }\n}\n"` | | | alloy.ingress.annotations | object | `{}` | | | alloy.ingress.enabled | bool | `true` | Enables ingress for Alloy (Faro port) | | alloy.ingress.faroPort | int | `12347` | | diff --git a/helm/faro-collector/values.yaml b/helm/faro-collector/values.yaml index 90326bc9..4770d4c1 100644 --- a/helm/faro-collector/values.yaml +++ b/helm/faro-collector/values.yaml @@ -74,4 +74,4 @@ alloy: logs = [loki.write.endpoint.receiver] traces = [otelcol.exporter.otlp.tempo.input] } - } \ No newline at end of file + } diff --git a/helm/observability/Chart.yaml b/helm/observability/Chart.yaml index 67ac2013..f3b07d82 100644 --- a/helm/observability/Chart.yaml +++ b/helm/observability/Chart.yaml @@ -28,4 +28,4 @@ dependencies: - name: lgtm-distributed version: "2.1.0" alias: lgtm - repository: "https://grafana.github.io/helm-charts" \ No newline at end of file + repository: "https://grafana.github.io/helm-charts" diff --git a/helm/observability/README.md b/helm/observability/README.md index 1fde09c2..21ad3855 100644 --- a/helm/observability/README.md +++ b/helm/observability/README.md @@ -14,7 +14,7 @@ A Helm chart for deploying the LGTM stack with additional resources | Key | Type | Default | Description | |-----|------|---------|-------------| -| lgtm.grafana | map | `{"affinity":{"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"topology.kubernetes.io/zone","operator":"In","values":["us-east-1a"]}]}]}}},"alerting":{"contactpoints.yaml":{"secret":{"apiVersion":1,"contactPoints":[{"name":"slack","orgId":1,"receivers":[{"settings":{"group":"slack","summary":"{{ `{{ include \"default.message\" . }}` }}","url":"https://hooks.slack.com/services/XXXXXXXXXX"},"type":"Slack","uid":"first_uid"}]}]}},"rules.yaml":{"apiVersion":1,"groups":[{"folder":"Alerts","interval":"5m","name":"Alerts","orgId":1,"rules":[{"annotations":{"summary":"Alert: HTTP 500 errors detected in the environment: {{`{{ $labels.clusters }}`}}"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({cluster=~\".+\"} | json | http_status_code=\"500\" [1h])) > 0","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"HTTP 500 errors detected","uid":"edwb8zgcvq96oc"},{"annotations":{"description":"Error in usersync job detected in cluster {{`{{ $labels.clusters }}`}}, namespace {{`{{ $labels.namespace }}`}}.","summary":"Error Logs Detected in Usersync Job"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster, namespace) (count_over_time({ app=\"gen3job\", job_name=~\"usersync-.*\"} |= \"ERROR - could not revoke policies from user `N/A`\" [5m])) > 1","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Error Logs Detected in Usersync Job","uid":"adwb9vhb7irr4b"},{"annotations":{"description":"Panic detected in app {{`{{ $labels.app }}`}} within cluster {{`{{ $labels.clusters }}`}}.","summary":"Hatchery panic"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({app=\"hatchery\"} |= \"panic\" [5m])) > 1","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Hatchery panic in {{`{{ env.name }}`}}","uid":"ddwbc12l6wc8wf"},{"annotations":{"description":"Detected 431 HTTP status codes in the logs within the last 5 minutes.","summary":"Http status code 431"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum(count_over_time({cluster=~\".+\"} | json | http_status_code=\"431\" [5m])) >= 2","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Http status code 431","uid":"cdwbcbphz1zb4a"},{"annotations":{"description":"High number of info status logs detected in the indexd service in cluster {{`{{ $labels.clusters }}`}}.","summary":"Indexd is getting an excessive amount of traffic"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({cluster=~\".+\", app=\"indexd\", status=\"info\"} [5m])) > 50000","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Indexd is getting an excessive amount of traffic","uid":"bdwbck1lgwdfka"},{"annotations":{"description":"More than 10 errors detected in the karpenter namespace in cluster {{`{{ $labels.clusters }}`}} related to providerRef not found.","summary":"Karpenter Resource Mismatch"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({namespace=\"karpenter\", cluster=~\".+\"} |= \"ERROR\" |= \"not found\" |= \"getting providerRef\" [5m])) > 10\n","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Karpenter Resource Mismatch","uid":"fdwbe5t439zpcd"},{"annotations":{"description":"More than 1000 \"limiting requests, excess\" errors detected in service {{`{{ $labels.app }}`}} (cluster: {{`{{ $labels.clusters }}`}}) within the last 5 minutes.","summary":"Nginx is logging excessive \" limiting requests, excess:\""},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (app, cluster) (count_over_time({app=~\".+\", cluster=~\".+\"} |= \"status:error\" |= \"limiting requests, excess:\" [5m])) > 1000","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Nginx is logging excessive \" limiting requests, excess:\"","uid":"fdwbeuftc7400c"}]}]}},"dashboardProviders":{"dashboardproviders.yaml":{"apiVersion":1,"providers":[{"disableDeletion":true,"editable":true,"folder":"Kubernetes","name":"grafana-dashboards-kubernetes","options":{"path":"/var/lib/grafana/dashboards/grafana-dashboards-kubernetes"},"orgId":1,"type":"file"}]}},"dashboards":{"grafana-dashboards-kubernetes":{"k8s-system-api-server":{"token":"","url":"https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-system-api-server.json"},"k8s-system-coredns":{"token":"","url":"https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-system-coredns.json"},"k8s-views-global":{"token":"","url":"https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-global.json"},"k8s-views-namespaces":{"token":"","url":"https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-namespaces.json"},"k8s-views-nodes":{"token":"","url":"https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-nodes.json"},"k8s-views-pods":{"token":"","url":"https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-pods.json"}}},"downloadDashboardsImage":{"registry":"quay.io/curl","repository":"curl","tag":"8.8.0"},"enabled":true,"env":{"GF_SERVER_ROOT_URL":"https://grafana.example.com"},"envFromSecret":null,"grafana.ini":{"auth.okta":{"allow_sign_up":true,"auto_login":true,"enabled":true,"icon":"okta"},"feature_toggles":{"enable":"ssoSettingsAPI transformationsVariableSupport","ssoSettingsApi":true,"transformationsVariableSupport":true},"log":{"level":"debug"},"server":{"domain":"grafana.example.com","root_url":"https://%(domain)s/"},"users":{"auto_assign_org_role":"Editor"}},"image":{"pullPolicy":"Always","registry":"quay.io/cdis","repository":"grafana","tag":"master"},"ingress":{"annotations":{},"enabled":true,"hosts":["grafana.example.com"],"ingressClassName":"alb","tls":[{"secretName":null}]},"initChownData":{"image":{"registry":"quay.io/cdis","repository":"busybox","tag":"1.32.0"}},"persistence":{"enabled":true}}` | Grafana configuration. | +| lgtm.grafana | map | `{"affinity":{"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"topology.kubernetes.io/zone","operator":"In","values":["us-east-1a"]}]}]}}},"alerting":{"contactpoints.yaml":{"secret":{"apiVersion":1,"contactPoints":[{"name":"slack","orgId":1,"receivers":[{"settings":{"group":"slack","summary":"{{ `{{ include \"default.message\" . }}` }}\n","url":"https://hooks.slack.com/services/XXXXXXXXXX"},"type":"Slack","uid":"first_uid"}]}]}},"rules.yaml":{"apiVersion":1,"groups":[{"folder":"Alerts","interval":"5m","name":"Alerts","orgId":1,"rules":[{"annotations":{"summary":"Alert: HTTP 500 errors detected in the environment: {{`{{ $labels.clusters }}`}}"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({cluster=~\".+\"} | json | http_status_code=\"500\" [1h])) > 0","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"HTTP 500 errors detected","uid":"edwb8zgcvq96oc"},{"annotations":{"description":"Error in usersync job detected in cluster {{`{{ $labels.clusters }}`}}, namespace {{`{{ $labels.namespace }}`}}.","summary":"Error Logs Detected in Usersync Job"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster, namespace) (count_over_time({ app=\"gen3job\", job_name=~\"usersync-.*\"} |= \"ERROR - could not revoke policies from user `N/A`\" [5m])) > 1","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Error Logs Detected in Usersync Job","uid":"adwb9vhb7irr4b"},{"annotations":{"description":"Panic detected in app {{`{{ $labels.app }}`}} within cluster {{`{{ $labels.clusters }}`}}.","summary":"Hatchery panic"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({app=\"hatchery\"} |= \"panic\" [5m])) > 1","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Hatchery panic in {{`{{ env.name }}`}}","uid":"ddwbc12l6wc8wf"},{"annotations":{"description":"Detected 431 HTTP status codes in the logs within the last 5 minutes.","summary":"Http status code 431"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum(count_over_time({cluster=~\".+\"} | json | http_status_code=\"431\" [5m])) >= 2","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Http status code 431","uid":"cdwbcbphz1zb4a"},{"annotations":{"description":"High number of info status logs detected in the indexd service in cluster {{`{{ $labels.clusters }}`}}.","summary":"Indexd is getting an excessive amount of traffic"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({cluster=~\".+\", app=\"indexd\", status=\"info\"} [5m])) > 50000","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Indexd is getting an excessive amount of traffic","uid":"bdwbck1lgwdfka"},{"annotations":{"description":"More than 10 errors detected in the karpenter namespace in cluster {{`{{ $labels.clusters }}`}} related to providerRef not found.","summary":"Karpenter Resource Mismatch"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({namespace=\"karpenter\", cluster=~\".+\"} |= \"ERROR\" |= \"not found\" |= \"getting providerRef\" [5m])) > 10\n","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Karpenter Resource Mismatch","uid":"fdwbe5t439zpcd"},{"annotations":{"description":"More than 1000 \"limiting requests, excess\" errors detected in service {{`{{ $labels.app }}`}} (cluster: {{`{{ $labels.clusters }}`}}) within the last 5 minutes.","summary":"Nginx is logging excessive \" limiting requests, excess:\""},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (app, cluster) (count_over_time({app=~\".+\", cluster=~\".+\"} |= \"status:error\" |= \"limiting requests, excess:\" [5m])) > 1000","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Nginx is logging excessive \" limiting requests, excess:\"","uid":"fdwbeuftc7400c"}]}]}},"dashboardProviders":{"dashboardproviders.yaml":{"apiVersion":1,"providers":[{"disableDeletion":true,"editable":true,"folder":"Kubernetes","name":"grafana-dashboards-kubernetes","options":{"path":"/var/lib/grafana/dashboards/grafana-dashboards-kubernetes"},"orgId":1,"type":"file"}]}},"dashboards":{"grafana-dashboards-kubernetes":{"k8s-system-api-server":{"token":"","url":"https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-system-api-server.json"},"k8s-system-coredns":{"token":"","url":"https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-system-coredns.json"},"k8s-views-global":{"token":"","url":"https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-global.json"},"k8s-views-namespaces":{"token":"","url":"https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-namespaces.json"},"k8s-views-nodes":{"token":"","url":"https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-nodes.json"},"k8s-views-pods":{"token":"","url":"https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-pods.json"}}},"downloadDashboardsImage":{"registry":"quay.io/curl","repository":"curl","tag":"8.8.0"},"enabled":true,"env":{"GF_SERVER_ROOT_URL":"https://grafana.example.com"},"envFromSecret":null,"grafana.ini":{"auth.okta":{"allow_sign_up":true,"auto_login":true,"enabled":true,"icon":"okta"},"feature_toggles":{"enable":"ssoSettingsAPI transformationsVariableSupport","ssoSettingsApi":true,"transformationsVariableSupport":true},"log":{"level":"debug"},"server":{"domain":"grafana.example.com","root_url":"https://%(domain)s/"},"users":{"auto_assign_org_role":"Editor"}},"image":{"pullPolicy":"Always","registry":"quay.io/cdis","repository":"grafana","tag":"master"},"ingress":{"annotations":{},"enabled":true,"hosts":["grafana.example.com"],"ingressClassName":"alb","tls":[{"secretName":null}]},"initChownData":{"image":{"registry":"quay.io/cdis","repository":"busybox","tag":"1.32.0"}},"persistence":{"enabled":true}}` | Grafana configuration. | | lgtm.grafana."grafana.ini"."auth.okta" | map | `{"allow_sign_up":true,"auto_login":true,"enabled":true,"icon":"okta"}` | Okta authentication settings in Grafana. | | lgtm.grafana."grafana.ini"."auth.okta".allow_sign_up | bool | `true` | Allow users to sign up automatically using Okta. | | lgtm.grafana."grafana.ini"."auth.okta".auto_login | bool | `true` | Automatically log in users using Okta when visiting Grafana. | @@ -35,7 +35,7 @@ A Helm chart for deploying the LGTM stack with additional resources | lgtm.grafana.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0] | string | `{"key":"topology.kubernetes.io/zone","operator":"In","values":["us-east-1a"]}` | Node label key for affinity. Ensures pods are scheduled on nodes in the specified zone. | | lgtm.grafana.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].operator | string | `"In"` | Operator to apply to the node selector. 'In' means the node must match one of the values. | | lgtm.grafana.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values | list | `["us-east-1a"]` | List of values for the node selector, representing allowed zones. | -| lgtm.grafana.alerting | map | `{"contactpoints.yaml":{"secret":{"apiVersion":1,"contactPoints":[{"name":"slack","orgId":1,"receivers":[{"settings":{"group":"slack","summary":"{{ `{{ include \"default.message\" . }}` }}","url":"https://hooks.slack.com/services/XXXXXXXXXX"},"type":"Slack","uid":"first_uid"}]}]}},"rules.yaml":{"apiVersion":1,"groups":[{"folder":"Alerts","interval":"5m","name":"Alerts","orgId":1,"rules":[{"annotations":{"summary":"Alert: HTTP 500 errors detected in the environment: {{`{{ $labels.clusters }}`}}"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({cluster=~\".+\"} | json | http_status_code=\"500\" [1h])) > 0","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"HTTP 500 errors detected","uid":"edwb8zgcvq96oc"},{"annotations":{"description":"Error in usersync job detected in cluster {{`{{ $labels.clusters }}`}}, namespace {{`{{ $labels.namespace }}`}}.","summary":"Error Logs Detected in Usersync Job"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster, namespace) (count_over_time({ app=\"gen3job\", job_name=~\"usersync-.*\"} |= \"ERROR - could not revoke policies from user `N/A`\" [5m])) > 1","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Error Logs Detected in Usersync Job","uid":"adwb9vhb7irr4b"},{"annotations":{"description":"Panic detected in app {{`{{ $labels.app }}`}} within cluster {{`{{ $labels.clusters }}`}}.","summary":"Hatchery panic"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({app=\"hatchery\"} |= \"panic\" [5m])) > 1","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Hatchery panic in {{`{{ env.name }}`}}","uid":"ddwbc12l6wc8wf"},{"annotations":{"description":"Detected 431 HTTP status codes in the logs within the last 5 minutes.","summary":"Http status code 431"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum(count_over_time({cluster=~\".+\"} | json | http_status_code=\"431\" [5m])) >= 2","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Http status code 431","uid":"cdwbcbphz1zb4a"},{"annotations":{"description":"High number of info status logs detected in the indexd service in cluster {{`{{ $labels.clusters }}`}}.","summary":"Indexd is getting an excessive amount of traffic"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({cluster=~\".+\", app=\"indexd\", status=\"info\"} [5m])) > 50000","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Indexd is getting an excessive amount of traffic","uid":"bdwbck1lgwdfka"},{"annotations":{"description":"More than 10 errors detected in the karpenter namespace in cluster {{`{{ $labels.clusters }}`}} related to providerRef not found.","summary":"Karpenter Resource Mismatch"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({namespace=\"karpenter\", cluster=~\".+\"} |= \"ERROR\" |= \"not found\" |= \"getting providerRef\" [5m])) > 10\n","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Karpenter Resource Mismatch","uid":"fdwbe5t439zpcd"},{"annotations":{"description":"More than 1000 \"limiting requests, excess\" errors detected in service {{`{{ $labels.app }}`}} (cluster: {{`{{ $labels.clusters }}`}}) within the last 5 minutes.","summary":"Nginx is logging excessive \" limiting requests, excess:\""},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (app, cluster) (count_over_time({app=~\".+\", cluster=~\".+\"} |= \"status:error\" |= \"limiting requests, excess:\" [5m])) > 1000","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Nginx is logging excessive \" limiting requests, excess:\"","uid":"fdwbeuftc7400c"}]}]}}` | Gen3 built-in alerting configuration in Grafana. | +| lgtm.grafana.alerting | map | `{"contactpoints.yaml":{"secret":{"apiVersion":1,"contactPoints":[{"name":"slack","orgId":1,"receivers":[{"settings":{"group":"slack","summary":"{{ `{{ include \"default.message\" . }}` }}\n","url":"https://hooks.slack.com/services/XXXXXXXXXX"},"type":"Slack","uid":"first_uid"}]}]}},"rules.yaml":{"apiVersion":1,"groups":[{"folder":"Alerts","interval":"5m","name":"Alerts","orgId":1,"rules":[{"annotations":{"summary":"Alert: HTTP 500 errors detected in the environment: {{`{{ $labels.clusters }}`}}"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({cluster=~\".+\"} | json | http_status_code=\"500\" [1h])) > 0","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"HTTP 500 errors detected","uid":"edwb8zgcvq96oc"},{"annotations":{"description":"Error in usersync job detected in cluster {{`{{ $labels.clusters }}`}}, namespace {{`{{ $labels.namespace }}`}}.","summary":"Error Logs Detected in Usersync Job"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster, namespace) (count_over_time({ app=\"gen3job\", job_name=~\"usersync-.*\"} |= \"ERROR - could not revoke policies from user `N/A`\" [5m])) > 1","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Error Logs Detected in Usersync Job","uid":"adwb9vhb7irr4b"},{"annotations":{"description":"Panic detected in app {{`{{ $labels.app }}`}} within cluster {{`{{ $labels.clusters }}`}}.","summary":"Hatchery panic"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({app=\"hatchery\"} |= \"panic\" [5m])) > 1","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Hatchery panic in {{`{{ env.name }}`}}","uid":"ddwbc12l6wc8wf"},{"annotations":{"description":"Detected 431 HTTP status codes in the logs within the last 5 minutes.","summary":"Http status code 431"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum(count_over_time({cluster=~\".+\"} | json | http_status_code=\"431\" [5m])) >= 2","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Http status code 431","uid":"cdwbcbphz1zb4a"},{"annotations":{"description":"High number of info status logs detected in the indexd service in cluster {{`{{ $labels.clusters }}`}}.","summary":"Indexd is getting an excessive amount of traffic"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({cluster=~\".+\", app=\"indexd\", status=\"info\"} [5m])) > 50000","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Indexd is getting an excessive amount of traffic","uid":"bdwbck1lgwdfka"},{"annotations":{"description":"More than 10 errors detected in the karpenter namespace in cluster {{`{{ $labels.clusters }}`}} related to providerRef not found.","summary":"Karpenter Resource Mismatch"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({namespace=\"karpenter\", cluster=~\".+\"} |= \"ERROR\" |= \"not found\" |= \"getting providerRef\" [5m])) > 10\n","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Karpenter Resource Mismatch","uid":"fdwbe5t439zpcd"},{"annotations":{"description":"More than 1000 \"limiting requests, excess\" errors detected in service {{`{{ $labels.app }}`}} (cluster: {{`{{ $labels.clusters }}`}}) within the last 5 minutes.","summary":"Nginx is logging excessive \" limiting requests, excess:\""},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (app, cluster) (count_over_time({app=~\".+\", cluster=~\".+\"} |= \"status:error\" |= \"limiting requests, excess:\" [5m])) > 1000","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Nginx is logging excessive \" limiting requests, excess:\"","uid":"fdwbeuftc7400c"}]}]}}` | Gen3 built-in alerting configuration in Grafana. | | lgtm.grafana.alerting."rules.yaml" | string | `{"apiVersion":1,"groups":[{"folder":"Alerts","interval":"5m","name":"Alerts","orgId":1,"rules":[{"annotations":{"summary":"Alert: HTTP 500 errors detected in the environment: {{`{{ $labels.clusters }}`}}"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({cluster=~\".+\"} | json | http_status_code=\"500\" [1h])) > 0","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"HTTP 500 errors detected","uid":"edwb8zgcvq96oc"},{"annotations":{"description":"Error in usersync job detected in cluster {{`{{ $labels.clusters }}`}}, namespace {{`{{ $labels.namespace }}`}}.","summary":"Error Logs Detected in Usersync Job"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster, namespace) (count_over_time({ app=\"gen3job\", job_name=~\"usersync-.*\"} |= \"ERROR - could not revoke policies from user `N/A`\" [5m])) > 1","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Error Logs Detected in Usersync Job","uid":"adwb9vhb7irr4b"},{"annotations":{"description":"Panic detected in app {{`{{ $labels.app }}`}} within cluster {{`{{ $labels.clusters }}`}}.","summary":"Hatchery panic"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({app=\"hatchery\"} |= \"panic\" [5m])) > 1","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Hatchery panic in {{`{{ env.name }}`}}","uid":"ddwbc12l6wc8wf"},{"annotations":{"description":"Detected 431 HTTP status codes in the logs within the last 5 minutes.","summary":"Http status code 431"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum(count_over_time({cluster=~\".+\"} | json | http_status_code=\"431\" [5m])) >= 2","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Http status code 431","uid":"cdwbcbphz1zb4a"},{"annotations":{"description":"High number of info status logs detected in the indexd service in cluster {{`{{ $labels.clusters }}`}}.","summary":"Indexd is getting an excessive amount of traffic"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({cluster=~\".+\", app=\"indexd\", status=\"info\"} [5m])) > 50000","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Indexd is getting an excessive amount of traffic","uid":"bdwbck1lgwdfka"},{"annotations":{"description":"More than 10 errors detected in the karpenter namespace in cluster {{`{{ $labels.clusters }}`}} related to providerRef not found.","summary":"Karpenter Resource Mismatch"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({namespace=\"karpenter\", cluster=~\".+\"} |= \"ERROR\" |= \"not found\" |= \"getting providerRef\" [5m])) > 10\n","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Karpenter Resource Mismatch","uid":"fdwbe5t439zpcd"},{"annotations":{"description":"More than 1000 \"limiting requests, excess\" errors detected in service {{`{{ $labels.app }}`}} (cluster: {{`{{ $labels.clusters }}`}}) within the last 5 minutes.","summary":"Nginx is logging excessive \" limiting requests, excess:\""},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (app, cluster) (count_over_time({app=~\".+\", cluster=~\".+\"} |= \"status:error\" |= \"limiting requests, excess:\" [5m])) > 1000","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Nginx is logging excessive \" limiting requests, excess:\"","uid":"fdwbeuftc7400c"}]}]}` | Alerting rules configuration file. | | lgtm.grafana.alerting."rules.yaml".apiVersion | int | `1` | API version for the alerting rules configuration. | | lgtm.grafana.alerting."rules.yaml".groups | list | `[{"folder":"Alerts","interval":"5m","name":"Alerts","orgId":1,"rules":[{"annotations":{"summary":"Alert: HTTP 500 errors detected in the environment: {{`{{ $labels.clusters }}`}}"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({cluster=~\".+\"} | json | http_status_code=\"500\" [1h])) > 0","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"HTTP 500 errors detected","uid":"edwb8zgcvq96oc"},{"annotations":{"description":"Error in usersync job detected in cluster {{`{{ $labels.clusters }}`}}, namespace {{`{{ $labels.namespace }}`}}.","summary":"Error Logs Detected in Usersync Job"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster, namespace) (count_over_time({ app=\"gen3job\", job_name=~\"usersync-.*\"} |= \"ERROR - could not revoke policies from user `N/A`\" [5m])) > 1","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Error Logs Detected in Usersync Job","uid":"adwb9vhb7irr4b"},{"annotations":{"description":"Panic detected in app {{`{{ $labels.app }}`}} within cluster {{`{{ $labels.clusters }}`}}.","summary":"Hatchery panic"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({app=\"hatchery\"} |= \"panic\" [5m])) > 1","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Hatchery panic in {{`{{ env.name }}`}}","uid":"ddwbc12l6wc8wf"},{"annotations":{"description":"Detected 431 HTTP status codes in the logs within the last 5 minutes.","summary":"Http status code 431"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum(count_over_time({cluster=~\".+\"} | json | http_status_code=\"431\" [5m])) >= 2","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Http status code 431","uid":"cdwbcbphz1zb4a"},{"annotations":{"description":"High number of info status logs detected in the indexd service in cluster {{`{{ $labels.clusters }}`}}.","summary":"Indexd is getting an excessive amount of traffic"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({cluster=~\".+\", app=\"indexd\", status=\"info\"} [5m])) > 50000","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Indexd is getting an excessive amount of traffic","uid":"bdwbck1lgwdfka"},{"annotations":{"description":"More than 10 errors detected in the karpenter namespace in cluster {{`{{ $labels.clusters }}`}} related to providerRef not found.","summary":"Karpenter Resource Mismatch"},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (cluster) (count_over_time({namespace=\"karpenter\", cluster=~\".+\"} |= \"ERROR\" |= \"not found\" |= \"getting providerRef\" [5m])) > 10\n","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Karpenter Resource Mismatch","uid":"fdwbe5t439zpcd"},{"annotations":{"description":"More than 1000 \"limiting requests, excess\" errors detected in service {{`{{ $labels.app }}`}} (cluster: {{`{{ $labels.clusters }}`}}) within the last 5 minutes.","summary":"Nginx is logging excessive \" limiting requests, excess:\""},"condition":"A","data":[{"datasourceUid":"loki","model":{"datasource":{"type":"loki","uid":"loki"},"editorMode":"code","expr":"sum by (app, cluster) (count_over_time({app=~\".+\", cluster=~\".+\"} |= \"status:error\" |= \"limiting requests, excess:\" [5m])) > 1000","hide":false,"intervalMs":1000,"maxDataPoints":43200,"queryType":"instant","refId":"A"},"queryType":"instant","refId":"A","relativeTimeRange":{"from":600,"to":0}}],"execErrState":"KeepLast","for":"5m","isPaused":false,"labels":{},"noDataState":"OK","notification_settings":{"receiver":"Slack"},"title":"Nginx is logging excessive \" limiting requests, excess:\"","uid":"fdwbeuftc7400c"}]}]` | Groups of alerting rules. | diff --git a/helm/observability/values.yaml b/helm/observability/values.yaml index 075874ff..64da0f61 100644 --- a/helm/observability/values.yaml +++ b/helm/observability/values.yaml @@ -1105,4 +1105,4 @@ lgtm: url: https://hooks.slack.com/services/XXXXXXXXXX group: slack summary: | - {{ `{{ include "default.message" . }}` }} \ No newline at end of file + {{ `{{ include "default.message" . }}` }} diff --git a/helm/test.yaml b/helm/test.yaml deleted file mode 100644 index 8b137891..00000000 --- a/helm/test.yaml +++ /dev/null @@ -1 +0,0 @@ -