diff --git a/kubernetes_cluster/02_monitor_aks.tf b/kubernetes_cluster/02_monitor_aks.tf index 84011271..91a8eea2 100644 --- a/kubernetes_cluster/02_monitor_aks.tf +++ b/kubernetes_cluster/02_monitor_aks.tf @@ -10,6 +10,7 @@ resource "azurerm_monitor_metric_alert" "this" { frequency = each.value.frequency window_size = each.value.window_size enabled = var.alerts_enabled + severity = lookup(each.value, "severity", 3) dynamic "action" { for_each = var.action @@ -44,6 +45,64 @@ resource "azurerm_monitor_metric_alert" "this" { ] } +resource "azurerm_monitor_scheduled_query_rules_alert_v2" "this" { + for_each = local.log_alerts + + name = "${azurerm_kubernetes_cluster.this.name}-${upper(each.key)}" + description = each.value.description + display_name = each.value.display_name + enabled = var.alerts_enabled + + resource_group_name = var.resource_group_name + scopes = [azurerm_kubernetes_cluster.this.id] + location = var.location + evaluation_frequency = each.value.evaluation_frequency + window_duration = each.value.window_duration + + # Assuming each.value includes this attribute + severity = each.value.severity + + criteria { + query = each.value.query + operator = each.value.operator + threshold = each.value.threshold + time_aggregation_method = lookup(each.value, "time_aggregation_method", "Average") + + resource_id_column = each.value.resource_id_column + metric_measure_column = lookup(each.value, "metric_measure_column", null) + + dynamic "dimension" { + for_each = each.value.dimension + content { + name = dimension.value.name + operator = dimension.value.operator + values = dimension.value.values + } + } + + failing_periods { + minimum_failing_periods_to_trigger_alert = lookup(each.value, "minimum_failing_periods_to_trigger_alert", 1) + number_of_evaluation_periods = lookup(each.value, "number_of_evaluation_periods", 1) + } + } + + auto_mitigation_enabled = lookup(each.value, "auto_mitigation_enabled", true) + workspace_alerts_storage_enabled = lookup(each.value, "workspace_alerts_storage_enabled", false) + skip_query_validation = lookup(each.value, "skip_query_validation", true) + + action { + // Concatenazione di tutti gli ID dei gruppi d'azione in un singolo set di stringhe + action_groups = [for g in var.action : g.action_group_id] + custom_properties = {} + } + + tags = var.tags + + depends_on = [ + azurerm_kubernetes_cluster.this + ] +} + resource "azurerm_monitor_diagnostic_setting" "aks" { count = var.sec_log_analytics_workspace_id != null ? 1 : 0 name = "LogSecurity" diff --git a/kubernetes_cluster/99_variables_monitoring_alerts.tf b/kubernetes_cluster/99_variables_monitoring_alerts.tf index 6b44fc91..59aa8b20 100644 --- a/kubernetes_cluster/99_variables_monitoring_alerts.tf +++ b/kubernetes_cluster/99_variables_monitoring_alerts.tf @@ -15,6 +15,8 @@ variable "default_metric_alerts" { # criteria.0.operator to be one of [Equals NotEquals GreaterThan GreaterThanOrEqual LessThan LessThanOrEqual] operator = string threshold = number + # Possible values are 0, 1, 2, 3 and 4. Defaults to 3. + severity = optional(number) # Possible values are PT1M, PT5M, PT15M, PT30M and PT1H frequency = string # Possible values are PT1M, PT5M, PT15M, PT30M, PT1H, PT6H, PT12H and P1D. @@ -39,6 +41,7 @@ variable "default_metric_alerts" { metric_name = "node_cpu_usage_percentage" operator = "GreaterThan" threshold = 80 + severity = 2 frequency = "PT15M" window_size = "PT1H" dimension = [ @@ -56,6 +59,7 @@ variable "default_metric_alerts" { metric_name = "node_memory_working_set_percentage" operator = "GreaterThan" threshold = 80 + severity = 2 frequency = "PT15M" window_size = "PT1H" dimension = [ @@ -66,49 +70,13 @@ variable "default_metric_alerts" { } ], } - node_disk = { - aggregation = "Average" - metric_namespace = "Microsoft.ContainerService/managedClusters" - metric_name = "node_disk_usage_percentage" - operator = "GreaterThan" - threshold = 80 - frequency = "PT15M" - window_size = "PT1H" - dimension = [ - { - name = "node" - operator = "Include" - values = ["*"] - }, - { - name = "device" - operator = "Include" - values = ["*"] - } - ] - } - node_not_ready = { - aggregation = "Average" - metric_namespace = "Microsoft.ContainerService/managedClusters" - metric_name = "kube_node_status_condition" - operator = "GreaterThan" - threshold = 0 - frequency = "PT15M" - window_size = "PT1H" - dimension = [ - { - name = "status2" - operator = "Include" - values = ["NotReady"] - } - ], - } pods_failed = { aggregation = "Average" metric_namespace = "Microsoft.ContainerService/managedClusters" metric_name = "kube_pod_status_phase" operator = "GreaterThan" threshold = 0 + severity = 1 frequency = "PT15M" window_size = "PT1H" dimension = [ @@ -160,6 +128,160 @@ variable "custom_metric_alerts" { })) } +# Setting locals logs alerts, because i need interpolation to set query correctly +locals { + default_logs_alerts = { + ### NODE NOT READY ALERT + node_not_ready = { + display_name = "${azurerm_kubernetes_cluster.this.name}-NODE-NOT-READY" + description = "Detect nodes that is not ready on AKS cluster" + query = <<-KQL + KubeNodeInventory + | where ClusterId == "${azurerm_kubernetes_cluster.this.id}" + | where TimeGenerated > ago(15m) + | where Status == "NotReady" + | summarize count() by Computer, Status + KQL + severity = 1 + window_duration = "PT30M" + evaluation_frequency = "PT10M" + operator = "GreaterThan" + threshold = 1 + time_aggregation_method = "Average" + resource_id_column = "Status" + metric_measure_column = "count_" + dimension = [ + { + name = "Computer" + operator = "Include" + values = ["*"] + } + ] + minimum_failing_periods_to_trigger_alert = 1 + number_of_evaluation_periods = 1 + auto_mitigation_enabled = true + workspace_alerts_storage_enabled = false + skip_query_validation = true + } + ### NODE DISK ALERT + node_disk_usage = { + display_name = "${azurerm_kubernetes_cluster.this.name}-NODE-DISK-USAGE" + description = "Detect nodes disk is going to run out of space" + query = <<-KQL + InsightsMetrics + | where _ResourceId == "${lower(azurerm_kubernetes_cluster.this.id)}" + | where TimeGenerated > ago(15m) + | where Namespace == "container.azm.ms/disk" + | where Name == "used_percent" + | project TimeGenerated, Computer, Val, Origin + | summarize AvgDiskUsage = avg(Val) by Computer + KQL + severity = 2 + window_duration = "PT30M" + evaluation_frequency = "PT10M" + operator = "GreaterThan" + threshold = 90 + time_aggregation_method = "Average" + resource_id_column = "AvgDiskUsage" + metric_measure_column = "AvgDiskUsage" + dimension = [ + { + name = "Computer" + operator = "Include" + values = ["*"] + } + ] + minimum_failing_periods_to_trigger_alert = 1 + number_of_evaluation_periods = 1 + auto_mitigation_enabled = true + workspace_alerts_storage_enabled = false + skip_query_validation = true + } + } +} + +variable "custom_logs_alerts" { + description = < [addon\_azure\_policy\_enabled](#input\_addon\_azure\_policy\_enabled) | Should the Azure Policy addon be enabled for this Node Pool? | `bool` | `false` | no | | [alerts\_enabled](#input\_alerts\_enabled) | Should Metrics Alert be enabled? | `bool` | `true` | no | | [automatic\_channel\_upgrade](#input\_automatic\_channel\_upgrade) | (Optional) The upgrade channel for this Kubernetes Cluster. Possible values are patch, rapid, node-image and stable. Omitting this field sets this value to none. | `string` | `null` | no | +| [custom\_logs\_alerts](#input\_custom\_logs\_alerts) | Map of name = criteria objects |
map(object({
# (Optional) Specifies the display name of the alert rule.
display_name = string
# (Optional) Specifies the description of the scheduled query rule.
description = string
# Assuming each.value includes this attribute for Kusto Query Language (KQL)
query = string
# (Required) Severity of the alert. Should be an integer between 0 and 4.
# Value of 0 is severest.
severity = number
# (Required) Specifies the period of time in ISO 8601 duration format on
# which the Scheduled Query Rule will be executed (bin size).
# If evaluation_frequency is PT1M, possible values are PT1M, PT5M, PT10M,
# PT15M, PT30M, PT45M, PT1H, PT2H, PT3H, PT4H, PT5H, and PT6H. Otherwise,
# possible values are PT5M, PT10M, PT15M, PT30M, PT45M, PT1H, PT2H, PT3H,
# PT4H, PT5H, PT6H, P1D, and P2D.
window_duration = optional(string)
# (Optional) How often the scheduled query rule is evaluated, represented
# in ISO 8601 duration format. Possible values are PT1M, PT5M, PT10M, PT15M,
# PT30M, PT45M, PT1H, PT2H, PT3H, PT4H, PT5H, PT6H, P1D.
evaluation_frequency = string
# Evaluation operation for rule - 'GreaterThan', GreaterThanOrEqual',
# 'LessThan', or 'LessThanOrEqual'.
operator = string
# Result or count threshold based on which rule should be triggered.
# Values must be between 0 and 10000 inclusive.
threshold = number
# (Required) The type of aggregation to apply to the data points in
# aggregation granularity. Possible values are Average, Count, Maximum,
# Minimum,and Total.
time_aggregation_method = string
# (Optional) Specifies the column containing the resource ID. The content
# of the column must be an uri formatted as resource ID.
resource_id_column = optional(string)

# (Optional) Specifies the column containing the metric measure number.
metric_measure_column = optional(string)

dimension = list(object(
{
# (Required) Name of the dimension.
name = string
# (Required) Operator for dimension values. Possible values are
# Exclude,and Include.
operator = string
# (Required) List of dimension values. Use a wildcard * to collect all.
values = list(string)
}
))

# (Required) Specifies the number of violations to trigger an alert.
# Should be smaller or equal to number_of_evaluation_periods.
# Possible value is integer between 1 and 6.
minimum_failing_periods_to_trigger_alert = number
# (Required) Specifies the number of aggregated look-back points.
# The look-back time window is calculated based on the aggregation
# granularity window_duration and the selected number of aggregated points.
# Possible value is integer between 1 and 6.
number_of_evaluation_periods = number

# (Optional) Specifies the flag that indicates whether the alert should
# be automatically resolved or not. Value should be true or false.
# The default is false.
auto_mitigation_enabled = optional(bool)
# (Optional) Specifies the flag which indicates whether this scheduled
# query rule check if storage is configured. Value should be true or false.
# The default is false.
workspace_alerts_storage_enabled = optional(bool)
# (Optional) Specifies the flag which indicates whether the provided
# query should be validated or not. The default is false.
skip_query_validation = optional(bool)
}))
| `{}` | no | | [custom\_metric\_alerts](#input\_custom\_metric\_alerts) | Map of name = criteria objects |
map(object({
# criteria.*.aggregation to be one of [Average Count Minimum Maximum Total]
aggregation = string
# "Insights.Container/pods" "Insights.Container/nodes"
metric_namespace = string
metric_name = string
# criteria.0.operator to be one of [Equals NotEquals GreaterThan GreaterThanOrEqual LessThan LessThanOrEqual]
operator = string
threshold = number
# Possible values are PT1M, PT5M, PT15M, PT30M and PT1H
frequency = string
# Possible values are PT1M, PT5M, PT15M, PT30M, PT1H, PT6H, PT12H and P1D.
window_size = string
# Skip metrics validation
skip_metric_validation = optional(bool, false)

dimension = list(object(
{
name = string
operator = string
values = list(string)
}
))
}))
| `{}` | no | -| [default\_metric\_alerts](#input\_default\_metric\_alerts) | Map of name = criteria objects |
map(object({
# criteria.*.aggregation to be one of [Average Count Minimum Maximum Total]
aggregation = string
# "Insights.Container/pods" "Insights.Container/nodes"
metric_namespace = string
metric_name = string
# criteria.0.operator to be one of [Equals NotEquals GreaterThan GreaterThanOrEqual LessThan LessThanOrEqual]
operator = string
threshold = number
# Possible values are PT1M, PT5M, PT15M, PT30M and PT1H
frequency = string
# Possible values are PT1M, PT5M, PT15M, PT30M, PT1H, PT6H, PT12H and P1D.
window_size = string
# Skip metrics validation
skip_metric_validation = optional(bool, false)


dimension = list(object(
{
name = string
operator = string
values = list(string)
}
))
}))
|
{
"node_cpu_usage_percentage": {
"aggregation": "Average",
"dimension": [
{
"name": "node",
"operator": "Include",
"values": [
"*"
]
}
],
"frequency": "PT15M",
"metric_name": "node_cpu_usage_percentage",
"metric_namespace": "Microsoft.ContainerService/managedClusters",
"operator": "GreaterThan",
"threshold": 80,
"window_size": "PT1H"
},
"node_disk": {
"aggregation": "Average",
"dimension": [
{
"name": "node",
"operator": "Include",
"values": [
"*"
]
},
{
"name": "device",
"operator": "Include",
"values": [
"*"
]
}
],
"frequency": "PT15M",
"metric_name": "node_disk_usage_percentage",
"metric_namespace": "Microsoft.ContainerService/managedClusters",
"operator": "GreaterThan",
"threshold": 80,
"window_size": "PT1H"
},
"node_memory_working_set_percentage": {
"aggregation": "Average",
"dimension": [
{
"name": "node",
"operator": "Include",
"values": [
"*"
]
}
],
"frequency": "PT15M",
"metric_name": "node_memory_working_set_percentage",
"metric_namespace": "Microsoft.ContainerService/managedClusters",
"operator": "GreaterThan",
"threshold": 80,
"window_size": "PT1H"
},
"node_not_ready": {
"aggregation": "Average",
"dimension": [
{
"name": "status2",
"operator": "Include",
"values": [
"NotReady"
]
}
],
"frequency": "PT15M",
"metric_name": "kube_node_status_condition",
"metric_namespace": "Microsoft.ContainerService/managedClusters",
"operator": "GreaterThan",
"threshold": 0,
"window_size": "PT1H"
},
"pods_failed": {
"aggregation": "Average",
"dimension": [
{
"name": "phase",
"operator": "Include",
"values": [
"Failed"
]
},
{
"name": "namespace",
"operator": "Include",
"values": [
"*"
]
}
],
"frequency": "PT15M",
"metric_name": "kube_pod_status_phase",
"metric_namespace": "Microsoft.ContainerService/managedClusters",
"operator": "GreaterThan",
"threshold": 0,
"window_size": "PT1H"
}
}
| no | +| [default\_metric\_alerts](#input\_default\_metric\_alerts) | Map of name = criteria objects |
map(object({
# criteria.*.aggregation to be one of [Average Count Minimum Maximum Total]
aggregation = string
# "Insights.Container/pods" "Insights.Container/nodes"
metric_namespace = string
metric_name = string
# criteria.0.operator to be one of [Equals NotEquals GreaterThan GreaterThanOrEqual LessThan LessThanOrEqual]
operator = string
threshold = number
# Possible values are 0, 1, 2, 3 and 4. Defaults to 3.
severity = optional(number)
# Possible values are PT1M, PT5M, PT15M, PT30M and PT1H
frequency = string
# Possible values are PT1M, PT5M, PT15M, PT30M, PT1H, PT6H, PT12H and P1D.
window_size = string
# Skip metrics validation
skip_metric_validation = optional(bool, false)


dimension = list(object(
{
name = string
operator = string
values = list(string)
}
))
}))
|
{
"node_cpu_usage_percentage": {
"aggregation": "Average",
"dimension": [
{
"name": "node",
"operator": "Include",
"values": [
"*"
]
}
],
"frequency": "PT15M",
"metric_name": "node_cpu_usage_percentage",
"metric_namespace": "Microsoft.ContainerService/managedClusters",
"operator": "GreaterThan",
"severity": 2,
"threshold": 80,
"window_size": "PT1H"
},
"node_memory_working_set_percentage": {
"aggregation": "Average",
"dimension": [
{
"name": "node",
"operator": "Include",
"values": [
"*"
]
}
],
"frequency": "PT15M",
"metric_name": "node_memory_working_set_percentage",
"metric_namespace": "Microsoft.ContainerService/managedClusters",
"operator": "GreaterThan",
"severity": 2,
"threshold": 80,
"window_size": "PT1H"
},
"pods_failed": {
"aggregation": "Average",
"dimension": [
{
"name": "phase",
"operator": "Include",
"values": [
"Failed"
]
},
{
"name": "namespace",
"operator": "Include",
"values": [
"*"
]
}
],
"frequency": "PT15M",
"metric_name": "kube_pod_status_phase",
"metric_namespace": "Microsoft.ContainerService/managedClusters",
"operator": "GreaterThan",
"severity": 1,
"threshold": 0,
"window_size": "PT1H"
}
}
| no | | [dns\_prefix](#input\_dns\_prefix) | (Required) DNS prefix specified when creating the managed cluster. Changing this forces a new resource to be created. | `string` | n/a | yes | | [kubernetes\_version](#input\_kubernetes\_version) | (Required) Version of Kubernetes specified when creating the AKS managed cluster. | `string` | n/a | yes | | [location](#input\_location) | n/a | `string` | n/a | yes |