From f8f6012a482cfb2f5612253367ff887d1a24bdbd Mon Sep 17 00:00:00 2001 From: scaminati-bv Date: Thu, 17 Oct 2024 11:34:27 +0200 Subject: [PATCH 1/9] feat: add alert for user-stats-service apis --- src/domains/ecommerce-common/00_alerts.tf | 45 +++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/src/domains/ecommerce-common/00_alerts.tf b/src/domains/ecommerce-common/00_alerts.tf index d91ceeddc7..76a3e8e76f 100644 --- a/src/domains/ecommerce-common/00_alerts.tf +++ b/src/domains/ecommerce-common/00_alerts.tf @@ -244,6 +244,51 @@ AzureDiagnostics | extend deltaRatio = todouble(todouble(trafficUp)/todouble(thresholdDelta)) | extend expectedAvailability = iff(Total >= thresholdTrafficLinear, toreal(highTrafficAvailability), iff(Total <= thresholdTrafficMin, toreal(lowTrafficAvailability), (deltaRatio*(availabilityDelta))+lowTrafficAvailability)) | extend Availability=((Success * 1.0) / Total) * 100 +| where Availability < expectedAvailability + QUERY + ) + severity = 1 + frequency = 30 + time_window = 30 + trigger { + operator = "GreaterThanOrEqual" + threshold = 2 + } +} + +# eCommerce user stats service availability +resource "azurerm_monitor_scheduled_query_rules_alert" "ecommerce_user_stats_service_availability_alert" { + count = var.env_short == "p" ? 1 : 0 + + name = "ecommerce-user-stats-service-availability-alert" + resource_group_name = azurerm_resource_group.rg_ecommerce_alerts[0].name + location = var.location + + action { + action_group = [data.azurerm_monitor_action_group.email.id, data.azurerm_monitor_action_group.slack.id] + email_subject = "[eCommerce] User stats service availability less that 99%" + custom_webhook_payload = "{}" + } + data_source_id = data.azurerm_api_management.apim.id + description = "eCommerce user stats service availability less than 99% in the last 30 minutes detected" + enabled = true + query = (<<-QUERY +let thresholdTrafficMin = 200; +let thresholdTrafficLinear = 500; +let lowTrafficAvailability = 94; +let highTrafficAvailability = 98; +let thresholdDelta = thresholdTrafficLinear - thresholdTrafficMin; +let availabilityDelta = highTrafficAvailability - lowTrafficAvailability; +AzureDiagnostics +| where url_s startswith 'https://api.dev.platform.pagopa.it/ecommerce/user-stats-service/v1' +| summarize + Total=count(), + Success=countif(responseCode_d < 500 and DurationMs < 500) + by Time = bin(TimeGenerated, 15m) +| extend trafficUp = Total-thresholdTrafficMin +| extend deltaRatio = todouble(todouble(trafficUp)/todouble(thresholdDelta)) +| extend expectedAvailability = iff(Total >= thresholdTrafficLinear, toreal(highTrafficAvailability), iff(Total <= thresholdTrafficMin, toreal(lowTrafficAvailability), (deltaRatio*(availabilityDelta))+lowTrafficAvailability)) +| extend Availability=((Success * 1.0) / Total) * 100 | where Availability < expectedAvailability QUERY ) From c76287258a2f5c4a62e6f35f5773b3f3afab0b71 Mon Sep 17 00:00:00 2001 From: scaminati-bv Date: Thu, 17 Oct 2024 11:36:08 +0200 Subject: [PATCH 2/9] fix: increase threshold --- src/domains/ecommerce-common/00_alerts.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/domains/ecommerce-common/00_alerts.tf b/src/domains/ecommerce-common/00_alerts.tf index 76a3e8e76f..7fd25d99c0 100644 --- a/src/domains/ecommerce-common/00_alerts.tf +++ b/src/domains/ecommerce-common/00_alerts.tf @@ -270,7 +270,7 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "ecommerce_user_stats_ser custom_webhook_payload = "{}" } data_source_id = data.azurerm_api_management.apim.id - description = "eCommerce user stats service availability less than 99% in the last 30 minutes detected" + description = "eCommerce User stats service availability less than 99% in the last 30 minutes detected" enabled = true query = (<<-QUERY let thresholdTrafficMin = 200; @@ -297,6 +297,6 @@ AzureDiagnostics time_window = 30 trigger { operator = "GreaterThanOrEqual" - threshold = 2 + threshold = 10 } } \ No newline at end of file From 5d7becc8b5ab4f45f635cb6410a3b6964f287d44 Mon Sep 17 00:00:00 2001 From: scaminati-bv Date: Thu, 17 Oct 2024 11:52:10 +0200 Subject: [PATCH 3/9] fix: alert label --- src/domains/ecommerce-common/00_alerts.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/domains/ecommerce-common/00_alerts.tf b/src/domains/ecommerce-common/00_alerts.tf index 7fd25d99c0..868117c7f7 100644 --- a/src/domains/ecommerce-common/00_alerts.tf +++ b/src/domains/ecommerce-common/00_alerts.tf @@ -270,7 +270,7 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "ecommerce_user_stats_ser custom_webhook_payload = "{}" } data_source_id = data.azurerm_api_management.apim.id - description = "eCommerce User stats service availability less than 99% in the last 30 minutes detected" + description = "eCommerce User stats service availability less than 98% in the last 30 minutes detected" enabled = true query = (<<-QUERY let thresholdTrafficMin = 200; From 9df12cc7b3b9b878873537a0b9c20d18a91b79de Mon Sep 17 00:00:00 2001 From: Simone Caminati Date: Fri, 18 Oct 2024 09:36:21 +0200 Subject: [PATCH 4/9] Update src/domains/ecommerce-common/00_alerts.tf Co-authored-by: ciuffagianluca <113357981+ciuffagianluca@users.noreply.github.com> --- src/domains/ecommerce-common/00_alerts.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/domains/ecommerce-common/00_alerts.tf b/src/domains/ecommerce-common/00_alerts.tf index 868117c7f7..7d236633ec 100644 --- a/src/domains/ecommerce-common/00_alerts.tf +++ b/src/domains/ecommerce-common/00_alerts.tf @@ -280,7 +280,7 @@ let highTrafficAvailability = 98; let thresholdDelta = thresholdTrafficLinear - thresholdTrafficMin; let availabilityDelta = highTrafficAvailability - lowTrafficAvailability; AzureDiagnostics -| where url_s startswith 'https://api.dev.platform.pagopa.it/ecommerce/user-stats-service/v1' +| where url_s startswith 'https://api.platform.pagopa.it/ecommerce/user-stats-service/v1' | summarize Total=count(), Success=countif(responseCode_d < 500 and DurationMs < 500) From 5544ec956736bec97bd63d4112ae48e0eed9c123 Mon Sep 17 00:00:00 2001 From: scaminati-bv Date: Fri, 18 Oct 2024 12:05:30 +0200 Subject: [PATCH 5/9] fix: use simple query for user-stats and add webhook --- src/domains/ecommerce-common/00_alerts.tf | 27 ++++++++++------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/src/domains/ecommerce-common/00_alerts.tf b/src/domains/ecommerce-common/00_alerts.tf index 7d236633ec..c304ab2d0a 100644 --- a/src/domains/ecommerce-common/00_alerts.tf +++ b/src/domains/ecommerce-common/00_alerts.tf @@ -267,29 +267,26 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "ecommerce_user_stats_ser action { action_group = [data.azurerm_monitor_action_group.email.id, data.azurerm_monitor_action_group.slack.id] email_subject = "[eCommerce] User stats service availability less that 99%" - custom_webhook_payload = "{}" + custom_webhook_payload = jsonencode({ + //alert properties https://docs.opsgenie.com/docs/alert-api + "message" = "[eCommerce] User stats service availability less that 99%" + "alias" = "user-stats-service-availability" + "tags" = "availability" + "entity" = "eCommerce" + "priority" = "P3" + }) } data_source_id = data.azurerm_api_management.apim.id - description = "eCommerce User stats service availability less than 98% in the last 30 minutes detected" + description = "eCommerce User stats service availability less that 99%" enabled = true query = (<<-QUERY -let thresholdTrafficMin = 200; -let thresholdTrafficLinear = 500; -let lowTrafficAvailability = 94; -let highTrafficAvailability = 98; -let thresholdDelta = thresholdTrafficLinear - thresholdTrafficMin; -let availabilityDelta = highTrafficAvailability - lowTrafficAvailability; AzureDiagnostics | where url_s startswith 'https://api.platform.pagopa.it/ecommerce/user-stats-service/v1' | summarize Total=count(), - Success=countif(responseCode_d < 500 and DurationMs < 500) + Success=countif(responseCode_d < 400 and DurationMs < 250) by Time = bin(TimeGenerated, 15m) -| extend trafficUp = Total-thresholdTrafficMin -| extend deltaRatio = todouble(todouble(trafficUp)/todouble(thresholdDelta)) -| extend expectedAvailability = iff(Total >= thresholdTrafficLinear, toreal(highTrafficAvailability), iff(Total <= thresholdTrafficMin, toreal(lowTrafficAvailability), (deltaRatio*(availabilityDelta))+lowTrafficAvailability)) -| extend Availability=((Success * 1.0) / Total) * 100 -| where Availability < expectedAvailability +| where toint(Availability) < 99 QUERY ) severity = 1 @@ -297,6 +294,6 @@ AzureDiagnostics time_window = 30 trigger { operator = "GreaterThanOrEqual" - threshold = 10 + threshold = 2 } } \ No newline at end of file From b9d16f7fbb2a276e50db5ed3c27bcce355af8162 Mon Sep 17 00:00:00 2001 From: scaminati-bv Date: Fri, 18 Oct 2024 12:17:24 +0200 Subject: [PATCH 6/9] fix: alias name --- src/domains/ecommerce-common/00_alerts.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/domains/ecommerce-common/00_alerts.tf b/src/domains/ecommerce-common/00_alerts.tf index c304ab2d0a..f929875109 100644 --- a/src/domains/ecommerce-common/00_alerts.tf +++ b/src/domains/ecommerce-common/00_alerts.tf @@ -270,7 +270,7 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "ecommerce_user_stats_ser custom_webhook_payload = jsonencode({ //alert properties https://docs.opsgenie.com/docs/alert-api "message" = "[eCommerce] User stats service availability less that 99%" - "alias" = "user-stats-service-availability" + "alias" = "ecommerce-user-stats-service-availability-alert" "tags" = "availability" "entity" = "eCommerce" "priority" = "P3" From 1ea2363a442065057f721759d9ad082b8adb20c7 Mon Sep 17 00:00:00 2001 From: scaminati-bv Date: Fri, 25 Oct 2024 14:31:53 +0200 Subject: [PATCH 7/9] fix: change user stats service alert with lastPaymentMethodUsed PUT api --- src/domains/ecommerce-common/00_alerts.tf | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/domains/ecommerce-common/00_alerts.tf b/src/domains/ecommerce-common/00_alerts.tf index f929875109..728c757450 100644 --- a/src/domains/ecommerce-common/00_alerts.tf +++ b/src/domains/ecommerce-common/00_alerts.tf @@ -256,32 +256,33 @@ AzureDiagnostics } } -# eCommerce user stats service availability -resource "azurerm_monitor_scheduled_query_rules_alert" "ecommerce_user_stats_service_availability_alert" { +# eCommerce user stats last payment method put availability +resource "azurerm_monitor_scheduled_query_rules_alert" "ecommerce_user_stats_last_payment_method_put_availability_alert" { count = var.env_short == "p" ? 1 : 0 - name = "ecommerce-user-stats-service-availability-alert" + name = "ecommerce-user-stats-last-payment-method-put-availability-alert" resource_group_name = azurerm_resource_group.rg_ecommerce_alerts[0].name location = var.location action { action_group = [data.azurerm_monitor_action_group.email.id, data.azurerm_monitor_action_group.slack.id] - email_subject = "[eCommerce] User stats service availability less that 99%" + email_subject = "[eCommerce] User stats service - PUT lastPaymentMethodUsed availability less that 99%" custom_webhook_payload = jsonencode({ //alert properties https://docs.opsgenie.com/docs/alert-api - "message" = "[eCommerce] User stats service availability less that 99%" - "alias" = "ecommerce-user-stats-service-availability-alert" + "message" = "[eCommerce] User stats service - PUT lastPaymentMethodUsed availability less that 99%" + "alias" = "ecommerce-user-stats-last-payment-method-put-availability-alert" "tags" = "availability" "entity" = "eCommerce" "priority" = "P3" }) } data_source_id = data.azurerm_api_management.apim.id - description = "eCommerce User stats service availability less that 99%" + description = "eCommerce User stats service PUT lastPaymentMethodUsed availability less that 99%" enabled = true query = (<<-QUERY AzureDiagnostics -| where url_s startswith 'https://api.platform.pagopa.it/ecommerce/user-stats-service/v1' +| where url_s startswith 'https://api.platform.pagopa.it/ecommerce/user-stats-service/v1/user/lastPaymentMethodUsed' +| where method_s == "PUT" | summarize Total=count(), Success=countif(responseCode_d < 400 and DurationMs < 250) From e97ab3388839ef4bfa525d39a167002b28be09bb Mon Sep 17 00:00:00 2001 From: Simone Infante Date: Fri, 25 Oct 2024 15:52:39 +0200 Subject: [PATCH 8/9] chore: docs update --- src/domains/ecommerce-common/00_alerts.tf | 4 ++-- src/domains/ecommerce-common/README.md | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/domains/ecommerce-common/00_alerts.tf b/src/domains/ecommerce-common/00_alerts.tf index 728c757450..4d37265367 100644 --- a/src/domains/ecommerce-common/00_alerts.tf +++ b/src/domains/ecommerce-common/00_alerts.tf @@ -265,8 +265,8 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "ecommerce_user_stats_las location = var.location action { - action_group = [data.azurerm_monitor_action_group.email.id, data.azurerm_monitor_action_group.slack.id] - email_subject = "[eCommerce] User stats service - PUT lastPaymentMethodUsed availability less that 99%" + action_group = [data.azurerm_monitor_action_group.email.id, data.azurerm_monitor_action_group.slack.id] + email_subject = "[eCommerce] User stats service - PUT lastPaymentMethodUsed availability less that 99%" custom_webhook_payload = jsonencode({ //alert properties https://docs.opsgenie.com/docs/alert-api "message" = "[eCommerce] User stats service - PUT lastPaymentMethodUsed availability less that 99%" diff --git a/src/domains/ecommerce-common/README.md b/src/domains/ecommerce-common/README.md index 9db290738c..bf5c3d5a8f 100644 --- a/src/domains/ecommerce-common/README.md +++ b/src/domains/ecommerce-common/README.md @@ -90,6 +90,7 @@ | [azurerm_monitor_scheduled_query_rules_alert.ecommerce_transactions_service_auth_request_ko](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert) | resource | | [azurerm_monitor_scheduled_query_rules_alert.ecommerce_transactions_service_user_receipts_ko](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert) | resource | | [azurerm_monitor_scheduled_query_rules_alert.ecommerce_transient_enqueue_rate_alert](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert) | resource | +| [azurerm_monitor_scheduled_query_rules_alert.ecommerce_user_stats_last_payment_method_put_availability_alert](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert) | resource | | [azurerm_private_dns_a_record.ingress](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/private_dns_a_record) | resource | | [azurerm_private_endpoint.storage_deadletter_private_endpoint](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/private_endpoint) | resource | | [azurerm_private_endpoint.storage_transient_private_endpoint](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/private_endpoint) | resource | From e358744709a2d04a015326777c2289f26b5c38bd Mon Sep 17 00:00:00 2001 From: Simone Infante Date: Fri, 25 Oct 2024 15:57:42 +0200 Subject: [PATCH 9/9] fix: availability rule --- src/domains/ecommerce-common/00_alerts.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/src/domains/ecommerce-common/00_alerts.tf b/src/domains/ecommerce-common/00_alerts.tf index 4d37265367..ffbc3b6075 100644 --- a/src/domains/ecommerce-common/00_alerts.tf +++ b/src/domains/ecommerce-common/00_alerts.tf @@ -287,6 +287,7 @@ AzureDiagnostics Total=count(), Success=countif(responseCode_d < 400 and DurationMs < 250) by Time = bin(TimeGenerated, 15m) +| extend Availability=((Success * 1.0) / Total) * 100 | where toint(Availability) < 99 QUERY )