From 47953660088469dd72c380c3393227f747d2cc83 Mon Sep 17 00:00:00 2001 From: Pat Heard Date: Thu, 20 Jun 2024 13:00:53 -0400 Subject: [PATCH] feat: health check alarm for submission lambda invocations (#703) Add alarms that will trigger if no submission lambda function invocations have been seen in an expected period. 1. `SubmissionLambdaNoInvocationsCoreHours`: triggers if there has not been at least one invocation between 9am UTC and 6am UTC the next day. Between 6am and 9am, the alarm will always be in a non-breaching state. 2. `SubmissionLambdaInvocationsAnomaly`: uses anomaly detection and will trigger if the invocations are low for a given period. Neither alarm has an SNS action defined. This will allow us to observe and adjust their behaviour before they begin posting to Slack for a breaching state. --- aws/alarms/cloudwatch.tf | 67 +++++++++++++++++++++++++++++++++ aws/alarms/inputs.tf | 10 +++++ aws/alarms/locals.tf | 3 ++ env/cloud/alarms/terragrunt.hcl | 3 ++ 4 files changed, 83 insertions(+) create mode 100644 aws/alarms/locals.tf diff --git a/aws/alarms/cloudwatch.tf b/aws/alarms/cloudwatch.tf index b99643a5a..44a750910 100644 --- a/aws/alarms/cloudwatch.tf +++ b/aws/alarms/cloudwatch.tf @@ -516,3 +516,70 @@ resource "aws_cloudwatch_metric_alarm" "cognito_login_outside_canada_warn" { alarm_description = "Forms: A sign-in by a forms owner has been detected from outside of Canada." } + +# +# Service health: these will trigger if expected metrics thresholds are not met +# + +# Submission lambda: no invocations in a given period during core hours +resource "aws_cloudwatch_metric_alarm" "healthcheck_lambda_submission_invocations_core_hours" { + alarm_name = "SubmissionLambdaNoInvocationsCoreHours" + alarm_description = "HealthCheck - no `submission` invocations in ${local.lambda_submission_expect_invocation_in_period} minutes." + comparison_operator = "LessThanThreshold" + evaluation_periods = 1 + threshold = 1 + treat_missing_data = "breaching" + + metric_query { + id = "invocations_core_hours" + label = "Invocations (core hours)" + expression = "IF(((HOUR(invocations)>=9 OR HOUR(invocations)<=6)),invocations,1)" # Before 6am or after 9am (UTC) use metric, otherwise return `1` + return_data = true + } + + metric_query { + id = "invocations" + metric { + metric_name = "Invocations" + namespace = "AWS/Lambda" + period = local.lambda_submission_expect_invocation_in_period * 60 + stat = "Sum" + unit = "Count" + dimensions = { + FunctionName = var.lambda_submission_function_name + } + } + } +} + +# Submission lambda: anomaly detection, trigger when invocations are below lower threshold +resource "aws_cloudwatch_metric_alarm" "healthcheck_lambda_submission_invocations_anomaly" { + alarm_name = "SubmissionLambdaInvocationsAnomaly" + alarm_description = "HealthCheck - `submission` invocations in ${local.lambda_submission_expect_invocation_in_period} minutes is low." + comparison_operator = "LessThanLowerThreshold" + evaluation_periods = 1 + threshold_metric_id = "invocations_expected" + treat_missing_data = "notBreaching" + + metric_query { + id = "invocations_expected" + expression = "ANOMALY_DETECTION_BAND(invocations)" + label = "Invocations (expected)" + return_data = "true" + } + + metric_query { + id = "invocations" + return_data = "true" + metric { + metric_name = "Invocations" + namespace = "AWS/Lambda" + period = local.lambda_submission_expect_invocation_in_period * 60 + stat = "Sum" + unit = "Count" + dimensions = { + FunctionName = var.lambda_submission_function_name + } + } + } +} diff --git a/aws/alarms/inputs.tf b/aws/alarms/inputs.tf index 38c1e7f4f..4439af1ae 100644 --- a/aws/alarms/inputs.tf +++ b/aws/alarms/inputs.tf @@ -43,6 +43,16 @@ variable "lambda_response_archiver_log_group_name" { type = string } +variable "lambda_submission_expect_invocation_in_period" { + description = "Submission Lambda period (minutes) during which it is expected at least one function invocation will occur. This is used for the healthcheck alarms." + type = number +} + +variable "lambda_submission_function_name" { + description = "Submission Lambda function name" + type = string +} + variable "lambda_submission_log_group_name" { description = "Submission Lambda CloudWatch log group name" type = string diff --git a/aws/alarms/locals.tf b/aws/alarms/locals.tf new file mode 100644 index 000000000..905467958 --- /dev/null +++ b/aws/alarms/locals.tf @@ -0,0 +1,3 @@ +locals { + lambda_submission_expect_invocation_in_period = var.env == "production" ? var.lambda_submission_expect_invocation_in_period : 60 * 24 # expect once a day in non-prod envs +} diff --git a/env/cloud/alarms/terragrunt.hcl b/env/cloud/alarms/terragrunt.hcl index 69de8f78d..90a131417 100644 --- a/env/cloud/alarms/terragrunt.hcl +++ b/env/cloud/alarms/terragrunt.hcl @@ -88,6 +88,7 @@ dependency "lambdas" { lambda_reliability_log_group_name = "/aws/lambda/Reliability" lambda_reliability_dlq_consumer_log_group_name = "/aws/lambda/Reliability_DLQ_Consumer" lambda_response_archiver_log_group_name = "/aws/lambda/Response_Archiver" + lambda_submission_function_name = "Submission" lambda_submission_log_group_name = "/aws/lambda/Submission" lambda_vault_integrity_log_group_name = "/aws/lambda/Vault_Data_Integrity_Check" lambda_vault_integrity_function_name = "vault-integrity" @@ -146,6 +147,8 @@ inputs = { lambda_reliability_log_group_name = dependency.lambdas.outputs.lambda_reliability_log_group_name lambda_reliability_dlq_consumer_log_group_name = dependency.lambdas.outputs.lambda_reliability_dlq_consumer_log_group_name lambda_response_archiver_log_group_name = dependency.lambdas.outputs.lambda_response_archiver_log_group_name + lambda_submission_expect_invocation_in_period = 30 + lambda_submission_function_name = dependency.lambdas.outputs.lambda_submission_function_name lambda_submission_log_group_name = dependency.lambdas.outputs.lambda_submission_log_group_name lambda_vault_integrity_log_group_name = dependency.lambdas.outputs.lambda_vault_integrity_log_group_name lambda_vault_integrity_function_name = dependency.lambdas.outputs.lambda_vault_integrity_function_name