Skip to content

Commit

Permalink
feat: health check alarm for submission lambda invocations (#703)
Browse files Browse the repository at this point in the history
Add alarms that will trigger if no submission lambda function invocations have
been seen in an expected period.

1. `SubmissionLambdaNoInvocationsCoreHours`: triggers if there has not been at least one
invocation between 9am UTC and 6am UTC the next day. Between 6am and 9am, the
alarm will always be in a non-breaching state.
2. `SubmissionLambdaInvocationsAnomaly`: uses anomaly detection and will trigger if the
invocations are low for a given period.

Neither alarm has an SNS action defined. This will allow us to observe and adjust their
behaviour before they begin posting to Slack for a breaching state.
  • Loading branch information
patheard authored Jun 20, 2024
1 parent 4ea0a7f commit 4795366
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 0 deletions.
67 changes: 67 additions & 0 deletions aws/alarms/cloudwatch.tf
Original file line number Diff line number Diff line change
Expand Up @@ -516,3 +516,70 @@ resource "aws_cloudwatch_metric_alarm" "cognito_login_outside_canada_warn" {

alarm_description = "Forms: A sign-in by a forms owner has been detected from outside of Canada."
}

#
# Service health: these will trigger if expected metrics thresholds are not met
#

# Submission lambda: no invocations in a given period during core hours
resource "aws_cloudwatch_metric_alarm" "healthcheck_lambda_submission_invocations_core_hours" {
alarm_name = "SubmissionLambdaNoInvocationsCoreHours"
alarm_description = "HealthCheck - no `submission` invocations in ${local.lambda_submission_expect_invocation_in_period} minutes."
comparison_operator = "LessThanThreshold"
evaluation_periods = 1
threshold = 1
treat_missing_data = "breaching"

metric_query {
id = "invocations_core_hours"
label = "Invocations (core hours)"
expression = "IF(((HOUR(invocations)>=9 OR HOUR(invocations)<=6)),invocations,1)" # Before 6am or after 9am (UTC) use metric, otherwise return `1`
return_data = true
}

metric_query {
id = "invocations"
metric {
metric_name = "Invocations"
namespace = "AWS/Lambda"
period = local.lambda_submission_expect_invocation_in_period * 60
stat = "Sum"
unit = "Count"
dimensions = {
FunctionName = var.lambda_submission_function_name
}
}
}
}

# Submission lambda: anomaly detection, trigger when invocations are below lower threshold
resource "aws_cloudwatch_metric_alarm" "healthcheck_lambda_submission_invocations_anomaly" {
alarm_name = "SubmissionLambdaInvocationsAnomaly"
alarm_description = "HealthCheck - `submission` invocations in ${local.lambda_submission_expect_invocation_in_period} minutes is low."
comparison_operator = "LessThanLowerThreshold"
evaluation_periods = 1
threshold_metric_id = "invocations_expected"
treat_missing_data = "notBreaching"

metric_query {
id = "invocations_expected"
expression = "ANOMALY_DETECTION_BAND(invocations)"
label = "Invocations (expected)"
return_data = "true"
}

metric_query {
id = "invocations"
return_data = "true"
metric {
metric_name = "Invocations"
namespace = "AWS/Lambda"
period = local.lambda_submission_expect_invocation_in_period * 60
stat = "Sum"
unit = "Count"
dimensions = {
FunctionName = var.lambda_submission_function_name
}
}
}
}
10 changes: 10 additions & 0 deletions aws/alarms/inputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,16 @@ variable "lambda_response_archiver_log_group_name" {
type = string
}

variable "lambda_submission_expect_invocation_in_period" {
description = "Submission Lambda period (minutes) during which it is expected at least one function invocation will occur. This is used for the healthcheck alarms."
type = number
}

variable "lambda_submission_function_name" {
description = "Submission Lambda function name"
type = string
}

variable "lambda_submission_log_group_name" {
description = "Submission Lambda CloudWatch log group name"
type = string
Expand Down
3 changes: 3 additions & 0 deletions aws/alarms/locals.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
locals {
lambda_submission_expect_invocation_in_period = var.env == "production" ? var.lambda_submission_expect_invocation_in_period : 60 * 24 # expect once a day in non-prod envs
}
3 changes: 3 additions & 0 deletions env/cloud/alarms/terragrunt.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ dependency "lambdas" {
lambda_reliability_log_group_name = "/aws/lambda/Reliability"
lambda_reliability_dlq_consumer_log_group_name = "/aws/lambda/Reliability_DLQ_Consumer"
lambda_response_archiver_log_group_name = "/aws/lambda/Response_Archiver"
lambda_submission_function_name = "Submission"
lambda_submission_log_group_name = "/aws/lambda/Submission"
lambda_vault_integrity_log_group_name = "/aws/lambda/Vault_Data_Integrity_Check"
lambda_vault_integrity_function_name = "vault-integrity"
Expand Down Expand Up @@ -146,6 +147,8 @@ inputs = {
lambda_reliability_log_group_name = dependency.lambdas.outputs.lambda_reliability_log_group_name
lambda_reliability_dlq_consumer_log_group_name = dependency.lambdas.outputs.lambda_reliability_dlq_consumer_log_group_name
lambda_response_archiver_log_group_name = dependency.lambdas.outputs.lambda_response_archiver_log_group_name
lambda_submission_expect_invocation_in_period = 30
lambda_submission_function_name = dependency.lambdas.outputs.lambda_submission_function_name
lambda_submission_log_group_name = dependency.lambdas.outputs.lambda_submission_log_group_name
lambda_vault_integrity_log_group_name = dependency.lambdas.outputs.lambda_vault_integrity_log_group_name
lambda_vault_integrity_function_name = dependency.lambdas.outputs.lambda_vault_integrity_function_name
Expand Down

0 comments on commit 4795366

Please sign in to comment.