Skip to content

Commit

Permalink
Merge pull request #108 from dxw/conditionally-create-ecs-cluster-pen…
Browse files Browse the repository at this point in the history
…ding-task-alert

Conditionally create ECS Cluster Pending Task Alert
  • Loading branch information
Stretch96 authored Jun 25, 2024
2 parents 858547b + 542a819 commit 15b6fa4
Show file tree
Hide file tree
Showing 9 changed files with 350 additions and 12 deletions.
23 changes: 23 additions & 0 deletions README.md

Large diffs are not rendered by default.

25 changes: 25 additions & 0 deletions ecs-cluster-infrastructure-alert-pending-tasks.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
resource "aws_cloudwatch_metric_alarm" "infrastructure_ecs_cluster_pending_task" {
count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0

alarm_name = "${local.resource_prefix}-infrastructure-ecs-cluster-infrastructure-pending-task"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = local.infrastructure_ecs_cluster_pending_task_alert_evaluation_periods
metric_name = "PendingTasksCount"
namespace = "ECS"
period = local.infrastructure_ecs_cluster_pending_task_alert_period
statistic = "Maximum"
threshold = local.infrastructure_ecs_cluster_pending_task_alert_threshold
alarm_description = "Pending Tasks for ${aws_ecs_cluster.infrastructure[0].name} Cluster"
actions_enabled = "true"
alarm_actions = concat(
local.infrastructure_ecs_cluster_pending_task_alert_slack ? [data.aws_sns_topic.infrastructure_slack_sns_topic[0].arn] : [],
local.infrastructure_ecs_cluster_pending_task_alert_opsgenie ? [data.aws_sns_topic.infrastructure_opsgenie_sns_topic[0].arn] : []
)
ok_actions = concat(
local.infrastructure_ecs_cluster_pending_task_alert_slack ? [data.aws_sns_topic.infrastructure_slack_sns_topic[0].arn] : [],
local.infrastructure_ecs_cluster_pending_task_alert_opsgenie ? [data.aws_sns_topic.infrastructure_opsgenie_sns_topic[0].arn] : []
)
dimensions = {
ClusterName = aws_ecs_cluster.infrastructure[0].name
}
}
164 changes: 164 additions & 0 deletions ecs-cluster-infrastructure-pending-task-metric-lambda.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
resource "aws_cloudwatch_log_group" "ecs_cluster_infrastructure_pending_task_metric_lambda_log_group" {
count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0

name = "/aws/lambda/${local.resource_prefix_hash}-ecs-cluster-infrastructure-pending-task-metric"
kms_key_id = local.infrastructure_kms_encryption ? aws_kms_key.infrastructure[0].arn : null
retention_in_days = local.infrastructure_ecs_cluster_pending_task_metric_lambda_log_retention
}

resource "aws_iam_role" "ecs_cluster_infrastructure_pending_task_metric_lambda" {
count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0

name = "${local.resource_prefix}-${substr(sha512("ecs-cluster-infrastructure-pending-task-metric"), 0, 6)}"
description = "${local.resource_prefix}-ecs-cluster-infrastructure-pending-task-metric"
assume_role_policy = templatefile(
"${path.root}/policies/assume-roles/service-principle-standard.json.tpl",
{ services = jsonencode(["lambda.amazonaws.com"]) }
)
}

resource "aws_iam_policy" "ecs_cluster_infrastructure_pending_task_metric_lambda" {
count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0

name = "${local.resource_prefix}-ecs-cluster-infrastructure-pending-task-metric"
policy = templatefile(
"${path.root}/policies/lambda-default.json.tpl",
{
region = local.aws_region
account_id = local.aws_account_id
function_name = "${local.resource_prefix_hash}-ecs-cluster-infrastructure-pending-task-metric"
}
)
}

resource "aws_iam_role_policy_attachment" "ecs_cluster_infrastructure_pending_task_metric_lambda" {
count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0

role = aws_iam_role.ecs_cluster_infrastructure_pending_task_metric_lambda[0].name
policy_arn = aws_iam_policy.ecs_cluster_infrastructure_pending_task_metric_lambda[0].arn
}

resource "aws_iam_policy" "ecs_cluster_infrastructure_pending_task_metric_cloudwatch_put_metric_data_lambda" {
count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0

name = "${local.resource_prefix}-ecs-cluster-infrastructure-pending-task-metric-cloudwatch-put-metric-data"
policy = templatefile(
"${path.root}/policies/cloudwatch-put-metric-data.json.tpl",
{
region = local.aws_region
account_id = local.aws_account_id
namespaces = ["ECS"]
}
)
}

resource "aws_iam_role_policy_attachment" "ecs_cluster_infrastructure_pending_task_cloudwatch_metric_put_metric_data_lambda" {
count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0

role = aws_iam_role.ecs_cluster_infrastructure_pending_task_metric_lambda[0].name
policy_arn = aws_iam_policy.ecs_cluster_infrastructure_pending_task_metric_cloudwatch_put_metric_data_lambda[0].arn
}

resource "aws_iam_policy" "ecs_cluster_infrastructure_pending_task_metric_ecs_describe_cluster_lambda" {
count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0

name = "${local.resource_prefix}-ecs-cluster-infrastructure-pending-task-metric-ecs-describe-cluster"
policy = templatefile(
"${path.root}/policies/ecs-describe-cluster.json.tpl",
{
region = local.aws_region
account_id = local.aws_account_id
cluster_names = [local.infrastructure_ecs_cluster_name]
}
)
}

resource "aws_iam_role_policy_attachment" "ecs_cluster_infrastructure_pending_task_metric_ecs_describe_cluster_lambda" {
count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0

role = aws_iam_role.ecs_cluster_infrastructure_pending_task_metric_lambda[0].name
policy_arn = aws_iam_policy.ecs_cluster_infrastructure_pending_task_metric_ecs_describe_cluster_lambda[0].arn
}

resource "aws_iam_policy" "ecs_cluster_infrastructure_pending_task_metric_kms_encrypt" {
count = local.enable_infrastructure_ecs_cluster_pending_task_alert && local.infrastructure_kms_encryption ? 1 : 0

name = "${local.resource_prefix}-ecs-cluster-infrastructure-pending-task-metric-kms-encrypt"
policy = templatefile(
"${path.root}/policies/kms-encrypt.json.tpl",
{ kms_key_arn = aws_kms_key.infrastructure[0].arn }
)
}

resource "aws_iam_role_policy_attachment" "ecs_cluster_infrastructure_pending_task_kms_encrypt" {
count = local.enable_infrastructure_ecs_cluster_pending_task_alert && local.infrastructure_kms_encryption ? 1 : 0

role = aws_iam_role.ecs_cluster_infrastructure_pending_task_metric_lambda[0].name
policy_arn = aws_iam_policy.ecs_cluster_infrastructure_pending_task_metric_kms_encrypt[0].arn
}

data "archive_file" "ecs_cluster_infrastructure_pending_task_metric_lambda" {
count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0

type = "zip"
source_dir = "lambdas/ecs-pending-task-metric"
output_path = "lambdas/.zip-cache/ecs-pending-task-metric.zip"
}

resource "aws_lambda_function" "ecs_cluster_infrastructure_pending_task_metric" {
count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0

filename = data.archive_file.ecs_cluster_infrastructure_pending_task_metric_lambda[0].output_path
function_name = "${local.resource_prefix_hash}-ecs-cluster-infrastructure-pending-task-metric"
description = "${local.resource_prefix} ECS Cluster Infrastructure Pending Task Metric"
handler = "function.lambda_handler"
runtime = "python3.11"
role = aws_iam_role.ecs_cluster_infrastructure_pending_task_metric_lambda[0].arn
source_code_hash = data.archive_file.ecs_cluster_infrastructure_pending_task_metric_lambda[0].output_base64sha256
memory_size = 128
package_type = "Zip"
timeout = 900

environment {
variables = {
ecsClusterName = local.infrastructure_ecs_cluster_name
}
}

tracing_config {
mode = "Active"
}

depends_on = [
aws_iam_role_policy_attachment.ecs_cluster_infrastructure_pending_task_metric_lambda,
aws_iam_role_policy_attachment.ecs_cluster_infrastructure_pending_task_cloudwatch_metric_put_metric_data_lambda,
aws_iam_role_policy_attachment.ecs_cluster_infrastructure_pending_task_metric_ecs_describe_cluster_lambda,
aws_iam_role_policy_attachment.ecs_cluster_infrastructure_pending_task_kms_encrypt
]
}

resource "aws_cloudwatch_event_rule" "ecs_cluster_infrastructure_pending_task_metric_1_min_cron" {
count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0

name = "${local.resource_prefix_hash}-ecs-cluster-infrastructure-pending-task-metric-1-min"
description = "Triggers the ${aws_lambda_function.ecs_cluster_infrastructure_pending_task_metric[0].function_name} Lambda every 1 minute"
schedule_expression = "rate(1 minute)"
}

resource "aws_cloudwatch_event_target" "ecs_cluster_infrastructure_pending_task_metric_1_min_cron" {
count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0

rule = aws_cloudwatch_event_rule.ecs_cluster_infrastructure_pending_task_metric_1_min_cron[0].name
target_id = "lambda"
arn = aws_lambda_function.ecs_cluster_infrastructure_pending_task_metric[0].arn
}

resource "aws_lambda_permission" "ecs_cluster_infrastructure_pending_task_metric_allow_cloudwatch_execution" {
count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0

statement_id = "AllowExecutionFromCloudWatch"
action = "lambda:InvokeFunction"
function_name = aws_lambda_function.ecs_cluster_infrastructure_pending_task_metric[0].function_name
principal = "events.amazonaws.com"
source_arn = aws_cloudwatch_event_rule.ecs_cluster_infrastructure_pending_task_metric_1_min_cron[0].arn
}
5 changes: 5 additions & 0 deletions kms-infrastructure.tf
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ resource "aws_kms_key" "infrastructure" {
{
log_group_arn = local.infrastructure_ecs_cluster_draining_lambda_enabled && local.infrastructure_kms_encryption ? "arn:aws:logs:${local.aws_region}:${local.aws_account_id}:log-group:/aws/lambda/${local.resource_prefix_hash}-ecs-cluster-infrastructure-draining" : ""
}
)}${local.enable_infrastructure_ecs_cluster_pending_task_alert && local.infrastructure_kms_encryption ? "," : ""}
${templatefile("${path.root}/policies/kms-key-policy-statements/cloudwatch-logs-allow.json.tpl",
{
log_group_arn = local.enable_infrastructure_ecs_cluster_pending_task_alert && local.infrastructure_kms_encryption ? "arn:aws:logs:${local.aws_region}:${local.aws_account_id}:log-group:/aws/lambda/${local.resource_prefix_hash}-ecs-cluster-infrastructure-pending-task-metric" : ""
}
)}${length(local.infrastructure_ecs_cluster_services) > 0 && local.infrastructure_kms_encryption ? "," : ""}
${templatefile("${path.root}/policies/kms-key-policy-statements/cloudwatch-logs-allow.json.tpl",
{
Expand Down
36 changes: 36 additions & 0 deletions lambdas/ecs-pending-task-metric/function.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import boto3
import os

CLUSTER_NAME = os.environ['ecsClusterName']

def lambda_handler(event, context):
ecs_client = boto3.client('ecs')
cloudwatch_client = boto3.client('cloudwatch')

response = ecs_client.describe_clusters(
clusters=[CLUSTER_NAME]
)

pending_tasks = response['clusters'][0]['pendingTasksCount']

response = cloudwatch_client.put_metric_data(
Namespace='ECS',
MetricData=[
{
'MetricName': 'PendingTasksCount',
'Dimensions': [
{
'Name': 'ClusterName',
'Value': CLUSTER_NAME
},
],
'Value': pending_tasks,
'Unit': 'Count'
},
]
)

return {
'statusCode': 200,
'body': f'Successfully created custom metric for {CLUSTER_NAME} with {pending_tasks} pending tasks'
}
37 changes: 25 additions & 12 deletions locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,16 @@ locals {

infrastructure_logging_bucket_retention = var.infrastructure_logging_bucket_retention

infrastructure_slack_sns_topic_name = "${local.project_name}-cloudwatch-slack-alerts"
infrastructure_opsgenie_sns_topic_name = "${local.project_name}-cloudwatch-opsgenie-alerts"
infrastructure_slack_sns_topic_in_use = local.infrastructure_ecs_cluster_asg_cpu_alert_slack
infrastructure_opsgenie_sns_topic_in_use = local.infrastructure_ecs_cluster_asg_cpu_alert_opsgenie
infrastructure_slack_sns_topic_name = "${local.project_name}-cloudwatch-slack-alerts"
infrastructure_opsgenie_sns_topic_name = "${local.project_name}-cloudwatch-opsgenie-alerts"
infrastructure_slack_sns_topic_in_use = (
local.infrastructure_ecs_cluster_asg_cpu_alert_slack ||
local.infrastructure_ecs_cluster_pending_task_alert_slack
)
infrastructure_opsgenie_sns_topic_in_use = (
local.infrastructure_ecs_cluster_asg_cpu_alert_opsgenie ||
local.infrastructure_ecs_cluster_pending_task_alert_opsgenie
)

enable_infrastructure_logs_bucket = (
local.infrastructure_vpc_flow_logs_s3_with_athena ||
Expand Down Expand Up @@ -137,14 +143,21 @@ locals {
infrastructure_ecs_cluster_autoscaling_time_based_custom = {
for custom in toset(var.infrastructure_ecs_cluster_autoscaling_time_based_custom) : "${custom["min"]}-${custom["max"]} ${custom["cron"]}" => custom
}
enable_infrastructure_ecs_cluster_asg_cpu_alert = var.enable_infrastructure_ecs_cluster_asg_cpu_alert && local.enable_infrastructure_ecs_cluster
infrastructure_ecs_cluster_asg_cpu_alert_evaluation_periods = var.infrastructure_ecs_cluster_asg_cpu_alert_evaluation_periods
infrastructure_ecs_cluster_asg_cpu_alert_period = var.infrastructure_ecs_cluster_asg_cpu_alert_period
infrastructure_ecs_cluster_asg_cpu_alert_threshold = var.infrastructure_ecs_cluster_asg_cpu_alert_threshold
infrastructure_ecs_cluster_asg_cpu_alert_slack = var.infrastructure_ecs_cluster_asg_cpu_alert_slack && local.enable_infrastructure_ecs_cluster_asg_cpu_alert
infrastructure_ecs_cluster_asg_cpu_alert_opsgenie = var.infrastructure_ecs_cluster_asg_cpu_alert_opsgenie && local.enable_infrastructure_ecs_cluster_asg_cpu_alert
infrastructure_ecs_cluster_wafs = var.infrastructure_ecs_cluster_wafs
infrastructure_ecs_cluster_enable_ssm_dhmc = local.enable_infrastructure_ecs_cluster ? data.external.ssm_dhmc_setting[0].result.setting_value != "$None" : false
enable_infrastructure_ecs_cluster_asg_cpu_alert = var.enable_infrastructure_ecs_cluster_asg_cpu_alert && local.enable_infrastructure_ecs_cluster
infrastructure_ecs_cluster_asg_cpu_alert_evaluation_periods = var.infrastructure_ecs_cluster_asg_cpu_alert_evaluation_periods
infrastructure_ecs_cluster_asg_cpu_alert_period = var.infrastructure_ecs_cluster_asg_cpu_alert_period
infrastructure_ecs_cluster_asg_cpu_alert_threshold = var.infrastructure_ecs_cluster_asg_cpu_alert_threshold
infrastructure_ecs_cluster_asg_cpu_alert_slack = var.infrastructure_ecs_cluster_asg_cpu_alert_slack && local.enable_infrastructure_ecs_cluster_asg_cpu_alert
infrastructure_ecs_cluster_asg_cpu_alert_opsgenie = var.infrastructure_ecs_cluster_asg_cpu_alert_opsgenie && local.enable_infrastructure_ecs_cluster_asg_cpu_alert
enable_infrastructure_ecs_cluster_pending_task_alert = var.enable_infrastructure_ecs_cluster_pending_task_alert && local.enable_infrastructure_ecs_cluster
infrastructure_ecs_cluster_pending_task_metric_lambda_log_retention = var.infrastructure_ecs_cluster_pending_task_metric_lambda_log_retention
infrastructure_ecs_cluster_pending_task_alert_evaluation_periods = var.infrastructure_ecs_cluster_pending_task_alert_evaluation_periods
infrastructure_ecs_cluster_pending_task_alert_period = var.infrastructure_ecs_cluster_pending_task_alert_period
infrastructure_ecs_cluster_pending_task_alert_threshold = var.infrastructure_ecs_cluster_pending_task_alert_threshold
infrastructure_ecs_cluster_pending_task_alert_slack = var.infrastructure_ecs_cluster_pending_task_alert_slack
infrastructure_ecs_cluster_pending_task_alert_opsgenie = var.infrastructure_ecs_cluster_pending_task_alert_opsgenie
infrastructure_ecs_cluster_wafs = var.infrastructure_ecs_cluster_wafs
infrastructure_ecs_cluster_enable_ssm_dhmc = local.enable_infrastructure_ecs_cluster ? data.external.ssm_dhmc_setting[0].result.setting_value != "$None" : false
infrastructure_ecs_cluster_user_data = base64encode(
templatefile("ec2-userdata/ecs-instance.tpl", {
docker_storage_volume_device_name = local.infrastructure_ecs_cluster_ebs_docker_storage_volume_device_name,
Expand Down
21 changes: 21 additions & 0 deletions policies/cloudwatch-put-metric-data.json.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"Version": "2012-10-17",
"Statement": [
{
"Action": [
"cloudwatch:PutMetricData"
],
"Effect": "Allow",
"Resource": "*",
"Condition": {
"ForAnyValue:StringEquals": {
"cloudwatch:namespace": [
%{for k, v in namespaces}
"${v}"%{if k+1 != length(namespaces)},%{endif}
%{endfor}
]
}
}
}
]
}
16 changes: 16 additions & 0 deletions policies/ecs-describe-cluster.json.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"Version": "2012-10-17",
"Statement": [
{
"Action": [
"ecs:DescribeClusters"
],
"Effect": "Allow",
"Resource": [
%{for k, v in cluster_names}
"arn:aws:ecs:${region}:${account_id}:cluster/${v}"%{if k+1 != length(cluster_names)},%{endif}
%{endfor}
]
}
]
}
35 changes: 35 additions & 0 deletions variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,41 @@ variable "infrastructure_ecs_cluster_asg_cpu_alert_opsgenie" {
type = bool
}

variable "enable_infrastructure_ecs_cluster_pending_task_alert" {
description = "Enable the ECS Cluster pending task alert"
type = bool
}

variable "infrastructure_ecs_cluster_pending_task_metric_lambda_log_retention" {
description = "Log retention for the ECS cluster pending task metric Lambda"
type = number
}

variable "infrastructure_ecs_cluster_pending_task_alert_evaluation_periods" {
description = "Evaluation periods for the ECS cluster's Pending Task alert"
type = number
}

variable "infrastructure_ecs_cluster_pending_task_alert_period" {
description = "Period (in secods) for the ECS cluster's Pending Task alert"
type = number
}

variable "infrastructure_ecs_cluster_pending_task_alert_threshold" {
description = "Threshold (Number of pending tasks) for the ECS cluster's Pending Task alert"
type = number
}

variable "infrastructure_ecs_cluster_pending_task_alert_slack" {
description = "Enable Slack alerts for the ECS cluster's Pending Task alert"
type = bool
}

variable "infrastructure_ecs_cluster_pending_task_alert_opsgenie" {
description = "Enable Opsgenie alerts for the ECS cluster's Pending Task alert"
type = bool
}

variable "infrastructure_ecs_cluster_wafs" {
description = "Map of WAF ACLs to craete, which can be used with service CloudFront distributions"
type = map(object({
Expand Down

0 comments on commit 15b6fa4

Please sign in to comment.