Skip to content

Commit

Permalink
Merge pull request #109 from dxw/conditionally-create-ecs-container-i…
Browse files Browse the repository at this point in the history
…nstance-asg-instance-diff-alert

Conditionally create ECS Container Instance / ASG Instance diff alert
  • Loading branch information
Stretch96 authored Jun 26, 2024
2 parents 15b6fa4 + 8a1a2b4 commit 4630eb5
Show file tree
Hide file tree
Showing 8 changed files with 345 additions and 2 deletions.
25 changes: 25 additions & 0 deletions README.md

Large diffs are not rendered by default.

25 changes: 25 additions & 0 deletions ecs-cluster-infrastructure-alert-ecs-asg-diff.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
resource "aws_cloudwatch_metric_alarm" "infrastructure_ecs_cluster_ecs_asg_diff" {
count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0

alarm_name = "${local.resource_prefix}-infrastructure-ecs-cluster-infrastructure-ecs-asg-diff"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = local.infrastructure_ecs_cluster_ecs_asg_diff_alert_evaluation_periods
metric_name = "ContainerInstanceAsgInstanceDiff"
namespace = "ECS"
period = local.infrastructure_ecs_cluster_ecs_asg_diff_alert_period
statistic = "Maximum"
threshold = local.infrastructure_ecs_cluster_ecs_asg_diff_alert_threshold
alarm_description = "Container Instance / ASG Instance Difference for ${aws_ecs_cluster.infrastructure[0].name} Cluster"
actions_enabled = "true"
alarm_actions = concat(
local.infrastructure_ecs_cluster_ecs_asg_diff_alert_slack ? [data.aws_sns_topic.infrastructure_slack_sns_topic[0].arn] : [],
local.infrastructure_ecs_cluster_ecs_asg_diff_alert_opsgenie ? [data.aws_sns_topic.infrastructure_opsgenie_sns_topic[0].arn] : []
)
ok_actions = concat(
local.infrastructure_ecs_cluster_ecs_asg_diff_alert_slack ? [data.aws_sns_topic.infrastructure_slack_sns_topic[0].arn] : [],
local.infrastructure_ecs_cluster_ecs_asg_diff_alert_opsgenie ? [data.aws_sns_topic.infrastructure_opsgenie_sns_topic[0].arn] : []
)
dimensions = {
ClusterName = aws_ecs_cluster.infrastructure[0].name
}
}
181 changes: 181 additions & 0 deletions ecs-cluster-infrastructure-ecs-asg-diff-lambda.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
resource "aws_cloudwatch_log_group" "ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda_log_group" {
count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0

name = "/aws/lambda/${local.resource_prefix_hash}-ecs-cluster-infrastructure-ecs-asg-diff-metric"
kms_key_id = local.infrastructure_kms_encryption ? aws_kms_key.infrastructure[0].arn : null
retention_in_days = local.infrastructure_ecs_cluster_ecs_asg_diff_metric_lambda_log_retention
}

resource "aws_iam_role" "ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda" {
count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0

name = "${local.resource_prefix}-${substr(sha512("ecs-cluster-infrastructure-ecs-asg-diff-metric"), 0, 6)}"
description = "${local.resource_prefix}-ecs-cluster-infrastructure-ecs-asg-diff-metric"
assume_role_policy = templatefile(
"${path.root}/policies/assume-roles/service-principle-standard.json.tpl",
{ services = jsonencode(["lambda.amazonaws.com"]) }
)
}

resource "aws_iam_policy" "ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda" {
count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0

name = "${local.resource_prefix}-ecs-cluster-infrastructure-ecs-asg-diff-metric"
policy = templatefile(
"${path.root}/policies/lambda-default.json.tpl",
{
region = local.aws_region
account_id = local.aws_account_id
function_name = "${local.resource_prefix_hash}-ecs-cluster-infrastructure-ecs-asg-diff-metric"
}
)
}

resource "aws_iam_role_policy_attachment" "ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda" {
count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0

role = aws_iam_role.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda[0].name
policy_arn = aws_iam_policy.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda[0].arn
}

resource "aws_iam_policy" "ecs_cluster_infrastructure_ecs_asg_diff_metric_cloudwatch_put_metric_data_lambda" {
count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0

name = "${local.resource_prefix}-ecs-cluster-infrastructure-ecs-asg-diff-metric-cloudwatch-put-metric-data"
policy = templatefile(
"${path.root}/policies/cloudwatch-put-metric-data.json.tpl",
{
region = local.aws_region
account_id = local.aws_account_id
namespaces = ["ECS"]
}
)
}

resource "aws_iam_role_policy_attachment" "ecs_cluster_infrastructure_ecs_asg_diff_cloudwatch_metric_put_metric_data_lambda" {
count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0

role = aws_iam_role.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda[0].name
policy_arn = aws_iam_policy.ecs_cluster_infrastructure_ecs_asg_diff_metric_cloudwatch_put_metric_data_lambda[0].arn
}

resource "aws_iam_policy" "ecs_cluster_infrastructure_ecs_asg_diff_metric_ecs_describe_cluster_lambda" {
count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0

name = "${local.resource_prefix}-ecs-cluster-infrastructure-ecs-asg-diff-metric-ecs-describe-cluster"
policy = templatefile(
"${path.root}/policies/ecs-describe-cluster.json.tpl",
{
region = local.aws_region
account_id = local.aws_account_id
cluster_names = [local.infrastructure_ecs_cluster_name]
}
)
}

resource "aws_iam_role_policy_attachment" "ecs_cluster_infrastructure_ecs_asg_diff_metric_ecs_describe_cluster_lambda" {
count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0

role = aws_iam_role.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda[0].name
policy_arn = aws_iam_policy.ecs_cluster_infrastructure_ecs_asg_diff_metric_ecs_describe_cluster_lambda[0].arn
}

resource "aws_iam_policy" "ecs_cluster_infrastructure_ecs_asg_diff_metric_asg_describe_asg_lambda" {
count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0

name = "${local.resource_prefix}-ecs-cluster-infrastructure-ecs-asg-diff-metric-asg-describe-asg"
policy = templatefile(
"${path.root}/policies/asg-describe-asg.json.tpl", {}
)
}

resource "aws_iam_role_policy_attachment" "ecs_cluster_infrastructure_ecs_asg_diff_metric_asg_describe_asg_lambda" {
count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0

role = aws_iam_role.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda[0].name
policy_arn = aws_iam_policy.ecs_cluster_infrastructure_ecs_asg_diff_metric_asg_describe_asg_lambda[0].arn
}

resource "aws_iam_policy" "ecs_cluster_infrastructure_ecs_asg_diff_metric_kms_encrypt" {
count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert && local.infrastructure_kms_encryption ? 1 : 0

name = "${local.resource_prefix}-ecs-cluster-infrastructure-ecs-asg-diff-metric-kms-encrypt"
policy = templatefile(
"${path.root}/policies/kms-encrypt.json.tpl",
{ kms_key_arn = aws_kms_key.infrastructure[0].arn }
)
}

resource "aws_iam_role_policy_attachment" "ecs_cluster_infrastructure_ecs_asg_diff_kms_encrypt" {
count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert && local.infrastructure_kms_encryption ? 1 : 0

role = aws_iam_role.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda[0].name
policy_arn = aws_iam_policy.ecs_cluster_infrastructure_ecs_asg_diff_metric_kms_encrypt[0].arn
}

data "archive_file" "ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda" {
count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0

type = "zip"
source_dir = "lambdas/ecs-asg-diff-metric"
output_path = "lambdas/.zip-cache/ecs-asg-diff-metric.zip"
}

resource "aws_lambda_function" "ecs_cluster_infrastructure_ecs_asg_diff_metric" {
count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0

filename = data.archive_file.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda[0].output_path
function_name = "${local.resource_prefix_hash}-ecs-cluster-infrastructure-ecs-asg-diff-metric"
description = "${local.resource_prefix} ECS Cluster Infrastructure Container Instance / ASG Instance Difference Metric"
handler = "function.lambda_handler"
runtime = "python3.11"
role = aws_iam_role.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda[0].arn
source_code_hash = data.archive_file.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda[0].output_base64sha256
memory_size = 128
package_type = "Zip"
timeout = 900

environment {
variables = {
ecsClusterName = local.infrastructure_ecs_cluster_name
asgName = aws_autoscaling_group.infrastructure_ecs_cluster[0].name
}
}

tracing_config {
mode = "Active"
}

depends_on = [
aws_iam_role_policy_attachment.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda,
aws_iam_role_policy_attachment.ecs_cluster_infrastructure_ecs_asg_diff_cloudwatch_metric_put_metric_data_lambda,
aws_iam_role_policy_attachment.ecs_cluster_infrastructure_ecs_asg_diff_metric_ecs_describe_cluster_lambda,
aws_iam_role_policy_attachment.ecs_cluster_infrastructure_ecs_asg_diff_kms_encrypt
]
}

resource "aws_cloudwatch_event_rule" "ecs_cluster_infrastructure_ecs_asg_diff_metric_1_min_cron" {
count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0

name = "${local.resource_prefix_hash}-ecs-cluster-infrastructure-ecs-asg-diff-metric-1-min"
description = "Triggers the ${aws_lambda_function.ecs_cluster_infrastructure_ecs_asg_diff_metric[0].function_name} Lambda every 1 minute"
schedule_expression = "rate(1 minute)"
}

resource "aws_cloudwatch_event_target" "ecs_cluster_infrastructure_ecs_asg_diff_metric_1_min_cron" {
count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0

rule = aws_cloudwatch_event_rule.ecs_cluster_infrastructure_ecs_asg_diff_metric_1_min_cron[0].name
target_id = "lambda"
arn = aws_lambda_function.ecs_cluster_infrastructure_ecs_asg_diff_metric[0].arn
}

resource "aws_lambda_permission" "ecs_cluster_infrastructure_ecs_asg_diff_metric_allow_cloudwatch_execution" {
count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0

statement_id = "AllowExecutionFromCloudWatch"
action = "lambda:InvokeFunction"
function_name = aws_lambda_function.ecs_cluster_infrastructure_ecs_asg_diff_metric[0].function_name
principal = "events.amazonaws.com"
source_arn = aws_cloudwatch_event_rule.ecs_cluster_infrastructure_ecs_asg_diff_metric_1_min_cron[0].arn
}
5 changes: 5 additions & 0 deletions kms-infrastructure.tf
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ resource "aws_kms_key" "infrastructure" {
{
log_group_arn = local.enable_infrastructure_ecs_cluster_pending_task_alert && local.infrastructure_kms_encryption ? "arn:aws:logs:${local.aws_region}:${local.aws_account_id}:log-group:/aws/lambda/${local.resource_prefix_hash}-ecs-cluster-infrastructure-pending-task-metric" : ""
}
)}${local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert && local.infrastructure_kms_encryption ? "," : ""}
${templatefile("${path.root}/policies/kms-key-policy-statements/cloudwatch-logs-allow.json.tpl",
{
log_group_arn = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert && local.infrastructure_kms_encryption ? "arn:aws:logs:${local.aws_region}:${local.aws_account_id}:log-group:/aws/lambda/${local.resource_prefix_hash}-ecs-cluster-infrastructure-ecs-asg-diff-metric" : ""
}
)}${length(local.infrastructure_ecs_cluster_services) > 0 && local.infrastructure_kms_encryption ? "," : ""}
${templatefile("${path.root}/policies/kms-key-policy-statements/cloudwatch-logs-allow.json.tpl",
{
Expand Down
51 changes: 51 additions & 0 deletions lambdas/ecs-asg-diff-metric/function.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import boto3
import os

CLUSTER_NAME = os.environ['ecsClusterName']
ASG_NAME = os.environ['asgName']

ecs = boto3.client('ecs')
autoscaling = boto3.client('autoscaling')
cloudwatch = boto3.client('cloudwatch')

def lambda_handler(event, context):
ecs_response = ecs.describe_clusters(
clusters=[CLUSTER_NAME],
)

if not ecs_response['clusters']:
return {'statusCode': 200, 'body': 'No ECS cluster found with the given name.'}

ecs_instance_count = ecs_response['clusters'][0]['registeredContainerInstancesCount']

asg_response = autoscaling.describe_auto_scaling_groups(
AutoScalingGroupNames=[ASG_NAME],
)

if not asg_response['AutoScalingGroups']:
return {'statusCode': 200, 'body': 'No Auto Scaling Group found with the given name.'}

asg_instance_count = len(asg_response['AutoScalingGroups'][0]['Instances'])

instance_diff = asg_instance_count - ecs_instance_count

cloudwatch.put_metric_data(
Namespace="ECS",
MetricData=[
{
'MetricName': "ContainerInstanceAsgInstanceDiff",
'Dimensions': [
{
'Name': 'ClusterName',
'Value': CLUSTER_NAME
},
],
'Value': instance_diff,
'Unit': 'Count'
},
]
)

return {
'statusCode': 200,
'body': f'Container Instance / ASG Instance difference ({instance_diff}) calculated and published successfully.'}
13 changes: 11 additions & 2 deletions locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@ locals {
infrastructure_opsgenie_sns_topic_name = "${local.project_name}-cloudwatch-opsgenie-alerts"
infrastructure_slack_sns_topic_in_use = (
local.infrastructure_ecs_cluster_asg_cpu_alert_slack ||
local.infrastructure_ecs_cluster_pending_task_alert_slack
local.infrastructure_ecs_cluster_pending_task_alert_slack ||
local.infrastructure_ecs_cluster_ecs_asg_diff_alert_slack
)
infrastructure_opsgenie_sns_topic_in_use = (
local.infrastructure_ecs_cluster_asg_cpu_alert_opsgenie ||
local.infrastructure_ecs_cluster_pending_task_alert_opsgenie
local.infrastructure_ecs_cluster_pending_task_alert_opsgenie ||
local.infrastructure_ecs_cluster_ecs_asg_diff_alert_opsgenie
)

enable_infrastructure_logs_bucket = (
Expand Down Expand Up @@ -156,6 +158,13 @@ locals {
infrastructure_ecs_cluster_pending_task_alert_threshold = var.infrastructure_ecs_cluster_pending_task_alert_threshold
infrastructure_ecs_cluster_pending_task_alert_slack = var.infrastructure_ecs_cluster_pending_task_alert_slack
infrastructure_ecs_cluster_pending_task_alert_opsgenie = var.infrastructure_ecs_cluster_pending_task_alert_opsgenie
enable_infrastructure_ecs_cluster_ecs_asg_diff_alert = var.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert && local.enable_infrastructure_ecs_cluster
infrastructure_ecs_cluster_ecs_asg_diff_metric_lambda_log_retention = var.infrastructure_ecs_cluster_ecs_asg_diff_metric_lambda_log_retention
infrastructure_ecs_cluster_ecs_asg_diff_alert_evaluation_periods = var.infrastructure_ecs_cluster_ecs_asg_diff_alert_evaluation_periods
infrastructure_ecs_cluster_ecs_asg_diff_alert_period = var.infrastructure_ecs_cluster_ecs_asg_diff_alert_period
infrastructure_ecs_cluster_ecs_asg_diff_alert_threshold = var.infrastructure_ecs_cluster_ecs_asg_diff_alert_threshold
infrastructure_ecs_cluster_ecs_asg_diff_alert_slack = var.infrastructure_ecs_cluster_ecs_asg_diff_alert_slack
infrastructure_ecs_cluster_ecs_asg_diff_alert_opsgenie = var.infrastructure_ecs_cluster_ecs_asg_diff_alert_opsgenie
infrastructure_ecs_cluster_wafs = var.infrastructure_ecs_cluster_wafs
infrastructure_ecs_cluster_enable_ssm_dhmc = local.enable_infrastructure_ecs_cluster ? data.external.ssm_dhmc_setting[0].result.setting_value != "$None" : false
infrastructure_ecs_cluster_user_data = base64encode(
Expand Down
12 changes: 12 additions & 0 deletions policies/asg-describe-asg.json.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"Version": "2012-10-17",
"Statement": [
{
"Action": [
"autoscaling:DescribeAutoScalingGroups"
],
"Effect": "Allow",
"Resource": "*"
}
]
}
35 changes: 35 additions & 0 deletions variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,41 @@ variable "infrastructure_ecs_cluster_pending_task_alert_opsgenie" {
type = bool
}

variable "enable_infrastructure_ecs_cluster_ecs_asg_diff_alert" {
description = "Enable the ECS Cluster Container Instance / ASG instance diff alert"
type = bool
}

variable "infrastructure_ecs_cluster_ecs_asg_diff_metric_lambda_log_retention" {
description = "Log retention for the ECS cluster Container Instance / ASG instance diff metric Lambda"
type = number
}

variable "infrastructure_ecs_cluster_ecs_asg_diff_alert_evaluation_periods" {
description = "Evaluation periods for the ECS cluster's Container Instance / ASG instance diff alert"
type = number
}

variable "infrastructure_ecs_cluster_ecs_asg_diff_alert_period" {
description = "Period (in secods) for the ECS cluster's Container Instance / ASG instance diff alert"
type = number
}

variable "infrastructure_ecs_cluster_ecs_asg_diff_alert_threshold" {
description = "Threshold (Number of pending tasks) for the ECS cluster's Container Instance / ASG instance diff alert"
type = number
}

variable "infrastructure_ecs_cluster_ecs_asg_diff_alert_slack" {
description = "Enable Slack alerts for the ECS cluster's Container Instance / ASG instance diff alert"
type = bool
}

variable "infrastructure_ecs_cluster_ecs_asg_diff_alert_opsgenie" {
description = "Enable Opsgenie alerts for the ECS cluster's Container Instance / ASG instance diff alert"
type = bool
}

variable "infrastructure_ecs_cluster_wafs" {
description = "Map of WAF ACLs to craete, which can be used with service CloudFront distributions"
type = map(object({
Expand Down

0 comments on commit 4630eb5

Please sign in to comment.