From 8a1a2b4088b9b0c89dc2a07cf095f214004b66b5 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Wed, 26 Jun 2024 15:55:19 +0100 Subject: [PATCH] Conditionally create ECS Container Instance / ASG Instance diff alert * Creates a Lambda which gets the number of instances within the infrastructure ECS Cluster, and the number of instances within it's related Autoscaling group, and puts the difference to a CloudWatch metric in the 'ECS' namespace with the metric name 'ContainerInstanceAsgInstanceDiff' * Creates CloudWatch alarm, which can be configured to alert to Slack and/or Opsgenie, based on EvaluationPeriod, Period and Threshold (Number of instances that are in the ASG, but not in the ECS Cluster) --- README.md | 25 +++ ...uster-infrastructure-alert-ecs-asg-diff.tf | 25 +++ ...ster-infrastructure-ecs-asg-diff-lambda.tf | 181 ++++++++++++++++++ kms-infrastructure.tf | 5 + lambdas/ecs-asg-diff-metric/function.py | 51 +++++ locals.tf | 13 +- policies/asg-describe-asg.json.tpl | 12 ++ variables.tf | 35 ++++ 8 files changed, 345 insertions(+), 2 deletions(-) create mode 100644 ecs-cluster-infrastructure-alert-ecs-asg-diff.tf create mode 100644 ecs-cluster-infrastructure-ecs-asg-diff-lambda.tf create mode 100644 lambdas/ecs-asg-diff-metric/function.py create mode 100644 policies/asg-describe-asg.json.tpl diff --git a/README.md b/README.md index 3602eb0..1e62c80 100644 --- a/README.md +++ b/README.md @@ -59,18 +59,22 @@ This project creates and manages resources within an AWS account for infrastruct | [aws_cloudfront_distribution.infrastructure_ecs_cluster_service_cloudfront](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudfront_distribution) | resource | | [aws_cloudfront_function.custom_s3_buckets_viewer_request](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudfront_function) | resource | | [aws_cloudfront_origin_access_control.custom_s3_buckets](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudfront_origin_access_control) | resource | +| [aws_cloudwatch_event_rule.ecs_cluster_infrastructure_ecs_asg_diff_metric_1_min_cron](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | | [aws_cloudwatch_event_rule.ecs_cluster_infrastructure_pending_task_metric_1_min_cron](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | | [aws_cloudwatch_event_rule.infrastructure_ecs_cluster_service_ecr_scan](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | | [aws_cloudwatch_event_rule.infrastructure_ecs_cluster_service_scheduled_task](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | | [aws_cloudwatch_event_target.ecr_scan_event_target](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | +| [aws_cloudwatch_event_target.ecs_cluster_infrastructure_ecs_asg_diff_metric_1_min_cron](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | | [aws_cloudwatch_event_target.ecs_cluster_infrastructure_pending_task_metric_1_min_cron](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | | [aws_cloudwatch_event_target.infrastructure_ecs_cluster_service_scheduled_task](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | | [aws_cloudwatch_log_group.ecs_cluster_infrastructure_draining_lambda_log_group](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | +| [aws_cloudwatch_log_group.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda_log_group](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | | [aws_cloudwatch_log_group.ecs_cluster_infrastructure_pending_task_metric_lambda_log_group](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | | [aws_cloudwatch_log_group.infrastructure_ecs_cluster_service](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | | [aws_cloudwatch_log_group.infrastructure_rds_exports](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | | [aws_cloudwatch_log_group.infrastructure_vpc_flow_logs](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | | [aws_cloudwatch_metric_alarm.infrastructure_ecs_cluster_asg_cpu](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource | +| [aws_cloudwatch_metric_alarm.infrastructure_ecs_cluster_ecs_asg_diff](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource | | [aws_cloudwatch_metric_alarm.infrastructure_ecs_cluster_pending_task](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource | | [aws_codebuild_project.infrastructure_ecs_cluster_service_build](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/codebuild_project) | resource | | [aws_codedeploy_app.infrastructure_ecs_cluster_service_blue_green](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/codedeploy_app) | resource | @@ -112,6 +116,11 @@ This project creates and manages resources within an AWS account for infrastruct | [aws_iam_policy.ecs_cluster_infrastructure_draining_kms_encrypt](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_iam_policy.ecs_cluster_infrastructure_draining_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_iam_policy.ecs_cluster_infrastructure_draining_sns_publish_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_iam_policy.ecs_cluster_infrastructure_ecs_asg_diff_metric_asg_describe_asg_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_iam_policy.ecs_cluster_infrastructure_ecs_asg_diff_metric_cloudwatch_put_metric_data_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_iam_policy.ecs_cluster_infrastructure_ecs_asg_diff_metric_ecs_describe_cluster_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_iam_policy.ecs_cluster_infrastructure_ecs_asg_diff_metric_kms_encrypt](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_iam_policy.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_iam_policy.ecs_cluster_infrastructure_pending_task_metric_cloudwatch_put_metric_data_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_iam_policy.ecs_cluster_infrastructure_pending_task_metric_ecs_describe_cluster_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_iam_policy.ecs_cluster_infrastructure_pending_task_metric_kms_encrypt](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | @@ -140,6 +149,7 @@ This project creates and manages resources within an AWS account for infrastruct | [aws_iam_policy.infrastructure_ecs_cluster_ssm_service_setting_rw](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_iam_policy.infrastructure_rds_monitoring](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_iam_role.ecs_cluster_infrastructure_draining_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | +| [aws_iam_role.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | | [aws_iam_role.ecs_cluster_infrastructure_pending_task_metric_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | | [aws_iam_role.infrastructure_ecs_cluster](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | | [aws_iam_role.infrastructure_ecs_cluster_autoscaling_lifecycle_termination](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | @@ -156,6 +166,11 @@ This project creates and manages resources within an AWS account for infrastruct | [aws_iam_role_policy_attachment.ecs_cluster_infrastructure_draining_kms_encrypt](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | | [aws_iam_role_policy_attachment.ecs_cluster_infrastructure_draining_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | | [aws_iam_role_policy_attachment.ecs_cluster_infrastructure_draining_sns_publish_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | +| [aws_iam_role_policy_attachment.ecs_cluster_infrastructure_ecs_asg_diff_cloudwatch_metric_put_metric_data_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | +| [aws_iam_role_policy_attachment.ecs_cluster_infrastructure_ecs_asg_diff_kms_encrypt](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | +| [aws_iam_role_policy_attachment.ecs_cluster_infrastructure_ecs_asg_diff_metric_asg_describe_asg_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | +| [aws_iam_role_policy_attachment.ecs_cluster_infrastructure_ecs_asg_diff_metric_ecs_describe_cluster_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | +| [aws_iam_role_policy_attachment.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | | [aws_iam_role_policy_attachment.ecs_cluster_infrastructure_pending_task_cloudwatch_metric_put_metric_data_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | | [aws_iam_role_policy_attachment.ecs_cluster_infrastructure_pending_task_kms_encrypt](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | | [aws_iam_role_policy_attachment.ecs_cluster_infrastructure_pending_task_metric_ecs_describe_cluster_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | @@ -190,8 +205,10 @@ This project creates and manages resources within an AWS account for infrastruct | [aws_kms_key.custom_s3_buckets](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/kms_key) | resource | | [aws_kms_key.infrastructure](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/kms_key) | resource | | [aws_lambda_function.ecs_cluster_infrastructure_draining](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource | +| [aws_lambda_function.ecs_cluster_infrastructure_ecs_asg_diff_metric](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource | | [aws_lambda_function.ecs_cluster_infrastructure_pending_task_metric](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource | | [aws_lambda_permission.ecs_cluster_infrastructure_draining_allow_sns_execution](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | +| [aws_lambda_permission.ecs_cluster_infrastructure_ecs_asg_diff_metric_allow_cloudwatch_execution](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | | [aws_lambda_permission.ecs_cluster_infrastructure_pending_task_metric_allow_cloudwatch_execution](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | | [aws_launch_template.infrastructure_ecs_cluster](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/launch_template) | resource | | [aws_lb_listener_certificate.service_shared_alb_certificate](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lb_listener_certificate) | resource | @@ -321,6 +338,7 @@ This project creates and manages resources within an AWS account for infrastruct | [random_password.infrastructure_ecs_cluster_service_cloudfront_bypass_protection_secret](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/password) | resource | | [random_password.infrastructure_rds_root](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/password) | resource | | [archive_file.ecs_cluster_infrastructure_draining_lambda](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source | +| [archive_file.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source | | [archive_file.ecs_cluster_infrastructure_pending_task_metric_lambda](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source | | [aws_ami.ecs_cluster_ami](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source | | [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | @@ -351,6 +369,7 @@ This project creates and manages resources within an AWS account for infrastruct | [enable\_cloudformatian\_s3\_template\_store](#input\_enable\_cloudformatian\_s3\_template\_store) | Creates an S3 bucket to store custom CloudFormation templates, which can then be referenced in `custom_cloudformation_stacks`. A user with RW access to the bucket is also created. | `bool` | n/a | yes | | [enable\_infrastructure\_ecs\_cluster](#input\_enable\_infrastructure\_ecs\_cluster) | Enable creation of infrastructure ECS cluster, to place ECS services | `bool` | n/a | yes | | [enable\_infrastructure\_ecs\_cluster\_asg\_cpu\_alert](#input\_enable\_infrastructure\_ecs\_cluster\_asg\_cpu\_alert) | Enable a CPU alert for the ECS cluster's Autoscaling Group | `bool` | n/a | yes | +| [enable\_infrastructure\_ecs\_cluster\_ecs\_asg\_diff\_alert](#input\_enable\_infrastructure\_ecs\_cluster\_ecs\_asg\_diff\_alert) | Enable the ECS Cluster Container Instance / ASG instance diff alert | `bool` | n/a | yes | | [enable\_infrastructure\_ecs\_cluster\_efs](#input\_enable\_infrastructure\_ecs\_cluster\_efs) | Conditionally create and mount EFS to the ECS cluster instances | `bool` | n/a | yes | | [enable\_infrastructure\_ecs\_cluster\_pending\_task\_alert](#input\_enable\_infrastructure\_ecs\_cluster\_pending\_task\_alert) | Enable the ECS Cluster pending task alert | `bool` | n/a | yes | | [enable\_infrastructure\_ecs\_cluster\_services\_alb\_logs](#input\_enable\_infrastructure\_ecs\_cluster\_services\_alb\_logs) | Enable Infrastructure ECS cluster services ALB logs | `bool` | n/a | yes | @@ -374,6 +393,12 @@ This project creates and manages resources within an AWS account for infrastruct | [infrastructure\_ecs\_cluster\_draining\_lambda\_log\_retention](#input\_infrastructure\_ecs\_cluster\_draining\_lambda\_log\_retention) | Log retention for the ECS cluster draining Lambda | `number` | n/a | yes | | [infrastructure\_ecs\_cluster\_ebs\_docker\_storage\_volume\_size](#input\_infrastructure\_ecs\_cluster\_ebs\_docker\_storage\_volume\_size) | Size of EBS volume for Docker storage on the infrastructure ECS instances | `number` | n/a | yes | | [infrastructure\_ecs\_cluster\_ebs\_docker\_storage\_volume\_type](#input\_infrastructure\_ecs\_cluster\_ebs\_docker\_storage\_volume\_type) | Type of EBS volume for Docker storage on the infrastructure ECS instances (eg. gp3) | `string` | n/a | yes | +| [infrastructure\_ecs\_cluster\_ecs\_asg\_diff\_alert\_evaluation\_periods](#input\_infrastructure\_ecs\_cluster\_ecs\_asg\_diff\_alert\_evaluation\_periods) | Evaluation periods for the ECS cluster's Container Instance / ASG instance diff alert | `number` | n/a | yes | +| [infrastructure\_ecs\_cluster\_ecs\_asg\_diff\_alert\_opsgenie](#input\_infrastructure\_ecs\_cluster\_ecs\_asg\_diff\_alert\_opsgenie) | Enable Opsgenie alerts for the ECS cluster's Container Instance / ASG instance diff alert | `bool` | n/a | yes | +| [infrastructure\_ecs\_cluster\_ecs\_asg\_diff\_alert\_period](#input\_infrastructure\_ecs\_cluster\_ecs\_asg\_diff\_alert\_period) | Period (in secods) for the ECS cluster's Container Instance / ASG instance diff alert | `number` | n/a | yes | +| [infrastructure\_ecs\_cluster\_ecs\_asg\_diff\_alert\_slack](#input\_infrastructure\_ecs\_cluster\_ecs\_asg\_diff\_alert\_slack) | Enable Slack alerts for the ECS cluster's Container Instance / ASG instance diff alert | `bool` | n/a | yes | +| [infrastructure\_ecs\_cluster\_ecs\_asg\_diff\_alert\_threshold](#input\_infrastructure\_ecs\_cluster\_ecs\_asg\_diff\_alert\_threshold) | Threshold (Number of pending tasks) for the ECS cluster's Container Instance / ASG instance diff alert | `number` | n/a | yes | +| [infrastructure\_ecs\_cluster\_ecs\_asg\_diff\_metric\_lambda\_log\_retention](#input\_infrastructure\_ecs\_cluster\_ecs\_asg\_diff\_metric\_lambda\_log\_retention) | Log retention for the ECS cluster Container Instance / ASG instance diff metric Lambda | `number` | n/a | yes | | [infrastructure\_ecs\_cluster\_instance\_type](#input\_infrastructure\_ecs\_cluster\_instance\_type) | The instance type for EC2 instances launched in the ECS cluster | `string` | n/a | yes | | [infrastructure\_ecs\_cluster\_max\_instance\_lifetime](#input\_infrastructure\_ecs\_cluster\_max\_instance\_lifetime) | Maximum lifetime in seconds of an instance within the ECS cluster | `number` | n/a | yes | | [infrastructure\_ecs\_cluster\_max\_size](#input\_infrastructure\_ecs\_cluster\_max\_size) | Maximum number of instances for the ECS cluster | `number` | n/a | yes | diff --git a/ecs-cluster-infrastructure-alert-ecs-asg-diff.tf b/ecs-cluster-infrastructure-alert-ecs-asg-diff.tf new file mode 100644 index 0000000..63f275a --- /dev/null +++ b/ecs-cluster-infrastructure-alert-ecs-asg-diff.tf @@ -0,0 +1,25 @@ +resource "aws_cloudwatch_metric_alarm" "infrastructure_ecs_cluster_ecs_asg_diff" { + count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0 + + alarm_name = "${local.resource_prefix}-infrastructure-ecs-cluster-infrastructure-ecs-asg-diff" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = local.infrastructure_ecs_cluster_ecs_asg_diff_alert_evaluation_periods + metric_name = "ContainerInstanceAsgInstanceDiff" + namespace = "ECS" + period = local.infrastructure_ecs_cluster_ecs_asg_diff_alert_period + statistic = "Maximum" + threshold = local.infrastructure_ecs_cluster_ecs_asg_diff_alert_threshold + alarm_description = "Container Instance / ASG Instance Difference for ${aws_ecs_cluster.infrastructure[0].name} Cluster" + actions_enabled = "true" + alarm_actions = concat( + local.infrastructure_ecs_cluster_ecs_asg_diff_alert_slack ? [data.aws_sns_topic.infrastructure_slack_sns_topic[0].arn] : [], + local.infrastructure_ecs_cluster_ecs_asg_diff_alert_opsgenie ? [data.aws_sns_topic.infrastructure_opsgenie_sns_topic[0].arn] : [] + ) + ok_actions = concat( + local.infrastructure_ecs_cluster_ecs_asg_diff_alert_slack ? [data.aws_sns_topic.infrastructure_slack_sns_topic[0].arn] : [], + local.infrastructure_ecs_cluster_ecs_asg_diff_alert_opsgenie ? [data.aws_sns_topic.infrastructure_opsgenie_sns_topic[0].arn] : [] + ) + dimensions = { + ClusterName = aws_ecs_cluster.infrastructure[0].name + } +} diff --git a/ecs-cluster-infrastructure-ecs-asg-diff-lambda.tf b/ecs-cluster-infrastructure-ecs-asg-diff-lambda.tf new file mode 100644 index 0000000..0772bc1 --- /dev/null +++ b/ecs-cluster-infrastructure-ecs-asg-diff-lambda.tf @@ -0,0 +1,181 @@ +resource "aws_cloudwatch_log_group" "ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda_log_group" { + count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0 + + name = "/aws/lambda/${local.resource_prefix_hash}-ecs-cluster-infrastructure-ecs-asg-diff-metric" + kms_key_id = local.infrastructure_kms_encryption ? aws_kms_key.infrastructure[0].arn : null + retention_in_days = local.infrastructure_ecs_cluster_ecs_asg_diff_metric_lambda_log_retention +} + +resource "aws_iam_role" "ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda" { + count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0 + + name = "${local.resource_prefix}-${substr(sha512("ecs-cluster-infrastructure-ecs-asg-diff-metric"), 0, 6)}" + description = "${local.resource_prefix}-ecs-cluster-infrastructure-ecs-asg-diff-metric" + assume_role_policy = templatefile( + "${path.root}/policies/assume-roles/service-principle-standard.json.tpl", + { services = jsonencode(["lambda.amazonaws.com"]) } + ) +} + +resource "aws_iam_policy" "ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda" { + count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0 + + name = "${local.resource_prefix}-ecs-cluster-infrastructure-ecs-asg-diff-metric" + policy = templatefile( + "${path.root}/policies/lambda-default.json.tpl", + { + region = local.aws_region + account_id = local.aws_account_id + function_name = "${local.resource_prefix_hash}-ecs-cluster-infrastructure-ecs-asg-diff-metric" + } + ) +} + +resource "aws_iam_role_policy_attachment" "ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda" { + count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0 + + role = aws_iam_role.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda[0].name + policy_arn = aws_iam_policy.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda[0].arn +} + +resource "aws_iam_policy" "ecs_cluster_infrastructure_ecs_asg_diff_metric_cloudwatch_put_metric_data_lambda" { + count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0 + + name = "${local.resource_prefix}-ecs-cluster-infrastructure-ecs-asg-diff-metric-cloudwatch-put-metric-data" + policy = templatefile( + "${path.root}/policies/cloudwatch-put-metric-data.json.tpl", + { + region = local.aws_region + account_id = local.aws_account_id + namespaces = ["ECS"] + } + ) +} + +resource "aws_iam_role_policy_attachment" "ecs_cluster_infrastructure_ecs_asg_diff_cloudwatch_metric_put_metric_data_lambda" { + count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0 + + role = aws_iam_role.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda[0].name + policy_arn = aws_iam_policy.ecs_cluster_infrastructure_ecs_asg_diff_metric_cloudwatch_put_metric_data_lambda[0].arn +} + +resource "aws_iam_policy" "ecs_cluster_infrastructure_ecs_asg_diff_metric_ecs_describe_cluster_lambda" { + count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0 + + name = "${local.resource_prefix}-ecs-cluster-infrastructure-ecs-asg-diff-metric-ecs-describe-cluster" + policy = templatefile( + "${path.root}/policies/ecs-describe-cluster.json.tpl", + { + region = local.aws_region + account_id = local.aws_account_id + cluster_names = [local.infrastructure_ecs_cluster_name] + } + ) +} + +resource "aws_iam_role_policy_attachment" "ecs_cluster_infrastructure_ecs_asg_diff_metric_ecs_describe_cluster_lambda" { + count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0 + + role = aws_iam_role.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda[0].name + policy_arn = aws_iam_policy.ecs_cluster_infrastructure_ecs_asg_diff_metric_ecs_describe_cluster_lambda[0].arn +} + +resource "aws_iam_policy" "ecs_cluster_infrastructure_ecs_asg_diff_metric_asg_describe_asg_lambda" { + count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0 + + name = "${local.resource_prefix}-ecs-cluster-infrastructure-ecs-asg-diff-metric-asg-describe-asg" + policy = templatefile( + "${path.root}/policies/asg-describe-asg.json.tpl", {} + ) +} + +resource "aws_iam_role_policy_attachment" "ecs_cluster_infrastructure_ecs_asg_diff_metric_asg_describe_asg_lambda" { + count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0 + + role = aws_iam_role.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda[0].name + policy_arn = aws_iam_policy.ecs_cluster_infrastructure_ecs_asg_diff_metric_asg_describe_asg_lambda[0].arn +} + +resource "aws_iam_policy" "ecs_cluster_infrastructure_ecs_asg_diff_metric_kms_encrypt" { + count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert && local.infrastructure_kms_encryption ? 1 : 0 + + name = "${local.resource_prefix}-ecs-cluster-infrastructure-ecs-asg-diff-metric-kms-encrypt" + policy = templatefile( + "${path.root}/policies/kms-encrypt.json.tpl", + { kms_key_arn = aws_kms_key.infrastructure[0].arn } + ) +} + +resource "aws_iam_role_policy_attachment" "ecs_cluster_infrastructure_ecs_asg_diff_kms_encrypt" { + count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert && local.infrastructure_kms_encryption ? 1 : 0 + + role = aws_iam_role.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda[0].name + policy_arn = aws_iam_policy.ecs_cluster_infrastructure_ecs_asg_diff_metric_kms_encrypt[0].arn +} + +data "archive_file" "ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda" { + count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0 + + type = "zip" + source_dir = "lambdas/ecs-asg-diff-metric" + output_path = "lambdas/.zip-cache/ecs-asg-diff-metric.zip" +} + +resource "aws_lambda_function" "ecs_cluster_infrastructure_ecs_asg_diff_metric" { + count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0 + + filename = data.archive_file.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda[0].output_path + function_name = "${local.resource_prefix_hash}-ecs-cluster-infrastructure-ecs-asg-diff-metric" + description = "${local.resource_prefix} ECS Cluster Infrastructure Container Instance / ASG Instance Difference Metric" + handler = "function.lambda_handler" + runtime = "python3.11" + role = aws_iam_role.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda[0].arn + source_code_hash = data.archive_file.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda[0].output_base64sha256 + memory_size = 128 + package_type = "Zip" + timeout = 900 + + environment { + variables = { + ecsClusterName = local.infrastructure_ecs_cluster_name + asgName = aws_autoscaling_group.infrastructure_ecs_cluster[0].name + } + } + + tracing_config { + mode = "Active" + } + + depends_on = [ + aws_iam_role_policy_attachment.ecs_cluster_infrastructure_ecs_asg_diff_metric_lambda, + aws_iam_role_policy_attachment.ecs_cluster_infrastructure_ecs_asg_diff_cloudwatch_metric_put_metric_data_lambda, + aws_iam_role_policy_attachment.ecs_cluster_infrastructure_ecs_asg_diff_metric_ecs_describe_cluster_lambda, + aws_iam_role_policy_attachment.ecs_cluster_infrastructure_ecs_asg_diff_kms_encrypt + ] +} + +resource "aws_cloudwatch_event_rule" "ecs_cluster_infrastructure_ecs_asg_diff_metric_1_min_cron" { + count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0 + + name = "${local.resource_prefix_hash}-ecs-cluster-infrastructure-ecs-asg-diff-metric-1-min" + description = "Triggers the ${aws_lambda_function.ecs_cluster_infrastructure_ecs_asg_diff_metric[0].function_name} Lambda every 1 minute" + schedule_expression = "rate(1 minute)" +} + +resource "aws_cloudwatch_event_target" "ecs_cluster_infrastructure_ecs_asg_diff_metric_1_min_cron" { + count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0 + + rule = aws_cloudwatch_event_rule.ecs_cluster_infrastructure_ecs_asg_diff_metric_1_min_cron[0].name + target_id = "lambda" + arn = aws_lambda_function.ecs_cluster_infrastructure_ecs_asg_diff_metric[0].arn +} + +resource "aws_lambda_permission" "ecs_cluster_infrastructure_ecs_asg_diff_metric_allow_cloudwatch_execution" { + count = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert ? 1 : 0 + + statement_id = "AllowExecutionFromCloudWatch" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.ecs_cluster_infrastructure_ecs_asg_diff_metric[0].function_name + principal = "events.amazonaws.com" + source_arn = aws_cloudwatch_event_rule.ecs_cluster_infrastructure_ecs_asg_diff_metric_1_min_cron[0].arn +} diff --git a/kms-infrastructure.tf b/kms-infrastructure.tf index 83c4cfb..67ddcb3 100644 --- a/kms-infrastructure.tf +++ b/kms-infrastructure.tf @@ -29,6 +29,11 @@ resource "aws_kms_key" "infrastructure" { { log_group_arn = local.enable_infrastructure_ecs_cluster_pending_task_alert && local.infrastructure_kms_encryption ? "arn:aws:logs:${local.aws_region}:${local.aws_account_id}:log-group:/aws/lambda/${local.resource_prefix_hash}-ecs-cluster-infrastructure-pending-task-metric" : "" } + )}${local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert && local.infrastructure_kms_encryption ? "," : ""} + ${templatefile("${path.root}/policies/kms-key-policy-statements/cloudwatch-logs-allow.json.tpl", + { + log_group_arn = local.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert && local.infrastructure_kms_encryption ? "arn:aws:logs:${local.aws_region}:${local.aws_account_id}:log-group:/aws/lambda/${local.resource_prefix_hash}-ecs-cluster-infrastructure-ecs-asg-diff-metric" : "" + } )}${length(local.infrastructure_ecs_cluster_services) > 0 && local.infrastructure_kms_encryption ? "," : ""} ${templatefile("${path.root}/policies/kms-key-policy-statements/cloudwatch-logs-allow.json.tpl", { diff --git a/lambdas/ecs-asg-diff-metric/function.py b/lambdas/ecs-asg-diff-metric/function.py new file mode 100644 index 0000000..24ac07c --- /dev/null +++ b/lambdas/ecs-asg-diff-metric/function.py @@ -0,0 +1,51 @@ +import boto3 +import os + +CLUSTER_NAME = os.environ['ecsClusterName'] +ASG_NAME = os.environ['asgName'] + +ecs = boto3.client('ecs') +autoscaling = boto3.client('autoscaling') +cloudwatch = boto3.client('cloudwatch') + +def lambda_handler(event, context): + ecs_response = ecs.describe_clusters( + clusters=[CLUSTER_NAME], + ) + + if not ecs_response['clusters']: + return {'statusCode': 200, 'body': 'No ECS cluster found with the given name.'} + + ecs_instance_count = ecs_response['clusters'][0]['registeredContainerInstancesCount'] + + asg_response = autoscaling.describe_auto_scaling_groups( + AutoScalingGroupNames=[ASG_NAME], + ) + + if not asg_response['AutoScalingGroups']: + return {'statusCode': 200, 'body': 'No Auto Scaling Group found with the given name.'} + + asg_instance_count = len(asg_response['AutoScalingGroups'][0]['Instances']) + + instance_diff = asg_instance_count - ecs_instance_count + + cloudwatch.put_metric_data( + Namespace="ECS", + MetricData=[ + { + 'MetricName': "ContainerInstanceAsgInstanceDiff", + 'Dimensions': [ + { + 'Name': 'ClusterName', + 'Value': CLUSTER_NAME + }, + ], + 'Value': instance_diff, + 'Unit': 'Count' + }, + ] + ) + + return { + 'statusCode': 200, + 'body': f'Container Instance / ASG Instance difference ({instance_diff}) calculated and published successfully.'} diff --git a/locals.tf b/locals.tf index 7bf6e4c..2ff71bc 100644 --- a/locals.tf +++ b/locals.tf @@ -15,11 +15,13 @@ locals { infrastructure_opsgenie_sns_topic_name = "${local.project_name}-cloudwatch-opsgenie-alerts" infrastructure_slack_sns_topic_in_use = ( local.infrastructure_ecs_cluster_asg_cpu_alert_slack || - local.infrastructure_ecs_cluster_pending_task_alert_slack + local.infrastructure_ecs_cluster_pending_task_alert_slack || + local.infrastructure_ecs_cluster_ecs_asg_diff_alert_slack ) infrastructure_opsgenie_sns_topic_in_use = ( local.infrastructure_ecs_cluster_asg_cpu_alert_opsgenie || - local.infrastructure_ecs_cluster_pending_task_alert_opsgenie + local.infrastructure_ecs_cluster_pending_task_alert_opsgenie || + local.infrastructure_ecs_cluster_ecs_asg_diff_alert_opsgenie ) enable_infrastructure_logs_bucket = ( @@ -156,6 +158,13 @@ locals { infrastructure_ecs_cluster_pending_task_alert_threshold = var.infrastructure_ecs_cluster_pending_task_alert_threshold infrastructure_ecs_cluster_pending_task_alert_slack = var.infrastructure_ecs_cluster_pending_task_alert_slack infrastructure_ecs_cluster_pending_task_alert_opsgenie = var.infrastructure_ecs_cluster_pending_task_alert_opsgenie + enable_infrastructure_ecs_cluster_ecs_asg_diff_alert = var.enable_infrastructure_ecs_cluster_ecs_asg_diff_alert && local.enable_infrastructure_ecs_cluster + infrastructure_ecs_cluster_ecs_asg_diff_metric_lambda_log_retention = var.infrastructure_ecs_cluster_ecs_asg_diff_metric_lambda_log_retention + infrastructure_ecs_cluster_ecs_asg_diff_alert_evaluation_periods = var.infrastructure_ecs_cluster_ecs_asg_diff_alert_evaluation_periods + infrastructure_ecs_cluster_ecs_asg_diff_alert_period = var.infrastructure_ecs_cluster_ecs_asg_diff_alert_period + infrastructure_ecs_cluster_ecs_asg_diff_alert_threshold = var.infrastructure_ecs_cluster_ecs_asg_diff_alert_threshold + infrastructure_ecs_cluster_ecs_asg_diff_alert_slack = var.infrastructure_ecs_cluster_ecs_asg_diff_alert_slack + infrastructure_ecs_cluster_ecs_asg_diff_alert_opsgenie = var.infrastructure_ecs_cluster_ecs_asg_diff_alert_opsgenie infrastructure_ecs_cluster_wafs = var.infrastructure_ecs_cluster_wafs infrastructure_ecs_cluster_enable_ssm_dhmc = local.enable_infrastructure_ecs_cluster ? data.external.ssm_dhmc_setting[0].result.setting_value != "$None" : false infrastructure_ecs_cluster_user_data = base64encode( diff --git a/policies/asg-describe-asg.json.tpl b/policies/asg-describe-asg.json.tpl new file mode 100644 index 0000000..441f778 --- /dev/null +++ b/policies/asg-describe-asg.json.tpl @@ -0,0 +1,12 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Action": [ + "autoscaling:DescribeAutoScalingGroups" + ], + "Effect": "Allow", + "Resource": "*" + } + ] +} diff --git a/variables.tf b/variables.tf index 8a94f8a..25c343d 100644 --- a/variables.tf +++ b/variables.tf @@ -378,6 +378,41 @@ variable "infrastructure_ecs_cluster_pending_task_alert_opsgenie" { type = bool } +variable "enable_infrastructure_ecs_cluster_ecs_asg_diff_alert" { + description = "Enable the ECS Cluster Container Instance / ASG instance diff alert" + type = bool +} + +variable "infrastructure_ecs_cluster_ecs_asg_diff_metric_lambda_log_retention" { + description = "Log retention for the ECS cluster Container Instance / ASG instance diff metric Lambda" + type = number +} + +variable "infrastructure_ecs_cluster_ecs_asg_diff_alert_evaluation_periods" { + description = "Evaluation periods for the ECS cluster's Container Instance / ASG instance diff alert" + type = number +} + +variable "infrastructure_ecs_cluster_ecs_asg_diff_alert_period" { + description = "Period (in secods) for the ECS cluster's Container Instance / ASG instance diff alert" + type = number +} + +variable "infrastructure_ecs_cluster_ecs_asg_diff_alert_threshold" { + description = "Threshold (Number of pending tasks) for the ECS cluster's Container Instance / ASG instance diff alert" + type = number +} + +variable "infrastructure_ecs_cluster_ecs_asg_diff_alert_slack" { + description = "Enable Slack alerts for the ECS cluster's Container Instance / ASG instance diff alert" + type = bool +} + +variable "infrastructure_ecs_cluster_ecs_asg_diff_alert_opsgenie" { + description = "Enable Opsgenie alerts for the ECS cluster's Container Instance / ASG instance diff alert" + type = bool +} + variable "infrastructure_ecs_cluster_wafs" { description = "Map of WAF ACLs to craete, which can be used with service CloudFront distributions" type = map(object({