From 542a81932c5a004baac8f0c60c6a64a6b5431bee Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Tue, 25 Jun 2024 16:08:55 +0100 Subject: [PATCH] Conditionally create ECS Cluster Pending Task Alert * Creates a lambda function, triggered every minute, which puts metric data to 'ECS/PendingTasksCount' - The number of pending tasks on the infrastructure ECS Cluster * Creates CloudWatch alarm, which can be configured to alert to Slack and/or Opsgenie, based on EvaluationPeriod, Period and Threshold (Number of pending tasks) --- README.md | 23 +++ ...ster-infrastructure-alert-pending-tasks.tf | 25 +++ ...frastructure-pending-task-metric-lambda.tf | 164 ++++++++++++++++++ kms-infrastructure.tf | 5 + lambdas/ecs-pending-task-metric/function.py | 36 ++++ locals.tf | 37 ++-- policies/cloudwatch-put-metric-data.json.tpl | 21 +++ policies/ecs-describe-cluster.json.tpl | 16 ++ variables.tf | 35 ++++ 9 files changed, 350 insertions(+), 12 deletions(-) create mode 100644 ecs-cluster-infrastructure-alert-pending-tasks.tf create mode 100644 ecs-cluster-infrastructure-pending-task-metric-lambda.tf create mode 100644 lambdas/ecs-pending-task-metric/function.py create mode 100644 policies/cloudwatch-put-metric-data.json.tpl create mode 100644 policies/ecs-describe-cluster.json.tpl diff --git a/README.md b/README.md index 93bdba4..3602eb0 100644 --- a/README.md +++ b/README.md @@ -59,15 +59,19 @@ This project creates and manages resources within an AWS account for infrastruct | [aws_cloudfront_distribution.infrastructure_ecs_cluster_service_cloudfront](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudfront_distribution) | resource | | [aws_cloudfront_function.custom_s3_buckets_viewer_request](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudfront_function) | resource | | [aws_cloudfront_origin_access_control.custom_s3_buckets](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudfront_origin_access_control) | resource | +| [aws_cloudwatch_event_rule.ecs_cluster_infrastructure_pending_task_metric_1_min_cron](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | | [aws_cloudwatch_event_rule.infrastructure_ecs_cluster_service_ecr_scan](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | | [aws_cloudwatch_event_rule.infrastructure_ecs_cluster_service_scheduled_task](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | | [aws_cloudwatch_event_target.ecr_scan_event_target](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | +| [aws_cloudwatch_event_target.ecs_cluster_infrastructure_pending_task_metric_1_min_cron](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | | [aws_cloudwatch_event_target.infrastructure_ecs_cluster_service_scheduled_task](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | | [aws_cloudwatch_log_group.ecs_cluster_infrastructure_draining_lambda_log_group](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | +| [aws_cloudwatch_log_group.ecs_cluster_infrastructure_pending_task_metric_lambda_log_group](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | | [aws_cloudwatch_log_group.infrastructure_ecs_cluster_service](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | | [aws_cloudwatch_log_group.infrastructure_rds_exports](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | | [aws_cloudwatch_log_group.infrastructure_vpc_flow_logs](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | | [aws_cloudwatch_metric_alarm.infrastructure_ecs_cluster_asg_cpu](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource | +| [aws_cloudwatch_metric_alarm.infrastructure_ecs_cluster_pending_task](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource | | [aws_codebuild_project.infrastructure_ecs_cluster_service_build](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/codebuild_project) | resource | | [aws_codedeploy_app.infrastructure_ecs_cluster_service_blue_green](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/codedeploy_app) | resource | | [aws_codedeploy_deployment_config.infrastructure_ecs_cluster_service_blue_green](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/codedeploy_deployment_config) | resource | @@ -108,6 +112,10 @@ This project creates and manages resources within an AWS account for infrastruct | [aws_iam_policy.ecs_cluster_infrastructure_draining_kms_encrypt](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_iam_policy.ecs_cluster_infrastructure_draining_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_iam_policy.ecs_cluster_infrastructure_draining_sns_publish_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_iam_policy.ecs_cluster_infrastructure_pending_task_metric_cloudwatch_put_metric_data_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_iam_policy.ecs_cluster_infrastructure_pending_task_metric_ecs_describe_cluster_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_iam_policy.ecs_cluster_infrastructure_pending_task_metric_kms_encrypt](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_iam_policy.ecs_cluster_infrastructure_pending_task_metric_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_iam_policy.infrastructure_ecs_cluster_autoscaling_lifecycle_termination_kms_encrypt](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_iam_policy.infrastructure_ecs_cluster_autoscaling_lifecycle_termination_sns_publish](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_iam_policy.infrastructure_ecs_cluster_ec2_ecs](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | @@ -132,6 +140,7 @@ This project creates and manages resources within an AWS account for infrastruct | [aws_iam_policy.infrastructure_ecs_cluster_ssm_service_setting_rw](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_iam_policy.infrastructure_rds_monitoring](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_iam_role.ecs_cluster_infrastructure_draining_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | +| [aws_iam_role.ecs_cluster_infrastructure_pending_task_metric_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | | [aws_iam_role.infrastructure_ecs_cluster](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | | [aws_iam_role.infrastructure_ecs_cluster_autoscaling_lifecycle_termination](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | | [aws_iam_role.infrastructure_ecs_cluster_service_blue_green_codedeploy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | @@ -147,6 +156,10 @@ This project creates and manages resources within an AWS account for infrastruct | [aws_iam_role_policy_attachment.ecs_cluster_infrastructure_draining_kms_encrypt](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | | [aws_iam_role_policy_attachment.ecs_cluster_infrastructure_draining_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | | [aws_iam_role_policy_attachment.ecs_cluster_infrastructure_draining_sns_publish_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | +| [aws_iam_role_policy_attachment.ecs_cluster_infrastructure_pending_task_cloudwatch_metric_put_metric_data_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | +| [aws_iam_role_policy_attachment.ecs_cluster_infrastructure_pending_task_kms_encrypt](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | +| [aws_iam_role_policy_attachment.ecs_cluster_infrastructure_pending_task_metric_ecs_describe_cluster_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | +| [aws_iam_role_policy_attachment.ecs_cluster_infrastructure_pending_task_metric_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | | [aws_iam_role_policy_attachment.infrastructure_ecs_cluster_autoscaling_lifecycle_termination_kms_encrypt](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | | [aws_iam_role_policy_attachment.infrastructure_ecs_cluster_autoscaling_lifecycle_termination_sns_publish](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | | [aws_iam_role_policy_attachment.infrastructure_ecs_cluster_ec2_ecs](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | @@ -177,7 +190,9 @@ This project creates and manages resources within an AWS account for infrastruct | [aws_kms_key.custom_s3_buckets](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/kms_key) | resource | | [aws_kms_key.infrastructure](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/kms_key) | resource | | [aws_lambda_function.ecs_cluster_infrastructure_draining](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource | +| [aws_lambda_function.ecs_cluster_infrastructure_pending_task_metric](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource | | [aws_lambda_permission.ecs_cluster_infrastructure_draining_allow_sns_execution](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | +| [aws_lambda_permission.ecs_cluster_infrastructure_pending_task_metric_allow_cloudwatch_execution](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | | [aws_launch_template.infrastructure_ecs_cluster](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/launch_template) | resource | | [aws_lb_listener_certificate.service_shared_alb_certificate](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lb_listener_certificate) | resource | | [aws_nat_gateway.infrastructure](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/nat_gateway) | resource | @@ -306,6 +321,7 @@ This project creates and manages resources within an AWS account for infrastruct | [random_password.infrastructure_ecs_cluster_service_cloudfront_bypass_protection_secret](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/password) | resource | | [random_password.infrastructure_rds_root](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/password) | resource | | [archive_file.ecs_cluster_infrastructure_draining_lambda](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source | +| [archive_file.ecs_cluster_infrastructure_pending_task_metric_lambda](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source | | [aws_ami.ecs_cluster_ami](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source | | [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | | [aws_cloudfront_cache_policy.managed_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/cloudfront_cache_policy) | data source | @@ -336,6 +352,7 @@ This project creates and manages resources within an AWS account for infrastruct | [enable\_infrastructure\_ecs\_cluster](#input\_enable\_infrastructure\_ecs\_cluster) | Enable creation of infrastructure ECS cluster, to place ECS services | `bool` | n/a | yes | | [enable\_infrastructure\_ecs\_cluster\_asg\_cpu\_alert](#input\_enable\_infrastructure\_ecs\_cluster\_asg\_cpu\_alert) | Enable a CPU alert for the ECS cluster's Autoscaling Group | `bool` | n/a | yes | | [enable\_infrastructure\_ecs\_cluster\_efs](#input\_enable\_infrastructure\_ecs\_cluster\_efs) | Conditionally create and mount EFS to the ECS cluster instances | `bool` | n/a | yes | +| [enable\_infrastructure\_ecs\_cluster\_pending\_task\_alert](#input\_enable\_infrastructure\_ecs\_cluster\_pending\_task\_alert) | Enable the ECS Cluster pending task alert | `bool` | n/a | yes | | [enable\_infrastructure\_ecs\_cluster\_services\_alb\_logs](#input\_enable\_infrastructure\_ecs\_cluster\_services\_alb\_logs) | Enable Infrastructure ECS cluster services ALB logs | `bool` | n/a | yes | | [enable\_infrastructure\_route53\_hosted\_zone](#input\_enable\_infrastructure\_route53\_hosted\_zone) | Creates a Route53 hosted zone, where DNS records will be created for resources launched within this module. | `bool` | n/a | yes | | [enable\_infrastructure\_vpc\_transfer\_s3\_bucket](#input\_enable\_infrastructure\_vpc\_transfer\_s3\_bucket) | Enable VPC transfer S3 bucket. This allows uploading/downloading files from resources within the infrastructure VPC | `bool` | n/a | yes | @@ -361,6 +378,12 @@ This project creates and manages resources within an AWS account for infrastruct | [infrastructure\_ecs\_cluster\_max\_instance\_lifetime](#input\_infrastructure\_ecs\_cluster\_max\_instance\_lifetime) | Maximum lifetime in seconds of an instance within the ECS cluster | `number` | n/a | yes | | [infrastructure\_ecs\_cluster\_max\_size](#input\_infrastructure\_ecs\_cluster\_max\_size) | Maximum number of instances for the ECS cluster | `number` | n/a | yes | | [infrastructure\_ecs\_cluster\_min\_size](#input\_infrastructure\_ecs\_cluster\_min\_size) | Minimum number of instances for the ECS cluster | `number` | n/a | yes | +| [infrastructure\_ecs\_cluster\_pending\_task\_alert\_evaluation\_periods](#input\_infrastructure\_ecs\_cluster\_pending\_task\_alert\_evaluation\_periods) | Evaluation periods for the ECS cluster's Pending Task alert | `number` | n/a | yes | +| [infrastructure\_ecs\_cluster\_pending\_task\_alert\_opsgenie](#input\_infrastructure\_ecs\_cluster\_pending\_task\_alert\_opsgenie) | Enable Opsgenie alerts for the ECS cluster's Pending Task alert | `bool` | n/a | yes | +| [infrastructure\_ecs\_cluster\_pending\_task\_alert\_period](#input\_infrastructure\_ecs\_cluster\_pending\_task\_alert\_period) | Period (in secods) for the ECS cluster's Pending Task alert | `number` | n/a | yes | +| [infrastructure\_ecs\_cluster\_pending\_task\_alert\_slack](#input\_infrastructure\_ecs\_cluster\_pending\_task\_alert\_slack) | Enable Slack alerts for the ECS cluster's Pending Task alert | `bool` | n/a | yes | +| [infrastructure\_ecs\_cluster\_pending\_task\_alert\_threshold](#input\_infrastructure\_ecs\_cluster\_pending\_task\_alert\_threshold) | Threshold (Number of pending tasks) for the ECS cluster's Pending Task alert | `number` | n/a | yes | +| [infrastructure\_ecs\_cluster\_pending\_task\_metric\_lambda\_log\_retention](#input\_infrastructure\_ecs\_cluster\_pending\_task\_metric\_lambda\_log\_retention) | Log retention for the ECS cluster pending task metric Lambda | `number` | n/a | yes | | [infrastructure\_ecs\_cluster\_publicly\_avaialble](#input\_infrastructure\_ecs\_cluster\_publicly\_avaialble) | Conditionally launch the ECS cluster EC2 instances into the Public subnet | `bool` | n/a | yes | | [infrastructure\_ecs\_cluster\_service\_defaults](#input\_infrastructure\_ecs\_cluster\_service\_defaults) | Default values for ECS Cluster Services |
object({
github_v1_source = optional(bool, null)
github_v1_oauth_token = optional(string, null)
codestar_connection_arn = optional(string, null)
github_owner = optional(string, null)
github_repo = optional(string, null)
github_track_revision = optional(string, null)
buildspec = optional(string, null)
buildspec_from_github_repo = optional(bool, null)
ecr_scan_target_sns_topic_arn = optional(string, null)
deployment_type = optional(string, null)
enable_cloudwatch_logs = optional(bool, null)
cloudwatch_logs_retention = optional(number, null)
enable_execute_command = optional(bool, null)
deregistration_delay = optional(number, null)
custom_policies = optional(map(object({
description = string
policy = object({
Version = string
Statement = list(object({
Action = list(string)
Effect = string
Resource = list(string)
}))
})
})), {})
container_entrypoint = optional(list(string), null)
container_port = optional(number, null)
container_volumes = optional(list(map(string)), null)
container_extra_hosts = optional(list(map(string)), null)
container_count = optional(number, null)
container_heath_check_path = optional(string, null)
container_heath_grace_period = optional(number, null)
scheduled_tasks = optional(map(object({
entrypoint = optional(list(string), null)
schedule_expression = string
})), {})
domain_names = optional(list(string), null)
enable_cloudfront = optional(bool, null)
cloudfront_tls_certificate_arn = optional(string, null)
cloudfront_access_logging_enabled = optional(bool, null)
cloudfront_bypass_protection_enabled = optional(bool, null)
cloudfront_bypass_protection_excluded_domains = optional(list(string), null)
cloudfront_origin_shield_enabled = optional(bool, null)
cloudfront_managed_cache_policy = optional(string, null)
cloudfront_managed_origin_request_policy = optional(string, null)
cloudfront_managed_response_headers_policy = optional(string, null)
cloudfront_waf_association = optional(string, null)
alb_tls_certificate_arn = optional(string, null)
})
| n/a | yes | | [infrastructure\_ecs\_cluster\_services](#input\_infrastructure\_ecs\_cluster\_services) | Map of ECS Cluster Services (The key will be the service name). Values in here will override `infrastructure_ecs_cluster_service_defaults` values if set."
{
service-name = {
github\_v1\_source: Conditionally use GitHubV1 for the CodePipeline source (CodeStar will be used by default)
github\_v1\_oauth\_token: If `github_v1_source` is set to true, provide the GitHub OAuthToken here
codestar\_connection\_arn: The CodeStar Connection ARN to use in the CodePipeline source
github\_owner: The GitHub Owner of the repository to be pulled by the CodePipeline source
github\_repo: The GitHub repo name to be pulled by the CodePipeline source
github\_track\_revision: The branch/revision of the GitHub repository to be pulled by the CodePipeline source
buildspec: The filename of the buildspec to use for the CodePipeline build phase, stored within the 'codepipeline buildspec store' S3 bucket
buildspec\_from\_github\_repo: Conditionally use the 'buildspec' filename stored within the GitHub repo as the buildspec
ecr\_scan\_target\_sns\_topic\_arn: An SNS topic ARN to publish ECR scan results to
deployment\_type: The service deployment type - Can be one of 'rolling' or 'blue-green'
enable\_cloudwatch\_logs: Conditionally enable cloudwatch logs for the service
cloudwatch\_logs\_retention: CloudWatch log retention in days
enable\_execute\_command: Enable Amazon ECS Exec to directly interact with containers
deregistration\_delay: Amount time for Elastic Load Balancing to wait before changing the state of a deregistering target from draining to unused
custom\_policies: Map of custom policies to attach to the service task role (eg. { policy-name = { description = \"my custom policy\", policy = { Version = \"2012-10-17\", Statement = [] } } })
container\_entrypoint: The container entrypoint
container\_port: The service container port
container\_volumes: List of maps containing volume mappings eg. [ { "name" = "my-volume", "host\_path" = "/mnt/efs/my-dir", "container\_path" = "/mnt/my-dir" } ]
container\_extra\_hosts: List of maps containing extra hosts eg. [ { "hostname" = "my.host", "ip\_address" = "10.1.2.3" } ]
container\_count: Number of containers to launch for the service
container\_heath\_check\_path: Destination for the health check request
container\_heath\_grace\_period: Seconds to ignore failing load balancer health checks on newly instantiated tasks to prevent premature shutdown
scheduled\_tasks: A map of scheduled tasks that use the same image as the service defined eg. { "name" => { "entrypoint" = ["bundle", "exec", "run\_jobs"], "schedule\_expression" = "cron(* * * * ? *)" } }
domain\_names: Domain names to assign to CloudFront aliases, and the Application Load Balancer's `host_header` condition
enable\_cloudfront: Enable cloadfront for the service
cloudfront\_tls\_certificate\_arn: Certificate ARN to attach to CloudFront - must contain the names provided in `domain_names`
cloudfront\_access\_logging\_enabled: Enable access logging for the distribution to the infrastructure S3 logs bucket
cloudfront\_bypass\_protection\_enabled: This adds a secret header at the CloudFront level, which is then checked by the ALB listener rules. Requests are only forwarded if the header matches, preventing requests going directly to the ALB.
cloudfront\_bypass\_protection\_excluded\_domains: A list of domains to exclude from the bypass protection
cloudfront\_origin\_shield\_enabled: Enable CloudFront Origin Shield
cloudfront\_managed\_cache\_policy: Conditionally specify a CloudFront Managed Cache Policy for the distribution
cloudfront\_managed\_origin\_request\_policy: Conditionally specify a CloudFront Managed Origin Request Policy for the distribution
cloudfront\_managed\_response\_headers\_policy: Conditionally specify a CloudFront Managed Response Headers Policy for the distribution
cloudfront\_waf\_association: Conditionally associate WAF created via `infrastructure_ecs_cluster_wafs` using the key of the waf configuration
alb\_tls\_certificate\_arn: Certificate ARN to attach to the Application Load Balancer - must contain the names provided in `domain_names`
}
} |
map(object({
github_v1_source = optional(bool, null)
github_v1_oauth_token = optional(string, null)
codestar_connection_arn = optional(string, null)
github_owner = optional(string, null)
github_repo = optional(string, null)
github_track_revision = optional(string, null)
buildspec = optional(string, null)
buildspec_from_github_repo = optional(bool, null)
ecr_scan_target_sns_topic_arn = optional(string, null)
deployment_type = optional(string, null)
enable_cloudwatch_logs = optional(bool, null)
cloudwatch_logs_retention = optional(number, null)
enable_execute_command = optional(bool, null)
deregistration_delay = optional(number, null)
custom_policies = optional(map(object({
description = string
policy = object({
Version = string
Statement = list(object({
Action = list(string)
Effect = string
Resource = list(string)
}))
})
})), {})
container_entrypoint = optional(list(string), null)
container_port = optional(number, null)
container_volumes = optional(list(map(string)), null)
container_extra_hosts = optional(list(map(string)), null)
container_count = optional(number, null)
container_heath_check_path = optional(string, null)
container_heath_grace_period = optional(number, null)
scheduled_tasks = optional(map(object({
entrypoint = list(string)
schedule_expression = string
})), null)
domain_names = optional(list(string), null)
enable_cloudfront = optional(bool, null)
cloudfront_tls_certificate_arn = optional(string, null)
cloudfront_access_logging_enabled = optional(bool, null)
cloudfront_bypass_protection_enabled = optional(bool, null)
cloudfront_bypass_protection_excluded_domains = optional(list(string), null)
cloudfront_origin_shield_enabled = optional(bool, null)
cloudfront_managed_cache_policy = optional(string, null)
cloudfront_managed_origin_request_policy = optional(string, null)
cloudfront_managed_response_headers_policy = optional(string, null)
cloudfront_waf_association = optional(string, null)
alb_tls_certificate_arn = optional(string, null)
}))
| n/a | yes | diff --git a/ecs-cluster-infrastructure-alert-pending-tasks.tf b/ecs-cluster-infrastructure-alert-pending-tasks.tf new file mode 100644 index 0000000..1d898a3 --- /dev/null +++ b/ecs-cluster-infrastructure-alert-pending-tasks.tf @@ -0,0 +1,25 @@ +resource "aws_cloudwatch_metric_alarm" "infrastructure_ecs_cluster_pending_task" { + count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0 + + alarm_name = "${local.resource_prefix}-infrastructure-ecs-cluster-infrastructure-pending-task" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = local.infrastructure_ecs_cluster_pending_task_alert_evaluation_periods + metric_name = "PendingTasksCount" + namespace = "ECS" + period = local.infrastructure_ecs_cluster_pending_task_alert_period + statistic = "Maximum" + threshold = local.infrastructure_ecs_cluster_pending_task_alert_threshold + alarm_description = "Pending Tasks for ${aws_ecs_cluster.infrastructure[0].name} Cluster" + actions_enabled = "true" + alarm_actions = concat( + local.infrastructure_ecs_cluster_pending_task_alert_slack ? [data.aws_sns_topic.infrastructure_slack_sns_topic[0].arn] : [], + local.infrastructure_ecs_cluster_pending_task_alert_opsgenie ? [data.aws_sns_topic.infrastructure_opsgenie_sns_topic[0].arn] : [] + ) + ok_actions = concat( + local.infrastructure_ecs_cluster_pending_task_alert_slack ? [data.aws_sns_topic.infrastructure_slack_sns_topic[0].arn] : [], + local.infrastructure_ecs_cluster_pending_task_alert_opsgenie ? [data.aws_sns_topic.infrastructure_opsgenie_sns_topic[0].arn] : [] + ) + dimensions = { + ClusterName = aws_ecs_cluster.infrastructure[0].name + } +} diff --git a/ecs-cluster-infrastructure-pending-task-metric-lambda.tf b/ecs-cluster-infrastructure-pending-task-metric-lambda.tf new file mode 100644 index 0000000..3a7bdf5 --- /dev/null +++ b/ecs-cluster-infrastructure-pending-task-metric-lambda.tf @@ -0,0 +1,164 @@ +resource "aws_cloudwatch_log_group" "ecs_cluster_infrastructure_pending_task_metric_lambda_log_group" { + count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0 + + name = "/aws/lambda/${local.resource_prefix_hash}-ecs-cluster-infrastructure-pending-task-metric" + kms_key_id = local.infrastructure_kms_encryption ? aws_kms_key.infrastructure[0].arn : null + retention_in_days = local.infrastructure_ecs_cluster_pending_task_metric_lambda_log_retention +} + +resource "aws_iam_role" "ecs_cluster_infrastructure_pending_task_metric_lambda" { + count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0 + + name = "${local.resource_prefix}-${substr(sha512("ecs-cluster-infrastructure-pending-task-metric"), 0, 6)}" + description = "${local.resource_prefix}-ecs-cluster-infrastructure-pending-task-metric" + assume_role_policy = templatefile( + "${path.root}/policies/assume-roles/service-principle-standard.json.tpl", + { services = jsonencode(["lambda.amazonaws.com"]) } + ) +} + +resource "aws_iam_policy" "ecs_cluster_infrastructure_pending_task_metric_lambda" { + count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0 + + name = "${local.resource_prefix}-ecs-cluster-infrastructure-pending-task-metric" + policy = templatefile( + "${path.root}/policies/lambda-default.json.tpl", + { + region = local.aws_region + account_id = local.aws_account_id + function_name = "${local.resource_prefix_hash}-ecs-cluster-infrastructure-pending-task-metric" + } + ) +} + +resource "aws_iam_role_policy_attachment" "ecs_cluster_infrastructure_pending_task_metric_lambda" { + count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0 + + role = aws_iam_role.ecs_cluster_infrastructure_pending_task_metric_lambda[0].name + policy_arn = aws_iam_policy.ecs_cluster_infrastructure_pending_task_metric_lambda[0].arn +} + +resource "aws_iam_policy" "ecs_cluster_infrastructure_pending_task_metric_cloudwatch_put_metric_data_lambda" { + count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0 + + name = "${local.resource_prefix}-ecs-cluster-infrastructure-pending-task-metric-cloudwatch-put-metric-data" + policy = templatefile( + "${path.root}/policies/cloudwatch-put-metric-data.json.tpl", + { + region = local.aws_region + account_id = local.aws_account_id + namespaces = ["ECS"] + } + ) +} + +resource "aws_iam_role_policy_attachment" "ecs_cluster_infrastructure_pending_task_cloudwatch_metric_put_metric_data_lambda" { + count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0 + + role = aws_iam_role.ecs_cluster_infrastructure_pending_task_metric_lambda[0].name + policy_arn = aws_iam_policy.ecs_cluster_infrastructure_pending_task_metric_cloudwatch_put_metric_data_lambda[0].arn +} + +resource "aws_iam_policy" "ecs_cluster_infrastructure_pending_task_metric_ecs_describe_cluster_lambda" { + count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0 + + name = "${local.resource_prefix}-ecs-cluster-infrastructure-pending-task-metric-ecs-describe-cluster" + policy = templatefile( + "${path.root}/policies/ecs-describe-cluster.json.tpl", + { + region = local.aws_region + account_id = local.aws_account_id + cluster_names = [local.infrastructure_ecs_cluster_name] + } + ) +} + +resource "aws_iam_role_policy_attachment" "ecs_cluster_infrastructure_pending_task_metric_ecs_describe_cluster_lambda" { + count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0 + + role = aws_iam_role.ecs_cluster_infrastructure_pending_task_metric_lambda[0].name + policy_arn = aws_iam_policy.ecs_cluster_infrastructure_pending_task_metric_ecs_describe_cluster_lambda[0].arn +} + +resource "aws_iam_policy" "ecs_cluster_infrastructure_pending_task_metric_kms_encrypt" { + count = local.enable_infrastructure_ecs_cluster_pending_task_alert && local.infrastructure_kms_encryption ? 1 : 0 + + name = "${local.resource_prefix}-ecs-cluster-infrastructure-pending-task-metric-kms-encrypt" + policy = templatefile( + "${path.root}/policies/kms-encrypt.json.tpl", + { kms_key_arn = aws_kms_key.infrastructure[0].arn } + ) +} + +resource "aws_iam_role_policy_attachment" "ecs_cluster_infrastructure_pending_task_kms_encrypt" { + count = local.enable_infrastructure_ecs_cluster_pending_task_alert && local.infrastructure_kms_encryption ? 1 : 0 + + role = aws_iam_role.ecs_cluster_infrastructure_pending_task_metric_lambda[0].name + policy_arn = aws_iam_policy.ecs_cluster_infrastructure_pending_task_metric_kms_encrypt[0].arn +} + +data "archive_file" "ecs_cluster_infrastructure_pending_task_metric_lambda" { + count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0 + + type = "zip" + source_dir = "lambdas/ecs-pending-task-metric" + output_path = "lambdas/.zip-cache/ecs-pending-task-metric.zip" +} + +resource "aws_lambda_function" "ecs_cluster_infrastructure_pending_task_metric" { + count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0 + + filename = data.archive_file.ecs_cluster_infrastructure_pending_task_metric_lambda[0].output_path + function_name = "${local.resource_prefix_hash}-ecs-cluster-infrastructure-pending-task-metric" + description = "${local.resource_prefix} ECS Cluster Infrastructure Pending Task Metric" + handler = "function.lambda_handler" + runtime = "python3.11" + role = aws_iam_role.ecs_cluster_infrastructure_pending_task_metric_lambda[0].arn + source_code_hash = data.archive_file.ecs_cluster_infrastructure_pending_task_metric_lambda[0].output_base64sha256 + memory_size = 128 + package_type = "Zip" + timeout = 900 + + environment { + variables = { + ecsClusterName = local.infrastructure_ecs_cluster_name + } + } + + tracing_config { + mode = "Active" + } + + depends_on = [ + aws_iam_role_policy_attachment.ecs_cluster_infrastructure_pending_task_metric_lambda, + aws_iam_role_policy_attachment.ecs_cluster_infrastructure_pending_task_cloudwatch_metric_put_metric_data_lambda, + aws_iam_role_policy_attachment.ecs_cluster_infrastructure_pending_task_metric_ecs_describe_cluster_lambda, + aws_iam_role_policy_attachment.ecs_cluster_infrastructure_pending_task_kms_encrypt + ] +} + +resource "aws_cloudwatch_event_rule" "ecs_cluster_infrastructure_pending_task_metric_1_min_cron" { + count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0 + + name = "${local.resource_prefix_hash}-ecs-cluster-infrastructure-pending-task-metric-1-min" + description = "Triggers the ${aws_lambda_function.ecs_cluster_infrastructure_pending_task_metric[0].function_name} Lambda every 1 minute" + schedule_expression = "rate(1 minute)" +} + +resource "aws_cloudwatch_event_target" "ecs_cluster_infrastructure_pending_task_metric_1_min_cron" { + count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0 + + rule = aws_cloudwatch_event_rule.ecs_cluster_infrastructure_pending_task_metric_1_min_cron[0].name + target_id = "lambda" + arn = aws_lambda_function.ecs_cluster_infrastructure_pending_task_metric[0].arn +} + +resource "aws_lambda_permission" "ecs_cluster_infrastructure_pending_task_metric_allow_cloudwatch_execution" { + count = local.enable_infrastructure_ecs_cluster_pending_task_alert ? 1 : 0 + + statement_id = "AllowExecutionFromCloudWatch" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.ecs_cluster_infrastructure_pending_task_metric[0].function_name + principal = "events.amazonaws.com" + source_arn = aws_cloudwatch_event_rule.ecs_cluster_infrastructure_pending_task_metric_1_min_cron[0].arn +} diff --git a/kms-infrastructure.tf b/kms-infrastructure.tf index ad03c4b..83c4cfb 100644 --- a/kms-infrastructure.tf +++ b/kms-infrastructure.tf @@ -24,6 +24,11 @@ resource "aws_kms_key" "infrastructure" { { log_group_arn = local.infrastructure_ecs_cluster_draining_lambda_enabled && local.infrastructure_kms_encryption ? "arn:aws:logs:${local.aws_region}:${local.aws_account_id}:log-group:/aws/lambda/${local.resource_prefix_hash}-ecs-cluster-infrastructure-draining" : "" } + )}${local.enable_infrastructure_ecs_cluster_pending_task_alert && local.infrastructure_kms_encryption ? "," : ""} + ${templatefile("${path.root}/policies/kms-key-policy-statements/cloudwatch-logs-allow.json.tpl", + { + log_group_arn = local.enable_infrastructure_ecs_cluster_pending_task_alert && local.infrastructure_kms_encryption ? "arn:aws:logs:${local.aws_region}:${local.aws_account_id}:log-group:/aws/lambda/${local.resource_prefix_hash}-ecs-cluster-infrastructure-pending-task-metric" : "" + } )}${length(local.infrastructure_ecs_cluster_services) > 0 && local.infrastructure_kms_encryption ? "," : ""} ${templatefile("${path.root}/policies/kms-key-policy-statements/cloudwatch-logs-allow.json.tpl", { diff --git a/lambdas/ecs-pending-task-metric/function.py b/lambdas/ecs-pending-task-metric/function.py new file mode 100644 index 0000000..4540565 --- /dev/null +++ b/lambdas/ecs-pending-task-metric/function.py @@ -0,0 +1,36 @@ +import boto3 +import os + +CLUSTER_NAME = os.environ['ecsClusterName'] + +def lambda_handler(event, context): + ecs_client = boto3.client('ecs') + cloudwatch_client = boto3.client('cloudwatch') + + response = ecs_client.describe_clusters( + clusters=[CLUSTER_NAME] + ) + + pending_tasks = response['clusters'][0]['pendingTasksCount'] + + response = cloudwatch_client.put_metric_data( + Namespace='ECS', + MetricData=[ + { + 'MetricName': 'PendingTasksCount', + 'Dimensions': [ + { + 'Name': 'ClusterName', + 'Value': CLUSTER_NAME + }, + ], + 'Value': pending_tasks, + 'Unit': 'Count' + }, + ] + ) + + return { + 'statusCode': 200, + 'body': f'Successfully created custom metric for {CLUSTER_NAME} with {pending_tasks} pending tasks' + } diff --git a/locals.tf b/locals.tf index 8af3322..7bf6e4c 100644 --- a/locals.tf +++ b/locals.tf @@ -11,10 +11,16 @@ locals { infrastructure_logging_bucket_retention = var.infrastructure_logging_bucket_retention - infrastructure_slack_sns_topic_name = "${local.project_name}-cloudwatch-slack-alerts" - infrastructure_opsgenie_sns_topic_name = "${local.project_name}-cloudwatch-opsgenie-alerts" - infrastructure_slack_sns_topic_in_use = local.infrastructure_ecs_cluster_asg_cpu_alert_slack - infrastructure_opsgenie_sns_topic_in_use = local.infrastructure_ecs_cluster_asg_cpu_alert_opsgenie + infrastructure_slack_sns_topic_name = "${local.project_name}-cloudwatch-slack-alerts" + infrastructure_opsgenie_sns_topic_name = "${local.project_name}-cloudwatch-opsgenie-alerts" + infrastructure_slack_sns_topic_in_use = ( + local.infrastructure_ecs_cluster_asg_cpu_alert_slack || + local.infrastructure_ecs_cluster_pending_task_alert_slack + ) + infrastructure_opsgenie_sns_topic_in_use = ( + local.infrastructure_ecs_cluster_asg_cpu_alert_opsgenie || + local.infrastructure_ecs_cluster_pending_task_alert_opsgenie + ) enable_infrastructure_logs_bucket = ( local.infrastructure_vpc_flow_logs_s3_with_athena || @@ -137,14 +143,21 @@ locals { infrastructure_ecs_cluster_autoscaling_time_based_custom = { for custom in toset(var.infrastructure_ecs_cluster_autoscaling_time_based_custom) : "${custom["min"]}-${custom["max"]} ${custom["cron"]}" => custom } - enable_infrastructure_ecs_cluster_asg_cpu_alert = var.enable_infrastructure_ecs_cluster_asg_cpu_alert && local.enable_infrastructure_ecs_cluster - infrastructure_ecs_cluster_asg_cpu_alert_evaluation_periods = var.infrastructure_ecs_cluster_asg_cpu_alert_evaluation_periods - infrastructure_ecs_cluster_asg_cpu_alert_period = var.infrastructure_ecs_cluster_asg_cpu_alert_period - infrastructure_ecs_cluster_asg_cpu_alert_threshold = var.infrastructure_ecs_cluster_asg_cpu_alert_threshold - infrastructure_ecs_cluster_asg_cpu_alert_slack = var.infrastructure_ecs_cluster_asg_cpu_alert_slack && local.enable_infrastructure_ecs_cluster_asg_cpu_alert - infrastructure_ecs_cluster_asg_cpu_alert_opsgenie = var.infrastructure_ecs_cluster_asg_cpu_alert_opsgenie && local.enable_infrastructure_ecs_cluster_asg_cpu_alert - infrastructure_ecs_cluster_wafs = var.infrastructure_ecs_cluster_wafs - infrastructure_ecs_cluster_enable_ssm_dhmc = local.enable_infrastructure_ecs_cluster ? data.external.ssm_dhmc_setting[0].result.setting_value != "$None" : false + enable_infrastructure_ecs_cluster_asg_cpu_alert = var.enable_infrastructure_ecs_cluster_asg_cpu_alert && local.enable_infrastructure_ecs_cluster + infrastructure_ecs_cluster_asg_cpu_alert_evaluation_periods = var.infrastructure_ecs_cluster_asg_cpu_alert_evaluation_periods + infrastructure_ecs_cluster_asg_cpu_alert_period = var.infrastructure_ecs_cluster_asg_cpu_alert_period + infrastructure_ecs_cluster_asg_cpu_alert_threshold = var.infrastructure_ecs_cluster_asg_cpu_alert_threshold + infrastructure_ecs_cluster_asg_cpu_alert_slack = var.infrastructure_ecs_cluster_asg_cpu_alert_slack && local.enable_infrastructure_ecs_cluster_asg_cpu_alert + infrastructure_ecs_cluster_asg_cpu_alert_opsgenie = var.infrastructure_ecs_cluster_asg_cpu_alert_opsgenie && local.enable_infrastructure_ecs_cluster_asg_cpu_alert + enable_infrastructure_ecs_cluster_pending_task_alert = var.enable_infrastructure_ecs_cluster_pending_task_alert && local.enable_infrastructure_ecs_cluster + infrastructure_ecs_cluster_pending_task_metric_lambda_log_retention = var.infrastructure_ecs_cluster_pending_task_metric_lambda_log_retention + infrastructure_ecs_cluster_pending_task_alert_evaluation_periods = var.infrastructure_ecs_cluster_pending_task_alert_evaluation_periods + infrastructure_ecs_cluster_pending_task_alert_period = var.infrastructure_ecs_cluster_pending_task_alert_period + infrastructure_ecs_cluster_pending_task_alert_threshold = var.infrastructure_ecs_cluster_pending_task_alert_threshold + infrastructure_ecs_cluster_pending_task_alert_slack = var.infrastructure_ecs_cluster_pending_task_alert_slack + infrastructure_ecs_cluster_pending_task_alert_opsgenie = var.infrastructure_ecs_cluster_pending_task_alert_opsgenie + infrastructure_ecs_cluster_wafs = var.infrastructure_ecs_cluster_wafs + infrastructure_ecs_cluster_enable_ssm_dhmc = local.enable_infrastructure_ecs_cluster ? data.external.ssm_dhmc_setting[0].result.setting_value != "$None" : false infrastructure_ecs_cluster_user_data = base64encode( templatefile("ec2-userdata/ecs-instance.tpl", { docker_storage_volume_device_name = local.infrastructure_ecs_cluster_ebs_docker_storage_volume_device_name, diff --git a/policies/cloudwatch-put-metric-data.json.tpl b/policies/cloudwatch-put-metric-data.json.tpl new file mode 100644 index 0000000..d5515e4 --- /dev/null +++ b/policies/cloudwatch-put-metric-data.json.tpl @@ -0,0 +1,21 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Action": [ + "cloudwatch:PutMetricData" + ], + "Effect": "Allow", + "Resource": "*", + "Condition": { + "ForAnyValue:StringEquals": { + "cloudwatch:namespace": [ + %{for k, v in namespaces} + "${v}"%{if k+1 != length(namespaces)},%{endif} + %{endfor} + ] + } + } + } + ] +} diff --git a/policies/ecs-describe-cluster.json.tpl b/policies/ecs-describe-cluster.json.tpl new file mode 100644 index 0000000..081b435 --- /dev/null +++ b/policies/ecs-describe-cluster.json.tpl @@ -0,0 +1,16 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Action": [ + "ecs:DescribeClusters" + ], + "Effect": "Allow", + "Resource": [ + %{for k, v in cluster_names} + "arn:aws:ecs:${region}:${account_id}:cluster/${v}"%{if k+1 != length(cluster_names)},%{endif} + %{endfor} + ] + } + ] +} diff --git a/variables.tf b/variables.tf index beba023..8a94f8a 100644 --- a/variables.tf +++ b/variables.tf @@ -343,6 +343,41 @@ variable "infrastructure_ecs_cluster_asg_cpu_alert_opsgenie" { type = bool } +variable "enable_infrastructure_ecs_cluster_pending_task_alert" { + description = "Enable the ECS Cluster pending task alert" + type = bool +} + +variable "infrastructure_ecs_cluster_pending_task_metric_lambda_log_retention" { + description = "Log retention for the ECS cluster pending task metric Lambda" + type = number +} + +variable "infrastructure_ecs_cluster_pending_task_alert_evaluation_periods" { + description = "Evaluation periods for the ECS cluster's Pending Task alert" + type = number +} + +variable "infrastructure_ecs_cluster_pending_task_alert_period" { + description = "Period (in secods) for the ECS cluster's Pending Task alert" + type = number +} + +variable "infrastructure_ecs_cluster_pending_task_alert_threshold" { + description = "Threshold (Number of pending tasks) for the ECS cluster's Pending Task alert" + type = number +} + +variable "infrastructure_ecs_cluster_pending_task_alert_slack" { + description = "Enable Slack alerts for the ECS cluster's Pending Task alert" + type = bool +} + +variable "infrastructure_ecs_cluster_pending_task_alert_opsgenie" { + description = "Enable Opsgenie alerts for the ECS cluster's Pending Task alert" + type = bool +} + variable "infrastructure_ecs_cluster_wafs" { description = "Map of WAF ACLs to craete, which can be used with service CloudFront distributions" type = map(object({