-
Notifications
You must be signed in to change notification settings - Fork 23
/
alarm.tf
191 lines (168 loc) · 8.29 KB
/
alarm.tf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
resource "aws_cloudwatch_metric_alarm" "cluster_status_red" {
count = var.cluster_status_red_enable ? 1 : 0
alarm_name = var.cluster_status_red_alarm_name
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = var.cluster_status_red_evaluation_periods
metric_name = "ClusterStatus.red"
namespace = "AWS/ES"
period = var.cluster_status_red_period
statistic = "Maximum"
threshold = var.cluster_status_red_threshold
alarm_description = "At least one primary shard and its replicas are not allocated to a node"
alarm_actions = var.alarm_actions
ok_actions = var.ok_actions
}
resource "aws_cloudwatch_metric_alarm" "cluster_status_yellow" {
count = var.cluster_status_yellow_enable ? 1 : 0
alarm_name = var.cluster_status_yellow_alarm_name
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = var.cluster_status_yellow_evaluation_periods
metric_name = "ClusterStatus.yellow"
namespace = "AWS/ES"
period = var.cluster_status_yellow_period
statistic = "Maximum"
threshold = var.cluster_status_yellow_threshold
alarm_description = "At least one replica shard is not allocated to a node"
alarm_actions = var.alarm_actions
ok_actions = var.ok_actions
}
resource "aws_cloudwatch_metric_alarm" "low_storage_space" {
count = var.low_storage_space_enable ? 1 : 0
alarm_name = var.low_storage_space_name
comparison_operator = "LessThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = "FreeStorageSpace"
namespace = "AWS/ES"
period = "60"
statistic = "Minimum"
threshold = var.es_ebs_volume_size * 256
alarm_description = "Less than 25% of ${var.es_ebs_volume_size} storage space available"
alarm_actions = var.alarm_actions
ok_actions = var.ok_actions
}
resource "aws_cloudwatch_metric_alarm" "cluster_index_writes_blocked" {
count = var.cluster_index_writes_blocked_enable ? 1 : 0
alarm_name = var.cluster_index_writes_blocked_alarm_name
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = var.cluster_index_writes_blocked_evaluation_periods
metric_name = "ClusterIndexWritesBlocked"
namespace = "AWS/ES"
period = var.cluster_index_writes_blocked_period
statistic = "SampleCount"
threshold = var.cluster_index_writes_blocked_threshold
alarm_description = "Cluster is blocking write request due to lack of available storage space or memory"
alarm_actions = var.alarm_actions
ok_actions = var.ok_actions
}
resource "aws_cloudwatch_metric_alarm" "node_unreachable" {
count = var.node_unreachable_enable ? 1 : 0
alarm_name = var.node_unreachable_alarm_name
comparison_operator = "LessThanThreshold"
evaluation_periods = var.node_unreachable_evaluation_periods
metric_name = "Nodes"
namespace = "AWS/ES"
period = var.node_unreachable_period
statistic = "Minimum"
threshold = var.es_instance_count
alarm_description = "Node in your cluster has been unreachable for one day."
alarm_actions = var.alarm_actions
ok_actions = var.ok_actions
}
resource "aws_cloudwatch_metric_alarm" "snapshot_failed" {
count = var.snapshot_failed_enable ? 1 : 0
alarm_name = var.snapshot_failed_alarm_name
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = var.snapshot_failed_evaluation_periods
metric_name = "AutomatedSnapshotFailure"
namespace = "AWS/ES"
period = var.snapshot_failed_period
statistic = "Maximum"
threshold = var.snapshot_failed_threshold
alarm_description = "An automated snapshot failed"
alarm_actions = var.alarm_actions
ok_actions = var.ok_actions
}
resource "aws_cloudwatch_metric_alarm" "high_cpu_utilization_data_node" {
count = var.high_cpu_utilization_data_node_enable ? 1 : 0
alarm_name = var.high_cpu_utilization_data_node_alarm_name
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = var.high_cpu_utilization_data_node_evaluation_periods
metric_name = "CPUUtilization"
namespace = "AWS/ES"
period = var.high_cpu_utilization_data_node_period
statistic = "Average"
threshold = var.high_cpu_utilization_master_node_threshold
alarm_description = "High cpu utilization for 15mins"
alarm_actions = var.alarm_actions
ok_actions = var.ok_actions
}
resource "aws_cloudwatch_metric_alarm" "high_jvm_memory_utilization_data_node" {
count = var.high_jvm_memory_utilization_data_node_enable ? 1 : 0
alarm_name = var.high_jvm_memory_utilization_data_node_alarm_name
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = var.high_jvm_memory_utilization_data_node_evaluation_periods
metric_name = "JVMMemoryPressure"
namespace = "AWS/ES"
period = var.high_jvm_memory_utilization_data_node_period
statistic = "Maximum"
threshold = var.high_jvm_memory_utilization_data_node_threshold
alarm_description = "High JVM memory utilization for 15mins"
alarm_actions = var.alarm_actions
ok_actions = var.ok_actions
}
resource "aws_cloudwatch_metric_alarm" "high_cpu_utilization_master_node" {
count = var.high_cpu_utilization_master_node_enable ? 1 : 0
alarm_name = var.high_cpu_utilization_master_node_alarm_name
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = var.high_cpu_utilization_master_node_evaluation_periods
metric_name = "MasterCPUUtilization"
namespace = "AWS/ES"
period = var.high_cpu_utilization_master_node_period
statistic = "Average"
threshold = var.high_cpu_utilization_master_node_threshold
alarm_description = "High cpu utilization for master node"
alarm_actions = var.alarm_actions
ok_actions = var.ok_actions
}
resource "aws_cloudwatch_metric_alarm" "high_jvm_memory_utilization_master_node" {
count = var.high_jvm_memory_utilization_master_node_enable ? 1 : 0
alarm_name = var.high_jvm_memory_utilization_master_node_alarm_name
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = var.high_jvm_memory_utilization_master_node_evaluation_periods
metric_name = "MasterJVMMemoryPressure"
namespace = "AWS/ES"
period = var.high_jvm_memory_utilization_master_node_period
statistic = "Maximum"
threshold = var.high_jvm_memory_utilization_master_node_threshold
alarm_description = "High JVM memory utilization for 15mins"
alarm_actions = var.alarm_actions
ok_actions = var.ok_actions
}
resource "aws_cloudwatch_metric_alarm" "kms_key_error" {
count = var.kms_key_error_enable ? 1 : 0
alarm_name = var.kms_key_error_alarm_name
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = var.kms_key_error_evaluation_periods
metric_name = "KMSKeyError"
namespace = "AWS/ES"
period = var.kms_key_error_period
statistic = "SampleCount"
threshold = var.kms_key_error_threshold
alarm_description = "The KMS encryption key that is used to encrypt data at rest in your domain is disabled"
alarm_actions = var.alarm_actions
ok_actions = var.ok_actions
}
resource "aws_cloudwatch_metric_alarm" "kms_key_inaccessible" {
count = var.kms_key_inaccessible_enable ? 1 : 0
alarm_name = var.kms_key_inaccessible_alarm_name
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = var.kms_key_inaccessible_evaluation_periods
metric_name = "KMSKeyInaccessible"
namespace = "AWS/ES"
period = var.kms_key_inaccessible_period
statistic = "SampleCount"
threshold = var.kms_key_inaccessible_threshold
alarm_description = "The KMS encryption key has been deleted or has revoked its grants to Amazon ES"
alarm_actions = var.alarm_actions
ok_actions = var.ok_actions
}