-
Notifications
You must be signed in to change notification settings - Fork 18
/
lambda.py
239 lines (179 loc) · 7.17 KB
/
lambda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
import boto3
import decimal
import json
import os
ALARM_NAME_PREFIX = 'InstanceAlarm:'
ALARM_TEMPLATES_BUCKET = os.environ['ALARM_TEMPLATES_BUCKET']
ALARM_TEMPLATES_CACHE = {}
# Maximum number of alarms to delete per API call.
DELETE_ALARMS_MAX_NAMES = 100
autoscaling = boto3.client('autoscaling')
cloudwatch = boto3.client('cloudwatch')
s3 = boto3.client('s3')
def create_instance_alarms(asg_name, instance_id):
"""
Creates alarms for the specified EC2 instance.
"""
asgs = describe_auto_scaling_groups(
AutoScalingGroupNames=[asg_name],
)
for asg in asgs:
alarms_to_create = get_alarms_to_create(asg, instance_id)
for alarm in alarms_to_create:
print('Creating alarm: {}'.format(alarm['AlarmName']))
put_metric_alarm(**alarm)
def delete_alarms(alarm_names):
"""
Deletes the specified alarms.
"""
# Delete as many alarms as possible in one API call.
# Use a list and go through it in chunks.
alarm_names = list(alarm_names)
while alarm_names:
# Delete a chunk of alarms.
response = cloudwatch.delete_alarms(
AlarmNames=alarm_names[:DELETE_ALARMS_MAX_NAMES],
)
if response['ResponseMetadata']['HTTPStatusCode'] != 200:
raise Exception('ERROR: {}'.format(response))
# Move to the next chunk.
alarm_names = alarm_names[DELETE_ALARMS_MAX_NAMES:]
def delete_instance_alarms(instance_id):
"""
Delete all alarms that exist for the specified EC2 instance.
"""
# This Lambda function always create alarms for instances using a standard
# prefix and then the instance id. Find any delete any alarms that have
# this naming convention and this instance id.
alarms = describe_alarms(
AlarmNamePrefix=ALARM_NAME_PREFIX + instance_id,
)
alarm_names = [alarm['AlarmName'] for alarm in alarms]
print('Deleting alarms: {}'.format(alarm_names))
delete_alarms(alarm_names)
def describe_alarms(**kwargs):
"""
Returns CloudWatch Metric Alarms.
"""
paginator = cloudwatch.get_paginator('describe_alarms')
pages = paginator.paginate(**kwargs)
for page in pages:
if page['ResponseMetadata']['HTTPStatusCode'] != 200:
raise Exception('ERROR: {}'.format(page))
for alarm in page['MetricAlarms']:
yield alarm
def describe_auto_scaling_groups(**kwargs):
"""
Returns Auto Scaling Groups.
"""
paginator = autoscaling.get_paginator('describe_auto_scaling_groups')
pages = paginator.paginate(**kwargs)
for page in pages:
if page['ResponseMetadata']['HTTPStatusCode'] != 200:
raise Exception('ERROR: {}'.format(page))
for asg in page['AutoScalingGroups']:
yield asg
def full_sweep():
"""
Creates any instance alarms that should exist but don't, and deletes
any instance alarms that shouldn't exist but do.
"""
# Get a list of all instance alarms in the AWS account.
found_alarm_names = set()
alarms = describe_alarms(
AlarmNamePrefix=ALARM_NAME_PREFIX,
)
for alarm in alarms:
alarm_name = alarm['AlarmName']
found_alarm_names.add(alarm_name)
# Go through all ASGs and their EC2 instances and create an alarms that
# should exist but don't. Build a list of the alarms that should exist.
expected_alarm_names = set()
for asg in describe_auto_scaling_groups():
for instance in asg['Instances']:
if instance['LifecycleState'] != 'InService':
continue
alarms = get_alarms_to_create(asg, instance['InstanceId'])
for alarm in alarms:
alarm_name = alarm['AlarmName']
expected_alarm_names.add(alarm_name)
if alarm_name not in found_alarm_names:
print('Creating missing alarm: {}'.format(alarm_name))
put_metric_alarm(**alarm)
# Delete any instance alarms that shouldn't exist.
orphan_alarm_names = found_alarm_names - expected_alarm_names
if orphan_alarm_names:
print('Deleting orphan alarms: {}'.format(orphan_alarm_names))
delete_alarms(orphan_alarm_names)
def get_alarm_keys(asg):
"""
Returns alarm keys as defined by the ASG's tags.
"""
for tag in asg['Tags']:
tag_key = tag['Key']
if tag_key.startswith(ALARM_NAME_PREFIX):
alarm_key = tag_key[len(ALARM_NAME_PREFIX):]
yield alarm_key
def get_alarms_to_create(asg, instance_id):
"""
Returns alarm dictionaries that should be created for an EC2 instance.
"""
for alarm_key in get_alarm_keys(asg):
# Read alarm templates from S3 and cache them in memory.
if alarm_key not in ALARM_TEMPLATES_CACHE:
ALARM_TEMPLATES_CACHE[alarm_key] = get_s3_object_body(
Bucket=ALARM_TEMPLATES_BUCKET,
Key=alarm_key,
)
template_string = ALARM_TEMPLATES_CACHE[alarm_key]
# Render the template using variables from the ASG and instance.
template_variables = {
'asg.AutoScalingGroupName': asg['AutoScalingGroupName'],
'instance.InstanceId': instance_id,
}
for tag in asg['Tags']:
var_name = 'asg.Tags.' + tag['Key']
template_variables[var_name] = tag['Value']
for var_name, value in template_variables.items():
template_string = template_string.replace(
'{{' + var_name + '}}',
value,
)
# It should be valid JSON now.
alarm = json.loads(template_string)
# Set the alarm name programatically so it can be found and deleted
# after the instance has been terminated.
alarm['AlarmName'] = ALARM_NAME_PREFIX + instance_id + ':' + alarm_key
yield alarm
def get_s3_object_body(**kwargs):
"""
Returns the content of an object in S3.
"""
response = s3.get_object(**kwargs)
if response['ResponseMetadata']['HTTPStatusCode'] != 200:
raise Exception('ERROR: {}'.format(response))
return response['Body'].read().decode('utf-8')
def put_metric_alarm(**alarm):
"""
Creates a CloudWatch Metric Alarm.
"""
# Convert numeric fields into appropriate types.
alarm['EvaluationPeriods'] = int(alarm['EvaluationPeriods'])
alarm['Period'] = int(alarm['Period'])
alarm['Threshold'] = decimal.Decimal(alarm['Threshold'])
# Create the alarm.
response = cloudwatch.put_metric_alarm(**alarm)
if response['ResponseMetadata']['HTTPStatusCode'] != 200:
raise Exception('ERROR: {}'.format(response))
def lambda_handler(event, context):
print('Received event: {}'.format(event))
if event['detail-type'] == 'EC2 Instance Launch Successful':
asg_name = event['detail']['AutoScalingGroupName']
instance_id = event['detail']['EC2InstanceId']
create_instance_alarms(asg_name, instance_id)
elif event['detail-type'] == 'EC2 Instance State-change Notification':
if event['detail']['state'] not in ('pending', 'running'):
instance_id = event['detail']['instance-id']
delete_instance_alarms(instance_id)
else:
full_sweep()