From e4e3e5d50feaf99fb3f6b410281a403cd0923b17 Mon Sep 17 00:00:00 2001 From: Andrei Burd Date: Mon, 23 Feb 2015 17:13:36 +0200 Subject: [PATCH 1/2] Added Cloudwatch reporting for replicaLag, LockPercentage and current_lock --- check_mongodb.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/check_mongodb.py b/check_mongodb.py index bc6278a..72fbf45 100755 --- a/check_mongodb.py +++ b/check_mongodb.py @@ -18,18 +18,23 @@ # - @Andor on github # - Steven Richards - Captainkrtek on github # - Max Vernimmen +# - @burdandrei Added CloudWatch monitoring for Mongodb # # USAGE # # See the README.md # + import sys import time +import datetime import optparse import textwrap import re import os +import commands +from boto.ec2.cloudwatch import CloudWatchConnection try: import pymongo @@ -44,6 +49,7 @@ else: import pymongo.son as son +cloudwatch_report = False # # thanks to http://stackoverflow.com/a/1229667/72987 @@ -82,6 +88,17 @@ def numeric_type(param): return True return False +#Get the instanceId for our machine. This is important later for +#autoscaling. The dimensions we select here when publishing +#must be matched later by our autoscale policy +def get_instance_id(): + ret, instanceId = commands.getstatusoutput("wget -q -O - http://169.254.169.254/latest/meta-data/instance-id") + return instanceId + +def put_data(namespace, name, value, unit, dimensions): + c = CloudWatchConnection() + now = datetime.datetime.now() + c.put_metric_data(namespace, name, value, now, unit, dimensions) def check_levels(param, warning, critical, message, ok=[]): if (numeric_type(critical) and numeric_type(warning)): @@ -145,6 +162,7 @@ def main(argv): p.add_option('-q', '--querytype', action='store', dest='query_type', default='query', help='The query type to check [query|insert|update|delete|getmore|command] from queries_per_second') p.add_option('-c', '--collection', action='store', dest='collection', default='admin', help='Specify the collection to check') p.add_option('-T', '--time', action='store', type='int', dest='sample_time', default=1, help='Time used to sample number of pages faults') + p.add_option('--cloudwatch-report', action='store_true', dest='cloudwatch_report', default=False,help='Report sampled data to cloudwatch') options, arguments = p.parse_args() host = options.host @@ -167,6 +185,8 @@ def main(argv): database = options.database ssl = options.ssl replicaset = options.replicaset + global cloudwatch_report + cloudwatch_report = options.cloudwatch_report if action == 'replica_primary' and replicaset is None: return "replicaset must be passed in when using replica_primary check" @@ -438,6 +458,12 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la except: lag = float(optime_lag.seconds + optime_lag.days * 24 * 3600) + if cloudwatch_report: + replicaset = rs_status["set"] + instanceId = get_instance_id() + put_data('Mongo', 'replicationLag', lag, 'Seconds',{'replicaSet': replicaset}) + put_data('Mongo', 'replicationLag', lag, 'Seconds',{'InstanceId': instanceId, 'replicaSet': replicaset}) + if percent: err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd) if err != 0: @@ -617,6 +643,11 @@ def check_lock(con, warning, critical, perf_data): lock_percentage = float(lockTime) / float(totalTime) * 100 message = "Lock Percentage: %.2f%%" % lock_percentage message += performance_data(perf_data, [("%.2f" % lock_percentage, "lock_percentage", warning, critical)]) + if cloudwatch_report: + instanceId = get_instance_id() + replicaset = data['repl']['setName'] + put_data('Mongo', 'LockPercentage', lock_percentage, 'Percent',{'replicaSet': replicaset}) + put_data('Mongo', 'LockPercentage', lock_percentage, 'Percent',{'InstanceId': instanceId, 'replicaSet': replicaset}) return check_levels(lock_percentage, warning, critical, message) except Exception, e: @@ -1130,6 +1161,11 @@ def check_current_lock(con, host, warning, critical, perf_data): lock_percentage = delta[2] / delta[1] * 100 # lockTime/totalTime*100 message = "Current Lock Percentage: %.2f%%" % lock_percentage message += performance_data(perf_data, [("%.2f" % lock_percentage, "current_lock_percentage", warning, critical)]) + if cloudwatch_report: + instanceId = get_instance_id() + replicaset = data['repl']['setName'] + put_data('Mongo', 'CurrentLockPercentage', lock_percentage, 'Percent',{'replicaSet': replicaset}) + put_data('Mongo', 'CurrentLockPercentage', lock_percentage, 'Percent',{'InstanceId': instanceId, 'replicaSet': replicaset}) return check_levels(lock_percentage, warning, critical, message) else: return exit_with_general_warning("problem reading data from temp file") From 4379f6f5e33bc0512413168fc1e00654423e47f5 Mon Sep 17 00:00:00 2001 From: Andrei Burd Date: Tue, 24 Feb 2015 16:21:49 +0200 Subject: [PATCH 2/2] Fix boto dependency issues Moved all the imports inside cloudwatch-related definitions to avoid dependency issues --- check_mongodb.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/check_mongodb.py b/check_mongodb.py index 72fbf45..450f00c 100755 --- a/check_mongodb.py +++ b/check_mongodb.py @@ -25,16 +25,12 @@ # See the README.md # - import sys import time -import datetime import optparse import textwrap import re import os -import commands -from boto.ec2.cloudwatch import CloudWatchConnection try: import pymongo @@ -92,10 +88,13 @@ def numeric_type(param): #autoscaling. The dimensions we select here when publishing #must be matched later by our autoscale policy def get_instance_id(): + import commands ret, instanceId = commands.getstatusoutput("wget -q -O - http://169.254.169.254/latest/meta-data/instance-id") return instanceId def put_data(namespace, name, value, unit, dimensions): + import datetime + from boto.ec2.cloudwatch import CloudWatchConnection c = CloudWatchConnection() now = datetime.datetime.now() c.put_metric_data(namespace, name, value, now, unit, dimensions)