Skip to content

Commit

Permalink
Merge pull request #1836 from zli/xenserver/master/CP-9091
Browse files Browse the repository at this point in the history
Add Dom0 mem_usage alert
  • Loading branch information
thomassa committed Jul 22, 2014
2 parents dc32739 + b10dbb8 commit 6cba889
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 9 deletions.
26 changes: 26 additions & 0 deletions scripts/mail-alarm
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,30 @@ class Dom0FSUsageAlarmETG(EmailTextGenerator):
self.value * 100.0,
self.alarm_trigger_level * 100.0)

class Dom0MemUsageAlarmETG(EmailTextGenerator):
def __init__(self, cls, obj_uuid, value, alarm_trigger_level):
if not alarm_trigger_level: alarm_trigger_level = 0.95
if cls != 'VM':
raise Exception, "programmer error - this alarm should only be available for control domain VM"
self.params = get_VM_params(obj_uuid)
self.cls = cls
self.value = value
self.alarm_trigger_level = alarm_trigger_level

def generate_subject(self):
pool_name = get_pool_name()
return '[%s] XenServer Alarm: Dom0 memory demand is high on "%s"' % (pool_name, self.params['name_label'])

def generate_body(self):
return \
'The memory demand on "%s" is about %.1f%% of the physical memory of the domain. ' \
'Occasional performance degradation can be expected when memory swapping is forced to happen.\n' \
'This alarm is set to be triggered when the ratio of the memory demand to the physical memory is beyond %.1f%%.\n' \
'\n' % \
(self.params['name_label'],
self.value * 100.0,
self.alarm_trigger_level * 100.0)

class WlbConsultationFailure(EmailTextGenerator):
def __init__(self, cls, obj_uuid):
self.cls = cls
Expand Down Expand Up @@ -336,6 +360,8 @@ class XapiMessage:
etg = DiskUsageAlarmETG(self.cls, self.obj_uuid, value, alarm_trigger_period, alarm_trigger_level)
elif name == 'fs_usage':
etg = Dom0FSUsageAlarmETG(self.cls, self.obj_uuid, value, alarm_trigger_level)
elif name == 'mem_usage':
etg = Dom0MemUsageAlarmETG(self.cls, self.obj_uuid, value, alarm_trigger_level)
else:
etg = None
elif self.name == 'HA_HOST_FAILED':
Expand Down
40 changes: 31 additions & 9 deletions scripts/perfmon
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ class RRDUpdates:


# Consolidation functions:
supported_consolidation_functions = [ 'sum', 'average', 'max', 'get_percent_fs_usage' ]
supported_consolidation_functions = [ 'sum', 'average', 'max', 'get_percent_fs_usage', 'get_percent_mem_usage' ]

def average(mylist):
return sum(mylist)/float(len(mylist))
Expand All @@ -359,6 +359,27 @@ def get_percent_fs_usage(ignored):
# strip of % character and convert to float
return float(percentage[0:-1])/100.0

def get_percent_mem_usage(ignored):
"Get the percent usage of Dom0 memory/swap. Input list is ignored and should be empty"
try:
memfd = open('/proc/meminfo', 'r')
memlist = memfd.readlines()
memfd.close()
memdict = [ m.split(':', 1) for m in memlist ]
memdict = dict([(k.strip(), float(re.search('\d+', v.strip()).group(0))) for (k,v) in memdict])
# We consider the sum of res memory and swap in use as the hard demand
# of mem usage, it is bad if this number is beyond the physical mem, as
# in such case swapping is obligatory rather than voluntary, hence
# degrading the performance. We define the percentage metrics as
# (res_mem + swap_in_use) / phy_mem, which could potentially go beyond
# 100% (but is considered bad when it does)
mem_in_use = memdict['MemTotal'] - memdict['MemFree'] - memdict['Buffers'] - memdict['Cached']
swap_in_use = memdict['SwapTotal'] - memdict['SwapFree']
return float(mem_in_use + swap_in_use) / memdict['MemTotal']
except Exception, e:
log_err("Error %s in get_percent_mem_usage, return 0.0 instead" % e)
return 0.0

class VariableConfig:
"""Object storing the configuration of a Variable
Expand Down Expand Up @@ -616,13 +637,13 @@ class VMMonitor(ObjectMonitor):
- Multiple <variable> nodes allowed
- full list of child nodes is
* name: what to call the variable (no default)
* alarm_priority: the priority of the messages generated (default '5')
* alarm_priority: the priority of the messages generated (default '3')
* alarm_trigger_level: level of value that triggers an alarm (no default)
* alarm_trigger_sense: 'high' if alarm_trigger_level is a max, otherwise 'low'. (default 'high')
* alarm_trigger_period: num seconds of 'bad' values before an alarm is sent (default '60')
* alarm_auto_inhibit_period: num seconds this alarm disabled after an alarm is sent (default '3600')
* consolidation_fn: how to combine variables from rrd_updates into one value
(default is 'average' for 'cpu_usage', 'get_percent_fs_usage' for 'fs_usage', & 'sum' for everything else)
(default is 'average' for 'cpu_usage', 'get_percent_fs_usage' for 'fs_usage', 'get_percent_mem_usage' for 'mem_usage', & 'sum' for everything else)
* rrd_regex matches the names of variables from (xe vm-data-sources-list uuid=$vmuuid) used to compute value
(only has defaults for "cpu_usage", "network_usage", and "disk_usage")
"""
Expand All @@ -636,19 +657,20 @@ class VMMonitor(ObjectMonitor):
if config_tag == 'consolidation_fn':
if variable_name == "cpu_usage": return 'average'
elif variable_name == "fs_usage": return 'get_percent_fs_usage'
elif variable_name == "mem_usage": return 'get_percent_mem_usage'
else: return 'sum'
elif config_tag == 'rrd_regex':
if variable_name == "cpu_usage": return "cpu[0-9]+"
elif variable_name == "network_usage": return "vif_[0-9]+_[rt]x"
elif variable_name == "disk_usage": return "vbd_(xvd|hd)[a-z]+_(read|write)"
elif variable_name == "fs_usage": return "_$_DUMMY__" # match nothing
elif variable_name == "mem_usage": return "_$_DUMMY__" # match nothing
else: raise XmlConfigException, "variable %s: no default rrd_regex - please specify one" % variable_name
elif config_tag == 'alarm_trigger_period': return '60' # 1 minute
elif config_tag == 'alarm_auto_inhibit_period':
if variable_name == "fs_usage": return '604800' # 1 week
else: return '3600' # 1 hour
elif config_tag == 'alarm_auto_inhibit_period': return '3600' # 1 hour
elif config_tag == 'alarm_trigger_level':
if variable_name == "fs_usage": return '0.9' # trigger when 90% full
if variable_name == "fs_usage": return '0.9' # trigger when 90% full
elif variable_name == "mem_usage": return '0.95' # tigger when mem demanded is close to phy_mem
else: raise XmlConfigException, "variable %s: no default alarm_trigger_level - please specify one" % variable_name
elif config_tag == 'alarm_trigger_sense': return 'high' # trigger if *above*
elif config_tag == 'alarm_priority': return '3' # Service degradation level defined in PR-1455
Expand All @@ -665,7 +687,7 @@ class HOSTMonitor(ObjectMonitor):
- Multiple <variable> nodes allowed
- full list of child nodes is
* name: what to call the variable (no default)
* alarm_priority: the priority of the messages generated (default '5')
* alarm_priority: the priority of the messages generated (default '3')
* alarm_trigger_level: level of value that triggers an alarm (no default)
* alarm_trigger_sense: 'high' if alarm_trigger_level is a max, otherwise 'low'. (default 'high')
* alarm_trigger_period: num seconds of 'bad' values before an alarm is sent (default '60')
Expand Down Expand Up @@ -723,7 +745,7 @@ class HOSTMonitor(ObjectMonitor):
# possible to set up an alarm on each host that uses an SR by setting
# appropriate configuration in the SR's other-config.
if self.uuid not in sruuids_by_hostuuid:
print_debug("%s not in sruuids_by_hostuuid")
print_debug("%s not in sruuids_by_hostuuid" % self.uuid)
self.secondary_variables.clear()
self.secondary_xmlconfigs.clear()
return
Expand Down

0 comments on commit 6cba889

Please sign in to comment.