diff --git a/scripts/mail-alarm b/scripts/mail-alarm index 3e8d42b7bba..addeff4972c 100755 --- a/scripts/mail-alarm +++ b/scripts/mail-alarm @@ -221,6 +221,30 @@ class Dom0FSUsageAlarmETG(EmailTextGenerator): self.value * 100.0, self.alarm_trigger_level * 100.0) +class Dom0MemUsageAlarmETG(EmailTextGenerator): + def __init__(self, cls, obj_uuid, value, alarm_trigger_level): + if not alarm_trigger_level: alarm_trigger_level = 0.95 + if cls != 'VM': + raise Exception, "programmer error - this alarm should only be available for control domain VM" + self.params = get_VM_params(obj_uuid) + self.cls = cls + self.value = value + self.alarm_trigger_level = alarm_trigger_level + + def generate_subject(self): + pool_name = get_pool_name() + return '[%s] XenServer Alarm: Dom0 memory demand is high on "%s"' % (pool_name, self.params['name_label']) + + def generate_body(self): + return \ + 'The memory demand on "%s" is about %.1f%% of the physical memory of the domain. ' \ + 'Occasional performance degradation can be expected when memory swapping is forced to happen.\n' \ + 'This alarm is set to be triggered when the ratio of the memory demand to the physical memory is beyond %.1f%%.\n' \ + '\n' % \ + (self.params['name_label'], + self.value * 100.0, + self.alarm_trigger_level * 100.0) + class WlbConsultationFailure(EmailTextGenerator): def __init__(self, cls, obj_uuid): self.cls = cls @@ -336,6 +360,8 @@ class XapiMessage: etg = DiskUsageAlarmETG(self.cls, self.obj_uuid, value, alarm_trigger_period, alarm_trigger_level) elif name == 'fs_usage': etg = Dom0FSUsageAlarmETG(self.cls, self.obj_uuid, value, alarm_trigger_level) + elif name == 'mem_usage': + etg = Dom0MemUsageAlarmETG(self.cls, self.obj_uuid, value, alarm_trigger_level) else: etg = None elif self.name == 'HA_HOST_FAILED': diff --git a/scripts/perfmon b/scripts/perfmon index d8ee612c93b..e8f28e22348 100644 --- a/scripts/perfmon +++ b/scripts/perfmon @@ -345,7 +345,7 @@ class RRDUpdates: # Consolidation functions: -supported_consolidation_functions = [ 'sum', 'average', 'max', 'get_percent_fs_usage' ] +supported_consolidation_functions = [ 'sum', 'average', 'max', 'get_percent_fs_usage', 'get_percent_mem_usage' ] def average(mylist): return sum(mylist)/float(len(mylist)) @@ -359,6 +359,27 @@ def get_percent_fs_usage(ignored): # strip of % character and convert to float return float(percentage[0:-1])/100.0 +def get_percent_mem_usage(ignored): + "Get the percent usage of Dom0 memory/swap. Input list is ignored and should be empty" + try: + memfd = open('/proc/meminfo', 'r') + memlist = memfd.readlines() + memfd.close() + memdict = [ m.split(':', 1) for m in memlist ] + memdict = dict([(k.strip(), float(re.search('\d+', v.strip()).group(0))) for (k,v) in memdict]) + # We consider the sum of res memory and swap in use as the hard demand + # of mem usage, it is bad if this number is beyond the physical mem, as + # in such case swapping is obligatory rather than voluntary, hence + # degrading the performance. We define the percentage metrics as + # (res_mem + swap_in_use) / phy_mem, which could potentially go beyond + # 100% (but is considered bad when it does) + mem_in_use = memdict['MemTotal'] - memdict['MemFree'] - memdict['Buffers'] - memdict['Cached'] + swap_in_use = memdict['SwapTotal'] - memdict['SwapFree'] + return float(mem_in_use + swap_in_use) / memdict['MemTotal'] + except Exception, e: + log_err("Error %s in get_percent_mem_usage, return 0.0 instead" % e) + return 0.0 + class VariableConfig: """Object storing the configuration of a Variable @@ -616,13 +637,13 @@ class VMMonitor(ObjectMonitor): - Multiple nodes allowed - full list of child nodes is * name: what to call the variable (no default) - * alarm_priority: the priority of the messages generated (default '5') + * alarm_priority: the priority of the messages generated (default '3') * alarm_trigger_level: level of value that triggers an alarm (no default) * alarm_trigger_sense: 'high' if alarm_trigger_level is a max, otherwise 'low'. (default 'high') * alarm_trigger_period: num seconds of 'bad' values before an alarm is sent (default '60') * alarm_auto_inhibit_period: num seconds this alarm disabled after an alarm is sent (default '3600') * consolidation_fn: how to combine variables from rrd_updates into one value - (default is 'average' for 'cpu_usage', 'get_percent_fs_usage' for 'fs_usage', & 'sum' for everything else) + (default is 'average' for 'cpu_usage', 'get_percent_fs_usage' for 'fs_usage', 'get_percent_mem_usage' for 'mem_usage', & 'sum' for everything else) * rrd_regex matches the names of variables from (xe vm-data-sources-list uuid=$vmuuid) used to compute value (only has defaults for "cpu_usage", "network_usage", and "disk_usage") """ @@ -636,19 +657,20 @@ class VMMonitor(ObjectMonitor): if config_tag == 'consolidation_fn': if variable_name == "cpu_usage": return 'average' elif variable_name == "fs_usage": return 'get_percent_fs_usage' + elif variable_name == "mem_usage": return 'get_percent_mem_usage' else: return 'sum' elif config_tag == 'rrd_regex': if variable_name == "cpu_usage": return "cpu[0-9]+" elif variable_name == "network_usage": return "vif_[0-9]+_[rt]x" elif variable_name == "disk_usage": return "vbd_(xvd|hd)[a-z]+_(read|write)" elif variable_name == "fs_usage": return "_$_DUMMY__" # match nothing + elif variable_name == "mem_usage": return "_$_DUMMY__" # match nothing else: raise XmlConfigException, "variable %s: no default rrd_regex - please specify one" % variable_name elif config_tag == 'alarm_trigger_period': return '60' # 1 minute - elif config_tag == 'alarm_auto_inhibit_period': - if variable_name == "fs_usage": return '604800' # 1 week - else: return '3600' # 1 hour + elif config_tag == 'alarm_auto_inhibit_period': return '3600' # 1 hour elif config_tag == 'alarm_trigger_level': - if variable_name == "fs_usage": return '0.9' # trigger when 90% full + if variable_name == "fs_usage": return '0.9' # trigger when 90% full + elif variable_name == "mem_usage": return '0.95' # tigger when mem demanded is close to phy_mem else: raise XmlConfigException, "variable %s: no default alarm_trigger_level - please specify one" % variable_name elif config_tag == 'alarm_trigger_sense': return 'high' # trigger if *above* elif config_tag == 'alarm_priority': return '3' # Service degradation level defined in PR-1455 @@ -665,7 +687,7 @@ class HOSTMonitor(ObjectMonitor): - Multiple nodes allowed - full list of child nodes is * name: what to call the variable (no default) - * alarm_priority: the priority of the messages generated (default '5') + * alarm_priority: the priority of the messages generated (default '3') * alarm_trigger_level: level of value that triggers an alarm (no default) * alarm_trigger_sense: 'high' if alarm_trigger_level is a max, otherwise 'low'. (default 'high') * alarm_trigger_period: num seconds of 'bad' values before an alarm is sent (default '60') @@ -723,7 +745,7 @@ class HOSTMonitor(ObjectMonitor): # possible to set up an alarm on each host that uses an SR by setting # appropriate configuration in the SR's other-config. if self.uuid not in sruuids_by_hostuuid: - print_debug("%s not in sruuids_by_hostuuid") + print_debug("%s not in sruuids_by_hostuuid" % self.uuid) self.secondary_variables.clear() self.secondary_xmlconfigs.clear() return