diff --git a/collector/fixtures/e2e-test-cgroupsv1-output.txt b/collector/fixtures/e2e-test-cgroupsv1-output.txt index 02c3733b..6f523ccd 100644 --- a/collector/fixtures/e2e-test-cgroupsv1-output.txt +++ b/collector/fixtures/e2e-test-cgroupsv1-output.txt @@ -1,15 +1,15 @@ # HELP batchjob_cpu_system_seconds Cumulative CPU system seconds # TYPE batchjob_cpu_system_seconds gauge -batchjob_cpu_system_seconds{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0.45 +batchjob_cpu_system_seconds{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 0.45 # HELP batchjob_cpu_total_seconds Cumulative CPU total seconds # TYPE batchjob_cpu_total_seconds gauge -batchjob_cpu_total_seconds{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 1.012410966 +batchjob_cpu_total_seconds{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 1.012410966 # HELP batchjob_cpu_user_seconds Cumulative CPU user seconds # TYPE batchjob_cpu_user_seconds gauge -batchjob_cpu_user_seconds{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0.39 +batchjob_cpu_user_seconds{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 0.39 # HELP batchjob_cpus Number of CPUs # TYPE batchjob_cpus gauge -batchjob_cpus{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0 +batchjob_cpus{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 0 # HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build. # TYPE batchjob_exporter_build_info gauge # HELP batchjob_ipmi_dcmi_watts_total Current Power consumption in watts @@ -17,28 +17,28 @@ batchjob_cpus{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bf batchjob_ipmi_dcmi_watts_total 332 # HELP batchjob_memory_cache_bytes Memory cache used in bytes # TYPE batchjob_memory_cache_bytes gauge -batchjob_memory_cache_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 2.1086208e+07 +batchjob_memory_cache_bytes{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 2.1086208e+07 # HELP batchjob_memory_fail_count Memory fail count # TYPE batchjob_memory_fail_count gauge -batchjob_memory_fail_count{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0 +batchjob_memory_fail_count{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 0 # HELP batchjob_memory_rss_bytes Memory RSS used in bytes # TYPE batchjob_memory_rss_bytes gauge -batchjob_memory_rss_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 1.0407936e+07 +batchjob_memory_rss_bytes{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 1.0407936e+07 # HELP batchjob_memory_total_bytes Memory total in bytes # TYPE batchjob_memory_total_bytes gauge -batchjob_memory_total_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 2.01362030592e+11 +batchjob_memory_total_bytes{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 2.01362030592e+11 # HELP batchjob_memory_used_bytes Memory used in bytes # TYPE batchjob_memory_used_bytes gauge -batchjob_memory_used_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 4.0194048e+07 +batchjob_memory_used_bytes{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 4.0194048e+07 # HELP batchjob_memsw_fail_count Swap fail count # TYPE batchjob_memsw_fail_count gauge -batchjob_memsw_fail_count{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0 +batchjob_memsw_fail_count{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 0 # HELP batchjob_memsw_total_bytes Swap total in bytes # TYPE batchjob_memsw_total_bytes gauge -batchjob_memsw_total_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 9.223372036854772e+18 +batchjob_memsw_total_bytes{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 9.223372036854772e+18 # HELP batchjob_memsw_used_bytes Swap used in bytes # TYPE batchjob_memsw_used_bytes gauge -batchjob_memsw_used_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 4.032512e+07 +batchjob_memsw_used_bytes{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 4.032512e+07 # HELP batchjob_nvidia_gpu_jobid Batch Job ID of current nVIDIA GPU # TYPE batchjob_nvidia_gpu_jobid gauge batchjob_nvidia_gpu_jobid{uuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"} 11000 diff --git a/collector/fixtures/e2e-test-cgroupsv2-output.txt b/collector/fixtures/e2e-test-cgroupsv2-output.txt index a46ba88e..07399ca0 100644 --- a/collector/fixtures/e2e-test-cgroupsv2-output.txt +++ b/collector/fixtures/e2e-test-cgroupsv2-output.txt @@ -1,15 +1,15 @@ # HELP batchjob_cpu_system_seconds Cumulative CPU system seconds # TYPE batchjob_cpu_system_seconds gauge -batchjob_cpu_system_seconds{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 115.777502 +batchjob_cpu_system_seconds{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 115.777502 # HELP batchjob_cpu_total_seconds Cumulative CPU total seconds # TYPE batchjob_cpu_total_seconds gauge -batchjob_cpu_total_seconds{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 60491.070351 +batchjob_cpu_total_seconds{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 60491.070351 # HELP batchjob_cpu_user_seconds Cumulative CPU user seconds # TYPE batchjob_cpu_user_seconds gauge -batchjob_cpu_user_seconds{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 60375.292848 +batchjob_cpu_user_seconds{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 60375.292848 # HELP batchjob_cpus Number of CPUs # TYPE batchjob_cpus gauge -batchjob_cpus{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 2 +batchjob_cpus{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 2 # HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build. # TYPE batchjob_exporter_build_info gauge # HELP batchjob_ipmi_dcmi_watts_total Current Power consumption in watts @@ -17,28 +17,28 @@ batchjob_cpus{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bf batchjob_ipmi_dcmi_watts_total 332 # HELP batchjob_memory_cache_bytes Memory cache used in bytes # TYPE batchjob_memory_cache_bytes gauge -batchjob_memory_cache_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0 +batchjob_memory_cache_bytes{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 0 # HELP batchjob_memory_fail_count Memory fail count # TYPE batchjob_memory_fail_count gauge -batchjob_memory_fail_count{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0 +batchjob_memory_fail_count{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 0 # HELP batchjob_memory_rss_bytes Memory RSS used in bytes # TYPE batchjob_memory_rss_bytes gauge -batchjob_memory_rss_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 4.098592768e+09 +batchjob_memory_rss_bytes{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 4.098592768e+09 # HELP batchjob_memory_total_bytes Memory total in bytes # TYPE batchjob_memory_total_bytes gauge -batchjob_memory_total_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 4.294967296e+09 +batchjob_memory_total_bytes{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 4.294967296e+09 # HELP batchjob_memory_used_bytes Memory used in bytes # TYPE batchjob_memory_used_bytes gauge -batchjob_memory_used_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 4.111491072e+09 +batchjob_memory_used_bytes{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 4.111491072e+09 # HELP batchjob_memsw_fail_count Swap fail count # TYPE batchjob_memsw_fail_count gauge -batchjob_memsw_fail_count{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0 +batchjob_memsw_fail_count{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 0 # HELP batchjob_memsw_total_bytes Swap total in bytes # TYPE batchjob_memsw_total_bytes gauge -batchjob_memsw_total_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0 +batchjob_memsw_total_bytes{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 0 # HELP batchjob_memsw_used_bytes Swap used in bytes # TYPE batchjob_memsw_used_bytes gauge -batchjob_memsw_used_bytes{batch="slurm",jobid="1009248",step="",task="",ujobid="557d568b88bfc87489dbea91de6da689"} 0 +batchjob_memsw_used_bytes{batch="slurm",jobgid="1000",jobid="1009248",jobuid="1000",jobuuid="8d4fad6d-c5e3-775b-8a8c-707e319114ec",step="",task=""} 0 # HELP batchjob_nvidia_gpu_jobid Batch Job ID of current nVIDIA GPU # TYPE batchjob_nvidia_gpu_jobid gauge batchjob_nvidia_gpu_jobid{uuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"} 11000 diff --git a/collector/slurm.go b/collector/slurm.go index 6f341779..0a8a4a54 100644 --- a/collector/slurm.go +++ b/collector/slurm.go @@ -45,9 +45,10 @@ type CgroupMetric struct { memswTotal float64 memswFailCount float64 userslice bool - uid int + jobuid string + jobgid string jobid string - ujobid string + jobuuid string step string task string batch string @@ -87,31 +88,31 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { return &slurmCollector{ cgroupV2: cgroupV2, cpuUser: prometheus.NewDesc(prometheus.BuildFQName(namespace, "cpu", "user_seconds"), - "Cumulative CPU user seconds", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), + "Cumulative CPU user seconds", []string{"batch", "jobuid", "jobgid", "jobid", "jobuuid", "step", "task"}, nil), cpuSystem: prometheus.NewDesc(prometheus.BuildFQName(namespace, "cpu", "system_seconds"), - "Cumulative CPU system seconds", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), + "Cumulative CPU system seconds", []string{"batch", "jobuid", "jobgid", "jobid", "jobuuid", "step", "task"}, nil), cpuTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "cpu", "total_seconds"), - "Cumulative CPU total seconds", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), + "Cumulative CPU total seconds", []string{"batch", "jobuid", "jobgid", "jobid", "jobuuid", "step", "task"}, nil), cpus: prometheus.NewDesc(prometheus.BuildFQName(namespace, "", "cpus"), - "Number of CPUs", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), + "Number of CPUs", []string{"batch", "jobuid", "jobgid", "jobid", "jobuuid", "step", "task"}, nil), memoryRSS: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "rss_bytes"), - "Memory RSS used in bytes", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), + "Memory RSS used in bytes", []string{"batch", "jobuid", "jobgid", "jobid", "jobuuid", "step", "task"}, nil), memoryCache: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "cache_bytes"), - "Memory cache used in bytes", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), + "Memory cache used in bytes", []string{"batch", "jobuid", "jobgid", "jobid", "jobuuid", "step", "task"}, nil), memoryUsed: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "used_bytes"), - "Memory used in bytes", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), + "Memory used in bytes", []string{"batch", "jobuid", "jobgid", "jobid", "jobuuid", "step", "task"}, nil), memoryTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "total_bytes"), - "Memory total in bytes", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), + "Memory total in bytes", []string{"batch", "jobuid", "jobgid", "jobid", "jobuuid", "step", "task"}, nil), memoryFailCount: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "fail_count"), - "Memory fail count", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), + "Memory fail count", []string{"batch", "jobuid", "jobgid", "jobid", "jobuuid", "step", "task"}, nil), memswUsed: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memsw", "used_bytes"), - "Swap used in bytes", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), + "Swap used in bytes", []string{"batch", "jobuid", "jobgid", "jobid", "jobuuid", "step", "task"}, nil), memswTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memsw", "total_bytes"), - "Swap total in bytes", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), + "Swap total in bytes", []string{"batch", "jobuid", "jobgid", "jobid", "jobuuid", "step", "task"}, nil), memswFailCount: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memsw", "fail_count"), - "Swap fail count", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), + "Swap fail count", []string{"batch", "jobuid", "jobgid", "jobid", "jobuuid", "step", "task"}, nil), collectError: prometheus.NewDesc(prometheus.BuildFQName(namespace, "exporter", "collect_error"), - "Indicates collection error, 0=no error, 1=error", []string{"batch", "jobid", "ujobid", "step", "task"}, nil), + "Indicates collection error, 0=no error, 1=error", []string{"batch", "jobuid", "jobgid", "jobid", "jobuuid", "step", "task"}, nil), logger: logger, }, nil } @@ -135,9 +136,9 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error { if m.err { ch <- prometheus.MustNewConstMetric(c.collectError, prometheus.GaugeValue, 1, m.name) } - ch <- prometheus.MustNewConstMetric(c.cpuUser, prometheus.GaugeValue, m.cpuUser, m.batch, m.jobid, m.ujobid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.cpuSystem, prometheus.GaugeValue, m.cpuSystem, m.batch, m.jobid, m.ujobid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.cpuTotal, prometheus.GaugeValue, m.cpuTotal, m.batch, m.jobid, m.ujobid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.cpuUser, prometheus.GaugeValue, m.cpuUser, m.batch, m.jobuid, m.jobgid, m.jobid, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.cpuSystem, prometheus.GaugeValue, m.cpuSystem, m.batch, m.jobuid, m.jobgid, m.jobid, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.cpuTotal, prometheus.GaugeValue, m.cpuTotal, m.batch, m.jobuid, m.jobgid, m.jobid, m.jobuuid, m.step, m.task) cpus := m.cpus if cpus == 0 { dir := filepath.Dir(n) @@ -146,15 +147,15 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error { cpus = metrics[filepath.Dir(dir)].cpus } } - ch <- prometheus.MustNewConstMetric(c.cpus, prometheus.GaugeValue, float64(cpus), m.batch, m.jobid, m.ujobid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memoryRSS, prometheus.GaugeValue, m.memoryRSS, m.batch, m.jobid, m.ujobid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memoryCache, prometheus.GaugeValue, m.memoryCache, m.batch, m.jobid, m.ujobid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memoryUsed, prometheus.GaugeValue, m.memoryUsed, m.batch, m.jobid, m.ujobid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.batch, m.jobid, m.ujobid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memoryFailCount, prometheus.GaugeValue, m.memoryFailCount, m.batch, m.jobid, m.ujobid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memswUsed, prometheus.GaugeValue, m.memswUsed, m.batch, m.jobid, m.ujobid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memswTotal, prometheus.GaugeValue, m.memswTotal, m.batch, m.jobid, m.ujobid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memswFailCount, prometheus.GaugeValue, m.memswFailCount, m.batch, m.jobid, m.ujobid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.cpus, prometheus.GaugeValue, float64(cpus), m.batch, m.jobuid, m.jobgid, m.jobid, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memoryRSS, prometheus.GaugeValue, m.memoryRSS, m.batch, m.jobuid, m.jobgid, m.jobid, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memoryCache, prometheus.GaugeValue, m.memoryCache, m.batch, m.jobuid, m.jobgid, m.jobid, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memoryUsed, prometheus.GaugeValue, m.memoryUsed, m.batch, m.jobuid, m.jobgid, m.jobid, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.batch, m.jobuid, m.jobgid, m.jobid, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memoryFailCount, prometheus.GaugeValue, m.memoryFailCount, m.batch, m.jobuid, m.jobgid, m.jobid, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memswUsed, prometheus.GaugeValue, m.memswUsed, m.batch, m.jobuid, m.jobgid, m.jobid, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memswTotal, prometheus.GaugeValue, m.memswTotal, m.batch, m.jobuid, m.jobgid, m.jobid, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memswFailCount, prometheus.GaugeValue, m.memswFailCount, m.batch, m.jobuid, m.jobgid, m.jobid, m.jobuuid, m.step, m.task) } return nil } @@ -295,11 +296,11 @@ func (c *slurmCollector) getCPUs(name string) ([]string, error) { return cpus, nil } -// Get job unique identifier from job metadata -func (c *slurmCollector) getJobUniqueId(jobid string) string { - var uniqueJobId string - var jobUid = "" - var jobGid = "" +// Get different labels of Job +func (c *slurmCollector) getJobLabels(jobid string) (string, string, string) { + var jobUuid string + var jobUid string = "" + var jobGid string = "" var jobNodes = "" var jobWorkDir = "" var slurmJobInfo = fmt.Sprintf("%s/%s", *jobStatPath, jobid) @@ -310,33 +311,37 @@ func (c *slurmCollector) getJobUniqueId(jobid string) string { } else { fmt.Sscanf(string(content), "%s %s %s %s", &jobUid, &jobGid, &jobNodes, &jobWorkDir) } - uniqueJobId = GetMD5CheckSum([]string{jobid, jobUid, jobGid, jobNodes, jobWorkDir}) + jobUuid, err = GetUuidFromString([]string{jobid, jobUid, jobGid, jobNodes, jobWorkDir}) + if err != nil { + level.Error(c.logger).Log("msg", "Failed to generate UUID for job", "jobid", jobid, "err", err) + jobUuid = jobid + } } - return uniqueJobId + return jobUuid, jobUid, jobGid } // Get job details from cgroups v1 func (c *slurmCollector) getInfoV1(name string, metric *CgroupMetric) { - var err error + // var err error pathBase := filepath.Base(name) userSlicePattern := regexp.MustCompile("^user-([0-9]+).slice$") userSliceMatch := userSlicePattern.FindStringSubmatch(pathBase) if len(userSliceMatch) == 2 { metric.userslice = true - metric.uid, err = strconv.Atoi(userSliceMatch[1]) - if err != nil { - level.Error(c.logger).Log("msg", "Error getting slurm job's uid number", "uid", pathBase, "err", err) - } - return + // metric.jobuid, err = userSliceMatch[1] + // if err != nil { + // level.Error(c.logger).Log("msg", "Error getting slurm job's uid number", "uid", pathBase, "err", err) + // } + // return } slurmPattern := regexp.MustCompile("^/slurm/uid_([0-9]+)/job_([0-9]+)(/step_([^/]+)(/task_([[0-9]+))?)?$") slurmMatch := slurmPattern.FindStringSubmatch(name) level.Debug(c.logger).Log("msg", "Got for match", "name", name, "len(slurmMatch)", len(slurmMatch), "slurmMatch", fmt.Sprintf("%v", slurmMatch)) if len(slurmMatch) >= 3 { - metric.uid, err = strconv.Atoi(slurmMatch[1]) - if err != nil { - level.Error(c.logger).Log("msg", "Error getting slurm job's uid number", "uid", name, "err", err) - } + // metric.jobuid, err = slurmMatch[1] + // if err != nil { + // level.Error(c.logger).Log("msg", "Error getting slurm job's uid number", "uid", name, "err", err) + // } metric.jobid = slurmMatch[2] metric.step = slurmMatch[4] metric.task = slurmMatch[6] @@ -389,7 +394,7 @@ func (c *slurmCollector) getCgroupsV1Metrics(name string) (CgroupMetric, error) metric.cpus = len(cpus) } c.getInfoV1(name, &metric) - metric.ujobid = c.getJobUniqueId(metric.jobid) + metric.jobuuid, metric.jobuid, metric.jobgid = c.getJobLabels(metric.jobid) return metric, nil } @@ -409,9 +414,7 @@ func (c *slurmCollector) getInfoV2(name string, metric *CgroupMetric) { // possibilities are /system.slice/slurmstepd.scope/job_211 // /system.slice/slurmstepd.scope/job_211/step_interactive // /system.slice/slurmstepd.scope/job_211/step_extern/user/task_0 - // we never ever get the uid - metric.uid = -1 - // nor is there a userslice + // we dont get userslice metric.userslice = false slurmPattern := regexp.MustCompile("^/system.slice/slurmstepd.scope/job_([0-9]+)(/step_([^/]+)(/user/task_([[0-9]+))?)?$") slurmMatch := slurmPattern.FindStringSubmatch(name) @@ -455,6 +458,6 @@ func (c *slurmCollector) getCgroupsV2Metrics(name string) (CgroupMetric, error) metric.cpus = len(cpus) } c.getInfoV2(name, &metric) - metric.ujobid = c.getJobUniqueId(metric.jobid) + metric.jobuuid, metric.jobuid, metric.jobgid = c.getJobLabels(metric.jobid) return metric, nil } diff --git a/collector/slurm_test.go b/collector/slurm_test.go index db43d970..9533f89b 100644 --- a/collector/slurm_test.go +++ b/collector/slurm_test.go @@ -14,7 +14,7 @@ import ( var expectedSlurmMetrics CgroupMetric func TestCgroupsV2SlurmJobMetrics(t *testing.T) { - if _, err := kingpin.CommandLine.Parse([]string{"--path.cgroupfs", "fixtures/sys/fs/cgroup"}); err != nil { + if _, err := kingpin.CommandLine.Parse([]string{"--path.cgroupfs", "fixtures/sys/fs/cgroup", "--collector.slurm.unique.jobid", "--collector.slurm.job.stat.path", "fixtures/slurmjobstat"}); err != nil { t.Fatal(err) } c := slurmCollector{cgroupV2: true, logger: log.NewNopLogger()} @@ -34,8 +34,12 @@ func TestCgroupsV2SlurmJobMetrics(t *testing.T) { memswTotal: 0, memswFailCount: 0, userslice: false, - uid: -1, + jobuid: "1000", + jobgid: "1000", jobid: "1009248", + jobuuid: "8d4fad6d-c5e3-775b-8a8c-707e319114ec", + step: "", + task: "", batch: "slurm", err: false} if err != nil { @@ -47,7 +51,7 @@ func TestCgroupsV2SlurmJobMetrics(t *testing.T) { } func TestCgroupsV1SlurmJobMetrics(t *testing.T) { - if _, err := kingpin.CommandLine.Parse([]string{"--path.cgroupfs", "fixtures/sys/fs/cgroup"}); err != nil { + if _, err := kingpin.CommandLine.Parse([]string{"--path.cgroupfs", "fixtures/sys/fs/cgroup", "--collector.slurm.unique.jobid", "--collector.slurm.job.stat.path", "fixtures/slurmjobstat"}); err != nil { t.Fatal(err) } c := slurmCollector{cgroupV2: false, logger: log.NewNopLogger()} @@ -67,8 +71,10 @@ func TestCgroupsV1SlurmJobMetrics(t *testing.T) { memswTotal: 9.223372036854772e+18, memswFailCount: 0, userslice: false, - uid: 1000, + jobuid: "1000", + jobgid: "1000", jobid: "1009248", + jobuuid: "8d4fad6d-c5e3-775b-8a8c-707e319114ec", step: "", task: "", batch: "slurm", diff --git a/collector/utils.go b/collector/utils.go index 60165139..37849817 100644 --- a/collector/utils.go +++ b/collector/utils.go @@ -1,8 +1,6 @@ package collector import ( - "crypto/md5" - "encoding/hex" "encoding/json" "fmt" "io" @@ -13,6 +11,9 @@ import ( "strconv" "strings" "time" + + "github.com/google/uuid" + "github.com/zeebo/xxh3" ) const ( @@ -90,9 +91,15 @@ func GetRteEnergyMixData() (float64, error) { return float64(fields[0].TauxCo2), nil } -// Get md5 checksum for given slice of strings -func GetMD5CheckSum(stringSlice []string) string { +// Get a UUID5 for given slice of strings +func GetUuidFromString(stringSlice []string) (string, error) { s := strings.Join(stringSlice[:], ",") - hash := md5.Sum([]byte(s)) - return hex.EncodeToString(hash[:]) + h := xxh3.HashString128(s).Bytes() + uuid, err := uuid.FromBytes(h[:]) + // hash := md5.Sum([]byte(s)) + // md5string := hex.EncodeToString(hash[:]) + // // generate the UUID from the + // // first 16 bytes of the MD5 hash + // uuid, err := uuid.FromBytes([]byte(md5string[0:16])) + return uuid.String(), err } diff --git a/go.mod b/go.mod index 7276e492..32b5af40 100644 --- a/go.mod +++ b/go.mod @@ -6,10 +6,12 @@ require ( github.com/alecthomas/kingpin/v2 v2.3.2 github.com/containerd/cgroups/v3 v3.0.2 github.com/go-kit/log v0.2.1 + github.com/google/uuid v1.4.0 github.com/prometheus/client_golang v1.17.0 github.com/prometheus/common v0.45.0 github.com/prometheus/exporter-toolkit v0.10.0 github.com/prometheus/procfs v0.12.0 + github.com/zeebo/xxh3 v1.0.2 ) require ( @@ -22,6 +24,7 @@ require ( github.com/godbus/dbus/v5 v5.1.0 // indirect github.com/golang/protobuf v1.5.3 // indirect github.com/jpillora/backoff v1.0.0 // indirect + github.com/klauspost/cpuid/v2 v2.0.9 // indirect github.com/kr/text v0.2.0 // indirect github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect diff --git a/go.sum b/go.sum index c01fd6ea..7e784980 100644 --- a/go.sum +++ b/go.sum @@ -30,8 +30,12 @@ github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiu github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/uuid v1.4.0 h1:MtMxsa51/r9yyhkyLsVeVt0B+BGQZzpQiTQ4eHZ8bc4= +github.com/google/uuid v1.4.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= +github.com/klauspost/cpuid/v2 v2.0.9 h1:lgaqFMSdTdQYdZ04uHyN2d/eKdOMyi2YLSvlQIBFYa4= +github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= @@ -62,6 +66,10 @@ github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcU github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/xhit/go-str2duration/v2 v2.1.0 h1:lxklc02Drh6ynqX+DdPyp5pCKLUQpRT8bp8Ydu2Bstc= github.com/xhit/go-str2duration/v2 v2.1.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtXVyJfNt1+BlmyAsU= +github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= +github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= +github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= +github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.14.0 h1:wBqGXzWJW6m1XrIKlAH0Hs1JJ7+9KBwnIO8v66Q9cHc= golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4=