From 7901aa0a6db08051e10b656040705c96c2664bed Mon Sep 17 00:00:00 2001 From: Mahendra Paipuri Date: Wed, 13 Dec 2023 15:50:55 +0100 Subject: [PATCH 1/8] feat: Get env vars from procfs * If prolog scripts are not executed, we attempt to get env vars from procs * Remove work dir from job id calculation as the env var is not available inside process * This needs CAP_SYS_PTRACE capability on binary to work Signed-off-by: Mahendra Paipuri --- pkg/collector/slurm.go | 86 ++++++++++++++++++++++++++----------- pkg/collector/slurm_test.go | 8 ++-- pkg/jobstats/slurm.go | 11 +++-- pkg/jobstats/slurm_test.go | 4 +- 4 files changed, 72 insertions(+), 37 deletions(-) diff --git a/pkg/collector/slurm.go b/pkg/collector/slurm.go index 69d60c40..0f915dc1 100644 --- a/pkg/collector/slurm.go +++ b/pkg/collector/slurm.go @@ -8,6 +8,7 @@ import ( "os" "path/filepath" "regexp" + "slices" "strconv" "strings" "sync" @@ -19,12 +20,13 @@ import ( "github.com/go-kit/log/level" "github.com/mahendrapaipuri/batchjob_monitoring/internal/helpers" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/procfs" ) const slurmCollectorSubsystem = "slurm_job" var ( - cgroupV2 = false + cgroupsV2 = false metricLock = sync.RWMutex{} collectJobSteps = kingpin.Flag( "collector.slurm.jobsteps.metrics", @@ -33,12 +35,12 @@ var ( ).Default("false").Bool() // useJobIdHash = kingpin.Flag( // "collector.slurm.unique.jobid", - // "Whether to calculate a hash based on job SLURM_JOBID, SLURM_JOB_UID, SLURM_JOB_ACCOUNT, SLURM_JOB_NODELIST, SLURM_JOB_WORKDIR to get unique job identifier.", + // "Whether to calculate a hash based on job SLURM_JOBID, SLURM_JOB_UID, SLURM_JOB_ACCOUNT, SLURM_JOB_NODELIST to get unique job identifier.", // ).Default("false").Bool() jobStatPath = kingpin.Flag( "collector.slurm.job.stat.path", `Path to jobstat files that contains a file for each job with line -\"$SLURM_JOB_UID $SLURM_JOB_ACCOUNT $SLURM_JOB_NODELIST $SLURM_JOB_WORKDIR\". +\"$SLURM_JOB_UID $SLURM_JOB_ACCOUNT $SLURM_JOB_NODELIST\". An deterministic UUID is computed on the variables in this file and job ID to get an unique job identifier.`, ).Default("/run/slurmjobstat").String() @@ -70,7 +72,7 @@ type CgroupMetric struct { } type slurmCollector struct { - cgroupV2 bool + cgroupsV2 bool cpuUser *prometheus.Desc cpuSystem *prometheus.Desc cpuTotal *prometheus.Desc @@ -94,13 +96,13 @@ func init() { // NewSlurmCollector returns a new Collector exposing a summary of cgroups. func NewSlurmCollector(logger log.Logger) (Collector, error) { if cgroups.Mode() == cgroups.Unified { - cgroupV2 = true + cgroupsV2 = true level.Info(logger).Log("msg", "Cgroup version v2 detected", "mount", *cgroupfsPath) } else { level.Info(logger).Log("msg", "Cgroup version v2 not detected, will proceed with v1.") } return &slurmCollector{ - cgroupV2: cgroupV2, + cgroupsV2: cgroupsV2, cpuUser: prometheus.NewDesc( prometheus.BuildFQName(namespace, "cpu", "user_seconds"), "Cumulative CPU user seconds", @@ -232,7 +234,7 @@ func (c *slurmCollector) getJobsMetrics() (map[string]CgroupMetric, error) { var metrics = make(map[string]CgroupMetric) var topPath string var fullPath string - if c.cgroupV2 { + if c.cgroupsV2 { topPath = *cgroupfsPath fullPath = topPath + "/system.slice/slurmstepd.scope" } else { @@ -278,7 +280,7 @@ func (c *slurmCollector) getJobsMetrics() (map[string]CgroupMetric, error) { // fix it by looking at the parent // we loop through names once as it was the result of Walk so top paths are seen first // also some cgroups we ignore, like path=/system.slice/slurmstepd.scope/job_216/step_interactive/user, hence the need to loop through multiple parents - if c.cgroupV2 { + if c.cgroupsV2 { for _, name := range names { metric, ok := metrics[name] if ok && metric.memoryTotal < 0 { @@ -298,7 +300,7 @@ func (c *slurmCollector) getJobsMetrics() (map[string]CgroupMetric, error) { // Get metrics of a given SLURM cgroups path func (c *slurmCollector) getMetrics(name string) (CgroupMetric, error) { - if c.cgroupV2 { + if c.cgroupsV2 { return c.getCgroupsV2Metrics(name) } else { return c.getCgroupsV1Metrics(name) @@ -343,7 +345,7 @@ func (c *slurmCollector) parseCpuSet(cpuset string) ([]string, error) { // Get list of CPUs in the cgroup func (c *slurmCollector) getCPUs(name string) ([]string, error) { var cpusPath string - if c.cgroupV2 { + if c.cgroupsV2 { cpusPath = fmt.Sprintf("%s%s/cpuset.cpus.effective", *cgroupfsPath, name) } else { cpusPath = fmt.Sprintf("%s/cpuset%s/cpuset.cpus", *cgroupfsPath, name) @@ -369,8 +371,7 @@ func (c *slurmCollector) getJobLabels(jobid string) (string, string, string) { var jobUuid string var jobUid string = "" var jobAccount string = "" - var jobNodelist = "" - var jobWorkDir = "" + var jobNodelist string = "" var slurmJobInfo = fmt.Sprintf("%s/%s", *jobStatPath, jobid) if _, err := os.Stat(slurmJobInfo); err == nil { content, err := os.ReadFile(slurmJobInfo) @@ -378,22 +379,57 @@ func (c *slurmCollector) getJobLabels(jobid string) (string, string, string) { level.Error(c.logger). Log("msg", "Failed to get metadata for job", "jobid", jobid, "err", err) } else { - fmt.Sscanf(string(content), "%s %s %s %s", &jobUid, &jobAccount, &jobNodelist, &jobWorkDir) + fmt.Sscanf(string(content), "%s %s %s", &jobUid, &jobAccount, &jobNodelist) } - jobUuid, err = helpers.GetUuidFromString( - []string{ - jobid, - jobUid, - strings.ToLower(jobAccount), - strings.ToLower(jobNodelist), - strings.ToLower(jobWorkDir), - }, - ) + } else { + // Attempt to get UID, Account, Nodelist from /proc file system by looking into + // environ for the process that has same SLURM_JOB_ID + allProcs, err := procfs.AllProcs() if err != nil { - level.Error(c.logger). - Log("msg", "Failed to generate UUID for job", "jobid", jobid, "err", err) - jobUuid = jobid + level.Error(c.logger).Log("msg", "Failed to read /proc", "err", err) + goto outside } + jobIDEnv := fmt.Sprintf("SLURM_JOB_ID=%s", jobid) + + // Iterate through all procs and look for SLURM_JOB_ID env entry + for _, proc := range allProcs { + environments, err := proc.Environ() + if err != nil { + continue + } + + // When env var entry found, get all necessary env vars + if slices.Contains(environments, jobIDEnv) { + for _, env := range environments { + if strings.Contains(env, "SLURM_JOB_UID") { + jobUid = strings.Split(env, "=")[1] + } + if strings.Contains(env, "SLURM_JOB_ACCOUNT") { + jobAccount = strings.Split(env, "=")[1] + } + if strings.Contains(env, "SLURM_JOB_NODELIST") { + jobNodelist = strings.Split(env, "=")[1] + } + } + + // Break loop once all env vars are read + goto outside + } + } + } +outside: + jobUuid, err := helpers.GetUuidFromString( + []string{ + strings.TrimSpace(jobid), + strings.TrimSpace(jobUid), + strings.ToLower(strings.TrimSpace(jobAccount)), + strings.ToLower(strings.TrimSpace(jobNodelist)), + }, + ) + if err != nil { + level.Error(c.logger). + Log("msg", "Failed to generate UUID for job", "jobid", jobid, "err", err) + jobUuid = jobid } return jobUuid, jobUid, jobAccount } diff --git a/pkg/collector/slurm_test.go b/pkg/collector/slurm_test.go index 24a5c5e6..8553d235 100644 --- a/pkg/collector/slurm_test.go +++ b/pkg/collector/slurm_test.go @@ -22,7 +22,7 @@ func TestCgroupsV2SlurmJobMetrics(t *testing.T) { ); err != nil { t.Fatal(err) } - c := slurmCollector{cgroupV2: true, logger: log.NewNopLogger()} + c := slurmCollector{cgroupsV2: true, logger: log.NewNopLogger()} metrics, err := c.getJobsMetrics() expectedSlurmMetrics = CgroupMetric{ name: "/system.slice/slurmstepd.scope/job_1009248", @@ -42,7 +42,7 @@ func TestCgroupsV2SlurmJobMetrics(t *testing.T) { jobuid: "1000", jobaccount: "testacc", jobid: "1009248", - jobuuid: "7f6c39b7-2740-fc1f-32c2-8fc28880829c", + jobuuid: "ac28caf5-ce6c-35f6-73fb-47d9d43f7780", step: "", task: "", batch: "slurm", @@ -64,7 +64,7 @@ func TestCgroupsV1SlurmJobMetrics(t *testing.T) { ); err != nil { t.Fatal(err) } - c := slurmCollector{cgroupV2: false, logger: log.NewNopLogger()} + c := slurmCollector{cgroupsV2: false, logger: log.NewNopLogger()} metrics, err := c.getJobsMetrics() expectedSlurmMetrics = CgroupMetric{ name: "/slurm/uid_1000/job_1009248", @@ -84,7 +84,7 @@ func TestCgroupsV1SlurmJobMetrics(t *testing.T) { jobuid: "1000", jobaccount: "testacc", jobid: "1009248", - jobuuid: "7f6c39b7-2740-fc1f-32c2-8fc28880829c", + jobuuid: "ac28caf5-ce6c-35f6-73fb-47d9d43f7780", step: "", task: "", batch: "slurm", diff --git a/pkg/jobstats/slurm.go b/pkg/jobstats/slurm.go index f41b35b2..56f0e2e9 100644 --- a/pkg/jobstats/slurm.go +++ b/pkg/jobstats/slurm.go @@ -58,14 +58,13 @@ func parseSacctCmdOutput(sacctOutput string, logger log.Logger) ([]BatchJob, int wg.Done() return } - // Generate UUID from jobID, uid, account, nodelist(lowercase), workdir(lowercase) + // Generate UUID from jobID, uid, account, nodelist(lowercase) jobUuid, err := helpers.GetUuidFromString( []string{ - components[0], - components[6], - strings.ToLower(components[2]), - strings.ToLower(components[14]), - strings.ToLower(components[16]), + strings.TrimSpace(components[0]), + strings.TrimSpace(components[6]), + strings.ToLower(strings.TrimSpace(components[2])), + strings.ToLower(strings.TrimSpace(components[14])), }, ) if err != nil { diff --git a/pkg/jobstats/slurm_test.go b/pkg/jobstats/slurm_test.go index eb63ea2c..1a3a70f6 100644 --- a/pkg/jobstats/slurm_test.go +++ b/pkg/jobstats/slurm_test.go @@ -15,7 +15,7 @@ var ( expectedBatchJobs = []BatchJob{ { Jobid: "1479763", - Jobuuid: "667127ae-68d0-47aa-78e3-76ae76e4aba7", + Jobuuid: "a3bd0ca1-5021-7e4d-943e-9529f8390f05", Partition: "part1", Account: "acc1", Grp: "grp", @@ -34,7 +34,7 @@ var ( }, { Jobid: "1481508", - Jobuuid: "6bfbe049-b5ee-ea02-6cd8-3dd95acdd0e4", + Jobuuid: "759a4e2e-1e47-c58b-3d1b-885a03ca323b", Partition: "part1", Account: "acc1", Grp: "grp", From 608b39797e03e2dc6ad6b1015db05c036736effc Mon Sep 17 00:00:00 2001 From: Mahendra Paipuri Date: Wed, 13 Dec 2023 15:51:12 +0100 Subject: [PATCH 2/8] style: Minor improvement for readability Signed-off-by: Mahendra Paipuri --- pkg/collector/nvidia_gpus.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pkg/collector/nvidia_gpus.go b/pkg/collector/nvidia_gpus.go index 65d80daf..702ad376 100644 --- a/pkg/collector/nvidia_gpus.go +++ b/pkg/collector/nvidia_gpus.go @@ -64,26 +64,34 @@ func getAllDevices(logger log.Logger) ([]Device, error) { Log("msg", "nvidia-smi command to get list of devices failed", "err", err) return nil, err } + + // Get all devices allDevices := []Device{} for _, line := range strings.Split(string(nvidiaSmiOutput), "\n") { // Header line if strings.HasPrefix(line, "name") { continue } + devDetails := strings.Split(line, ",") if len(devDetails) < 2 { level.Error(logger). Log("msg", "Cannot parse output from nvidia-smi command", "output", line) continue } + + // Get device name and UUID devName := strings.TrimSpace(devDetails[0]) devUuid := strings.TrimSpace(devDetails[1]) + + // Check if device is in MiG mode isMig := false if strings.HasPrefix(devUuid, "MIG") { isMig = true } level.Debug(logger). Log("msg", "Found nVIDIA GPU", "name", devName, "UUID", devUuid, "isMig:", isMig) + allDevices = append(allDevices, Device{name: devName, uuid: devUuid, isMig: isMig}) } return allDevices, nil From 607c953877cfd477666ca52d7c80f076a0419348 Mon Sep 17 00:00:00 2001 From: Mahendra Paipuri Date: Wed, 13 Dec 2023 15:51:23 +0100 Subject: [PATCH 3/8] test: Update test fixtures Signed-off-by: Mahendra Paipuri --- .../fixtures/e2e-test-cgroupsv1-output.txt | 24 +++++++++---------- .../fixtures/e2e-test-cgroupsv2-output.txt | 24 +++++++++---------- .../fixtures/e2e-test-stats-server-output.txt | 2 +- pkg/jobstats/fixtures/jobstats.dump | 4 ++-- 4 files changed, 27 insertions(+), 27 deletions(-) diff --git a/pkg/collector/fixtures/e2e-test-cgroupsv1-output.txt b/pkg/collector/fixtures/e2e-test-cgroupsv1-output.txt index 6b87f4c1..4b52ddde 100644 --- a/pkg/collector/fixtures/e2e-test-cgroupsv1-output.txt +++ b/pkg/collector/fixtures/e2e-test-cgroupsv1-output.txt @@ -1,15 +1,15 @@ # HELP batchjob_cpu_system_seconds Cumulative CPU system seconds # TYPE batchjob_cpu_system_seconds gauge -batchjob_cpu_system_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 0.45 +batchjob_cpu_system_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0.45 # HELP batchjob_cpu_total_seconds Cumulative CPU total seconds # TYPE batchjob_cpu_total_seconds gauge -batchjob_cpu_total_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 1.012410966 +batchjob_cpu_total_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 1.012410966 # HELP batchjob_cpu_user_seconds Cumulative CPU user seconds # TYPE batchjob_cpu_user_seconds gauge -batchjob_cpu_user_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 0.39 +batchjob_cpu_user_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0.39 # HELP batchjob_cpus Number of CPUs # TYPE batchjob_cpus gauge -batchjob_cpus{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 0 +batchjob_cpus{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 # HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build. # TYPE batchjob_exporter_build_info gauge # HELP batchjob_ipmi_dcmi_watts_total Current Power consumption in watts @@ -17,28 +17,28 @@ batchjob_cpus{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39 batchjob_ipmi_dcmi_watts_total 332 # HELP batchjob_memory_cache_bytes Memory cache used in bytes # TYPE batchjob_memory_cache_bytes gauge -batchjob_memory_cache_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 2.1086208e+07 +batchjob_memory_cache_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2.1086208e+07 # HELP batchjob_memory_fail_count Memory fail count # TYPE batchjob_memory_fail_count gauge -batchjob_memory_fail_count{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 0 +batchjob_memory_fail_count{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 # HELP batchjob_memory_rss_bytes Memory RSS used in bytes # TYPE batchjob_memory_rss_bytes gauge -batchjob_memory_rss_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 1.0407936e+07 +batchjob_memory_rss_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 1.0407936e+07 # HELP batchjob_memory_total_bytes Memory total in bytes # TYPE batchjob_memory_total_bytes gauge -batchjob_memory_total_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 2.01362030592e+11 +batchjob_memory_total_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2.01362030592e+11 # HELP batchjob_memory_used_bytes Memory used in bytes # TYPE batchjob_memory_used_bytes gauge -batchjob_memory_used_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 4.0194048e+07 +batchjob_memory_used_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.0194048e+07 # HELP batchjob_memsw_fail_count Swap fail count # TYPE batchjob_memsw_fail_count gauge -batchjob_memsw_fail_count{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 0 +batchjob_memsw_fail_count{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 # HELP batchjob_memsw_total_bytes Swap total in bytes # TYPE batchjob_memsw_total_bytes gauge -batchjob_memsw_total_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 9.223372036854772e+18 +batchjob_memsw_total_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 9.223372036854772e+18 # HELP batchjob_memsw_used_bytes Swap used in bytes # TYPE batchjob_memsw_used_bytes gauge -batchjob_memsw_used_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 4.032512e+07 +batchjob_memsw_used_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.032512e+07 # HELP batchjob_nvidia_gpu_jobid Batch Job ID of current nVIDIA GPU # TYPE batchjob_nvidia_gpu_jobid gauge batchjob_nvidia_gpu_jobid{uuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"} 11000 diff --git a/pkg/collector/fixtures/e2e-test-cgroupsv2-output.txt b/pkg/collector/fixtures/e2e-test-cgroupsv2-output.txt index df5cded4..56b3eb88 100644 --- a/pkg/collector/fixtures/e2e-test-cgroupsv2-output.txt +++ b/pkg/collector/fixtures/e2e-test-cgroupsv2-output.txt @@ -1,15 +1,15 @@ # HELP batchjob_cpu_system_seconds Cumulative CPU system seconds # TYPE batchjob_cpu_system_seconds gauge -batchjob_cpu_system_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 115.777502 +batchjob_cpu_system_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 115.777502 # HELP batchjob_cpu_total_seconds Cumulative CPU total seconds # TYPE batchjob_cpu_total_seconds gauge -batchjob_cpu_total_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 60491.070351 +batchjob_cpu_total_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60491.070351 # HELP batchjob_cpu_user_seconds Cumulative CPU user seconds # TYPE batchjob_cpu_user_seconds gauge -batchjob_cpu_user_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 60375.292848 +batchjob_cpu_user_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60375.292848 # HELP batchjob_cpus Number of CPUs # TYPE batchjob_cpus gauge -batchjob_cpus{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 2 +batchjob_cpus{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2 # HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build. # TYPE batchjob_exporter_build_info gauge # HELP batchjob_ipmi_dcmi_watts_total Current Power consumption in watts @@ -17,28 +17,28 @@ batchjob_cpus{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39 batchjob_ipmi_dcmi_watts_total 332 # HELP batchjob_memory_cache_bytes Memory cache used in bytes # TYPE batchjob_memory_cache_bytes gauge -batchjob_memory_cache_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 0 +batchjob_memory_cache_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 # HELP batchjob_memory_fail_count Memory fail count # TYPE batchjob_memory_fail_count gauge -batchjob_memory_fail_count{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 0 +batchjob_memory_fail_count{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 # HELP batchjob_memory_rss_bytes Memory RSS used in bytes # TYPE batchjob_memory_rss_bytes gauge -batchjob_memory_rss_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 4.098592768e+09 +batchjob_memory_rss_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.098592768e+09 # HELP batchjob_memory_total_bytes Memory total in bytes # TYPE batchjob_memory_total_bytes gauge -batchjob_memory_total_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 4.294967296e+09 +batchjob_memory_total_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.294967296e+09 # HELP batchjob_memory_used_bytes Memory used in bytes # TYPE batchjob_memory_used_bytes gauge -batchjob_memory_used_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 4.111491072e+09 +batchjob_memory_used_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.111491072e+09 # HELP batchjob_memsw_fail_count Swap fail count # TYPE batchjob_memsw_fail_count gauge -batchjob_memsw_fail_count{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 0 +batchjob_memsw_fail_count{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 # HELP batchjob_memsw_total_bytes Swap total in bytes # TYPE batchjob_memsw_total_bytes gauge -batchjob_memsw_total_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 0 +batchjob_memsw_total_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 # HELP batchjob_memsw_used_bytes Swap used in bytes # TYPE batchjob_memsw_used_bytes gauge -batchjob_memsw_used_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 0 +batchjob_memsw_used_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 # HELP batchjob_nvidia_gpu_jobid Batch Job ID of current nVIDIA GPU # TYPE batchjob_nvidia_gpu_jobid gauge batchjob_nvidia_gpu_jobid{uuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"} 11000 diff --git a/pkg/jobstats/fixtures/e2e-test-stats-server-output.txt b/pkg/jobstats/fixtures/e2e-test-stats-server-output.txt index 1cbb435f..a791db98 100644 --- a/pkg/jobstats/fixtures/e2e-test-stats-server-output.txt +++ b/pkg/jobstats/fixtures/e2e-test-stats-server-output.txt @@ -1 +1 @@ -{"status":"success","errorType":"","error":"","warnings":null,"data":[{"jobid":"1479763","id":"667127ae-68d0-47aa-78e3-76ae76e4aba7","partition":"part1","account":"acc1","group":"grp","gid":"1000","user":"usr","uid":"1000","submit":"2023-02-21T14:37:02","start":"2023-02-21T14:37:07","end":"2023-02-21T15:26:29","elapsed":"00:49:22","exitcode":"0:0","state":"CANCELLED by 1000","nnodes":"1","nodelist":"compute-0","nodelistexp":"compute-0"},{"jobid":"1481508","id":"6bfbe049-b5ee-ea02-6cd8-3dd95acdd0e4","partition":"part1","account":"acc1","group":"grp","gid":"1000","user":"usr","uid":"1000","submit":"2023-02-21T15:48:20","start":"2023-02-21T15:49:06","end":"2023-02-21T15:57:23","elapsed":"00:08:17","exitcode":"0:0","state":"CANCELLED by 1000","nnodes":"2","nodelist":"compute-[0-2]","nodelistexp":"compute-0|compute-1|compute-2"}]} +{"status":"success","errorType":"","error":"","warnings":null,"data":[{"jobid":"1479763","id":"a3bd0ca1-5021-7e4d-943e-9529f8390f05","partition":"part1","account":"acc1","group":"grp","gid":"1000","user":"usr","uid":"1000","submit":"2023-02-21T14:37:02","start":"2023-02-21T14:37:07","end":"2023-02-21T15:26:29","elapsed":"00:49:22","exitcode":"0:0","state":"CANCELLED by 1000","nnodes":"1","nodelist":"compute-0","nodelistexp":"compute-0"},{"jobid":"1481508","id":"759a4e2e-1e47-c58b-3d1b-885a03ca323b","partition":"part1","account":"acc1","group":"grp","gid":"1000","user":"usr","uid":"1000","submit":"2023-02-21T15:48:20","start":"2023-02-21T15:49:06","end":"2023-02-21T15:57:23","elapsed":"00:08:17","exitcode":"0:0","state":"CANCELLED by 1000","nnodes":"2","nodelist":"compute-[0-2]","nodelistexp":"compute-0|compute-1|compute-2"}]} diff --git a/pkg/jobstats/fixtures/jobstats.dump b/pkg/jobstats/fixtures/jobstats.dump index 66fd5ebc..cfae7695 100644 --- a/pkg/jobstats/fixtures/jobstats.dump +++ b/pkg/jobstats/fixtures/jobstats.dump @@ -20,8 +20,8 @@ CREATE TABLE jobs ( "Nodelist" TEXT, "NodelistExp" TEXT ); -INSERT INTO jobs VALUES(1,'1479763','667127ae-68d0-47aa-78e3-76ae76e4aba7','part1','acc1','grp','1000','usr','1000','2023-02-21T14:37:02','2023-02-21T14:37:07','2023-02-21T15:26:29','00:49:22','0:0','CANCELLED by 1000','1','compute-0','compute-0'); -INSERT INTO jobs VALUES(2,'1481508','6bfbe049-b5ee-ea02-6cd8-3dd95acdd0e4','part1','acc1','grp','1000','usr','1000','2023-02-21T15:48:20','2023-02-21T15:49:06','2023-02-21T15:57:23','00:08:17','0:0','CANCELLED by 1000','2','compute-[0-2]','compute-0|compute-1|compute-2'); +INSERT INTO jobs VALUES(1,'1479763','a3bd0ca1-5021-7e4d-943e-9529f8390f05','part1','acc1','grp','1000','usr','1000','2023-02-21T14:37:02','2023-02-21T14:37:07','2023-02-21T15:26:29','00:49:22','0:0','CANCELLED by 1000','1','compute-0','compute-0'); +INSERT INTO jobs VALUES(2,'1481508','759a4e2e-1e47-c58b-3d1b-885a03ca323b','part1','acc1','grp','1000','usr','1000','2023-02-21T15:48:20','2023-02-21T15:49:06','2023-02-21T15:57:23','00:08:17','0:0','CANCELLED by 1000','2','compute-[0-2]','compute-0|compute-1|compute-2'); DELETE FROM sqlite_sequence; INSERT INTO sqlite_sequence VALUES('jobs',2); CREATE INDEX i1 ON jobs (Usr,Account,Start); From 19933871a7448847874808e9a292e4ae8e8fefbf Mon Sep 17 00:00:00 2001 From: Mahendra Paipuri Date: Wed, 13 Dec 2023 16:35:58 +0100 Subject: [PATCH 4/8] refactor: Create file with ordinal number * Support only ordinal number for the moment * Current implementation might not work for MIG devices * We need to have a fallback to check for file name with UUID Signed-off-by: Mahendra Paipuri --- pkg/collector/nvidia_gpus.go | 27 +++++++++++++++++---------- pkg/collector/nvidia_gpus_test.go | 18 +++++++++++++----- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/pkg/collector/nvidia_gpus.go b/pkg/collector/nvidia_gpus.go index 702ad376..204f6d07 100644 --- a/pkg/collector/nvidia_gpus.go +++ b/pkg/collector/nvidia_gpus.go @@ -25,6 +25,7 @@ var ( ) type Device struct { + index string name string uuid string isMig bool @@ -57,7 +58,7 @@ func init() { // // NOTE: Hoping this command returns MIG devices too func getAllDevices(logger log.Logger) ([]Device, error) { - args := []string{"--query-gpu=name,uuid", "--format=csv"} + args := []string{"--query-gpu=index,name,uuid", "--format=csv"} nvidiaSmiOutput, err := helpers.Execute("nvidia-smi", args, logger) if err != nil { level.Error(logger). @@ -69,20 +70,21 @@ func getAllDevices(logger log.Logger) ([]Device, error) { allDevices := []Device{} for _, line := range strings.Split(string(nvidiaSmiOutput), "\n") { // Header line - if strings.HasPrefix(line, "name") { + if strings.HasPrefix(line, "index") { continue } devDetails := strings.Split(line, ",") - if len(devDetails) < 2 { + if len(devDetails) < 3 { level.Error(logger). Log("msg", "Cannot parse output from nvidia-smi command", "output", line) continue } - // Get device name and UUID - devName := strings.TrimSpace(devDetails[0]) - devUuid := strings.TrimSpace(devDetails[1]) + // Get device index, name and UUID + devIndx := strings.TrimSpace(devDetails[0]) + devName := strings.TrimSpace(devDetails[1]) + devUuid := strings.TrimSpace(devDetails[2]) // Check if device is in MiG mode isMig := false @@ -92,7 +94,7 @@ func getAllDevices(logger log.Logger) ([]Device, error) { level.Debug(logger). Log("msg", "Found nVIDIA GPU", "name", devName, "UUID", devUuid, "isMig:", isMig) - allDevices = append(allDevices, Device{name: devName, uuid: devUuid, isMig: isMig}) + allDevices = append(allDevices, Device{index: devIndx, name: devName, uuid: devUuid, isMig: isMig}) } return allDevices, nil } @@ -128,13 +130,18 @@ func (c *nvidiaGpuJobMapCollector) getJobId() (map[string]float64, error) { gpuJobMapper := make(map[string]float64) for _, dev := range c.devices { var jobId int64 = 0 - var slurmInfo string = fmt.Sprintf("%s/%s", *gpuStatPath, dev.uuid) + var slurmInfo string = fmt.Sprintf("%s/%s", *gpuStatPath, dev.index) + // NOTE: Look for file name with UUID as it will be more appropriate with + // MIG instances. + // If /run/gpustat/0 file is not found, check for the file with UUID as name? if _, err := os.Stat(slurmInfo); err == nil { content, err := os.ReadFile(slurmInfo) if err != nil { - level.Error(c.logger). - Log("msg", "Failed to get job ID for GPU", "name", dev.uuid, "err", err) + level.Error(c.logger).Log( + "msg", "Failed to get job ID for GPU", + "index", dev.index, "uuid", dev.uuid, "err", err, + ) gpuJobMapper[dev.uuid] = float64(0) } fmt.Sscanf(string(content), "%d", &jobId) diff --git a/pkg/collector/nvidia_gpus_test.go b/pkg/collector/nvidia_gpus_test.go index 23e58f68..323adc19 100644 --- a/pkg/collector/nvidia_gpus_test.go +++ b/pkg/collector/nvidia_gpus_test.go @@ -11,11 +11,19 @@ import ( ) var ( - devices = []Device{{name: "fakeGpu1", - uuid: "GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e", - isMig: false}, {name: "fakeGpu2", - uuid: "GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3", - isMig: false}} + devices = []Device{ + { + index: "0", + name: "fakeGpu1", + uuid: "GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e", + isMig: false, + }, { + index: "1", + name: "fakeGpu2", + uuid: "GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3", + isMig: false, + }, + } ) func TestNvidiaJobGpuMap(t *testing.T) { From 405f5f2e1c055da99c4c0c84883ad625a31d0a5a Mon Sep 17 00:00:00 2001 From: Mahendra Paipuri Date: Wed, 13 Dec 2023 16:36:09 +0100 Subject: [PATCH 5/8] test: Update test fixtures Signed-off-by: Mahendra Paipuri --- .../gpustat/{GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e => 0} | 0 .../gpustat/{GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3 => 1} | 0 pkg/collector/fixtures/nvidia-smi | 6 +++--- 3 files changed, 3 insertions(+), 3 deletions(-) rename pkg/collector/fixtures/gpustat/{GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e => 0} (100%) rename pkg/collector/fixtures/gpustat/{GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3 => 1} (100%) diff --git a/pkg/collector/fixtures/gpustat/GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e b/pkg/collector/fixtures/gpustat/0 similarity index 100% rename from pkg/collector/fixtures/gpustat/GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e rename to pkg/collector/fixtures/gpustat/0 diff --git a/pkg/collector/fixtures/gpustat/GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3 b/pkg/collector/fixtures/gpustat/1 similarity index 100% rename from pkg/collector/fixtures/gpustat/GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3 rename to pkg/collector/fixtures/gpustat/1 diff --git a/pkg/collector/fixtures/nvidia-smi b/pkg/collector/fixtures/nvidia-smi index 59b383fb..9e030652 100755 --- a/pkg/collector/fixtures/nvidia-smi +++ b/pkg/collector/fixtures/nvidia-smi @@ -1,5 +1,5 @@ #!/bin/bash -echo """name, uuid -Tesla V100-SXM2-32GB, GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e -Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3""" +echo """index, name, uuid +0, Tesla V100-SXM2-32GB, GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e +1, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3""" From 667231a7a69fdb51eb433033c0b826558c6281ee Mon Sep 17 00:00:00 2001 From: Mahendra Paipuri Date: Wed, 13 Dec 2023 16:53:12 +0100 Subject: [PATCH 6/8] ci: Bump go version in lint workflow Signed-off-by: Mahendra Paipuri --- .github/workflows/golangci-lint.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/golangci-lint.yml b/.github/workflows/golangci-lint.yml index c545dc7d..ee436398 100644 --- a/.github/workflows/golangci-lint.yml +++ b/.github/workflows/golangci-lint.yml @@ -22,7 +22,7 @@ jobs: - name: Install Go uses: actions/setup-go@v3 with: - go-version: 1.20.x + go-version: 1.21.x - name: Lint uses: golangci/golangci-lint-action@v3 From a41800d5eb9c4fb50be5b95d660cf6ffbcea7431 Mon Sep 17 00:00:00 2001 From: Mahendra Paipuri Date: Thu, 14 Dec 2023 11:30:31 +0100 Subject: [PATCH 7/8] feat: Support running ext cmds as different users * Now ipmi can be run as root directly in the exporter * Users can still use wrapper approach if they want Signed-off-by: Mahendra Paipuri --- internal/helpers/helpers.go | 16 +++++++++++++++- pkg/collector/ipmi.go | 26 +++++++++++++++++++++----- scripts/e2e-test.sh | 2 +- 3 files changed, 37 insertions(+), 7 deletions(-) diff --git a/internal/helpers/helpers.go b/internal/helpers/helpers.go index 47e3b48a..32813308 100644 --- a/internal/helpers/helpers.go +++ b/internal/helpers/helpers.go @@ -4,6 +4,7 @@ import ( "fmt" "os/exec" "strings" + "syscall" "github.com/go-kit/log" "github.com/go-kit/log/level" @@ -24,7 +25,20 @@ func Execute(cmd string, args []string, logger log.Logger) ([]byte, error) { level.Debug(logger).Log("msg", "Executing", "command", cmd, "args", fmt.Sprintf("%+v", args)) out, err := exec.Command(cmd, args...).CombinedOutput() if err != nil { - err = fmt.Errorf("error running %s: %s", cmd, err) + level.Error(logger).Log("msg", "Error executing command", "command", cmd, "args", fmt.Sprintf("%+v", args), "err", err) + } + return out, err +} + +// Execute command as a given UID and GID and return stdout/stderr +func ExecuteAs(cmd string, args []string, uid int, gid int, logger log.Logger) ([]byte, error) { + level.Debug(logger).Log("msg", "Executing as user", "command", cmd, "args", fmt.Sprintf("%+v", args), "uid", uid, "gid", gid) + execCmd := exec.Command(cmd, args...) + execCmd.SysProcAttr = &syscall.SysProcAttr{} + execCmd.SysProcAttr.Credential = &syscall.Credential{Uid: uint32(uid), Gid: uint32(gid)} + out, err := execCmd.CombinedOutput() + if err != nil { + level.Error(logger).Log("msg", "Error executing command as user", "command", cmd, "args", fmt.Sprintf("%+v", args), "uid", uid, "gid", gid, "err", err) } return out, err } diff --git a/pkg/collector/ipmi.go b/pkg/collector/ipmi.go index 88cbc64b..031fa7d1 100644 --- a/pkg/collector/ipmi.go +++ b/pkg/collector/ipmi.go @@ -28,10 +28,14 @@ type impiCollector struct { } var ( - ipmiDcmiWrapperExec = kingpin.Flag( - "collector.ipmi.dcmi.wrapper.path", - "Path to IPMI DCMI executable wrapper.", + ipmiDcmiExec = kingpin.Flag( + "collector.ipmi.dcmi.exec.path", + "Path to IPMI DCMI executable.", ).Default("ipmi-dcmi-wrapper").String() + ipmiDcmiExecAsRoot = kingpin.Flag( + "collector.ipmi.dcmi.exec.run.as.root", + "Execute IPMI DCMI command as root. This requires batchjob_exporter to run as root or to have appropriate capabilities (cap_setuid).", + ).Default("false").Bool() ipmiDCMIPowerMeasurementRegex = regexp.MustCompile( `^Power Measurement\s*:\s*(?PActive|Not\sAvailable).*`, ) @@ -78,16 +82,27 @@ func getValue(ipmiOutput []byte, regex *regexp.Regexp) (string, error) { // Update implements Collector and exposes IPMI DCMI power related metrics. func (c *impiCollector) Update(ch chan<- prometheus.Metric) error { - args := []string{""} - stdOut, err := helpers.Execute(*ipmiDcmiWrapperExec, args, c.logger) + args := []string{"--get-system-power-statistics"} + var stdOut []byte + var err error + + // Execute ipmi-dcmi command + if *ipmiDcmiExecAsRoot { + stdOut, err = helpers.ExecuteAs(*ipmiDcmiExec, args, 0, 0, c.logger) + } else { + stdOut, err = helpers.Execute(*ipmiDcmiExec, args, c.logger) + } if err != nil { return err } + + // Parse power consumption from output currentPowerConsumption, err := c.getCurrentPowerConsumption(stdOut) if err != nil { level.Error(c.logger).Log("msg", "Failed to collect IPMI DCMI data", "error", err) return err } + // Returned value negative == Power Measurement is not avail if currentPowerConsumption > -1 { ch <- prometheus.MustNewConstMetric(c.wattsMetricDesc, prometheus.CounterValue, float64(currentPowerConsumption)) @@ -102,6 +117,7 @@ func (c *impiCollector) getCurrentPowerConsumption(ipmiOutput []byte) (float64, if err != nil { return -1, err } + // When Power Measurement in 'Active' state - we can get watts if value == "Active" { value, err := getValue(ipmiOutput, ipmiDCMICurrentPowerRegex) diff --git a/scripts/e2e-test.sh b/scripts/e2e-test.sh index 1d1e73a2..c3cc16e6 100755 --- a/scripts/e2e-test.sh +++ b/scripts/e2e-test.sh @@ -118,7 +118,7 @@ then --path.sysfs="pkg/collector/fixtures/sys" \ --path.cgroupfs="pkg/collector/fixtures/sys/fs/cgroup" \ --collector.slurm.job.stat.path="pkg/collector/fixtures/slurmjobstat" \ - --collector.ipmi.dcmi.wrapper.path="pkg/collector/fixtures/ipmi-dcmi-wrapper.sh" \ + --collector.ipmi.dcmi.exec.path="pkg/collector/fixtures/ipmi-dcmi-wrapper.sh" \ --collector.nvidia_gpu \ --collector.nvidia.gpu.stat.path="pkg/collector/fixtures/gpustat" \ --web.listen-address "127.0.0.1:${port}" \ From db8aaf4b380ece3fa85af56619a538ce35d80872 Mon Sep 17 00:00:00 2001 From: Mahendra Paipuri Date: Thu, 14 Dec 2023 11:31:16 +0100 Subject: [PATCH 8/8] style: Fix gofmt errors Signed-off-by: Mahendra Paipuri --- pkg/collector/ipmi.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/collector/ipmi.go b/pkg/collector/ipmi.go index 031fa7d1..b0530228 100644 --- a/pkg/collector/ipmi.go +++ b/pkg/collector/ipmi.go @@ -102,7 +102,7 @@ func (c *impiCollector) Update(ch chan<- prometheus.Metric) error { level.Error(c.logger).Log("msg", "Failed to collect IPMI DCMI data", "error", err) return err } - + // Returned value negative == Power Measurement is not avail if currentPowerConsumption > -1 { ch <- prometheus.MustNewConstMetric(c.wattsMetricDesc, prometheus.CounterValue, float64(currentPowerConsumption)) @@ -117,7 +117,7 @@ func (c *impiCollector) getCurrentPowerConsumption(ipmiOutput []byte) (float64, if err != nil { return -1, err } - + // When Power Measurement in 'Active' state - we can get watts if value == "Active" { value, err := getValue(ipmiOutput, ipmiDCMICurrentPowerRegex)