From 6cf9264caf0e02f03694ccb8fef26f023b1b0e7b Mon Sep 17 00:00:00 2001 From: Mahendra Paipuri Date: Thu, 28 Dec 2023 12:21:30 +0100 Subject: [PATCH 1/2] fix: Correct env var name for getting gpu index * Add new CLI arg to set nvidia-smi path Signed-off-by: Mahendra Paipuri --- pkg/collector/nvidia_gpus.go | 51 +++++++++++------------------------- scripts/e2e-test.sh | 3 ++- 2 files changed, 17 insertions(+), 37 deletions(-) diff --git a/pkg/collector/nvidia_gpus.go b/pkg/collector/nvidia_gpus.go index 68ac26a5..76ee6a30 100644 --- a/pkg/collector/nvidia_gpus.go +++ b/pkg/collector/nvidia_gpus.go @@ -22,6 +22,10 @@ const nvidiaGpuJobMapCollectorSubsystem = "nvidia_gpu" var ( jobMapLock = sync.RWMutex{} + nvidiaSmiPath = BatchJobExporterApp.Flag( + "collector.nvidia.smi.path", + "Absolute path to nvidia-smi executable.", + ).Default("/usr/bin/nvidia-smi").String() gpuStatPath = BatchJobExporterApp.Flag( "collector.nvidia.gpu.job.map.path", "Path to file that maps GPU ordinals to job IDs.", @@ -62,8 +66,15 @@ func init() { // // NOTE: Hoping this command returns MIG devices too func getAllDevices(logger log.Logger) ([]Device, error) { + // Check if nvidia-smi binary exists + if _, err := os.Stat(*nvidiaSmiPath); err != nil { + level.Error(logger).Log("msg", "Failed to open nvidia-smi executable", "path", *nvidiaSmiPath, "err", err) + return nil, err + } + + // Execute nvidia-smi command to get available GPUs args := []string{"--query-gpu=index,name,uuid", "--format=csv"} - nvidiaSmiOutput, err := helpers.Execute("nvidia-smi", args, logger) + nvidiaSmiOutput, err := helpers.Execute(*nvidiaSmiPath, args, logger) if err != nil { level.Error(logger). Log("msg", "nvidia-smi command to get list of devices failed", "err", err) @@ -73,8 +84,8 @@ func getAllDevices(logger log.Logger) ([]Device, error) { // Get all devices allDevices := []Device{} for _, line := range strings.Split(string(nvidiaSmiOutput), "\n") { - // Header line - if strings.HasPrefix(line, "index") { + // Header line, empty line and newlines are ignored + if line == "" || line == "\n" || strings.HasPrefix(line, "index") { continue } @@ -196,7 +207,7 @@ func (c *nvidiaGpuJobMapCollector) getJobId() (map[string]float64, error) { // and SLURM_JOB_ID for _, env := range environments { // Check both SLURM_SETP_GPUS and SLURM_JOB_GPUS vars - if strings.Contains(env, "SLURM_STEP_GPUS") || strings.Contains(env, "SLURM_JOBS_GPUS") { + if strings.Contains(env, "SLURM_STEP_GPUS") || strings.Contains(env, "SLURM_JOB_GPUS") { gpuIndices = strings.Split(strings.Split(env, "=")[1], ",") } if strings.Contains(env, "SLURM_JOB_ID") { @@ -219,38 +230,6 @@ func (c *nvidiaGpuJobMapCollector) getJobId() (map[string]float64, error) { wg.Done() }(proc) - // environments, err := proc.Environ() - // if err != nil { - // continue - // } - - // var gpuIndices []string - // var slurmJobId string = "" - - // // Loop through all env vars and get SLURM_SETP_GPUS/SLURM_JOB_GPUS - // // and SLURM_JOB_ID - // for _, env := range environments { - // // Check both SLURM_SETP_GPUS and SLURM_JOB_GPUS vars and only when - // // gpuIndices is empty. - // // We dont want an empty env var to override already populated - // // gpuIndices slice - // if (strings.Contains(env, "SLURM_STEP_GPUS") || strings.Contains(env, "SLURM_JOBS_GPUS")) && len(gpuIndices) == 0 { - // gpuIndices = strings.Split(strings.Split(env, "=")[1], ",") - // } - // if strings.Contains(env, "SLURM_JOB_ID") { - // slurmJobId = strings.Split(env, "=")[1] - // } - // } - - // // If gpuIndices has current GPU index, assign the jobID and break loop - // if slices.Contains(gpuIndices, dev.index) { - // jid, err := strconv.Atoi(slurmJobId) - // if err != nil { - // gpuJobMapper[dev.uuid] = float64(0) - // } - // gpuJobMapper[dev.uuid] = float64(jid) - // goto outside - // } } // Wait for all go routines diff --git a/scripts/e2e-test.sh b/scripts/e2e-test.sh index b05c356a..bfab4eae 100755 --- a/scripts/e2e-test.sh +++ b/scripts/e2e-test.sh @@ -109,13 +109,14 @@ then exit 1 fi - PATH=$PWD/pkg/collector/fixtures:$PATH ./bin/batchjob_exporter \ + ./bin/batchjob_exporter \ --path.sysfs="pkg/collector/fixtures/sys" \ --path.cgroupfs="pkg/collector/fixtures/sys/fs/cgroup" \ --collector.slurm.create.unique.jobids \ --collector.slurm.job.props.path="pkg/collector/fixtures/slurmjobprops" \ --collector.ipmi.dcmi.cmd="pkg/collector/fixtures/ipmi-dcmi-wrapper.sh" \ --collector.nvidia_gpu \ + --collector.nvidia.smi.path="pkg/collector/fixtures/nvidia-smi" \ --collector.nvidia.gpu.job.map.path="pkg/collector/fixtures/gpujobmap" \ --web.listen-address "127.0.0.1:${port}" \ --log.level="debug" > "${logfile}" 2>&1 & From 9af6cbb623a422e6ba098b2e1b82a16fd072d386 Mon Sep 17 00:00:00 2001 From: Mahendra Paipuri Date: Thu, 28 Dec 2023 12:24:16 +0100 Subject: [PATCH 2/2] style: Fix formatting Signed-off-by: Mahendra Paipuri --- pkg/collector/nvidia_gpus.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/collector/nvidia_gpus.go b/pkg/collector/nvidia_gpus.go index 76ee6a30..a4a49f06 100644 --- a/pkg/collector/nvidia_gpus.go +++ b/pkg/collector/nvidia_gpus.go @@ -21,7 +21,7 @@ import ( const nvidiaGpuJobMapCollectorSubsystem = "nvidia_gpu" var ( - jobMapLock = sync.RWMutex{} + jobMapLock = sync.RWMutex{} nvidiaSmiPath = BatchJobExporterApp.Flag( "collector.nvidia.smi.path", "Absolute path to nvidia-smi executable.",