Skip to content

Commit

Permalink
Merge pull request #9 from mahendrapaipuri/env_vars_from_procfs
Browse files Browse the repository at this point in the history
Misc features
  • Loading branch information
mahendrapaipuri authored Dec 14, 2023
2 parents 49286e5 + db8aaf4 commit 759cbad
Show file tree
Hide file tree
Showing 17 changed files with 177 additions and 89 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/golangci-lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
- name: Install Go
uses: actions/setup-go@v3
with:
go-version: 1.20.x
go-version: 1.21.x

- name: Lint
uses: golangci/golangci-lint-action@v3
Expand Down
16 changes: 15 additions & 1 deletion internal/helpers/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"os/exec"
"strings"
"syscall"

"github.com/go-kit/log"
"github.com/go-kit/log/level"
Expand All @@ -24,7 +25,20 @@ func Execute(cmd string, args []string, logger log.Logger) ([]byte, error) {
level.Debug(logger).Log("msg", "Executing", "command", cmd, "args", fmt.Sprintf("%+v", args))
out, err := exec.Command(cmd, args...).CombinedOutput()
if err != nil {
err = fmt.Errorf("error running %s: %s", cmd, err)
level.Error(logger).Log("msg", "Error executing command", "command", cmd, "args", fmt.Sprintf("%+v", args), "err", err)
}
return out, err
}

// Execute command as a given UID and GID and return stdout/stderr
func ExecuteAs(cmd string, args []string, uid int, gid int, logger log.Logger) ([]byte, error) {
level.Debug(logger).Log("msg", "Executing as user", "command", cmd, "args", fmt.Sprintf("%+v", args), "uid", uid, "gid", gid)
execCmd := exec.Command(cmd, args...)
execCmd.SysProcAttr = &syscall.SysProcAttr{}
execCmd.SysProcAttr.Credential = &syscall.Credential{Uid: uint32(uid), Gid: uint32(gid)}
out, err := execCmd.CombinedOutput()
if err != nil {
level.Error(logger).Log("msg", "Error executing command as user", "command", cmd, "args", fmt.Sprintf("%+v", args), "uid", uid, "gid", gid, "err", err)
}
return out, err
}
24 changes: 12 additions & 12 deletions pkg/collector/fixtures/e2e-test-cgroupsv1-output.txt
Original file line number Diff line number Diff line change
@@ -1,44 +1,44 @@
# HELP batchjob_cpu_system_seconds Cumulative CPU system seconds
# TYPE batchjob_cpu_system_seconds gauge
batchjob_cpu_system_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 0.45
batchjob_cpu_system_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0.45
# HELP batchjob_cpu_total_seconds Cumulative CPU total seconds
# TYPE batchjob_cpu_total_seconds gauge
batchjob_cpu_total_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 1.012410966
batchjob_cpu_total_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 1.012410966
# HELP batchjob_cpu_user_seconds Cumulative CPU user seconds
# TYPE batchjob_cpu_user_seconds gauge
batchjob_cpu_user_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 0.39
batchjob_cpu_user_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0.39
# HELP batchjob_cpus Number of CPUs
# TYPE batchjob_cpus gauge
batchjob_cpus{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 0
batchjob_cpus{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0
# HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build.
# TYPE batchjob_exporter_build_info gauge
# HELP batchjob_ipmi_dcmi_watts_total Current Power consumption in watts
# TYPE batchjob_ipmi_dcmi_watts_total counter
batchjob_ipmi_dcmi_watts_total 332
# HELP batchjob_memory_cache_bytes Memory cache used in bytes
# TYPE batchjob_memory_cache_bytes gauge
batchjob_memory_cache_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 2.1086208e+07
batchjob_memory_cache_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2.1086208e+07
# HELP batchjob_memory_fail_count Memory fail count
# TYPE batchjob_memory_fail_count gauge
batchjob_memory_fail_count{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 0
batchjob_memory_fail_count{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0
# HELP batchjob_memory_rss_bytes Memory RSS used in bytes
# TYPE batchjob_memory_rss_bytes gauge
batchjob_memory_rss_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 1.0407936e+07
batchjob_memory_rss_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 1.0407936e+07
# HELP batchjob_memory_total_bytes Memory total in bytes
# TYPE batchjob_memory_total_bytes gauge
batchjob_memory_total_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 2.01362030592e+11
batchjob_memory_total_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2.01362030592e+11
# HELP batchjob_memory_used_bytes Memory used in bytes
# TYPE batchjob_memory_used_bytes gauge
batchjob_memory_used_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 4.0194048e+07
batchjob_memory_used_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.0194048e+07
# HELP batchjob_memsw_fail_count Swap fail count
# TYPE batchjob_memsw_fail_count gauge
batchjob_memsw_fail_count{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 0
batchjob_memsw_fail_count{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0
# HELP batchjob_memsw_total_bytes Swap total in bytes
# TYPE batchjob_memsw_total_bytes gauge
batchjob_memsw_total_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 9.223372036854772e+18
batchjob_memsw_total_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 9.223372036854772e+18
# HELP batchjob_memsw_used_bytes Swap used in bytes
# TYPE batchjob_memsw_used_bytes gauge
batchjob_memsw_used_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 4.032512e+07
batchjob_memsw_used_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.032512e+07
# HELP batchjob_nvidia_gpu_jobid Batch Job ID of current nVIDIA GPU
# TYPE batchjob_nvidia_gpu_jobid gauge
batchjob_nvidia_gpu_jobid{uuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"} 11000
Expand Down
24 changes: 12 additions & 12 deletions pkg/collector/fixtures/e2e-test-cgroupsv2-output.txt
Original file line number Diff line number Diff line change
@@ -1,44 +1,44 @@
# HELP batchjob_cpu_system_seconds Cumulative CPU system seconds
# TYPE batchjob_cpu_system_seconds gauge
batchjob_cpu_system_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 115.777502
batchjob_cpu_system_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 115.777502
# HELP batchjob_cpu_total_seconds Cumulative CPU total seconds
# TYPE batchjob_cpu_total_seconds gauge
batchjob_cpu_total_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 60491.070351
batchjob_cpu_total_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60491.070351
# HELP batchjob_cpu_user_seconds Cumulative CPU user seconds
# TYPE batchjob_cpu_user_seconds gauge
batchjob_cpu_user_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 60375.292848
batchjob_cpu_user_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60375.292848
# HELP batchjob_cpus Number of CPUs
# TYPE batchjob_cpus gauge
batchjob_cpus{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 2
batchjob_cpus{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2
# HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build.
# TYPE batchjob_exporter_build_info gauge
# HELP batchjob_ipmi_dcmi_watts_total Current Power consumption in watts
# TYPE batchjob_ipmi_dcmi_watts_total counter
batchjob_ipmi_dcmi_watts_total 332
# HELP batchjob_memory_cache_bytes Memory cache used in bytes
# TYPE batchjob_memory_cache_bytes gauge
batchjob_memory_cache_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 0
batchjob_memory_cache_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0
# HELP batchjob_memory_fail_count Memory fail count
# TYPE batchjob_memory_fail_count gauge
batchjob_memory_fail_count{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 0
batchjob_memory_fail_count{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0
# HELP batchjob_memory_rss_bytes Memory RSS used in bytes
# TYPE batchjob_memory_rss_bytes gauge
batchjob_memory_rss_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 4.098592768e+09
batchjob_memory_rss_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.098592768e+09
# HELP batchjob_memory_total_bytes Memory total in bytes
# TYPE batchjob_memory_total_bytes gauge
batchjob_memory_total_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 4.294967296e+09
batchjob_memory_total_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.294967296e+09
# HELP batchjob_memory_used_bytes Memory used in bytes
# TYPE batchjob_memory_used_bytes gauge
batchjob_memory_used_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 4.111491072e+09
batchjob_memory_used_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.111491072e+09
# HELP batchjob_memsw_fail_count Swap fail count
# TYPE batchjob_memsw_fail_count gauge
batchjob_memsw_fail_count{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 0
batchjob_memsw_fail_count{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0
# HELP batchjob_memsw_total_bytes Swap total in bytes
# TYPE batchjob_memsw_total_bytes gauge
batchjob_memsw_total_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 0
batchjob_memsw_total_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0
# HELP batchjob_memsw_used_bytes Swap used in bytes
# TYPE batchjob_memsw_used_bytes gauge
batchjob_memsw_used_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="7f6c39b7-2740-fc1f-32c2-8fc28880829c",step="",task=""} 0
batchjob_memsw_used_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0
# HELP batchjob_nvidia_gpu_jobid Batch Job ID of current nVIDIA GPU
# TYPE batchjob_nvidia_gpu_jobid gauge
batchjob_nvidia_gpu_jobid{uuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"} 11000
Expand Down
File renamed without changes.
File renamed without changes.
6 changes: 3 additions & 3 deletions pkg/collector/fixtures/nvidia-smi
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash

echo """name, uuid
Tesla V100-SXM2-32GB, GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e
Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"""
echo """index, name, uuid
0, Tesla V100-SXM2-32GB, GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e
1, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"""
26 changes: 21 additions & 5 deletions pkg/collector/ipmi.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,14 @@ type impiCollector struct {
}

var (
ipmiDcmiWrapperExec = kingpin.Flag(
"collector.ipmi.dcmi.wrapper.path",
"Path to IPMI DCMI executable wrapper.",
ipmiDcmiExec = kingpin.Flag(
"collector.ipmi.dcmi.exec.path",
"Path to IPMI DCMI executable.",
).Default("ipmi-dcmi-wrapper").String()
ipmiDcmiExecAsRoot = kingpin.Flag(
"collector.ipmi.dcmi.exec.run.as.root",
"Execute IPMI DCMI command as root. This requires batchjob_exporter to run as root or to have appropriate capabilities (cap_setuid).",
).Default("false").Bool()
ipmiDCMIPowerMeasurementRegex = regexp.MustCompile(
`^Power Measurement\s*:\s*(?P<value>Active|Not\sAvailable).*`,
)
Expand Down Expand Up @@ -78,16 +82,27 @@ func getValue(ipmiOutput []byte, regex *regexp.Regexp) (string, error) {

// Update implements Collector and exposes IPMI DCMI power related metrics.
func (c *impiCollector) Update(ch chan<- prometheus.Metric) error {
args := []string{""}
stdOut, err := helpers.Execute(*ipmiDcmiWrapperExec, args, c.logger)
args := []string{"--get-system-power-statistics"}
var stdOut []byte
var err error

// Execute ipmi-dcmi command
if *ipmiDcmiExecAsRoot {
stdOut, err = helpers.ExecuteAs(*ipmiDcmiExec, args, 0, 0, c.logger)
} else {
stdOut, err = helpers.Execute(*ipmiDcmiExec, args, c.logger)
}
if err != nil {
return err
}

// Parse power consumption from output
currentPowerConsumption, err := c.getCurrentPowerConsumption(stdOut)
if err != nil {
level.Error(c.logger).Log("msg", "Failed to collect IPMI DCMI data", "error", err)
return err
}

// Returned value negative == Power Measurement is not avail
if currentPowerConsumption > -1 {
ch <- prometheus.MustNewConstMetric(c.wattsMetricDesc, prometheus.CounterValue, float64(currentPowerConsumption))
Expand All @@ -102,6 +117,7 @@ func (c *impiCollector) getCurrentPowerConsumption(ipmiOutput []byte) (float64,
if err != nil {
return -1, err
}

// When Power Measurement in 'Active' state - we can get watts
if value == "Active" {
value, err := getValue(ipmiOutput, ipmiDCMICurrentPowerRegex)
Expand Down
33 changes: 24 additions & 9 deletions pkg/collector/nvidia_gpus.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ var (
)

type Device struct {
index string
name string
uuid string
isMig bool
Expand Down Expand Up @@ -57,34 +58,43 @@ func init() {
//
// NOTE: Hoping this command returns MIG devices too
func getAllDevices(logger log.Logger) ([]Device, error) {
args := []string{"--query-gpu=name,uuid", "--format=csv"}
args := []string{"--query-gpu=index,name,uuid", "--format=csv"}
nvidiaSmiOutput, err := helpers.Execute("nvidia-smi", args, logger)
if err != nil {
level.Error(logger).
Log("msg", "nvidia-smi command to get list of devices failed", "err", err)
return nil, err
}

// Get all devices
allDevices := []Device{}
for _, line := range strings.Split(string(nvidiaSmiOutput), "\n") {
// Header line
if strings.HasPrefix(line, "name") {
if strings.HasPrefix(line, "index") {
continue
}

devDetails := strings.Split(line, ",")
if len(devDetails) < 2 {
if len(devDetails) < 3 {
level.Error(logger).
Log("msg", "Cannot parse output from nvidia-smi command", "output", line)
continue
}
devName := strings.TrimSpace(devDetails[0])
devUuid := strings.TrimSpace(devDetails[1])

// Get device index, name and UUID
devIndx := strings.TrimSpace(devDetails[0])
devName := strings.TrimSpace(devDetails[1])
devUuid := strings.TrimSpace(devDetails[2])

// Check if device is in MiG mode
isMig := false
if strings.HasPrefix(devUuid, "MIG") {
isMig = true
}
level.Debug(logger).
Log("msg", "Found nVIDIA GPU", "name", devName, "UUID", devUuid, "isMig:", isMig)
allDevices = append(allDevices, Device{name: devName, uuid: devUuid, isMig: isMig})

allDevices = append(allDevices, Device{index: devIndx, name: devName, uuid: devUuid, isMig: isMig})
}
return allDevices, nil
}
Expand Down Expand Up @@ -120,13 +130,18 @@ func (c *nvidiaGpuJobMapCollector) getJobId() (map[string]float64, error) {
gpuJobMapper := make(map[string]float64)
for _, dev := range c.devices {
var jobId int64 = 0
var slurmInfo string = fmt.Sprintf("%s/%s", *gpuStatPath, dev.uuid)
var slurmInfo string = fmt.Sprintf("%s/%s", *gpuStatPath, dev.index)

// NOTE: Look for file name with UUID as it will be more appropriate with
// MIG instances.
// If /run/gpustat/0 file is not found, check for the file with UUID as name?
if _, err := os.Stat(slurmInfo); err == nil {
content, err := os.ReadFile(slurmInfo)
if err != nil {
level.Error(c.logger).
Log("msg", "Failed to get job ID for GPU", "name", dev.uuid, "err", err)
level.Error(c.logger).Log(
"msg", "Failed to get job ID for GPU",
"index", dev.index, "uuid", dev.uuid, "err", err,
)
gpuJobMapper[dev.uuid] = float64(0)
}
fmt.Sscanf(string(content), "%d", &jobId)
Expand Down
18 changes: 13 additions & 5 deletions pkg/collector/nvidia_gpus_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,19 @@ import (
)

var (
devices = []Device{{name: "fakeGpu1",
uuid: "GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e",
isMig: false}, {name: "fakeGpu2",
uuid: "GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3",
isMig: false}}
devices = []Device{
{
index: "0",
name: "fakeGpu1",
uuid: "GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e",
isMig: false,
}, {
index: "1",
name: "fakeGpu2",
uuid: "GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3",
isMig: false,
},
}
)

func TestNvidiaJobGpuMap(t *testing.T) {
Expand Down
Loading

0 comments on commit 759cbad

Please sign in to comment.