diff --git a/gpus.go b/gpus.go
index fdcd96a..6ba3959 100644
--- a/gpus.go
+++ b/gpus.go
@@ -1,311 +1,308 @@
-/* Copyright 2022 Joeri Hermans, Victor Penso, Matteo Dessalvi, Iztok Lebar Bajec
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program. If not, see . */
-
-package main
-
-import (
- "os/exec"
- "regexp"
- "strconv"
- "strings"
-
- "github.com/prometheus/client_golang/prometheus"
- "github.com/prometheus/common/log"
-)
-
-type GPUsMetrics struct {
- alloc float64
- idle float64
- other float64
- total float64
- utilization float64
-}
- UserGPUsDCGM map[string]float64 `json:"user_gpus_dcgm"`
- UserGPUsSLURM map[string]float64 `json:"user_gpus_slurm"`
-
- alloc float64
- idle float64
- other float64
- total float64
- utilization float64
-}
-
-func GPUsGetMetrics() *GPUsMetrics {
- return ParseGPUsMetrics()
-}
-
-/* TODO:
- sinfo has gresUSED since slurm>=19.05.0rc01 https://github.com/SchedMD/slurm/blob/master/NEWS
- revert to old process on slurm<19.05.0rc01
- --format=AllocGRES will return gres/gpu=8
- --format=AllocTRES will return billing=16,cpu=16,gres/gpu=8,mem=256G,node=1
-func ParseAllocatedGPUs() float64 {
- var num_gpus = 0.0
-
- args := []string{"-a", "-X", "--format=Allocgres", "--state=RUNNING", "--noheader", "--parsable2"}
- output := string(Execute("sacct", args))
- if len(output) > 0 {
- for _, line := range strings.Split(output, "\n") {
- if len(line) > 0 {
- line = strings.Trim(line, "\"")
- descriptor := strings.TrimPrefix(line, "gpu:")
- job_gpus, _ := strconv.ParseFloat(descriptor, 64)
- num_gpus += job_gpus
- }
- }
- }
-
- return num_gpus
-}
-*/
-
-func ParseAllocatedGPUs(data []byte) float64 {
- var num_gpus = 0.0
- // sinfo -a -h --Format="Nodes: ,GresUsed:" --state=allocated
- // 3 gpu:2 # slurm>=20.11.8
- // 1 gpu:(null):3(IDX:0-7) # slurm 21.08.5
- // 13 gpu:A30:4(IDX:0-3),gpu:Q6K:4(IDX:0-3) # slurm 21.08.5
-
- sinfo_lines := string(data)
- re := regexp.MustCompile(`gpu:(\(null\)|[^:(]*):?([0-9]+)(\([^)]*\))?`)
- if len(sinfo_lines) > 0 {
- for _, line := range strings.Split(sinfo_lines, "\n") {
- // log.info(line)
- if len(line) > 0 && strings.Contains(line, "gpu:") {
- nodes := strings.Fields(line)[0]
- num_nodes, _ := strconv.ParseFloat(nodes, 64)
- node_active_gpus := strings.Fields(line)[1]
- num_node_active_gpus := 0.0
- for _, node_active_gpus_type := range strings.Split(node_active_gpus, ",") {
- if strings.Contains(node_active_gpus_type, "gpu:") {
- node_active_gpus_type = re.FindStringSubmatch(node_active_gpus_type)[2]
- num_node_active_gpus_type, _ := strconv.ParseFloat(node_active_gpus_type, 64)
- num_node_active_gpus += num_node_active_gpus_type
- }
- }
- num_gpus += num_nodes * num_node_active_gpus
- }
- }
- }
-
- return num_gpus
-}
-
-func ParseIdleGPUs(data []byte) float64 {
- var num_gpus = 0.0
- // sinfo -a -h --Format="Nodes: ,Gres: ,GresUsed:" --state=idle,allocated
- // 3 gpu:4 gpu:2 # slurm 20.11.8
- // 1 gpu:8(S:0-1) gpu:(null):3(IDX:0-7) # slurm 21.08.5
- // 13 gpu:A30:4(S:0-1),gpu:Q6K:40(S:0-1) gpu:A30:4(IDX:0-3),gpu:Q6K:4(IDX:0-3) # slurm 21.08.5
-
- sinfo_lines := string(data)
- re := regexp.MustCompile(`gpu:(\(null\)|[^:(]*):?([0-9]+)(\([^)]*\))?`)
- if len(sinfo_lines) > 0 {
- for _, line := range strings.Split(sinfo_lines, "\n") {
- // log.info(line)
- if len(line) > 0 && strings.Contains(line, "gpu:") {
- nodes := strings.Fields(line)[0]
- num_nodes, _ := strconv.ParseFloat(nodes, 64)
- node_gpus := strings.Fields(line)[1]
- num_node_gpus := 0.0
- for _, node_gpus_type := range strings.Split(node_gpus, ",") {
- if strings.Contains(node_gpus_type, "gpu:") {
- node_gpus_type = re.FindStringSubmatch(node_gpus_type)[2]
- num_node_gpus_type, _ := strconv.ParseFloat(node_gpus_type, 64)
- num_node_gpus += num_node_gpus_type
- }
- }
- num_node_active_gpus := 0.0
- node_active_gpus := strings.Fields(line)[2]
- for _, node_active_gpus_type := range strings.Split(node_active_gpus, ",") {
- if strings.Contains(node_active_gpus_type, "gpu:") {
- node_active_gpus_type = re.FindStringSubmatch(node_active_gpus_type)[2]
- num_node_active_gpus_type, _ := strconv.ParseFloat(node_active_gpus_type, 64)
- num_node_active_gpus += num_node_active_gpus_type
- }
- }
- num_gpus += num_nodes * (num_node_gpus - num_node_active_gpus)
- }
- }
- }
-
- return num_gpus
-}
-
-func ParseTotalGPUs(data []byte) float64 {
- var num_gpus = 0.0
- // sinfo -a -h --Format="Nodes: ,Gres:"
- // 3 gpu:4 # slurm 20.11.8
- // 1 gpu:8(S:0-1) # slurm 21.08.5
- // 13 gpu:A30:4(S:0-1),gpu:Q6K:40(S:0-1) # slurm 21.08.5
-
- sinfo_lines := string(data)
- re := regexp.MustCompile(`gpu:(\(null\)|[^:(]*):?([0-9]+)(\([^)]*\))?`)
- if len(sinfo_lines) > 0 {
- for _, line := range strings.Split(sinfo_lines, "\n") {
- // log.Info(line)
- if len(line) > 0 && strings.Contains(line, "gpu:") {
- nodes := strings.Fields(line)[0]
- num_nodes, _ := strconv.ParseFloat(nodes, 64)
- node_gpus := strings.Fields(line)[1]
- num_node_gpus := 0.0
- for _, node_gpus_type := range strings.Split(node_gpus, ",") {
- if strings.Contains(node_gpus_type, "gpu:") {
- node_gpus_type = re.FindStringSubmatch(node_gpus_type)[2]
- num_node_gpus_type, _ := strconv.ParseFloat(node_gpus_type, 64)
- num_node_gpus += num_node_gpus_type
- }
- }
- num_gpus += num_nodes * num_node_gpus
- }
- }
- }
-
- return num_gpus
-}
-
-
-func ParseGPUsMetrics() *GPUsMetrics {
- var gm GPUsMetrics
- total_gpus := ParseTotalGPUs(TotalGPUsData())
- allocated_gpus := ParseAllocatedGPUs(AllocatedGPUsData())
- idle_gpus := ParseIdleGPUs(IdleGPUsData())
- other_gpus := total_gpus - allocated_gpus - idle_gpus
- gm.alloc = allocated_gpus
- gm.idle = idle_gpus
- gm.other = other_gpus
- gm.total = total_gpus
- gm.utilization = allocated_gpus / total_gpus
- gm.UserGPUsDCGM = ParseUserGPUsDCGM()
- gm.UserGPUsSLURM = ParseUserGPUsSLURM()
- return &gm
-}
-
-func AllocatedGPUsData() []byte {
- args := []string{"-a", "-h", "--Format=Nodes: ,GresUsed:", "--state=allocated"}
- return Execute("sinfo", args)
-}
-
-func IdleGPUsData() []byte {
- args := []string{"-a", "-h", "--Format=Nodes: ,Gres: ,GresUsed:", "--state=idle,allocated"}
- return Execute("sinfo", args)
-}
-
-func TotalGPUsData() []byte {
- args := []string{"-a", "-h", "--Format=Nodes: ,Gres:"}
- return Execute("sinfo", args)
-}
-
-// Execute the sinfo command and return its output
-func Execute(command string, arguments []string) []byte {
- cmd := exec.Command(command, arguments...)
- out, err := cmd.CombinedOutput()
- if err != nil {
- log.Fatal(err)
- }
- return out
-}
-
-/*
- * Implement the Prometheus Collector interface and feed the
- * Slurm scheduler metrics into it.
- * https://godoc.org/github.com/prometheus/client_golang/prometheus#Collector
- */
-
-func NewGPUsCollector() *GPUsCollector {
- return &GPUsCollector{
- alloc: prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil),
- idle: prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil),
- other: prometheus.NewDesc("slurm_gpus_other", "Other GPUs", nil, nil),
- total: prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil),
- utilization: prometheus.NewDesc("slurm_gpus_utilization", "Total GPU utilization", nil, nil),
- }
-}
-
-type GPUsCollector struct {
- alloc *prometheus.Desc
- idle *prometheus.Desc
- other *prometheus.Desc
- total *prometheus.Desc
- utilization *prometheus.Desc
-}
-
-// Send all metric descriptions
-func (cc *GPUsCollector) Describe(ch chan<- *prometheus.Desc) {
- ch <- cc.alloc
- ch <- cc.idle
- ch <- cc.other
- ch <- cc.total
- ch <- cc.utilization
-}
-func (cc *GPUsCollector) Collect(ch chan<- prometheus.Metric) {
- cm := GPUsGetMetrics()
- ch <- prometheus.MustNewConstMetric(cc.alloc, prometheus.GaugeValue, cm.alloc)
- ch <- prometheus.MustNewConstMetric(cc.idle, prometheus.GaugeValue, cm.idle)
- ch <- prometheus.MustNewConstMetric(cc.other, prometheus.GaugeValue, cm.other)
- ch <- prometheus.MustNewConstMetric(cc.total, prometheus.GaugeValue, cm.total)
- ch <- prometheus.MustNewConstMetric(cc.utilization, prometheus.GaugeValue, cm.utilization)
-}
-
-// ParseUserGPUsDCGM retrieves and parses GPU usage per user using DCGM and Linux tools
-func ParseUserGPUsDCGM() map[string]float64 {
- userGPUs := make(map[string]float64)
- // Implement data retrieval and parsing logic here for DCGM and Linux tools
- return userGPUs
-}
-
-// ParseUserGPUsSLURM retrieves and parses GPU usage per user using SLURM commands
-func ParseUserGPUsSLURM() map[string]float64 {
- userGPUs := make(map[string]float64)
- // Implement data retrieval and parsing logic here for SLURM commands
- return userGPUs
-}
-
-type GPUsCollector struct {
- alloc *prometheus.Desc
- idle *prometheus.Desc
- other *prometheus.Desc
- total *prometheus.Desc
- utilization *prometheus.Desc
- userGPUsDCGM *prometheus.Desc
- userGPUsSLURM *prometheus.Desc
-}
-
-func NewGPUsCollector() *GPUsCollector {
- return &GPUsCollector{
- alloc: prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil),
- idle: prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil),
- other: prometheus.NewDesc("slurm_gpus_other", "Other GPUs", nil, nil),
- total: prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil),
- utilization: prometheus.NewDesc("slurm_gpus_utilization", "Total GPU utilization", nil, nil),
- userGPUsDCGM: prometheus.NewDesc("slurm_user_gpus_dcgm", "Number of GPUs used per user over time, obtained using DCGM and Linux tools", []string{"user"}, nil),
- userGPUsSLURM: prometheus.NewDesc("slurm_user_gpus_slurm", "Number of GPUs used per user over time, obtained using SLURM commands", []string{"user"}, nil),
- }
-}
-
-func (cc *GPUsCollector) Collect(ch chan<- prometheus.Metric) {
- cm := GPUsGetMetrics()
- ch <- prometheus.MustNewConstMetric(cc.alloc, prometheus.GaugeValue, cm.alloc)
- ch <- prometheus.MustNewConstMetric(cc.idle, prometheus.GaugeValue, cm.idle)
- ch <- prometheus.MustNewConstMetric(cc.other, prometheus.GaugeValue, cm.other)
- ch <- prometheus.MustNewConstMetric(cc.total, prometheus.GaugeValue, cm.total)
- ch <- prometheus.MustNewConstMetric(cc.utilization, prometheus.GaugeValue, cm.utilization)
- for user, gpus := range cm.UserGPUsDCGM {
- ch <- prometheus.MustNewConstMetric(cc.userGPUsDCGM, prometheus.GaugeValue, gpus, user)
- }
- for user, gpus := range cm.UserGPUsSLURM {
- ch <- prometheus.MustNewConstMetric(cc.userGPUsSLURM, prometheus.GaugeValue, gpus, user)
- }
+/* Copyright 2022 Iztok Lebar Bajec
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see . */
+
+package main
+
+import (
+ "os/exec"
+ "regexp"
+ "strconv"
+ "strings"
+
+ "github.com/prometheus/client_golang/prometheus"
+ "github.com/prometheus/common/log"
+)
+
+type GPUsMetrics struct {
+ alloc float64
+ idle float64
+ other float64
+ total float64
+ utilization float64
+ UserGPUsDCGM map[string]float64
+ UserGPUsSLURM map[string]float64
+}
+
+func GPUsGetMetrics() *GPUsMetrics {
+ return ParseGPUsMetrics()
+}
+
+/* TODO:
+ sinfo has gresUSED since slurm>=19.05.0rc01 https://github.com/SchedMD/slurm/blob/master/NEWS
+ revert to old process on slurm<19.05.0rc01
+ --format=AllocGRES will return gres/gpu=8
+ --format=AllocTRES will return billing=16,cpu=16,gres/gpu=8,mem=256G,node=1
+func ParseAllocatedGPUs() float64 {
+ var num_gpus = 0.0
+
+ args := []string{"-a", "-X", "--format=Allocgres", "--state=RUNNING", "--noheader", "--parsable2"}
+ output := string(Execute("sacct", args))
+ if len(output) > 0 {
+ for _, line := range strings.Split(output, "\n") {
+ if len(line) > 0 {
+ line = strings.Trim(line, "\"")
+ descriptor := strings.TrimPrefix(line, "gpu:")
+ job_gpus, _ := strconv.ParseFloat(descriptor, 64)
+ num_gpus += job_gpus
+ }
+ }
+ }
+
+ return num_gpus
+}
+*/
+
+func ParseAllocatedGPUs(data []byte) float64 {
+ var num_gpus = 0.0
+ // sinfo -a -h --Format="Nodes: ,GresUsed:" --state=allocated
+ // 3 gpu:2 # slurm>=20.11.8
+ // 1 gpu:(null):3(IDX:0-7) # slurm 21.08.5
+ // 13 gpu:A30:4(IDX:0-3),gpu:Q6K:4(IDX:0-3) # slurm 21.08.5 sinfo_lines := string(data)
+ re := regexp.MustCompile(`gpu:(\(null\)|[^:(]*):?([0-9]+)(\([^)]*\))?`)
+ if len(data) > 0 {
+ for _, line := range strings.Split(string(data), "\n") {
+ if len(line) > 0 && strings.Contains(line, "gpu:") {
+ nodes := strings.Fields(line)[0]
+ num_nodes, _ := strconv.ParseFloat(nodes, 64)
+ node_active_gpus := strings.Fields(line)[1]
+ num_node_active_gpus := 0.0
+ for _, node_active_gpus_type := range strings.Split(node_active_gpus, ",") {
+ if strings.Contains(node_active_gpus_type, "gpu:") {
+ node_active_gpus_type = re.FindStringSubmatch(node_active_gpus_type)[2]
+ num_node_active_gpus_type, _ := strconv.ParseFloat(node_active_gpus_type, 64)
+ num_node_active_gpus += num_node_active_gpus_type
+ }
+ }
+ num_gpus += num_nodes * num_node_active_gpus
+ }
+ }
+ }
+ return num_gpus
+}
+
+func ParseIdleGPUs(data []byte) float64 {
+ var num_gpus = 0.0
+ // sinfo -a -h --Format="Nodes: ,Gres: ,GresUsed:" --state=idle,allocated
+ // 3 gpu:4 gpu:2 # slurm 20.11.8
+ // 1 gpu:8(S:0-1) gpu:(null):3(IDX:0-7) # slurm 21.08.5
+ // 13 gpu:A30:4(S:0-1),gpu:Q6K:40(S:0-1) gpu:A30:4(IDX:0-3),gpu:Q6K:4(IDX:0-3) sinfo_lines := string(data)
+ re := regexp.MustCompile(`gpu:(\(null\)|[^:(]*):?([0-9]+)(\([^)]*\))?`)
+ if len(data) > 0 {
+ for _, line := range strings.Split(string(data), "\n") {
+ if len(line) > 0 && strings.Contains(line, "gpu:") {
+ nodes := strings.Fields(line)[0]
+ num_nodes, _ := strconv.ParseFloat(nodes, 64)
+ node_gpus := strings.Fields(line)[1]
+ num_node_gpus := 0.0
+ for _, node_gpus_type := range strings.Split(node_gpus, ",") {
+ if strings.Contains(node_gpus_type, "gpu:") {
+ node_gpus_type = re.FindStringSubmatch(node_gpus_type)[2]
+ num_node_gpus_type, _ := strconv.ParseFloat(node_gpus_type, 64)
+ num_node_gpus += num_node_gpus_type
+ }
+ }
+ num_node_active_gpus := 0.0
+ node_active_gpus := strings.Fields(line)[2]
+ for _, node_active_gpus_type := range strings.Split(node_active_gpus, ",") {
+ if strings.Contains(node_active_gpus_type, "gpu:") {
+ node_active_gpus_type = re.FindStringSubmatch(node_active_gpus_type)[2]
+ num_node_active_gpus_type, _ := strconv.ParseFloat(node_active_gpus_type, 64)
+ num_node_active_gpus += num_node_active_gpus_type
+ }
+ }
+ num_gpus += num_nodes * (num_node_gpus - num_node_active_gpus)
+ }
+ }
+ }
+ return num_gpus
+}
+
+func ParseTotalGPUs(data []byte) float64 {
+ var num_gpus = 0.0
+ // sinfo -a -h --Format="Nodes: ,Gres:"
+ // 3 gpu:4 # slurm 20.11.8
+ // 1 gpu:8(S:0-1) # slurm 21.08.5
+ // 13 gpu:A30:4(S:0-1),gpu:Q6K:40(S:0-1) # slurm 21.08.5
+ re := regexp.MustCompile(`gpu:(\(null\)|[^:(]*):?([0-9]+)(\([^)]*\))?`)
+ if len(data) > 0 {
+ for _, line := range strings.Split(string(data), "\n") {
+ if len(line) > 0 && strings.Contains(line, "gpu:") {
+ nodes := strings.Fields(line)[0]
+ num_nodes, _ := strconv.ParseFloat(nodes, 64)
+ node_gpus := strings.Fields(line)[1]
+ num_node_gpus := 0.0
+ for _, node_gpus_type := range strings.Split(node_gpus, ",") {
+ if strings.Contains(node_gpus_type, "gpu:") {
+ node_gpus_type = re.FindStringSubmatch(node_gpus_type)[2]
+ num_node_gpus_type, _ := strconv.ParseFloat(node_gpus_type, 64)
+ num_node_gpus += num_node_gpus_type
+ }
+ }
+ num_gpus += num_nodes * num_node_gpus
+ }
+ }
+ }
+ return num_gpus
+}
+
+func ParseGPUsMetrics() *GPUsMetrics {
+ var gm GPUsMetrics
+ total_gpus := ParseTotalGPUs(TotalGPUsData())
+ allocated_gpus := ParseAllocatedGPUs(AllocatedGPUsData())
+ idle_gpus := ParseIdleGPUs(IdleGPUsData())
+ other_gpus := total_gpus - allocated_gpus - idle_gpus
+ gm.alloc = allocated_gpus
+ gm.idle = idle_gpus
+ gm.other = other_gpus
+ gm.total = total_gpus
+ gm.utilization = allocated_gpus / total_gpus
+ gm.UserGPUsDCGM = ParseUserGPUsDCGM()
+ gm.UserGPUsSLURM = ParseUserGPUsSLURM()
+ return &gm
+}
+
+func AllocatedGPUsData() []byte {
+ args := []string{"-a", "-h", "--Format=Nodes: ,GresUsed:", "--state=allocated"}
+ return Execute("sinfo", args)
+}
+
+func IdleGPUsData() []byte {
+ args := []string{"-a", "-h", "--Format=Nodes: ,Gres: ,GresUsed:", "--state=idle,allocated"}
+ return Execute("sinfo", args)
+}
+
+func TotalGPUsData() []byte {
+ args := []string{"-a", "-h", "--Format=Nodes: ,Gres:"}
+ return Execute("sinfo", args)
+}
+
+func Execute(command string, arguments []string) []byte {
+ cmd := exec.Command(command, arguments...)
+ out, err := cmd.CombinedOutput()
+ if err != nil {
+ log.Fatal(err)
+ }
+ return out
+}
+
+/*
+ * Implement the Prometheus Collector interface and feed the
+ * Slurm scheduler metrics into it.
+ * https://godoc.org/github.com/prometheus/client_golang/prometheus#Collector
+ */
+
+ func ParseUserGPUsDCGM() map[string]float64 {
+ userGPUs := make(map[string]float64)
+
+ // Execute a command to get GPU usage information
+ cmd := exec.Command("dcgmi", "dmon", "-c", "1", "-e", "203")
+ out, err := cmd.CombinedOutput()
+ if err != nil {
+ log.Fatalf("Failed to execute dcgmi command: %v", err)
+ }
+
+ // Parse the command output
+ lines := strings.Split(string(out), "\n")
+ for _, line := range lines {
+ fields := strings.Fields(line)
+ if len(fields) >= 3 && fields[0] != "#" {
+ user := fields[1]
+ gpuUsage, err := strconv.ParseFloat(fields[2], 64)
+ if err != nil {
+ log.Errorf("Failed to parse GPU usage for user %s: %v", user, err)
+ continue
+ }
+ userGPUs[user] = gpuUsage
+ }
+ }
+
+ return userGPUs
+}
+
+
+func ParseUserGPUsSLURM() map[string]float64 {
+ userGPUs := make(map[string]float64)
+
+ // Execute a command to get GPU usage information
+ cmd := exec.Command("sacct", "-P", "--format=User,GresUsed", "-a")
+ out, err := cmd.CombinedOutput()
+ if err != nil {
+ log.Fatalf("Failed to execute sacct command: %v", err)
+ }
+
+ // Parse the command output
+ lines := strings.Split(string(out), "\n")
+ for _, line := range lines {
+ fields := strings.Split(line, "|")
+ if len(fields) >= 2 {
+ user := fields[0]
+ gpuUsage, err := strconv.ParseFloat(fields[1], 64)
+ if err != nil {
+ log.Errorf("Failed to parse GPU usage for user %s: %v", user, err)
+ continue
+ }
+ userGPUs[user] = gpuUsage
+ }
+ }
+
+ return userGPUs
+}
+
+type GPUsCollector struct {
+ alloc *prometheus.Desc
+ idle *prometheus.Desc
+ other *prometheus.Desc
+ total *prometheus.Desc
+ utilization *prometheus.Desc
+ userGPUsDCGM *prometheus.Desc
+ userGPUsSLURM *prometheus.Desc
+}
+
+func NewGPUsCollector() *GPUsCollector {
+ return &GPUsCollector{
+ alloc: prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil),
+ idle: prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil),
+ other: prometheus.NewDesc("slurm_gpus_other", "Other GPUs", nil, nil),
+ total: prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil),
+ utilization: prometheus.NewDesc("slurm_gpus_utilization", "Total GPU utilization", nil, nil),
+ userGPUsDCGM: prometheus.NewDesc("slurm_user_gpus_dcgm", "Number of GPUs used per user over time, obtained using DCGM and Linux tools", []string{"user"}, nil),
+ userGPUsSLURM: prometheus.NewDesc("slurm_user_gpus_slurm", "Number of GPUs used per user over time, obtained using SLURM commands", []string{"user"}, nil),
+ }
+}
+
+func (cc *GPUsCollector) Describe(ch chan<- *prometheus.Desc) {
+ ch <- cc.alloc
+ ch <- cc.idle
+ ch <- cc.other
+ ch <- cc.total
+ ch <- cc.utilization
+ ch <- cc.userGPUsDCGM
+ ch <- cc.userGPUsSLURM
+}
+
+func (cc *GPUsCollector) Collect(ch chan<- prometheus.Metric) {
+ cm := GPUsGetMetrics()
+ ch <- prometheus.MustNewConstMetric(cc.alloc, prometheus.GaugeValue, cm.alloc)
+ ch <- prometheus.MustNewConstMetric(cc.idle, prometheus.GaugeValue, cm.idle)
+ ch <- prometheus.MustNewConstMetric(cc.other, prometheus.GaugeValue, cm.other)
+ ch <- prometheus.MustNewConstMetric(cc.total, prometheus.GaugeValue, cm.total)
+ ch <- prometheus.MustNewConstMetric(cc.utilization, prometheus.GaugeValue, cm.utilization)
+ for user, gpus := range cm.UserGPUsDCGM {
+ ch <- prometheus.MustNewConstMetric(cc.userGPUsDCGM, prometheus.GaugeValue, gpus, user)
+ }
+ for user, gpus := range cm.UserGPUsSLURM {
+ ch <- prometheus.MustNewConstMetric(cc.userGPUsSLURM, prometheus.GaugeValue, gpus, user)
+ }
}
\ No newline at end of file
diff --git a/gpus_test.go b/gpus_test.go
index 939bef5..12de113 100644
--- a/gpus_test.go
+++ b/gpus_test.go
@@ -65,3 +65,13 @@ func TestGPUsGetMetrics(t *testing.T) {
t.Logf("User GPUs DCGM: %v", metrics.UserGPUsDCGM)
t.Logf("User GPUs SLURM: %v", metrics.UserGPUsSLURM)
}
+
+func TestUserGPUsDCGM(t *testing.T) {
+ userGPUs := ParseUserGPUsDCGM()
+ t.Logf("User GPUs DCGM: %v", userGPUs)
+}
+
+func TestUserGPUsSLURM(t *testing.T) {
+ userGPUs := ParseUserGPUsSLURM()
+ t.Logf("User GPUs SLURM: %v", userGPUs)
+}
\ No newline at end of file