diff --git a/pkg/collector/cli.go b/pkg/collector/cli.go index 79151a23..8917a083 100644 --- a/pkg/collector/cli.go +++ b/pkg/collector/cli.go @@ -36,6 +36,9 @@ var BatchJobExporterApp = *kingpin.New( "Prometheus Exporter to export batch job metrics.", ) +// Empty hostname flag (Used only for testing) +var emptyHostnameLabel *bool + // Create a new BatchJobExporter struct func NewBatchJobExporter() (*BatchJobExporter, error) { return &BatchJobExporter{ @@ -91,6 +94,12 @@ func (b *BatchJobExporter) Main() { toolkitFlags = kingpinflag.AddFlags(&b.App, ":9010") ) + // This is hidden flag only used for e2e testing + emptyHostnameLabel = b.App.Flag( + "collector.empty.hostname.label", + "Use empty hostname in labels. Only for testing. (default is disabled)", + ).Hidden().Default("false").Bool() + promlogConfig := &promlog.Config{} flag.AddFlags(&b.App, promlogConfig) b.App.Version(version.Print(b.appName)) diff --git a/pkg/collector/fixtures/e2e-test-cgroupsv1-output.txt b/pkg/collector/fixtures/e2e-test-cgroupsv1-output.txt index 4b52ddde..ae630621 100644 --- a/pkg/collector/fixtures/e2e-test-cgroupsv1-output.txt +++ b/pkg/collector/fixtures/e2e-test-cgroupsv1-output.txt @@ -1,52 +1,52 @@ # HELP batchjob_cpu_system_seconds Cumulative CPU system seconds # TYPE batchjob_cpu_system_seconds gauge -batchjob_cpu_system_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0.45 +batchjob_cpu_system_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0.45 # HELP batchjob_cpu_total_seconds Cumulative CPU total seconds # TYPE batchjob_cpu_total_seconds gauge -batchjob_cpu_total_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 1.012410966 +batchjob_cpu_total_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 1.012410966 # HELP batchjob_cpu_user_seconds Cumulative CPU user seconds # TYPE batchjob_cpu_user_seconds gauge -batchjob_cpu_user_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0.39 +batchjob_cpu_user_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0.39 # HELP batchjob_cpus Number of CPUs # TYPE batchjob_cpus gauge -batchjob_cpus{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +batchjob_cpus{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 # HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build. # TYPE batchjob_exporter_build_info gauge # HELP batchjob_ipmi_dcmi_watts_total Current Power consumption in watts # TYPE batchjob_ipmi_dcmi_watts_total counter -batchjob_ipmi_dcmi_watts_total 332 +batchjob_ipmi_dcmi_watts_total{hostname=""} 332 # HELP batchjob_memory_cache_bytes Memory cache used in bytes # TYPE batchjob_memory_cache_bytes gauge -batchjob_memory_cache_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2.1086208e+07 +batchjob_memory_cache_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2.1086208e+07 # HELP batchjob_memory_fail_count Memory fail count # TYPE batchjob_memory_fail_count gauge -batchjob_memory_fail_count{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +batchjob_memory_fail_count{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 # HELP batchjob_memory_rss_bytes Memory RSS used in bytes # TYPE batchjob_memory_rss_bytes gauge -batchjob_memory_rss_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 1.0407936e+07 +batchjob_memory_rss_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 1.0407936e+07 # HELP batchjob_memory_total_bytes Memory total in bytes # TYPE batchjob_memory_total_bytes gauge -batchjob_memory_total_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2.01362030592e+11 +batchjob_memory_total_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2.01362030592e+11 # HELP batchjob_memory_used_bytes Memory used in bytes # TYPE batchjob_memory_used_bytes gauge -batchjob_memory_used_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.0194048e+07 +batchjob_memory_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.0194048e+07 # HELP batchjob_memsw_fail_count Swap fail count # TYPE batchjob_memsw_fail_count gauge -batchjob_memsw_fail_count{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +batchjob_memsw_fail_count{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 # HELP batchjob_memsw_total_bytes Swap total in bytes # TYPE batchjob_memsw_total_bytes gauge -batchjob_memsw_total_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 9.223372036854772e+18 +batchjob_memsw_total_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 9.223372036854772e+18 # HELP batchjob_memsw_used_bytes Swap used in bytes # TYPE batchjob_memsw_used_bytes gauge -batchjob_memsw_used_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.032512e+07 +batchjob_memsw_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.032512e+07 # HELP batchjob_nvidia_gpu_jobid Batch Job ID of current nVIDIA GPU # TYPE batchjob_nvidia_gpu_jobid gauge -batchjob_nvidia_gpu_jobid{uuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"} 11000 -batchjob_nvidia_gpu_jobid{uuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e"} 10000 +batchjob_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3",hostname="",uuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"} 11000 +batchjob_nvidia_gpu_jobid{UUID="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e",hostname="",uuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e"} 10000 # HELP batchjob_rapl_package_joules_total Current RAPL package value in joules # TYPE batchjob_rapl_package_joules_total counter -batchjob_rapl_package_joules_total{index="0",path="pkg/collector/fixtures/sys/class/powercap/intel-rapl:0"} 258218.293244 -batchjob_rapl_package_joules_total{index="1",path="pkg/collector/fixtures/sys/class/powercap/intel-rapl:1"} 130570.505826 +batchjob_rapl_package_joules_total{hostname="",index="0",path="pkg/collector/fixtures/sys/class/powercap/intel-rapl:0"} 258218.293244 +batchjob_rapl_package_joules_total{hostname="",index="1",path="pkg/collector/fixtures/sys/class/powercap/intel-rapl:1"} 130570.505826 # HELP batchjob_scrape_collector_duration_seconds batchjob_exporter: Duration of a collector scrape. # TYPE batchjob_scrape_collector_duration_seconds gauge # HELP batchjob_scrape_collector_success batchjob_exporter: Whether a collector succeeded. diff --git a/pkg/collector/fixtures/e2e-test-cgroupsv2-output.txt b/pkg/collector/fixtures/e2e-test-cgroupsv2-output.txt index b5ccf1ad..a4113b05 100644 --- a/pkg/collector/fixtures/e2e-test-cgroupsv2-output.txt +++ b/pkg/collector/fixtures/e2e-test-cgroupsv2-output.txt @@ -1,52 +1,52 @@ # HELP batchjob_cpu_system_seconds Cumulative CPU system seconds # TYPE batchjob_cpu_system_seconds gauge -batchjob_cpu_system_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 115.777502 +batchjob_cpu_system_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 115.777502 # HELP batchjob_cpu_total_seconds Cumulative CPU total seconds # TYPE batchjob_cpu_total_seconds gauge -batchjob_cpu_total_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60491.070351 +batchjob_cpu_total_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60491.070351 # HELP batchjob_cpu_user_seconds Cumulative CPU user seconds # TYPE batchjob_cpu_user_seconds gauge -batchjob_cpu_user_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60375.292848 +batchjob_cpu_user_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60375.292848 # HELP batchjob_cpus Number of CPUs # TYPE batchjob_cpus gauge -batchjob_cpus{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2 +batchjob_cpus{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2 # HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build. # TYPE batchjob_exporter_build_info gauge # HELP batchjob_ipmi_dcmi_watts_total Current Power consumption in watts # TYPE batchjob_ipmi_dcmi_watts_total counter -batchjob_ipmi_dcmi_watts_total 332 +batchjob_ipmi_dcmi_watts_total{hostname=""} 332 # HELP batchjob_memory_cache_bytes Memory cache used in bytes # TYPE batchjob_memory_cache_bytes gauge -batchjob_memory_cache_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +batchjob_memory_cache_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 # HELP batchjob_memory_fail_count Memory fail count # TYPE batchjob_memory_fail_count gauge -batchjob_memory_fail_count{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +batchjob_memory_fail_count{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 # HELP batchjob_memory_rss_bytes Memory RSS used in bytes # TYPE batchjob_memory_rss_bytes gauge -batchjob_memory_rss_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.098592768e+09 +batchjob_memory_rss_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.098592768e+09 # HELP batchjob_memory_total_bytes Memory total in bytes # TYPE batchjob_memory_total_bytes gauge -batchjob_memory_total_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.294967296e+09 +batchjob_memory_total_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.294967296e+09 # HELP batchjob_memory_used_bytes Memory used in bytes # TYPE batchjob_memory_used_bytes gauge -batchjob_memory_used_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.111491072e+09 +batchjob_memory_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.111491072e+09 # HELP batchjob_memsw_fail_count Swap fail count # TYPE batchjob_memsw_fail_count gauge -batchjob_memsw_fail_count{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +batchjob_memsw_fail_count{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 # HELP batchjob_memsw_total_bytes Swap total in bytes # TYPE batchjob_memsw_total_bytes gauge -batchjob_memsw_total_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} -1 +batchjob_memsw_total_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} -1 # HELP batchjob_memsw_used_bytes Swap used in bytes # TYPE batchjob_memsw_used_bytes gauge -batchjob_memsw_used_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +batchjob_memsw_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 # HELP batchjob_nvidia_gpu_jobid Batch Job ID of current nVIDIA GPU # TYPE batchjob_nvidia_gpu_jobid gauge -batchjob_nvidia_gpu_jobid{uuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"} 11000 -batchjob_nvidia_gpu_jobid{uuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e"} 10000 +batchjob_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3",hostname="",uuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"} 11000 +batchjob_nvidia_gpu_jobid{UUID="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e",hostname="",uuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e"} 10000 # HELP batchjob_rapl_package_joules_total Current RAPL package value in joules # TYPE batchjob_rapl_package_joules_total counter -batchjob_rapl_package_joules_total{index="0",path="pkg/collector/fixtures/sys/class/powercap/intel-rapl:0"} 258218.293244 -batchjob_rapl_package_joules_total{index="1",path="pkg/collector/fixtures/sys/class/powercap/intel-rapl:1"} 130570.505826 +batchjob_rapl_package_joules_total{hostname="",index="0",path="pkg/collector/fixtures/sys/class/powercap/intel-rapl:0"} 258218.293244 +batchjob_rapl_package_joules_total{hostname="",index="1",path="pkg/collector/fixtures/sys/class/powercap/intel-rapl:1"} 130570.505826 # HELP batchjob_scrape_collector_duration_seconds batchjob_exporter: Duration of a collector scrape. # TYPE batchjob_scrape_collector_duration_seconds gauge # HELP batchjob_scrape_collector_success batchjob_exporter: Whether a collector succeeded. diff --git a/pkg/collector/ipmi.go b/pkg/collector/ipmi.go index e35d8dd7..6be7f57e 100644 --- a/pkg/collector/ipmi.go +++ b/pkg/collector/ipmi.go @@ -8,6 +8,7 @@ package collector import ( "fmt" + "os" "regexp" "strconv" "strings" @@ -22,6 +23,7 @@ const ipmiCollectorSubsystem = "ipmi_dcmi" type impiCollector struct { logger log.Logger + hostname string execMode string wattsMetricDesc *prometheus.Desc } @@ -45,14 +47,23 @@ func init() { // NewIPMICollector returns a new Collector exposing IMPI DCMI power metrics. func NewIPMICollector(logger log.Logger) (Collector, error) { + var execMode string + var hostname string + var err error + + // Get hostname + if !*emptyHostnameLabel { + hostname, err = os.Hostname() + if err != nil { + level.Error(logger).Log("msg", "Failed to get hostname", "err", err) + } + } wattsMetricDesc := prometheus.NewDesc( prometheus.BuildFQName(Namespace, ipmiCollectorSubsystem, "watts_total"), - "Current Power consumption in watts", []string{}, nil, + "Current Power consumption in watts", []string{"hostname"}, nil, ) - var execMode string - // Split command cmdSlice := strings.Split(*ipmiDcmiCmd, " ") @@ -81,6 +92,7 @@ func NewIPMICollector(logger log.Logger) (Collector, error) { outside: collector := impiCollector{ logger: logger, + hostname: hostname, execMode: execMode, wattsMetricDesc: wattsMetricDesc, } @@ -134,7 +146,7 @@ func (c *impiCollector) Update(ch chan<- prometheus.Metric) error { // Returned value negative == Power Measurement is not avail if currentPowerConsumption > -1 { - ch <- prometheus.MustNewConstMetric(c.wattsMetricDesc, prometheus.CounterValue, float64(currentPowerConsumption)) + ch <- prometheus.MustNewConstMetric(c.wattsMetricDesc, prometheus.CounterValue, float64(currentPowerConsumption), c.hostname) } return nil } diff --git a/pkg/collector/nvidia_gpus.go b/pkg/collector/nvidia_gpus.go index a4a49f06..31786207 100644 --- a/pkg/collector/nvidia_gpus.go +++ b/pkg/collector/nvidia_gpus.go @@ -42,6 +42,7 @@ type Device struct { type nvidiaGpuJobMapCollector struct { devices []Device logger log.Logger + hostname string gpuJobMapDesc *prometheus.Desc } @@ -116,16 +117,28 @@ func getAllDevices(logger log.Logger) ([]Device, error) { // NewNvidiaGpuJobMapCollector returns a new Collector exposing batch jobs to nVIDIA GPU ordinals mapping. func NewNvidiaGpuJobMapCollector(logger log.Logger) (Collector, error) { + var hostname string + var err error + + // Get hostname + if !*emptyHostnameLabel { + hostname, err = os.Hostname() + if err != nil { + level.Error(logger).Log("msg", "Failed to get hostname", "err", err) + } + } + allDevices, _ := getAllDevices(logger) gpuJobMapDesc := prometheus.NewDesc( prometheus.BuildFQName(Namespace, nvidiaGpuJobMapCollectorSubsystem, "jobid"), "Batch Job ID of current nVIDIA GPU", - []string{"uuid"}, nil, + []string{"hostname", "uuid", "UUID"}, nil, ) collector := nvidiaGpuJobMapCollector{ devices: allDevices, logger: logger, + hostname: hostname, gpuJobMapDesc: gpuJobMapDesc, } return &collector, nil @@ -135,7 +148,7 @@ func NewNvidiaGpuJobMapCollector(logger log.Logger) (Collector, error) { func (c *nvidiaGpuJobMapCollector) Update(ch chan<- prometheus.Metric) error { gpuJobMapper, _ := c.getJobId() for _, dev := range c.devices { - ch <- prometheus.MustNewConstMetric(c.gpuJobMapDesc, prometheus.GaugeValue, gpuJobMapper[dev.uuid], dev.uuid) + ch <- prometheus.MustNewConstMetric(c.gpuJobMapDesc, prometheus.GaugeValue, gpuJobMapper[dev.uuid], c.hostname, dev.uuid, dev.uuid) } return nil } diff --git a/pkg/collector/rapl.go b/pkg/collector/rapl.go index e10c7c68..cfad7ae5 100644 --- a/pkg/collector/rapl.go +++ b/pkg/collector/rapl.go @@ -20,9 +20,9 @@ import ( const raplCollectorSubsystem = "rapl" type raplCollector struct { - fs sysfs.FS - logger log.Logger - + fs sysfs.FS + logger log.Logger + hostname string joulesMetricDesc *prometheus.Desc } @@ -39,8 +39,18 @@ var ( // NewRaplCollector returns a new Collector exposing RAPL metrics. func NewRaplCollector(logger log.Logger) (Collector, error) { - fs, err := sysfs.NewFS(*sysPath) + var hostname string + var err error + // Get hostname + if !*emptyHostnameLabel { + hostname, err = os.Hostname() + if err != nil { + level.Error(logger).Log("msg", "Failed to get hostname", "err", err) + } + } + + fs, err := sysfs.NewFS(*sysPath) if err != nil { return nil, err } @@ -48,12 +58,13 @@ func NewRaplCollector(logger log.Logger) (Collector, error) { joulesMetricDesc := prometheus.NewDesc( prometheus.BuildFQName(Namespace, raplCollectorSubsystem, "joules_total"), "Current RAPL value in joules", - []string{"index", "path", "rapl_zone"}, nil, + []string{"hostname", "index", "path", "rapl_zone"}, nil, ) collector := raplCollector{ fs: fs, logger: logger, + hostname: hostname, joulesMetricDesc: joulesMetricDesc, } return &collector, nil @@ -107,13 +118,13 @@ func (c *raplCollector) joulesMetric(z sysfs.RaplZone, v float64) prometheus.Met fmt.Sprintf("%s_joules_total", SanitizeMetricName(z.Name)), ), fmt.Sprintf("Current RAPL %s value in joules", z.Name), - []string{"index", "path"}, nil, + []string{"hostname", "index", "path"}, nil, ) - return prometheus.MustNewConstMetric( descriptor, prometheus.CounterValue, v, + c.hostname, index, z.Path, ) @@ -121,11 +132,11 @@ func (c *raplCollector) joulesMetric(z sysfs.RaplZone, v float64) prometheus.Met func (c *raplCollector) joulesMetricWithZoneLabel(z sysfs.RaplZone, v float64) prometheus.Metric { index := strconv.Itoa(z.Index) - return prometheus.MustNewConstMetric( c.joulesMetricDesc, prometheus.CounterValue, v, + c.hostname, index, z.Path, z.Name, diff --git a/pkg/collector/slurm.go b/pkg/collector/slurm.go index 841f65b8..8d1ea406 100644 --- a/pkg/collector/slurm.go +++ b/pkg/collector/slurm.go @@ -60,13 +60,14 @@ type CgroupMetric struct { memswTotal float64 memswFailCount float64 userslice bool + batch string + hostname string jobuid string jobaccount string jobid string jobuuid string step string task string - batch string err bool } @@ -74,6 +75,7 @@ type slurmCollector struct { cgroups string // v1 or v2 cgroupsRootPath string slurmCgroupsPath string + hostname string cpuUser *prometheus.Desc cpuSystem *prometheus.Desc cpuTotal *prometheus.Desc @@ -99,6 +101,8 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { var cgroupsVer string var cgroupsRootPath string var slurmCgroupsPath string + var hostname string + var err error if cgroups.Mode() == cgroups.Unified { cgroupsVer = "v2" @@ -112,7 +116,15 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { slurmCgroupsPath = fmt.Sprintf("%s/slurm", cgroupsRootPath) } - // Snippet for testing e2e tests for cgroups v1 + // Get hostname + if !*emptyHostnameLabel { + hostname, err = os.Hostname() + if err != nil { + level.Error(logger).Log("msg", "Failed to get hostname", "err", err) + } + } + + // // Snippet for testing e2e tests for cgroups v1 // cgroupsVer = "v1" // level.Info(logger).Log("msg", "Cgroup version v2 not detected, will proceed with v1.") // cgroupsRootPath = fmt.Sprintf("%s/cpuacct", *cgroupfsPath) @@ -129,82 +141,83 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { cgroups: cgroupsVer, cgroupsRootPath: cgroupsRootPath, slurmCgroupsPath: slurmCgroupsPath, + hostname: hostname, cpuUser: prometheus.NewDesc( prometheus.BuildFQName(Namespace, "cpu", "user_seconds"), "Cumulative CPU user seconds", - []string{"batch", "jobid", "jobaccount", "jobuuid", "step", "task"}, + []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), cpuSystem: prometheus.NewDesc( prometheus.BuildFQName(Namespace, "cpu", "system_seconds"), "Cumulative CPU system seconds", - []string{"batch", "jobid", "jobaccount", "jobuuid", "step", "task"}, + []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), cpuTotal: prometheus.NewDesc( prometheus.BuildFQName(Namespace, "cpu", "total_seconds"), "Cumulative CPU total seconds", - []string{"batch", "jobid", "jobaccount", "jobuuid", "step", "task"}, + []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), cpus: prometheus.NewDesc( prometheus.BuildFQName(Namespace, "", "cpus"), "Number of CPUs", - []string{"batch", "jobid", "jobaccount", "jobuuid", "step", "task"}, + []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), memoryRSS: prometheus.NewDesc( prometheus.BuildFQName(Namespace, "memory", "rss_bytes"), "Memory RSS used in bytes", - []string{"batch", "jobid", "jobaccount", "jobuuid", "step", "task"}, + []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), memoryCache: prometheus.NewDesc( prometheus.BuildFQName(Namespace, "memory", "cache_bytes"), "Memory cache used in bytes", - []string{"batch", "jobid", "jobaccount", "jobuuid", "step", "task"}, + []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), memoryUsed: prometheus.NewDesc( prometheus.BuildFQName(Namespace, "memory", "used_bytes"), "Memory used in bytes", - []string{"batch", "jobid", "jobaccount", "jobuuid", "step", "task"}, + []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), memoryTotal: prometheus.NewDesc( prometheus.BuildFQName(Namespace, "memory", "total_bytes"), "Memory total in bytes", - []string{"batch", "jobid", "jobaccount", "jobuuid", "step", "task"}, + []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), memoryFailCount: prometheus.NewDesc( prometheus.BuildFQName(Namespace, "memory", "fail_count"), "Memory fail count", - []string{"batch", "jobid", "jobaccount", "jobuuid", "step", "task"}, + []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), memswUsed: prometheus.NewDesc( prometheus.BuildFQName(Namespace, "memsw", "used_bytes"), "Swap used in bytes", - []string{"batch", "jobid", "jobaccount", "jobuuid", "step", "task"}, + []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), memswTotal: prometheus.NewDesc( prometheus.BuildFQName(Namespace, "memsw", "total_bytes"), "Swap total in bytes", - []string{"batch", "jobid", "jobaccount", "jobuuid", "step", "task"}, + []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), memswFailCount: prometheus.NewDesc( prometheus.BuildFQName(Namespace, "memsw", "fail_count"), "Swap fail count", - []string{"batch", "jobid", "jobaccount", "jobuuid", "step", "task"}, + []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), collectError: prometheus.NewDesc( prometheus.BuildFQName(Namespace, "exporter", "collect_error"), "Indicates collection error, 0=no error, 1=error", - []string{"batch", "jobid", "jobaccount", "jobuuid", "step", "task"}, + []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), logger: logger, @@ -230,9 +243,9 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error { if m.err { ch <- prometheus.MustNewConstMetric(c.collectError, prometheus.GaugeValue, 1, m.name) } - ch <- prometheus.MustNewConstMetric(c.cpuUser, prometheus.GaugeValue, m.cpuUser, m.batch, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.cpuSystem, prometheus.GaugeValue, m.cpuSystem, m.batch, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.cpuTotal, prometheus.GaugeValue, m.cpuTotal, m.batch, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.cpuUser, prometheus.GaugeValue, m.cpuUser, m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.cpuSystem, prometheus.GaugeValue, m.cpuSystem, m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.cpuTotal, prometheus.GaugeValue, m.cpuTotal, m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) cpus := m.cpus if cpus == 0 { dir := filepath.Dir(n) @@ -241,15 +254,15 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error { cpus = metrics[filepath.Dir(dir)].cpus } } - ch <- prometheus.MustNewConstMetric(c.cpus, prometheus.GaugeValue, float64(cpus), m.batch, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memoryRSS, prometheus.GaugeValue, m.memoryRSS, m.batch, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memoryCache, prometheus.GaugeValue, m.memoryCache, m.batch, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memoryUsed, prometheus.GaugeValue, m.memoryUsed, m.batch, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.batch, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memoryFailCount, prometheus.GaugeValue, m.memoryFailCount, m.batch, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memswUsed, prometheus.GaugeValue, m.memswUsed, m.batch, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memswTotal, prometheus.GaugeValue, m.memswTotal, m.batch, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memswFailCount, prometheus.GaugeValue, m.memswFailCount, m.batch, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.cpus, prometheus.GaugeValue, float64(cpus), m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memoryRSS, prometheus.GaugeValue, m.memoryRSS, m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memoryCache, prometheus.GaugeValue, m.memoryCache, m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memoryUsed, prometheus.GaugeValue, m.memoryUsed, m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memoryFailCount, prometheus.GaugeValue, m.memoryFailCount, m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memswUsed, prometheus.GaugeValue, m.memswUsed, m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memswTotal, prometheus.GaugeValue, m.memswTotal, m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memswFailCount, prometheus.GaugeValue, m.memswFailCount, m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) } return nil } @@ -526,7 +539,7 @@ func (c *slurmCollector) getInfoV1(name string, metric *CgroupMetric) { // Get metrics from cgroups v1 func (c *slurmCollector) getCgroupsV1Metrics(name string) (CgroupMetric, error) { - metric := CgroupMetric{name: name, batch: "slurm"} + metric := CgroupMetric{name: name, batch: "slurm", hostname: c.hostname} metric.err = false level.Debug(c.logger).Log("msg", "Loading cgroup v1", "path", name) ctrl, err := cgroup1.Load(cgroup1.StaticPath(name), cgroup1.WithHiearchy(subsystem)) @@ -611,7 +624,7 @@ func (c *slurmCollector) getInfoV2(name string, metric *CgroupMetric) { // Get Job metrics from cgroups v2 func (c *slurmCollector) getCgroupsV2Metrics(name string) (CgroupMetric, error) { - metric := CgroupMetric{name: name, batch: "slurm"} + metric := CgroupMetric{name: name, batch: "slurm", hostname: c.hostname} metric.err = false level.Debug(c.logger).Log("msg", "Loading cgroup v2", "path", name) // Files to parse out of the cgroup diff --git a/scripts/e2e-test.sh b/scripts/e2e-test.sh index bfab4eae..8c94cddb 100755 --- a/scripts/e2e-test.sh +++ b/scripts/e2e-test.sh @@ -118,6 +118,7 @@ then --collector.nvidia_gpu \ --collector.nvidia.smi.path="pkg/collector/fixtures/nvidia-smi" \ --collector.nvidia.gpu.job.map.path="pkg/collector/fixtures/gpujobmap" \ + --collector.empty.hostname.label \ --web.listen-address "127.0.0.1:${port}" \ --log.level="debug" > "${logfile}" 2>&1 &