diff --git a/Makefile b/Makefile index ef52994a..87546719 100644 --- a/Makefile +++ b/Makefile @@ -113,12 +113,16 @@ ifeq ($(CGO_BUILD), 0) .PHONY: test-e2e test-e2e: build pkg/collector/fixtures/sys/.unpacked pkg/collector/fixtures/proc/.unpacked @echo ">> running end-to-end tests" - ./scripts/e2e-test.sh -p exporter + ./scripts/e2e-test.sh -s exporter-cgroups-v1 + ./scripts/e2e-test.sh -s exporter-cgroups-v2 + ./scripts/e2e-test.sh -s exporter-cgroups-v2-nogpu + ./scripts/e2e-test.sh -s exporter-cgroups-v2-procfs + ./scripts/e2e-test.sh -s exporter-cgroups-v2-all-metrics else .PHONY: test-e2e test-e2e: build pkg/collector/fixtures/sys/.unpacked pkg/collector/fixtures/proc/.unpacked @echo ">> running end-to-end tests" - ./scripts/e2e-test.sh -p stats + ./scripts/e2e-test.sh -s stats endif .PHONY: skip-test-e2e diff --git a/README.md b/README.md index 0ec5ebae..74a69139 100644 --- a/README.md +++ b/README.md @@ -127,8 +127,7 @@ CGO_BUILD=1 make tests Currently, the exporter supports only SLURM. `batchjob_exporter` provides following collectors: -- Slurm collector: Exports SLURM job metrics like CPU, memory and IO usage -- nVIDIA GPU collector: Exports GPU indices to job ID maps +- Slurm collector: Exports SLURM job metrics like CPU, memory and GPU indices to job ID maps - IPMI collector: Exports power usage reported by `ipmi` tools - RAPL collector: Exports RAPL energy metrics - Emissions collector: Exports emission factor (g eCO2/kWh) @@ -170,17 +169,15 @@ userland. If the admins would not want to have the burden of maintaining prolog epilog scripts, it is better to assign capabilities. These two approaches should be always favoured to running the exporter as `root`. -### nVIDIA GPU job map collector - This collector exports the GPU ordinal index to job ID map to Prometheus. The actual GPU metrics are exported using [dcgm-exporter](https://github.com/NVIDIA/dcgm-exporter). -Like in the case of SLURM collector, we need to know which GPU is allocated to which -job and this info is not available post job. Thus, similar approaches as SLURM collector -are available for this collector too. +To use `dcgm-exporter`, we need to know which GPU is allocated to which +job and this info is not available post job. Thus, similar approaches as used to retrieve +SLURM job properties can be used here as well - Use prolog and epilog scripts to get the GPU to job ID map. Example prolog script is provided in the [repo](./configs/slurm/prolog.d/gpujobmap.sh). Similarly, this approach -needs `--collector.nvidia.gpu.job.map.path=/run/gpujobmap` command line option. +needs `--collector.slurm.nvidia.gpu.job.map.path=/run/gpujobmap` command line option. - Using capabilities to read the environment variables directly from `/proc` file system. @@ -196,6 +193,8 @@ output of the command expects following lines: ``` Current Power : 332 Watts +Minimum Power over sampling duration : 68 watts +Maximum Power over sampling duration : 504 watts Power Measurement : Active ``` @@ -307,9 +306,8 @@ Using prolog and epilog scripts approach and `sudo` for `ipmi`, ``` /path/to/batchjob_exporter \ --collector.slurm.job.props.path="/run/slurmjobprops" \ + --collector.slurm.nvidia.gpu.job.map.path="/run/gpujobmap" \ --collector.ipmi.dcmi.cmd="sudo /usr/sbin/ipmi-dcmi --get-system-power-statistics" \ - --collector.nvidia_gpu \ - --collector.nvidia.gpu.job.map.path="/run/gpujobmap" \ --log.level="debug" ``` @@ -317,69 +315,64 @@ This will start exporter server on default 9010 port. Metrics can be consulted u `curl http://localhost:9010/metrics` command which will give an output as follows: ``` -# HELP batchjob_cpu_system_seconds Cumulative CPU system seconds -# TYPE batchjob_cpu_system_seconds gauge -batchjob_cpu_system_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 115.777502 -# HELP batchjob_cpu_total_seconds Cumulative CPU total seconds -# TYPE batchjob_cpu_total_seconds gauge -batchjob_cpu_total_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60491.070351 -# HELP batchjob_cpu_user_seconds Cumulative CPU user seconds -# TYPE batchjob_cpu_user_seconds gauge -batchjob_cpu_user_seconds{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60375.292848 -# HELP batchjob_cpus Number of CPUs -# TYPE batchjob_cpus gauge -batchjob_cpus{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2 # HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build. # TYPE batchjob_exporter_build_info gauge -batchjob_exporter_build_info{branch="main",goarch="amd64",goos="linux",goversion="go1.21.3",revision="50a5db3888711a35341891a2bdd4925549ad6a14",tags="netgo osusergo static_build",version="Unreleased"} 1 -# HELP batchjob_ipmi_dcmi_watts_total Current Power consumption in watts -# TYPE batchjob_ipmi_dcmi_watts_total counter -batchjob_ipmi_dcmi_watts_total 332 -# HELP batchjob_memory_cache_bytes Memory cache used in bytes -# TYPE batchjob_memory_cache_bytes gauge -batchjob_memory_cache_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 -# HELP batchjob_memory_fail_count Memory fail count -# TYPE batchjob_memory_fail_count gauge -batchjob_memory_fail_count{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 -# HELP batchjob_memory_rss_bytes Memory RSS used in bytes -# TYPE batchjob_memory_rss_bytes gauge -batchjob_memory_rss_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.098592768e+09 -# HELP batchjob_memory_total_bytes Memory total in bytes -# TYPE batchjob_memory_total_bytes gauge -batchjob_memory_total_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.294967296e+09 -# HELP batchjob_memory_used_bytes Memory used in bytes -# TYPE batchjob_memory_used_bytes gauge -batchjob_memory_used_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.111491072e+09 -# HELP batchjob_memsw_fail_count Swap fail count -# TYPE batchjob_memsw_fail_count gauge -batchjob_memsw_fail_count{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 -# HELP batchjob_memsw_total_bytes Swap total in bytes -# TYPE batchjob_memsw_total_bytes gauge -batchjob_memsw_total_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 -# HELP batchjob_memsw_used_bytes Swap used in bytes -# TYPE batchjob_memsw_used_bytes gauge -batchjob_memsw_used_bytes{batch="slurm",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +# HELP batchjob_ipmi_dcmi_current_watts_total Current Power consumption in watts +# TYPE batchjob_ipmi_dcmi_current_watts_total counter +batchjob_ipmi_dcmi_current_watts_total{hostname=""} 332 +# HELP batchjob_ipmi_dcmi_max_watts_total Maximum Power consumption in watts +# TYPE batchjob_ipmi_dcmi_max_watts_total counter +batchjob_ipmi_dcmi_max_watts_total{hostname=""} 504 +# HELP batchjob_ipmi_dcmi_min_watts_total Minimum Power consumption in watts +# TYPE batchjob_ipmi_dcmi_min_watts_total counter +batchjob_ipmi_dcmi_min_watts_total{hostname=""} 68 # HELP batchjob_rapl_package_joules_total Current RAPL package value in joules # TYPE batchjob_rapl_package_joules_total counter -batchjob_rapl_package_joules_total{index="0",path="pkg/collector/fixtures/sys/class/powercap/intel-rapl:0"} 258218.293244 -batchjob_rapl_package_joules_total{index="1",path="pkg/collector/fixtures/sys/class/powercap/intel-rapl:1"} 130570.505826 +batchjob_rapl_package_joules_total{hostname="",index="0",path="pkg/collector/fixtures/sys/class/powercap/intel-rapl:0"} 258218.293244 +batchjob_rapl_package_joules_total{hostname="",index="1",path="pkg/collector/fixtures/sys/class/powercap/intel-rapl:1"} 130570.505826 # HELP batchjob_scrape_collector_duration_seconds batchjob_exporter: Duration of a collector scrape. # TYPE batchjob_scrape_collector_duration_seconds gauge -batchjob_scrape_collector_duration_seconds{collector="ipmi_dcmi"} 0.003479042 -batchjob_scrape_collector_duration_seconds{collector="nvidia_gpu"} 1.66e-05 -batchjob_scrape_collector_duration_seconds{collector="rapl"} 0.001222098 -batchjob_scrape_collector_duration_seconds{collector="slurm_job"} 0.005055937 # HELP batchjob_scrape_collector_success batchjob_exporter: Whether a collector succeeded. # TYPE batchjob_scrape_collector_success gauge batchjob_scrape_collector_success{collector="ipmi_dcmi"} 1 -batchjob_scrape_collector_success{collector="nvidia_gpu"} 1 batchjob_scrape_collector_success{collector="rapl"} 1 batchjob_scrape_collector_success{collector="slurm_job"} 1 +# HELP batchjob_slurm_job_cpu_system_seconds Cumulative CPU system seconds +# TYPE batchjob_slurm_job_cpu_system_seconds gauge +batchjob_slurm_job_cpu_system_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 115.777502 +# HELP batchjob_slurm_job_cpu_total_seconds Cumulative CPU total seconds +# TYPE batchjob_slurm_job_cpu_total_seconds gauge +batchjob_slurm_job_cpu_total_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60491.070351 +# HELP batchjob_slurm_job_cpu_user_seconds Cumulative CPU user seconds +# TYPE batchjob_slurm_job_cpu_user_seconds gauge +batchjob_slurm_job_cpu_user_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60375.292848 +# HELP batchjob_slurm_job_cpus Number of CPUs +# TYPE batchjob_slurm_job_cpus gauge +batchjob_slurm_job_cpus{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2 +# HELP batchjob_slurm_job_memory_cache_bytes Memory cache used in bytes +# TYPE batchjob_slurm_job_memory_cache_bytes gauge +batchjob_slurm_job_memory_cache_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +# HELP batchjob_slurm_job_memory_fail_count Memory fail count +# TYPE batchjob_slurm_job_memory_fail_count gauge +batchjob_slurm_job_memory_fail_count{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +# HELP batchjob_slurm_job_memory_rss_bytes Memory RSS used in bytes +# TYPE batchjob_slurm_job_memory_rss_bytes gauge +batchjob_slurm_job_memory_rss_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.098592768e+09 +# HELP batchjob_slurm_job_memory_total_bytes Memory total in bytes +# TYPE batchjob_slurm_job_memory_total_bytes gauge +batchjob_slurm_job_memory_total_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.294967296e+09 +# HELP batchjob_slurm_job_memory_used_bytes Memory used in bytes +# TYPE batchjob_slurm_job_memory_used_bytes gauge +batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.111491072e+09 +# HELP batchjob_slurm_job_nvidia_gpu_jobid Batch Job ID of current nVIDIA GPU +# TYPE batchjob_slurm_job_nvidia_gpu_jobid gauge +batchjob_slurm_job_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",batch="slurm",hostname="",index="3",uuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3"} 1.009248e+06 +batchjob_slurm_job_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",batch="slurm",hostname="",index="2",uuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3"} 1.009248e+06 ``` If the `batchjob_exporter` process have necessary capabilities assigned either _via_ file capabilities or process capabilities, the flags `--collector.slurm.job.props.path` -and `--collector.nvidia.gpu.job.map.path` can be omitted and there is no need to +and `--collector.slurm.nvidia.gpu.job.map.path` can be omitted and there is no need to set up prolog and epilog scripts. ### `batchjob_stats_server` diff --git a/go.mod b/go.mod index 4d2cde0f..03529598 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.21 require ( github.com/alecthomas/kingpin/v2 v2.3.2 - github.com/containerd/cgroups/v3 v3.0.2 + github.com/containerd/cgroups/v3 v3.0.3 github.com/go-kit/log v0.2.1 github.com/google/uuid v1.4.0 github.com/gorilla/mux v1.8.1 @@ -14,13 +14,14 @@ require ( github.com/prometheus/exporter-toolkit v0.10.0 github.com/prometheus/procfs v0.12.0 github.com/zeebo/xxh3 v1.0.2 - golang.org/x/sys v0.13.0 + golang.org/x/sys v0.15.0 ) require ( github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect + github.com/cilium/ebpf v0.11.0 // indirect github.com/coreos/go-systemd/v22 v22.5.0 // indirect github.com/docker/go-units v0.4.0 // indirect github.com/go-logfmt/logfmt v0.5.1 // indirect @@ -28,14 +29,15 @@ require ( github.com/golang/protobuf v1.5.3 // indirect github.com/jpillora/backoff v1.0.0 // indirect github.com/klauspost/cpuid/v2 v2.2.3 // indirect - github.com/kr/text v0.2.0 // indirect github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect github.com/opencontainers/runtime-spec v1.0.2 // indirect github.com/prometheus/client_model v0.5.0 // indirect + github.com/sirupsen/logrus v1.9.0 // indirect github.com/stretchr/testify v1.8.4 // indirect github.com/xhit/go-str2duration/v2 v2.1.0 // indirect golang.org/x/crypto v0.14.0 // indirect + golang.org/x/exp v0.0.0-20230224173230-c95f2b4c22f2 // indirect golang.org/x/net v0.17.0 // indirect golang.org/x/oauth2 v0.12.0 // indirect golang.org/x/sync v0.3.0 // indirect diff --git a/go.sum b/go.sum index 6e6dc644..ddc8cb4a 100644 --- a/go.sum +++ b/go.sum @@ -6,16 +6,19 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/containerd/cgroups/v3 v3.0.2 h1:f5WFqIVSgo5IZmtTT3qVBo6TzI1ON6sycSBKkymb9L0= -github.com/containerd/cgroups/v3 v3.0.2/go.mod h1:JUgITrzdFqp42uI2ryGA+ge0ap/nxzYgkGmIcetmErE= +github.com/cilium/ebpf v0.11.0 h1:V8gS/bTCCjX9uUnkUFUpPsksM8n1lXBAvHcpiFk1X2Y= +github.com/cilium/ebpf v0.11.0/go.mod h1:WE7CZAnqOL2RouJ4f1uyNhqr2P4CCvXFIqdRDUgWsVs= +github.com/containerd/cgroups/v3 v3.0.3 h1:S5ByHZ/h9PMe5IOQoN7E+nMc2UcLEM/V48DGDJ9kip0= +github.com/containerd/cgroups/v3 v3.0.3/go.mod h1:8HBe7V3aWGLFPd/k03swSIsGjZhHI2WzJmticMgVuz0= github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs= github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= -github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw= github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/frankban/quicktest v1.14.5 h1:dfYrrRyLtiqT9GyKXgdh+k4inNeTvmGbuSgZ3lx3GhA= +github.com/frankban/quicktest v1.14.5/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= github.com/go-kit/log v0.2.1 h1:MRVx0/zhvdseW+Gza6N9rVzU/IVzaeE1SFI4raAhmBU= github.com/go-kit/log v0.2.1/go.mod h1:NwTd00d/i8cPZ3xOwwiv2PO5MOcx78fFErGNcVmBjv0= github.com/go-logfmt/logfmt v0.5.1 h1:otpy5pqBCBZ1ng9RQ0dPu4PN7ba75Y/aA+UpowDyNVA= @@ -64,8 +67,11 @@ github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= +github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0= +github.com/sirupsen/logrus v1.9.0/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/xhit/go-str2duration/v2 v2.1.0 h1:lxklc02Drh6ynqX+DdPyp5pCKLUQpRT8bp8Ydu2Bstc= @@ -74,9 +80,13 @@ github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= +go.uber.org/goleak v1.1.12 h1:gZAh5/EyT/HQwlpkCy6wTpqfH9H8Lz8zbm3dZh+OyzA= +go.uber.org/goleak v1.1.12/go.mod h1:cwTWslyiVhfpKIDGSZEM2HlOvcqm+tG4zioyIeLoqMQ= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.14.0 h1:wBqGXzWJW6m1XrIKlAH0Hs1JJ7+9KBwnIO8v66Q9cHc= golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4= +golang.org/x/exp v0.0.0-20230224173230-c95f2b4c22f2 h1:Jvc7gsqn21cJHCmAWx0LiimpP18LZmUxkT5Mp7EZ1mI= +golang.org/x/exp v0.0.0-20230224173230-c95f2b4c22f2/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc= golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= @@ -86,8 +96,9 @@ golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E= golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20220704084225-05e143d24a9e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= -golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc= +golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= @@ -106,5 +117,6 @@ gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EV gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/init/systemd/batchjob_exporter_no_privs.service b/init/systemd/batchjob_exporter_no_privs.service index f34dc5b3..63a85d99 100644 --- a/init/systemd/batchjob_exporter_no_privs.service +++ b/init/systemd/batchjob_exporter_no_privs.service @@ -8,9 +8,8 @@ User=batchjob-exp Group=batchjob-exp ExecStart=/usr/local/bin/batchjob_exporter \ --collector.slurm.job.props.path="/run/slurmjobprops" \ + --collector.slurm.nvidia.gpu.job.map.path="/run/gpujobmap" \ --collector.ipmi.dcmi.cmd="sudo /usr/sbin/ipmi-dcmi --get-system-power-statistics" \ - --collector.nvidia_gpu \ - --collector.nvidia.gpu.job.map.path="/run/gpujobmap" \ --log.level="debug" SyslogIdentifier=batchjob_exporter diff --git a/init/systemd/batchjob_exporter_with_caps.service b/init/systemd/batchjob_exporter_with_caps.service index 04c31b8d..08f78d42 100644 --- a/init/systemd/batchjob_exporter_with_caps.service +++ b/init/systemd/batchjob_exporter_with_caps.service @@ -8,7 +8,6 @@ User=batchjob-exp Group=batchjob-exp ExecStart=/usr/local/bin/batchjob_exporter \ --collector.ipmi.dcmi.cmd="sudo /usr/sbin/ipmi-dcmi --get-system-power-statistics" \ - --collector.nvidia_gpu \ --log.level="debug" SyslogIdentifier=batchjob_exporter diff --git a/pkg/collector/emissions.go b/pkg/collector/emissions.go index 91c10585..d39c7055 100644 --- a/pkg/collector/emissions.go +++ b/pkg/collector/emissions.go @@ -69,7 +69,8 @@ func NewEmissionsCollector(logger log.Logger) (Collector, error) { // Create metric description emissionsMetricDesc := prometheus.NewDesc( prometheus.BuildFQName(Namespace, emissionsCollectorSubsystem, "gCo2_kWh"), - "Current emission factor in CO2eq grams per kWh", []string{"provider", "provider_name", "country"}, nil, + "Current emission factor in CO2eq grams per kWh", + []string{"provider", "provider_name", "country"}, nil, ) // Create a new instance of EmissionCollector diff --git a/pkg/collector/fixtures/gpujobmap/2 b/pkg/collector/fixtures/gpujobmap/2 new file mode 100644 index 00000000..e2867791 --- /dev/null +++ b/pkg/collector/fixtures/gpujobmap/2 @@ -0,0 +1 @@ +1009248 diff --git a/pkg/collector/fixtures/gpujobmap/3 b/pkg/collector/fixtures/gpujobmap/3 new file mode 100644 index 00000000..e2867791 --- /dev/null +++ b/pkg/collector/fixtures/gpujobmap/3 @@ -0,0 +1 @@ +1009248 diff --git a/pkg/collector/fixtures/nvidia-smi b/pkg/collector/fixtures/nvidia-smi index 9e030652..2d8a77a8 100755 --- a/pkg/collector/fixtures/nvidia-smi +++ b/pkg/collector/fixtures/nvidia-smi @@ -2,4 +2,6 @@ echo """index, name, uuid 0, Tesla V100-SXM2-32GB, GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e -1, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3""" +1, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3 +2, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3 +3, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3""" diff --git a/pkg/collector/fixtures/e2e-test-cgroupsv1-output.txt b/pkg/collector/fixtures/output/e2e-test-cgroupsv1-output.txt similarity index 64% rename from pkg/collector/fixtures/e2e-test-cgroupsv1-output.txt rename to pkg/collector/fixtures/output/e2e-test-cgroupsv1-output.txt index c4cbc166..41f1d985 100644 --- a/pkg/collector/fixtures/e2e-test-cgroupsv1-output.txt +++ b/pkg/collector/fixtures/output/e2e-test-cgroupsv1-output.txt @@ -1,15 +1,3 @@ -# HELP batchjob_cpu_system_seconds Cumulative CPU system seconds -# TYPE batchjob_cpu_system_seconds gauge -batchjob_cpu_system_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0.45 -# HELP batchjob_cpu_total_seconds Cumulative CPU total seconds -# TYPE batchjob_cpu_total_seconds gauge -batchjob_cpu_total_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 1.012410966 -# HELP batchjob_cpu_user_seconds Cumulative CPU user seconds -# TYPE batchjob_cpu_user_seconds gauge -batchjob_cpu_user_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0.39 -# HELP batchjob_cpus Number of CPUs -# TYPE batchjob_cpus gauge -batchjob_cpus{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 # HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build. # TYPE batchjob_exporter_build_info gauge # HELP batchjob_ipmi_dcmi_current_watts_total Current Power consumption in watts @@ -21,34 +9,6 @@ batchjob_ipmi_dcmi_max_watts_total{hostname=""} 504 # HELP batchjob_ipmi_dcmi_min_watts_total Minimum Power consumption in watts # TYPE batchjob_ipmi_dcmi_min_watts_total counter batchjob_ipmi_dcmi_min_watts_total{hostname=""} 68 -# HELP batchjob_memory_cache_bytes Memory cache used in bytes -# TYPE batchjob_memory_cache_bytes gauge -batchjob_memory_cache_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2.1086208e+07 -# HELP batchjob_memory_fail_count Memory fail count -# TYPE batchjob_memory_fail_count gauge -batchjob_memory_fail_count{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 -# HELP batchjob_memory_rss_bytes Memory RSS used in bytes -# TYPE batchjob_memory_rss_bytes gauge -batchjob_memory_rss_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 1.0407936e+07 -# HELP batchjob_memory_total_bytes Memory total in bytes -# TYPE batchjob_memory_total_bytes gauge -batchjob_memory_total_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2.01362030592e+11 -# HELP batchjob_memory_used_bytes Memory used in bytes -# TYPE batchjob_memory_used_bytes gauge -batchjob_memory_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.0194048e+07 -# HELP batchjob_memsw_fail_count Swap fail count -# TYPE batchjob_memsw_fail_count gauge -batchjob_memsw_fail_count{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 -# HELP batchjob_memsw_total_bytes Swap total in bytes -# TYPE batchjob_memsw_total_bytes gauge -batchjob_memsw_total_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 9.223372036854772e+18 -# HELP batchjob_memsw_used_bytes Swap used in bytes -# TYPE batchjob_memsw_used_bytes gauge -batchjob_memsw_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.032512e+07 -# HELP batchjob_nvidia_gpu_jobid Batch Job ID of current nVIDIA GPU -# TYPE batchjob_nvidia_gpu_jobid gauge -batchjob_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3",hostname="",uuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"} 11000 -batchjob_nvidia_gpu_jobid{UUID="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e",hostname="",uuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e"} 10000 # HELP batchjob_rapl_package_joules_total Current RAPL package value in joules # TYPE batchjob_rapl_package_joules_total counter batchjob_rapl_package_joules_total{hostname="",index="0",path="pkg/collector/fixtures/sys/class/powercap/intel-rapl:0"} 258218.293244 @@ -58,9 +18,39 @@ batchjob_rapl_package_joules_total{hostname="",index="1",path="pkg/collector/fix # HELP batchjob_scrape_collector_success batchjob_exporter: Whether a collector succeeded. # TYPE batchjob_scrape_collector_success gauge batchjob_scrape_collector_success{collector="ipmi_dcmi"} 1 -batchjob_scrape_collector_success{collector="nvidia_gpu"} 1 batchjob_scrape_collector_success{collector="rapl"} 1 batchjob_scrape_collector_success{collector="slurm_job"} 1 +# HELP batchjob_slurm_job_cpu_system_seconds Cumulative CPU system seconds +# TYPE batchjob_slurm_job_cpu_system_seconds gauge +batchjob_slurm_job_cpu_system_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0.45 +# HELP batchjob_slurm_job_cpu_total_seconds Cumulative CPU total seconds +# TYPE batchjob_slurm_job_cpu_total_seconds gauge +batchjob_slurm_job_cpu_total_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 1.012410966 +# HELP batchjob_slurm_job_cpu_user_seconds Cumulative CPU user seconds +# TYPE batchjob_slurm_job_cpu_user_seconds gauge +batchjob_slurm_job_cpu_user_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0.39 +# HELP batchjob_slurm_job_cpus Number of CPUs +# TYPE batchjob_slurm_job_cpus gauge +batchjob_slurm_job_cpus{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +# HELP batchjob_slurm_job_memory_cache_bytes Memory cache used in bytes +# TYPE batchjob_slurm_job_memory_cache_bytes gauge +batchjob_slurm_job_memory_cache_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2.1086208e+07 +# HELP batchjob_slurm_job_memory_fail_count Memory fail count +# TYPE batchjob_slurm_job_memory_fail_count gauge +batchjob_slurm_job_memory_fail_count{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +# HELP batchjob_slurm_job_memory_rss_bytes Memory RSS used in bytes +# TYPE batchjob_slurm_job_memory_rss_bytes gauge +batchjob_slurm_job_memory_rss_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 1.0407936e+07 +# HELP batchjob_slurm_job_memory_total_bytes Memory total in bytes +# TYPE batchjob_slurm_job_memory_total_bytes gauge +batchjob_slurm_job_memory_total_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2.01362030592e+11 +# HELP batchjob_slurm_job_memory_used_bytes Memory used in bytes +# TYPE batchjob_slurm_job_memory_used_bytes gauge +batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.0194048e+07 +# HELP batchjob_slurm_job_nvidia_gpu_jobid Batch Job ID of current nVIDIA GPU +# TYPE batchjob_slurm_job_nvidia_gpu_jobid gauge +batchjob_slurm_job_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",batch="slurm",hostname="",index="3",uuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3"} 1.009248e+06 +batchjob_slurm_job_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",batch="slurm",hostname="",index="2",uuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3"} 1.009248e+06 # HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. # TYPE go_gc_duration_seconds summary # HELP go_goroutines Number of goroutines that currently exist. diff --git a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-all-metrics-output.txt b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-all-metrics-output.txt new file mode 100644 index 00000000..347f6844 --- /dev/null +++ b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-all-metrics-output.txt @@ -0,0 +1,148 @@ +# HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build. +# TYPE batchjob_exporter_build_info gauge +# HELP batchjob_ipmi_dcmi_current_watts_total Current Power consumption in watts +# TYPE batchjob_ipmi_dcmi_current_watts_total counter +batchjob_ipmi_dcmi_current_watts_total{hostname=""} 332 +# HELP batchjob_ipmi_dcmi_max_watts_total Maximum Power consumption in watts +# TYPE batchjob_ipmi_dcmi_max_watts_total counter +batchjob_ipmi_dcmi_max_watts_total{hostname=""} 504 +# HELP batchjob_ipmi_dcmi_min_watts_total Minimum Power consumption in watts +# TYPE batchjob_ipmi_dcmi_min_watts_total counter +batchjob_ipmi_dcmi_min_watts_total{hostname=""} 68 +# HELP batchjob_rapl_package_joules_total Current RAPL package value in joules +# TYPE batchjob_rapl_package_joules_total counter +batchjob_rapl_package_joules_total{hostname="",index="0",path="pkg/collector/fixtures/sys/class/powercap/intel-rapl:0"} 258218.293244 +batchjob_rapl_package_joules_total{hostname="",index="1",path="pkg/collector/fixtures/sys/class/powercap/intel-rapl:1"} 130570.505826 +# HELP batchjob_scrape_collector_duration_seconds batchjob_exporter: Duration of a collector scrape. +# TYPE batchjob_scrape_collector_duration_seconds gauge +# HELP batchjob_scrape_collector_success batchjob_exporter: Whether a collector succeeded. +# TYPE batchjob_scrape_collector_success gauge +batchjob_scrape_collector_success{collector="ipmi_dcmi"} 1 +batchjob_scrape_collector_success{collector="rapl"} 1 +batchjob_scrape_collector_success{collector="slurm_job"} 1 +# HELP batchjob_slurm_job_cpu_psi_seconds Cumulative CPU PSI seconds +# TYPE batchjob_slurm_job_cpu_psi_seconds gauge +batchjob_slurm_job_cpu_psi_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +# HELP batchjob_slurm_job_cpu_system_seconds Cumulative CPU system seconds +# TYPE batchjob_slurm_job_cpu_system_seconds gauge +batchjob_slurm_job_cpu_system_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 115.777502 +# HELP batchjob_slurm_job_cpu_total_seconds Cumulative CPU total seconds +# TYPE batchjob_slurm_job_cpu_total_seconds gauge +batchjob_slurm_job_cpu_total_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60491.070351 +# HELP batchjob_slurm_job_cpu_user_seconds Cumulative CPU user seconds +# TYPE batchjob_slurm_job_cpu_user_seconds gauge +batchjob_slurm_job_cpu_user_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60375.292848 +# HELP batchjob_slurm_job_cpus Number of CPUs +# TYPE batchjob_slurm_job_cpus gauge +batchjob_slurm_job_cpus{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2 +# HELP batchjob_slurm_job_memory_cache_bytes Memory cache used in bytes +# TYPE batchjob_slurm_job_memory_cache_bytes gauge +batchjob_slurm_job_memory_cache_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +# HELP batchjob_slurm_job_memory_fail_count Memory fail count +# TYPE batchjob_slurm_job_memory_fail_count gauge +batchjob_slurm_job_memory_fail_count{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +# HELP batchjob_slurm_job_memory_psi_seconds Cumulative memory PSI seconds +# TYPE batchjob_slurm_job_memory_psi_seconds gauge +batchjob_slurm_job_memory_psi_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +# HELP batchjob_slurm_job_memory_rss_bytes Memory RSS used in bytes +# TYPE batchjob_slurm_job_memory_rss_bytes gauge +batchjob_slurm_job_memory_rss_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.098592768e+09 +# HELP batchjob_slurm_job_memory_total_bytes Memory total in bytes +# TYPE batchjob_slurm_job_memory_total_bytes gauge +batchjob_slurm_job_memory_total_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.294967296e+09 +# HELP batchjob_slurm_job_memory_used_bytes Memory used in bytes +# TYPE batchjob_slurm_job_memory_used_bytes gauge +batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.111491072e+09 +# HELP batchjob_slurm_job_memsw_fail_count Swap fail count +# TYPE batchjob_slurm_job_memsw_fail_count gauge +batchjob_slurm_job_memsw_fail_count{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +# HELP batchjob_slurm_job_memsw_total_bytes Swap total in bytes +# TYPE batchjob_slurm_job_memsw_total_bytes gauge +batchjob_slurm_job_memsw_total_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 1.8446744073709552e+19 +# HELP batchjob_slurm_job_memsw_used_bytes Swap used in bytes +# TYPE batchjob_slurm_job_memsw_used_bytes gauge +batchjob_slurm_job_memsw_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +# HELP batchjob_slurm_job_nvidia_gpu_jobid Batch Job ID of current nVIDIA GPU +# TYPE batchjob_slurm_job_nvidia_gpu_jobid gauge +batchjob_slurm_job_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",batch="slurm",hostname="",index="3",uuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3"} 1.009248e+06 +batchjob_slurm_job_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",batch="slurm",hostname="",index="2",uuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3"} 1.009248e+06 +# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. +# TYPE go_gc_duration_seconds summary +# HELP go_goroutines Number of goroutines that currently exist. +# TYPE go_goroutines gauge +# HELP go_info Information about the Go environment. +# TYPE go_info gauge +# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use. +# TYPE go_memstats_alloc_bytes gauge +# HELP go_memstats_alloc_bytes_total Total number of bytes allocated, even if freed. +# TYPE go_memstats_alloc_bytes_total counter +# HELP go_memstats_buck_hash_sys_bytes Number of bytes used by the profiling bucket hash table. +# TYPE go_memstats_buck_hash_sys_bytes gauge +# HELP go_memstats_frees_total Total number of frees. +# TYPE go_memstats_frees_total counter +# HELP go_memstats_gc_sys_bytes Number of bytes used for garbage collection system metadata. +# TYPE go_memstats_gc_sys_bytes gauge +# HELP go_memstats_heap_alloc_bytes Number of heap bytes allocated and still in use. +# TYPE go_memstats_heap_alloc_bytes gauge +# HELP go_memstats_heap_idle_bytes Number of heap bytes waiting to be used. +# TYPE go_memstats_heap_idle_bytes gauge +# HELP go_memstats_heap_inuse_bytes Number of heap bytes that are in use. +# TYPE go_memstats_heap_inuse_bytes gauge +# HELP go_memstats_heap_objects Number of allocated objects. +# TYPE go_memstats_heap_objects gauge +# HELP go_memstats_heap_released_bytes Number of heap bytes released to OS. +# TYPE go_memstats_heap_released_bytes gauge +# HELP go_memstats_heap_sys_bytes Number of heap bytes obtained from system. +# TYPE go_memstats_heap_sys_bytes gauge +# HELP go_memstats_last_gc_time_seconds Number of seconds since 1970 of last garbage collection. +# TYPE go_memstats_last_gc_time_seconds gauge +# HELP go_memstats_lookups_total Total number of pointer lookups. +# TYPE go_memstats_lookups_total counter +# HELP go_memstats_mallocs_total Total number of mallocs. +# TYPE go_memstats_mallocs_total counter +# HELP go_memstats_mcache_inuse_bytes Number of bytes in use by mcache structures. +# TYPE go_memstats_mcache_inuse_bytes gauge +# HELP go_memstats_mcache_sys_bytes Number of bytes used for mcache structures obtained from system. +# TYPE go_memstats_mcache_sys_bytes gauge +# HELP go_memstats_mspan_inuse_bytes Number of bytes in use by mspan structures. +# TYPE go_memstats_mspan_inuse_bytes gauge +# HELP go_memstats_mspan_sys_bytes Number of bytes used for mspan structures obtained from system. +# TYPE go_memstats_mspan_sys_bytes gauge +# HELP go_memstats_next_gc_bytes Number of heap bytes when next garbage collection will take place. +# TYPE go_memstats_next_gc_bytes gauge +# HELP go_memstats_other_sys_bytes Number of bytes used for other system allocations. +# TYPE go_memstats_other_sys_bytes gauge +# HELP go_memstats_stack_inuse_bytes Number of bytes in use by the stack allocator. +# TYPE go_memstats_stack_inuse_bytes gauge +# HELP go_memstats_stack_sys_bytes Number of bytes obtained from system for stack allocator. +# TYPE go_memstats_stack_sys_bytes gauge +# HELP go_memstats_sys_bytes Number of bytes obtained from system. +# TYPE go_memstats_sys_bytes gauge +# HELP go_threads Number of OS threads created. +# TYPE go_threads gauge +# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds. +# TYPE process_cpu_seconds_total counter +# HELP process_max_fds Maximum number of open file descriptors. +# TYPE process_max_fds gauge +# HELP process_open_fds Number of open file descriptors. +# TYPE process_open_fds gauge +# HELP process_resident_memory_bytes Resident memory size in bytes. +# TYPE process_resident_memory_bytes gauge +# HELP process_start_time_seconds Start time of the process since unix epoch in seconds. +# TYPE process_start_time_seconds gauge +# HELP process_virtual_memory_bytes Virtual memory size in bytes. +# TYPE process_virtual_memory_bytes gauge +# HELP process_virtual_memory_max_bytes Maximum amount of virtual memory available in bytes. +# TYPE process_virtual_memory_max_bytes gauge +# HELP promhttp_metric_handler_errors_total Total number of internal errors encountered by the promhttp metric handler. +# TYPE promhttp_metric_handler_errors_total counter +promhttp_metric_handler_errors_total{cause="encoding"} 0 +promhttp_metric_handler_errors_total{cause="gathering"} 0 +# HELP promhttp_metric_handler_requests_in_flight Current number of scrapes being served. +# TYPE promhttp_metric_handler_requests_in_flight gauge +promhttp_metric_handler_requests_in_flight 1 +# HELP promhttp_metric_handler_requests_total Total number of scrapes by HTTP status code. +# TYPE promhttp_metric_handler_requests_total counter +promhttp_metric_handler_requests_total{code="200"} 0 +promhttp_metric_handler_requests_total{code="500"} 0 +promhttp_metric_handler_requests_total{code="503"} 0 diff --git a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-nogpu-output.txt b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-nogpu-output.txt new file mode 100644 index 00000000..04c3d478 --- /dev/null +++ b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-nogpu-output.txt @@ -0,0 +1,129 @@ +# HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build. +# TYPE batchjob_exporter_build_info gauge +# HELP batchjob_ipmi_dcmi_current_watts_total Current Power consumption in watts +# TYPE batchjob_ipmi_dcmi_current_watts_total counter +batchjob_ipmi_dcmi_current_watts_total{hostname=""} 332 +# HELP batchjob_ipmi_dcmi_max_watts_total Maximum Power consumption in watts +# TYPE batchjob_ipmi_dcmi_max_watts_total counter +batchjob_ipmi_dcmi_max_watts_total{hostname=""} 504 +# HELP batchjob_ipmi_dcmi_min_watts_total Minimum Power consumption in watts +# TYPE batchjob_ipmi_dcmi_min_watts_total counter +batchjob_ipmi_dcmi_min_watts_total{hostname=""} 68 +# HELP batchjob_rapl_package_joules_total Current RAPL package value in joules +# TYPE batchjob_rapl_package_joules_total counter +batchjob_rapl_package_joules_total{hostname="",index="0",path="pkg/collector/fixtures/sys/class/powercap/intel-rapl:0"} 258218.293244 +batchjob_rapl_package_joules_total{hostname="",index="1",path="pkg/collector/fixtures/sys/class/powercap/intel-rapl:1"} 130570.505826 +# HELP batchjob_scrape_collector_duration_seconds batchjob_exporter: Duration of a collector scrape. +# TYPE batchjob_scrape_collector_duration_seconds gauge +# HELP batchjob_scrape_collector_success batchjob_exporter: Whether a collector succeeded. +# TYPE batchjob_scrape_collector_success gauge +batchjob_scrape_collector_success{collector="ipmi_dcmi"} 1 +batchjob_scrape_collector_success{collector="rapl"} 1 +batchjob_scrape_collector_success{collector="slurm_job"} 1 +# HELP batchjob_slurm_job_cpu_system_seconds Cumulative CPU system seconds +# TYPE batchjob_slurm_job_cpu_system_seconds gauge +batchjob_slurm_job_cpu_system_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 115.777502 +# HELP batchjob_slurm_job_cpu_total_seconds Cumulative CPU total seconds +# TYPE batchjob_slurm_job_cpu_total_seconds gauge +batchjob_slurm_job_cpu_total_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60491.070351 +# HELP batchjob_slurm_job_cpu_user_seconds Cumulative CPU user seconds +# TYPE batchjob_slurm_job_cpu_user_seconds gauge +batchjob_slurm_job_cpu_user_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60375.292848 +# HELP batchjob_slurm_job_cpus Number of CPUs +# TYPE batchjob_slurm_job_cpus gauge +batchjob_slurm_job_cpus{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2 +# HELP batchjob_slurm_job_memory_cache_bytes Memory cache used in bytes +# TYPE batchjob_slurm_job_memory_cache_bytes gauge +batchjob_slurm_job_memory_cache_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +# HELP batchjob_slurm_job_memory_fail_count Memory fail count +# TYPE batchjob_slurm_job_memory_fail_count gauge +batchjob_slurm_job_memory_fail_count{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +# HELP batchjob_slurm_job_memory_rss_bytes Memory RSS used in bytes +# TYPE batchjob_slurm_job_memory_rss_bytes gauge +batchjob_slurm_job_memory_rss_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.098592768e+09 +# HELP batchjob_slurm_job_memory_total_bytes Memory total in bytes +# TYPE batchjob_slurm_job_memory_total_bytes gauge +batchjob_slurm_job_memory_total_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.294967296e+09 +# HELP batchjob_slurm_job_memory_used_bytes Memory used in bytes +# TYPE batchjob_slurm_job_memory_used_bytes gauge +batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.111491072e+09 +# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. +# TYPE go_gc_duration_seconds summary +# HELP go_goroutines Number of goroutines that currently exist. +# TYPE go_goroutines gauge +# HELP go_info Information about the Go environment. +# TYPE go_info gauge +# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use. +# TYPE go_memstats_alloc_bytes gauge +# HELP go_memstats_alloc_bytes_total Total number of bytes allocated, even if freed. +# TYPE go_memstats_alloc_bytes_total counter +# HELP go_memstats_buck_hash_sys_bytes Number of bytes used by the profiling bucket hash table. +# TYPE go_memstats_buck_hash_sys_bytes gauge +# HELP go_memstats_frees_total Total number of frees. +# TYPE go_memstats_frees_total counter +# HELP go_memstats_gc_sys_bytes Number of bytes used for garbage collection system metadata. +# TYPE go_memstats_gc_sys_bytes gauge +# HELP go_memstats_heap_alloc_bytes Number of heap bytes allocated and still in use. +# TYPE go_memstats_heap_alloc_bytes gauge +# HELP go_memstats_heap_idle_bytes Number of heap bytes waiting to be used. +# TYPE go_memstats_heap_idle_bytes gauge +# HELP go_memstats_heap_inuse_bytes Number of heap bytes that are in use. +# TYPE go_memstats_heap_inuse_bytes gauge +# HELP go_memstats_heap_objects Number of allocated objects. +# TYPE go_memstats_heap_objects gauge +# HELP go_memstats_heap_released_bytes Number of heap bytes released to OS. +# TYPE go_memstats_heap_released_bytes gauge +# HELP go_memstats_heap_sys_bytes Number of heap bytes obtained from system. +# TYPE go_memstats_heap_sys_bytes gauge +# HELP go_memstats_last_gc_time_seconds Number of seconds since 1970 of last garbage collection. +# TYPE go_memstats_last_gc_time_seconds gauge +# HELP go_memstats_lookups_total Total number of pointer lookups. +# TYPE go_memstats_lookups_total counter +# HELP go_memstats_mallocs_total Total number of mallocs. +# TYPE go_memstats_mallocs_total counter +# HELP go_memstats_mcache_inuse_bytes Number of bytes in use by mcache structures. +# TYPE go_memstats_mcache_inuse_bytes gauge +# HELP go_memstats_mcache_sys_bytes Number of bytes used for mcache structures obtained from system. +# TYPE go_memstats_mcache_sys_bytes gauge +# HELP go_memstats_mspan_inuse_bytes Number of bytes in use by mspan structures. +# TYPE go_memstats_mspan_inuse_bytes gauge +# HELP go_memstats_mspan_sys_bytes Number of bytes used for mspan structures obtained from system. +# TYPE go_memstats_mspan_sys_bytes gauge +# HELP go_memstats_next_gc_bytes Number of heap bytes when next garbage collection will take place. +# TYPE go_memstats_next_gc_bytes gauge +# HELP go_memstats_other_sys_bytes Number of bytes used for other system allocations. +# TYPE go_memstats_other_sys_bytes gauge +# HELP go_memstats_stack_inuse_bytes Number of bytes in use by the stack allocator. +# TYPE go_memstats_stack_inuse_bytes gauge +# HELP go_memstats_stack_sys_bytes Number of bytes obtained from system for stack allocator. +# TYPE go_memstats_stack_sys_bytes gauge +# HELP go_memstats_sys_bytes Number of bytes obtained from system. +# TYPE go_memstats_sys_bytes gauge +# HELP go_threads Number of OS threads created. +# TYPE go_threads gauge +# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds. +# TYPE process_cpu_seconds_total counter +# HELP process_max_fds Maximum number of open file descriptors. +# TYPE process_max_fds gauge +# HELP process_open_fds Number of open file descriptors. +# TYPE process_open_fds gauge +# HELP process_resident_memory_bytes Resident memory size in bytes. +# TYPE process_resident_memory_bytes gauge +# HELP process_start_time_seconds Start time of the process since unix epoch in seconds. +# TYPE process_start_time_seconds gauge +# HELP process_virtual_memory_bytes Virtual memory size in bytes. +# TYPE process_virtual_memory_bytes gauge +# HELP process_virtual_memory_max_bytes Maximum amount of virtual memory available in bytes. +# TYPE process_virtual_memory_max_bytes gauge +# HELP promhttp_metric_handler_errors_total Total number of internal errors encountered by the promhttp metric handler. +# TYPE promhttp_metric_handler_errors_total counter +promhttp_metric_handler_errors_total{cause="encoding"} 0 +promhttp_metric_handler_errors_total{cause="gathering"} 0 +# HELP promhttp_metric_handler_requests_in_flight Current number of scrapes being served. +# TYPE promhttp_metric_handler_requests_in_flight gauge +promhttp_metric_handler_requests_in_flight 1 +# HELP promhttp_metric_handler_requests_total Total number of scrapes by HTTP status code. +# TYPE promhttp_metric_handler_requests_total counter +promhttp_metric_handler_requests_total{code="200"} 0 +promhttp_metric_handler_requests_total{code="500"} 0 +promhttp_metric_handler_requests_total{code="503"} 0 diff --git a/pkg/collector/fixtures/e2e-test-cgroupsv2-output.txt b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-output.txt similarity index 64% rename from pkg/collector/fixtures/e2e-test-cgroupsv2-output.txt rename to pkg/collector/fixtures/output/e2e-test-cgroupsv2-output.txt index 89de42b5..224e9ee8 100644 --- a/pkg/collector/fixtures/e2e-test-cgroupsv2-output.txt +++ b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-output.txt @@ -1,15 +1,3 @@ -# HELP batchjob_cpu_system_seconds Cumulative CPU system seconds -# TYPE batchjob_cpu_system_seconds gauge -batchjob_cpu_system_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 115.777502 -# HELP batchjob_cpu_total_seconds Cumulative CPU total seconds -# TYPE batchjob_cpu_total_seconds gauge -batchjob_cpu_total_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60491.070351 -# HELP batchjob_cpu_user_seconds Cumulative CPU user seconds -# TYPE batchjob_cpu_user_seconds gauge -batchjob_cpu_user_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60375.292848 -# HELP batchjob_cpus Number of CPUs -# TYPE batchjob_cpus gauge -batchjob_cpus{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2 # HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build. # TYPE batchjob_exporter_build_info gauge # HELP batchjob_ipmi_dcmi_current_watts_total Current Power consumption in watts @@ -21,34 +9,6 @@ batchjob_ipmi_dcmi_max_watts_total{hostname=""} 504 # HELP batchjob_ipmi_dcmi_min_watts_total Minimum Power consumption in watts # TYPE batchjob_ipmi_dcmi_min_watts_total counter batchjob_ipmi_dcmi_min_watts_total{hostname=""} 68 -# HELP batchjob_memory_cache_bytes Memory cache used in bytes -# TYPE batchjob_memory_cache_bytes gauge -batchjob_memory_cache_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 -# HELP batchjob_memory_fail_count Memory fail count -# TYPE batchjob_memory_fail_count gauge -batchjob_memory_fail_count{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 -# HELP batchjob_memory_rss_bytes Memory RSS used in bytes -# TYPE batchjob_memory_rss_bytes gauge -batchjob_memory_rss_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.098592768e+09 -# HELP batchjob_memory_total_bytes Memory total in bytes -# TYPE batchjob_memory_total_bytes gauge -batchjob_memory_total_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.294967296e+09 -# HELP batchjob_memory_used_bytes Memory used in bytes -# TYPE batchjob_memory_used_bytes gauge -batchjob_memory_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.111491072e+09 -# HELP batchjob_memsw_fail_count Swap fail count -# TYPE batchjob_memsw_fail_count gauge -batchjob_memsw_fail_count{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 -# HELP batchjob_memsw_total_bytes Swap total in bytes -# TYPE batchjob_memsw_total_bytes gauge -batchjob_memsw_total_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} -1 -# HELP batchjob_memsw_used_bytes Swap used in bytes -# TYPE batchjob_memsw_used_bytes gauge -batchjob_memsw_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 -# HELP batchjob_nvidia_gpu_jobid Batch Job ID of current nVIDIA GPU -# TYPE batchjob_nvidia_gpu_jobid gauge -batchjob_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3",hostname="",uuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"} 11000 -batchjob_nvidia_gpu_jobid{UUID="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e",hostname="",uuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e"} 10000 # HELP batchjob_rapl_package_joules_total Current RAPL package value in joules # TYPE batchjob_rapl_package_joules_total counter batchjob_rapl_package_joules_total{hostname="",index="0",path="pkg/collector/fixtures/sys/class/powercap/intel-rapl:0"} 258218.293244 @@ -58,9 +18,39 @@ batchjob_rapl_package_joules_total{hostname="",index="1",path="pkg/collector/fix # HELP batchjob_scrape_collector_success batchjob_exporter: Whether a collector succeeded. # TYPE batchjob_scrape_collector_success gauge batchjob_scrape_collector_success{collector="ipmi_dcmi"} 1 -batchjob_scrape_collector_success{collector="nvidia_gpu"} 1 batchjob_scrape_collector_success{collector="rapl"} 1 batchjob_scrape_collector_success{collector="slurm_job"} 1 +# HELP batchjob_slurm_job_cpu_system_seconds Cumulative CPU system seconds +# TYPE batchjob_slurm_job_cpu_system_seconds gauge +batchjob_slurm_job_cpu_system_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 115.777502 +# HELP batchjob_slurm_job_cpu_total_seconds Cumulative CPU total seconds +# TYPE batchjob_slurm_job_cpu_total_seconds gauge +batchjob_slurm_job_cpu_total_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60491.070351 +# HELP batchjob_slurm_job_cpu_user_seconds Cumulative CPU user seconds +# TYPE batchjob_slurm_job_cpu_user_seconds gauge +batchjob_slurm_job_cpu_user_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60375.292848 +# HELP batchjob_slurm_job_cpus Number of CPUs +# TYPE batchjob_slurm_job_cpus gauge +batchjob_slurm_job_cpus{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2 +# HELP batchjob_slurm_job_memory_cache_bytes Memory cache used in bytes +# TYPE batchjob_slurm_job_memory_cache_bytes gauge +batchjob_slurm_job_memory_cache_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +# HELP batchjob_slurm_job_memory_fail_count Memory fail count +# TYPE batchjob_slurm_job_memory_fail_count gauge +batchjob_slurm_job_memory_fail_count{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +# HELP batchjob_slurm_job_memory_rss_bytes Memory RSS used in bytes +# TYPE batchjob_slurm_job_memory_rss_bytes gauge +batchjob_slurm_job_memory_rss_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.098592768e+09 +# HELP batchjob_slurm_job_memory_total_bytes Memory total in bytes +# TYPE batchjob_slurm_job_memory_total_bytes gauge +batchjob_slurm_job_memory_total_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.294967296e+09 +# HELP batchjob_slurm_job_memory_used_bytes Memory used in bytes +# TYPE batchjob_slurm_job_memory_used_bytes gauge +batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.111491072e+09 +# HELP batchjob_slurm_job_nvidia_gpu_jobid Batch Job ID of current nVIDIA GPU +# TYPE batchjob_slurm_job_nvidia_gpu_jobid gauge +batchjob_slurm_job_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",batch="slurm",hostname="",index="3",uuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3"} 1.009248e+06 +batchjob_slurm_job_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",batch="slurm",hostname="",index="2",uuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3"} 1.009248e+06 # HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. # TYPE go_gc_duration_seconds summary # HELP go_goroutines Number of goroutines that currently exist. diff --git a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-procfs-output.txt b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-procfs-output.txt new file mode 100644 index 00000000..224e9ee8 --- /dev/null +++ b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-procfs-output.txt @@ -0,0 +1,133 @@ +# HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build. +# TYPE batchjob_exporter_build_info gauge +# HELP batchjob_ipmi_dcmi_current_watts_total Current Power consumption in watts +# TYPE batchjob_ipmi_dcmi_current_watts_total counter +batchjob_ipmi_dcmi_current_watts_total{hostname=""} 332 +# HELP batchjob_ipmi_dcmi_max_watts_total Maximum Power consumption in watts +# TYPE batchjob_ipmi_dcmi_max_watts_total counter +batchjob_ipmi_dcmi_max_watts_total{hostname=""} 504 +# HELP batchjob_ipmi_dcmi_min_watts_total Minimum Power consumption in watts +# TYPE batchjob_ipmi_dcmi_min_watts_total counter +batchjob_ipmi_dcmi_min_watts_total{hostname=""} 68 +# HELP batchjob_rapl_package_joules_total Current RAPL package value in joules +# TYPE batchjob_rapl_package_joules_total counter +batchjob_rapl_package_joules_total{hostname="",index="0",path="pkg/collector/fixtures/sys/class/powercap/intel-rapl:0"} 258218.293244 +batchjob_rapl_package_joules_total{hostname="",index="1",path="pkg/collector/fixtures/sys/class/powercap/intel-rapl:1"} 130570.505826 +# HELP batchjob_scrape_collector_duration_seconds batchjob_exporter: Duration of a collector scrape. +# TYPE batchjob_scrape_collector_duration_seconds gauge +# HELP batchjob_scrape_collector_success batchjob_exporter: Whether a collector succeeded. +# TYPE batchjob_scrape_collector_success gauge +batchjob_scrape_collector_success{collector="ipmi_dcmi"} 1 +batchjob_scrape_collector_success{collector="rapl"} 1 +batchjob_scrape_collector_success{collector="slurm_job"} 1 +# HELP batchjob_slurm_job_cpu_system_seconds Cumulative CPU system seconds +# TYPE batchjob_slurm_job_cpu_system_seconds gauge +batchjob_slurm_job_cpu_system_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 115.777502 +# HELP batchjob_slurm_job_cpu_total_seconds Cumulative CPU total seconds +# TYPE batchjob_slurm_job_cpu_total_seconds gauge +batchjob_slurm_job_cpu_total_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60491.070351 +# HELP batchjob_slurm_job_cpu_user_seconds Cumulative CPU user seconds +# TYPE batchjob_slurm_job_cpu_user_seconds gauge +batchjob_slurm_job_cpu_user_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 60375.292848 +# HELP batchjob_slurm_job_cpus Number of CPUs +# TYPE batchjob_slurm_job_cpus gauge +batchjob_slurm_job_cpus{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2 +# HELP batchjob_slurm_job_memory_cache_bytes Memory cache used in bytes +# TYPE batchjob_slurm_job_memory_cache_bytes gauge +batchjob_slurm_job_memory_cache_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +# HELP batchjob_slurm_job_memory_fail_count Memory fail count +# TYPE batchjob_slurm_job_memory_fail_count gauge +batchjob_slurm_job_memory_fail_count{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 +# HELP batchjob_slurm_job_memory_rss_bytes Memory RSS used in bytes +# TYPE batchjob_slurm_job_memory_rss_bytes gauge +batchjob_slurm_job_memory_rss_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.098592768e+09 +# HELP batchjob_slurm_job_memory_total_bytes Memory total in bytes +# TYPE batchjob_slurm_job_memory_total_bytes gauge +batchjob_slurm_job_memory_total_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.294967296e+09 +# HELP batchjob_slurm_job_memory_used_bytes Memory used in bytes +# TYPE batchjob_slurm_job_memory_used_bytes gauge +batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 4.111491072e+09 +# HELP batchjob_slurm_job_nvidia_gpu_jobid Batch Job ID of current nVIDIA GPU +# TYPE batchjob_slurm_job_nvidia_gpu_jobid gauge +batchjob_slurm_job_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",batch="slurm",hostname="",index="3",uuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3"} 1.009248e+06 +batchjob_slurm_job_nvidia_gpu_jobid{UUID="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",batch="slurm",hostname="",index="2",uuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3"} 1.009248e+06 +# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. +# TYPE go_gc_duration_seconds summary +# HELP go_goroutines Number of goroutines that currently exist. +# TYPE go_goroutines gauge +# HELP go_info Information about the Go environment. +# TYPE go_info gauge +# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use. +# TYPE go_memstats_alloc_bytes gauge +# HELP go_memstats_alloc_bytes_total Total number of bytes allocated, even if freed. +# TYPE go_memstats_alloc_bytes_total counter +# HELP go_memstats_buck_hash_sys_bytes Number of bytes used by the profiling bucket hash table. +# TYPE go_memstats_buck_hash_sys_bytes gauge +# HELP go_memstats_frees_total Total number of frees. +# TYPE go_memstats_frees_total counter +# HELP go_memstats_gc_sys_bytes Number of bytes used for garbage collection system metadata. +# TYPE go_memstats_gc_sys_bytes gauge +# HELP go_memstats_heap_alloc_bytes Number of heap bytes allocated and still in use. +# TYPE go_memstats_heap_alloc_bytes gauge +# HELP go_memstats_heap_idle_bytes Number of heap bytes waiting to be used. +# TYPE go_memstats_heap_idle_bytes gauge +# HELP go_memstats_heap_inuse_bytes Number of heap bytes that are in use. +# TYPE go_memstats_heap_inuse_bytes gauge +# HELP go_memstats_heap_objects Number of allocated objects. +# TYPE go_memstats_heap_objects gauge +# HELP go_memstats_heap_released_bytes Number of heap bytes released to OS. +# TYPE go_memstats_heap_released_bytes gauge +# HELP go_memstats_heap_sys_bytes Number of heap bytes obtained from system. +# TYPE go_memstats_heap_sys_bytes gauge +# HELP go_memstats_last_gc_time_seconds Number of seconds since 1970 of last garbage collection. +# TYPE go_memstats_last_gc_time_seconds gauge +# HELP go_memstats_lookups_total Total number of pointer lookups. +# TYPE go_memstats_lookups_total counter +# HELP go_memstats_mallocs_total Total number of mallocs. +# TYPE go_memstats_mallocs_total counter +# HELP go_memstats_mcache_inuse_bytes Number of bytes in use by mcache structures. +# TYPE go_memstats_mcache_inuse_bytes gauge +# HELP go_memstats_mcache_sys_bytes Number of bytes used for mcache structures obtained from system. +# TYPE go_memstats_mcache_sys_bytes gauge +# HELP go_memstats_mspan_inuse_bytes Number of bytes in use by mspan structures. +# TYPE go_memstats_mspan_inuse_bytes gauge +# HELP go_memstats_mspan_sys_bytes Number of bytes used for mspan structures obtained from system. +# TYPE go_memstats_mspan_sys_bytes gauge +# HELP go_memstats_next_gc_bytes Number of heap bytes when next garbage collection will take place. +# TYPE go_memstats_next_gc_bytes gauge +# HELP go_memstats_other_sys_bytes Number of bytes used for other system allocations. +# TYPE go_memstats_other_sys_bytes gauge +# HELP go_memstats_stack_inuse_bytes Number of bytes in use by the stack allocator. +# TYPE go_memstats_stack_inuse_bytes gauge +# HELP go_memstats_stack_sys_bytes Number of bytes obtained from system for stack allocator. +# TYPE go_memstats_stack_sys_bytes gauge +# HELP go_memstats_sys_bytes Number of bytes obtained from system. +# TYPE go_memstats_sys_bytes gauge +# HELP go_threads Number of OS threads created. +# TYPE go_threads gauge +# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds. +# TYPE process_cpu_seconds_total counter +# HELP process_max_fds Maximum number of open file descriptors. +# TYPE process_max_fds gauge +# HELP process_open_fds Number of open file descriptors. +# TYPE process_open_fds gauge +# HELP process_resident_memory_bytes Resident memory size in bytes. +# TYPE process_resident_memory_bytes gauge +# HELP process_start_time_seconds Start time of the process since unix epoch in seconds. +# TYPE process_start_time_seconds gauge +# HELP process_virtual_memory_bytes Virtual memory size in bytes. +# TYPE process_virtual_memory_bytes gauge +# HELP process_virtual_memory_max_bytes Maximum amount of virtual memory available in bytes. +# TYPE process_virtual_memory_max_bytes gauge +# HELP promhttp_metric_handler_errors_total Total number of internal errors encountered by the promhttp metric handler. +# TYPE promhttp_metric_handler_errors_total counter +promhttp_metric_handler_errors_total{cause="encoding"} 0 +promhttp_metric_handler_errors_total{cause="gathering"} 0 +# HELP promhttp_metric_handler_requests_in_flight Current number of scrapes being served. +# TYPE promhttp_metric_handler_requests_in_flight gauge +promhttp_metric_handler_requests_in_flight 1 +# HELP promhttp_metric_handler_requests_total Total number of scrapes by HTTP status code. +# TYPE promhttp_metric_handler_requests_total counter +promhttp_metric_handler_requests_total{code="200"} 0 +promhttp_metric_handler_requests_total{code="500"} 0 +promhttp_metric_handler_requests_total{code="503"} 0 diff --git a/pkg/collector/fixtures/proc.ttar b/pkg/collector/fixtures/proc.ttar index f1f50240..5e9bbadd 100644 --- a/pkg/collector/fixtures/proc.ttar +++ b/pkg/collector/fixtures/proc.ttar @@ -856,52 +856,52 @@ voluntary_ctxt_switches: 4742839 nonvoluntary_ctxt_switches: 1727500 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Directory: proc/26240 +Directory: proc/26241 Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/cmdline +Path: proc/26241/cmdline Lines: 1 vimNULLBYTEtest.goNULLBYTE+10NULLBYTEEOF Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/comm +Path: proc/26241/comm Lines: 1 vim Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/cwd +Path: proc/26241/cwd SymlinkTo: /usr/bin # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/environ +Path: proc/26241/environ Lines: 1 -PATH=/go/bin:/usr/local/go/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/binNULLBYTEHOSTNAME=cd24e11f73a5NULLBYTETERM=xtermNULLBYTEGOLANG_VERSION=1.12.5NULLBYTEGOPATH=/goNULLBYTEHOME=/rootNULLBYTESLURM_JOB_UID=1000NULLBYTESLURM_JOB_ID=1009248NULLBYTESLURM_JOB_ACCOUNT=testaccNULLBYTESLURM_JOB_NODELIST=compute-[0-2]NULLBYTE +PATH=/go/bin:/usr/local/go/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/binNULLBYTEHOSTNAME=cd24e11f73a5NULLBYTETERM=xtermNULLBYTEGOLANG_VERSION=1.12.5NULLBYTEGOPATH=/goNULLBYTEHOME=/rootNULLBYTESLURM_JOB_UID=1000NULLBYTESLURM_JOB_ID=10000NULLBYTESLURM_JOB_ACCOUNT=testaccNULLBYTESLURM_JOB_NODELIST=compute-[0-2]NULLBYTESLURM_STEP_GPUS=0NULLBYTE Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/exe +Path: proc/26241/exe SymlinkTo: /usr/bin/vim # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Directory: proc/26240/fd +Directory: proc/26241/fd Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/fd/0 +Path: proc/26241/fd/0 SymlinkTo: ../../symlinktargets/abc # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/fd/1 +Path: proc/26241/fd/1 SymlinkTo: ../../symlinktargets/def # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/fd/10 +Path: proc/26241/fd/10 SymlinkTo: ../../symlinktargets/xyz # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/fd/2 +Path: proc/26241/fd/2 SymlinkTo: ../../symlinktargets/ghi # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/fd/3 +Path: proc/26241/fd/3 SymlinkTo: ../../symlinktargets/uvw # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Directory: proc/26240/fdinfo +Directory: proc/26241/fdinfo Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/fdinfo/0 +Path: proc/26241/fdinfo/0 Lines: 6 pos: 0 flags: 02004000 @@ -911,7 +911,7 @@ inotify wd:2 ino:1300016 sdev:fd00002 mask:fce ignored_mask:0 fhandle-bytes:8 fh inotify wd:1 ino:2e0001 sdev:fd00000 mask:fce ignored_mask:0 fhandle-bytes:8 fhandle-type:1 f_handle:01002e00138e7c65 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/fdinfo/1 +Path: proc/26241/fdinfo/1 Lines: 4 pos: 0 flags: 02004002 @@ -919,28 +919,28 @@ mnt_id: 13 eventfd-count: 0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/fdinfo/10 +Path: proc/26241/fdinfo/10 Lines: 3 pos: 0 flags: 02004002 mnt_id: 9 Mode: 400 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/fdinfo/2 +Path: proc/26241/fdinfo/2 Lines: 3 pos: 0 flags: 02004002 mnt_id: 9 Mode: 400 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/fdinfo/3 +Path: proc/26241/fdinfo/3 Lines: 3 pos: 0 flags: 02004002 mnt_id: 9 Mode: 400 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/interrupts +Path: proc/26241/interrupts Lines: 49 CPU0 CPU1 CPU2 CPU3 0: 49 0 0 0 IO-APIC 2-edge timer @@ -993,7 +993,7 @@ NPI: 0 0 0 0 Nested posted-interrupt event PIW: 0 0 0 0 Posted-interrupt wakeup event Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/io +Path: proc/26241/io Lines: 7 rchar: 750339 wchar: 818609 @@ -1004,7 +1004,7 @@ write_bytes: 2048 cancelled_write_bytes: -1024 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/limits +Path: proc/26241/limits Lines: 17 Limit Soft Limit Hard Limit Units Max cpu time unlimited unlimited seconds @@ -1025,7 +1025,7 @@ Max realtime priority 0 0 Max realtime timeout unlimited unlimited us Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/mountstats +Path: proc/26241/mountstats Lines: 20 device rootfs mounted on / with fstype rootfs device sysfs mounted on /sys with fstype sysfs @@ -1049,10 +1049,10 @@ device 192.168.1.1:/srv/test mounted on /mnt/nfs/test with fstype nfs4 statvers= Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Directory: proc/26240/net +Directory: proc/26241/net Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/net/dev +Path: proc/26241/net/dev Lines: 4 Inter-| Receive | Transmit face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed @@ -1060,7 +1060,7 @@ Inter-| Receive | Transmit eth0: 438 5 0 0 0 0 0 0 648 8 0 0 0 0 0 0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/net/netstat +Path: proc/26241/net/netstat Lines: 4 TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPHPHits TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPMemoryPressuresChrono TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop PFMemallocDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPRcvQDrop TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPFastOpenBlackhole TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess TCPWqueueTooBig TcpExt: 0 0 0 1 0 0 0 0 0 0 83 0 0 0 3640 287 1 7460 0 0 134193 1335 829 0 4 0 1 0 0 0 0 1 19 0 0 0 0 0 3 0 32 100 4 0 0 0 7460 2421 49 1 62 6 0 23 0 7 0 0 0 0 19 2 0 0 0 0 0 6 0 0 0 0 3 0 0 0 0 92425 65515 0 2421 4 4 0 0 0 0 0 0 0 0 0 10 0 0 0 16 2221 0 0 2 45 0 0 3 0 0 0 0 456 0 0 0 @@ -1068,7 +1068,7 @@ IpExt: InNoRoutes InTruncatedPkts InMcastPkts OutMcastPkts InBcastPkts OutBcastP IpExt: 0 0 208 214 118 111 190585481 7512674 26093 25903 14546 13628 0 134215 0 0 0 0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/net/snmp +Path: proc/26241/net/snmp Lines: 12 Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates Ip: 2 64 594223 0 1 0 0 0 593186 547253 20 231 0 0 0 0 0 0 0 @@ -1084,7 +1084,7 @@ UdpLite: InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InC UdpLite: 0 0 0 0 0 0 0 0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/net/snmp6 +Path: proc/26241/net/snmp6 Lines: 92 Ip6InReceives 92166 Ip6InHdrErrors 0 @@ -1180,24 +1180,24 @@ UdpLite6InCsumErrors 0 Mode: 644 Mode: 664 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Directory: proc/26240/ns +Directory: proc/26241/ns Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/ns/mnt +Path: proc/26241/ns/mnt SymlinkTo: mnt:[4026531840] # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/ns/net +Path: proc/26241/ns/net SymlinkTo: net:[4026531993] # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/root +Path: proc/26241/root SymlinkTo: / # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/schedstat +Path: proc/26241/schedstat Lines: 1 411605849 93680043 79 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/smaps +Path: proc/26241/smaps Lines: 252 00400000-00cb1000 r-xp 00000000 fd:01 952273 /bin/alertmanager Size: 8900 kB @@ -1453,7 +1453,7 @@ Locked: 0 kB VmFlags: rd ex Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/smaps_rollup +Path: proc/26241/smaps_rollup Lines: 17 00400000-ffffffffff601000 ---p 00000000 00:00 0 [rollup] Rss: 29948 kB @@ -1474,12 +1474,12 @@ SwapPss: 1940 kB Locked: 0 kB Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/stat +Path: proc/26241/stat Lines: 1 26231 (vim) R 5392 7446 5392 34835 7446 4218880 32533 309516 26 82 1677 44 158 99 20 0 1 0 82375 56274944 1981 18446744073709551615 4194304 6294284 140736914091744 140736914087944 139965136429984 0 0 12288 1870679807 0 0 0 17 0 0 0 31 0 0 8391624 8481048 16420864 140736914093252 140736914093279 140736914093279 140736914096107 0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/status +Path: proc/26241/status Lines: 53 Name: prometheus @@ -1536,57 +1536,57 @@ voluntary_ctxt_switches: 4742839 nonvoluntary_ctxt_switches: 1727500 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26240/wchan +Path: proc/26241/wchan Lines: 1 poll_schedule_timeoutEOF Mode: 664 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Directory: proc/26241 +Directory: proc/26242 Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/cmdline +Path: proc/26242/cmdline Lines: 1 vimNULLBYTEtest.goNULLBYTE+10NULLBYTEEOF Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/comm +Path: proc/26242/comm Lines: 1 vim Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/cwd +Path: proc/26242/cwd SymlinkTo: /usr/bin # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/environ +Path: proc/26242/environ Lines: 1 -PATH=/go/bin:/usr/local/go/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/binNULLBYTEHOSTNAME=cd24e11f73a5NULLBYTETERM=xtermNULLBYTEGOLANG_VERSION=1.12.5NULLBYTEGOPATH=/goNULLBYTEHOME=/rootNULLBYTESLURM_JOB_UID=1000NULLBYTESLURM_JOB_ID=10000NULLBYTESLURM_JOB_ACCOUNT=testaccNULLBYTESLURM_JOB_NODELIST=compute-[0-2]NULLBYTESLURM_STEP_GPUS=0NULLBYTE +PATH=/go/bin:/usr/local/go/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/binNULLBYTEHOSTNAME=cd24e11f73a5NULLBYTETERM=xtermNULLBYTEGOLANG_VERSION=1.12.5NULLBYTEGOPATH=/goNULLBYTEHOME=/rootNULLBYTESLURM_JOB_UID=1000NULLBYTESLURM_JOB_ID=11000NULLBYTESLURM_JOB_ACCOUNT=testaccNULLBYTESLURM_JOB_NODELIST=compute-[0-2]NULLBYTESLURM_STEP_GPUS=1NULLBYTE Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/exe +Path: proc/26242/exe SymlinkTo: /usr/bin/vim # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Directory: proc/26241/fd +Directory: proc/26242/fd Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/fd/0 +Path: proc/26242/fd/0 SymlinkTo: ../../symlinktargets/abc # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/fd/1 +Path: proc/26242/fd/1 SymlinkTo: ../../symlinktargets/def # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/fd/10 +Path: proc/26242/fd/10 SymlinkTo: ../../symlinktargets/xyz # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/fd/2 +Path: proc/26242/fd/2 SymlinkTo: ../../symlinktargets/ghi # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/fd/3 +Path: proc/26242/fd/3 SymlinkTo: ../../symlinktargets/uvw # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Directory: proc/26241/fdinfo +Directory: proc/26242/fdinfo Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/fdinfo/0 +Path: proc/26242/fdinfo/0 Lines: 6 pos: 0 flags: 02004000 @@ -1596,7 +1596,7 @@ inotify wd:2 ino:1300016 sdev:fd00002 mask:fce ignored_mask:0 fhandle-bytes:8 fh inotify wd:1 ino:2e0001 sdev:fd00000 mask:fce ignored_mask:0 fhandle-bytes:8 fhandle-type:1 f_handle:01002e00138e7c65 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/fdinfo/1 +Path: proc/26242/fdinfo/1 Lines: 4 pos: 0 flags: 02004002 @@ -1604,28 +1604,28 @@ mnt_id: 13 eventfd-count: 0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/fdinfo/10 +Path: proc/26242/fdinfo/10 Lines: 3 pos: 0 flags: 02004002 mnt_id: 9 Mode: 400 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/fdinfo/2 +Path: proc/26242/fdinfo/2 Lines: 3 pos: 0 flags: 02004002 mnt_id: 9 Mode: 400 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/fdinfo/3 +Path: proc/26242/fdinfo/3 Lines: 3 pos: 0 flags: 02004002 mnt_id: 9 Mode: 400 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/interrupts +Path: proc/26242/interrupts Lines: 49 CPU0 CPU1 CPU2 CPU3 0: 49 0 0 0 IO-APIC 2-edge timer @@ -1678,7 +1678,7 @@ NPI: 0 0 0 0 Nested posted-interrupt event PIW: 0 0 0 0 Posted-interrupt wakeup event Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/io +Path: proc/26242/io Lines: 7 rchar: 750339 wchar: 818609 @@ -1689,7 +1689,7 @@ write_bytes: 2048 cancelled_write_bytes: -1024 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/limits +Path: proc/26242/limits Lines: 17 Limit Soft Limit Hard Limit Units Max cpu time unlimited unlimited seconds @@ -1710,7 +1710,7 @@ Max realtime priority 0 0 Max realtime timeout unlimited unlimited us Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/mountstats +Path: proc/26242/mountstats Lines: 20 device rootfs mounted on / with fstype rootfs device sysfs mounted on /sys with fstype sysfs @@ -1734,10 +1734,10 @@ device 192.168.1.1:/srv/test mounted on /mnt/nfs/test with fstype nfs4 statvers= Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Directory: proc/26241/net +Directory: proc/26242/net Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/net/dev +Path: proc/26242/net/dev Lines: 4 Inter-| Receive | Transmit face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed @@ -1745,7 +1745,7 @@ Inter-| Receive | Transmit eth0: 438 5 0 0 0 0 0 0 648 8 0 0 0 0 0 0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/net/netstat +Path: proc/26242/net/netstat Lines: 4 TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPHPHits TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPMemoryPressuresChrono TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop PFMemallocDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPRcvQDrop TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPFastOpenBlackhole TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess TCPWqueueTooBig TcpExt: 0 0 0 1 0 0 0 0 0 0 83 0 0 0 3640 287 1 7460 0 0 134193 1335 829 0 4 0 1 0 0 0 0 1 19 0 0 0 0 0 3 0 32 100 4 0 0 0 7460 2421 49 1 62 6 0 23 0 7 0 0 0 0 19 2 0 0 0 0 0 6 0 0 0 0 3 0 0 0 0 92425 65515 0 2421 4 4 0 0 0 0 0 0 0 0 0 10 0 0 0 16 2221 0 0 2 45 0 0 3 0 0 0 0 456 0 0 0 @@ -1753,7 +1753,7 @@ IpExt: InNoRoutes InTruncatedPkts InMcastPkts OutMcastPkts InBcastPkts OutBcastP IpExt: 0 0 208 214 118 111 190585481 7512674 26093 25903 14546 13628 0 134215 0 0 0 0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/net/snmp +Path: proc/26242/net/snmp Lines: 12 Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates Ip: 2 64 594223 0 1 0 0 0 593186 547253 20 231 0 0 0 0 0 0 0 @@ -1769,7 +1769,7 @@ UdpLite: InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InC UdpLite: 0 0 0 0 0 0 0 0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/net/snmp6 +Path: proc/26242/net/snmp6 Lines: 92 Ip6InReceives 92166 Ip6InHdrErrors 0 @@ -1865,24 +1865,24 @@ UdpLite6InCsumErrors 0 Mode: 644 Mode: 664 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Directory: proc/26241/ns +Directory: proc/26242/ns Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/ns/mnt +Path: proc/26242/ns/mnt SymlinkTo: mnt:[4026531840] # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/ns/net +Path: proc/26242/ns/net SymlinkTo: net:[4026531993] # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/root +Path: proc/26242/root SymlinkTo: / # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/schedstat +Path: proc/26242/schedstat Lines: 1 411605849 93680043 79 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/smaps +Path: proc/26242/smaps Lines: 252 00400000-00cb1000 r-xp 00000000 fd:01 952273 /bin/alertmanager Size: 8900 kB @@ -2138,7 +2138,7 @@ Locked: 0 kB VmFlags: rd ex Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/smaps_rollup +Path: proc/26242/smaps_rollup Lines: 17 00400000-ffffffffff601000 ---p 00000000 00:00 0 [rollup] Rss: 29948 kB @@ -2159,12 +2159,12 @@ SwapPss: 1940 kB Locked: 0 kB Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/stat +Path: proc/26242/stat Lines: 1 26231 (vim) R 5392 7446 5392 34835 7446 4218880 32533 309516 26 82 1677 44 158 99 20 0 1 0 82375 56274944 1981 18446744073709551615 4194304 6294284 140736914091744 140736914087944 139965136429984 0 0 12288 1870679807 0 0 0 17 0 0 0 31 0 0 8391624 8481048 16420864 140736914093252 140736914093279 140736914093279 140736914096107 0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/status +Path: proc/26242/status Lines: 53 Name: prometheus @@ -2221,57 +2221,108 @@ voluntary_ctxt_switches: 4742839 nonvoluntary_ctxt_switches: 1727500 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26241/wchan +Path: proc/26242/wchan Lines: 1 poll_schedule_timeoutEOF Mode: 664 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Directory: proc/26242 +Directory: proc/27079 Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/cmdline +Path: proc/27079/stat +Lines: 1 +27079 (pthread_load) S 1 27079 1 34816 27079 4194304 113 0 1 0 58125 15 0 0 20 0 5 0 4289574 36282368 138 18446744073709551615 94441498279936 94441498282741 140736878632528 0 0 0 0 0 0 0 0 0 17 2 0 0 0 0 0 94441498291504 94441498292248 94441510707200 140736878639434 140736878639460 140736878639460 140736878641129 0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: proc/27079/task +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: proc/27079/task/27079 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: proc/27079/task/27079/stat +Lines: 1 +27079 (pthread_load) S 1 27079 1 34816 27079 4194304 97 0 1 0 0 0 0 0 20 0 5 0 4289574 36282368 138 18446744073709551615 94441498279936 94441498282741 140736878632528 0 0 0 0 0 0 0 0 0 17 2 0 0 0 0 0 94441498291504 94441498292248 94441510707200 140736878639434 140736878639460 140736878639460 140736878641129 0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: proc/27079/task/27080 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: proc/27079/task/27080/stat +Lines: 1 +27080 (pthread_load) R 1 27079 1 34816 27079 4194368 7 0 0 0 34136 3 0 0 20 0 5 0 4289575 36282368 138 18446744073709551615 94441498279936 94441498282741 140736878632528 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 94441498291504 94441498292248 94441510707200 140736878639434 140736878639460 140736878639460 140736878641129 0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: proc/27079/task/27081 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: proc/27079/task/27081/stat +Lines: 1 +27081 (pthread_load) S 1 27079 1 34816 27079 1077936192 3 0 0 0 13680 4 0 0 20 0 5 0 4289575 36282368 138 18446744073709551615 94441498279936 94441498282741 140736878632528 0 0 0 0 0 0 0 0 0 -1 5 0 0 0 0 0 94441498291504 94441498292248 94441510707200 140736878639434 140736878639460 140736878639460 140736878641129 0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: proc/27079/task/27082 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: proc/27079/task/27082/stat +Lines: 1 +27082 (pthread_load) S 1 27079 1 34816 27079 1077936192 3 0 0 0 6859 3 0 0 20 0 5 0 4289575 36282368 138 18446744073709551615 94441498279936 94441498282741 140736878632528 0 0 0 0 0 0 0 0 0 -1 1 0 0 0 0 0 94441498291504 94441498292248 94441510707200 140736878639434 140736878639460 140736878639460 140736878641129 0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: proc/27079/task/27083 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: proc/27079/task/27083/stat +Lines: 1 +27083 (pthread_load) S 1 27079 1 34816 27079 1077936192 3 0 0 0 3452 4 0 0 20 0 5 0 4289575 36282368 138 18446744073709551615 94441498279936 94441498282741 140736878632528 0 0 0 0 0 0 0 0 0 -1 4 0 0 0 0 0 94441498291504 94441498292248 94441510707200 140736878639434 140736878639460 140736878639460 140736878641129 0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: proc/3346567 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: proc/3346567/cmdline Lines: 1 vimNULLBYTEtest.goNULLBYTE+10NULLBYTEEOF Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/comm +Path: proc/3346567/comm Lines: 1 vim Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/cwd +Path: proc/3346567/cwd SymlinkTo: /usr/bin # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/environ +Path: proc/3346567/environ Lines: 1 -PATH=/go/bin:/usr/local/go/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/binNULLBYTEHOSTNAME=cd24e11f73a5NULLBYTETERM=xtermNULLBYTEGOLANG_VERSION=1.12.5NULLBYTEGOPATH=/goNULLBYTEHOME=/rootNULLBYTESLURM_JOB_UID=1000NULLBYTESLURM_JOB_ID=11000NULLBYTESLURM_JOB_ACCOUNT=testaccNULLBYTESLURM_JOB_NODELIST=compute-[0-2]NULLBYTESLURM_STEP_GPUS=1NULLBYTE +PATH=/go/bin:/usr/local/go/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/binNULLBYTEHOSTNAME=cd24e11f73a5NULLBYTETERM=xtermNULLBYTEGOLANG_VERSION=1.12.5NULLBYTEGOPATH=/goNULLBYTEHOME=/rootNULLBYTESLURM_JOB_UID=1000NULLBYTESLURM_JOB_ID=1009248NULLBYTESLURM_JOB_ACCOUNT=testaccNULLBYTESLURM_JOB_NODELIST=compute-[0-2]NULLBYTESLURM_JOB_GPUS=2,3NULLBYTE Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/exe +Path: proc/3346567/exe SymlinkTo: /usr/bin/vim # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Directory: proc/26242/fd +Directory: proc/3346567/fd Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/fd/0 +Path: proc/3346567/fd/0 SymlinkTo: ../../symlinktargets/abc # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/fd/1 +Path: proc/3346567/fd/1 SymlinkTo: ../../symlinktargets/def # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/fd/10 +Path: proc/3346567/fd/10 SymlinkTo: ../../symlinktargets/xyz # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/fd/2 +Path: proc/3346567/fd/2 SymlinkTo: ../../symlinktargets/ghi # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/fd/3 +Path: proc/3346567/fd/3 SymlinkTo: ../../symlinktargets/uvw # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Directory: proc/26242/fdinfo +Directory: proc/3346567/fdinfo Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/fdinfo/0 +Path: proc/3346567/fdinfo/0 Lines: 6 pos: 0 flags: 02004000 @@ -2281,7 +2332,7 @@ inotify wd:2 ino:1300016 sdev:fd00002 mask:fce ignored_mask:0 fhandle-bytes:8 fh inotify wd:1 ino:2e0001 sdev:fd00000 mask:fce ignored_mask:0 fhandle-bytes:8 fhandle-type:1 f_handle:01002e00138e7c65 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/fdinfo/1 +Path: proc/3346567/fdinfo/1 Lines: 4 pos: 0 flags: 02004002 @@ -2289,28 +2340,28 @@ mnt_id: 13 eventfd-count: 0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/fdinfo/10 +Path: proc/3346567/fdinfo/10 Lines: 3 pos: 0 flags: 02004002 mnt_id: 9 Mode: 400 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/fdinfo/2 +Path: proc/3346567/fdinfo/2 Lines: 3 pos: 0 flags: 02004002 mnt_id: 9 Mode: 400 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/fdinfo/3 +Path: proc/3346567/fdinfo/3 Lines: 3 pos: 0 flags: 02004002 mnt_id: 9 Mode: 400 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/interrupts +Path: proc/3346567/interrupts Lines: 49 CPU0 CPU1 CPU2 CPU3 0: 49 0 0 0 IO-APIC 2-edge timer @@ -2363,7 +2414,7 @@ NPI: 0 0 0 0 Nested posted-interrupt event PIW: 0 0 0 0 Posted-interrupt wakeup event Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/io +Path: proc/3346567/io Lines: 7 rchar: 750339 wchar: 818609 @@ -2374,7 +2425,7 @@ write_bytes: 2048 cancelled_write_bytes: -1024 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/limits +Path: proc/3346567/limits Lines: 17 Limit Soft Limit Hard Limit Units Max cpu time unlimited unlimited seconds @@ -2395,7 +2446,7 @@ Max realtime priority 0 0 Max realtime timeout unlimited unlimited us Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/mountstats +Path: proc/3346567/mountstats Lines: 20 device rootfs mounted on / with fstype rootfs device sysfs mounted on /sys with fstype sysfs @@ -2419,10 +2470,10 @@ device 192.168.1.1:/srv/test mounted on /mnt/nfs/test with fstype nfs4 statvers= Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Directory: proc/26242/net +Directory: proc/3346567/net Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/net/dev +Path: proc/3346567/net/dev Lines: 4 Inter-| Receive | Transmit face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed @@ -2430,7 +2481,7 @@ Inter-| Receive | Transmit eth0: 438 5 0 0 0 0 0 0 648 8 0 0 0 0 0 0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/net/netstat +Path: proc/3346567/net/netstat Lines: 4 TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPHPHits TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPMemoryPressuresChrono TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop PFMemallocDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPRcvQDrop TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPFastOpenBlackhole TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess TCPWqueueTooBig TcpExt: 0 0 0 1 0 0 0 0 0 0 83 0 0 0 3640 287 1 7460 0 0 134193 1335 829 0 4 0 1 0 0 0 0 1 19 0 0 0 0 0 3 0 32 100 4 0 0 0 7460 2421 49 1 62 6 0 23 0 7 0 0 0 0 19 2 0 0 0 0 0 6 0 0 0 0 3 0 0 0 0 92425 65515 0 2421 4 4 0 0 0 0 0 0 0 0 0 10 0 0 0 16 2221 0 0 2 45 0 0 3 0 0 0 0 456 0 0 0 @@ -2438,7 +2489,7 @@ IpExt: InNoRoutes InTruncatedPkts InMcastPkts OutMcastPkts InBcastPkts OutBcastP IpExt: 0 0 208 214 118 111 190585481 7512674 26093 25903 14546 13628 0 134215 0 0 0 0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/net/snmp +Path: proc/3346567/net/snmp Lines: 12 Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates Ip: 2 64 594223 0 1 0 0 0 593186 547253 20 231 0 0 0 0 0 0 0 @@ -2454,7 +2505,7 @@ UdpLite: InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InC UdpLite: 0 0 0 0 0 0 0 0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/net/snmp6 +Path: proc/3346567/net/snmp6 Lines: 92 Ip6InReceives 92166 Ip6InHdrErrors 0 @@ -2550,24 +2601,24 @@ UdpLite6InCsumErrors 0 Mode: 644 Mode: 664 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Directory: proc/26242/ns +Directory: proc/3346567/ns Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/ns/mnt +Path: proc/3346567/ns/mnt SymlinkTo: mnt:[4026531840] # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/ns/net +Path: proc/3346567/ns/net SymlinkTo: net:[4026531993] # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/root +Path: proc/3346567/root SymlinkTo: / # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/schedstat +Path: proc/3346567/schedstat Lines: 1 411605849 93680043 79 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/smaps +Path: proc/3346567/smaps Lines: 252 00400000-00cb1000 r-xp 00000000 fd:01 952273 /bin/alertmanager Size: 8900 kB @@ -2823,7 +2874,7 @@ Locked: 0 kB VmFlags: rd ex Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/smaps_rollup +Path: proc/3346567/smaps_rollup Lines: 17 00400000-ffffffffff601000 ---p 00000000 00:00 0 [rollup] Rss: 29948 kB @@ -2844,12 +2895,12 @@ SwapPss: 1940 kB Locked: 0 kB Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/stat +Path: proc/3346567/stat Lines: 1 26231 (vim) R 5392 7446 5392 34835 7446 4218880 32533 309516 26 82 1677 44 158 99 20 0 1 0 82375 56274944 1981 18446744073709551615 4194304 6294284 140736914091744 140736914087944 139965136429984 0 0 12288 1870679807 0 0 0 17 0 0 0 31 0 0 8391624 8481048 16420864 140736914093252 140736914093279 140736914093279 140736914096107 0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/status +Path: proc/3346567/status Lines: 53 Name: prometheus @@ -2906,62 +2957,11 @@ voluntary_ctxt_switches: 4742839 nonvoluntary_ctxt_switches: 1727500 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/26242/wchan +Path: proc/3346567/wchan Lines: 1 poll_schedule_timeoutEOF Mode: 664 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Directory: proc/27079 -Mode: 775 -# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/27079/stat -Lines: 1 -27079 (pthread_load) S 1 27079 1 34816 27079 4194304 113 0 1 0 58125 15 0 0 20 0 5 0 4289574 36282368 138 18446744073709551615 94441498279936 94441498282741 140736878632528 0 0 0 0 0 0 0 0 0 17 2 0 0 0 0 0 94441498291504 94441498292248 94441510707200 140736878639434 140736878639460 140736878639460 140736878641129 0 -Mode: 644 -# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Directory: proc/27079/task -Mode: 775 -# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Directory: proc/27079/task/27079 -Mode: 775 -# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/27079/task/27079/stat -Lines: 1 -27079 (pthread_load) S 1 27079 1 34816 27079 4194304 97 0 1 0 0 0 0 0 20 0 5 0 4289574 36282368 138 18446744073709551615 94441498279936 94441498282741 140736878632528 0 0 0 0 0 0 0 0 0 17 2 0 0 0 0 0 94441498291504 94441498292248 94441510707200 140736878639434 140736878639460 140736878639460 140736878641129 0 -Mode: 644 -# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Directory: proc/27079/task/27080 -Mode: 775 -# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/27079/task/27080/stat -Lines: 1 -27080 (pthread_load) R 1 27079 1 34816 27079 4194368 7 0 0 0 34136 3 0 0 20 0 5 0 4289575 36282368 138 18446744073709551615 94441498279936 94441498282741 140736878632528 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 94441498291504 94441498292248 94441510707200 140736878639434 140736878639460 140736878639460 140736878641129 0 -Mode: 644 -# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Directory: proc/27079/task/27081 -Mode: 775 -# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/27079/task/27081/stat -Lines: 1 -27081 (pthread_load) S 1 27079 1 34816 27079 1077936192 3 0 0 0 13680 4 0 0 20 0 5 0 4289575 36282368 138 18446744073709551615 94441498279936 94441498282741 140736878632528 0 0 0 0 0 0 0 0 0 -1 5 0 0 0 0 0 94441498291504 94441498292248 94441510707200 140736878639434 140736878639460 140736878639460 140736878641129 0 -Mode: 644 -# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Directory: proc/27079/task/27082 -Mode: 775 -# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/27079/task/27082/stat -Lines: 1 -27082 (pthread_load) S 1 27079 1 34816 27079 1077936192 3 0 0 0 6859 3 0 0 20 0 5 0 4289575 36282368 138 18446744073709551615 94441498279936 94441498282741 140736878632528 0 0 0 0 0 0 0 0 0 -1 1 0 0 0 0 0 94441498291504 94441498292248 94441510707200 140736878639434 140736878639460 140736878639460 140736878641129 0 -Mode: 644 -# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Directory: proc/27079/task/27083 -Mode: 775 -# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: proc/27079/task/27083/stat -Lines: 1 -27083 (pthread_load) S 1 27079 1 34816 27079 1077936192 3 0 0 0 3452 4 0 0 20 0 5 0 4289575 36282368 138 18446744073709551615 94441498279936 94441498282741 140736878632528 0 0 0 0 0 0 0 0 0 -1 4 0 0 0 0 0 94441498291504 94441498292248 94441510707200 140736878639434 140736878639460 140736878639460 140736878641129 0 -Mode: 644 -# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: proc/584 Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -5591,3 +5591,7 @@ Node 0, zone Device protection: (0, 0, 0, 0, 0) Mode: 444 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: proc/.unpacked +Lines: 0 +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/pkg/collector/fixtures/sys.ttar b/pkg/collector/fixtures/sys.ttar index 78daa714..f85dbfe0 100644 --- a/pkg/collector/fixtures/sys.ttar +++ b/pkg/collector/fixtures/sys.ttar @@ -1,4 +1,4 @@ -# Archive created by ttar -C collector/fixtures -c -f collector/fixtures/sys.ttar sys +# Archive created by ttar -C pkg/collector/fixtures -c -f pkg/collector/fixtures/sys.ttar sys Directory: sys Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -1324,10 +1324,6 @@ Lines: 1 max 100000 Mode: 640 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: sys/fs/cgroup/system.slice/slurmstepd.scope/job_1009248/cpu.pressure -Lines: 0 -Mode: 640 -# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/fs/cgroup/system.slice/slurmstepd.scope/job_1009248/cpu.stat Lines: 6 usage_usec 60491070351 @@ -1373,10 +1369,6 @@ Lines: 1 0-1 Mode: 440 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: sys/fs/cgroup/system.slice/slurmstepd.scope/job_1009248/io.pressure -Lines: 0 -Mode: 640 -# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/fs/cgroup/system.slice/slurmstepd.scope/job_1009248/memory.current Lines: 1 4111491072 @@ -1455,10 +1447,6 @@ Lines: 1 0 Mode: 640 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: sys/fs/cgroup/system.slice/slurmstepd.scope/job_1009248/memory.pressure -Lines: 0 -Mode: 640 -# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/fs/cgroup/system.slice/slurmstepd.scope/job_1009248/memory.stat Lines: 40 anon 4098592768 @@ -5083,3 +5071,7 @@ Lines: 1 max Mode: 640 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/.unpacked +Lines: 0 +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/pkg/collector/helper.go b/pkg/collector/helper.go index ba9292eb..768a615c 100644 --- a/pkg/collector/helper.go +++ b/pkg/collector/helper.go @@ -7,8 +7,19 @@ import ( "strings" "regexp" + + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/mahendrapaipuri/batchjob_monitoring/internal/helpers" ) +type Device struct { + index string + name string + uuid string + isMig bool +} + var ( metricNameRegex = regexp.MustCompile(`_*[^0-9A-Za-z_]+_*`) ) @@ -75,3 +86,66 @@ func LoadCgroupsV2Metrics( } return data, nil } + +// Get all physical or MIG devices using nvidia-smi command +// Example output: +// bash-4.4$ nvidia-smi --query-gpu=name,uuid --format=csv +// name, uuid +// Tesla V100-SXM2-32GB, GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e +// Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3 +// +// Here we are using nvidia-smi to avoid having build issues if we use +// nvml go bindings. This way we dont have deps on nvidia stuff and keep +// exporter simple. +// +// NOTE: Hoping this command returns MIG devices too +func GetNvidiaGPUDevices(nvidiaSmiPath string, logger log.Logger) (map[int]Device, error) { + // Check if nvidia-smi binary exists + if _, err := os.Stat(nvidiaSmiPath); err != nil { + level.Error(logger).Log("msg", "Failed to open nvidia-smi executable", "path", nvidiaSmiPath, "err", err) + return nil, err + } + + // Execute nvidia-smi command to get available GPUs + args := []string{"--query-gpu=index,name,uuid", "--format=csv"} + nvidiaSmiOutput, err := helpers.Execute(nvidiaSmiPath, args, logger) + if err != nil { + level.Error(logger). + Log("msg", "nvidia-smi command to get list of devices failed", "err", err) + return nil, err + } + + // Get all devices + gpuDevices := map[int]Device{} + devIndxInt := 0 + for _, line := range strings.Split(string(nvidiaSmiOutput), "\n") { + // Header line, empty line and newlines are ignored + if line == "" || line == "\n" || strings.HasPrefix(line, "index") { + continue + } + + devDetails := strings.Split(line, ",") + if len(devDetails) < 3 { + level.Error(logger). + Log("msg", "Cannot parse output from nvidia-smi command", "output", line) + continue + } + + // Get device index, name and UUID + devIndx := strings.TrimSpace(devDetails[0]) + devName := strings.TrimSpace(devDetails[1]) + devUuid := strings.TrimSpace(devDetails[2]) + + // Check if device is in MiG mode + isMig := false + if strings.HasPrefix(devUuid, "MIG") { + isMig = true + } + level.Debug(logger). + Log("msg", "Found nVIDIA GPU", "name", devName, "UUID", devUuid, "isMig:", isMig) + + gpuDevices[devIndxInt] = Device{index: devIndx, name: devName, uuid: devUuid, isMig: isMig} + devIndxInt++ + } + return gpuDevices, nil +} diff --git a/pkg/collector/nvidia_gpus.go b/pkg/collector/nvidia_gpus.go deleted file mode 100644 index 31786207..00000000 --- a/pkg/collector/nvidia_gpus.go +++ /dev/null @@ -1,264 +0,0 @@ -//go:build !nonvidia -// +build !nonvidia - -package collector - -import ( - "fmt" - "os" - "slices" - "strconv" - "strings" - "sync" - - "github.com/go-kit/log" - "github.com/go-kit/log/level" - "github.com/mahendrapaipuri/batchjob_monitoring/internal/helpers" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/procfs" -) - -const nvidiaGpuJobMapCollectorSubsystem = "nvidia_gpu" - -var ( - jobMapLock = sync.RWMutex{} - nvidiaSmiPath = BatchJobExporterApp.Flag( - "collector.nvidia.smi.path", - "Absolute path to nvidia-smi executable.", - ).Default("/usr/bin/nvidia-smi").String() - gpuStatPath = BatchJobExporterApp.Flag( - "collector.nvidia.gpu.job.map.path", - "Path to file that maps GPU ordinals to job IDs.", - ).Default("/run/gpujobmap").String() -) - -type Device struct { - index string - name string - uuid string - isMig bool -} - -type nvidiaGpuJobMapCollector struct { - devices []Device - logger log.Logger - hostname string - gpuJobMapDesc *prometheus.Desc -} - -func init() { - RegisterCollector( - nvidiaGpuJobMapCollectorSubsystem, - defaultDisabled, - NewNvidiaGpuJobMapCollector, - ) -} - -// Get all physical or MIG devices using nvidia-smi command -// Example output: -// bash-4.4$ nvidia-smi --query-gpu=name,uuid --format=csv -// name, uuid -// Tesla V100-SXM2-32GB, GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e -// Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3 -// -// Here we are using nvidia-smi to avoid having build issues if we use -// nvml go bindings. This way we dont have deps on nvidia stuff and keep -// exporter simple. -// -// NOTE: Hoping this command returns MIG devices too -func getAllDevices(logger log.Logger) ([]Device, error) { - // Check if nvidia-smi binary exists - if _, err := os.Stat(*nvidiaSmiPath); err != nil { - level.Error(logger).Log("msg", "Failed to open nvidia-smi executable", "path", *nvidiaSmiPath, "err", err) - return nil, err - } - - // Execute nvidia-smi command to get available GPUs - args := []string{"--query-gpu=index,name,uuid", "--format=csv"} - nvidiaSmiOutput, err := helpers.Execute(*nvidiaSmiPath, args, logger) - if err != nil { - level.Error(logger). - Log("msg", "nvidia-smi command to get list of devices failed", "err", err) - return nil, err - } - - // Get all devices - allDevices := []Device{} - for _, line := range strings.Split(string(nvidiaSmiOutput), "\n") { - // Header line, empty line and newlines are ignored - if line == "" || line == "\n" || strings.HasPrefix(line, "index") { - continue - } - - devDetails := strings.Split(line, ",") - if len(devDetails) < 3 { - level.Error(logger). - Log("msg", "Cannot parse output from nvidia-smi command", "output", line) - continue - } - - // Get device index, name and UUID - devIndx := strings.TrimSpace(devDetails[0]) - devName := strings.TrimSpace(devDetails[1]) - devUuid := strings.TrimSpace(devDetails[2]) - - // Check if device is in MiG mode - isMig := false - if strings.HasPrefix(devUuid, "MIG") { - isMig = true - } - level.Debug(logger). - Log("msg", "Found nVIDIA GPU", "name", devName, "UUID", devUuid, "isMig:", isMig) - - allDevices = append(allDevices, Device{index: devIndx, name: devName, uuid: devUuid, isMig: isMig}) - } - return allDevices, nil -} - -// NewNvidiaGpuJobMapCollector returns a new Collector exposing batch jobs to nVIDIA GPU ordinals mapping. -func NewNvidiaGpuJobMapCollector(logger log.Logger) (Collector, error) { - var hostname string - var err error - - // Get hostname - if !*emptyHostnameLabel { - hostname, err = os.Hostname() - if err != nil { - level.Error(logger).Log("msg", "Failed to get hostname", "err", err) - } - } - - allDevices, _ := getAllDevices(logger) - gpuJobMapDesc := prometheus.NewDesc( - prometheus.BuildFQName(Namespace, nvidiaGpuJobMapCollectorSubsystem, "jobid"), - "Batch Job ID of current nVIDIA GPU", - []string{"hostname", "uuid", "UUID"}, nil, - ) - - collector := nvidiaGpuJobMapCollector{ - devices: allDevices, - logger: logger, - hostname: hostname, - gpuJobMapDesc: gpuJobMapDesc, - } - return &collector, nil -} - -// Update implements Collector and exposes IPMI DCMI power related metrics. -func (c *nvidiaGpuJobMapCollector) Update(ch chan<- prometheus.Metric) error { - gpuJobMapper, _ := c.getJobId() - for _, dev := range c.devices { - ch <- prometheus.MustNewConstMetric(c.gpuJobMapDesc, prometheus.GaugeValue, gpuJobMapper[dev.uuid], c.hostname, dev.uuid, dev.uuid) - } - return nil -} - -// Read gpustat file and get job ID of each GPU -func (c *nvidiaGpuJobMapCollector) getJobId() (map[string]float64, error) { - gpuJobMapper := make(map[string]float64) - for _, dev := range c.devices { - var jobId int64 = 0 - var slurmInfo string = fmt.Sprintf("%s/%s", *gpuStatPath, dev.index) - - // NOTE: Look for file name with UUID as it will be more appropriate with - // MIG instances. - // If /run/gpustat/0 file is not found, check for the file with UUID as name? - if _, err := os.Stat(slurmInfo); err == nil { - content, err := os.ReadFile(slurmInfo) - if err != nil { - level.Error(c.logger).Log( - "msg", "Failed to get job ID for GPU", - "index", dev.index, "uuid", dev.uuid, "err", err, - ) - gpuJobMapper[dev.uuid] = float64(0) - } - fmt.Sscanf(string(content), "%d", &jobId) - gpuJobMapper[dev.uuid] = float64(jobId) - } else { - // Attempt to get GPU dev indices from /proc file system by looking into - // environ for the process that has same SLURM_JOB_ID - // - // Instantiate a new Proc FS - procFS, err := procfs.NewFS(*procfsPath) - if err != nil { - level.Error(c.logger).Log("msg", "Unable to open procfs", "path", *procfsPath) - goto outside - } - - // Get all procs from current proc fs - allProcs, err := procFS.AllProcs() - if err != nil { - level.Error(c.logger).Log("msg", "Failed to read /proc", "err", err) - gpuJobMapper[dev.uuid] = float64(0) - - // If we cannot read procfs break - goto outside - } - - // Initialize a waitgroup for all go routines that we will spawn later - wg := &sync.WaitGroup{} - wg.Add(len(allProcs)) - - // Iterate through all procs and look for SLURM_JOB_ID env entry - for _, proc := range allProcs { - go func(p procfs.Proc) { - // Read process environment variables - // NOTE: This needs CAP_SYS_PTRACE and CAP_DAC_READ_SEARCH caps - // on the current process - environments, err := p.Environ() - - // Skip if we cannot read file - if err != nil { - wg.Done() - return - } - - var gpuIndices []string - var slurmJobId string = "" - - // Loop through all env vars and get SLURM_SETP_GPUS/SLURM_JOB_GPUS - // and SLURM_JOB_ID - for _, env := range environments { - // Check both SLURM_SETP_GPUS and SLURM_JOB_GPUS vars - if strings.Contains(env, "SLURM_STEP_GPUS") || strings.Contains(env, "SLURM_JOB_GPUS") { - gpuIndices = strings.Split(strings.Split(env, "=")[1], ",") - } - if strings.Contains(env, "SLURM_JOB_ID") { - slurmJobId = strings.Split(env, "=")[1] - } - } - - // If gpuIndices has current GPU index, assign the jobID and break loop - if slices.Contains(gpuIndices, dev.index) { - jobMapLock.Lock() - jid, err := strconv.Atoi(slurmJobId) - if err != nil { - gpuJobMapper[dev.uuid] = float64(0) - } - gpuJobMapper[dev.uuid] = float64(jid) - jobMapLock.Unlock() - } - - // Mark routine as done - wg.Done() - - }(proc) - } - - // Wait for all go routines - wg.Wait() - } - outside: - if gpuJobMapper[dev.uuid] == 0 { - level.Error(c.logger).Log( - "msg", "Failed to get job ID for GPU", "index", dev.index, "uuid", dev.uuid, - ) - } else { - level.Debug(c.logger).Log( - "msg", "Foung job ID for GPU", "index", dev.index, "uuid", dev.uuid, - "jobid", gpuJobMapper[dev.uuid], - ) - } - } - return gpuJobMapper, nil -} diff --git a/pkg/collector/nvidia_gpus_test.go b/pkg/collector/nvidia_gpus_test.go deleted file mode 100644 index d06c0d20..00000000 --- a/pkg/collector/nvidia_gpus_test.go +++ /dev/null @@ -1,70 +0,0 @@ -//go:build !nonvidia -// +build !nonvidia - -package collector - -import ( - "testing" - - "github.com/go-kit/log" -) - -var ( - devices = []Device{ - { - index: "0", - name: "fakeGpu1", - uuid: "GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e", - isMig: false, - }, { - index: "1", - name: "fakeGpu2", - uuid: "GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3", - isMig: false, - }, - } -) - -func TestNvidiaJobGpuMap(t *testing.T) { - if _, err := BatchJobExporterApp.Parse([]string{"--collector.nvidia.gpu.job.map.path", "fixtures/gpujobmap"}); err != nil { - t.Fatal(err) - } - c := nvidiaGpuJobMapCollector{devices: devices, logger: log.NewNopLogger()} - gpuJobMapper, _ := c.getJobId() - if gpuJobMapper["GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e"] != 10000 { - t.Fatalf( - "Expected Job ID is %d: \nGot %f", - 10000, - gpuJobMapper["GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e"], - ) - } - if gpuJobMapper["GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"] != 11000 { - t.Fatalf( - "Expected Job ID is %d: \nGot %f", - 11000, - gpuJobMapper["GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"], - ) - } -} - -func TestNvidiaJobGpuMapWithProcFs(t *testing.T) { - if _, err := BatchJobExporterApp.Parse([]string{"--path.procfs", "fixtures/proc"}); err != nil { - t.Fatal(err) - } - c := nvidiaGpuJobMapCollector{devices: devices, logger: log.NewNopLogger()} - gpuJobMapper, _ := c.getJobId() - if gpuJobMapper["GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e"] != 10000 { - t.Fatalf( - "Expected Job ID is %d: \nGot %f", - 10000, - gpuJobMapper["GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e"], - ) - } - if gpuJobMapper["GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"] != 11000 { - t.Fatalf( - "Expected Job ID is %d: \nGot %f", - 11000, - gpuJobMapper["GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"], - ) - } -} diff --git a/pkg/collector/slurm.go b/pkg/collector/slurm.go index 8d1ea406..eeba864e 100644 --- a/pkg/collector/slurm.go +++ b/pkg/collector/slurm.go @@ -16,6 +16,7 @@ import ( "github.com/containerd/cgroups/v3" "github.com/containerd/cgroups/v3/cgroup1" + "github.com/containerd/cgroups/v3/cgroup2" "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/mahendrapaipuri/batchjob_monitoring/internal/helpers" @@ -26,13 +27,20 @@ import ( const slurmCollectorSubsystem = "slurm_job" var ( - cgroupsV2 = false metricLock = sync.RWMutex{} collectJobSteps = BatchJobExporterApp.Flag( "collector.slurm.jobsteps.metrics", `Enables collection of metrics of all slurm job steps and tasks (default: disabled). [WARNING: This option can result in very high cardinality of metrics]`, ).Default("false").Bool() + collectSwapMemoryStats = BatchJobExporterApp.Flag( + "collector.slurm.swap.memory.metrics", + "Enables collection of swap memory metrics (default: disabled)", + ).Default("false").Bool() + collectPSIStats = BatchJobExporterApp.Flag( + "collector.slurm.psi.metrics", + "Enables collection of PSI metrics (default: disabled)", + ).Default("false").Bool() useJobIdHash = BatchJobExporterApp.Flag( "collector.slurm.create.unique.jobids", `Enables calculation of a unique hash based job UUID (default: disabled). @@ -43,6 +51,18 @@ UUID is calculated based on SLURM_JOBID, SLURM_JOB_UID, SLURM_JOB_ACCOUNT, SLURM `Directory containing files with job properties. Files should be named after SLURM_JOBID with contents as "$SLURM_JOB_UID $SLURM_JOB_ACCOUNT $SLURM_JOB_NODELIST" in the same order.`, ).Default("/run/slurmjobprops").String() + gpuStatPath = BatchJobExporterApp.Flag( + "collector.slurm.nvidia.gpu.job.map.path", + "Path to file that maps GPU ordinals to job IDs.", + ).Default("/run/gpujobmap").String() + nvidiaSmiPath = BatchJobExporterApp.Flag( + "collector.slurm.nvidia.smi.path", + "Absolute path to nvidia-smi executable.", + ).Default("/usr/bin/nvidia-smi").String() + forceCgroupsVersion = BatchJobExporterApp.Flag( + "collector.slurm.force.cgroups.version", + "Set cgroups version manually. Used only for testing.", + ).Hidden().Enum("v1", "v2") ) type CgroupMetric struct { @@ -51,6 +71,7 @@ type CgroupMetric struct { cpuSystem float64 cpuTotal float64 cpus int + cpuPressure float64 memoryRSS float64 memoryCache float64 memoryUsed float64 @@ -59,6 +80,7 @@ type CgroupMetric struct { memswUsed float64 memswTotal float64 memswFailCount float64 + memoryPressure float64 userslice bool batch string hostname string @@ -66,6 +88,7 @@ type CgroupMetric struct { jobaccount string jobid string jobuuid string + jobGpuOrdinals []string step string task string err bool @@ -76,10 +99,12 @@ type slurmCollector struct { cgroupsRootPath string slurmCgroupsPath string hostname string + nvidiaGPUDevs map[int]Device cpuUser *prometheus.Desc cpuSystem *prometheus.Desc cpuTotal *prometheus.Desc cpus *prometheus.Desc + cpuPressure *prometheus.Desc memoryRSS *prometheus.Desc memoryCache *prometheus.Desc memoryUsed *prometheus.Desc @@ -88,6 +113,8 @@ type slurmCollector struct { memswUsed *prometheus.Desc memswTotal *prometheus.Desc memswFailCount *prometheus.Desc + memoryPressure *prometheus.Desc + gpuJobMap *prometheus.Desc collectError *prometheus.Desc logger log.Logger } @@ -98,19 +125,19 @@ func init() { // NewSlurmCollector returns a new Collector exposing a summary of cgroups. func NewSlurmCollector(logger log.Logger) (Collector, error) { - var cgroupsVer string + var cgroupsVersion string var cgroupsRootPath string var slurmCgroupsPath string var hostname string var err error if cgroups.Mode() == cgroups.Unified { - cgroupsVer = "v2" + cgroupsVersion = "v2" level.Info(logger).Log("msg", "Cgroup version v2 detected", "mount", *cgroupfsPath) cgroupsRootPath = *cgroupfsPath slurmCgroupsPath = fmt.Sprintf("%s/system.slice/slurmstepd.scope", *cgroupfsPath) } else { - cgroupsVer = "v1" + cgroupsVersion = "v1" level.Info(logger).Log("msg", "Cgroup version v2 not detected, will proceed with v1.") cgroupsRootPath = fmt.Sprintf("%s/cpuacct", *cgroupfsPath) slurmCgroupsPath = fmt.Sprintf("%s/slurm", cgroupsRootPath) @@ -124,102 +151,124 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { } } - // // Snippet for testing e2e tests for cgroups v1 - // cgroupsVer = "v1" - // level.Info(logger).Log("msg", "Cgroup version v2 not detected, will proceed with v1.") - // cgroupsRootPath = fmt.Sprintf("%s/cpuacct", *cgroupfsPath) - // slurmCgroupsPath = fmt.Sprintf("%s/slurm", cgroupsRootPath) - - // Dont fail starting collector. Let it fail during scraping - // Check if cgroups exist - // if _, err := os.Stat(slurmCgroupsPath); err != nil { - // level.Error(logger).Log("msg", "Slurm cgroups hierarchy not found", "path", slurmCgroupsPath, "err", err) - // return nil, err - // } + // If cgroup version is set via CLI flag for testing override the one we got earlier + if *forceCgroupsVersion != "" { + cgroupsVersion = *forceCgroupsVersion + if cgroupsVersion == "v2" { + cgroupsRootPath = *cgroupfsPath + slurmCgroupsPath = fmt.Sprintf("%s/system.slice/slurmstepd.scope", *cgroupfsPath) + } else if cgroupsVersion == "v1" { + cgroupsRootPath = fmt.Sprintf("%s/cpuacct", *cgroupfsPath) + slurmCgroupsPath = fmt.Sprintf("%s/slurm", cgroupsRootPath) + } + } + // Attempt to get nVIDIA GPU devices + nvidiaGPUDevs, err := GetNvidiaGPUDevices(*nvidiaSmiPath, logger) + if err == nil { + level.Info(logger).Log("msg", "nVIDIA GPU devices found") + } return &slurmCollector{ - cgroups: cgroupsVer, + cgroups: cgroupsVersion, cgroupsRootPath: cgroupsRootPath, slurmCgroupsPath: slurmCgroupsPath, hostname: hostname, + nvidiaGPUDevs: nvidiaGPUDevs, cpuUser: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, "cpu", "user_seconds"), + prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "cpu_user_seconds"), "Cumulative CPU user seconds", []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), cpuSystem: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, "cpu", "system_seconds"), + prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "cpu_system_seconds"), "Cumulative CPU system seconds", []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), cpuTotal: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, "cpu", "total_seconds"), + prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "cpu_total_seconds"), "Cumulative CPU total seconds", []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), cpus: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, "", "cpus"), + prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "cpus"), "Number of CPUs", []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), + cpuPressure: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "cpu_psi_seconds"), + "Cumulative CPU PSI seconds", + []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, + nil, + ), memoryRSS: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, "memory", "rss_bytes"), + prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "memory_rss_bytes"), "Memory RSS used in bytes", []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), memoryCache: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, "memory", "cache_bytes"), + prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "memory_cache_bytes"), "Memory cache used in bytes", []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), memoryUsed: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, "memory", "used_bytes"), + prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "memory_used_bytes"), "Memory used in bytes", []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), memoryTotal: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, "memory", "total_bytes"), + prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "memory_total_bytes"), "Memory total in bytes", []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), memoryFailCount: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, "memory", "fail_count"), + prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "memory_fail_count"), "Memory fail count", []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), memswUsed: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, "memsw", "used_bytes"), + prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "memsw_used_bytes"), "Swap used in bytes", []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), memswTotal: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, "memsw", "total_bytes"), + prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "memsw_total_bytes"), "Swap total in bytes", []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), memswFailCount: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, "memsw", "fail_count"), + prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "memsw_fail_count"), "Swap fail count", []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), + memoryPressure: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "memory_psi_seconds"), + "Cumulative memory PSI seconds", + []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, + nil, + ), collectError: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, "exporter", "collect_error"), + prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "collect_error"), "Indicates collection error, 0=no error, 1=error", []string{"batch", "hostname", "jobid", "jobaccount", "jobuuid", "step", "task"}, nil, ), + gpuJobMap: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "nvidia_gpu_jobid"), + "Batch Job ID of current nVIDIA GPU", + []string{"batch", "hostname", "index", "uuid", "UUID"}, nil, + ), logger: logger, }, nil } @@ -240,6 +289,12 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error { return err } for n, m := range metrics { + // Convert job id to int + jid, err := strconv.Atoi(m.jobid) + if err != nil { + level.Debug(c.logger).Log("msg", "Failed to convert SLURM jobID to int", "jobID", m.jobid) + jid = 0 + } if m.err { ch <- prometheus.MustNewConstMetric(c.collectError, prometheus.GaugeValue, 1, m.name) } @@ -260,9 +315,26 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error { ch <- prometheus.MustNewConstMetric(c.memoryUsed, prometheus.GaugeValue, m.memoryUsed, m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) ch <- prometheus.MustNewConstMetric(c.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) ch <- prometheus.MustNewConstMetric(c.memoryFailCount, prometheus.GaugeValue, m.memoryFailCount, m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memswUsed, prometheus.GaugeValue, m.memswUsed, m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memswTotal, prometheus.GaugeValue, m.memswTotal, m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) - ch <- prometheus.MustNewConstMetric(c.memswFailCount, prometheus.GaugeValue, m.memswFailCount, m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) + if *collectSwapMemoryStats { + ch <- prometheus.MustNewConstMetric(c.memswUsed, prometheus.GaugeValue, m.memswUsed, m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memswTotal, prometheus.GaugeValue, m.memswTotal, m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memswFailCount, prometheus.GaugeValue, m.memswFailCount, m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) + } + if *collectPSIStats { + ch <- prometheus.MustNewConstMetric(c.cpuPressure, prometheus.GaugeValue, m.cpuPressure, m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) + ch <- prometheus.MustNewConstMetric(c.memoryPressure, prometheus.GaugeValue, m.memoryPressure, m.batch, m.hostname, m.jobid, m.jobaccount, m.jobuuid, m.step, m.task) + } + for _, gpuOrdinal := range m.jobGpuOrdinals { + var uuid string + // Check the int index of devices where gpuOrdinal == dev.index + for _, dev := range c.nvidiaGPUDevs { + if gpuOrdinal == dev.index { + uuid = dev.uuid + break + } + } + ch <- prometheus.MustNewConstMetric(c.gpuJobMap, prometheus.GaugeValue, float64(jid), m.batch, c.hostname, gpuOrdinal, uuid, uuid) + } } return nil } @@ -309,26 +381,6 @@ func (c *slurmCollector) getJobsMetrics() (map[string]CgroupMetric, error) { }(name) } wg.Wait() - - // if memory.max = "max" case we set memory max to -1 - // fix it by looking at the parent - // we loop through names once as it was the result of Walk so top paths are seen first - // also some cgroups we ignore, like path=/system.slice/slurmstepd.scope/job_216/step_interactive/user, hence the need to loop through multiple parents - if c.cgroups == "v2" { - for _, name := range names { - metric, ok := metrics[name] - if ok && metric.memoryTotal < 0 { - for upName := name; len(upName) > 1; { - upName = filepath.Dir(upName) - upMetric, ok := metrics[upName] - if ok { - metric.memoryTotal = upMetric.memoryTotal - metrics[name] = metric - } - } - } - } - } return metrics, nil } @@ -400,18 +452,18 @@ func (c *slurmCollector) getCPUs(name string) ([]string, error) { return cpus, nil } -// Get different labels of Job -func (c *slurmCollector) getJobLabels(jobid string) (string, string, string) { +// Get different properties of Job +func (c *slurmCollector) getJobProperties(metric *CgroupMetric, pids []uint64) { + jobid := metric.jobid var jobUuid string var jobUid string = "" var jobAccount string = "" var jobNodelist string = "" + var gpuJobId string = "" + var jobGpuOrdinals []string + var err error - // If useJobIdHash is false return with empty strings - if !*useJobIdHash { - return jobUuid, jobUid, jobAccount - } - + // First try to read files that might be created by SLURM prolog scripts var slurmJobInfo = fmt.Sprintf("%s/%s", *jobStatPath, jobid) if _, err := os.Stat(slurmJobInfo); err == nil { content, err := os.ReadFile(slurmJobInfo) @@ -421,7 +473,46 @@ func (c *slurmCollector) getJobLabels(jobid string) (string, string, string) { } else { fmt.Sscanf(string(content), "%s %s %s", &jobUid, &jobAccount, &jobNodelist) } - } else { + } + + // If there are no GPUs this loop will be skipped anyways + // NOTE: In go loop over map is not reproducible. The order is undefined and thus + // we might end up with a situation where jobGpuOrdinals will [1 2] or [2 1] if + // current Job has two GPUs. This will fail unit tests as order in Slice is important + // in Go + // + // So we use map[int]Device to have int indices for devices which we use internally + // We are not using device index as it might be a non-integer. We are not sure about + // it but just to be safe. This will have a small overhead as we need to check the + // correct integer index for each device index. We can live with it as there are + // typically 2/4/8 GPUs per node. + for i := 0; i <= len(c.nvidiaGPUDevs); i++ { + dev := c.nvidiaGPUDevs[i] + gpuJobMapInfo := fmt.Sprintf("%s/%s", *gpuStatPath, dev.index) + + // NOTE: Look for file name with UUID as it will be more appropriate with + // MIG instances. + // If /run/gpustat/0 file is not found, check for the file with UUID as name? + if _, err := os.Stat(gpuJobMapInfo); err == nil { + content, err := os.ReadFile(gpuJobMapInfo) + if err != nil { + level.Error(c.logger).Log( + "msg", "Failed to get job ID for GPU", + "index", dev.index, "uuid", dev.uuid, "err", err, + ) + continue + } + fmt.Sscanf(string(content), "%s", &gpuJobId) + if gpuJobId == jobid { + jobGpuOrdinals = append(jobGpuOrdinals, dev.index) + } + } + } + + // If we fail to get any of the job properties or if there are atleast one GPU devices + // and if we fail to get gpu ordinals for that job, try to get these properties + // by looking into environment variables + if jobUid == "" || jobAccount == "" || jobNodelist == "" || (len(jobGpuOrdinals) == 0 && len(c.nvidiaGPUDevs) > 0) { // Attempt to get UID, Account, Nodelist from /proc file system by looking into // environ for the process that has same SLURM_JOB_ID // @@ -432,11 +523,17 @@ func (c *slurmCollector) getJobLabels(jobid string) (string, string, string) { goto outside } - // Get all procs from current proc fs - allProcs, err := procFS.AllProcs() - if err != nil { - level.Error(c.logger).Log("msg", "Failed to read /proc", "err", err) - goto outside + // Get all procs from current proc fs if passed pids slice is nil + if pids == nil { + allProcs, err := procFS.AllProcs() + if err != nil { + level.Error(c.logger).Log("msg", "Failed to read /proc", "err", err) + goto outside + } + pids = make([]uint64, len(allProcs)) + for idx, proc := range allProcs { + pids[idx] = uint64(proc.PID) + } } // Env var that we will search @@ -444,15 +541,20 @@ func (c *slurmCollector) getJobLabels(jobid string) (string, string, string) { // Initialize a waitgroup for all go routines that we will spawn later wg := &sync.WaitGroup{} - wg.Add(len(allProcs)) + wg.Add(len(pids)) // Iterate through all procs and look for SLURM_JOB_ID env entry - for _, proc := range allProcs { - go func(p procfs.Proc) { + for _, pid := range pids { + go func(p int) { // Read process environment variables // NOTE: This needs CAP_SYS_PTRACE and CAP_DAC_READ_SEARCH caps // on the current process - environments, err := p.Environ() + proc, err := procFS.Proc(p) + if err != nil { + wg.Done() + return + } + environments, err := proc.Environ() // Skip if we cannot read file or job ID env var is not found if err != nil || !slices.Contains(environments, jobIDEnv) { @@ -471,38 +573,52 @@ func (c *slurmCollector) getJobLabels(jobid string) (string, string, string) { if strings.Contains(env, "SLURM_JOB_NODELIST") { jobNodelist = strings.Split(env, "=")[1] } + if strings.Contains(env, "SLURM_STEP_GPUS") || strings.Contains(env, "SLURM_JOB_GPUS") { + jobGpuOrdinals = strings.Split(strings.Split(env, "=")[1], ",") + } } // Mark routine as done wg.Done() - }(proc) + }(int(pid)) } // Wait for all go routines to finish wg.Wait() } + outside: // Emit a warning if we could not get all job properties if jobUid == "" && jobAccount == "" && jobNodelist == "" { level.Warn(c.logger). Log("msg", "Failed to get job properties", "jobid", jobid) } + // Emit warning when there are GPUs but no job to GPU map found + if len(c.nvidiaGPUDevs) > 0 && len(jobGpuOrdinals) == 0 { + level.Warn(c.logger). + Log("msg", "Failed to get GPU ordinals for job", "jobid", jobid) + } // Get UUID using job properties - jobUuid, err := helpers.GetUuidFromString( - []string{ - strings.TrimSpace(jobid), - strings.TrimSpace(jobUid), - strings.ToLower(strings.TrimSpace(jobAccount)), - strings.ToLower(strings.TrimSpace(jobNodelist)), - }, - ) - if err != nil { - level.Error(c.logger). - Log("msg", "Failed to generate UUID for job", "jobid", jobid, "err", err) - jobUuid = jobid + if *useJobIdHash { + jobUuid, err = helpers.GetUuidFromString( + []string{ + strings.TrimSpace(jobid), + strings.TrimSpace(jobUid), + strings.ToLower(strings.TrimSpace(jobAccount)), + strings.ToLower(strings.TrimSpace(jobNodelist)), + }, + ) + if err != nil { + level.Error(c.logger). + Log("msg", "Failed to generate UUID for job", "jobid", jobid, "err", err) + jobUuid = jobid + } } - return jobUuid, jobUid, jobAccount + metric.jobuid = jobUid + metric.jobuuid = jobUuid + metric.jobaccount = jobAccount + metric.jobGpuOrdinals = jobGpuOrdinals } // Get job details from cgroups v1 @@ -513,12 +629,9 @@ func (c *slurmCollector) getInfoV1(name string, metric *CgroupMetric) { userSliceMatch := userSlicePattern.FindStringSubmatch(pathBase) if len(userSliceMatch) == 2 { metric.userslice = true - // metric.jobuid, err = userSliceMatch[1] - // if err != nil { - // level.Error(c.logger).Log("msg", "Error getting slurm job's uid number", "uid", pathBase, "err", err) - // } - // return } + + // Get job ID, step and task slurmPattern := regexp.MustCompile( "^/slurm/uid_([0-9]+)/job_([0-9]+)(/step_([^/]+)(/task_([[0-9]+))?)?$", ) @@ -526,10 +639,6 @@ func (c *slurmCollector) getInfoV1(name string, metric *CgroupMetric) { level.Debug(c.logger). Log("msg", "Got for match", "name", name, "len(slurmMatch)", len(slurmMatch), "slurmMatch", fmt.Sprintf("%v", slurmMatch)) if len(slurmMatch) >= 3 { - // metric.jobuid, err = slurmMatch[1] - // if err != nil { - // level.Error(c.logger).Log("msg", "Error getting slurm job's uid number", "uid", name, "err", err) - // } metric.jobid = slurmMatch[2] metric.step = slurmMatch[4] metric.task = slurmMatch[6] @@ -548,6 +657,8 @@ func (c *slurmCollector) getCgroupsV1Metrics(name string) (CgroupMetric, error) metric.err = true return metric, err } + + // Load cgroup stats stats, err := ctrl.Stat(cgroup1.IgnoreNotExist) if err != nil { level.Error(c.logger).Log("msg", "Failed to stat cgroups", "path", name, "err", err) @@ -557,6 +668,8 @@ func (c *slurmCollector) getCgroupsV1Metrics(name string) (CgroupMetric, error) level.Error(c.logger).Log("msg", "Cgroup stats are nil", "path", name) return metric, err } + + // Get CPU stats if stats.CPU != nil { if stats.CPU.Usage != nil { metric.cpuUser = float64(stats.CPU.Usage.User) / 1000000000.0 @@ -564,6 +677,11 @@ func (c *slurmCollector) getCgroupsV1Metrics(name string) (CgroupMetric, error) metric.cpuTotal = float64(stats.CPU.Usage.Total) / 1000000000.0 } } + if cpus, err := c.getCPUs(name); err == nil { + metric.cpus = len(cpus) + } + + // Get memory stats if stats.Memory != nil { metric.memoryRSS = float64(stats.Memory.TotalRSS) metric.memoryCache = float64(stats.Memory.TotalCache) @@ -578,28 +696,13 @@ func (c *slurmCollector) getCgroupsV1Metrics(name string) (CgroupMetric, error) metric.memswFailCount = float64(stats.Memory.Swap.Failcnt) } } - if cpus, err := c.getCPUs(name); err == nil { - metric.cpus = len(cpus) - } + + // Get cgroup info c.getInfoV1(name, &metric) - metric.jobuuid, metric.jobuid, metric.jobaccount = c.getJobLabels(metric.jobid) - return metric, nil -} -// Convenience function that will check if name+metric exists in the data -// and log an error if it does not. It returns 0 in such case but otherwise -// returns the value -func (c *slurmCollector) getOneMetric( - name string, - metric string, - required bool, - data map[string]float64, -) float64 { - val, ok := data[metric] - if !ok && required { - level.Error(c.logger).Log("msg", "Failed to load", "metric", metric, "cgroup", name) - } - return val + // Get job Info + c.getJobProperties(&metric, nil) + return metric, nil } // Get Job info for cgroups v2 @@ -627,53 +730,67 @@ func (c *slurmCollector) getCgroupsV2Metrics(name string) (CgroupMetric, error) metric := CgroupMetric{name: name, batch: "slurm", hostname: c.hostname} metric.err = false level.Debug(c.logger).Log("msg", "Loading cgroup v2", "path", name) - // Files to parse out of the cgroup - controllers := []string{ - "cpu.stat", - "memory.current", - "memory.events", - "memory.max", - "memory.stat", - "memory.swap.current", - "memory.swap.max", - "memory.swap.high", - "memory.swap.events", - } - data, err := LoadCgroupsV2Metrics(name, *cgroupfsPath, controllers) + + // Load cgroups + ctrl, err := cgroup2.Load(name, cgroup2.WithMountpoint(*cgroupfsPath)) if err != nil { - level.Error(c.logger).Log("msg", "Failed to load cgroups v2", "path", name, "err", err) + level.Error(c.logger).Log("msg", "Failed to load cgroups", "path", name, "err", err) metric.err = true return metric, err } - metric.cpuUser = c.getOneMetric(name, "cpu.stat.user_usec", true, data) / 1000000.0 - metric.cpuSystem = c.getOneMetric(name, "cpu.stat.system_usec", true, data) / 1000000.0 - metric.cpuTotal = c.getOneMetric(name, "cpu.stat.usage_usec", true, data) / 1000000.0 - // we use Oom entry from memory.events - it maps most closely to FailCount - // TODO: add oom_kill as a separate value - metric.memoryFailCount = c.getOneMetric(name, "memory.events.oom", true, data) - // taking Slurm's cgroup v2 as inspiration, swapcached could be missing if swap is off so OK to ignore that case - metric.memoryRSS = c.getOneMetric( - name, - "memory.stat.anon", - true, - data, - ) + c.getOneMetric( - name, - "memory.stat.swapcached", - false, - data, - ) - // I guess? - metric.memoryCache = c.getOneMetric(name, "memory.stat.file", true, data) - metric.memoryUsed = c.getOneMetric(name, "memory.current", true, data) - metric.memoryTotal = c.getOneMetric(name, "memory.max", true, data) - metric.memswUsed = c.getOneMetric(name, "memory.swap.current", true, data) - metric.memswTotal = c.getOneMetric(name, "memory.swap.max", true, data) - metric.memswFailCount = c.getOneMetric(name, "memory.swap.events.fail", true, data) + + // Get stats from cgroup + stats, err := ctrl.Stat() + if err != nil { + level.Error(c.logger).Log("msg", "Failed to stat cgroups", "path", name, "err", err) + return metric, err + } + if stats == nil { + level.Error(c.logger).Log("msg", "Cgroup stats are nil", "path", name) + return metric, err + } + + // Get CPU stats + if stats.CPU != nil { + metric.cpuUser = float64(stats.CPU.UserUsec) / 1000000.0 + metric.cpuSystem = float64(stats.CPU.SystemUsec) / 1000000.0 + metric.cpuTotal = float64(stats.CPU.UsageUsec) / 1000000.0 + if *collectPSIStats && stats.CPU.PSI != nil { + metric.cpuPressure = float64(stats.CPU.PSI.Full.Total) / 1000000.0 + } + } if cpus, err := c.getCPUs(name); err == nil { metric.cpus = len(cpus) } + + // Get memory stats + // cgroups2 does not expose swap memory events. So we dont set memswFailCount + if stats.Memory != nil { + metric.memoryUsed = float64(stats.Memory.Usage) + metric.memoryTotal = float64(stats.Memory.UsageLimit) + metric.memoryCache = float64(stats.Memory.File) // This is page cache + metric.memoryRSS = float64(stats.Memory.Anon) + metric.memswUsed = float64(stats.Memory.SwapUsage) + metric.memswTotal = float64(stats.Memory.SwapLimit) + if *collectPSIStats && stats.Memory.PSI != nil { + metric.memoryPressure = float64(stats.Memory.PSI.Full.Total) / 1000000.0 + } + } + // Get memory events + if stats.MemoryEvents != nil { + metric.memoryFailCount = float64(stats.MemoryEvents.Oom) + } + + // Get cgroup Info c.getInfoV2(name, &metric) - metric.jobuuid, metric.jobuid, metric.jobaccount = c.getJobLabels(metric.jobid) + + // Get job Info + cgroupProcPids, err := ctrl.Procs(true) + if err != nil { + level.Error(c.logger).Log("msg", "Failed to get proc pids in cgroup", "path", name) + } + + // Get job Info + c.getJobProperties(&metric, cgroupProcPids) return metric, nil } diff --git a/pkg/collector/slurm_test.go b/pkg/collector/slurm_test.go index 04b1be13..d56be220 100644 --- a/pkg/collector/slurm_test.go +++ b/pkg/collector/slurm_test.go @@ -6,6 +6,7 @@ package collector import ( "fmt" "reflect" + "strconv" "testing" "github.com/go-kit/log" @@ -13,18 +14,29 @@ import ( var expectedSlurmMetrics CgroupMetric +func mockGPUDevices() map[int]Device { + var devs = make(map[int]Device, 4) + for i := 0; i <= 4; i++ { + idxString := strconv.Itoa(i) + devs[i] = Device{index: idxString, uuid: fmt.Sprintf("GPU-%d", i)} + } + return devs +} + func TestCgroupsV2SlurmJobMetrics(t *testing.T) { if _, err := BatchJobExporterApp.Parse( []string{ "--path.cgroupfs", "fixtures/sys/fs/cgroup", "--collector.slurm.create.unique.jobids", "--collector.slurm.job.props.path", "fixtures/slurmjobprops", + "--collector.slurm.nvidia.gpu.job.map.path", "fixtures/gpujobmap", }, ); err != nil { t.Fatal(err) } c := slurmCollector{ cgroups: "v2", + nvidiaGPUDevs: mockGPUDevices(), cgroupsRootPath: *cgroupfsPath, slurmCgroupsPath: fmt.Sprintf("%s/system.slice/slurmstepd.scope", *cgroupfsPath), logger: log.NewNopLogger(), @@ -36,32 +48,36 @@ func TestCgroupsV2SlurmJobMetrics(t *testing.T) { cpuSystem: 115.777502, cpuTotal: 60491.070351, cpus: 2, + cpuPressure: 0, memoryRSS: 4.098592768e+09, memoryCache: 0, memoryUsed: 4.111491072e+09, memoryTotal: 4.294967296e+09, memoryFailCount: 0, memswUsed: 0, - memswTotal: -1, + memswTotal: 1.8446744073709552e+19, // cgroupv2 just returns math.MaxUint64 memswFailCount: 0, + memoryPressure: 0, userslice: false, jobuid: "1000", jobaccount: "testacc", jobid: "1009248", jobuuid: "ac28caf5-ce6c-35f6-73fb-47d9d43f7780", + jobGpuOrdinals: []string{"2", "3"}, step: "", task: "", batch: "slurm", - err: false} + err: false, + } if err != nil { t.Fatalf("Cannot fetch data from getJobsMetrics function: %v ", err) } if !reflect.DeepEqual(metrics["1009248"], expectedSlurmMetrics) { - t.Fatalf("Expected metrics data is %+v: \nGot %+v", expectedSlurmMetrics, metrics) + t.Fatalf("Expected metrics data is %+v: \nGot %+v", expectedSlurmMetrics, metrics["1009248"]) } } -func TestCgroupsV2WithProcFsSlurmJobMetrics(t *testing.T) { +func TestCgroupsV2SlurmJobMetricsWithProcFs(t *testing.T) { if _, err := BatchJobExporterApp.Parse( []string{ "--path.cgroupfs", "fixtures/sys/fs/cgroup", @@ -74,6 +90,7 @@ func TestCgroupsV2WithProcFsSlurmJobMetrics(t *testing.T) { c := slurmCollector{ cgroups: "v2", cgroupsRootPath: *cgroupfsPath, + nvidiaGPUDevs: mockGPUDevices(), slurmCgroupsPath: fmt.Sprintf("%s/system.slice/slurmstepd.scope", *cgroupfsPath), logger: log.NewNopLogger(), } @@ -84,28 +101,83 @@ func TestCgroupsV2WithProcFsSlurmJobMetrics(t *testing.T) { cpuSystem: 115.777502, cpuTotal: 60491.070351, cpus: 2, + cpuPressure: 0, memoryRSS: 4.098592768e+09, memoryCache: 0, memoryUsed: 4.111491072e+09, memoryTotal: 4.294967296e+09, memoryFailCount: 0, memswUsed: 0, - memswTotal: -1, + memswTotal: 1.8446744073709552e+19, memswFailCount: 0, + memoryPressure: 0, userslice: false, jobuid: "1000", jobaccount: "testacc", jobid: "1009248", jobuuid: "ac28caf5-ce6c-35f6-73fb-47d9d43f7780", + jobGpuOrdinals: []string{"2", "3"}, + step: "", + task: "", + batch: "slurm", + err: false, + } + if err != nil { + t.Fatalf("Cannot fetch data from getJobsMetrics function: %v ", err) + } + if !reflect.DeepEqual(metrics["1009248"], expectedSlurmMetrics) { + t.Fatalf("Expected metrics data is %+v: \nGot %+v", expectedSlurmMetrics, metrics["1009248"]) + } +} + +func TestCgroupsV2SlurmJobMetricsNoJobProps(t *testing.T) { + if _, err := BatchJobExporterApp.Parse( + []string{ + "--path.cgroupfs", "fixtures/sys/fs/cgroup", + "--collector.slurm.create.unique.jobids", + }, + ); err != nil { + t.Fatal(err) + } + c := slurmCollector{ + cgroups: "v2", + cgroupsRootPath: *cgroupfsPath, + nvidiaGPUDevs: mockGPUDevices(), + slurmCgroupsPath: fmt.Sprintf("%s/system.slice/slurmstepd.scope", *cgroupfsPath), + logger: log.NewNopLogger(), + } + metrics, err := c.getJobsMetrics() + expectedSlurmMetrics = CgroupMetric{ + name: "/system.slice/slurmstepd.scope/job_1009248", + cpuUser: 60375.292848, + cpuSystem: 115.777502, + cpuTotal: 60491.070351, + cpus: 2, + cpuPressure: 0, + memoryRSS: 4.098592768e+09, + memoryCache: 0, + memoryUsed: 4.111491072e+09, + memoryTotal: 4.294967296e+09, + memoryFailCount: 0, + memswUsed: 0, + memswTotal: 1.8446744073709552e+19, + memswFailCount: 0, + memoryPressure: 0, + userslice: false, + jobuid: "", + jobaccount: "", + jobid: "1009248", + jobuuid: "a0523e93-a037-c2b1-8b34-410c9996399c", step: "", task: "", batch: "slurm", - err: false} + err: false, + } if err != nil { t.Fatalf("Cannot fetch data from getJobsMetrics function: %v ", err) } if !reflect.DeepEqual(metrics["1009248"], expectedSlurmMetrics) { - t.Fatalf("Expected metrics data is %+v: \nGot %+v", expectedSlurmMetrics, metrics) + t.Fatalf("Expected metrics data is %+v: \nGot %+v", expectedSlurmMetrics, metrics["1009248"]) } } @@ -113,6 +185,7 @@ func TestCgroupsV1SlurmJobMetrics(t *testing.T) { if _, err := BatchJobExporterApp.Parse( []string{ "--path.cgroupfs", "fixtures/sys/fs/cgroup", + "--path.procfs", "fixtures/proc", "--collector.slurm.create.unique.jobids", "--collector.slurm.job.props.path", "fixtures/slurmjobprops", }, @@ -122,6 +195,7 @@ func TestCgroupsV1SlurmJobMetrics(t *testing.T) { c := slurmCollector{ cgroups: "v1", logger: log.NewNopLogger(), + nvidiaGPUDevs: mockGPUDevices(), cgroupsRootPath: fmt.Sprintf("%s/cpuacct", *cgroupfsPath), slurmCgroupsPath: fmt.Sprintf("%s/cpuacct/slurm", *cgroupfsPath), } @@ -132,6 +206,7 @@ func TestCgroupsV1SlurmJobMetrics(t *testing.T) { cpuSystem: 0.45, cpuTotal: 1.012410966, cpus: 0, + cpuPressure: 0, memoryRSS: 1.0407936e+07, memoryCache: 2.1086208e+07, memoryUsed: 4.0194048e+07, @@ -140,19 +215,22 @@ func TestCgroupsV1SlurmJobMetrics(t *testing.T) { memswUsed: 4.032512e+07, memswTotal: 9.223372036854772e+18, memswFailCount: 0, + memoryPressure: 0, userslice: false, jobuid: "1000", jobaccount: "testacc", jobid: "1009248", jobuuid: "ac28caf5-ce6c-35f6-73fb-47d9d43f7780", + jobGpuOrdinals: []string{"2", "3"}, step: "", task: "", batch: "slurm", - err: false} + err: false, + } if err != nil { t.Fatalf("Cannot fetch data from getJobsMetrics function: %v ", err) } if !reflect.DeepEqual(metrics["1009248"], expectedSlurmMetrics) { - t.Fatalf("Expected metrics data is %+v: \nGot %+v", expectedSlurmMetrics, metrics) + t.Fatalf("Expected metrics data is %+v: \nGot %+v", expectedSlurmMetrics, metrics["1009248"]) } } diff --git a/scripts/checkmetrics.sh b/scripts/checkmetrics.sh index 082a7a38..e800774a 100755 --- a/scripts/checkmetrics.sh +++ b/scripts/checkmetrics.sh @@ -6,7 +6,7 @@ if [[ ( -z "$1" ) || ( -z "$2" ) ]]; then fi # Ignore known issues in auto-generated and network specific collectors. -lint=$($1 check metrics < "$2" 2>&1 | grep -v -E "^batchjob_(memory_fail_count|memsw_fail_count)") +lint=$($1 check metrics < "$2" 2>&1 | grep -v -E "^batchjob_slurm_job_(memory_fail_count|memsw_fail_count)") if [[ -n $lint ]]; then echo -e "Some Prometheus metrics do not follow best practices:\n" diff --git a/scripts/e2e-test.sh b/scripts/e2e-test.sh index 8c94cddb..f4856601 100755 --- a/scripts/e2e-test.sh +++ b/scripts/e2e-test.sh @@ -11,12 +11,12 @@ skip_re="^(go_|batchjob_exporter_build_info|batchjob_scrape_collector_duration_s arch="$(uname -m)" -package="exporter"; keep=0; update=0; verbose=0 -while getopts 'hp:kuv' opt +scenario="exporter-cgroups-v1"; keep=0; update=0; verbose=0 +while getopts 'hs:kuv' opt do case "$opt" in - p) - package=$OPTARG + s) + scenario=$OPTARG ;; k) keep=1 @@ -30,7 +30,7 @@ do ;; *) echo "Usage: $0 [-p] [-k] [-u] [-v]" - echo " -p: package to test [options: exporter, stats]" + echo " -s: scenario to test [options: exporter, stats]" echo " -k: keep temporary files and leave batchjob_exporter running" echo " -u: update fixtures" echo " -v: verbose output" @@ -39,21 +39,49 @@ do esac done -if [ "${package}" = "exporter" ] +if [[ "${scenario}" =~ "exporter" ]] then - cgroups_mode=$([ $(stat -fc %T /sys/fs/cgroup/) = "cgroup2fs" ] && echo "unified" || ( [ -e /sys/fs/cgroup/unified/ ] && echo "hybrid" || echo "legacy")) + # cgroups_mode=$([ $(stat -fc %T /sys/fs/cgroup/) = "cgroup2fs" ] && echo "unified" || ( [ -e /sys/fs/cgroup/unified/ ] && echo "hybrid" || echo "legacy")) # cgroups_mode="legacy" - echo "cgroups mode detected: ${cgroups_mode}" - case "${cgroups_mode}" in - legacy|hybrid) fixture='pkg/collector/fixtures/e2e-test-cgroupsv1-output.txt' ;; - *) fixture='pkg/collector/fixtures/e2e-test-cgroupsv2-output.txt' ;; - esac + if [ "${scenario}" = "exporter-cgroups-v1" ] + then + cgroups_mode="legacy" + desc="Cgroups V1" + fixture='pkg/collector/fixtures/output/e2e-test-cgroupsv1-output.txt' + elif [ "${scenario}" = "exporter-cgroups-v2" ] + then + cgroups_mode="unified" + desc="Cgroups V2" + fixture='pkg/collector/fixtures/output/e2e-test-cgroupsv2-output.txt' + elif [ "${scenario}" = "exporter-cgroups-v2-nogpu" ] + then + cgroups_mode="unified" + desc="Cgroups V2 when there are no GPUs" + fixture='pkg/collector/fixtures/output/e2e-test-cgroupsv2-nogpu-output.txt' + elif [ "${scenario}" = "exporter-cgroups-v2-procfs" ] + then + cgroups_mode="unified" + desc="Cgroups V2 using /proc for fetching job properties" + fixture='pkg/collector/fixtures/output/e2e-test-cgroupsv2-procfs-output.txt' + elif [ "${scenario}" = "exporter-cgroups-v2-all-metrics" ] + then + cgroups_mode="unified" + desc="Cgroups V2 enabling all available cgroups metrics" + fixture='pkg/collector/fixtures/output/e2e-test-cgroupsv2-all-metrics-output.txt' + fi + + echo "using scenario: ${scenario}. Description: ${desc}" + + # case "${cgroups_mode}" in + # legacy|hybrid) fixture='pkg/collector/fixtures/output/e2e-test-cgroupsv1-output.txt' ;; + # *) fixture='pkg/collector/fixtures/output/e2e-test-cgroupsv2-output.txt' ;; + # esac logfile="${tmpdir}/batchjob_exporter.log" fixture_output="${tmpdir}/e2e-test-exporter-output.txt" pidfile="${tmpdir}/batchjob_exporter.pid" -elif [ "${package}" = "stats" ] +elif [[ "${scenario}" =~ "stats" ]] then fixture='pkg/jobstats/fixtures/e2e-test-stats-server-output.txt' logfile="${tmpdir}/batchjob_stats_server.log" @@ -101,7 +129,108 @@ get() { fi } -if [ "${package}" = "exporter" ] +if [ "${scenario}" = "exporter-cgroups-v1" ] +then + if [ ! -x ./bin/batchjob_exporter ] + then + echo './bin/batchjob_exporter not found. Consider running `go build` first.' >&2 + exit 1 + fi + + ./bin/batchjob_exporter \ + --path.sysfs="pkg/collector/fixtures/sys" \ + --path.cgroupfs="pkg/collector/fixtures/sys/fs/cgroup" \ + --collector.slurm.create.unique.jobids \ + --collector.slurm.job.props.path="pkg/collector/fixtures/slurmjobprops" \ + --collector.slurm.nvidia.smi.path="pkg/collector/fixtures/nvidia-smi" \ + --collector.slurm.force.cgroups.version="v1" \ + --collector.slurm.nvidia.gpu.job.map.path="pkg/collector/fixtures/gpujobmap" \ + --collector.ipmi.dcmi.cmd="pkg/collector/fixtures/ipmi-dcmi-wrapper.sh" \ + --collector.empty.hostname.label \ + --web.listen-address "127.0.0.1:${port}" \ + --log.level="debug" > "${logfile}" 2>&1 & + + echo $! > "${pidfile}" + + sleep 1 + + get "127.0.0.1:${port}/metrics" | grep -E -v "${skip_re}" > "${fixture_output}" +elif [ "${scenario}" = "exporter-cgroups-v2" ] +then + if [ ! -x ./bin/batchjob_exporter ] + then + echo './bin/batchjob_exporter not found. Consider running `go build` first.' >&2 + exit 1 + fi + + ./bin/batchjob_exporter \ + --path.sysfs="pkg/collector/fixtures/sys" \ + --path.cgroupfs="pkg/collector/fixtures/sys/fs/cgroup" \ + --collector.slurm.create.unique.jobids \ + --collector.slurm.job.props.path="pkg/collector/fixtures/slurmjobprops" \ + --collector.slurm.nvidia.smi.path="pkg/collector/fixtures/nvidia-smi" \ + --collector.slurm.force.cgroups.version="v2" \ + --collector.slurm.nvidia.gpu.job.map.path="pkg/collector/fixtures/gpujobmap" \ + --collector.ipmi.dcmi.cmd="pkg/collector/fixtures/ipmi-dcmi-wrapper.sh" \ + --collector.empty.hostname.label \ + --web.listen-address "127.0.0.1:${port}" \ + --log.level="debug" > "${logfile}" 2>&1 & + + echo $! > "${pidfile}" + + sleep 1 + + get "127.0.0.1:${port}/metrics" | grep -E -v "${skip_re}" > "${fixture_output}" +elif [ "${scenario}" = "exporter-cgroups-v2-nogpu" ] +then + if [ ! -x ./bin/batchjob_exporter ] + then + echo './bin/batchjob_exporter not found. Consider running `go build` first.' >&2 + exit 1 + fi + + ./bin/batchjob_exporter \ + --path.sysfs="pkg/collector/fixtures/sys" \ + --path.cgroupfs="pkg/collector/fixtures/sys/fs/cgroup" \ + --collector.slurm.create.unique.jobids \ + --collector.slurm.job.props.path="pkg/collector/fixtures/slurmjobprops" \ + --collector.slurm.force.cgroups.version="v2" \ + --collector.ipmi.dcmi.cmd="pkg/collector/fixtures/ipmi-dcmi-wrapper.sh" \ + --collector.empty.hostname.label \ + --web.listen-address "127.0.0.1:${port}" \ + --log.level="debug" > "${logfile}" 2>&1 & + + echo $! > "${pidfile}" + + sleep 1 + + get "127.0.0.1:${port}/metrics" | grep -E -v "${skip_re}" > "${fixture_output}" +elif [ "${scenario}" = "exporter-cgroups-v2-procfs" ] +then + if [ ! -x ./bin/batchjob_exporter ] + then + echo './bin/batchjob_exporter not found. Consider running `go build` first.' >&2 + exit 1 + fi + + ./bin/batchjob_exporter \ + --path.sysfs="pkg/collector/fixtures/sys" \ + --path.cgroupfs="pkg/collector/fixtures/sys/fs/cgroup" \ + --path.procfs="pkg/collector/fixtures/proc" \ + --collector.slurm.create.unique.jobids \ + --collector.slurm.nvidia.smi.path="pkg/collector/fixtures/nvidia-smi" \ + --collector.slurm.force.cgroups.version="v2" \ + --collector.ipmi.dcmi.cmd="pkg/collector/fixtures/ipmi-dcmi-wrapper.sh" \ + --collector.empty.hostname.label \ + --web.listen-address "127.0.0.1:${port}" \ + --log.level="debug" > "${logfile}" 2>&1 & + + echo $! > "${pidfile}" + + sleep 1 + + get "127.0.0.1:${port}/metrics" | grep -E -v "${skip_re}" > "${fixture_output}" +elif [ "${scenario}" = "exporter-cgroups-v2-all-metrics" ] then if [ ! -x ./bin/batchjob_exporter ] then @@ -114,10 +243,12 @@ then --path.cgroupfs="pkg/collector/fixtures/sys/fs/cgroup" \ --collector.slurm.create.unique.jobids \ --collector.slurm.job.props.path="pkg/collector/fixtures/slurmjobprops" \ + --collector.slurm.nvidia.smi.path="pkg/collector/fixtures/nvidia-smi" \ + --collector.slurm.force.cgroups.version="v2" \ + --collector.slurm.nvidia.gpu.job.map.path="pkg/collector/fixtures/gpujobmap" \ + --collector.slurm.swap.memory.metrics \ + --collector.slurm.psi.metrics \ --collector.ipmi.dcmi.cmd="pkg/collector/fixtures/ipmi-dcmi-wrapper.sh" \ - --collector.nvidia_gpu \ - --collector.nvidia.smi.path="pkg/collector/fixtures/nvidia-smi" \ - --collector.nvidia.gpu.job.map.path="pkg/collector/fixtures/gpujobmap" \ --collector.empty.hostname.label \ --web.listen-address "127.0.0.1:${port}" \ --log.level="debug" > "${logfile}" 2>&1 & @@ -127,7 +258,7 @@ then sleep 1 get "127.0.0.1:${port}/metrics" | grep -E -v "${skip_re}" > "${fixture_output}" -elif [ "${package}" = "stats" ] +elif [ "${scenario}" = "stats" ] then if [ ! -x ./bin/batchjob_stats_server ] then