From dd92a413cef036188f666a7e1f8922e8b2017cae Mon Sep 17 00:00:00 2001 From: ghokun Date: Mon, 21 Mar 2022 23:15:15 +0300 Subject: [PATCH] This will go on forever Signed-off-by: ghokun --- manifests/device-plugin.yml | 2 +- pkg/gpu/nvidia/metrics.go | 29 +++++++++++++++-------------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/manifests/device-plugin.yml b/manifests/device-plugin.yml index b732e50..f6cb4b0 100644 --- a/manifests/device-plugin.yml +++ b/manifests/device-plugin.yml @@ -45,7 +45,7 @@ spec: capabilities: add: ["SYS_ADMIN"] containers: - - image: ghcr.io/kuartis/kuartis-virtual-gpu-device-plugin:0.4.9 + - image: ghcr.io/kuartis/kuartis-virtual-gpu-device-plugin:0.4.10 name: kuartis-virtual-gpu-device-plugin-ctr command: - /usr/bin/virtual-gpu-device-plugin diff --git a/pkg/gpu/nvidia/metrics.go b/pkg/gpu/nvidia/metrics.go index 7d50515..cbb11c7 100644 --- a/pkg/gpu/nvidia/metrics.go +++ b/pkg/gpu/nvidia/metrics.go @@ -96,20 +96,21 @@ func collectMetrics(w http.ResponseWriter, r *http.Request) { log.Printf("Found %d processes on GPU %d", len(processes), i) for _, process := range processes { containerId := getContainerId(process.Pid) - container := containerMap[containerId] - log.Printf("Using %s Found container %+v for process: %d", containerId, container, process.Pid) - collected = append(collected, metric{ - Pid: process.Pid, - UsedGpuMemory: process.UsedGpuMemory, - GpuIndex: i, - GpuUUID: getDeviceUUID(d), - Node: container.Node, - Namespace: container.Namespace, - Pod: container.Pod, - PodUid: container.PodUid, - Container: container.Container, - ContainerId: container.ContainerId, - }) + if container, ok := containerMap[strings.TrimSpace(containerId)]; ok { + log.Printf("Using %s Found container %+v for process: %d", containerId, container, process.Pid) + collected = append(collected, metric{ + Pid: process.Pid, + UsedGpuMemory: process.UsedGpuMemory, + GpuIndex: i, + GpuUUID: getDeviceUUID(d), + Node: container.Node, + Namespace: container.Namespace, + Pod: container.Pod, + PodUid: container.PodUid, + Container: container.Container, + ContainerId: container.ContainerId, + }) + } } }