From 1037974c2a46a3fca7bba2365fbfed4d3f5174b4 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Thu, 30 Nov 2023 12:17:30 -0800 Subject: [PATCH] Use GKE's native NVidia driver installer for GPUs - Remove our custom GPU installer daemonset, as GKE now supports automatically doing it (like eksctl does). - Switch from installing 'latest' to using default driver, which is slightly older (version 470 with CUDA 11.4, vs version 530 with CUDA 12). There seems to be a bug with the latest driver causing the GPU to not be usable by non-root users, so let's stick to this until that is resolved. - Apply these changes to LEAP hub already. m2lines is about to be decomissioned, so not necessary. --- config/clusters/leap/support.values.yaml | 5 - config/clusters/m2lines/support.values.yaml | 5 - .../nvidiaDevicePlugin/gke/latest.yaml | 112 ------------------ .../nvidiaDevicePlugin/gke/stable.yaml | 112 ------------------ helm-charts/support/values.schema.yaml | 21 ---- helm-charts/support/values.yaml | 8 +- terraform/gcp/cluster.tf | 4 + 7 files changed, 6 insertions(+), 261 deletions(-) delete mode 100644 helm-charts/support/templates/nvidiaDevicePlugin/gke/latest.yaml delete mode 100644 helm-charts/support/templates/nvidiaDevicePlugin/gke/stable.yaml diff --git a/config/clusters/leap/support.values.yaml b/config/clusters/leap/support.values.yaml index 06420aee63..da5722b3ca 100644 --- a/config/clusters/leap/support.values.yaml +++ b/config/clusters/leap/support.values.yaml @@ -1,8 +1,3 @@ -nvidiaDevicePlugin: - gke: - enabled: true - version: "latest" - prometheusIngressAuthSecret: enabled: true diff --git a/config/clusters/m2lines/support.values.yaml b/config/clusters/m2lines/support.values.yaml index b14e55076a..a2e4fbddb2 100644 --- a/config/clusters/m2lines/support.values.yaml +++ b/config/clusters/m2lines/support.values.yaml @@ -1,8 +1,3 @@ -nvidiaDevicePlugin: - gke: - enabled: true - version: "latest" - grafana: grafana.ini: server: diff --git a/helm-charts/support/templates/nvidiaDevicePlugin/gke/latest.yaml b/helm-charts/support/templates/nvidiaDevicePlugin/gke/latest.yaml deleted file mode 100644 index 6c3d8072f2..0000000000 --- a/helm-charts/support/templates/nvidiaDevicePlugin/gke/latest.yaml +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright 2022 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# The Dockerfile and other source for this daemonset are in -# https://cos.googlesource.com/cos/tools/+/refs/heads/master/src/cmd/cos_gpu_installer/ -# -# This is the same as ../../daemonset.yaml except that it assumes that the -# docker image is present on the node instead of downloading from GCR. This -# allows easier upgrades because GKE can preload the correct image on the -# node and the daemonset can just use that image. -{{- if .Values.nvidiaDevicePlugin.gke.enabled -}} -{{- if eq .Values.nvidiaDevicePlugin.gke.version "latest" }} -# Documented from https://cloud.google.com/kubernetes-engine/docs/how-to/gpus#installing_drivers -# From https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded-latest.yaml -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: nvidia-driver-installer - namespace: kube-system - labels: - k8s-app: nvidia-driver-installer -spec: - selector: - matchLabels: - k8s-app: nvidia-driver-installer - updateStrategy: - type: RollingUpdate - template: - metadata: - labels: - name: nvidia-driver-installer - k8s-app: nvidia-driver-installer - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: cloud.google.com/gke-accelerator - operator: Exists - tolerations: - - operator: "Exists" - hostNetwork: true - hostPID: true - volumes: - - name: dev - hostPath: - path: /dev - - name: vulkan-icd-mount - hostPath: - path: /home/kubernetes/bin/nvidia/vulkan/icd.d - - name: nvidia-install-dir-host - hostPath: - path: /home/kubernetes/bin/nvidia - - name: root-mount - hostPath: - path: / - - name: cos-tools - hostPath: - path: /var/lib/cos-tools - initContainers: - - image: "cos-nvidia-installer:fixed" - imagePullPolicy: Never - name: nvidia-driver-installer - resources: - requests: - cpu: "0.15" - securityContext: - privileged: true - env: - - name: NVIDIA_INSTALL_DIR_HOST - value: /home/kubernetes/bin/nvidia - - name: NVIDIA_INSTALL_DIR_CONTAINER - value: /usr/local/nvidia - - name: VULKAN_ICD_DIR_HOST - value: /home/kubernetes/bin/nvidia/vulkan/icd.d - - name: VULKAN_ICD_DIR_CONTAINER - value: /etc/vulkan/icd.d - - name: ROOT_MOUNT_DIR - value: /root - - name: COS_TOOLS_DIR_HOST - value: /var/lib/cos-tools - - name: COS_TOOLS_DIR_CONTAINER - value: /build/cos-tools - volumeMounts: - - name: nvidia-install-dir-host - mountPath: /usr/local/nvidia - - name: vulkan-icd-mount - mountPath: /etc/vulkan/icd.d - - name: dev - mountPath: /dev - - name: root-mount - mountPath: /root - - name: cos-tools - mountPath: /build/cos-tools - command: ['/cos-gpu-installer', 'install', '--version=latest'] - containers: - - image: "gcr.io/google-containers/pause:2.0" - name: pause -{{- end }} -{{- end }} \ No newline at end of file diff --git a/helm-charts/support/templates/nvidiaDevicePlugin/gke/stable.yaml b/helm-charts/support/templates/nvidiaDevicePlugin/gke/stable.yaml deleted file mode 100644 index 53dceceb6d..0000000000 --- a/helm-charts/support/templates/nvidiaDevicePlugin/gke/stable.yaml +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright 2017 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# The Dockerfile and other source for this daemonset are in -# https://cos.googlesource.com/cos/tools/+/refs/heads/master/src/cmd/cos_gpu_installer/ -# -# This is the same as ../../daemonset.yaml except that it assumes that the -# docker image is present on the node instead of downloading from GCR. This -# allows easier upgrades because GKE can preload the correct image on the -# node and the daemonset can just use that image. - -{{- if .Values.nvidiaDevicePlugin.gke.enabled -}} -{{- if eq .Values.nvidiaDevicePlugin.gke.version "stable" }} -# Documented from https://cloud.google.com/kubernetes-engine/docs/how-to/gpus#installing_drivers -# From https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: nvidia-driver-installer - namespace: kube-system - labels: - k8s-app: nvidia-driver-installer -spec: - selector: - matchLabels: - k8s-app: nvidia-driver-installer - updateStrategy: - type: RollingUpdate - template: - metadata: - labels: - name: nvidia-driver-installer - k8s-app: nvidia-driver-installer - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: cloud.google.com/gke-accelerator - operator: Exists - tolerations: - - operator: "Exists" - hostNetwork: true - hostPID: true - volumes: - - name: dev - hostPath: - path: /dev - - name: vulkan-icd-mount - hostPath: - path: /home/kubernetes/bin/nvidia/vulkan/icd.d - - name: nvidia-install-dir-host - hostPath: - path: /home/kubernetes/bin/nvidia - - name: root-mount - hostPath: - path: / - - name: cos-tools - hostPath: - path: /var/lib/cos-tools - initContainers: - - image: "cos-nvidia-installer:fixed" - imagePullPolicy: Never - name: nvidia-driver-installer - resources: - requests: - cpu: "0.15" - securityContext: - privileged: true - env: - - name: NVIDIA_INSTALL_DIR_HOST - value: /home/kubernetes/bin/nvidia - - name: NVIDIA_INSTALL_DIR_CONTAINER - value: /usr/local/nvidia - - name: VULKAN_ICD_DIR_HOST - value: /home/kubernetes/bin/nvidia/vulkan/icd.d - - name: VULKAN_ICD_DIR_CONTAINER - value: /etc/vulkan/icd.d - - name: ROOT_MOUNT_DIR - value: /root - - name: COS_TOOLS_DIR_HOST - value: /var/lib/cos-tools - - name: COS_TOOLS_DIR_CONTAINER - value: /build/cos-tools - volumeMounts: - - name: nvidia-install-dir-host - mountPath: /usr/local/nvidia - - name: vulkan-icd-mount - mountPath: /etc/vulkan/icd.d - - name: dev - mountPath: /dev - - name: root-mount - mountPath: /root - - name: cos-tools - mountPath: /build/cos-tools - containers: - - image: "gcr.io/google-containers/pause:2.0" - name: pause -{{- end }} -{{- end }} \ No newline at end of file diff --git a/helm-charts/support/values.schema.yaml b/helm-charts/support/values.schema.yaml index d63c7bced4..243acf9ea8 100644 --- a/helm-charts/support/values.schema.yaml +++ b/helm-charts/support/values.schema.yaml @@ -91,7 +91,6 @@ properties: additionalProperties: false required: - azure - - gke properties: azure: type: object @@ -101,26 +100,6 @@ properties: properties: enabled: type: boolean - gke: - type: object - additionalProperties: false - required: - - enabled - - version - properties: - enabled: - type: boolean - version: - type: string - enum: - - stable - - latest - description: | - Install the stable or latest version of nvidia GPU drivers for the node. - - See table in https://cloud.google.com/kubernetes-engine/docs/how-to/gpus#installing_drivers - to determine what versions would be installed. Might need to be matched with appropriate - version of the CUDA libraries used in the images users use. prometheusIngressAuthSecret: type: object diff --git a/helm-charts/support/values.yaml b/helm-charts/support/values.yaml index 97f6099380..5b361b8a32 100644 --- a/helm-charts/support/values.yaml +++ b/helm-charts/support/values.yaml @@ -433,15 +433,11 @@ redirects: rules: [] # Enable a daemonset to install nvidia device plugin to GPU nodes -# AWS does not require this to be set, as eksctl sets this up automatically +# Not necessary on GCP & AWS don't need this, as it is handled automatically by terraform or eksctl +# respectively nvidiaDevicePlugin: - # For Azure-specific image, default to false azure: enabled: false - # For GKE specific image, defaults to false - gke: - enabled: false - version: "stable" # Setup a separate storageClass specifically for prometheus data prometheusStorageClass: diff --git a/terraform/gcp/cluster.tf b/terraform/gcp/cluster.tf index 3b1472c7b6..e30fa73b13 100644 --- a/terraform/gcp/cluster.tf +++ b/terraform/gcp/cluster.tf @@ -296,6 +296,10 @@ resource "google_container_node_pool" "notebook" { content { type = each.value.gpu.type count = each.value.gpu.count + + gpu_driver_installation_config { + gpu_driver_version = "DEFAULT" + } } }