From 091ff2d549b18c44bff62053332efe2e2377d574 Mon Sep 17 00:00:00 2001
From: Steven Powell <spowellgrr@gmail.com>
Date: Mon, 25 Sep 2023 10:54:36 -0700
Subject: [PATCH] Automate installing NVIDIA Container Toolkit

---
 cmd/minikube/cmd/start.go                     | 11 ++-
 cmd/minikube/cmd/start_test.go                | 14 ++-
 deploy/addons/assets.go                       |  4 +
 .../nvidia-device-plugin.yaml.tmpl            | 56 +++++++++++
 pkg/addons/config.go                          |  5 +
 pkg/drivers/kic/kic.go                        |  3 +
 pkg/drivers/kic/oci/oci.go                    |  3 +
 pkg/drivers/kic/oci/types.go                  |  3 +-
 pkg/minikube/assets/addons.go                 | 13 +++
 pkg/minikube/constants/constants.go           |  2 +
 pkg/minikube/cruntime/docker.go               | 26 ++++-
 pkg/minikube/style/style.go                   |  1 +
 pkg/minikube/style/style_enum.go              |  1 +
 .../{handbook/addons => tutorials}/nvidia.md  | 94 ++++++++++++-------
 14 files changed, 197 insertions(+), 39 deletions(-)
 create mode 100644 deploy/addons/nvidia-device-plugin/nvidia-device-plugin.yaml.tmpl
 rename site/content/en/docs/{handbook/addons => tutorials}/nvidia.md (74%)

diff --git a/cmd/minikube/cmd/start.go b/cmd/minikube/cmd/start.go
index 62449d7ba7ae..5ea273026867 100644
--- a/cmd/minikube/cmd/start.go
+++ b/cmd/minikube/cmd/start.go
@@ -1285,7 +1285,7 @@ func validateFlags(cmd *cobra.Command, drvName string) {
 	}
 
 	if cmd.Flags().Changed(containerRuntime) {
-		err := validateRuntime(viper.GetString(containerRuntime))
+		err := validateRuntime(viper.GetString(containerRuntime), drvName)
 		if err != nil {
 			exit.Message(reason.Usage, "{{.err}}", out.V{"err": err})
 		}
@@ -1402,7 +1402,7 @@ func validateDiskSize(diskSize string) error {
 }
 
 // validateRuntime validates the supplied runtime
-func validateRuntime(rtime string) error {
+func validateRuntime(rtime, driverName string) error {
 	validOptions := cruntime.ValidRuntimes()
 	// `crio` is accepted as an alternative spelling to `cri-o`
 	validOptions = append(validOptions, constants.CRIO)
@@ -1431,6 +1431,11 @@ func validateRuntime(rtime string) error {
 	if !validRuntime {
 		return errors.Errorf("Invalid Container Runtime: %s. Valid runtimes are: %s", rtime, cruntime.ValidRuntimes())
 	}
+
+	if rtime == constants.NvidiaDocker && driverName != constants.Docker {
+		return errors.Errorf("The nvidia-docker container-runtime can only be run with the docker driver")
+	}
+
 	return nil
 }
 
@@ -1793,7 +1798,7 @@ func validateContainerRuntime(old *config.ClusterConfig) {
 		return
 	}
 
-	if err := validateRuntime(old.KubernetesConfig.ContainerRuntime); err != nil {
+	if err := validateRuntime(old.KubernetesConfig.ContainerRuntime, old.Driver); err != nil {
 		klog.Errorf("Error parsing old runtime %q: %v", old.KubernetesConfig.ContainerRuntime, err)
 	}
 }
diff --git a/cmd/minikube/cmd/start_test.go b/cmd/minikube/cmd/start_test.go
index 2eed76a7f367..ad18b611cbfa 100644
--- a/cmd/minikube/cmd/start_test.go
+++ b/cmd/minikube/cmd/start_test.go
@@ -434,6 +434,7 @@ func TestValidateDiskSize(t *testing.T) {
 func TestValidateRuntime(t *testing.T) {
 	var tests = []struct {
 		runtime  string
+		driver   string
 		errorMsg string
 	}{
 		{
@@ -444,15 +445,24 @@ func TestValidateRuntime(t *testing.T) {
 			runtime:  "docker",
 			errorMsg: "",
 		},
-
 		{
 			runtime:  "test",
 			errorMsg: fmt.Sprintf("Invalid Container Runtime: test. Valid runtimes are: %v", cruntime.ValidRuntimes()),
 		},
+		{
+			runtime:  "nvidia-docker",
+			driver:   "docker",
+			errorMsg: "",
+		},
+		{
+			runtime:  "nvidia-docker",
+			driver:   "kvm",
+			errorMsg: "The nvidia-docker container-runtime can only be run with the docker driver",
+		},
 	}
 	for _, test := range tests {
 		t.Run(test.runtime, func(t *testing.T) {
-			got := validateRuntime(test.runtime)
+			got := validateRuntime(test.runtime, test.driver)
 			gotError := ""
 			if got != nil {
 				gotError = got.Error()
diff --git a/deploy/addons/assets.go b/deploy/addons/assets.go
index 19b3221b8bed..98d08291b6ff 100644
--- a/deploy/addons/assets.go
+++ b/deploy/addons/assets.go
@@ -159,4 +159,8 @@ var (
 	// CloudSpanner assets for cloud-spanner addon
 	//go:embed cloud-spanner/*.yaml
 	CloudSpanner embed.FS
+
+	// NvidiaDevicePlugin assets for nvidia-device-plugin addon
+	//go:embed nvidia-device-plugin/*.tmpl
+	NvidiaDevicePlugin embed.FS
 )
diff --git a/deploy/addons/nvidia-device-plugin/nvidia-device-plugin.yaml.tmpl b/deploy/addons/nvidia-device-plugin/nvidia-device-plugin.yaml.tmpl
new file mode 100644
index 000000000000..c05c586edcbe
--- /dev/null
+++ b/deploy/addons/nvidia-device-plugin/nvidia-device-plugin.yaml.tmpl
@@ -0,0 +1,56 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nvidia-device-plugin-daemonset
+  namespace: kube-system
+spec:
+  selector:
+    matchLabels:
+      name: nvidia-device-plugin-ds
+  updateStrategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        name: nvidia-device-plugin-ds
+    spec:
+      tolerations:
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
+      # Mark this pod as a critical add-on; when enabled, the critical add-on
+      # scheduler reserves resources for critical add-on pods so that they can
+      # be rescheduled after a failure.
+      # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
+      priorityClassName: "system-node-critical"
+      containers:
+      - image: {{.CustomRegistries.NvidiaDevicePlugin | default .ImageRepository | default .Registries.NvidiaDevicePlugin}}{{.Images.NvidiaDevicePlugin}}
+        name: nvidia-device-plugin-ctr
+        env:
+          - name: FAIL_ON_INIT_ERROR
+            value: "false"
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop: ["ALL"]
+        volumeMounts:
+        - name: device-plugin
+          mountPath: /var/lib/kubelet/device-plugins
+      volumes:
+      - name: device-plugin
+        hostPath:
+          path: /var/lib/kubelet/device-plugins
diff --git a/pkg/addons/config.go b/pkg/addons/config.go
index 351ca0c2c880..2c1e7526fb8d 100644
--- a/pkg/addons/config.go
+++ b/pkg/addons/config.go
@@ -217,4 +217,9 @@ var Addons = []*Addon{
 		set:       SetBool,
 		callbacks: []setFn{EnableOrDisableAddon},
 	},
+	{
+		name:      "nvidia-device-plugin",
+		set:       SetBool,
+		callbacks: []setFn{EnableOrDisableAddon},
+	},
 }
diff --git a/pkg/drivers/kic/kic.go b/pkg/drivers/kic/kic.go
index 3596a9c243c8..53f0b1dac3ad 100644
--- a/pkg/drivers/kic/kic.go
+++ b/pkg/drivers/kic/kic.go
@@ -90,6 +90,9 @@ func (d *Driver) Create() error {
 		APIServerPort: d.NodeConfig.APIServerPort,
 	}
 
+	if d.NodeConfig.ContainerRuntime == constants.NvidiaDocker {
+		params.GPUs = true
+	}
 	networkName := d.NodeConfig.Network
 	if networkName == "" {
 		networkName = d.NodeConfig.ClusterName
diff --git a/pkg/drivers/kic/oci/oci.go b/pkg/drivers/kic/oci/oci.go
index 29dc26293d8d..4e9e291d7ecd 100644
--- a/pkg/drivers/kic/oci/oci.go
+++ b/pkg/drivers/kic/oci/oci.go
@@ -190,6 +190,9 @@ func CreateContainerNode(p CreateParams) error {
 		runArgs = append(runArgs, "--network", p.Network)
 		runArgs = append(runArgs, "--ip", p.IP)
 	}
+	if p.GPUs {
+		runArgs = append(runArgs, "--gpus", "all")
+	}
 
 	memcgSwap := hasMemorySwapCgroup()
 	memcg := HasMemoryCgroup()
diff --git a/pkg/drivers/kic/oci/types.go b/pkg/drivers/kic/oci/types.go
index 0b0efb471df2..58dce76198ee 100644
--- a/pkg/drivers/kic/oci/types.go
+++ b/pkg/drivers/kic/oci/types.go
@@ -58,7 +58,8 @@ type CreateParams struct {
 	ExtraArgs     []string          // a list of any extra option to pass to oci binary during creation time, for example --expose 8080...
 	OCIBinary     string            // docker or podman
 	Network       string            // network name that the container will attach to
-	IP            string            // static IP to assign for th container in the cluster network
+	IP            string            // static IP to assign the container in the cluster network
+	GPUs          bool              // add GPU devices to the container
 }
 
 // createOpt is an option for Create
diff --git a/pkg/minikube/assets/addons.go b/pkg/minikube/assets/addons.go
index 86f35e8c99de..b3f8aef406b0 100644
--- a/pkg/minikube/assets/addons.go
+++ b/pkg/minikube/assets/addons.go
@@ -93,6 +93,11 @@ func (a *Addon) IsEnabledOrDefault(cc *config.ClusterConfig) bool {
 	return a.enabled
 }
 
+// EnableByDefault will enable the addon by default on cluster start
+func (a *Addon) EnableByDefault() {
+	a.enabled = true
+}
+
 // Addons is the list of addons
 // TODO: Make dynamically loadable: move this data to a .yaml file within each addon directory
 var Addons = map[string]*Addon{
@@ -757,6 +762,14 @@ var Addons = map[string]*Addon{
 	}, map[string]string{
 		"CloudSpanner": "gcr.io",
 	}),
+	"nvidia-device-plugin": NewAddon([]*BinAsset{
+		MustBinAsset(addons.NvidiaDevicePlugin, "nvidia-device-plugin/nvidia-device-plugin.yaml.tmpl", vmpath.GuestAddonsDir, "nvidia-device-plugin.yaml", "0640"),
+	}, false, "nvidia-device-plugin", "3rd party (NVIDIA)", "", "",
+		map[string]string{
+			"NvidiaDevicePlugin": "nvidia/k8s-device-plugin:v0.14.1@sha256:15c4280d13a61df703b12d1fd1b5b5eec4658157db3cb4b851d3259502310136",
+		}, map[string]string{
+			"NvidiaDevicePlugin": "nvcr.io",
+		}),
 }
 
 // parseMapString creates a map based on `str` which is encoded as <key1>=<value1>,<key2>=<value2>,...
diff --git a/pkg/minikube/constants/constants.go b/pkg/minikube/constants/constants.go
index 73649da20db0..3df86fa3d3a4 100644
--- a/pkg/minikube/constants/constants.go
+++ b/pkg/minikube/constants/constants.go
@@ -63,6 +63,8 @@ const (
 	CRIO = "crio"
 	// Docker is the default name and spelling for the docker container runtime
 	Docker = "docker"
+	// NvidiaDocker is the default name and spelling for the nvidia-docker container runtime
+	NvidiaDocker = "nvidia-docker"
 	// DefaultContainerRuntime is our default container runtime
 	DefaultContainerRuntime = ""
 
diff --git a/pkg/minikube/cruntime/docker.go b/pkg/minikube/cruntime/docker.go
index d083f94a3f65..9d839311c47c 100644
--- a/pkg/minikube/cruntime/docker.go
+++ b/pkg/minikube/cruntime/docker.go
@@ -39,6 +39,7 @@ import (
 	"k8s.io/minikube/pkg/minikube/docker"
 	"k8s.io/minikube/pkg/minikube/download"
 	"k8s.io/minikube/pkg/minikube/image"
+	"k8s.io/minikube/pkg/minikube/out"
 	"k8s.io/minikube/pkg/minikube/style"
 	"k8s.io/minikube/pkg/minikube/sysinit"
 )
@@ -560,7 +561,11 @@ func (r *Docker) configureDocker(driver string) error {
 		},
 		StorageDriver: "overlay2",
 	}
-	if r.Type == "nvidia-docker" {
+	if r.Type == constants.NvidiaDocker {
+		if err := r.installNvidiaContainerToolkit(); err != nil {
+			return fmt.Errorf("failed installing the NVIDIA Container Toolkit: %v", err)
+		}
+		assets.Addons["nvidia-device-plugin"].EnableByDefault()
 		daemonConfig.DefaultRuntime = "nvidia"
 		runtimes := &dockerDaemonRuntimes{}
 		runtimes.Nvidia.Path = "/usr/bin/nvidia-container-runtime"
@@ -574,6 +579,25 @@ func (r *Docker) configureDocker(driver string) error {
 	return r.Runner.Copy(ma)
 }
 
+// installNvidiaContainerToolkit installs the NVIDIA Container Toolkit
+// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
+func (r *Docker) installNvidiaContainerToolkit() error {
+	out.Styled(style.Toolkit, "Installing the NVIDIA Container Toolkit...")
+	cmds := []string{
+		"curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg",
+		"curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list",
+		"sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit",
+	}
+
+	for _, cmd := range cmds {
+		c := exec.Command("/bin/bash", "-c", cmd)
+		if _, err := r.Runner.RunCmd(c); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
 // Preload preloads docker with k8s images:
 // 1. Copy over the preloaded tarball into the VM
 // 2. Extract the preloaded tarball to the correct directory
diff --git a/pkg/minikube/style/style.go b/pkg/minikube/style/style.go
index 597c402b82d0..411ee2162590 100644
--- a/pkg/minikube/style/style.go
+++ b/pkg/minikube/style/style.go
@@ -139,6 +139,7 @@ var Config = map[Enum]Options{
 	VerifyingNoLine:  {Prefix: "🤔  ", OmitNewline: true},
 	Verifying:        {Prefix: "🤔  "},
 	CNI:              {Prefix: "🔗  "},
+	Toolkit:          {Prefix: "🛠️   "},
 }
 
 // LowPrefix returns a 7-bit compatible prefix for a style
diff --git a/pkg/minikube/style/style_enum.go b/pkg/minikube/style/style_enum.go
index 19dce3a06014..d89ba5eeb3ff 100644
--- a/pkg/minikube/style/style_enum.go
+++ b/pkg/minikube/style/style_enum.go
@@ -105,4 +105,5 @@ const (
 	Warning
 	Workaround
 	CNI
+	Toolkit
 )
diff --git a/site/content/en/docs/handbook/addons/nvidia.md b/site/content/en/docs/tutorials/nvidia.md
similarity index 74%
rename from site/content/en/docs/handbook/addons/nvidia.md
rename to site/content/en/docs/tutorials/nvidia.md
index a28f34be39b9..46b2624c5585 100644
--- a/site/content/en/docs/handbook/addons/nvidia.md
+++ b/site/content/en/docs/tutorials/nvidia.md
@@ -1,6 +1,6 @@
 ---
-title: "Using the Nvidia Addons"
-linkTitle: "Nvidia"
+title: "Using NVIDIA GPUs with minikube"
+linkTitle: "Using NVIDIA GPUs with minikube"
 weight: 1
 date: 2018-01-02
 ---
@@ -8,17 +8,66 @@ date: 2018-01-02
 ## Prerequisites
 
 - Linux
-- kvm2 driver
 - Latest NVIDIA GPU drivers
 
-## Using the KVM2 driver
+## Instructions per driver
 
-When using NVIDIA GPUs with the kvm2 driver, we passthrough spare GPUs on the
+{{% tabs %}}
+{{% tab docker %}}
+## Using the docker driver
+
+- Check if `bpf_jit_harden` is set to `0`
+  ```shell
+  sudo sysctl net.core.bpf_jit_harden
+  ```
+  - If it's not `0` run:
+  ```shell
+  echo "net.core.bpf_jit_harden=0" | sudo tee -a /etc/sysctl.conf
+  sudo sysctl -p
+  ```
+
+- Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) on your host machine
+
+- Configure Docker:
+  ```shell
+  sudo nvidia-ctk runtime configure --runtime=docker && sudo systemctl restart docker
+  ```
+- Start minikube:
+  ```shell
+  minikube start --driver docker --container-runtime nvidia-docker
+  ```
+{{% /tab %}}
+{{% tab none %}}
+## Using the 'none' driver
+
+NOTE: This approach used to expose GPUs here is different than the approach used
+to expose GPUs with `--driver=kvm`. Please don't mix these instructions.
+
+- Install minikube.
+
+- Install the nvidia driver, nvidia-docker and configure docker with nvidia as
+  the default runtime. See instructions at
+  <https://github.com/NVIDIA/nvidia-docker>
+
+- Start minikube:
+  ```shell
+  minikube start --driver=none --apiserver-ips 127.0.0.1 --apiserver-name localhost
+  ```
+
+- Install NVIDIA's device plugin:
+  ```shell
+  minikube addons enable nvidia-device-plugin
+  ```
+{{% /tab %}}
+{{% tab kvm %}}
+## Using the kvm driver
+
+When using NVIDIA GPUs with the kvm driver, we passthrough spare GPUs on the
 host to the minikube VM. Doing so has a few prerequisites:
 
-- You must install the [kvm2 driver]({{< ref "/docs/drivers/kvm2" >}}) If you already had
+- You must install the [kvm driver]({{< ref "/docs/drivers/kvm2" >}}) If you already had
   this installed make sure that you fetch the latest
-  `docker-machine-driver-kvm2` binary that has GPU support.
+  `docker-machine-driver-kvm` binary that has GPU support.
 
 - Your CPU must support IOMMU. Different vendors have different names for this
   technology. Intel calls it Intel VT-d. AMD calls it AMD-Vi. Your motherboard
@@ -40,9 +89,9 @@ host to the minikube VM. Doing so has a few prerequisites:
   group of these GPUs.
 
 - Once you reboot the system after doing the above, you should be ready to use
-  GPUs with kvm2. Run the following command to start minikube:
+  GPUs with kvm. Run the following command to start minikube:
   ```shell
-  minikube start --driver kvm2 --kvm-gpu
+  minikube start --driver kvm --kvm-gpu
   ```
 
   This command will check if all the above conditions are satisfied and
@@ -68,31 +117,12 @@ host to the minikube VM. Doing so has a few prerequisites:
 See the excellent documentation at
 <https://wiki.archlinux.org/index.php/PCI_passthrough_via_OVMF>
 
-### Why are so many manual steps required to use GPUs with kvm2 on minikube?
+### Why are so many manual steps required to use GPUs with kvm on minikube?
 
 These steps require elevated privileges which minikube doesn't run with and they
 are disruptive to the host, so we decided to not do them automatically.
-
-## Using the 'none' driver
-
-NOTE: This approach used to expose GPUs here is different than the approach used
-to expose GPUs with `--driver=kvm2`. Please don't mix these instructions.
-
-- Install minikube.
-
-- Install the nvidia driver, nvidia-docker and configure docker with nvidia as
-  the default runtime. See instructions at
-  <https://github.com/NVIDIA/nvidia-docker>
-
-- Start minikube:
-  ```shell
-  minikube start --driver=none --apiserver-ips 127.0.0.1 --apiserver-name localhost
-  ```
-
-- Install NVIDIA's device plugin:
-  ```shell
-  kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/master/nvidia-device-plugin.yml
-  ```
+{{% /tab %}}
+{{% /tabs %}}
 
 ## Why does minikube not support NVIDIA GPUs on macOS?
 
@@ -102,7 +132,7 @@ drivers supported by minikube for macOS doesn't support GPU passthrough:
 - [moby/hyperkit#159](https://github.com/moby/hyperkit/issues/159)
 - [VirtualBox docs](https://www.virtualbox.org/manual/ch09.html#pcipassthrough)
 
-Also:
+Also: 
 
 - For quite a while, all Mac hardware (both laptops and desktops) have come with
   Intel or AMD GPUs (and not with NVIDIA GPUs). Recently, Apple added [support