diff --git a/cmd/minikube/cmd/start.go b/cmd/minikube/cmd/start.go index 11cfeabd458c..f92447103caa 100644 --- a/cmd/minikube/cmd/start.go +++ b/cmd/minikube/cmd/start.go @@ -1302,6 +1302,12 @@ func validateFlags(cmd *cobra.Command, drvName string) { } } + if cmd.Flags().Changed(gpus) { + if err := validateGPUs(viper.GetString(gpus), drvName, viper.GetString(containerRuntime)); err != nil { + exit.Message(reason.Usage, "{{.err}}", out.V{"err": err}) + } + } + if driver.IsSSH(drvName) { sshIPAddress := viper.GetString(sshIPAddress) if sshIPAddress == "" { @@ -1438,6 +1444,20 @@ func validateRuntime(rtime string) error { return nil } +// validateGPUs validates that a valid option was given, and if so, can it be used with the given configuration +func validateGPUs(value, drvName, rtime string) error { + if value == "" { + return nil + } + if value != "nvidia" && value != "all" { + return errors.Errorf(`The gpus flag must be passed a value of "nvidia" or "all"`) + } + if drvName == constants.Docker && (rtime == constants.Docker || rtime == constants.DefaultContainerRuntime) { + return nil + } + return errors.Errorf("The gpus flag can only be used with the docker driver and docker container-runtime") +} + func getContainerRuntime(old *config.ClusterConfig) string { paramRuntime := viper.GetString(containerRuntime) diff --git a/cmd/minikube/cmd/start_flags.go b/cmd/minikube/cmd/start_flags.go index d7b12c8c9f73..b1cba66a2a88 100644 --- a/cmd/minikube/cmd/start_flags.go +++ b/cmd/minikube/cmd/start_flags.go @@ -142,6 +142,7 @@ const ( socketVMnetPath = "socket-vmnet-path" staticIP = "static-ip" autoPauseInterval = "auto-pause-interval" + gpus = "gpus" ) var ( @@ -204,6 +205,7 @@ func initMinikubeFlags() { startCmd.Flags().Bool(disableMetrics, false, "If set, disables metrics reporting (CPU and memory usage), this can improve CPU usage. Defaults to false.") startCmd.Flags().String(staticIP, "", "Set a static IP for the minikube cluster, the IP must be: private, IPv4, and the last octet must be between 2 and 254, for example 192.168.200.200 (Docker and Podman drivers only)") startCmd.Flags().Duration(autoPauseInterval, time.Minute*1, "Duration of inactivity before the minikube VM is paused (default 1m0s). To disable, set to 0s") + startCmd.Flags().StringP(gpus, "g", "", "Allow pods to use your NVIDIA GPUs. Options include: [all,nvidia] (Docker driver with Docker container-runtime only)") } // initKubernetesFlags inits the commandline flags for Kubernetes related options @@ -595,6 +597,7 @@ func generateNewConfigFromFlags(cmd *cobra.Command, k8sVersion string, rtime str }, MultiNodeRequested: viper.GetInt(nodes) > 1, AutoPauseInterval: viper.GetDuration(autoPauseInterval), + GPUs: viper.GetString(gpus), } cc.VerifyComponents = interpretWaitFlag(*cmd) if viper.GetBool(createMount) && driver.IsKIC(drvName) { diff --git a/cmd/minikube/cmd/start_test.go b/cmd/minikube/cmd/start_test.go index 2eed76a7f367..a5ad32a6f2b0 100644 --- a/cmd/minikube/cmd/start_test.go +++ b/cmd/minikube/cmd/start_test.go @@ -444,7 +444,6 @@ func TestValidateRuntime(t *testing.T) { runtime: "docker", errorMsg: "", }, - { runtime: "test", errorMsg: fmt.Sprintf("Invalid Container Runtime: test. Valid runtimes are: %v", cruntime.ValidRuntimes()), @@ -860,3 +859,32 @@ func TestImageMatchesBinaryVersion(t *testing.T) { } } } + +func TestValidateGPUs(t *testing.T) { + tests := []struct { + gpus string + drvName string + runtime string + errorMsg string + }{ + {"", "kvm", "containerd", ""}, + {"all", "docker", "docker", ""}, + {"nvidia", "docker", "docker", ""}, + {"all", "docker", "", ""}, + {"nvidia", "docker", "", ""}, + {"all", "kvm", "docker", "The gpus flag can only be used with the docker driver and docker container-runtime"}, + {"nvidia", "docker", "containerd", "The gpus flag can only be used with the docker driver and docker container-runtime"}, + {"cat", "docker", "docker", `The gpus flag must be passed a value of "nvidia" or "all"`}, + } + + for _, tc := range tests { + gotError := "" + got := validateGPUs(tc.gpus, tc.drvName, tc.runtime) + if got != nil { + gotError = got.Error() + } + if gotError != tc.errorMsg { + t.Errorf("validateGPUs(%s, %s, %s) = %q; want = %q", tc.gpus, tc.drvName, tc.runtime, got, tc.errorMsg) + } + } +} diff --git a/deploy/addons/assets.go b/deploy/addons/assets.go index d4cf42ae1a85..a1c11e795feb 100644 --- a/deploy/addons/assets.go +++ b/deploy/addons/assets.go @@ -166,4 +166,8 @@ var ( // Kubeflow assets for kubeflow addon //go:embed kubeflow/*.yaml Kubeflow embed.FS + + // NvidiaDevicePlugin assets for nvidia-device-plugin addon + //go:embed nvidia-device-plugin/*.tmpl + NvidiaDevicePlugin embed.FS ) diff --git a/deploy/addons/nvidia-device-plugin/nvidia-device-plugin.yaml.tmpl b/deploy/addons/nvidia-device-plugin/nvidia-device-plugin.yaml.tmpl new file mode 100644 index 000000000000..c05c586edcbe --- /dev/null +++ b/deploy/addons/nvidia-device-plugin/nvidia-device-plugin.yaml.tmpl @@ -0,0 +1,56 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin-daemonset + namespace: kube-system +spec: + selector: + matchLabels: + name: nvidia-device-plugin-ds + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: nvidia-device-plugin-ds + spec: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + # Mark this pod as a critical add-on; when enabled, the critical add-on + # scheduler reserves resources for critical add-on pods so that they can + # be rescheduled after a failure. + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + priorityClassName: "system-node-critical" + containers: + - image: {{.CustomRegistries.NvidiaDevicePlugin | default .ImageRepository | default .Registries.NvidiaDevicePlugin}}{{.Images.NvidiaDevicePlugin}} + name: nvidia-device-plugin-ctr + env: + - name: FAIL_ON_INIT_ERROR + value: "false" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins diff --git a/pkg/addons/config.go b/pkg/addons/config.go index 18764ac4022e..3e5f96aeb9f2 100644 --- a/pkg/addons/config.go +++ b/pkg/addons/config.go @@ -227,4 +227,9 @@ var Addons = []*Addon{ set: SetBool, callbacks: []setFn{EnableOrDisableAddon}, }, + { + name: "nvidia-device-plugin", + set: SetBool, + callbacks: []setFn{EnableOrDisableAddon}, + }, } diff --git a/pkg/drivers/kic/kic.go b/pkg/drivers/kic/kic.go index 3596a9c243c8..0b2f3c7ae204 100644 --- a/pkg/drivers/kic/kic.go +++ b/pkg/drivers/kic/kic.go @@ -88,6 +88,7 @@ func (d *Driver) Create() error { ExtraArgs: append([]string{"--expose", fmt.Sprintf("%d", d.NodeConfig.APIServerPort)}, d.NodeConfig.ExtraArgs...), OCIBinary: d.NodeConfig.OCIBinary, APIServerPort: d.NodeConfig.APIServerPort, + GPUs: d.NodeConfig.GPUs, } networkName := d.NodeConfig.Network diff --git a/pkg/drivers/kic/oci/oci.go b/pkg/drivers/kic/oci/oci.go index 29dc26293d8d..986f4a72a245 100644 --- a/pkg/drivers/kic/oci/oci.go +++ b/pkg/drivers/kic/oci/oci.go @@ -190,6 +190,9 @@ func CreateContainerNode(p CreateParams) error { runArgs = append(runArgs, "--network", p.Network) runArgs = append(runArgs, "--ip", p.IP) } + if p.GPUs != "" { + runArgs = append(runArgs, "--gpus", "all") + } memcgSwap := hasMemorySwapCgroup() memcg := HasMemoryCgroup() diff --git a/pkg/drivers/kic/oci/types.go b/pkg/drivers/kic/oci/types.go index 0b0efb471df2..1009df9c70c8 100644 --- a/pkg/drivers/kic/oci/types.go +++ b/pkg/drivers/kic/oci/types.go @@ -58,7 +58,8 @@ type CreateParams struct { ExtraArgs []string // a list of any extra option to pass to oci binary during creation time, for example --expose 8080... OCIBinary string // docker or podman Network string // network name that the container will attach to - IP string // static IP to assign for th container in the cluster network + IP string // static IP to assign the container in the cluster network + GPUs string // add NVIDIA GPU devices to the container } // createOpt is an option for Create diff --git a/pkg/drivers/kic/types.go b/pkg/drivers/kic/types.go index a064afceaf62..cd08a7b81363 100644 --- a/pkg/drivers/kic/types.go +++ b/pkg/drivers/kic/types.go @@ -69,4 +69,5 @@ type Config struct { StaticIP string // static IP for the kic cluster ExtraArgs []string // a list of any extra option to pass to oci binary during creation time, for example --expose 8080... ListenAddress string // IP Address to listen to + GPUs string // add NVIDIA GPU devices to the container } diff --git a/pkg/minikube/assets/addons.go b/pkg/minikube/assets/addons.go index 1639ba165b06..f926d3b07747 100644 --- a/pkg/minikube/assets/addons.go +++ b/pkg/minikube/assets/addons.go @@ -93,6 +93,11 @@ func (a *Addon) IsEnabledOrDefault(cc *config.ClusterConfig) bool { return a.enabled } +// EnableByDefault will enable the addon by default on cluster start +func (a *Addon) EnableByDefault() { + a.enabled = true +} + // Addons is the list of addons // TODO: Make dynamically loadable: move this data to a .yaml file within each addon directory var Addons = map[string]*Addon{ @@ -774,6 +779,14 @@ var Addons = map[string]*Addon{ MustBinAsset(addons.Kubeflow, "kubeflow/kubeflow.yaml", vmpath.GuestAddonsDir, "kubeflow.yaml", "0640"), }, false, "kubeflow", "3rd party", "", "", nil, nil, ), + "nvidia-device-plugin": NewAddon([]*BinAsset{ + MustBinAsset(addons.NvidiaDevicePlugin, "nvidia-device-plugin/nvidia-device-plugin.yaml.tmpl", vmpath.GuestAddonsDir, "nvidia-device-plugin.yaml", "0640"), + }, false, "nvidia-device-plugin", "3rd party (NVIDIA)", "", "", + map[string]string{ + "NvidiaDevicePlugin": "nvidia/k8s-device-plugin:v0.14.1@sha256:15c4280d13a61df703b12d1fd1b5b5eec4658157db3cb4b851d3259502310136", + }, map[string]string{ + "NvidiaDevicePlugin": "nvcr.io", + }), } // parseMapString creates a map based on `str` which is encoded as =,=,... diff --git a/pkg/minikube/config/types.go b/pkg/minikube/config/types.go index e38b4cf403ec..245f5c10e7e3 100644 --- a/pkg/minikube/config/types.go +++ b/pkg/minikube/config/types.go @@ -108,6 +108,7 @@ type ClusterConfig struct { SSHAuthSock string SSHAgentPID int AutoPauseInterval time.Duration // Specifies interval of time to wait before checking if cluster should be paused + GPUs string } // KubernetesConfig contains the parameters used to configure the VM Kubernetes. diff --git a/pkg/minikube/cruntime/cruntime.go b/pkg/minikube/cruntime/cruntime.go index 85488a7eea77..d96e230a252b 100644 --- a/pkg/minikube/cruntime/cruntime.go +++ b/pkg/minikube/cruntime/cruntime.go @@ -50,7 +50,7 @@ func (cs ContainerState) String() string { // ValidRuntimes lists the supported container runtimes func ValidRuntimes() []string { - return []string{"docker", "nvidia-docker", "cri-o", "containerd"} + return []string{"docker", "cri-o", "containerd"} } // CommandRunner is the subset of command.Runner this package consumes @@ -155,6 +155,8 @@ type Config struct { KubernetesVersion semver.Version // InsecureRegistry list of insecure registries InsecureRegistry []string + // GPUs add GPU devices to the container + GPUs bool } // ListContainersOptions are the options to use for listing containers @@ -210,7 +212,7 @@ func New(c Config) (Manager, error) { sm := sysinit.New(c.Runner) switch c.Type { - case "", "docker", "nvidia-docker": + case "", "docker": sp := c.Socket cs := "" // There is no more dockershim socket, in Kubernetes version 1.24 and beyond @@ -219,7 +221,6 @@ func New(c Config) (Manager, error) { cs = "cri-docker.socket" } return &Docker{ - Type: c.Type, Socket: sp, Runner: c.Runner, NetworkPlugin: c.NetworkPlugin, @@ -228,6 +229,7 @@ func New(c Config) (Manager, error) { Init: sm, UseCRI: (sp != ""), // !dockershim CRIService: cs, + GPUs: c.GPUs, }, nil case "crio", "cri-o": return &CRIO{ diff --git a/pkg/minikube/cruntime/cruntime_test.go b/pkg/minikube/cruntime/cruntime_test.go index ed7e5d6b5561..60e0687321d4 100644 --- a/pkg/minikube/cruntime/cruntime_test.go +++ b/pkg/minikube/cruntime/cruntime_test.go @@ -40,7 +40,6 @@ func TestName(t *testing.T) { }{ {"", "Docker"}, {"docker", "Docker"}, - {"nvidia-docker", "Docker"}, {"crio", "CRI-O"}, {"cri-o", "CRI-O"}, {"containerd", "containerd"}, @@ -125,7 +124,6 @@ func TestCGroupDriver(t *testing.T) { want string }{ {"docker", "cgroupfs"}, - {"nvidia-docker", "cgroupfs"}, {"crio", "cgroupfs"}, {"containerd", "cgroupfs"}, } @@ -157,12 +155,6 @@ func TestKubeletOptions(t *testing.T) { {"docker", "1.24.0", map[string]string{ "container-runtime-endpoint": "unix:///var/run/cri-dockerd.sock", }}, - {"nvidia-docker", "1.23.0", map[string]string{ - "container-runtime": "docker", - }}, - {"nvidia-docker", "1.25.0", map[string]string{ - "container-runtime-endpoint": "unix:///var/run/cri-dockerd.sock", - }}, {"crio", "1.25.0", map[string]string{ "container-runtime-endpoint": "unix:///var/run/crio/crio.sock", }}, @@ -688,13 +680,6 @@ func TestEnable(t *testing.T) { "crio": SvcExited, "crio-shutdown": SvcExited, }}, - {"nvidia-docker", defaultServices, - map[string]serviceState{ - "docker": SvcRestarted, - "containerd": SvcExited, - "crio": SvcExited, - "crio-shutdown": SvcExited, - }}, {"containerd", defaultServices, map[string]serviceState{ "docker": SvcExited, @@ -736,7 +721,6 @@ func TestContainerFunctions(t *testing.T) { runtime string }{ {"docker"}, - {"nvidia-docker"}, {"crio"}, {"containerd"}, } @@ -746,7 +730,7 @@ func TestContainerFunctions(t *testing.T) { t.Run(tc.runtime, func(t *testing.T) { runner := NewFakeRunner(t) prefix := "" - if tc.runtime == "docker" || tc.runtime == "nvidia-docker" { + if tc.runtime == "docker" { prefix = "k8s_" } runner.containers = map[string]string{ diff --git a/pkg/minikube/cruntime/docker.go b/pkg/minikube/cruntime/docker.go index d083f94a3f65..a509df9e12e1 100644 --- a/pkg/minikube/cruntime/docker.go +++ b/pkg/minikube/cruntime/docker.go @@ -39,6 +39,7 @@ import ( "k8s.io/minikube/pkg/minikube/docker" "k8s.io/minikube/pkg/minikube/download" "k8s.io/minikube/pkg/minikube/image" + "k8s.io/minikube/pkg/minikube/out" "k8s.io/minikube/pkg/minikube/style" "k8s.io/minikube/pkg/minikube/sysinit" ) @@ -67,7 +68,6 @@ func (e *ErrISOFeature) Error() string { // Docker contains Docker runtime state type Docker struct { - Type string Socket string Runner CommandRunner NetworkPlugin string @@ -76,6 +76,7 @@ type Docker struct { Init sysinit.Manager UseCRI bool CRIService string + GPUs bool } // Name is a human readable name for Docker @@ -560,7 +561,11 @@ func (r *Docker) configureDocker(driver string) error { }, StorageDriver: "overlay2", } - if r.Type == "nvidia-docker" { + if r.GPUs { + if err := r.installNvidiaContainerToolkit(); err != nil { + return fmt.Errorf("failed installing the NVIDIA Container Toolkit: %v", err) + } + assets.Addons["nvidia-device-plugin"].EnableByDefault() daemonConfig.DefaultRuntime = "nvidia" runtimes := &dockerDaemonRuntimes{} runtimes.Nvidia.Path = "/usr/bin/nvidia-container-runtime" @@ -574,6 +579,26 @@ func (r *Docker) configureDocker(driver string) error { return r.Runner.Copy(ma) } +// installNvidiaContainerToolkit installs the NVIDIA Container Toolkit +// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html +func (r *Docker) installNvidiaContainerToolkit() error { + out.Styled(style.Warning, "Using GPUs with the Docker driver is experimental, if you experience any issues please report them at: https://github.com/kubernetes/minikube/issues/new/choose") + out.Styled(style.Toolkit, "Installing the NVIDIA Container Toolkit...") + cmds := []string{ + "curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg", + "curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list", + "sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit", + } + + for _, cmd := range cmds { + c := exec.Command("/bin/bash", "-c", cmd) + if _, err := r.Runner.RunCmd(c); err != nil { + return err + } + } + return nil +} + // Preload preloads docker with k8s images: // 1. Copy over the preloaded tarball into the VM // 2. Extract the preloaded tarball to the correct directory diff --git a/pkg/minikube/node/start.go b/pkg/minikube/node/start.go index 9047c8b362f1..7ea3e02d2686 100755 --- a/pkg/minikube/node/start.go +++ b/pkg/minikube/node/start.go @@ -395,6 +395,9 @@ func configureRuntimes(runner cruntime.CommandRunner, cc config.ClusterConfig, k KubernetesVersion: kv, InsecureRegistry: cc.InsecureRegistry, } + if cc.GPUs != "" { + co.GPUs = true + } cr, err := cruntime.New(co) if err != nil { exit.Error(reason.InternalRuntime, "Failed runtime", err) diff --git a/pkg/minikube/registry/drvs/docker/docker.go b/pkg/minikube/registry/drvs/docker/docker.go index ed23b569c98e..5feeefb87c97 100644 --- a/pkg/minikube/registry/drvs/docker/docker.go +++ b/pkg/minikube/registry/drvs/docker/docker.go @@ -90,6 +90,7 @@ func configure(cc config.ClusterConfig, n config.Node) (interface{}, error) { Subnet: cc.Subnet, StaticIP: cc.StaticIP, ListenAddress: cc.ListenAddress, + GPUs: cc.GPUs, }), nil } diff --git a/pkg/minikube/style/style.go b/pkg/minikube/style/style.go index 597c402b82d0..411ee2162590 100644 --- a/pkg/minikube/style/style.go +++ b/pkg/minikube/style/style.go @@ -139,6 +139,7 @@ var Config = map[Enum]Options{ VerifyingNoLine: {Prefix: "🤔 ", OmitNewline: true}, Verifying: {Prefix: "🤔 "}, CNI: {Prefix: "🔗 "}, + Toolkit: {Prefix: "🛠️ "}, } // LowPrefix returns a 7-bit compatible prefix for a style diff --git a/pkg/minikube/style/style_enum.go b/pkg/minikube/style/style_enum.go index 19dce3a06014..d89ba5eeb3ff 100644 --- a/pkg/minikube/style/style_enum.go +++ b/pkg/minikube/style/style_enum.go @@ -105,4 +105,5 @@ const ( Warning Workaround CNI + Toolkit ) diff --git a/site/content/en/docs/handbook/addons/nvidia.md b/site/content/en/docs/tutorials/nvidia.md similarity index 74% rename from site/content/en/docs/handbook/addons/nvidia.md rename to site/content/en/docs/tutorials/nvidia.md index a28f34be39b9..c9e236f8e896 100644 --- a/site/content/en/docs/handbook/addons/nvidia.md +++ b/site/content/en/docs/tutorials/nvidia.md @@ -1,6 +1,6 @@ --- -title: "Using the Nvidia Addons" -linkTitle: "Nvidia" +title: "Using NVIDIA GPUs with minikube" +linkTitle: "Using NVIDIA GPUs with minikube" weight: 1 date: 2018-01-02 --- @@ -8,17 +8,67 @@ date: 2018-01-02 ## Prerequisites - Linux -- kvm2 driver - Latest NVIDIA GPU drivers +- minikube v1.32.0-beta0 or later (docker driver only) -## Using the KVM2 driver +## Instructions per driver -When using NVIDIA GPUs with the kvm2 driver, we passthrough spare GPUs on the +{{% tabs %}} +{{% tab docker %}} +## Using the docker driver + +- Check if `bpf_jit_harden` is set to `0` + ```shell + sudo sysctl net.core.bpf_jit_harden + ``` + - If it's not `0` run: + ```shell + echo "net.core.bpf_jit_harden=0" | sudo tee -a /etc/sysctl.conf + sudo sysctl -p + ``` + +- Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) on your host machine + +- Configure Docker: + ```shell + sudo nvidia-ctk runtime configure --runtime=docker && sudo systemctl restart docker + ``` +- Start minikube: + ```shell + minikube start --driver docker --container-runtime docker --gpus all + ``` +{{% /tab %}} +{{% tab none %}} +## Using the 'none' driver + +NOTE: This approach used to expose GPUs here is different than the approach used +to expose GPUs with `--driver=kvm`. Please don't mix these instructions. + +- Install minikube. + +- Install the nvidia driver, nvidia-docker and configure docker with nvidia as + the default runtime. See instructions at + + +- Start minikube: + ```shell + minikube start --driver=none --apiserver-ips 127.0.0.1 --apiserver-name localhost + ``` + +- Install NVIDIA's device plugin: + ```shell + minikube addons enable nvidia-device-plugin + ``` +{{% /tab %}} +{{% tab kvm %}} +## Using the kvm driver + +When using NVIDIA GPUs with the kvm driver, we passthrough spare GPUs on the host to the minikube VM. Doing so has a few prerequisites: -- You must install the [kvm2 driver]({{< ref "/docs/drivers/kvm2" >}}) If you already had +- You must install the [kvm driver]({{< ref "/docs/drivers/kvm2" >}}) If you already had this installed make sure that you fetch the latest - `docker-machine-driver-kvm2` binary that has GPU support. + `docker-machine-driver-kvm` binary that has GPU support. - Your CPU must support IOMMU. Different vendors have different names for this technology. Intel calls it Intel VT-d. AMD calls it AMD-Vi. Your motherboard @@ -40,9 +90,9 @@ host to the minikube VM. Doing so has a few prerequisites: group of these GPUs. - Once you reboot the system after doing the above, you should be ready to use - GPUs with kvm2. Run the following command to start minikube: + GPUs with kvm. Run the following command to start minikube: ```shell - minikube start --driver kvm2 --kvm-gpu + minikube start --driver kvm --kvm-gpu ``` This command will check if all the above conditions are satisfied and @@ -68,31 +118,12 @@ host to the minikube VM. Doing so has a few prerequisites: See the excellent documentation at -### Why are so many manual steps required to use GPUs with kvm2 on minikube? +### Why are so many manual steps required to use GPUs with kvm on minikube? These steps require elevated privileges which minikube doesn't run with and they are disruptive to the host, so we decided to not do them automatically. - -## Using the 'none' driver - -NOTE: This approach used to expose GPUs here is different than the approach used -to expose GPUs with `--driver=kvm2`. Please don't mix these instructions. - -- Install minikube. - -- Install the nvidia driver, nvidia-docker and configure docker with nvidia as - the default runtime. See instructions at - - -- Start minikube: - ```shell - minikube start --driver=none --apiserver-ips 127.0.0.1 --apiserver-name localhost - ``` - -- Install NVIDIA's device plugin: - ```shell - kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/master/nvidia-device-plugin.yml - ``` +{{% /tab %}} +{{% /tabs %}} ## Why does minikube not support NVIDIA GPUs on macOS? @@ -102,7 +133,7 @@ drivers supported by minikube for macOS doesn't support GPU passthrough: - [moby/hyperkit#159](https://github.com/moby/hyperkit/issues/159) - [VirtualBox docs](https://www.virtualbox.org/manual/ch09.html#pcipassthrough) -Also: +Also: - For quite a while, all Mac hardware (both laptops and desktops) have come with Intel or AMD GPUs (and not with NVIDIA GPUs). Recently, Apple added [support diff --git a/test/integration/addons_test.go b/test/integration/addons_test.go index 0ebc38bc871b..38e219fa97ff 100644 --- a/test/integration/addons_test.go +++ b/test/integration/addons_test.go @@ -99,7 +99,7 @@ func TestAddons(t *testing.T) { // so we override that here to let minikube auto-detect appropriate cgroup driver os.Setenv(constants.MinikubeForceSystemdEnv, "") - args := append([]string{"start", "-p", profile, "--wait=true", "--memory=4000", "--alsologtostderr", "--addons=registry", "--addons=metrics-server", "--addons=volumesnapshots", "--addons=csi-hostpath-driver", "--addons=gcp-auth", "--addons=cloud-spanner", "--addons=inspektor-gadget", "--addons=storage-provisioner-rancher"}, StartArgs()...) + args := append([]string{"start", "-p", profile, "--wait=true", "--memory=4000", "--alsologtostderr", "--addons=registry", "--addons=metrics-server", "--addons=volumesnapshots", "--addons=csi-hostpath-driver", "--addons=gcp-auth", "--addons=cloud-spanner", "--addons=inspektor-gadget", "--addons=storage-provisioner-rancher", "--addons=nvidia-device-plugin"}, StartArgs()...) if !NoneDriver() { // none driver does not support ingress args = append(args, "--addons=ingress", "--addons=ingress-dns") } @@ -133,6 +133,7 @@ func TestAddons(t *testing.T) { {"Headlamp", validateHeadlampAddon}, {"CloudSpanner", validateCloudSpannerAddon}, {"LocalPath", validateLocalPathAddon}, + {"NvidiaDevicePlugin", validateNvidiaDevicePlugin}, } for _, tc := range tests { tc := tc @@ -942,3 +943,15 @@ func validateDisablingAddonOnNonExistingCluster(ctx context.Context, t *testing. t.Fatalf("unexpected error was returned: %v", err) } } + +// validateNvidiaDevicePlugin tests the nvidia-device-plugin addon by ensuring the pod comes up and the addon disables +func validateNvidiaDevicePlugin(ctx context.Context, t *testing.T, profile string) { + defer PostMortemLogs(t, profile) + + if _, err := PodWait(ctx, t, profile, "kube-system", "name=nvidia-device-plugin-ds", Minutes(6)); err != nil { + t.Fatalf("failed waiting for nvidia-device-plugin-ds pod: %v", err) + } + if rr, err := Run(t, exec.CommandContext(ctx, Target(), "addons", "disable", "nvidia-device-plugin", "-p", profile)); err != nil { + t.Errorf("failed to disable nvidia-device-plugin: args %q : %v", rr.Command(), err) + } +}