diff --git a/.github/workflows/update-amd-gpu-device-plugin-version.yml b/.github/workflows/update-amd-gpu-device-plugin-version.yml new file mode 100644 index 000000000000..ecd16c35fd92 --- /dev/null +++ b/.github/workflows/update-amd-gpu-device-plugin-version.yml @@ -0,0 +1,48 @@ +name: "update-amd-gpu-device-plugin-version" +on: + workflow_dispatch: + schedule: + # every Monday at around 3 am pacific/10 am UTC + - cron: "0 10 * * 1" +env: + GOPROXY: https://proxy.golang.org + GO_VERSION: '1.23.0' +permissions: + contents: read + +jobs: + bump-amd-gpu-device-plugin-version: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 + - uses: actions/setup-go@0a12ed9d6a96ab950c8f026ed9f722fe0da7ef32 + with: + go-version: ${{env.GO_VERSION}} + - name: Bump amd-gpu-device-plugin version + id: bumpAmdDevicePlugin + run: | + echo "OLD_VERSION=$(DEP=amd-gpu-device-plugin make get-dependency-version)" >> "$GITHUB_OUTPUT" + make update-amd-gpu-device-plugin-version + echo "NEW_VERSION=$(DEP=amd-gpu-device-plugin make get-dependency-version)" >> "$GITHUB_OUTPUT" + # The following is to support multiline with GITHUB_OUTPUT, see https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#multiline-strings + echo "changes<<EOF" >> "$GITHUB_OUTPUT" + echo "$(git status --porcelain)" >> "$GITHUB_OUTPUT" + echo "EOF" >> "$GITHUB_OUTPUT" + - name: Create PR + if: ${{ steps.bumpAmdDevicePlugin.outputs.changes != '' }} + uses: peter-evans/create-pull-request@5e914681df9dc83aa4e4905692ca88beb2f9e91f + with: + token: ${{ secrets.MINIKUBE_BOT_PAT }} + commit-message: 'Addon amd-gpu-device-plugin: Update amd/k8s-device-plugin image from ${{ steps.bumpAmdDevicePlugin.outputs.OLD_VERSION }} to ${{ steps.bumpAmdDevicePlugin.outputs.NEW_VERSION }}' + committer: minikube-bot <minikube-bot@google.com> + author: minikube-bot <minikube-bot@google.com> + branch: auto_bump_amd_device_plugin_version + push-to-fork: minikube-bot/minikube + base: master + delete-branch: true + title: 'Addon amd-gpu-device-plugin: Update amd/k8s-device-plugin image from ${{ steps.bumpAmdDevicePlugin.outputs.OLD_VERSION }} to ${{ steps.bumpAmdDevicePlugin.outputs.NEW_VERSION }}' + labels: ok-to-test + body: | + The [k8s-device-plugin](https://github.com/ROCm/k8s-device-plugin) project released a new k8s-device-plugin image + + This PR was auto-generated by `make update-amd-gpu-device-plugin-version` using [update-amd-gpu-device-plugin-version.yml](https://github.com/kubernetes/minikube/tree/master/.github/workflows/update-amd-gpu-device-plugin-version.yml) CI Workflow. diff --git a/Makefile b/Makefile index f869cc3a1d0e..764de66c620f 100644 --- a/Makefile +++ b/Makefile @@ -1222,6 +1222,11 @@ update-nvidia-device-plugin-version: (cd hack/update/nvidia_device_plugin_version && \ go run update_nvidia_device_plugin_version.go) +.PHONY: update-amd-gpu-device-plugin-version +update-amd-gpu-device-plugin-version: + (cd hack/update/amd_device_plugin_version && \ + go run update_amd_device_plugin_version.go) + .PHONY: update-nerctld-version update-nerdctld-version: (cd hack/update/nerdctld_version && \ diff --git a/README.md b/README.md index cdc27d042242..16496f915af4 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ As well as developer-friendly features: * [Addons](https://minikube.sigs.k8s.io/docs/handbook/deploying/#addons) - a marketplace for developers to share configurations for running services on minikube * [NVIDIA GPU support](https://minikube.sigs.k8s.io/docs/tutorials/nvidia/) - for machine learning +* [AMD GPU support](https://minikube.sigs.k8s.io/docs/tutorials/amd/) - for machine learning * [Filesystem mounts](https://minikube.sigs.k8s.io/docs/handbook/mount/) **For more information, see the official [minikube website](https://minikube.sigs.k8s.io)** diff --git a/cmd/minikube/cmd/start.go b/cmd/minikube/cmd/start.go index cb585ba73173..c31dd21f43ad 100644 --- a/cmd/minikube/cmd/start.go +++ b/cmd/minikube/cmd/start.go @@ -1462,8 +1462,8 @@ func validateGPUs(value, drvName, rtime string) error { if err := validateGPUsArch(); err != nil { return err } - if value != "nvidia" && value != "all" { - return errors.Errorf(`The gpus flag must be passed a value of "nvidia" or "all"`) + if value != "nvidia" && value != "all" && value != "amd" { + return errors.Errorf(`The gpus flag must be passed a value of "nvidia", "amd" or "all"`) } if drvName == constants.Docker && (rtime == constants.Docker || rtime == constants.DefaultContainerRuntime) { return nil diff --git a/cmd/minikube/cmd/start_flags.go b/cmd/minikube/cmd/start_flags.go index 5f286db46ea3..aac3a8a270b9 100644 --- a/cmd/minikube/cmd/start_flags.go +++ b/cmd/minikube/cmd/start_flags.go @@ -206,7 +206,7 @@ func initMinikubeFlags() { startCmd.Flags().Bool(disableOptimizations, false, "If set, disables optimizations that are set for local Kubernetes. Including decreasing CoreDNS replicas from 2 to 1. Defaults to false.") startCmd.Flags().Bool(disableMetrics, false, "If set, disables metrics reporting (CPU and memory usage), this can improve CPU usage. Defaults to false.") startCmd.Flags().String(staticIP, "", "Set a static IP for the minikube cluster, the IP must be: private, IPv4, and the last octet must be between 2 and 254, for example 192.168.200.200 (Docker and Podman drivers only)") - startCmd.Flags().StringP(gpus, "g", "", "Allow pods to use your NVIDIA GPUs. Options include: [all,nvidia] (Docker driver with Docker container-runtime only)") + startCmd.Flags().StringP(gpus, "g", "", "Allow pods to use your GPUs. Options include: [all,nvidia,amd] (Docker driver with Docker container-runtime only)") startCmd.Flags().Duration(autoPauseInterval, time.Minute*1, "Duration of inactivity before the minikube VM is paused (default 1m0s)") } diff --git a/cmd/minikube/cmd/start_test.go b/cmd/minikube/cmd/start_test.go index c19e151d3c47..6f468b632689 100644 --- a/cmd/minikube/cmd/start_test.go +++ b/cmd/minikube/cmd/start_test.go @@ -814,7 +814,10 @@ func TestValidateGPUs(t *testing.T) { {"nvidia", "docker", "", ""}, {"all", "kvm", "docker", "The gpus flag can only be used with the docker driver and docker container-runtime"}, {"nvidia", "docker", "containerd", "The gpus flag can only be used with the docker driver and docker container-runtime"}, - {"cat", "docker", "docker", `The gpus flag must be passed a value of "nvidia" or "all"`}, + {"cat", "docker", "docker", `The gpus flag must be passed a value of "nvidia", "amd" or "all"`}, + {"amd", "docker", "docker", ""}, + {"amd", "docker", "", ""}, + {"amd", "docker", "containerd", "The gpus flag can only be used with the docker driver and docker container-runtime"}, } for _, tc := range tests { diff --git a/deploy/addons/assets.go b/deploy/addons/assets.go index 757a9e5ca83f..f1db9821c51a 100644 --- a/deploy/addons/assets.go +++ b/deploy/addons/assets.go @@ -107,6 +107,10 @@ var ( //go:embed gpu/nvidia-gpu-device-plugin.yaml.tmpl NvidiaGpuDevicePluginAssets embed.FS + // AmdGpuDevicePluginAssets assets for amd-gpu-device-plugin addon + //go:embed gpu/amd-gpu-device-plugin.yaml.tmpl + AmdGpuDevicePluginAssets embed.FS + // LogviewerAssets assets for logviewer addon //go:embed logviewer/*.tmpl logviewer/*.yaml LogviewerAssets embed.FS diff --git a/deploy/addons/gpu/amd-gpu-device-plugin.yaml.tmpl b/deploy/addons/gpu/amd-gpu-device-plugin.yaml.tmpl new file mode 100644 index 000000000000..12bffa56a350 --- /dev/null +++ b/deploy/addons/gpu/amd-gpu-device-plugin.yaml.tmpl @@ -0,0 +1,60 @@ +# Copyright 2024 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: amd-gpu-device-plugin + namespace: kube-system + labels: + k8s-app: amd-gpu-device-plugin + kubernetes.io/minikube-addons: amd-gpu-device-plugin + addonmanager.kubernetes.io/mode: Reconcile +spec: + selector: + matchLabels: + k8s-app: amd-gpu-device-plugin + template: + metadata: + labels: + name: amd-gpu-device-plugin + k8s-app: amd-gpu-device-plugin + spec: + nodeSelector: + kubernetes.io/arch: amd64 + priorityClassName: system-node-critical + tolerations: + - key: CriticalAddonsOnly + operator: Exists + volumes: + - name: dp + hostPath: + path: /var/lib/kubelet/device-plugins + - name: sys + hostPath: + path: /sys + containers: + - image: {{.CustomRegistries.AmdDevicePlugin | default .ImageRepository | default .Registries.AmdDevicePlugin }}{{.Images.AmdDevicePlugin}} + name: amd-gpu-device-plugin + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: dp + mountPath: /var/lib/kubelet/device-plugins + - name: sys + mountPath: /sys + updateStrategy: + type: RollingUpdate diff --git a/hack/update/amd_device_plugin_version/update_amd_device_plugin_version.go b/hack/update/amd_device_plugin_version/update_amd_device_plugin_version.go new file mode 100644 index 000000000000..ea90ee2eb429 --- /dev/null +++ b/hack/update/amd_device_plugin_version/update_amd_device_plugin_version.go @@ -0,0 +1,57 @@ +/* +Copyright 2024 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "context" + "fmt" + "time" + + "k8s.io/klog/v2" + "k8s.io/minikube/hack/update" +) + +var schema = map[string]update.Item{ + "pkg/minikube/assets/addons.go": { + Replace: map[string]string{ + `rocm/k8s-device-plugin:.*`: `rocm/k8s-device-plugin:{{.Version}}@{{.SHA}}",`, + }, + }, +} + +type Data struct { + Version string + SHA string +} + +func main() { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + + stable, _, _, err := update.GHReleases(ctx, "ROCm", "k8s-device-plugin") + if err != nil { + klog.Fatalf("Unable to get stable version: %v", err) + } + sha, err := update.GetImageSHA(fmt.Sprintf("rocm/k8s-device-plugin:%s", stable.Tag)) + if err != nil { + klog.Fatalf("failed to get image SHA: %v", err) + } + + data := Data{Version: stable.Tag, SHA: sha} + + update.Apply(schema, data) +} diff --git a/hack/update/get_version/get_version.go b/hack/update/get_version/get_version.go index ea3892c0843b..383feae303a6 100644 --- a/hack/update/get_version/get_version.go +++ b/hack/update/get_version/get_version.go @@ -33,6 +33,7 @@ type dependency struct { } var dependencies = map[string]dependency{ + "amd-gpu-device-plugin": {addonsFile, `rocm/k8s-device-plugin:(.*)@`}, "buildkit": {"deploy/iso/minikube-iso/arch/x86_64/package/buildkit-bin/buildkit-bin.mk", `BUILDKIT_BIN_VERSION = (.*)`}, "calico": {"pkg/minikube/bootstrapper/images/images.go", `calicoVersion = "(.*)"`}, "cilium": {"pkg/minikube/cni/cilium.yaml", `quay.io/cilium/cilium:(.*)@`}, diff --git a/pkg/addons/config.go b/pkg/addons/config.go index 9bbafc183833..63e26f436d8e 100644 --- a/pkg/addons/config.go +++ b/pkg/addons/config.go @@ -131,6 +131,11 @@ var Addons = []*Addon{ validations: []setFn{isKVMDriverForNVIDIA}, callbacks: []setFn{EnableOrDisableAddon}, }, + { + name: "amd-gpu-device-plugin", + set: SetBool, + callbacks: []setFn{EnableOrDisableAddon}, + }, { name: "olm", set: SetBool, diff --git a/pkg/drivers/kic/oci/oci.go b/pkg/drivers/kic/oci/oci.go index 3fcbe63b3251..5e761db72421 100644 --- a/pkg/drivers/kic/oci/oci.go +++ b/pkg/drivers/kic/oci/oci.go @@ -190,8 +190,14 @@ func CreateContainerNode(p CreateParams) error { //nolint to suppress cyclomatic runArgs = append(runArgs, "--network", p.Network) runArgs = append(runArgs, "--ip", p.IP) } - if p.GPUs != "" { + + if p.GPUs == "all" || p.GPUs == "nvidia" { runArgs = append(runArgs, "--gpus", "all", "--env", "NVIDIA_DRIVER_CAPABILITIES=all") + } else if p.GPUs == "amd" { + /* https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html + * "--security-opt seccomp=unconfined" is also required but included above. + */ + runArgs = append(runArgs, "--device", "/dev/kfd", "--device", "/dev/dri", "--group-add", "video", "--group-add", "render") } memcgSwap := hasMemorySwapCgroup() diff --git a/pkg/drivers/kic/oci/types.go b/pkg/drivers/kic/oci/types.go index 3cb6ec483f53..894679c4e667 100644 --- a/pkg/drivers/kic/oci/types.go +++ b/pkg/drivers/kic/oci/types.go @@ -61,7 +61,7 @@ type CreateParams struct { OCIBinary string // docker or podman Network string // network name that the container will attach to IP string // static IP to assign the container in the cluster network - GPUs string // add NVIDIA GPU devices to the container + GPUs string // add GPU devices to the container } // createOpt is an option for Create diff --git a/pkg/drivers/kic/types.go b/pkg/drivers/kic/types.go index 5020be74f705..b581c93b128a 100644 --- a/pkg/drivers/kic/types.go +++ b/pkg/drivers/kic/types.go @@ -69,5 +69,5 @@ type Config struct { StaticIP string // static IP for the kic cluster ExtraArgs []string // a list of any extra option to pass to oci binary during creation time, for example --expose 8080... ListenAddress string // IP Address to listen to - GPUs string // add NVIDIA GPU devices to the container + GPUs string // add GPU devices to the container } diff --git a/pkg/minikube/assets/addons.go b/pkg/minikube/assets/addons.go index 7bdc0a850b3e..586b5a87e8ed 100644 --- a/pkg/minikube/assets/addons.go +++ b/pkg/minikube/assets/addons.go @@ -487,6 +487,17 @@ var Addons = map[string]*Addon{ }, map[string]string{ "NvidiaDevicePlugin": "registry.k8s.io", }), + "amd-gpu-device-plugin": NewAddon([]*BinAsset{ + MustBinAsset(addons.AmdGpuDevicePluginAssets, + "gpu/amd-gpu-device-plugin.yaml.tmpl", + vmpath.GuestAddonsDir, + "amd-gpu-device-plugin.yaml", + "0640"), + }, false, "amd-gpu-device-plugin", "3rd party (AMD)", "", "https://minikube.sigs.k8s.io/docs/tutorials/amd/", map[string]string{ + "AmdDevicePlugin": "rocm/k8s-device-plugin:1.25.2.8@sha256:f3835498cf2274e0a07c32b38c166c05a876f8eb776d756cc06805e599a3ba5f", + }, map[string]string{ + "AmdDevicePlugin": "docker.io", + }), "logviewer": NewAddon([]*BinAsset{ MustBinAsset(addons.LogviewerAssets, "logviewer/logviewer-dp-and-svc.yaml.tmpl", diff --git a/pkg/minikube/cruntime/cruntime.go b/pkg/minikube/cruntime/cruntime.go index d96e230a252b..8b06f042d9d0 100644 --- a/pkg/minikube/cruntime/cruntime.go +++ b/pkg/minikube/cruntime/cruntime.go @@ -156,7 +156,7 @@ type Config struct { // InsecureRegistry list of insecure registries InsecureRegistry []string // GPUs add GPU devices to the container - GPUs bool + GPUs string } // ListContainersOptions are the options to use for listing containers diff --git a/pkg/minikube/cruntime/docker.go b/pkg/minikube/cruntime/docker.go index 4c27c11dcfb0..2a4dd310da5e 100644 --- a/pkg/minikube/cruntime/docker.go +++ b/pkg/minikube/cruntime/docker.go @@ -75,7 +75,7 @@ type Docker struct { Init sysinit.Manager UseCRI bool CRIService string - GPUs bool + GPUs string } // Name is a human readable name for Docker @@ -580,13 +580,17 @@ func (r *Docker) configureDocker(driver string) error { }, StorageDriver: "overlay2", } - if r.GPUs { + + if r.GPUs == "all" || r.GPUs == "nvidia" { assets.Addons["nvidia-device-plugin"].EnableByDefault() daemonConfig.DefaultRuntime = "nvidia" runtimes := &dockerDaemonRuntimes{} runtimes.Nvidia.Path = "/usr/bin/nvidia-container-runtime" daemonConfig.Runtimes = runtimes + } else if r.GPUs == "amd" { + assets.Addons["amd-gpu-device-plugin"].EnableByDefault() } + daemonConfigBytes, err := json.Marshal(daemonConfig) if err != nil { return err diff --git a/pkg/minikube/node/start.go b/pkg/minikube/node/start.go index ccf6e830ed10..75de4f1c3f4a 100755 --- a/pkg/minikube/node/start.go +++ b/pkg/minikube/node/start.go @@ -419,7 +419,7 @@ func configureRuntimes(runner cruntime.CommandRunner, cc config.ClusterConfig, k InsecureRegistry: cc.InsecureRegistry, } if cc.GPUs != "" { - co.GPUs = true + co.GPUs = cc.GPUs } cr, err := cruntime.New(co) if err != nil { diff --git a/site/content/en/docs/commands/start.md b/site/content/en/docs/commands/start.md index 43f200d4598d..06308e92ea37 100644 --- a/site/content/en/docs/commands/start.md +++ b/site/content/en/docs/commands/start.md @@ -57,7 +57,7 @@ minikube start [flags] --feature-gates string A set of key=value pairs that describe feature gates for alpha/experimental features. --force Force minikube to perform possibly dangerous operations --force-systemd If set, force the container runtime to use systemd as cgroup manager. Defaults to false. - -g, --gpus string Allow pods to use your NVIDIA GPUs. Options include: [all,nvidia] (Docker driver with Docker container-runtime only) + -g, --gpus string Allow pods to use your GPUs. Options include: [all,nvidia,amd] (Docker driver with Docker container-runtime only) --ha Create Highly Available Multi-Control Plane Cluster with a minimum of three control-plane nodes that will also be marked for work. --host-dns-resolver Enable host resolver for NAT DNS requests (virtualbox driver only) (default true) --host-only-cidr string The CIDR to be used for the minikube VM (virtualbox driver only) (default "192.168.59.1/24") diff --git a/site/content/en/docs/contrib/tests.en.md b/site/content/en/docs/contrib/tests.en.md index 073824364a30..e3b78a4d2e96 100644 --- a/site/content/en/docs/contrib/tests.en.md +++ b/site/content/en/docs/contrib/tests.en.md @@ -65,6 +65,9 @@ tests disabling an addon on a non-existing cluster #### validateNvidiaDevicePlugin tests the nvidia-device-plugin addon by ensuring the pod comes up and the addon disables +#### validateAmdGpuDevicePlugin +tests the amd-gpu-device-plugin addon by ensuring the pod comes up and the addon disables + #### validateYakdAddon ## TestCertOptions diff --git a/site/content/en/docs/tutorials/amd.md b/site/content/en/docs/tutorials/amd.md new file mode 100644 index 000000000000..8760e6762a78 --- /dev/null +++ b/site/content/en/docs/tutorials/amd.md @@ -0,0 +1,114 @@ +--- +title: "Using AMD GPUs with minikube" +linkTitle: "Using AMD GPUs with minikube" +weight: 1 +date: 2024-10-04 +--- + +This tutorial shows how to start minikube with support for AMD GPUs. + +Support is provided by the [AMD GPU device plugin for Kubernetes](https://github.com/ROCm/k8s-device-plugin). + + +## Prerequisites + +- Linux +- Latest AMD GPU Drivers 6.2.1 or greater +- minikube v1.35.0 or later (docker driver only) + +## Using the docker driver + +- Ensure you have an AMD driver installed, you can check if one is installed by running `rocminfo`, if one is not installed follow the [Radeon™ Driver Installation Guide](https://amdgpu-install.readthedocs.io/en/latest/) + +- Delete existing minikube (optional) + + If you have an existing minikube instance, you may need to delete it if it was built before installing the AMD drivers. + ```shell + minikube delete + ``` + +- Start minikube: + ```shell + minikube start --driver docker --container-runtime docker --gpus amd + ``` + +## Verifying the GPU is available + +Test the AMD GPUs are available to the cluster. + +1. Create the following Job: + + ```shell + cat <<'EOF' | kubectl apply -f - + apiVersion: batch/v1 + kind: Job + metadata: + name: amd-gpu-check + labels: + purpose: amd-gpu-check + spec: + ttlSecondsAfterFinished: 100 + template: + spec: + restartPolicy: Never + securityContext: + supplementalGroups: + - 44 + - 110 + containers: + - name: amd-gpu-checker + image: rocm/rocm-terminal + workingDir: /root + command: ["rocminfo"] + args: [] + resources: + limits: + amd.com/gpu: 1 # requesting a GPU + EOF + ``` + +2. Check the Job output `kubectl logs jobs/amd-gpu-check` looks something like the following: + + ```plain + ROCk module version 6.8.5 is loaded + ===================== + HSA System Attributes + ===================== + Runtime Version: 1.14 + Runtime Ext Version: 1.6 + System Timestamp Freq.: 1000.000000MHz + Sig. Max Wait Duration: 18446744073709551615 (0xFFFFFFFFFFFFFFFF) (timestamp count) + Machine Model: LARGE + System Endianness: LITTLE + Mwaitx: DISABLED + DMAbuf Support: YES + + ========== + HSA Agents + ========== + ******* + Agent 1 + ******* + Name: AMD Ryzen 7 7840U w/ Radeon 780M Graphics + Uuid: CPU-XX + ... + ``` + +## Where can I learn more about GPU passthrough? + +See the excellent documentation at +<https://wiki.archlinux.org/index.php/PCI_passthrough_via_OVMF> + +## Why does minikube not support AMD GPUs on Windows? + +minikube supports Windows host through Hyper-V or VirtualBox. + +- VirtualBox doesn't support PCI passthrough for [Windows + host](https://www.virtualbox.org/manual/ch09.html#pcipassthrough). + +- Hyper-V supports DDA (discrete device assignment) but [only for Windows Server + 2016](https://docs.microsoft.com/en-us/windows-server/virtualization/hyper-v/plan/plan-for-deploying-devices-using-discrete-device-assignment) + +Since the only possibility of supporting GPUs on minikube on Windows is on a +server OS where users don't usually run minikube, we haven't invested time in +trying to support GPUs on minikube on Windows. diff --git a/test/integration/addons_test.go b/test/integration/addons_test.go index 9bf9d3a2def5..e9ec27e75f83 100644 --- a/test/integration/addons_test.go +++ b/test/integration/addons_test.go @@ -100,7 +100,7 @@ func TestAddons(t *testing.T) { // so we override that here to let minikube auto-detect appropriate cgroup driver os.Setenv(constants.MinikubeForceSystemdEnv, "") - args := append([]string{"start", "-p", profile, "--wait=true", "--memory=4000", "--alsologtostderr", "--addons=registry", "--addons=metrics-server", "--addons=volumesnapshots", "--addons=csi-hostpath-driver", "--addons=gcp-auth", "--addons=cloud-spanner", "--addons=inspektor-gadget", "--addons=nvidia-device-plugin", "--addons=yakd", "--addons=volcano"}, StartArgs()...) + args := append([]string{"start", "-p", profile, "--wait=true", "--memory=4000", "--alsologtostderr", "--addons=registry", "--addons=metrics-server", "--addons=volumesnapshots", "--addons=csi-hostpath-driver", "--addons=gcp-auth", "--addons=cloud-spanner", "--addons=inspektor-gadget", "--addons=nvidia-device-plugin", "--addons=yakd", "--addons=volcano", "--addons=amd-gpu-device-plugin"}, StartArgs()...) if !NoneDriver() { args = append(args, "--addons=ingress", "--addons=ingress-dns", "--addons=storage-provisioner-rancher") } @@ -115,12 +115,13 @@ func TestAddons(t *testing.T) { t.Fatalf("Failed setup for addon tests") } + type TestCase = struct { + name string + validator validateFunc + } // Run tests in serial to avoid collision t.Run("serial", func(t *testing.T) { - tests := []struct { - name string - validator validateFunc - }{ + tests := []TestCase{ {"Volcano", validateVolcanoAddon}, {"GCPAuth", validateGCPAuthAddon}, } @@ -137,10 +138,7 @@ func TestAddons(t *testing.T) { // Parallelized tests t.Run("parallel", func(t *testing.T) { - tests := []struct { - name string - validator validateFunc - }{ + tests := []TestCase{ {"Registry", validateRegistryAddon}, {"Ingress", validateIngressAddon}, {"InspektorGadget", validateInspektorGadgetAddon}, @@ -152,7 +150,9 @@ func TestAddons(t *testing.T) { {"LocalPath", validateLocalPathAddon}, {"NvidiaDevicePlugin", validateNvidiaDevicePlugin}, {"Yakd", validateYakdAddon}, + {"AmdGpuDevicePlugin", validateAmdGpuDevicePlugin}, } + for _, tc := range tests { tc := tc if ctx.Err() == context.DeadlineExceeded { @@ -962,6 +962,19 @@ func validateNvidiaDevicePlugin(ctx context.Context, t *testing.T, profile strin } } +// validateAmdGpuDevicePlugin tests the amd-gpu-device-plugin addon by ensuring the pod comes up and the addon disables +func validateAmdGpuDevicePlugin(ctx context.Context, t *testing.T, profile string) { + if !(DockerDriver() && amd64Platform()) { + t.Skipf("skip amd gpu test on all but docker driver and amd64 platform") + } + defer disableAddon(t, "amd-gpu-device-plugin", profile) + defer PostMortemLogs(t, profile) + + if _, err := PodWait(ctx, t, profile, "kube-system", "name=amd-gpu-device-plugin", Minutes(6)); err != nil { + t.Fatalf("failed waiting for amd-gpu-device-plugin pod: %v", err) + } +} + func validateYakdAddon(ctx context.Context, t *testing.T, profile string) { defer disableAddon(t, "yakd", profile) defer PostMortemLogs(t, profile) diff --git a/test/integration/main_test.go b/test/integration/main_test.go index 051c302b2951..895d15fefa46 100644 --- a/test/integration/main_test.go +++ b/test/integration/main_test.go @@ -180,6 +180,11 @@ func arm64Platform() bool { return runtime.GOARCH == "arm64" } +// amd64Platform returns true if running on amd64/* platform +func amd64Platform() bool { + return runtime.GOARCH == "amd64" +} + // NeedsPortForward returns access to endpoints with this driver needs port forwarding // (Docker on non-Linux platforms and rootless KIC requires ports to be forwarded to 127.0.0.1) func NeedsPortForward() bool { diff --git a/translations/de.json b/translations/de.json index 1d5c00a83940..a391325b0fe0 100644 --- a/translations/de.json +++ b/translations/de.json @@ -65,7 +65,7 @@ "After the addon is enabled, please run \"minikube tunnel\" and your ingress resources would be available at \"127.0.0.1\"": "Nachdem das Addon aktiviert wurde, führen Sie bitte \"minikube tunnel\" aus, dann sind ihre Resourcen über \"127.0.0.1\" erreichbar", "Aliases": "Aliase", "All existing scheduled stops cancelled": "Alle derzeit existierenden und geplanten Stops wurden storniert.", - "Allow pods to use your NVIDIA GPUs. Options include: [all,nvidia] (Docker driver with Docker container-runtime only)": "Erlaube PODs auf die NVIDIA Grafikkarten zuzugreifen. Mögliche Optionen: [all,nvidia] (nur für Docker Treiber mit Docker Container Runtime)", + "Allow pods to use your GPUs. Options include: [all,nvidia] (Docker driver with Docker container-runtime only)": "Erlaube PODs auf die Grafikkarten zuzugreifen. Mögliche Optionen: [all,nvidia,amd] (nur für Docker Treiber mit Docker Container Runtime)", "Allow user prompts for more information": "Benutzer-Eingabeaufforderungen für zusätzliche Informationen zulassen", "Alternative image repository to pull docker images from. This can be used when you have limited access to gcr.io. Set it to \"auto\" to let minikube decide one for you. For Chinese mainland users, you may use local gcr.io mirrors such as registry.cn-hangzhou.aliyuncs.com/google_containers": "Alternatives Bild-Repository zum Abrufen von Docker-Images. Dies ist hilfreich, wenn Sie nur eingeschränkten Zugriff auf gcr.io haben. Stellen Sie \"auto\" ein, dann wählt minikube eins für sie aus. Nutzer vom chinesischen Festland können einen lokalen gcr.io-Mirror wie registry.cn-hangzhou.aliyuncs.com/google_containers verwenden.", "Alternatively you could install one of these drivers:": "Alternativ könnten Sie einen dieser Treiber installieren:", diff --git a/translations/es.json b/translations/es.json index 011a48d60a5e..46f4a453cd92 100644 --- a/translations/es.json +++ b/translations/es.json @@ -66,7 +66,7 @@ "After the addon is enabled, please run \"minikube tunnel\" and your ingress resources would be available at \"127.0.0.1\"": "", "Aliases": "Aliases", "All existing scheduled stops cancelled": "", - "Allow pods to use your NVIDIA GPUs. Options include: [all,nvidia] (Docker driver with Docker container-runtime only)": "", + "Allow pods to use your GPUs. Options include: [all,nvidia,amd] (Docker driver with Docker container-runtime only)": "", "Allow user prompts for more information": "Permitir que el usuario solicite más información", "Alternative image repository to pull docker images from. This can be used when you have limited access to gcr.io. Set it to \"auto\" to let minikube decide one for you. For Chinese mainland users, you may use local gcr.io mirrors such as registry.cn-hangzhou.aliyuncs.com/google_containers": "Repositorio de imágenes alternativo del que extraer imágenes de Docker. Puedes usarlo cuando tengas acceso limitado a gcr.io. Si quieres que minikube elija uno por ti, solo tienes que definir el valor como \"auto\". Los usuarios de China continental pueden utilizar réplicas locales de gcr.io, como registry.cn-hangzhou.aliyuncs.com/google_containers", "Alternatively you could install one of these drivers:": "Alternativamente, puede installar uno de estos drivers:", diff --git a/translations/fr.json b/translations/fr.json index 08d6602559cd..a1f4f274cefc 100644 --- a/translations/fr.json +++ b/translations/fr.json @@ -68,7 +68,7 @@ "After the addon is enabled, please run \"minikube tunnel\" and your ingress resources would be available at \"127.0.0.1\"": "Après que le module est activé, veuiller exécuter \"minikube tunnel\" et vos ressources ingress seront disponibles à \"127.0.0.1\"", "Aliases": "Alias", "All existing scheduled stops cancelled": "Tous les arrêts programmés existants annulés", - "Allow pods to use your NVIDIA GPUs. Options include: [all,nvidia] (Docker driver with Docker container-runtime only)": "Autorisez les pods à utiliser vos GPU NVIDIA. Les options incluent : [all,nvidia] (pilote Docker avec environnement d'exécution de conteneur Docker uniquement)", + "Allow pods to use your GPUs. Options include: [all,nvidia,amd] (Docker driver with Docker container-runtime only)": "Autorisez les pods à utiliser vos GPU. Les options incluent : [all,nvidia,amd] (pilote Docker avec environnement d'exécution de conteneur Docker uniquement)", "Allow user prompts for more information": "Autoriser les utilisateurs à saisir plus d'informations", "Alternative image repository to pull docker images from. This can be used when you have limited access to gcr.io. Set it to \"auto\" to let minikube decide one for you. For Chinese mainland users, you may use local gcr.io mirrors such as registry.cn-hangzhou.aliyuncs.com/google_containers": "Autre dépôt d'images d'où extraire des images Docker. Il peut être utilisé en cas d'accès limité à gcr.io. Définissez-le sur \"auto\" pour permettre à minikube de choisir la valeur à votre place. Pour les utilisateurs situés en Chine continentale, vous pouvez utiliser des miroirs gcr.io locaux tels que registry.cn-hangzhou.aliyuncs.com/google_containers.", "Alternatively you could install one of these drivers:": "Vous pouvez également installer l'un de ces pilotes :", diff --git a/translations/ja.json b/translations/ja.json index db61e0358749..86400896c847 100644 --- a/translations/ja.json +++ b/translations/ja.json @@ -62,7 +62,7 @@ "After the addon is enabled, please run \"minikube tunnel\" and your ingress resources would be available at \"127.0.0.1\"": "アドオンを有効にした後、「minikube tunnel」を実行することで、ingress リソースが「127.0.0.1」で利用可能になります", "Aliases": "エイリアス", "All existing scheduled stops cancelled": "既存のスケジュールされていたすべての停止がキャンセルされました", - "Allow pods to use your NVIDIA GPUs. Options include: [all,nvidia] (Docker driver with Docker container-runtime only)": "", + "Allow pods to use your GPUs. Options include: [all,nvidia,amd] (Docker driver with Docker container-runtime only)": "", "Allow user prompts for more information": "ユーザーによる詳細情報の入力をできるようにします", "Alternative image repository to pull docker images from. This can be used when you have limited access to gcr.io. Set it to \"auto\" to let minikube decide one for you. For Chinese mainland users, you may use local gcr.io mirrors such as registry.cn-hangzhou.aliyuncs.com/google_containers": "Docker イメージを取得するための代替イメージリポジトリー。これは、gcr.io へのアクセスが制限されている場合に使用できます。これを「auto」に設定すると、minikube によって自動的に指定されるようになります。中国本土のユーザーの場合、registry.cn-hangzhou.aliyuncs.com/google_containers などのローカル gcr.io ミラーを使用できます", "Alternatively you could install one of these drivers:": "代わりに、これらのドライバーのいずれかをインストールすることもできます:", diff --git a/translations/ko.json b/translations/ko.json index 8410eec22702..40ff84cd2332 100644 --- a/translations/ko.json +++ b/translations/ko.json @@ -68,7 +68,7 @@ "After the addon is enabled, please run \"minikube tunnel\" and your ingress resources would be available at \"127.0.0.1\"": "애드온이 활성화된 후 \"minikube tunnel\"을 실행하면 인그레스 리소스를 \"127.0.0.1\"에서 사용할 수 있습니다", "Aliases": "별칭", "All existing scheduled stops cancelled": "예정된 모든 중지 요청이 취소되었습니다", - "Allow pods to use your NVIDIA GPUs. Options include: [all,nvidia] (Docker driver with Docker container-runtime only)": "pod 가 NVIDIA GPU를 사용할 수 있도록 허용합니다. 옵션은 다음과 같습니다: [all,nvidia] (Docker 드라이버와 Docker 컨테이너 런타임만 해당)", + "Allow pods to use your GPUs. Options include: [all,nvidia,amd] (Docker driver with Docker container-runtime only)": "pod 가 GPU를 사용할 수 있도록 허용합니다. 옵션은 다음과 같습니다: [all,nvidia,amd] (Docker 드라이버와 Docker 컨테이너 런타임만 해당)", "Allow user prompts for more information": "추가 정보를 위해 사용자 프롬프트를 허용합니다", "Alternative image repository to pull docker images from. This can be used when you have limited access to gcr.io. Set it to \"auto\" to let minikube decide one for you. For Chinese mainland users, you may use local gcr.io mirrors such as registry.cn-hangzhou.aliyuncs.com/google_containers": "도커 이미지를 가져올 대체 이미지 저장소입니다. gcr.io에 제한된 액세스 권한이 있는 경우 사용할 수 있습니다. \"auto\"로 설정하여 minikube가 대신 결정하도록 할 수 있습니다. 중국 본토 사용자는 registry.cn-hangzhou.aliyuncs.com/google_containers와 같은 로컬 gcr.io 미러를 사용할 수 있습니다", "Alternatively you could install one of these drivers:": "또는 다음 드라이버 중 하나를 설치할 수 있습니다:", diff --git a/translations/pl.json b/translations/pl.json index e65b15139be3..99db202497e1 100644 --- a/translations/pl.json +++ b/translations/pl.json @@ -65,7 +65,7 @@ "After the addon is enabled, please run \"minikube tunnel\" and your ingress resources would be available at \"127.0.0.1\"": "Po włączeniu addona wykonaj komendę \"minikube tunnel\". Twoje zasoby będą dostępne pod adresem \"127.0.0.1\"", "Aliases": "Aliasy", "All existing scheduled stops cancelled": "Wszystkie zaplanowane zatrzymania zostały anulowane", - "Allow pods to use your NVIDIA GPUs. Options include: [all,nvidia] (Docker driver with Docker container-runtime only)": "", + "Allow pods to use your GPUs. Options include: [all,nvidia,amd] (Docker driver with Docker container-runtime only)": "", "Allow user prompts for more information": "", "Alternative image repository to pull docker images from. This can be used when you have limited access to gcr.io. Set it to \"auto\" to let minikube decide one for you. For Chinese mainland users, you may use local gcr.io mirrors such as registry.cn-hangzhou.aliyuncs.com/google_containers": "", "Alternatively you could install one of these drivers:": "", diff --git a/translations/ru.json b/translations/ru.json index cef057cff443..33124128ed36 100644 --- a/translations/ru.json +++ b/translations/ru.json @@ -57,7 +57,7 @@ "After the addon is enabled, please run \"minikube tunnel\" and your ingress resources would be available at \"127.0.0.1\"": "", "Aliases": "", "All existing scheduled stops cancelled": "", - "Allow pods to use your NVIDIA GPUs. Options include: [all,nvidia] (Docker driver with Docker container-runtime only)": "", + "Allow pods to use your GPUs. Options include: [all,nvidia,amd] (Docker driver with Docker container-runtime only)": "", "Allow user prompts for more information": "", "Alternative image repository to pull docker images from. This can be used when you have limited access to gcr.io. Set it to \"auto\" to let minikube decide one for you. For Chinese mainland users, you may use local gcr.io mirrors such as registry.cn-hangzhou.aliyuncs.com/google_containers": "", "Alternatively you could install one of these drivers:": "", diff --git a/translations/strings.txt b/translations/strings.txt index 919c80475d23..a22ec186d6be 100644 --- a/translations/strings.txt +++ b/translations/strings.txt @@ -57,7 +57,7 @@ "After the addon is enabled, please run \"minikube tunnel\" and your ingress resources would be available at \"127.0.0.1\"": "", "Aliases": "", "All existing scheduled stops cancelled": "", - "Allow pods to use your NVIDIA GPUs. Options include: [all,nvidia] (Docker driver with Docker container-runtime only)": "", + "Allow pods to use your GPUs. Options include: [all,nvidia,amd] (Docker driver with Docker container-runtime only)": "", "Allow user prompts for more information": "", "Alternative image repository to pull docker images from. This can be used when you have limited access to gcr.io. Set it to \"auto\" to let minikube decide one for you. For Chinese mainland users, you may use local gcr.io mirrors such as registry.cn-hangzhou.aliyuncs.com/google_containers": "", "Alternatively you could install one of these drivers:": "", diff --git a/translations/zh-CN.json b/translations/zh-CN.json index a90cd39f4b71..3bf849bc5e00 100644 --- a/translations/zh-CN.json +++ b/translations/zh-CN.json @@ -75,7 +75,7 @@ "After the addon is enabled, please run \"minikube tunnel\" and your ingress resources would be available at \"127.0.0.1\"": "插件启用后,请运行 \"minikube tunnel\" 您的 ingress 资源将在 \"127.0.0.1\"", "Aliases": "别名", "All existing scheduled stops cancelled": "取消所有已计划的停止", - "Allow pods to use your NVIDIA GPUs. Options include: [all,nvidia] (Docker driver with Docker container-runtime only)": "所有 pods 使用您的英伟达 GPUs。选项包括:[all,nvidia](仅支持Docker容器运行时的Docker驱动程序)", + "Allow pods to use your GPUs. Options include: [all,nvidia,amd] (Docker driver with Docker container-runtime only)": "允许 pods 使用您的 GPUs。选项包括:[all,nvidia,amd](仅支持Docker容器运行时的Docker驱动程序)", "Allow user prompts for more information": "允许用户提示以获取更多信息", "Alternative image repository to pull docker images from. This can be used when you have limited access to gcr.io. Set it to \"auto\" to let minikube decide one for you. For Chinese mainland users, you may use local gcr.io mirrors such as registry.cn-hangzhou.aliyuncs.com/google_containers": "用于从中拉取 docker 镜像的备选镜像存储库。如果您对 gcr.io 的访问受到限制,则可以使用该镜像存储库。将镜像存储库设置为“auto”可让 minikube 为您选择一个存储库。对于中国大陆用户,您可以使用本地 gcr.io 镜像,例如 registry.cn-hangzhou.aliyuncs.com/google_containers", "Alternatively you could install one of these drivers:": "或者你也可以安装以下驱动程序:",