Skip to content

Commit

Permalink
Add support for AMD GPU via --gpu=amd for docker linux amd64.
Browse files Browse the repository at this point in the history
  • Loading branch information
fbyrne committed Oct 11, 2024
1 parent a46a49b commit f101a72
Show file tree
Hide file tree
Showing 35 changed files with 518 additions and 32 deletions.
48 changes: 48 additions & 0 deletions .github/workflows/update-amd-gpu-device-plugin-version.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
name: "update-amd-gpu-device-plugin-version"
on:
workflow_dispatch:
schedule:
# every Monday at around 3 am pacific/10 am UTC
- cron: "0 10 * * 1"
env:
GOPROXY: https://proxy.golang.org
GO_VERSION: '1.23.0'
permissions:
contents: read

jobs:
bump-amd-gpu-device-plugin-version:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
- uses: actions/setup-go@0a12ed9d6a96ab950c8f026ed9f722fe0da7ef32
with:
go-version: ${{env.GO_VERSION}}
- name: Bump amd-gpu-device-plugin version
id: bumpAmdDevicePlugin
run: |
echo "OLD_VERSION=$(DEP=amd-gpu-device-plugin make get-dependency-version)" >> "$GITHUB_OUTPUT"
make update-amd-gpu-device-plugin-version
echo "NEW_VERSION=$(DEP=amd-gpu-device-plugin make get-dependency-version)" >> "$GITHUB_OUTPUT"
# The following is to support multiline with GITHUB_OUTPUT, see https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#multiline-strings
echo "changes<<EOF" >> "$GITHUB_OUTPUT"
echo "$(git status --porcelain)" >> "$GITHUB_OUTPUT"
echo "EOF" >> "$GITHUB_OUTPUT"
- name: Create PR
if: ${{ steps.bumpAmdDevicePlugin.outputs.changes != '' }}
uses: peter-evans/create-pull-request@5e914681df9dc83aa4e4905692ca88beb2f9e91f
with:
token: ${{ secrets.MINIKUBE_BOT_PAT }}
commit-message: 'Addon amd-gpu-device-plugin: Update amd/k8s-device-plugin image from ${{ steps.bumpAmdDevicePlugin.outputs.OLD_VERSION }} to ${{ steps.bumpAmdDevicePlugin.outputs.NEW_VERSION }}'
committer: minikube-bot <[email protected]>
author: minikube-bot <[email protected]>
branch: auto_bump_amd_device_plugin_version
push-to-fork: minikube-bot/minikube
base: master
delete-branch: true
title: 'Addon amd-gpu-device-plugin: Update amd/k8s-device-plugin image from ${{ steps.bumpAmdDevicePlugin.outputs.OLD_VERSION }} to ${{ steps.bumpAmdDevicePlugin.outputs.NEW_VERSION }}'
labels: ok-to-test
body: |
The [k8s-device-plugin](https://github.com/ROCm/k8s-device-plugin) project released a new k8s-device-plugin image
This PR was auto-generated by `make update-amd-gpu-device-plugin-version` using [update-amd-gpu-device-plugin-version.yml](https://github.com/kubernetes/minikube/tree/master/.github/workflows/update-amd-gpu-device-plugin-version.yml) CI Workflow.
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -1222,6 +1222,11 @@ update-nvidia-device-plugin-version:
(cd hack/update/nvidia_device_plugin_version && \
go run update_nvidia_device_plugin_version.go)

.PHONY: update-amd-gpu-device-plugin-version
update-amd-gpu-device-plugin-version:
(cd hack/update/amd_device_plugin_version && \
go run update_amd_device_plugin_version.go)

.PHONY: update-nerctld-version
update-nerdctld-version:
(cd hack/update/nerdctld_version && \
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ As well as developer-friendly features:

* [Addons](https://minikube.sigs.k8s.io/docs/handbook/deploying/#addons) - a marketplace for developers to share configurations for running services on minikube
* [NVIDIA GPU support](https://minikube.sigs.k8s.io/docs/tutorials/nvidia/) - for machine learning
* [AMD GPU support](https://minikube.sigs.k8s.io/docs/tutorials/amd/) - for machine learning
* [Filesystem mounts](https://minikube.sigs.k8s.io/docs/handbook/mount/)

**For more information, see the official [minikube website](https://minikube.sigs.k8s.io)**
Expand Down
4 changes: 2 additions & 2 deletions cmd/minikube/cmd/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -1462,8 +1462,8 @@ func validateGPUs(value, drvName, rtime string) error {
if err := validateGPUsArch(); err != nil {
return err
}
if value != "nvidia" && value != "all" {
return errors.Errorf(`The gpus flag must be passed a value of "nvidia" or "all"`)
if value != "nvidia" && value != "all" && value != "amd" {
return errors.Errorf(`The gpus flag must be passed a value of "nvidia", "amd" or "all"`)
}
if drvName == constants.Docker && (rtime == constants.Docker || rtime == constants.DefaultContainerRuntime) {
return nil
Expand Down
2 changes: 1 addition & 1 deletion cmd/minikube/cmd/start_flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ func initMinikubeFlags() {
startCmd.Flags().Bool(disableOptimizations, false, "If set, disables optimizations that are set for local Kubernetes. Including decreasing CoreDNS replicas from 2 to 1. Defaults to false.")
startCmd.Flags().Bool(disableMetrics, false, "If set, disables metrics reporting (CPU and memory usage), this can improve CPU usage. Defaults to false.")
startCmd.Flags().String(staticIP, "", "Set a static IP for the minikube cluster, the IP must be: private, IPv4, and the last octet must be between 2 and 254, for example 192.168.200.200 (Docker and Podman drivers only)")
startCmd.Flags().StringP(gpus, "g", "", "Allow pods to use your NVIDIA GPUs. Options include: [all,nvidia] (Docker driver with Docker container-runtime only)")
startCmd.Flags().StringP(gpus, "g", "", "Allow pods to use your GPUs. Options include: [all,nvidia,amd] (Docker driver with Docker container-runtime only)")
startCmd.Flags().Duration(autoPauseInterval, time.Minute*1, "Duration of inactivity before the minikube VM is paused (default 1m0s)")
}

Expand Down
5 changes: 4 additions & 1 deletion cmd/minikube/cmd/start_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -814,7 +814,10 @@ func TestValidateGPUs(t *testing.T) {
{"nvidia", "docker", "", ""},
{"all", "kvm", "docker", "The gpus flag can only be used with the docker driver and docker container-runtime"},
{"nvidia", "docker", "containerd", "The gpus flag can only be used with the docker driver and docker container-runtime"},
{"cat", "docker", "docker", `The gpus flag must be passed a value of "nvidia" or "all"`},
{"cat", "docker", "docker", `The gpus flag must be passed a value of "nvidia", "amd" or "all"`},
{"amd", "docker", "docker", ""},
{"amd", "docker", "", ""},
{"amd", "docker", "containerd", "The gpus flag can only be used with the docker driver and docker container-runtime"},
}

for _, tc := range tests {
Expand Down
4 changes: 4 additions & 0 deletions deploy/addons/assets.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,10 @@ var (
//go:embed gpu/nvidia-gpu-device-plugin.yaml.tmpl
NvidiaGpuDevicePluginAssets embed.FS

// AmdGpuDevicePluginAssets assets for amd-gpu-device-plugin addon
//go:embed gpu/amd-gpu-device-plugin.yaml.tmpl
AmdGpuDevicePluginAssets embed.FS

// LogviewerAssets assets for logviewer addon
//go:embed logviewer/*.tmpl logviewer/*.yaml
LogviewerAssets embed.FS
Expand Down
60 changes: 60 additions & 0 deletions deploy/addons/gpu/amd-gpu-device-plugin.yaml.tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Copyright 2024 The Kubernetes Authors All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: apps/v1
kind: DaemonSet
metadata:
name: amd-gpu-device-plugin
namespace: kube-system
labels:
k8s-app: amd-gpu-device-plugin
kubernetes.io/minikube-addons: amd-gpu-device-plugin
addonmanager.kubernetes.io/mode: Reconcile
spec:
selector:
matchLabels:
k8s-app: amd-gpu-device-plugin
template:
metadata:
labels:
name: amd-gpu-device-plugin
k8s-app: amd-gpu-device-plugin
spec:
nodeSelector:
kubernetes.io/arch: amd64
priorityClassName: system-node-critical
tolerations:
- key: CriticalAddonsOnly
operator: Exists
volumes:
- name: dp
hostPath:
path: /var/lib/kubelet/device-plugins
- name: sys
hostPath:
path: /sys
containers:
- image: {{.CustomRegistries.AmdDevicePlugin | default .ImageRepository | default .Registries.AmdDevicePlugin }}{{.Images.AmdDevicePlugin}}
name: amd-gpu-device-plugin
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: dp
mountPath: /var/lib/kubelet/device-plugins
- name: sys
mountPath: /sys
updateStrategy:
type: RollingUpdate
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*
Copyright 2024 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package main

import (
"context"
"fmt"
"time"

"k8s.io/klog/v2"
"k8s.io/minikube/hack/update"
)

var schema = map[string]update.Item{
"pkg/minikube/assets/addons.go": {
Replace: map[string]string{
`rocm/k8s-device-plugin:.*`: `rocm/k8s-device-plugin:{{.Version}}@{{.SHA}}",`,
},
},
}

type Data struct {
Version string
SHA string
}

func main() {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()

stable, _, _, err := update.GHReleases(ctx, "ROCm", "k8s-device-plugin")
if err != nil {
klog.Fatalf("Unable to get stable version: %v", err)
}
sha, err := update.GetImageSHA(fmt.Sprintf("rocm/k8s-device-plugin:%s", stable.Tag))
if err != nil {
klog.Fatalf("failed to get image SHA: %v", err)
}

data := Data{Version: stable.Tag, SHA: sha}

update.Apply(schema, data)
}
1 change: 1 addition & 0 deletions hack/update/get_version/get_version.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ type dependency struct {
}

var dependencies = map[string]dependency{
"amd-gpu-device-plugin": {addonsFile, `rocm/k8s-device-plugin:(.*)@`},
"buildkit": {"deploy/iso/minikube-iso/arch/x86_64/package/buildkit-bin/buildkit-bin.mk", `BUILDKIT_BIN_VERSION = (.*)`},
"calico": {"pkg/minikube/bootstrapper/images/images.go", `calicoVersion = "(.*)"`},
"cilium": {"pkg/minikube/cni/cilium.yaml", `quay.io/cilium/cilium:(.*)@`},
Expand Down
71 changes: 71 additions & 0 deletions kubeadm.new.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
apiVersion: kubeadm.k8s.io/v1beta4
kind: InitConfiguration
bootstrapTokens:
- groups:
- system:bootstrappers:kubeadm:default-node-token
token: r3db6d.ut3qb84zr8ngrbhf
ttl: 24h0m0s
usages:
- signing
- authentication
localAPIEndpoint:
advertiseAddress: 192.168.49.2
bindPort: 8443
nodeRegistration:
criSocket: unix:///var/run/cri-dockerd.sock
name: minikube
imagePullPolicy: IfNotPresent
imagePullSerial: true
kubeletExtraArgs:
- name: node-ip
value: 192.168.49.2
taints: []
timeouts:
controlPlaneComponentHealthCheck: 4m0s
discovery: 5m0s
etcdAPICall: 2m0s
kubeletHealthCheck: 4m0s
kubernetesAPICall: 1m0s
tlsBootstrap: 5m0s
upgradeManifests: 5m0s
---
apiVersion: kubeadm.k8s.io/v1beta4
kind: ClusterConfiguration
apiServer:
certSANs:
- 127.0.0.1
- localhost
- 192.168.49.2
extraArgs:
- name: enable-admission-plugins
value: NamespaceLifecycle,LimitRanger,ServiceAccount,DefaultStorageClass,DefaultTolerationSeconds,NodeRestriction,MutatingAdmissionWebhook,ValidatingAdmissionWebhook,ResourceQuota
caCertificateValidityPeriod: 87600h0m0s
certificateValidityPeriod: 8760h0m0s
certificatesDir: /var/lib/minikube/certs
clusterName: mk
controlPlaneEndpoint: control-plane.minikube.internal:8443
controllerManager:
extraArgs:
- name: allocate-node-cidrs
value: "true"
- name: leader-elect
value: "false"
dns: {}
encryptionAlgorithm: RSA-2048
etcd:
local:
dataDir: /var/lib/minikube/etcd
extraArgs:
- name: proxy-refresh-interval
value: "70000"
imageRepository: registry.k8s.io
kubernetesVersion: v1.31.0
networking:
dnsDomain: cluster.dev.local
podSubnet: 10.244.0.0/16
serviceSubnet: 10.96.0.0/12
proxy: {}
scheduler:
extraArgs:
- name: leader-elect
value: "false"
75 changes: 75 additions & 0 deletions kubeadm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
apiVersion: kubeadm.k8s.io/v1beta3
kind: InitConfiguration
bootstrapTokens:
- groups:
- system:bootstrappers:kubeadm:default-node-token
ttl: 24h0m0s
usages:
- signing
- authentication
localAPIEndpoint:
advertiseAddress: 192.168.49.2
bindPort: 8443
nodeRegistration:
criSocket: unix:///var/run/cri-dockerd.sock
name: "minikube"
kubeletExtraArgs:
node-ip: 192.168.49.2
taints: []
---
apiVersion: kubeadm.k8s.io/v1beta3
kind: ClusterConfiguration
apiServer:
certSANs: ["127.0.0.1", "localhost", "192.168.49.2"]
extraArgs:
enable-admission-plugins: NamespaceLifecycle,LimitRanger,ServiceAccount,DefaultStorageClass,DefaultTolerationSeconds,NodeRestriction,MutatingAdmissionWebhook,ValidatingAdmissionWebhook,ResourceQuota
certificatesDir: /var/lib/minikube/certs
clusterName: mk
controlPlaneEndpoint: control-plane.minikube.internal:8443
controllerManager:
extraArgs:
allocate-node-cidrs: "true"
leader-elect: "false"
scheduler:
extraArgs:
leader-elect: "false"
etcd:
local:
dataDir: /var/lib/minikube/etcd
extraArgs:
proxy-refresh-interval: "70000"
kubernetesVersion: v1.31.0
networking:
dnsDomain: cluster.dev.local
podSubnet: "10.244.0.0/16"
serviceSubnet: 10.96.0.0/12
---
apiVersion: kubelet.config.k8s.io/v1beta1
kind: KubeletConfiguration
authentication:
x509:
clientCAFile: /var/lib/minikube/certs/ca.crt
cgroupDriver: systemd
containerRuntimeEndpoint: unix:///var/run/cri-dockerd.sock
hairpinMode: hairpin-veth
runtimeRequestTimeout: 15m
clusterDomain: "cluster.dev.local"
# disable disk resource management by default
imageGCHighThresholdPercent: 100
evictionHard:
nodefs.available: "0%"
nodefs.inodesFree: "0%"
imagefs.available: "0%"
failSwapOn: false
staticPodPath: /etc/kubernetes/manifests
---
apiVersion: kubeproxy.config.k8s.io/v1alpha1
kind: KubeProxyConfiguration
clusterCIDR: "10.244.0.0/16"
metricsBindAddress: 0.0.0.0:10249
conntrack:
maxPerCore: 0
# Skip setting "net.netfilter.nf_conntrack_tcp_timeout_established"
tcpEstablishedTimeout: 0s
# Skip setting "net.netfilter.nf_conntrack_tcp_timeout_close"
tcpCloseWaitTimeout: 0s
5 changes: 5 additions & 0 deletions pkg/addons/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,11 @@ var Addons = []*Addon{
validations: []setFn{isKVMDriverForNVIDIA},
callbacks: []setFn{EnableOrDisableAddon},
},
{
name: "amd-gpu-device-plugin",
set: SetBool,
callbacks: []setFn{EnableOrDisableAddon},
},
{
name: "olm",
set: SetBool,
Expand Down
Loading

0 comments on commit f101a72

Please sign in to comment.