Skip to content

Commit

Permalink
Automate installing NVIDIA Container Toolkit
Browse files Browse the repository at this point in the history
  • Loading branch information
spowelljr committed Sep 25, 2023
1 parent 2a1f5b9 commit 091ff2d
Show file tree
Hide file tree
Showing 14 changed files with 197 additions and 39 deletions.
11 changes: 8 additions & 3 deletions cmd/minikube/cmd/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -1285,7 +1285,7 @@ func validateFlags(cmd *cobra.Command, drvName string) {
}

if cmd.Flags().Changed(containerRuntime) {
err := validateRuntime(viper.GetString(containerRuntime))
err := validateRuntime(viper.GetString(containerRuntime), drvName)
if err != nil {
exit.Message(reason.Usage, "{{.err}}", out.V{"err": err})
}
Expand Down Expand Up @@ -1402,7 +1402,7 @@ func validateDiskSize(diskSize string) error {
}

// validateRuntime validates the supplied runtime
func validateRuntime(rtime string) error {
func validateRuntime(rtime, driverName string) error {
validOptions := cruntime.ValidRuntimes()
// `crio` is accepted as an alternative spelling to `cri-o`
validOptions = append(validOptions, constants.CRIO)
Expand Down Expand Up @@ -1431,6 +1431,11 @@ func validateRuntime(rtime string) error {
if !validRuntime {
return errors.Errorf("Invalid Container Runtime: %s. Valid runtimes are: %s", rtime, cruntime.ValidRuntimes())
}

if rtime == constants.NvidiaDocker && driverName != constants.Docker {
return errors.Errorf("The nvidia-docker container-runtime can only be run with the docker driver")
}

return nil
}

Expand Down Expand Up @@ -1793,7 +1798,7 @@ func validateContainerRuntime(old *config.ClusterConfig) {
return
}

if err := validateRuntime(old.KubernetesConfig.ContainerRuntime); err != nil {
if err := validateRuntime(old.KubernetesConfig.ContainerRuntime, old.Driver); err != nil {
klog.Errorf("Error parsing old runtime %q: %v", old.KubernetesConfig.ContainerRuntime, err)
}
}
Expand Down
14 changes: 12 additions & 2 deletions cmd/minikube/cmd/start_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,7 @@ func TestValidateDiskSize(t *testing.T) {
func TestValidateRuntime(t *testing.T) {
var tests = []struct {
runtime string
driver string
errorMsg string
}{
{
Expand All @@ -444,15 +445,24 @@ func TestValidateRuntime(t *testing.T) {
runtime: "docker",
errorMsg: "",
},

{
runtime: "test",
errorMsg: fmt.Sprintf("Invalid Container Runtime: test. Valid runtimes are: %v", cruntime.ValidRuntimes()),
},
{
runtime: "nvidia-docker",
driver: "docker",
errorMsg: "",
},
{
runtime: "nvidia-docker",
driver: "kvm",
errorMsg: "The nvidia-docker container-runtime can only be run with the docker driver",
},
}
for _, test := range tests {
t.Run(test.runtime, func(t *testing.T) {
got := validateRuntime(test.runtime)
got := validateRuntime(test.runtime, test.driver)
gotError := ""
if got != nil {
gotError = got.Error()
Expand Down
4 changes: 4 additions & 0 deletions deploy/addons/assets.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,4 +159,8 @@ var (
// CloudSpanner assets for cloud-spanner addon
//go:embed cloud-spanner/*.yaml
CloudSpanner embed.FS

// NvidiaDevicePlugin assets for nvidia-device-plugin addon
//go:embed nvidia-device-plugin/*.tmpl
NvidiaDevicePlugin embed.FS
)
56 changes: 56 additions & 0 deletions deploy/addons/nvidia-device-plugin/nvidia-device-plugin.yaml.tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
name: nvidia-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-device-plugin-ds
spec:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
containers:
- image: {{.CustomRegistries.NvidiaDevicePlugin | default .ImageRepository | default .Registries.NvidiaDevicePlugin}}{{.Images.NvidiaDevicePlugin}}
name: nvidia-device-plugin-ctr
env:
- name: FAIL_ON_INIT_ERROR
value: "false"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
5 changes: 5 additions & 0 deletions pkg/addons/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,4 +217,9 @@ var Addons = []*Addon{
set: SetBool,
callbacks: []setFn{EnableOrDisableAddon},
},
{
name: "nvidia-device-plugin",
set: SetBool,
callbacks: []setFn{EnableOrDisableAddon},
},
}
3 changes: 3 additions & 0 deletions pkg/drivers/kic/kic.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ func (d *Driver) Create() error {
APIServerPort: d.NodeConfig.APIServerPort,
}

if d.NodeConfig.ContainerRuntime == constants.NvidiaDocker {
params.GPUs = true
}
networkName := d.NodeConfig.Network
if networkName == "" {
networkName = d.NodeConfig.ClusterName
Expand Down
3 changes: 3 additions & 0 deletions pkg/drivers/kic/oci/oci.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,9 @@ func CreateContainerNode(p CreateParams) error {
runArgs = append(runArgs, "--network", p.Network)
runArgs = append(runArgs, "--ip", p.IP)
}
if p.GPUs {
runArgs = append(runArgs, "--gpus", "all")
}

memcgSwap := hasMemorySwapCgroup()
memcg := HasMemoryCgroup()
Expand Down
3 changes: 2 additions & 1 deletion pkg/drivers/kic/oci/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ type CreateParams struct {
ExtraArgs []string // a list of any extra option to pass to oci binary during creation time, for example --expose 8080...
OCIBinary string // docker or podman
Network string // network name that the container will attach to
IP string // static IP to assign for th container in the cluster network
IP string // static IP to assign the container in the cluster network
GPUs bool // add GPU devices to the container
}

// createOpt is an option for Create
Expand Down
13 changes: 13 additions & 0 deletions pkg/minikube/assets/addons.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,11 @@ func (a *Addon) IsEnabledOrDefault(cc *config.ClusterConfig) bool {
return a.enabled
}

// EnableByDefault will enable the addon by default on cluster start
func (a *Addon) EnableByDefault() {
a.enabled = true
}

// Addons is the list of addons
// TODO: Make dynamically loadable: move this data to a .yaml file within each addon directory
var Addons = map[string]*Addon{
Expand Down Expand Up @@ -757,6 +762,14 @@ var Addons = map[string]*Addon{
}, map[string]string{
"CloudSpanner": "gcr.io",
}),
"nvidia-device-plugin": NewAddon([]*BinAsset{
MustBinAsset(addons.NvidiaDevicePlugin, "nvidia-device-plugin/nvidia-device-plugin.yaml.tmpl", vmpath.GuestAddonsDir, "nvidia-device-plugin.yaml", "0640"),
}, false, "nvidia-device-plugin", "3rd party (NVIDIA)", "", "",
map[string]string{
"NvidiaDevicePlugin": "nvidia/k8s-device-plugin:v0.14.1@sha256:15c4280d13a61df703b12d1fd1b5b5eec4658157db3cb4b851d3259502310136",
}, map[string]string{
"NvidiaDevicePlugin": "nvcr.io",
}),
}

// parseMapString creates a map based on `str` which is encoded as <key1>=<value1>,<key2>=<value2>,...
Expand Down
2 changes: 2 additions & 0 deletions pkg/minikube/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ const (
CRIO = "crio"
// Docker is the default name and spelling for the docker container runtime
Docker = "docker"
// NvidiaDocker is the default name and spelling for the nvidia-docker container runtime
NvidiaDocker = "nvidia-docker"
// DefaultContainerRuntime is our default container runtime
DefaultContainerRuntime = ""

Expand Down
26 changes: 25 additions & 1 deletion pkg/minikube/cruntime/docker.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
"k8s.io/minikube/pkg/minikube/docker"
"k8s.io/minikube/pkg/minikube/download"
"k8s.io/minikube/pkg/minikube/image"
"k8s.io/minikube/pkg/minikube/out"
"k8s.io/minikube/pkg/minikube/style"
"k8s.io/minikube/pkg/minikube/sysinit"
)
Expand Down Expand Up @@ -560,7 +561,11 @@ func (r *Docker) configureDocker(driver string) error {
},
StorageDriver: "overlay2",
}
if r.Type == "nvidia-docker" {
if r.Type == constants.NvidiaDocker {
if err := r.installNvidiaContainerToolkit(); err != nil {
return fmt.Errorf("failed installing the NVIDIA Container Toolkit: %v", err)
}
assets.Addons["nvidia-device-plugin"].EnableByDefault()
daemonConfig.DefaultRuntime = "nvidia"
runtimes := &dockerDaemonRuntimes{}
runtimes.Nvidia.Path = "/usr/bin/nvidia-container-runtime"
Expand All @@ -574,6 +579,25 @@ func (r *Docker) configureDocker(driver string) error {
return r.Runner.Copy(ma)
}

// installNvidiaContainerToolkit installs the NVIDIA Container Toolkit
// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
func (r *Docker) installNvidiaContainerToolkit() error {
out.Styled(style.Toolkit, "Installing the NVIDIA Container Toolkit...")
cmds := []string{
"curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg",
"curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list",
"sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit",
}

for _, cmd := range cmds {
c := exec.Command("/bin/bash", "-c", cmd)
if _, err := r.Runner.RunCmd(c); err != nil {
return err
}
}
return nil
}

// Preload preloads docker with k8s images:
// 1. Copy over the preloaded tarball into the VM
// 2. Extract the preloaded tarball to the correct directory
Expand Down
1 change: 1 addition & 0 deletions pkg/minikube/style/style.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ var Config = map[Enum]Options{
VerifyingNoLine: {Prefix: "🤔 ", OmitNewline: true},
Verifying: {Prefix: "🤔 "},
CNI: {Prefix: "🔗 "},
Toolkit: {Prefix: "🛠️ "},
}

// LowPrefix returns a 7-bit compatible prefix for a style
Expand Down
1 change: 1 addition & 0 deletions pkg/minikube/style/style_enum.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,4 +105,5 @@ const (
Warning
Workaround
CNI
Toolkit
)
Loading

0 comments on commit 091ff2d

Please sign in to comment.