Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automate installing NVIDIA Container Toolkit --container-runtime #17287

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions cmd/minikube/cmd/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -1285,7 +1285,7 @@ func validateFlags(cmd *cobra.Command, drvName string) {
}

if cmd.Flags().Changed(containerRuntime) {
err := validateRuntime(viper.GetString(containerRuntime))
err := validateRuntime(viper.GetString(containerRuntime), drvName)
if err != nil {
exit.Message(reason.Usage, "{{.err}}", out.V{"err": err})
}
Expand Down Expand Up @@ -1402,7 +1402,7 @@ func validateDiskSize(diskSize string) error {
}

// validateRuntime validates the supplied runtime
func validateRuntime(rtime string) error {
func validateRuntime(rtime, driverName string) error {
validOptions := cruntime.ValidRuntimes()
// `crio` is accepted as an alternative spelling to `cri-o`
validOptions = append(validOptions, constants.CRIO)
Expand Down Expand Up @@ -1431,6 +1431,11 @@ func validateRuntime(rtime string) error {
if !validRuntime {
return errors.Errorf("Invalid Container Runtime: %s. Valid runtimes are: %s", rtime, cruntime.ValidRuntimes())
}

if rtime == constants.NvidiaDocker && driverName != constants.Docker {
return errors.Errorf("The nvidia-docker container-runtime can only be run with the docker driver")
}

return nil
}

Expand Down Expand Up @@ -1793,7 +1798,7 @@ func validateContainerRuntime(old *config.ClusterConfig) {
return
}

if err := validateRuntime(old.KubernetesConfig.ContainerRuntime); err != nil {
if err := validateRuntime(old.KubernetesConfig.ContainerRuntime, old.Driver); err != nil {
klog.Errorf("Error parsing old runtime %q: %v", old.KubernetesConfig.ContainerRuntime, err)
}
}
Expand Down
14 changes: 12 additions & 2 deletions cmd/minikube/cmd/start_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,7 @@ func TestValidateDiskSize(t *testing.T) {
func TestValidateRuntime(t *testing.T) {
var tests = []struct {
runtime string
driver string
errorMsg string
}{
{
Expand All @@ -444,15 +445,24 @@ func TestValidateRuntime(t *testing.T) {
runtime: "docker",
errorMsg: "",
},

{
runtime: "test",
errorMsg: fmt.Sprintf("Invalid Container Runtime: test. Valid runtimes are: %v", cruntime.ValidRuntimes()),
},
{
runtime: "nvidia-docker",
driver: "docker",
errorMsg: "",
},
{
runtime: "nvidia-docker",
driver: "kvm",
errorMsg: "The nvidia-docker container-runtime can only be run with the docker driver",
},
}
for _, test := range tests {
t.Run(test.runtime, func(t *testing.T) {
got := validateRuntime(test.runtime)
got := validateRuntime(test.runtime, test.driver)
gotError := ""
if got != nil {
gotError = got.Error()
Expand Down
4 changes: 4 additions & 0 deletions deploy/addons/assets.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,4 +163,8 @@ var (
// CloudSpanner assets for cloud-spanner addon
//go:embed cloud-spanner/*.yaml
CloudSpanner embed.FS

// NvidiaDevicePlugin assets for nvidia-device-plugin addon
//go:embed nvidia-device-plugin/*.tmpl
NvidiaDevicePlugin embed.FS
)
56 changes: 56 additions & 0 deletions deploy/addons/nvidia-device-plugin/nvidia-device-plugin.yaml.tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
name: nvidia-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-device-plugin-ds
spec:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
containers:
- image: {{.CustomRegistries.NvidiaDevicePlugin | default .ImageRepository | default .Registries.NvidiaDevicePlugin}}{{.Images.NvidiaDevicePlugin}}
name: nvidia-device-plugin-ctr
env:
- name: FAIL_ON_INIT_ERROR
value: "false"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
5 changes: 5 additions & 0 deletions pkg/addons/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,4 +222,9 @@ var Addons = []*Addon{
set: SetBool,
callbacks: []setFn{EnableOrDisableAddon},
},
{
name: "nvidia-device-plugin",
set: SetBool,
callbacks: []setFn{EnableOrDisableAddon},
},
}
3 changes: 3 additions & 0 deletions pkg/drivers/kic/kic.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ func (d *Driver) Create() error {
APIServerPort: d.NodeConfig.APIServerPort,
}

if d.NodeConfig.ContainerRuntime == constants.NvidiaDocker {
params.GPUs = true
}
networkName := d.NodeConfig.Network
if networkName == "" {
networkName = d.NodeConfig.ClusterName
Expand Down
3 changes: 3 additions & 0 deletions pkg/drivers/kic/oci/oci.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,9 @@ func CreateContainerNode(p CreateParams) error {
runArgs = append(runArgs, "--network", p.Network)
runArgs = append(runArgs, "--ip", p.IP)
}
if p.GPUs {
runArgs = append(runArgs, "--gpus", "all")
}

memcgSwap := hasMemorySwapCgroup()
memcg := HasMemoryCgroup()
Expand Down
3 changes: 2 additions & 1 deletion pkg/drivers/kic/oci/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ type CreateParams struct {
ExtraArgs []string // a list of any extra option to pass to oci binary during creation time, for example --expose 8080...
OCIBinary string // docker or podman
Network string // network name that the container will attach to
IP string // static IP to assign for th container in the cluster network
IP string // static IP to assign the container in the cluster network
GPUs bool // add GPU devices to the container
}

// createOpt is an option for Create
Expand Down
13 changes: 13 additions & 0 deletions pkg/minikube/assets/addons.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,11 @@ func (a *Addon) IsEnabledOrDefault(cc *config.ClusterConfig) bool {
return a.enabled
}

// EnableByDefault will enable the addon by default on cluster start
func (a *Addon) EnableByDefault() {
a.enabled = true
}

// Addons is the list of addons
// TODO: Make dynamically loadable: move this data to a .yaml file within each addon directory
var Addons = map[string]*Addon{
Expand Down Expand Up @@ -770,6 +775,14 @@ var Addons = map[string]*Addon{
}, map[string]string{
"CloudSpanner": "gcr.io",
}),
"nvidia-device-plugin": NewAddon([]*BinAsset{
MustBinAsset(addons.NvidiaDevicePlugin, "nvidia-device-plugin/nvidia-device-plugin.yaml.tmpl", vmpath.GuestAddonsDir, "nvidia-device-plugin.yaml", "0640"),
}, false, "nvidia-device-plugin", "3rd party (NVIDIA)", "", "",
map[string]string{
"NvidiaDevicePlugin": "nvidia/k8s-device-plugin:v0.14.1@sha256:15c4280d13a61df703b12d1fd1b5b5eec4658157db3cb4b851d3259502310136",
}, map[string]string{
"NvidiaDevicePlugin": "nvcr.io",
}),
}

// parseMapString creates a map based on `str` which is encoded as <key1>=<value1>,<key2>=<value2>,...
Expand Down
2 changes: 2 additions & 0 deletions pkg/minikube/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ const (
CRIO = "crio"
// Docker is the default name and spelling for the docker container runtime
Docker = "docker"
// NvidiaDocker is the default name and spelling for the nvidia-docker container runtime
NvidiaDocker = "nvidia-docker"
// DefaultContainerRuntime is our default container runtime
DefaultContainerRuntime = ""

Expand Down
26 changes: 25 additions & 1 deletion pkg/minikube/cruntime/docker.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
"k8s.io/minikube/pkg/minikube/docker"
"k8s.io/minikube/pkg/minikube/download"
"k8s.io/minikube/pkg/minikube/image"
"k8s.io/minikube/pkg/minikube/out"
"k8s.io/minikube/pkg/minikube/style"
"k8s.io/minikube/pkg/minikube/sysinit"
)
Expand Down Expand Up @@ -560,7 +561,11 @@ func (r *Docker) configureDocker(driver string) error {
},
StorageDriver: "overlay2",
}
if r.Type == "nvidia-docker" {
if r.Type == constants.NvidiaDocker {
if err := r.installNvidiaContainerToolkit(); err != nil {
return fmt.Errorf("failed installing the NVIDIA Container Toolkit: %v", err)
}
assets.Addons["nvidia-device-plugin"].EnableByDefault()
daemonConfig.DefaultRuntime = "nvidia"
runtimes := &dockerDaemonRuntimes{}
runtimes.Nvidia.Path = "/usr/bin/nvidia-container-runtime"
Expand All @@ -574,6 +579,25 @@ func (r *Docker) configureDocker(driver string) error {
return r.Runner.Copy(ma)
}

// installNvidiaContainerToolkit installs the NVIDIA Container Toolkit
// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
func (r *Docker) installNvidiaContainerToolkit() error {
out.Styled(style.Toolkit, "Installing the NVIDIA Container Toolkit...")
cmds := []string{
"curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg",
"curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list",
"sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit",
}

for _, cmd := range cmds {
c := exec.Command("/bin/bash", "-c", cmd)
if _, err := r.Runner.RunCmd(c); err != nil {
return err
}
}
return nil
}

// Preload preloads docker with k8s images:
// 1. Copy over the preloaded tarball into the VM
// 2. Extract the preloaded tarball to the correct directory
Expand Down
1 change: 1 addition & 0 deletions pkg/minikube/style/style.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ var Config = map[Enum]Options{
VerifyingNoLine: {Prefix: "πŸ€” ", OmitNewline: true},
Verifying: {Prefix: "πŸ€” "},
CNI: {Prefix: "πŸ”— "},
Toolkit: {Prefix: "πŸ› οΈ "},
}

// LowPrefix returns a 7-bit compatible prefix for a style
Expand Down
1 change: 1 addition & 0 deletions pkg/minikube/style/style_enum.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,4 +105,5 @@ const (
Warning
Workaround
CNI
Toolkit
)
Loading
Loading