Skip to content

Commit

Permalink
Automate installing NVIDIA Container Toolkit w/ flag
Browse files Browse the repository at this point in the history
  • Loading branch information
spowelljr committed Sep 27, 2023
1 parent b64950e commit 3a592d4
Show file tree
Hide file tree
Showing 14 changed files with 67 additions and 78 deletions.
26 changes: 19 additions & 7 deletions cmd/minikube/cmd/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -1285,7 +1285,7 @@ func validateFlags(cmd *cobra.Command, drvName string) {
}

if cmd.Flags().Changed(containerRuntime) {
err := validateRuntime(viper.GetString(containerRuntime), drvName)
err := validateRuntime(viper.GetString(containerRuntime))
if err != nil {
exit.Message(reason.Usage, "{{.err}}", out.V{"err": err})
}
Expand All @@ -1298,6 +1298,12 @@ func validateFlags(cmd *cobra.Command, drvName string) {
}
}

if cmd.Flags().Changed(enableNvidiaGPUs) {
if err := validateEnableNvidiaGPUs(viper.GetBool(enableNvidiaGPUs), drvName, viper.GetString(containerRuntime)); err != nil {
exit.Message(reason.Usage, "{{.err}}", out.V{"err": err})
}
}

if driver.IsSSH(drvName) {
sshIPAddress := viper.GetString(sshIPAddress)
if sshIPAddress == "" {
Expand Down Expand Up @@ -1402,7 +1408,7 @@ func validateDiskSize(diskSize string) error {
}

// validateRuntime validates the supplied runtime
func validateRuntime(rtime, driverName string) error {
func validateRuntime(rtime string) error {
validOptions := cruntime.ValidRuntimes()
// `crio` is accepted as an alternative spelling to `cri-o`
validOptions = append(validOptions, constants.CRIO)
Expand Down Expand Up @@ -1431,12 +1437,18 @@ func validateRuntime(rtime, driverName string) error {
if !validRuntime {
return errors.Errorf("Invalid Container Runtime: %s. Valid runtimes are: %s", rtime, cruntime.ValidRuntimes())
}
return nil
}

if rtime == constants.NvidiaDocker && driverName != constants.Docker {
return errors.Errorf("The nvidia-docker container-runtime can only be run with the docker driver")
// validateEnableNvidiaGPUs validates that the nvidia GPU(s) can be used with the given configuration
func validateEnableNvidiaGPUs(gpusEnabled bool, drvName, rtime string) error {
if !gpusEnabled {
return nil
}

return nil
if drvName == constants.Docker && rtime == constants.Docker {
return nil
}
return errors.Errorf("The enable-nvidia-gpus flag can only be run with the docker driver and docker container-runtime")
}

func getContainerRuntime(old *config.ClusterConfig) string {
Expand Down Expand Up @@ -1798,7 +1810,7 @@ func validateContainerRuntime(old *config.ClusterConfig) {
return
}

if err := validateRuntime(old.KubernetesConfig.ContainerRuntime, old.Driver); err != nil {
if err := validateRuntime(old.KubernetesConfig.ContainerRuntime); err != nil {
klog.Errorf("Error parsing old runtime %q: %v", old.KubernetesConfig.ContainerRuntime, err)
}
}
Expand Down
3 changes: 3 additions & 0 deletions cmd/minikube/cmd/start_flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ const (
socketVMnetPath = "socket-vmnet-path"
staticIP = "static-ip"
autoPauseInterval = "auto-pause-interval"
enableNvidiaGPUs = "enable-nvidia-gpus"
)

var (
Expand Down Expand Up @@ -204,6 +205,7 @@ func initMinikubeFlags() {
startCmd.Flags().Bool(disableMetrics, false, "If set, disables metrics reporting (CPU and memory usage), this can improve CPU usage. Defaults to false.")
startCmd.Flags().String(staticIP, "", "Set a static IP for the minikube cluster, the IP must be: private, IPv4, and the last octet must be between 2 and 254, for example 192.168.200.200 (Docker and Podman drivers only)")
startCmd.Flags().Duration(autoPauseInterval, time.Minute*1, "Duration of inactivity before the minikube VM is paused (default 1m0s). To disable, set to 0s")
startCmd.Flags().Bool(enableNvidiaGPUs, false, "If set, allows pods to use your NVIDIA GPU(s) (Docker driver with Docker container-runtime only)")
}

// initKubernetesFlags inits the commandline flags for Kubernetes related options
Expand Down Expand Up @@ -595,6 +597,7 @@ func generateNewConfigFromFlags(cmd *cobra.Command, k8sVersion string, rtime str
},
MultiNodeRequested: viper.GetInt(nodes) > 1,
AutoPauseInterval: viper.GetDuration(autoPauseInterval),
EnableNvidiaGPUs: viper.GetBool(enableNvidiaGPUs),
}
cc.VerifyComponents = interpretWaitFlag(*cmd)
if viper.GetBool(createMount) && driver.IsKIC(drvName) {
Expand Down
13 changes: 1 addition & 12 deletions cmd/minikube/cmd/start_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,6 @@ func TestValidateDiskSize(t *testing.T) {
func TestValidateRuntime(t *testing.T) {
var tests = []struct {
runtime string
driver string
errorMsg string
}{
{
Expand All @@ -449,20 +448,10 @@ func TestValidateRuntime(t *testing.T) {
runtime: "test",
errorMsg: fmt.Sprintf("Invalid Container Runtime: test. Valid runtimes are: %v", cruntime.ValidRuntimes()),
},
{
runtime: "nvidia-docker",
driver: "docker",
errorMsg: "",
},
{
runtime: "nvidia-docker",
driver: "kvm",
errorMsg: "The nvidia-docker container-runtime can only be run with the docker driver",
},
}
for _, test := range tests {
t.Run(test.runtime, func(t *testing.T) {
got := validateRuntime(test.runtime, test.driver)
got := validateRuntime(test.runtime)
gotError := ""
if got != nil {
gotError = got.Error()
Expand Down
30 changes: 14 additions & 16 deletions pkg/drivers/kic/kic.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,22 +77,20 @@ func NewDriver(c Config) *Driver {
func (d *Driver) Create() error {
ctx := context.Background()
params := oci.CreateParams{
Mounts: d.NodeConfig.Mounts,
Name: d.NodeConfig.MachineName,
Image: d.NodeConfig.ImageDigest,
ClusterLabel: oci.ProfileLabelKey + "=" + d.MachineName,
NodeLabel: oci.NodeLabelKey + "=" + d.NodeConfig.MachineName,
CPUs: strconv.Itoa(d.NodeConfig.CPU),
Memory: strconv.Itoa(d.NodeConfig.Memory) + "mb",
Envs: d.NodeConfig.Envs,
ExtraArgs: append([]string{"--expose", fmt.Sprintf("%d", d.NodeConfig.APIServerPort)}, d.NodeConfig.ExtraArgs...),
OCIBinary: d.NodeConfig.OCIBinary,
APIServerPort: d.NodeConfig.APIServerPort,
}

if d.NodeConfig.ContainerRuntime == constants.NvidiaDocker {
params.GPUs = true
Mounts: d.NodeConfig.Mounts,
Name: d.NodeConfig.MachineName,
Image: d.NodeConfig.ImageDigest,
ClusterLabel: oci.ProfileLabelKey + "=" + d.MachineName,
NodeLabel: oci.NodeLabelKey + "=" + d.NodeConfig.MachineName,
CPUs: strconv.Itoa(d.NodeConfig.CPU),
Memory: strconv.Itoa(d.NodeConfig.Memory) + "mb",
Envs: d.NodeConfig.Envs,
ExtraArgs: append([]string{"--expose", fmt.Sprintf("%d", d.NodeConfig.APIServerPort)}, d.NodeConfig.ExtraArgs...),
OCIBinary: d.NodeConfig.OCIBinary,
APIServerPort: d.NodeConfig.APIServerPort,
EnableNvidiaGPUs: d.NodeConfig.EnableNvidiaGPUs,
}

networkName := d.NodeConfig.Network
if networkName == "" {
networkName = d.NodeConfig.ClusterName
Expand Down Expand Up @@ -455,7 +453,7 @@ func (d *Driver) Stop() error {
}
}

runtime, err := cruntime.New(cruntime.Config{Type: d.NodeConfig.ContainerRuntime, Runner: d.exec})
runtime, err := cruntime.New(cruntime.Config{Type: d.NodeConfig.ContainerRuntime, Runner: d.exec, EnableNvidiaGPUs: d.NodeConfig.EnableNvidiaGPUs})
if err != nil { // won't return error because:
// even though we can't stop the cotainers inside, we still wanna stop the minikube container itself
klog.Errorf("unable to get container runtime: %v", err)
Expand Down
2 changes: 1 addition & 1 deletion pkg/drivers/kic/oci/oci.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ func CreateContainerNode(p CreateParams) error {
runArgs = append(runArgs, "--network", p.Network)
runArgs = append(runArgs, "--ip", p.IP)
}
if p.GPUs {
if p.EnableNvidiaGPUs {
runArgs = append(runArgs, "--gpus", "all")
}

Expand Down
34 changes: 17 additions & 17 deletions pkg/drivers/kic/oci/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,23 +43,23 @@ const (

// CreateParams are parameters needed to create a container
type CreateParams struct {
ClusterName string // cluster(profile name) that this container belongs to
Name string // used for container name and hostname
Image string // container image to use to create the node.
ClusterLabel string // label the clusters we create using minikube so we can clean up
NodeLabel string // label the nodes so we can clean up by node name
Role string // currently only role supported is control-plane
Mounts []Mount // volume mounts
APIServerPort int // Kubernetes api server port
PortMappings []PortMapping // ports to map to container from host
CPUs string // number of cpu cores assign to container
Memory string // memory (mbs) to assign to the container
Envs map[string]string // environment variables to pass to the container
ExtraArgs []string // a list of any extra option to pass to oci binary during creation time, for example --expose 8080...
OCIBinary string // docker or podman
Network string // network name that the container will attach to
IP string // static IP to assign the container in the cluster network
GPUs bool // add GPU devices to the container
ClusterName string // cluster(profile name) that this container belongs to
Name string // used for container name and hostname
Image string // container image to use to create the node.
ClusterLabel string // label the clusters we create using minikube so we can clean up
NodeLabel string // label the nodes so we can clean up by node name
Role string // currently only role supported is control-plane
Mounts []Mount // volume mounts
APIServerPort int // Kubernetes api server port
PortMappings []PortMapping // ports to map to container from host
CPUs string // number of cpu cores assign to container
Memory string // memory (mbs) to assign to the container
Envs map[string]string // environment variables to pass to the container
ExtraArgs []string // a list of any extra option to pass to oci binary during creation time, for example --expose 8080...
OCIBinary string // docker or podman
Network string // network name that the container will attach to
IP string // static IP to assign the container in the cluster network
EnableNvidiaGPUs bool // add NVIDIA GPU devices to the container
}

// createOpt is an option for Create
Expand Down
1 change: 1 addition & 0 deletions pkg/drivers/kic/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,5 @@ type Config struct {
StaticIP string // static IP for the kic cluster
ExtraArgs []string // a list of any extra option to pass to oci binary during creation time, for example --expose 8080...
ListenAddress string // IP Address to listen to
EnableNvidiaGPUs bool // add NVIDIA GPU devices to the container
}
1 change: 1 addition & 0 deletions pkg/minikube/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ type ClusterConfig struct {
SSHAuthSock string
SSHAgentPID int
AutoPauseInterval time.Duration // Specifies interval of time to wait before checking if cluster should be paused
EnableNvidiaGPUs bool
}

// KubernetesConfig contains the parameters used to configure the VM Kubernetes.
Expand Down
2 changes: 0 additions & 2 deletions pkg/minikube/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,6 @@ const (
CRIO = "crio"
// Docker is the default name and spelling for the docker container runtime
Docker = "docker"
// NvidiaDocker is the default name and spelling for the nvidia-docker container runtime
NvidiaDocker = "nvidia-docker"
// DefaultContainerRuntime is our default container runtime
DefaultContainerRuntime = ""

Expand Down
8 changes: 5 additions & 3 deletions pkg/minikube/cruntime/cruntime.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ func (cs ContainerState) String() string {

// ValidRuntimes lists the supported container runtimes
func ValidRuntimes() []string {
return []string{"docker", "nvidia-docker", "cri-o", "containerd"}
return []string{"docker", "cri-o", "containerd"}
}

// CommandRunner is the subset of command.Runner this package consumes
Expand Down Expand Up @@ -155,6 +155,8 @@ type Config struct {
KubernetesVersion semver.Version
// InsecureRegistry list of insecure registries
InsecureRegistry []string
// EnableNvidiaGPUs add GPU devices to the container
EnableNvidiaGPUs bool
}

// ListContainersOptions are the options to use for listing containers
Expand Down Expand Up @@ -210,7 +212,7 @@ func New(c Config) (Manager, error) {
sm := sysinit.New(c.Runner)

switch c.Type {
case "", "docker", "nvidia-docker":
case "", "docker":
sp := c.Socket
cs := ""
// There is no more dockershim socket, in Kubernetes version 1.24 and beyond
Expand All @@ -219,7 +221,6 @@ func New(c Config) (Manager, error) {
cs = "cri-docker.socket"
}
return &Docker{
Type: c.Type,
Socket: sp,
Runner: c.Runner,
NetworkPlugin: c.NetworkPlugin,
Expand All @@ -228,6 +229,7 @@ func New(c Config) (Manager, error) {
Init: sm,
UseCRI: (sp != ""), // !dockershim
CRIService: cs,
NvidiaGPUs: c.EnableNvidiaGPUs,
}, nil
case "crio", "cri-o":
return &CRIO{
Expand Down
18 changes: 1 addition & 17 deletions pkg/minikube/cruntime/cruntime_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ func TestName(t *testing.T) {
}{
{"", "Docker"},
{"docker", "Docker"},
{"nvidia-docker", "Docker"},
{"crio", "CRI-O"},
{"cri-o", "CRI-O"},
{"containerd", "containerd"},
Expand Down Expand Up @@ -125,7 +124,6 @@ func TestCGroupDriver(t *testing.T) {
want string
}{
{"docker", "cgroupfs"},
{"nvidia-docker", "cgroupfs"},
{"crio", "cgroupfs"},
{"containerd", "cgroupfs"},
}
Expand Down Expand Up @@ -157,12 +155,6 @@ func TestKubeletOptions(t *testing.T) {
{"docker", "1.24.0", map[string]string{
"container-runtime-endpoint": "unix:///var/run/cri-dockerd.sock",
}},
{"nvidia-docker", "1.23.0", map[string]string{
"container-runtime": "docker",
}},
{"nvidia-docker", "1.25.0", map[string]string{
"container-runtime-endpoint": "unix:///var/run/cri-dockerd.sock",
}},
{"crio", "1.25.0", map[string]string{
"container-runtime-endpoint": "unix:///var/run/crio/crio.sock",
}},
Expand Down Expand Up @@ -688,13 +680,6 @@ func TestEnable(t *testing.T) {
"crio": SvcExited,
"crio-shutdown": SvcExited,
}},
{"nvidia-docker", defaultServices,
map[string]serviceState{
"docker": SvcRestarted,
"containerd": SvcExited,
"crio": SvcExited,
"crio-shutdown": SvcExited,
}},
{"containerd", defaultServices,
map[string]serviceState{
"docker": SvcExited,
Expand Down Expand Up @@ -736,7 +721,6 @@ func TestContainerFunctions(t *testing.T) {
runtime string
}{
{"docker"},
{"nvidia-docker"},
{"crio"},
{"containerd"},
}
Expand All @@ -746,7 +730,7 @@ func TestContainerFunctions(t *testing.T) {
t.Run(tc.runtime, func(t *testing.T) {
runner := NewFakeRunner(t)
prefix := ""
if tc.runtime == "docker" || tc.runtime == "nvidia-docker" {
if tc.runtime == "docker" {
prefix = "k8s_"
}
runner.containers = map[string]string{
Expand Down
4 changes: 2 additions & 2 deletions pkg/minikube/cruntime/docker.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ func (e *ErrISOFeature) Error() string {

// Docker contains Docker runtime state
type Docker struct {
Type string
Socket string
Runner CommandRunner
NetworkPlugin string
Expand All @@ -77,6 +76,7 @@ type Docker struct {
Init sysinit.Manager
UseCRI bool
CRIService string
NvidiaGPUs bool
}

// Name is a human readable name for Docker
Expand Down Expand Up @@ -561,7 +561,7 @@ func (r *Docker) configureDocker(driver string) error {
},
StorageDriver: "overlay2",
}
if r.Type == constants.NvidiaDocker {
if r.NvidiaGPUs {
if err := r.installNvidiaContainerToolkit(); err != nil {
return fmt.Errorf("failed installing the NVIDIA Container Toolkit: %v", err)
}
Expand Down
1 change: 1 addition & 0 deletions pkg/minikube/registry/drvs/docker/docker.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ func configure(cc config.ClusterConfig, n config.Node) (interface{}, error) {
Subnet: cc.Subnet,
StaticIP: cc.StaticIP,
ListenAddress: cc.ListenAddress,
EnableNvidiaGPUs: cc.EnableNvidiaGPUs,
}), nil
}

Expand Down
2 changes: 1 addition & 1 deletion site/content/en/docs/tutorials/nvidia.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ date: 2018-01-02
```
- Start minikube:
```shell
minikube start --driver docker --container-runtime nvidia-docker
minikube start --driver docker --container-runtime docker --enable-nvidia-gpus
```
{{% /tab %}}
{{% tab none %}}
Expand Down

0 comments on commit 3a592d4

Please sign in to comment.