From b138f39acdb0b60f492b421a8007085d7bc4179c Mon Sep 17 00:00:00 2001
From: Ewe Zi Yi <36802364+deadlycoconuts@users.noreply.github.com>
Date: Wed, 14 Feb 2024 17:54:02 +0800
Subject: [PATCH] fix(api): Change default GPU Request value and prevent null
 values from being accepted (#533)

# Description
As of present, users are allowed to submit null or `0` values as a GPU
request value even when a GPU is configured (when the GPU name field is
set) for Merlin models. This causes the Merlin model to be deployed on a
regular default node without the proper node selectors or tolerations
applied which is an unexpected behaviour from users. In no scenario
would a user intend to have 0 GPUs selected for a specific GPU type when
he/she intends to use a GPU. However, this is not prevented by the API
server nor the UI.

A user using the UI to change his/her GPU configuration might thus
potentially end up redeploying his/her model in a regular node despite
thinking that it has been configured to work with a GPU.

This PR addresses this issue with two approaches, 1) to prevent the
'None' GPU request value from being shown in the UI as a default value
when a GPU is selected and to show the first available GPU request
option instead and, 2) to make the API server throw an error if it
receives any deployment request not having a GPU request value set when
a GPU is configured.

The snippet below shows how the UI changes would look like:


https://github.com/caraml-dev/merlin/assets/36802364/e599b373-888c-45b8-9424-85282e035465

# Modifications
- `ui/src/pages/version/components/forms/components/ResourcesPanel.js` -
Display first available GPU request value (set in the Helm chart values
of the Merlin API server deployment) when a new GPU name is selected
- `api/cluster/resource/templater.go` - Add an additional check to
ensure that the GPU request value is not 0 when a GPU is set

# Checklist
- [x] Added PR label
- [x] Added unit test, integration, and/or e2e tests
- [x] Tested locally
- [ ] Updated documentation
- [ ] Update Swagger spec if the PR introduce API changes
- [ ] Regenerated Golang and Python client if the PR introduces API
changes

# Release Notes
```release-note
Users are now no longer able to select a GPU without setting a non-zero GPU request value.
```
---
 api/cluster/resource/templater.go             | 34 +++++++++++--------
 api/cluster/resource/templater_gpu_test.go    | 27 +++++++++++++++
 .../forms/components/ResourcesPanel.js        |  2 +-
 3 files changed, 48 insertions(+), 15 deletions(-)

diff --git a/api/cluster/resource/templater.go b/api/cluster/resource/templater.go
index b4cd26c9c..95ffd54fc 100644
--- a/api/cluster/resource/templater.go
+++ b/api/cluster/resource/templater.go
@@ -222,20 +222,26 @@ func (t *InferenceServiceTemplater) createPredictorSpec(modelService *models.Ser
 
 	nodeSelector := map[string]string{}
 	tolerations := []corev1.Toleration{}
-	if modelService.ResourceRequest.GPUName != "" && !modelService.ResourceRequest.GPURequest.IsZero() {
-		// Look up to the GPU resource type and quantity from DeploymentConfig
-		for _, gpuConfig := range t.deploymentConfig.GPUs {
-			if gpuConfig.Name == modelService.ResourceRequest.GPUName {
-				// Declare and initialize resourceType and resourceQuantity variables
-				resourceType := corev1.ResourceName(gpuConfig.ResourceType)
-				resourceQuantity := modelService.ResourceRequest.GPURequest
-
-				// Set the resourceType as the key in the maps, with resourceQuantity as the value
-				resources.Requests[resourceType] = resourceQuantity
-				resources.Limits[resourceType] = resourceQuantity
-
-				nodeSelector = gpuConfig.NodeSelector
-				tolerations = gpuConfig.Tolerations
+
+	if modelService.ResourceRequest.GPUName != "" {
+		if modelService.ResourceRequest.GPURequest.IsZero() {
+			// This should never be set as zero if a GPU name is specified
+			return kservev1beta1.PredictorSpec{}, fmt.Errorf("GPU request cannot set as be 0")
+		} else {
+			// Look up to the GPU resource type and quantity from DeploymentConfig
+			for _, gpuConfig := range t.deploymentConfig.GPUs {
+				if gpuConfig.Name == modelService.ResourceRequest.GPUName {
+					// Declare and initialize resourceType and resourceQuantity variables
+					resourceType := corev1.ResourceName(gpuConfig.ResourceType)
+					resourceQuantity := modelService.ResourceRequest.GPURequest
+
+					// Set the resourceType as the key in the maps, with resourceQuantity as the value
+					resources.Requests[resourceType] = resourceQuantity
+					resources.Limits[resourceType] = resourceQuantity
+
+					nodeSelector = gpuConfig.NodeSelector
+					tolerations = gpuConfig.Tolerations
+				}
 			}
 		}
 	}
diff --git a/api/cluster/resource/templater_gpu_test.go b/api/cluster/resource/templater_gpu_test.go
index 7067f682d..2faf48bee 100644
--- a/api/cluster/resource/templater_gpu_test.go
+++ b/api/cluster/resource/templater_gpu_test.go
@@ -106,6 +106,14 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
 		},
 	}
 
+	invalidResourceRequest := &models.ResourceRequest{
+		MinReplica:    1,
+		MaxReplica:    2,
+		CPURequest:    resource.MustParse("500m"),
+		MemoryRequest: resource.MustParse("500Mi"),
+		GPUName:       "NVIDIA P4",
+	}
+
 	queueResourcePercentage := "2"
 	storageUri := fmt.Sprintf("%s/model", modelSvc.ArtifactURI)
 
@@ -1563,6 +1571,25 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
 				},
 			},
 		},
+		{
+			name: "invalid resource request with 0 GPU requested",
+			modelSvc: &models.Service{
+				Name:            modelSvc.Name,
+				ModelName:       modelSvc.ModelName,
+				ModelVersion:    modelSvc.ModelVersion,
+				Namespace:       project.Name,
+				ArtifactURI:     modelSvc.ArtifactURI,
+				Type:            models.ModelTypeTensorflow,
+				Options:         &models.ModelOption{},
+				Metadata:        modelSvc.Metadata,
+				Protocol:        protocol.HttpJson,
+				ResourceRequest: invalidResourceRequest,
+			},
+			resourcePercentage: queueResourcePercentage,
+			deploymentScale:    defaultDeploymentScale,
+			exp:                &kservev1beta1.InferenceService{},
+			wantErr:            true,
+		},
 	}
 
 	for _, tt := range tests {
diff --git a/ui/src/pages/version/components/forms/components/ResourcesPanel.js b/ui/src/pages/version/components/forms/components/ResourcesPanel.js
index 0f4465951..34bca68da 100644
--- a/ui/src/pages/version/components/forms/components/ResourcesPanel.js
+++ b/ui/src/pages/version/components/forms/components/ResourcesPanel.js
@@ -82,7 +82,7 @@ export const ResourcesPanel = ({
       return;
     }
     onChange("gpu_name")(gpu_name);
-    onChange("gpu_request")(undefined);
+    onChange("gpu_request")(gpus[gpu_name].values[0]);
     onChange("min_monthly_cost_per_gpu")(
       gpus[gpu_name].min_monthly_cost_per_gpu
     );