fix(api): Change default GPU Request value and prevent null values fr…

…om being accepted (#533) # Description As of present, users are allowed to submit null or `0` values as a GPU request value even when a GPU is configured (when the GPU name field is set) for Merlin models. This causes the Merlin model to be deployed on a regular default node without the proper node selectors or tolerations applied which is an unexpected behaviour from users. In no scenario would a user intend to have 0 GPUs selected for a specific GPU type when he/she intends to use a GPU. However, this is not prevented by the API server nor the UI. A user using the UI to change his/her GPU configuration might thus potentially end up redeploying his/her model in a regular node despite thinking that it has been configured to work with a GPU. This PR addresses this issue with two approaches, 1) to prevent the 'None' GPU request value from being shown in the UI as a default value when a GPU is selected and to show the first available GPU request option instead and, 2) to make the API server throw an error if it receives any deployment request not having a GPU request value set when a GPU is configured. The snippet below shows how the UI changes would look like: https://github.com/caraml-dev/merlin/assets/36802364/e599b373-888c-45b8-9424-85282e035465 # Modifications - `ui/src/pages/version/components/forms/components/ResourcesPanel.js` - Display first available GPU request value (set in the Helm chart values of the Merlin API server deployment) when a new GPU name is selected - `api/cluster/resource/templater.go` - Add an additional check to ensure that the GPU request value is not 0 when a GPU is set # Checklist - [x] Added PR label - [x] Added unit test, integration, and/or e2e tests - [x] Tested locally - [ ] Updated documentation - [ ] Update Swagger spec if the PR introduce API changes - [ ] Regenerated Golang and Python client if the PR introduces API changes # Release Notes ```release-note Users are now no longer able to select a GPU without setting a non-zero GPU request value. ```
caraml-dev · Feb 20, 2024 · b138f39 · b138f39
1 parent e4a0a01
commit b138f39
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 15 deletions.
diff --git a/api/cluster/resource/templater.go b/api/cluster/resource/templater.go
@@ -222,20 +222,26 @@ func (t *InferenceServiceTemplater) createPredictorSpec(modelService *models.Ser
 
 	nodeSelector := map[string]string{}
 	tolerations := []corev1.Toleration{}
-	if modelService.ResourceRequest.GPUName != "" && !modelService.ResourceRequest.GPURequest.IsZero() {
-		// Look up to the GPU resource type and quantity from DeploymentConfig
-		for _, gpuConfig := range t.deploymentConfig.GPUs {
-			if gpuConfig.Name == modelService.ResourceRequest.GPUName {
-				// Declare and initialize resourceType and resourceQuantity variables
-				resourceType := corev1.ResourceName(gpuConfig.ResourceType)
-				resourceQuantity := modelService.ResourceRequest.GPURequest
-
-				// Set the resourceType as the key in the maps, with resourceQuantity as the value
-				resources.Requests[resourceType] = resourceQuantity
-				resources.Limits[resourceType] = resourceQuantity
-
-				nodeSelector = gpuConfig.NodeSelector
-				tolerations = gpuConfig.Tolerations
+
+	if modelService.ResourceRequest.GPUName != "" {
+		if modelService.ResourceRequest.GPURequest.IsZero() {
+			// This should never be set as zero if a GPU name is specified
+			return kservev1beta1.PredictorSpec{}, fmt.Errorf("GPU request cannot set as be 0")
+		} else {
+			// Look up to the GPU resource type and quantity from DeploymentConfig
+			for _, gpuConfig := range t.deploymentConfig.GPUs {
+				if gpuConfig.Name == modelService.ResourceRequest.GPUName {
+					// Declare and initialize resourceType and resourceQuantity variables
+					resourceType := corev1.ResourceName(gpuConfig.ResourceType)
+					resourceQuantity := modelService.ResourceRequest.GPURequest
+
+					// Set the resourceType as the key in the maps, with resourceQuantity as the value
+					resources.Requests[resourceType] = resourceQuantity
+					resources.Limits[resourceType] = resourceQuantity
+
+					nodeSelector = gpuConfig.NodeSelector
+					tolerations = gpuConfig.Tolerations
+				}
 			}
 		}
 	}

diff --git a/api/cluster/resource/templater_gpu_test.go b/api/cluster/resource/templater_gpu_test.go
@@ -106,6 +106,14 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
 		},
 	}
 
+	invalidResourceRequest := &models.ResourceRequest{
+		MinReplica:    1,
+		MaxReplica:    2,
+		CPURequest:    resource.MustParse("500m"),
+		MemoryRequest: resource.MustParse("500Mi"),
+		GPUName:       "NVIDIA P4",
+	}
+
 	queueResourcePercentage := "2"
 	storageUri := fmt.Sprintf("%s/model", modelSvc.ArtifactURI)
 
@@ -1563,6 +1571,25 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
 				},
 			},
 		},
+		{
+			name: "invalid resource request with 0 GPU requested",
+			modelSvc: &models.Service{
+				Name:            modelSvc.Name,
+				ModelName:       modelSvc.ModelName,
+				ModelVersion:    modelSvc.ModelVersion,
+				Namespace:       project.Name,
+				ArtifactURI:     modelSvc.ArtifactURI,
+				Type:            models.ModelTypeTensorflow,
+				Options:         &models.ModelOption{},
+				Metadata:        modelSvc.Metadata,
+				Protocol:        protocol.HttpJson,
+				ResourceRequest: invalidResourceRequest,
+			},
+			resourcePercentage: queueResourcePercentage,
+			deploymentScale:    defaultDeploymentScale,
+			exp:                &kservev1beta1.InferenceService{},
+			wantErr:            true,
+		},
 	}
 
 	for _, tt := range tests {

diff --git a/ui/src/pages/version/components/forms/components/ResourcesPanel.js b/ui/src/pages/version/components/forms/components/ResourcesPanel.js
@@ -82,7 +82,7 @@ export const ResourcesPanel = ({
       return;
     }
     onChange("gpu_name")(gpu_name);
-    onChange("gpu_request")(undefined);
+    onChange("gpu_request")(gpus[gpu_name].values[0]);
     onChange("min_monthly_cost_per_gpu")(
       gpus[gpu_name].min_monthly_cost_per_gpu
     );