From b138f39acdb0b60f492b421a8007085d7bc4179c Mon Sep 17 00:00:00 2001 From: Ewe Zi Yi <36802364+deadlycoconuts@users.noreply.github.com> Date: Wed, 14 Feb 2024 17:54:02 +0800 Subject: [PATCH] fix(api): Change default GPU Request value and prevent null values from being accepted (#533) # Description As of present, users are allowed to submit null or `0` values as a GPU request value even when a GPU is configured (when the GPU name field is set) for Merlin models. This causes the Merlin model to be deployed on a regular default node without the proper node selectors or tolerations applied which is an unexpected behaviour from users. In no scenario would a user intend to have 0 GPUs selected for a specific GPU type when he/she intends to use a GPU. However, this is not prevented by the API server nor the UI. A user using the UI to change his/her GPU configuration might thus potentially end up redeploying his/her model in a regular node despite thinking that it has been configured to work with a GPU. This PR addresses this issue with two approaches, 1) to prevent the 'None' GPU request value from being shown in the UI as a default value when a GPU is selected and to show the first available GPU request option instead and, 2) to make the API server throw an error if it receives any deployment request not having a GPU request value set when a GPU is configured. The snippet below shows how the UI changes would look like: https://github.com/caraml-dev/merlin/assets/36802364/e599b373-888c-45b8-9424-85282e035465 # Modifications - `ui/src/pages/version/components/forms/components/ResourcesPanel.js` - Display first available GPU request value (set in the Helm chart values of the Merlin API server deployment) when a new GPU name is selected - `api/cluster/resource/templater.go` - Add an additional check to ensure that the GPU request value is not 0 when a GPU is set # Checklist - [x] Added PR label - [x] Added unit test, integration, and/or e2e tests - [x] Tested locally - [ ] Updated documentation - [ ] Update Swagger spec if the PR introduce API changes - [ ] Regenerated Golang and Python client if the PR introduces API changes # Release Notes ```release-note Users are now no longer able to select a GPU without setting a non-zero GPU request value. ``` --- api/cluster/resource/templater.go | 34 +++++++++++-------- api/cluster/resource/templater_gpu_test.go | 27 +++++++++++++++ .../forms/components/ResourcesPanel.js | 2 +- 3 files changed, 48 insertions(+), 15 deletions(-) diff --git a/api/cluster/resource/templater.go b/api/cluster/resource/templater.go index b4cd26c9c..95ffd54fc 100644 --- a/api/cluster/resource/templater.go +++ b/api/cluster/resource/templater.go @@ -222,20 +222,26 @@ func (t *InferenceServiceTemplater) createPredictorSpec(modelService *models.Ser nodeSelector := map[string]string{} tolerations := []corev1.Toleration{} - if modelService.ResourceRequest.GPUName != "" && !modelService.ResourceRequest.GPURequest.IsZero() { - // Look up to the GPU resource type and quantity from DeploymentConfig - for _, gpuConfig := range t.deploymentConfig.GPUs { - if gpuConfig.Name == modelService.ResourceRequest.GPUName { - // Declare and initialize resourceType and resourceQuantity variables - resourceType := corev1.ResourceName(gpuConfig.ResourceType) - resourceQuantity := modelService.ResourceRequest.GPURequest - - // Set the resourceType as the key in the maps, with resourceQuantity as the value - resources.Requests[resourceType] = resourceQuantity - resources.Limits[resourceType] = resourceQuantity - - nodeSelector = gpuConfig.NodeSelector - tolerations = gpuConfig.Tolerations + + if modelService.ResourceRequest.GPUName != "" { + if modelService.ResourceRequest.GPURequest.IsZero() { + // This should never be set as zero if a GPU name is specified + return kservev1beta1.PredictorSpec{}, fmt.Errorf("GPU request cannot set as be 0") + } else { + // Look up to the GPU resource type and quantity from DeploymentConfig + for _, gpuConfig := range t.deploymentConfig.GPUs { + if gpuConfig.Name == modelService.ResourceRequest.GPUName { + // Declare and initialize resourceType and resourceQuantity variables + resourceType := corev1.ResourceName(gpuConfig.ResourceType) + resourceQuantity := modelService.ResourceRequest.GPURequest + + // Set the resourceType as the key in the maps, with resourceQuantity as the value + resources.Requests[resourceType] = resourceQuantity + resources.Limits[resourceType] = resourceQuantity + + nodeSelector = gpuConfig.NodeSelector + tolerations = gpuConfig.Tolerations + } } } } diff --git a/api/cluster/resource/templater_gpu_test.go b/api/cluster/resource/templater_gpu_test.go index 7067f682d..2faf48bee 100644 --- a/api/cluster/resource/templater_gpu_test.go +++ b/api/cluster/resource/templater_gpu_test.go @@ -106,6 +106,14 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { }, } + invalidResourceRequest := &models.ResourceRequest{ + MinReplica: 1, + MaxReplica: 2, + CPURequest: resource.MustParse("500m"), + MemoryRequest: resource.MustParse("500Mi"), + GPUName: "NVIDIA P4", + } + queueResourcePercentage := "2" storageUri := fmt.Sprintf("%s/model", modelSvc.ArtifactURI) @@ -1563,6 +1571,25 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { }, }, }, + { + name: "invalid resource request with 0 GPU requested", + modelSvc: &models.Service{ + Name: modelSvc.Name, + ModelName: modelSvc.ModelName, + ModelVersion: modelSvc.ModelVersion, + Namespace: project.Name, + ArtifactURI: modelSvc.ArtifactURI, + Type: models.ModelTypeTensorflow, + Options: &models.ModelOption{}, + Metadata: modelSvc.Metadata, + Protocol: protocol.HttpJson, + ResourceRequest: invalidResourceRequest, + }, + resourcePercentage: queueResourcePercentage, + deploymentScale: defaultDeploymentScale, + exp: &kservev1beta1.InferenceService{}, + wantErr: true, + }, } for _, tt := range tests { diff --git a/ui/src/pages/version/components/forms/components/ResourcesPanel.js b/ui/src/pages/version/components/forms/components/ResourcesPanel.js index 0f4465951..34bca68da 100644 --- a/ui/src/pages/version/components/forms/components/ResourcesPanel.js +++ b/ui/src/pages/version/components/forms/components/ResourcesPanel.js @@ -82,7 +82,7 @@ export const ResourcesPanel = ({ return; } onChange("gpu_name")(gpu_name); - onChange("gpu_request")(undefined); + onChange("gpu_request")(gpus[gpu_name].values[0]); onChange("min_monthly_cost_per_gpu")( gpus[gpu_name].min_monthly_cost_per_gpu );