Skip to content

Commit

Permalink
fix(api): Change default GPU Request value and prevent null values fr…
Browse files Browse the repository at this point in the history
…om being accepted (#533)

# Description
As of present, users are allowed to submit null or `0` values as a GPU
request value even when a GPU is configured (when the GPU name field is
set) for Merlin models. This causes the Merlin model to be deployed on a
regular default node without the proper node selectors or tolerations
applied which is an unexpected behaviour from users. In no scenario
would a user intend to have 0 GPUs selected for a specific GPU type when
he/she intends to use a GPU. However, this is not prevented by the API
server nor the UI.

A user using the UI to change his/her GPU configuration might thus
potentially end up redeploying his/her model in a regular node despite
thinking that it has been configured to work with a GPU.

This PR addresses this issue with two approaches, 1) to prevent the
'None' GPU request value from being shown in the UI as a default value
when a GPU is selected and to show the first available GPU request
option instead and, 2) to make the API server throw an error if it
receives any deployment request not having a GPU request value set when
a GPU is configured.

The snippet below shows how the UI changes would look like:


https://github.com/caraml-dev/merlin/assets/36802364/e599b373-888c-45b8-9424-85282e035465

# Modifications
- `ui/src/pages/version/components/forms/components/ResourcesPanel.js` -
Display first available GPU request value (set in the Helm chart values
of the Merlin API server deployment) when a new GPU name is selected
- `api/cluster/resource/templater.go` - Add an additional check to
ensure that the GPU request value is not 0 when a GPU is set

# Checklist
- [x] Added PR label
- [x] Added unit test, integration, and/or e2e tests
- [x] Tested locally
- [ ] Updated documentation
- [ ] Update Swagger spec if the PR introduce API changes
- [ ] Regenerated Golang and Python client if the PR introduces API
changes

# Release Notes
```release-note
Users are now no longer able to select a GPU without setting a non-zero GPU request value.
```
  • Loading branch information
deadlycoconuts authored and leonlnj committed Feb 20, 2024
1 parent e4a0a01 commit b138f39
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 15 deletions.
34 changes: 20 additions & 14 deletions api/cluster/resource/templater.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,20 +222,26 @@ func (t *InferenceServiceTemplater) createPredictorSpec(modelService *models.Ser

nodeSelector := map[string]string{}
tolerations := []corev1.Toleration{}
if modelService.ResourceRequest.GPUName != "" && !modelService.ResourceRequest.GPURequest.IsZero() {
// Look up to the GPU resource type and quantity from DeploymentConfig
for _, gpuConfig := range t.deploymentConfig.GPUs {
if gpuConfig.Name == modelService.ResourceRequest.GPUName {
// Declare and initialize resourceType and resourceQuantity variables
resourceType := corev1.ResourceName(gpuConfig.ResourceType)
resourceQuantity := modelService.ResourceRequest.GPURequest

// Set the resourceType as the key in the maps, with resourceQuantity as the value
resources.Requests[resourceType] = resourceQuantity
resources.Limits[resourceType] = resourceQuantity

nodeSelector = gpuConfig.NodeSelector
tolerations = gpuConfig.Tolerations

if modelService.ResourceRequest.GPUName != "" {
if modelService.ResourceRequest.GPURequest.IsZero() {
// This should never be set as zero if a GPU name is specified
return kservev1beta1.PredictorSpec{}, fmt.Errorf("GPU request cannot set as be 0")
} else {
// Look up to the GPU resource type and quantity from DeploymentConfig
for _, gpuConfig := range t.deploymentConfig.GPUs {
if gpuConfig.Name == modelService.ResourceRequest.GPUName {
// Declare and initialize resourceType and resourceQuantity variables
resourceType := corev1.ResourceName(gpuConfig.ResourceType)
resourceQuantity := modelService.ResourceRequest.GPURequest

// Set the resourceType as the key in the maps, with resourceQuantity as the value
resources.Requests[resourceType] = resourceQuantity
resources.Limits[resourceType] = resourceQuantity

nodeSelector = gpuConfig.NodeSelector
tolerations = gpuConfig.Tolerations
}
}
}
}
Expand Down
27 changes: 27 additions & 0 deletions api/cluster/resource/templater_gpu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,14 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
},
}

invalidResourceRequest := &models.ResourceRequest{
MinReplica: 1,
MaxReplica: 2,
CPURequest: resource.MustParse("500m"),
MemoryRequest: resource.MustParse("500Mi"),
GPUName: "NVIDIA P4",
}

queueResourcePercentage := "2"
storageUri := fmt.Sprintf("%s/model", modelSvc.ArtifactURI)

Expand Down Expand Up @@ -1563,6 +1571,25 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
},
},
},
{
name: "invalid resource request with 0 GPU requested",
modelSvc: &models.Service{
Name: modelSvc.Name,
ModelName: modelSvc.ModelName,
ModelVersion: modelSvc.ModelVersion,
Namespace: project.Name,
ArtifactURI: modelSvc.ArtifactURI,
Type: models.ModelTypeTensorflow,
Options: &models.ModelOption{},
Metadata: modelSvc.Metadata,
Protocol: protocol.HttpJson,
ResourceRequest: invalidResourceRequest,
},
resourcePercentage: queueResourcePercentage,
deploymentScale: defaultDeploymentScale,
exp: &kservev1beta1.InferenceService{},
wantErr: true,
},
}

for _, tt := range tests {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ export const ResourcesPanel = ({
return;
}
onChange("gpu_name")(gpu_name);
onChange("gpu_request")(undefined);
onChange("gpu_request")(gpus[gpu_name].values[0]);
onChange("min_monthly_cost_per_gpu")(
gpus[gpu_name].min_monthly_cost_per_gpu
);
Expand Down

0 comments on commit b138f39

Please sign in to comment.