Skip to content

Commit

Permalink
support adding gpu resource to ps role pods for submitting tf job (#313)
Browse files Browse the repository at this point in the history
* support adding gpu resource to ps role pods for submitting tf job

* change tfjob chart version to 0.29.0
  • Loading branch information
happy2048 authored May 8, 2020
1 parent afb9061 commit 6b19e11
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 1 deletion.
4 changes: 4 additions & 0 deletions charts/tfjob/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,7 @@
### 0.28.0

* PS annotations should be a attribute of metadata yaml node.

### 0.29.0

* support assgining gpu resources for PS when submitting tfjobs
2 changes: 1 addition & 1 deletion charts/tfjob/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ apiVersion: v1
appVersion: "1.0"
description: A Helm chart for TFJob
name: tfjob
version: 0.28.0
version: 0.29.0
7 changes: 7 additions & 0 deletions charts/tfjob/templates/tfjob.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{{- $gpuCount := .Values.gpuCount -}}
{{- $psGpuCount := .Values.psGPU -}}
{{- $syncMode := .Values.syncMode -}}
{{- $cleanPodPolicy := .Values.cleanPodPolicy -}}
{{- $dataDirs := .Values.dataDirs -}}
Expand Down Expand Up @@ -228,6 +229,9 @@ spec:
{{- end }}
resources:
requests:
{{- if gt (int $psGpuCount) 0}}
nvidia.com/gpu: {{ .Values.psGPU | quote }}
{{- end }}
{{- if .Values.psCPU }}
cpu: {{ .Values.psCPU | quote }}
{{- end}}
Expand All @@ -238,6 +242,9 @@ spec:
rdma/hca: "1"
{{- end}}
limits:
{{- if gt (int $psGpuCount) 0}}
nvidia.com/gpu: {{ .Values.psGPU | quote }}
{{- end }}
{{- if .Values.psCPU }}
cpu: {{ .Values.psCPU | quote }}
{{- end}}
Expand Down
3 changes: 3 additions & 0 deletions cmd/arena/commands/submit_tfjob.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ func NewSubmitTFJobCommand() *cobra.Command {
command.Flags().MarkDeprecated("psCpu", "please use --ps-cpu instead")
command.Flags().StringVar(&submitArgs.PSCpu, "ps-cpu", "", "the cpu resource to use for the parameter servers, like 1 for 1 core.")

command.Flags().IntVar(&submitArgs.PSGpu, "ps-gpus", 0, "the gpu resource to use for the parameter servers, like 1 for 1 gpu.")

command.Flags().StringVar(&submitArgs.PSMemory, "psMemory", "", "the memory resource to use for the parameter servers, like 1Gi.")
command.Flags().MarkDeprecated("psMemory", "please use --ps-memory instead")
command.Flags().StringVar(&submitArgs.PSMemory, "ps-memory", "", "the memory resource to use for the parameter servers, like 1Gi.")
Expand Down Expand Up @@ -175,6 +177,7 @@ type submitTFJobArgs struct {
//WorkerNodeSelectors map[string]string `yaml:"workerNodeSelectors"` // --worker-selector
WorkerMemory string `yaml:"workerMemory"` // --workerMemory
PSCpu string `yaml:"psCPU"` // --psCpu
PSGpu int `yaml:"psGPU"` // --ps-gpus
PSMemory string `yaml:"psMemory"` // --psMemory
CleanPodPolicy string `yaml:"cleanPodPolicy"` // --cleanTaskPolicy
// For esitmator, it reuses workerImage
Expand Down

0 comments on commit 6b19e11

Please sign in to comment.