Skip to content

Commit

Permalink
fix nproc env in elastic mode for pytorchjob (#1948)
Browse files Browse the repository at this point in the history
  • Loading branch information
kuizhiqing authored Nov 20, 2023
1 parent 230bfb4 commit 2856aa0
Showing 1 changed file with 7 additions and 4 deletions.
11 changes: 7 additions & 4 deletions pkg/controller.v1/pytorch/envvar.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,16 +90,19 @@ func setPodEnv(obj interface{}, podTemplateSpec *corev1.PodTemplateSpec, rtype,
Name: "RANK",
Value: strconv.Itoa(rank),
})
podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{
Name: EnvNprocPerNode,
Value: *pytorchjob.Spec.NprocPerNode,
})
podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{
Name: EnvNodeRank,
Value: strconv.Itoa(rank),
})
}

if pytorchjob.Spec.NprocPerNode != nil {
podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{
Name: EnvNprocPerNode,
Value: *pytorchjob.Spec.NprocPerNode,
})
}

// Set the elastic environment variables if the elasticPolicy is not null.
// nnodes is set in range format in elastic mode, e.g. nnodes=1:4
// otherwise, nnodes is set by int, e.g. nnodes=2
Expand Down

0 comments on commit 2856aa0

Please sign in to comment.