Skip to content

Commit

Permalink
KEP-2170: Implement Initializer builders in the JobSet plugin (#2316)
Browse files Browse the repository at this point in the history
* KEP-2170: Implement Initializer builder in the JobSet plugin

Signed-off-by: Andrey Velichkevich <[email protected]>

* Update the SDK models

Signed-off-by: Andrey Velichkevich <[email protected]>

* Remove Info from Initializer builder

Signed-off-by: Andrey Velichkevich <[email protected]>

* Update manifests

Signed-off-by: Andrey Velichkevich <[email protected]>

* Update pkg/constants/constants.go

Co-authored-by: Yuki Iwai <[email protected]>
Signed-off-by: Andrey Velichkevich <[email protected]>

* Use var for envs

Signed-off-by: Andrey Velichkevich <[email protected]>

* Remove check manifests from GitHub actions

Signed-off-by: Andrey Velichkevich <[email protected]>

* Move consts to JobSet plugin

Signed-off-by: Andrey Velichkevich <[email protected]>

---------

Signed-off-by: Andrey Velichkevich <[email protected]>
Co-authored-by: Yuki Iwai <[email protected]>
  • Loading branch information
andreyvelich and tenzen-y authored Nov 1, 2024
1 parent 7c5ea70 commit 2e1e125
Show file tree
Hide file tree
Showing 24 changed files with 595 additions and 143 deletions.
10 changes: 3 additions & 7 deletions .github/workflows/test-go.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,10 @@ jobs:
run: |
go mod tidy && pushd hack/swagger && go mod tidy && popd && git add go.* &&
git diff --cached --exit-code || (echo 'Please run "go mod tidy" to sync Go modules' && exit 1);
- name: Check manifests
- name: Check auto-generated assets
run: |
make manifests && git add manifests &&
git diff --cached --exit-code || (echo 'Please run "make manifests" to generate manifests' && exit 1);
- name: Check auto-generated codes
run: |
make generate && git add pkg sdk &&
git diff --cached --exit-code || (echo 'Please run "make generate" to generate Go codes' && exit 1);
make generate && git add pkg sdk manifests &&
git diff --cached --exit-code || (echo 'Please run "make generate" to generate assets' && exit 1);
- name: Verify gofmt
run: |
make fmt && git add pkg cmd &&
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and Cust
output:rbac:artifacts:config=manifests/v2/base/rbac \
output:webhook:artifacts:config=manifests/v2/base/webhook

generate: controller-gen ## Generate apidoc, sdk and code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
generate: controller-gen manifests ## Generate apidoc, sdk and code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
$(CONTROLLER_GEN) object:headerFile="hack/boilerplate/boilerplate.go.txt" paths="./pkg/apis/..."
hack/update-codegen.sh
hack/python-sdk/gen-sdk.sh
Expand Down
12 changes: 6 additions & 6 deletions api.v2/openapi-spec/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,8 @@
}
},
"secretRef": {
"description": "Reference to the TrainJob's secrets to download dataset.",
"$ref": "#/definitions/v1.SecretReference"
"description": "Reference to the secret with credentials to download dataset. Secret must be created in the TrainJob's namespace.",
"$ref": "#/definitions/v1.LocalObjectReference"
},
"storageUri": {
"description": "Storage uri for the dataset provider.",
Expand All @@ -159,8 +159,8 @@
}
},
"secretRef": {
"description": "Reference to the TrainJob's secrets to download model.",
"$ref": "#/definitions/v1.SecretReference"
"description": "Reference to the secret with credentials to download model. Secret must be created in the TrainJob's namespace.",
"$ref": "#/definitions/v1.LocalObjectReference"
},
"storageUri": {
"description": "Storage uri for the model provider.",
Expand Down Expand Up @@ -315,8 +315,8 @@
}
},
"secretRef": {
"description": "Reference to the TrainJob's secrets to export model.",
"$ref": "#/definitions/v1.SecretReference"
"description": "Reference to the secret with credentials to export model. Secret must be created in the TrainJob's namespace.",
"$ref": "#/definitions/v1.LocalObjectReference"
},
"storageUri": {
"description": "Storage uri for the model exporter.",
Expand Down
6 changes: 3 additions & 3 deletions docs/proposals/2170-kubeflow-training-v2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -597,7 +597,7 @@ type DatasetConfig struct {
Env []corev1.EnvVar `json:"env,omitempty"`

// Reference to the TrainJob's secrets to download dataset.
SecretRef *corev1.SecretReference `json:"secretRef,omitempty"`
SecretRef *corev1.LocalObjectReference `json:"secretRef,omitempty"`
}
```

Expand Down Expand Up @@ -665,7 +665,7 @@ type InputModel struct {
Env []corev1.EnvVar `json:"env,omitempty"`

// Reference to the TrainJob's secrets to download model.
SecretRef *corev1.SecretReference `json:"secretRef,omitempty"`
SecretRef *corev1.LocalObjectReference `json:"secretRef,omitempty"`
}

type OutputModel struct {
Expand All @@ -677,7 +677,7 @@ type OutputModel struct {
Env []corev1.EnvVar `json:"env,omitempty"`

// Reference to the TrainJob's secrets to export model.
SecretRef *corev1.SecretReference `json:"secretRef,omitempty"`
SecretRef *corev1.LocalObjectReference `json:"secretRef,omitempty"`
}
```

Expand Down
44 changes: 21 additions & 23 deletions manifests/v2/base/crds/kubeflow.org_trainjobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -173,15 +173,15 @@ spec:
type: object
type: array
secretRef:
description: Reference to the TrainJob's secrets to download dataset.
description: |-
Reference to the secret with credentials to download dataset.
Secret must be created in the TrainJob's namespace.
properties:
name:
description: name is unique within a namespace to reference
a secret resource.
type: string
namespace:
description: namespace defines the space within which the
secret name must be unique.
description: |-
Name of the referent.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?
type: string
type: object
x-kubernetes-map-type: atomic
Expand Down Expand Up @@ -339,16 +339,15 @@ spec:
type: object
type: array
secretRef:
description: Reference to the TrainJob's secrets to download
model.
description: |-
Reference to the secret with credentials to download model.
Secret must be created in the TrainJob's namespace.
properties:
name:
description: name is unique within a namespace to reference
a secret resource.
type: string
namespace:
description: namespace defines the space within which
the secret name must be unique.
description: |-
Name of the referent.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?
type: string
type: object
x-kubernetes-map-type: atomic
Expand Down Expand Up @@ -479,16 +478,15 @@ spec:
type: object
type: array
secretRef:
description: Reference to the TrainJob's secrets to export
model.
description: |-
Reference to the secret with credentials to export model.
Secret must be created in the TrainJob's namespace.
properties:
name:
description: name is unique within a namespace to reference
a secret resource.
type: string
namespace:
description: namespace defines the space within which
the secret name must be unique.
description: |-
Name of the referent.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?
type: string
type: object
x-kubernetes-map-type: atomic
Expand Down
18 changes: 9 additions & 9 deletions pkg/apis/kubeflow.org/v2alpha1/openapi_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 9 additions & 6 deletions pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,9 @@ type DatasetConfig struct {
// These values will be merged with the TrainingRuntime's dataset initializer environments.
Env []corev1.EnvVar `json:"env,omitempty"`

// Reference to the TrainJob's secrets to download dataset.
SecretRef *corev1.SecretReference `json:"secretRef,omitempty"`
// Reference to the secret with credentials to download dataset.
// Secret must be created in the TrainJob's namespace.
SecretRef *corev1.LocalObjectReference `json:"secretRef,omitempty"`
}

// ModelConfig represents the desired model configuration.
Expand All @@ -193,8 +194,9 @@ type InputModel struct {
// These values will be merged with the TrainingRuntime's model initializer environments.
Env []corev1.EnvVar `json:"env,omitempty"`

// Reference to the TrainJob's secrets to download model.
SecretRef *corev1.SecretReference `json:"secretRef,omitempty"`
// Reference to the secret with credentials to download model.
// Secret must be created in the TrainJob's namespace.
SecretRef *corev1.LocalObjectReference `json:"secretRef,omitempty"`
}

// OutputModel represents the desired trained model configuration.
Expand All @@ -206,8 +208,9 @@ type OutputModel struct {
// These values will be merged with the TrainingRuntime's model exporter environments.
Env []corev1.EnvVar `json:"env,omitempty"`

// Reference to the TrainJob's secrets to export model.
SecretRef *corev1.SecretReference `json:"secretRef,omitempty"`
// Reference to the secret with credentials to export model.
// Secret must be created in the TrainJob's namespace.
SecretRef *corev1.LocalObjectReference `json:"secretRef,omitempty"`
}

// PodSpecOverride represents the custom overrides that will be applied for the TrainJob's resources.
Expand Down
6 changes: 3 additions & 3 deletions pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pkg/controller.v1/pytorch/elastic.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func GetElasticEnvVarGenerator() EnvVarGenerator {

func (e ElasticEnvVarGenerator) Generate(
job *kubeflowv1.PyTorchJob) ([]corev1.EnvVar, error) {
envVars := []corev1.EnvVar{}
var envVars []corev1.EnvVar

elasticPolicy := job.Spec.ElasticPolicy
if elasticPolicy == nil {
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller.v1/pytorch/master.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ func GetMasterEnvVarGenerator() EnvVarGenerator {

func (e MasterEnvVarGenerator) Generate(
job *kubeflowv1.PyTorchJob) ([]corev1.EnvVar, error) {
envVars := []corev1.EnvVar{}
var envVars []corev1.EnvVar
if job.Spec.PyTorchReplicaSpecs[kubeflowv1.PyTorchJobReplicaTypeMaster] != nil {
masterPort, err := getPortFromPyTorchJob(job, kubeflowv1.PyTorchJobReplicaTypeMaster)
if err != nil {
Expand Down
6 changes: 3 additions & 3 deletions pkg/runtime.v2/core/clustertrainingruntime_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ func TestClusterTrainingRuntimeNewObjects(t *testing.T) {
"succeeded to build PodGroup and JobSet with NumNodes from the Runtime and container from the Trainer.": {
clusterTrainingRuntime: testingutil.MakeClusterTrainingRuntimeWrapper("test-runtime").RuntimeSpec(
testingutil.MakeTrainingRuntimeSpecWrapper(testingutil.MakeClusterTrainingRuntimeWrapper("test-runtime").Spec).
InitContainerDatasetModelInitializer("test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests).
NumNodes(100).
ContainerTrainer("test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests).
ContainerDatasetModelInitializer("test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests).
PodGroupPolicyCoschedulingSchedulingTimeout(120).
Obj(),
).Obj(),
Expand All @@ -59,15 +59,15 @@ func TestClusterTrainingRuntimeNewObjects(t *testing.T) {
RuntimeRef(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.ClusterTrainingRuntimeKind), "test-runtime").
Trainer(
testingutil.MakeTrainJobTrainerWrapper().
ContainerTrainer("test:trainjob", []string{"trainjob"}, []string{"trainjob"}, resRequests).
Container("test:trainjob", []string{"trainjob"}, []string{"trainjob"}, resRequests).
Obj(),
).
Obj(),
wantObjs: []client.Object{
testingutil.MakeJobSetWrapper(metav1.NamespaceDefault, "test-job").
InitContainerDatasetModelInitializer("test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests).
NumNodes(100).
ContainerTrainer("test:trainjob", []string{"trainjob"}, []string{"trainjob"}, resRequests).
ContainerDatasetModelInitializer("test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests).
Suspend(true).
PodLabel(schedulerpluginsv1alpha1.PodGroupLabel, "test-job").
ControllerReference(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.TrainJobKind), "test-job", "uid").
Expand Down
Loading

0 comments on commit 2e1e125

Please sign in to comment.