diff --git a/docs/proposals/2170-kubeflow-training-v2/README.md b/docs/proposals/2170-kubeflow-training-v2/README.md index c627e89527..81ba10ecad 100644 --- a/docs/proposals/2170-kubeflow-training-v2/README.md +++ b/docs/proposals/2170-kubeflow-training-v2/README.md @@ -281,7 +281,7 @@ type TrainJob struct { type TrainJobSpec struct { // Reference to the training runtime. - TrainingRuntimeRef TrainingRuntimeRef `json:"trainingRuntimeRef"` + RuntimeRef RuntimeRef `json:"runtimeRef"` // Configuration of the desired trainer. Trainer *Trainer `json:"trainer,omitempty"` @@ -317,7 +317,7 @@ type TrainJobSpec struct { ManagedBy *string `json:"managedBy,omitempty"` } -type TrainingRuntimeRef struct { +type RuntimeRef struct { // Name of the runtime being referenced. // When namespaced-scoped TrainingRuntime is used, the TrainJob must have // the same namespace as the deployed runtime. @@ -375,7 +375,7 @@ This table explains the rationale for each `TrainJob` parameter: - TrainingRuntimeRef + RuntimeRef Reference to the existing TrainingRuntime that is pre-deployed by platform engineers @@ -430,7 +430,7 @@ metadata: name: torch-ddp namespace: tenant-alpha spec: - trainingRuntimeRef: + runtimeRef: name: torch-distributed-multi-node trainer: image: docker.io/custom-training @@ -488,7 +488,7 @@ metadata: name: tune-llama-with-yelp namespace: tenant-alpha spec: - trainingRuntimeRef: + runtimeRef: name: torch-tune-llama-7b datasetConfig: storageUri: s3://dataset/custom-dataset/yelp-review @@ -890,7 +890,7 @@ metadata: name: pytorch-distributed namespace: tenant-alpha spec: - trainingRuntimeRef: + runtimeRef: name: pytorch-distributed-gpu trainer: image: docker.io/custom-training @@ -939,7 +939,7 @@ to control versions of `TrainingRuntime` and enable rolling updates. We are going to create two CRDs: `TrainingRuntime` and `ClusterTrainingRuntime`. These runtimes have exactly the same APIs, but the first one is the namespace-scoped, the second is the cluster-scoped. -User can set the `kind` and `apiGroup` parameters in the `trainingRuntimeRef` to use +User can set the `kind` and `apiGroup` parameters in the `runtimeRef` to use the `TrainingRuntime` from the `TrainJob's` namespace, otherwise the `ClusterTrainingRuntime` will be used. @@ -1228,7 +1228,7 @@ metadata: name: torch-test namespace: tenant-alpha spec: - trainingRuntimeRef: + runtimeRef: name: torch-distributed-multi-node trainer: resourcesPerNode: @@ -1698,7 +1698,7 @@ Note that we should implement the status transitions validations to once we supp ### Support Multiple API Versions of TrainingRuntime -We can consider to introduce the `version` field for runtime API version to the `.spec.trainingRuntimeRef` +We can consider to introduce the `version` field for runtime API version to the `.spec.runtimeRef` so that we can support multiple API versions of TrainingRuntime. It could mitigate the pain points when users upgrade the older API Version to newer API Version like alpha to beta. @@ -1706,7 +1706,7 @@ But, we do not aim to support both Alpha and Beta versions or both first Alpha a Hence, the `version` field was not introduced. ```go -type TrainingRuntimeRef struct { +type RuntimeRef struct { [...] // APIVersion is the apiVersion for the runtime. diff --git a/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml b/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml index 6da29d74d0..f28b08b6c5 100644 --- a/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml +++ b/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml @@ -19,7 +19,7 @@ spec: openAPIV3Schema: description: |- ClusterTrainingRuntime represents a training runtime which can be referenced as part of - `trainingRuntimeRef` API in TrainJob. This resource is a cluster-scoped and can be referenced + `runtimeRef` API in TrainJob. This resource is a cluster-scoped and can be referenced by TrainJob that created in *any* namespace. properties: apiVersion: diff --git a/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml b/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml index 90f7d8e209..d5769ed073 100644 --- a/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml +++ b/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml @@ -19,7 +19,7 @@ spec: openAPIV3Schema: description: |- TrainingRuntime represents a training runtime which can be referenced as part of - `trainingRuntimeRef` API in TrainJob. This resource is a namespaced-scoped and can be referenced + `runtimeRef` API in TrainJob. This resource is a namespaced-scoped and can be referenced by TrainJob that created in the *same* namespace as the TrainingRuntime. properties: apiVersion: diff --git a/manifests/v2/base/crds/kubeflow.org_trainjobs.yaml b/manifests/v2/base/crds/kubeflow.org_trainjobs.yaml index 771b30ebdd..d0e0e6f86e 100644 --- a/manifests/v2/base/crds/kubeflow.org_trainjobs.yaml +++ b/manifests/v2/base/crds/kubeflow.org_trainjobs.yaml @@ -2732,6 +2732,29 @@ spec: - targetReplicatedJobs type: object type: array + runtimeRef: + description: Reference to the training runtime. + properties: + apiGroup: + description: |- + APIGroup of the runtime being referenced. + Defaults to `kubeflow.org`. + type: string + kind: + description: |- + Kind of the runtime being referenced. + It must be one of TrainingRuntime or ClusterTrainingRuntime. + Defaults to ClusterTrainingRuntime. + type: string + name: + description: |- + Name of the runtime being referenced. + When namespaced-scoped TrainingRuntime is used, the TrainJob must have + the same namespace as the deployed runtime. + type: string + required: + - name + type: object suspend: description: |- Whether the controller should suspend the running TrainJob. @@ -2937,31 +2960,8 @@ spec: type: object type: object type: object - trainingRuntimeRef: - description: Reference to the training runtime. - properties: - apiGroup: - description: |- - APIGroup of the runtime being referenced. - Defaults to `kubeflow.org`. - type: string - kind: - description: |- - Kind of the runtime being referenced. - It must be one of TrainingRuntime or ClusterTrainingRuntime. - Defaults to ClusterTrainingRuntime. - type: string - name: - description: |- - Name of the runtime being referenced. - When namespaced-scoped TrainingRuntime is used, the TrainJob must have - the same namespace as the deployed runtime. - type: string - required: - - name - type: object required: - - trainingRuntimeRef + - runtimeRef type: object status: description: Current status of TrainJob. diff --git a/pkg/apis/kubeflow.org/v2alpha1/openapi_generated.go b/pkg/apis/kubeflow.org/v2alpha1/openapi_generated.go index 10a47600a9..6cb9daabe8 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/openapi_generated.go +++ b/pkg/apis/kubeflow.org/v2alpha1/openapi_generated.go @@ -43,6 +43,7 @@ func GetOpenAPIDefinitions(ref common.ReferenceCallback) map[string]common.OpenA "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.PodGroupPolicy": schema_pkg_apis_kubefloworg_v2alpha1_PodGroupPolicy(ref), "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.PodGroupPolicySource": schema_pkg_apis_kubefloworg_v2alpha1_PodGroupPolicySource(ref), "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.PodSpecOverride": schema_pkg_apis_kubefloworg_v2alpha1_PodSpecOverride(ref), + "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.RuntimeRef": schema_pkg_apis_kubefloworg_v2alpha1_RuntimeRef(ref), "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.TorchElasticPolicy": schema_pkg_apis_kubefloworg_v2alpha1_TorchElasticPolicy(ref), "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.TorchMLPolicySource": schema_pkg_apis_kubefloworg_v2alpha1_TorchMLPolicySource(ref), "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.TrainJob": schema_pkg_apis_kubefloworg_v2alpha1_TrainJob(ref), @@ -52,7 +53,6 @@ func GetOpenAPIDefinitions(ref common.ReferenceCallback) map[string]common.OpenA "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.Trainer": schema_pkg_apis_kubefloworg_v2alpha1_Trainer(ref), "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.TrainingRuntime": schema_pkg_apis_kubefloworg_v2alpha1_TrainingRuntime(ref), "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.TrainingRuntimeList": schema_pkg_apis_kubefloworg_v2alpha1_TrainingRuntimeList(ref), - "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.TrainingRuntimeRef": schema_pkg_apis_kubefloworg_v2alpha1_TrainingRuntimeRef(ref), "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.TrainingRuntimeSpec": schema_pkg_apis_kubefloworg_v2alpha1_TrainingRuntimeSpec(ref), } } @@ -61,7 +61,7 @@ func schema_pkg_apis_kubefloworg_v2alpha1_ClusterTrainingRuntime(ref common.Refe return common.OpenAPIDefinition{ Schema: spec.Schema{ SchemaProps: spec.SchemaProps{ - Description: "ClusterTrainingRuntime represents a training runtime which can be referenced as part of `trainingRuntimeRef` API in TrainJob. This resource is a cluster-scoped and can be referenced by TrainJob that created in *any* namespace.", + Description: "ClusterTrainingRuntime represents a training runtime which can be referenced as part of `runtimeRef` API in TrainJob. This resource is a cluster-scoped and can be referenced by TrainJob that created in *any* namespace.", Type: []string{"object"}, Properties: map[string]spec.Schema{ "kind": { @@ -703,6 +703,42 @@ func schema_pkg_apis_kubefloworg_v2alpha1_PodSpecOverride(ref common.ReferenceCa } } +func schema_pkg_apis_kubefloworg_v2alpha1_RuntimeRef(ref common.ReferenceCallback) common.OpenAPIDefinition { + return common.OpenAPIDefinition{ + Schema: spec.Schema{ + SchemaProps: spec.SchemaProps{ + Description: "RuntimeRef represents the reference to the existing training runtime.", + Type: []string{"object"}, + Properties: map[string]spec.Schema{ + "name": { + SchemaProps: spec.SchemaProps{ + Description: "Name of the runtime being referenced. When namespaced-scoped TrainingRuntime is used, the TrainJob must have the same namespace as the deployed runtime.", + Default: "", + Type: []string{"string"}, + Format: "", + }, + }, + "apiGroup": { + SchemaProps: spec.SchemaProps{ + Description: "APIGroup of the runtime being referenced. Defaults to `kubeflow.org`.", + Type: []string{"string"}, + Format: "", + }, + }, + "kind": { + SchemaProps: spec.SchemaProps{ + Description: "Kind of the runtime being referenced. It must be one of TrainingRuntime or ClusterTrainingRuntime. Defaults to ClusterTrainingRuntime.", + Type: []string{"string"}, + Format: "", + }, + }, + }, + Required: []string{"name"}, + }, + }, + } +} + func schema_pkg_apis_kubefloworg_v2alpha1_TorchElasticPolicy(ref common.ReferenceCallback) common.OpenAPIDefinition { return common.OpenAPIDefinition{ Schema: spec.Schema{ @@ -889,11 +925,11 @@ func schema_pkg_apis_kubefloworg_v2alpha1_TrainJobSpec(ref common.ReferenceCallb Description: "TrainJobSpec represents specification of the desired TrainJob.", Type: []string{"object"}, Properties: map[string]spec.Schema{ - "trainingRuntimeRef": { + "runtimeRef": { SchemaProps: spec.SchemaProps{ Description: "Reference to the training runtime.", Default: map[string]interface{}{}, - Ref: ref("github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.TrainingRuntimeRef"), + Ref: ref("github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.RuntimeRef"), }, }, "trainer": { @@ -975,11 +1011,11 @@ func schema_pkg_apis_kubefloworg_v2alpha1_TrainJobSpec(ref common.ReferenceCallb }, }, }, - Required: []string{"trainingRuntimeRef"}, + Required: []string{"runtimeRef"}, }, }, Dependencies: []string{ - "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.DatasetConfig", "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.ModelConfig", "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.PodSpecOverride", "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.Trainer", "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.TrainingRuntimeRef"}, + "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.DatasetConfig", "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.ModelConfig", "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.PodSpecOverride", "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.RuntimeRef", "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.Trainer"}, } } @@ -1116,7 +1152,7 @@ func schema_pkg_apis_kubefloworg_v2alpha1_TrainingRuntime(ref common.ReferenceCa return common.OpenAPIDefinition{ Schema: spec.Schema{ SchemaProps: spec.SchemaProps{ - Description: "TrainingRuntime represents a training runtime which can be referenced as part of `trainingRuntimeRef` API in TrainJob. This resource is a namespaced-scoped and can be referenced by TrainJob that created in the *same* namespace as the TrainingRuntime.", + Description: "TrainingRuntime represents a training runtime which can be referenced as part of `runtimeRef` API in TrainJob. This resource is a namespaced-scoped and can be referenced by TrainJob that created in the *same* namespace as the TrainingRuntime.", Type: []string{"object"}, Properties: map[string]spec.Schema{ "kind": { @@ -1206,42 +1242,6 @@ func schema_pkg_apis_kubefloworg_v2alpha1_TrainingRuntimeList(ref common.Referen } } -func schema_pkg_apis_kubefloworg_v2alpha1_TrainingRuntimeRef(ref common.ReferenceCallback) common.OpenAPIDefinition { - return common.OpenAPIDefinition{ - Schema: spec.Schema{ - SchemaProps: spec.SchemaProps{ - Description: "TrainingRuntimeRef represents the reference to the existing training runtime.", - Type: []string{"object"}, - Properties: map[string]spec.Schema{ - "name": { - SchemaProps: spec.SchemaProps{ - Description: "Name of the runtime being referenced. When namespaced-scoped TrainingRuntime is used, the TrainJob must have the same namespace as the deployed runtime.", - Default: "", - Type: []string{"string"}, - Format: "", - }, - }, - "apiGroup": { - SchemaProps: spec.SchemaProps{ - Description: "APIGroup of the runtime being referenced. Defaults to `kubeflow.org`.", - Type: []string{"string"}, - Format: "", - }, - }, - "kind": { - SchemaProps: spec.SchemaProps{ - Description: "Kind of the runtime being referenced. It must be one of TrainingRuntime or ClusterTrainingRuntime. Defaults to ClusterTrainingRuntime.", - Type: []string{"string"}, - Format: "", - }, - }, - }, - Required: []string{"name"}, - }, - }, - } -} - func schema_pkg_apis_kubefloworg_v2alpha1_TrainingRuntimeSpec(ref common.ReferenceCallback) common.OpenAPIDefinition { return common.OpenAPIDefinition{ Schema: spec.Schema{ diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go index 318d22be0d..c25e2c49b3 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go +++ b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go @@ -38,7 +38,7 @@ const ( // +kubebuilder:resource:scope=Cluster // ClusterTrainingRuntime represents a training runtime which can be referenced as part of -// `trainingRuntimeRef` API in TrainJob. This resource is a cluster-scoped and can be referenced +// `runtimeRef` API in TrainJob. This resource is a cluster-scoped and can be referenced // by TrainJob that created in *any* namespace. type ClusterTrainingRuntime struct { metav1.TypeMeta `json:",inline"` @@ -72,7 +72,7 @@ type ClusterTrainingRuntimeList struct { // +kubebuilder:storageversion // TrainingRuntime represents a training runtime which can be referenced as part of -// `trainingRuntimeRef` API in TrainJob. This resource is a namespaced-scoped and can be referenced +// `runtimeRef` API in TrainJob. This resource is a namespaced-scoped and can be referenced // by TrainJob that created in the *same* namespace as the TrainingRuntime. type TrainingRuntime struct { metav1.TypeMeta `json:",inline"` diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go index f77f50de95..0a2f95770d 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go +++ b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go @@ -63,7 +63,7 @@ type TrainJobList struct { // TrainJobSpec represents specification of the desired TrainJob. type TrainJobSpec struct { // Reference to the training runtime. - TrainingRuntimeRef TrainingRuntimeRef `json:"trainingRuntimeRef"` + RuntimeRef RuntimeRef `json:"runtimeRef"` // Configuration of the desired trainer. Trainer *Trainer `json:"trainer,omitempty"` @@ -99,8 +99,8 @@ type TrainJobSpec struct { ManagedBy *string `json:"managedBy,omitempty"` } -// TrainingRuntimeRef represents the reference to the existing training runtime. -type TrainingRuntimeRef struct { +// RuntimeRef represents the reference to the existing training runtime. +type RuntimeRef struct { // Name of the runtime being referenced. // When namespaced-scoped TrainingRuntime is used, the TrainJob must have // the same namespace as the deployed runtime. diff --git a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go index 65773f998e..2c87b2a838 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go @@ -460,6 +460,31 @@ func (in *PodSpecOverride) DeepCopy() *PodSpecOverride { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RuntimeRef) DeepCopyInto(out *RuntimeRef) { + *out = *in + if in.APIGroup != nil { + in, out := &in.APIGroup, &out.APIGroup + *out = new(string) + **out = **in + } + if in.Kind != nil { + in, out := &in.Kind, &out.Kind + *out = new(string) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RuntimeRef. +func (in *RuntimeRef) DeepCopy() *RuntimeRef { + if in == nil { + return nil + } + out := new(RuntimeRef) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *TorchElasticPolicy) DeepCopyInto(out *TorchElasticPolicy) { *out = *in @@ -584,7 +609,7 @@ func (in *TrainJobList) DeepCopyObject() runtime.Object { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *TrainJobSpec) DeepCopyInto(out *TrainJobSpec) { *out = *in - in.TrainingRuntimeRef.DeepCopyInto(&out.TrainingRuntimeRef) + in.RuntimeRef.DeepCopyInto(&out.RuntimeRef) if in.Trainer != nil { in, out := &in.Trainer, &out.Trainer *out = new(Trainer) @@ -780,31 +805,6 @@ func (in *TrainingRuntimeList) DeepCopyObject() runtime.Object { return nil } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *TrainingRuntimeRef) DeepCopyInto(out *TrainingRuntimeRef) { - *out = *in - if in.APIGroup != nil { - in, out := &in.APIGroup, &out.APIGroup - *out = new(string) - **out = **in - } - if in.Kind != nil { - in, out := &in.Kind, &out.Kind - *out = new(string) - **out = **in - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingRuntimeRef. -func (in *TrainingRuntimeRef) DeepCopy() *TrainingRuntimeRef { - if in == nil { - return nil - } - out := new(TrainingRuntimeRef) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *TrainingRuntimeSpec) DeepCopyInto(out *TrainingRuntimeSpec) { *out = *in diff --git a/pkg/runtime.v2/core/clustertrainingruntime.go b/pkg/runtime.v2/core/clustertrainingruntime.go index de819363b5..35c35fe0c9 100644 --- a/pkg/runtime.v2/core/clustertrainingruntime.go +++ b/pkg/runtime.v2/core/clustertrainingruntime.go @@ -53,7 +53,7 @@ func NewClusterTrainingRuntime(context.Context, client.Client, client.FieldIndex func (r *ClusterTrainingRuntime) NewObjects(ctx context.Context, trainJob *kubeflowv2.TrainJob) ([]client.Object, error) { var clTrainingRuntime kubeflowv2.ClusterTrainingRuntime - if err := r.client.Get(ctx, client.ObjectKey{Name: trainJob.Spec.TrainingRuntimeRef.Name}, &clTrainingRuntime); err != nil { + if err := r.client.Get(ctx, client.ObjectKey{Name: trainJob.Spec.RuntimeRef.Name}, &clTrainingRuntime); err != nil { return nil, fmt.Errorf("%w: %w", errorNotFoundSpecifiedClusterTrainingRuntime, err) } return r.buildObjects(ctx, trainJob, clTrainingRuntime.Spec.Template, clTrainingRuntime.Spec.MLPolicy, clTrainingRuntime.Spec.PodGroupPolicy) @@ -66,10 +66,10 @@ func (r *ClusterTrainingRuntime) EventHandlerRegistrars() []runtime.ReconcilerBu func (r *ClusterTrainingRuntime) ValidateObjects(ctx context.Context, old, new *kubeflowv2.TrainJob) (admission.Warnings, field.ErrorList) { if err := r.client.Get(ctx, client.ObjectKey{ Namespace: old.Namespace, - Name: old.Spec.TrainingRuntimeRef.Name, + Name: old.Spec.RuntimeRef.Name, }, &kubeflowv2.ClusterTrainingRuntime{}); err != nil { return nil, field.ErrorList{ - field.Invalid(field.NewPath("spec", "trainingRuntimeRef"), old.Spec.TrainingRuntimeRef, + field.Invalid(field.NewPath("spec", "RuntimeRef"), old.Spec.RuntimeRef, fmt.Sprintf("%v: specified clusterTrainingRuntime must be created before the TrainJob is created", err)), } } diff --git a/pkg/runtime.v2/core/clustertrainingruntime_test.go b/pkg/runtime.v2/core/clustertrainingruntime_test.go index 5665c10fe5..696d486ab5 100644 --- a/pkg/runtime.v2/core/clustertrainingruntime_test.go +++ b/pkg/runtime.v2/core/clustertrainingruntime_test.go @@ -47,7 +47,7 @@ func TestClusterTrainingRuntimeNewObjects(t *testing.T) { "succeeded to build JobSet and PodGroup": { trainJob: testingutil.MakeTrainJobWrapper(metav1.NamespaceDefault, "test-job"). UID("uid"). - TrainingRuntimeRef(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.ClusterTrainingRuntimeKind), "test-runtime"). + RuntimeRef(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.ClusterTrainingRuntimeKind), "test-runtime"). Trainer( testingutil.MakeTrainJobTrainerWrapper(). ContainerImage("test:trainjob"). @@ -93,7 +93,7 @@ func TestClusterTrainingRuntimeNewObjects(t *testing.T) { "missing trainingRuntime resource": { trainJob: testingutil.MakeTrainJobWrapper(metav1.NamespaceDefault, "test-job"). UID("uid"). - TrainingRuntimeRef(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.ClusterTrainingRuntimeKind), "test-runtime"). + RuntimeRef(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.ClusterTrainingRuntimeKind), "test-runtime"). Trainer( testingutil.MakeTrainJobTrainerWrapper(). ContainerImage("test:trainjob"). diff --git a/pkg/runtime.v2/core/trainingruntime.go b/pkg/runtime.v2/core/trainingruntime.go index 1597bbf0a8..621d4eb533 100644 --- a/pkg/runtime.v2/core/trainingruntime.go +++ b/pkg/runtime.v2/core/trainingruntime.go @@ -55,10 +55,10 @@ var _ runtime.Runtime = (*TrainingRuntime)(nil) var trainingRuntimeFactory *TrainingRuntime func NewTrainingRuntime(ctx context.Context, c client.Client, indexer client.FieldIndexer) (runtime.Runtime, error) { - if err := indexer.IndexField(ctx, &kubeflowv2.TrainJob{}, idxer.TrainJobTrainingRuntimeRefKey, idxer.IndexTrainJobTrainingRuntime); err != nil { + if err := indexer.IndexField(ctx, &kubeflowv2.TrainJob{}, idxer.TrainJobRuntimeRefKey, idxer.IndexTrainJobTrainingRuntime); err != nil { return nil, fmt.Errorf("setting index on TrainingRuntime for TrainJob: %w", err) } - if err := indexer.IndexField(ctx, &kubeflowv2.TrainJob{}, idxer.TrainJobClusterTrainingRuntimeRefKey, idxer.IndexTrainJobClusterTrainingRuntime); err != nil { + if err := indexer.IndexField(ctx, &kubeflowv2.TrainJob{}, idxer.TrainJobClusterRuntimeRefKey, idxer.IndexTrainJobClusterTrainingRuntime); err != nil { return nil, fmt.Errorf("setting index on ClusterTrainingRuntime for TrainJob: %w", err) } fwk, err := fwkcore.New(ctx, c, fwkplugins.NewRegistry(), indexer) @@ -74,7 +74,7 @@ func NewTrainingRuntime(ctx context.Context, c client.Client, indexer client.Fie func (r *TrainingRuntime) NewObjects(ctx context.Context, trainJob *kubeflowv2.TrainJob) ([]client.Object, error) { var trainingRuntime kubeflowv2.TrainingRuntime - err := r.client.Get(ctx, client.ObjectKey{Namespace: trainJob.Namespace, Name: trainJob.Spec.TrainingRuntimeRef.Name}, &trainingRuntime) + err := r.client.Get(ctx, client.ObjectKey{Namespace: trainJob.Namespace, Name: trainJob.Spec.RuntimeRef.Name}, &trainingRuntime) if err != nil { return nil, fmt.Errorf("%w: %w", errorNotFoundSpecifiedTrainingRuntime, err) } @@ -139,10 +139,10 @@ func (r *TrainingRuntime) EventHandlerRegistrars() []runtime.ReconcilerBuilder { func (r *TrainingRuntime) ValidateObjects(ctx context.Context, old, new *kubeflowv2.TrainJob) (admission.Warnings, field.ErrorList) { if err := r.client.Get(ctx, client.ObjectKey{ Namespace: old.Namespace, - Name: old.Spec.TrainingRuntimeRef.Name, + Name: old.Spec.RuntimeRef.Name, }, &kubeflowv2.TrainingRuntime{}); err != nil { return nil, field.ErrorList{ - field.Invalid(field.NewPath("spec", "trainingRuntimeRef"), old.Spec.TrainingRuntimeRef, + field.Invalid(field.NewPath("spec", "runtimeRef"), old.Spec.RuntimeRef, fmt.Sprintf("%v: specified trainingRuntime must be created before the TrainJob is created", err)), } } diff --git a/pkg/runtime.v2/core/trainingruntime_test.go b/pkg/runtime.v2/core/trainingruntime_test.go index a3bd63efa6..a32ad33852 100644 --- a/pkg/runtime.v2/core/trainingruntime_test.go +++ b/pkg/runtime.v2/core/trainingruntime_test.go @@ -47,7 +47,7 @@ func TestTrainingRuntimeNewObjects(t *testing.T) { "succeeded to build JobSet and PodGroup": { trainJob: testingutil.MakeTrainJobWrapper(metav1.NamespaceDefault, "test-job"). UID("uid"). - TrainingRuntimeRef(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.TrainingRuntimeKind), "test-runtime"). + RuntimeRef(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.TrainingRuntimeKind), "test-runtime"). SpecLabel("conflictLabel", "override"). SpecAnnotation("conflictAnnotation", "override"). Trainer( @@ -100,7 +100,7 @@ func TestTrainingRuntimeNewObjects(t *testing.T) { "missing trainingRuntime resource": { trainJob: testingutil.MakeTrainJobWrapper(metav1.NamespaceDefault, "test-job"). UID("uid"). - TrainingRuntimeRef(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.TrainingRuntimeKind), "test-runtime"). + RuntimeRef(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.TrainingRuntimeKind), "test-runtime"). Trainer( testingutil.MakeTrainJobTrainerWrapper(). ContainerImage("test:trainjob"). diff --git a/pkg/runtime.v2/framework/plugins/coscheduling/coscheduling.go b/pkg/runtime.v2/framework/plugins/coscheduling/coscheduling.go index 36b7d6813d..aa0ef3b3a6 100644 --- a/pkg/runtime.v2/framework/plugins/coscheduling/coscheduling.go +++ b/pkg/runtime.v2/framework/plugins/coscheduling/coscheduling.go @@ -207,7 +207,7 @@ func (h *PodGroupRuntimeClassHandler) queueSuspendedTrainJobs(ctx context.Contex var trainJobs []kubeflowv2.TrainJob for _, trainingRuntime := range trainingRuntimes.Items { var trainJobsWithTrainingRuntime kubeflowv2.TrainJobList - err := h.client.List(ctx, &trainJobsWithTrainingRuntime, client.MatchingFields{runtimeindexer.TrainJobTrainingRuntimeRefKey: trainingRuntime.Name}) + err := h.client.List(ctx, &trainJobsWithTrainingRuntime, client.MatchingFields{runtimeindexer.TrainJobRuntimeRefKey: trainingRuntime.Name}) if err != nil { return err } @@ -215,7 +215,7 @@ func (h *PodGroupRuntimeClassHandler) queueSuspendedTrainJobs(ctx context.Contex } for _, clusterTrainingRuntime := range clusterTrainingRuntimes.Items { var trainJobsWithClTrainingRuntime kubeflowv2.TrainJobList - err := h.client.List(ctx, &trainJobsWithClTrainingRuntime, client.MatchingFields{runtimeindexer.TrainJobClusterTrainingRuntimeRefKey: clusterTrainingRuntime.Name}) + err := h.client.List(ctx, &trainJobsWithClTrainingRuntime, client.MatchingFields{runtimeindexer.TrainJobClusterRuntimeRefKey: clusterTrainingRuntime.Name}) if err != nil { return err } diff --git a/pkg/runtime.v2/indexer/indexer.go b/pkg/runtime.v2/indexer/indexer.go index dacbfcd050..730f4267de 100644 --- a/pkg/runtime.v2/indexer/indexer.go +++ b/pkg/runtime.v2/indexer/indexer.go @@ -24,8 +24,8 @@ import ( ) const ( - TrainJobTrainingRuntimeRefKey = ".spec.trainingRuntimeRef.kind=trainingRuntime" - TrainJobClusterTrainingRuntimeRefKey = ".spec.trainingRuntimeRef.kind=clusterTrainingRuntime" + TrainJobRuntimeRefKey = ".spec.runtimeRef.kind=trainingRuntime" + TrainJobClusterRuntimeRefKey = ".spec.runtimeRef.kind=clusterTrainingRuntime" ) func IndexTrainJobTrainingRuntime(obj client.Object) []string { @@ -33,9 +33,9 @@ func IndexTrainJobTrainingRuntime(obj client.Object) []string { if !ok { return nil } - if ptr.Deref(trainJob.Spec.TrainingRuntimeRef.APIGroup, "") == kubeflowv2.GroupVersion.Group && - ptr.Deref(trainJob.Spec.TrainingRuntimeRef.Kind, "") == kubeflowv2.TrainingRuntimeKind { - return []string{trainJob.Spec.TrainingRuntimeRef.Name} + if ptr.Deref(trainJob.Spec.RuntimeRef.APIGroup, "") == kubeflowv2.GroupVersion.Group && + ptr.Deref(trainJob.Spec.RuntimeRef.Kind, "") == kubeflowv2.TrainingRuntimeKind { + return []string{trainJob.Spec.RuntimeRef.Name} } return nil } @@ -45,9 +45,9 @@ func IndexTrainJobClusterTrainingRuntime(obj client.Object) []string { if !ok { return nil } - if ptr.Deref(trainJob.Spec.TrainingRuntimeRef.APIGroup, "") == kubeflowv2.GroupVersion.Group && - ptr.Deref(trainJob.Spec.TrainingRuntimeRef.Kind, "") == kubeflowv2.ClusterTrainingRuntimeKind { - return []string{trainJob.Spec.TrainingRuntimeRef.Name} + if ptr.Deref(trainJob.Spec.RuntimeRef.APIGroup, "") == kubeflowv2.GroupVersion.Group && + ptr.Deref(trainJob.Spec.RuntimeRef.Kind, "") == kubeflowv2.ClusterTrainingRuntimeKind { + return []string{trainJob.Spec.RuntimeRef.Name} } return nil } diff --git a/pkg/util.v2/testing/wrapper.go b/pkg/util.v2/testing/wrapper.go index df3f8cbfce..3be7f4f194 100644 --- a/pkg/util.v2/testing/wrapper.go +++ b/pkg/util.v2/testing/wrapper.go @@ -249,8 +249,8 @@ func (t *TrainJobWrapper) Trainer(trainer *kubeflowv2.Trainer) *TrainJobWrapper return t } -func (t *TrainJobWrapper) TrainingRuntimeRef(gvk schema.GroupVersionKind, name string) *TrainJobWrapper { - t.Spec.TrainingRuntimeRef = kubeflowv2.TrainingRuntimeRef{ +func (t *TrainJobWrapper) RuntimeRef(gvk schema.GroupVersionKind, name string) *TrainJobWrapper { + t.Spec.RuntimeRef = kubeflowv2.RuntimeRef{ APIGroup: &gvk.Group, Kind: &gvk.Kind, Name: name,