diff --git a/docs/proposals/2170-kubeflow-training-v2/README.md b/docs/proposals/2170-kubeflow-training-v2/README.md
index c627e89527..81ba10ecad 100644
--- a/docs/proposals/2170-kubeflow-training-v2/README.md
+++ b/docs/proposals/2170-kubeflow-training-v2/README.md
@@ -281,7 +281,7 @@ type TrainJob struct {
type TrainJobSpec struct {
// Reference to the training runtime.
- TrainingRuntimeRef TrainingRuntimeRef `json:"trainingRuntimeRef"`
+ RuntimeRef RuntimeRef `json:"runtimeRef"`
// Configuration of the desired trainer.
Trainer *Trainer `json:"trainer,omitempty"`
@@ -317,7 +317,7 @@ type TrainJobSpec struct {
ManagedBy *string `json:"managedBy,omitempty"`
}
-type TrainingRuntimeRef struct {
+type RuntimeRef struct {
// Name of the runtime being referenced.
// When namespaced-scoped TrainingRuntime is used, the TrainJob must have
// the same namespace as the deployed runtime.
@@ -375,7 +375,7 @@ This table explains the rationale for each `TrainJob` parameter:
- TrainingRuntimeRef
+ | RuntimeRef
|
Reference to the existing TrainingRuntime that is pre-deployed by platform engineers
|
@@ -430,7 +430,7 @@ metadata:
name: torch-ddp
namespace: tenant-alpha
spec:
- trainingRuntimeRef:
+ runtimeRef:
name: torch-distributed-multi-node
trainer:
image: docker.io/custom-training
@@ -488,7 +488,7 @@ metadata:
name: tune-llama-with-yelp
namespace: tenant-alpha
spec:
- trainingRuntimeRef:
+ runtimeRef:
name: torch-tune-llama-7b
datasetConfig:
storageUri: s3://dataset/custom-dataset/yelp-review
@@ -890,7 +890,7 @@ metadata:
name: pytorch-distributed
namespace: tenant-alpha
spec:
- trainingRuntimeRef:
+ runtimeRef:
name: pytorch-distributed-gpu
trainer:
image: docker.io/custom-training
@@ -939,7 +939,7 @@ to control versions of `TrainingRuntime` and enable rolling updates.
We are going to create two CRDs: `TrainingRuntime` and `ClusterTrainingRuntime`. These runtimes have
exactly the same APIs, but the first one is the namespace-scoped, the second is the cluster-scoped.
-User can set the `kind` and `apiGroup` parameters in the `trainingRuntimeRef` to use
+User can set the `kind` and `apiGroup` parameters in the `runtimeRef` to use
the `TrainingRuntime` from the `TrainJob's` namespace, otherwise the `ClusterTrainingRuntime` will
be used.
@@ -1228,7 +1228,7 @@ metadata:
name: torch-test
namespace: tenant-alpha
spec:
- trainingRuntimeRef:
+ runtimeRef:
name: torch-distributed-multi-node
trainer:
resourcesPerNode:
@@ -1698,7 +1698,7 @@ Note that we should implement the status transitions validations to once we supp
### Support Multiple API Versions of TrainingRuntime
-We can consider to introduce the `version` field for runtime API version to the `.spec.trainingRuntimeRef`
+We can consider to introduce the `version` field for runtime API version to the `.spec.runtimeRef`
so that we can support multiple API versions of TrainingRuntime.
It could mitigate the pain points when users upgrade the older API Version to newer API Version like alpha to beta.
@@ -1706,7 +1706,7 @@ But, we do not aim to support both Alpha and Beta versions or both first Alpha a
Hence, the `version` field was not introduced.
```go
-type TrainingRuntimeRef struct {
+type RuntimeRef struct {
[...]
// APIVersion is the apiVersion for the runtime.
diff --git a/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml b/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml
index 6da29d74d0..f28b08b6c5 100644
--- a/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml
+++ b/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml
@@ -19,7 +19,7 @@ spec:
openAPIV3Schema:
description: |-
ClusterTrainingRuntime represents a training runtime which can be referenced as part of
- `trainingRuntimeRef` API in TrainJob. This resource is a cluster-scoped and can be referenced
+ `runtimeRef` API in TrainJob. This resource is a cluster-scoped and can be referenced
by TrainJob that created in *any* namespace.
properties:
apiVersion:
diff --git a/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml b/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml
index 90f7d8e209..d5769ed073 100644
--- a/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml
+++ b/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml
@@ -19,7 +19,7 @@ spec:
openAPIV3Schema:
description: |-
TrainingRuntime represents a training runtime which can be referenced as part of
- `trainingRuntimeRef` API in TrainJob. This resource is a namespaced-scoped and can be referenced
+ `runtimeRef` API in TrainJob. This resource is a namespaced-scoped and can be referenced
by TrainJob that created in the *same* namespace as the TrainingRuntime.
properties:
apiVersion:
diff --git a/manifests/v2/base/crds/kubeflow.org_trainjobs.yaml b/manifests/v2/base/crds/kubeflow.org_trainjobs.yaml
index 771b30ebdd..d0e0e6f86e 100644
--- a/manifests/v2/base/crds/kubeflow.org_trainjobs.yaml
+++ b/manifests/v2/base/crds/kubeflow.org_trainjobs.yaml
@@ -2732,6 +2732,29 @@ spec:
- targetReplicatedJobs
type: object
type: array
+ runtimeRef:
+ description: Reference to the training runtime.
+ properties:
+ apiGroup:
+ description: |-
+ APIGroup of the runtime being referenced.
+ Defaults to `kubeflow.org`.
+ type: string
+ kind:
+ description: |-
+ Kind of the runtime being referenced.
+ It must be one of TrainingRuntime or ClusterTrainingRuntime.
+ Defaults to ClusterTrainingRuntime.
+ type: string
+ name:
+ description: |-
+ Name of the runtime being referenced.
+ When namespaced-scoped TrainingRuntime is used, the TrainJob must have
+ the same namespace as the deployed runtime.
+ type: string
+ required:
+ - name
+ type: object
suspend:
description: |-
Whether the controller should suspend the running TrainJob.
@@ -2937,31 +2960,8 @@ spec:
type: object
type: object
type: object
- trainingRuntimeRef:
- description: Reference to the training runtime.
- properties:
- apiGroup:
- description: |-
- APIGroup of the runtime being referenced.
- Defaults to `kubeflow.org`.
- type: string
- kind:
- description: |-
- Kind of the runtime being referenced.
- It must be one of TrainingRuntime or ClusterTrainingRuntime.
- Defaults to ClusterTrainingRuntime.
- type: string
- name:
- description: |-
- Name of the runtime being referenced.
- When namespaced-scoped TrainingRuntime is used, the TrainJob must have
- the same namespace as the deployed runtime.
- type: string
- required:
- - name
- type: object
required:
- - trainingRuntimeRef
+ - runtimeRef
type: object
status:
description: Current status of TrainJob.
diff --git a/pkg/apis/kubeflow.org/v2alpha1/openapi_generated.go b/pkg/apis/kubeflow.org/v2alpha1/openapi_generated.go
index 10a47600a9..6cb9daabe8 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/openapi_generated.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/openapi_generated.go
@@ -43,6 +43,7 @@ func GetOpenAPIDefinitions(ref common.ReferenceCallback) map[string]common.OpenA
"github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.PodGroupPolicy": schema_pkg_apis_kubefloworg_v2alpha1_PodGroupPolicy(ref),
"github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.PodGroupPolicySource": schema_pkg_apis_kubefloworg_v2alpha1_PodGroupPolicySource(ref),
"github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.PodSpecOverride": schema_pkg_apis_kubefloworg_v2alpha1_PodSpecOverride(ref),
+ "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.RuntimeRef": schema_pkg_apis_kubefloworg_v2alpha1_RuntimeRef(ref),
"github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.TorchElasticPolicy": schema_pkg_apis_kubefloworg_v2alpha1_TorchElasticPolicy(ref),
"github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.TorchMLPolicySource": schema_pkg_apis_kubefloworg_v2alpha1_TorchMLPolicySource(ref),
"github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.TrainJob": schema_pkg_apis_kubefloworg_v2alpha1_TrainJob(ref),
@@ -52,7 +53,6 @@ func GetOpenAPIDefinitions(ref common.ReferenceCallback) map[string]common.OpenA
"github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.Trainer": schema_pkg_apis_kubefloworg_v2alpha1_Trainer(ref),
"github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.TrainingRuntime": schema_pkg_apis_kubefloworg_v2alpha1_TrainingRuntime(ref),
"github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.TrainingRuntimeList": schema_pkg_apis_kubefloworg_v2alpha1_TrainingRuntimeList(ref),
- "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.TrainingRuntimeRef": schema_pkg_apis_kubefloworg_v2alpha1_TrainingRuntimeRef(ref),
"github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.TrainingRuntimeSpec": schema_pkg_apis_kubefloworg_v2alpha1_TrainingRuntimeSpec(ref),
}
}
@@ -61,7 +61,7 @@ func schema_pkg_apis_kubefloworg_v2alpha1_ClusterTrainingRuntime(ref common.Refe
return common.OpenAPIDefinition{
Schema: spec.Schema{
SchemaProps: spec.SchemaProps{
- Description: "ClusterTrainingRuntime represents a training runtime which can be referenced as part of `trainingRuntimeRef` API in TrainJob. This resource is a cluster-scoped and can be referenced by TrainJob that created in *any* namespace.",
+ Description: "ClusterTrainingRuntime represents a training runtime which can be referenced as part of `runtimeRef` API in TrainJob. This resource is a cluster-scoped and can be referenced by TrainJob that created in *any* namespace.",
Type: []string{"object"},
Properties: map[string]spec.Schema{
"kind": {
@@ -703,6 +703,42 @@ func schema_pkg_apis_kubefloworg_v2alpha1_PodSpecOverride(ref common.ReferenceCa
}
}
+func schema_pkg_apis_kubefloworg_v2alpha1_RuntimeRef(ref common.ReferenceCallback) common.OpenAPIDefinition {
+ return common.OpenAPIDefinition{
+ Schema: spec.Schema{
+ SchemaProps: spec.SchemaProps{
+ Description: "RuntimeRef represents the reference to the existing training runtime.",
+ Type: []string{"object"},
+ Properties: map[string]spec.Schema{
+ "name": {
+ SchemaProps: spec.SchemaProps{
+ Description: "Name of the runtime being referenced. When namespaced-scoped TrainingRuntime is used, the TrainJob must have the same namespace as the deployed runtime.",
+ Default: "",
+ Type: []string{"string"},
+ Format: "",
+ },
+ },
+ "apiGroup": {
+ SchemaProps: spec.SchemaProps{
+ Description: "APIGroup of the runtime being referenced. Defaults to `kubeflow.org`.",
+ Type: []string{"string"},
+ Format: "",
+ },
+ },
+ "kind": {
+ SchemaProps: spec.SchemaProps{
+ Description: "Kind of the runtime being referenced. It must be one of TrainingRuntime or ClusterTrainingRuntime. Defaults to ClusterTrainingRuntime.",
+ Type: []string{"string"},
+ Format: "",
+ },
+ },
+ },
+ Required: []string{"name"},
+ },
+ },
+ }
+}
+
func schema_pkg_apis_kubefloworg_v2alpha1_TorchElasticPolicy(ref common.ReferenceCallback) common.OpenAPIDefinition {
return common.OpenAPIDefinition{
Schema: spec.Schema{
@@ -889,11 +925,11 @@ func schema_pkg_apis_kubefloworg_v2alpha1_TrainJobSpec(ref common.ReferenceCallb
Description: "TrainJobSpec represents specification of the desired TrainJob.",
Type: []string{"object"},
Properties: map[string]spec.Schema{
- "trainingRuntimeRef": {
+ "runtimeRef": {
SchemaProps: spec.SchemaProps{
Description: "Reference to the training runtime.",
Default: map[string]interface{}{},
- Ref: ref("github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.TrainingRuntimeRef"),
+ Ref: ref("github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.RuntimeRef"),
},
},
"trainer": {
@@ -975,11 +1011,11 @@ func schema_pkg_apis_kubefloworg_v2alpha1_TrainJobSpec(ref common.ReferenceCallb
},
},
},
- Required: []string{"trainingRuntimeRef"},
+ Required: []string{"runtimeRef"},
},
},
Dependencies: []string{
- "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.DatasetConfig", "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.ModelConfig", "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.PodSpecOverride", "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.Trainer", "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.TrainingRuntimeRef"},
+ "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.DatasetConfig", "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.ModelConfig", "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.PodSpecOverride", "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.RuntimeRef", "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.Trainer"},
}
}
@@ -1116,7 +1152,7 @@ func schema_pkg_apis_kubefloworg_v2alpha1_TrainingRuntime(ref common.ReferenceCa
return common.OpenAPIDefinition{
Schema: spec.Schema{
SchemaProps: spec.SchemaProps{
- Description: "TrainingRuntime represents a training runtime which can be referenced as part of `trainingRuntimeRef` API in TrainJob. This resource is a namespaced-scoped and can be referenced by TrainJob that created in the *same* namespace as the TrainingRuntime.",
+ Description: "TrainingRuntime represents a training runtime which can be referenced as part of `runtimeRef` API in TrainJob. This resource is a namespaced-scoped and can be referenced by TrainJob that created in the *same* namespace as the TrainingRuntime.",
Type: []string{"object"},
Properties: map[string]spec.Schema{
"kind": {
@@ -1206,42 +1242,6 @@ func schema_pkg_apis_kubefloworg_v2alpha1_TrainingRuntimeList(ref common.Referen
}
}
-func schema_pkg_apis_kubefloworg_v2alpha1_TrainingRuntimeRef(ref common.ReferenceCallback) common.OpenAPIDefinition {
- return common.OpenAPIDefinition{
- Schema: spec.Schema{
- SchemaProps: spec.SchemaProps{
- Description: "TrainingRuntimeRef represents the reference to the existing training runtime.",
- Type: []string{"object"},
- Properties: map[string]spec.Schema{
- "name": {
- SchemaProps: spec.SchemaProps{
- Description: "Name of the runtime being referenced. When namespaced-scoped TrainingRuntime is used, the TrainJob must have the same namespace as the deployed runtime.",
- Default: "",
- Type: []string{"string"},
- Format: "",
- },
- },
- "apiGroup": {
- SchemaProps: spec.SchemaProps{
- Description: "APIGroup of the runtime being referenced. Defaults to `kubeflow.org`.",
- Type: []string{"string"},
- Format: "",
- },
- },
- "kind": {
- SchemaProps: spec.SchemaProps{
- Description: "Kind of the runtime being referenced. It must be one of TrainingRuntime or ClusterTrainingRuntime. Defaults to ClusterTrainingRuntime.",
- Type: []string{"string"},
- Format: "",
- },
- },
- },
- Required: []string{"name"},
- },
- },
- }
-}
-
func schema_pkg_apis_kubefloworg_v2alpha1_TrainingRuntimeSpec(ref common.ReferenceCallback) common.OpenAPIDefinition {
return common.OpenAPIDefinition{
Schema: spec.Schema{
diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
index 318d22be0d..c25e2c49b3 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
@@ -38,7 +38,7 @@ const (
// +kubebuilder:resource:scope=Cluster
// ClusterTrainingRuntime represents a training runtime which can be referenced as part of
-// `trainingRuntimeRef` API in TrainJob. This resource is a cluster-scoped and can be referenced
+// `runtimeRef` API in TrainJob. This resource is a cluster-scoped and can be referenced
// by TrainJob that created in *any* namespace.
type ClusterTrainingRuntime struct {
metav1.TypeMeta `json:",inline"`
@@ -72,7 +72,7 @@ type ClusterTrainingRuntimeList struct {
// +kubebuilder:storageversion
// TrainingRuntime represents a training runtime which can be referenced as part of
-// `trainingRuntimeRef` API in TrainJob. This resource is a namespaced-scoped and can be referenced
+// `runtimeRef` API in TrainJob. This resource is a namespaced-scoped and can be referenced
// by TrainJob that created in the *same* namespace as the TrainingRuntime.
type TrainingRuntime struct {
metav1.TypeMeta `json:",inline"`
diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
index f77f50de95..0a2f95770d 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
@@ -63,7 +63,7 @@ type TrainJobList struct {
// TrainJobSpec represents specification of the desired TrainJob.
type TrainJobSpec struct {
// Reference to the training runtime.
- TrainingRuntimeRef TrainingRuntimeRef `json:"trainingRuntimeRef"`
+ RuntimeRef RuntimeRef `json:"runtimeRef"`
// Configuration of the desired trainer.
Trainer *Trainer `json:"trainer,omitempty"`
@@ -99,8 +99,8 @@ type TrainJobSpec struct {
ManagedBy *string `json:"managedBy,omitempty"`
}
-// TrainingRuntimeRef represents the reference to the existing training runtime.
-type TrainingRuntimeRef struct {
+// RuntimeRef represents the reference to the existing training runtime.
+type RuntimeRef struct {
// Name of the runtime being referenced.
// When namespaced-scoped TrainingRuntime is used, the TrainJob must have
// the same namespace as the deployed runtime.
diff --git a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go
index 65773f998e..2c87b2a838 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go
@@ -460,6 +460,31 @@ func (in *PodSpecOverride) DeepCopy() *PodSpecOverride {
return out
}
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *RuntimeRef) DeepCopyInto(out *RuntimeRef) {
+ *out = *in
+ if in.APIGroup != nil {
+ in, out := &in.APIGroup, &out.APIGroup
+ *out = new(string)
+ **out = **in
+ }
+ if in.Kind != nil {
+ in, out := &in.Kind, &out.Kind
+ *out = new(string)
+ **out = **in
+ }
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RuntimeRef.
+func (in *RuntimeRef) DeepCopy() *RuntimeRef {
+ if in == nil {
+ return nil
+ }
+ out := new(RuntimeRef)
+ in.DeepCopyInto(out)
+ return out
+}
+
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *TorchElasticPolicy) DeepCopyInto(out *TorchElasticPolicy) {
*out = *in
@@ -584,7 +609,7 @@ func (in *TrainJobList) DeepCopyObject() runtime.Object {
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *TrainJobSpec) DeepCopyInto(out *TrainJobSpec) {
*out = *in
- in.TrainingRuntimeRef.DeepCopyInto(&out.TrainingRuntimeRef)
+ in.RuntimeRef.DeepCopyInto(&out.RuntimeRef)
if in.Trainer != nil {
in, out := &in.Trainer, &out.Trainer
*out = new(Trainer)
@@ -780,31 +805,6 @@ func (in *TrainingRuntimeList) DeepCopyObject() runtime.Object {
return nil
}
-// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
-func (in *TrainingRuntimeRef) DeepCopyInto(out *TrainingRuntimeRef) {
- *out = *in
- if in.APIGroup != nil {
- in, out := &in.APIGroup, &out.APIGroup
- *out = new(string)
- **out = **in
- }
- if in.Kind != nil {
- in, out := &in.Kind, &out.Kind
- *out = new(string)
- **out = **in
- }
-}
-
-// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingRuntimeRef.
-func (in *TrainingRuntimeRef) DeepCopy() *TrainingRuntimeRef {
- if in == nil {
- return nil
- }
- out := new(TrainingRuntimeRef)
- in.DeepCopyInto(out)
- return out
-}
-
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *TrainingRuntimeSpec) DeepCopyInto(out *TrainingRuntimeSpec) {
*out = *in
diff --git a/pkg/runtime.v2/core/clustertrainingruntime.go b/pkg/runtime.v2/core/clustertrainingruntime.go
index de819363b5..35c35fe0c9 100644
--- a/pkg/runtime.v2/core/clustertrainingruntime.go
+++ b/pkg/runtime.v2/core/clustertrainingruntime.go
@@ -53,7 +53,7 @@ func NewClusterTrainingRuntime(context.Context, client.Client, client.FieldIndex
func (r *ClusterTrainingRuntime) NewObjects(ctx context.Context, trainJob *kubeflowv2.TrainJob) ([]client.Object, error) {
var clTrainingRuntime kubeflowv2.ClusterTrainingRuntime
- if err := r.client.Get(ctx, client.ObjectKey{Name: trainJob.Spec.TrainingRuntimeRef.Name}, &clTrainingRuntime); err != nil {
+ if err := r.client.Get(ctx, client.ObjectKey{Name: trainJob.Spec.RuntimeRef.Name}, &clTrainingRuntime); err != nil {
return nil, fmt.Errorf("%w: %w", errorNotFoundSpecifiedClusterTrainingRuntime, err)
}
return r.buildObjects(ctx, trainJob, clTrainingRuntime.Spec.Template, clTrainingRuntime.Spec.MLPolicy, clTrainingRuntime.Spec.PodGroupPolicy)
@@ -66,10 +66,10 @@ func (r *ClusterTrainingRuntime) EventHandlerRegistrars() []runtime.ReconcilerBu
func (r *ClusterTrainingRuntime) ValidateObjects(ctx context.Context, old, new *kubeflowv2.TrainJob) (admission.Warnings, field.ErrorList) {
if err := r.client.Get(ctx, client.ObjectKey{
Namespace: old.Namespace,
- Name: old.Spec.TrainingRuntimeRef.Name,
+ Name: old.Spec.RuntimeRef.Name,
}, &kubeflowv2.ClusterTrainingRuntime{}); err != nil {
return nil, field.ErrorList{
- field.Invalid(field.NewPath("spec", "trainingRuntimeRef"), old.Spec.TrainingRuntimeRef,
+ field.Invalid(field.NewPath("spec", "RuntimeRef"), old.Spec.RuntimeRef,
fmt.Sprintf("%v: specified clusterTrainingRuntime must be created before the TrainJob is created", err)),
}
}
diff --git a/pkg/runtime.v2/core/clustertrainingruntime_test.go b/pkg/runtime.v2/core/clustertrainingruntime_test.go
index 5665c10fe5..696d486ab5 100644
--- a/pkg/runtime.v2/core/clustertrainingruntime_test.go
+++ b/pkg/runtime.v2/core/clustertrainingruntime_test.go
@@ -47,7 +47,7 @@ func TestClusterTrainingRuntimeNewObjects(t *testing.T) {
"succeeded to build JobSet and PodGroup": {
trainJob: testingutil.MakeTrainJobWrapper(metav1.NamespaceDefault, "test-job").
UID("uid").
- TrainingRuntimeRef(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.ClusterTrainingRuntimeKind), "test-runtime").
+ RuntimeRef(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.ClusterTrainingRuntimeKind), "test-runtime").
Trainer(
testingutil.MakeTrainJobTrainerWrapper().
ContainerImage("test:trainjob").
@@ -93,7 +93,7 @@ func TestClusterTrainingRuntimeNewObjects(t *testing.T) {
"missing trainingRuntime resource": {
trainJob: testingutil.MakeTrainJobWrapper(metav1.NamespaceDefault, "test-job").
UID("uid").
- TrainingRuntimeRef(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.ClusterTrainingRuntimeKind), "test-runtime").
+ RuntimeRef(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.ClusterTrainingRuntimeKind), "test-runtime").
Trainer(
testingutil.MakeTrainJobTrainerWrapper().
ContainerImage("test:trainjob").
diff --git a/pkg/runtime.v2/core/trainingruntime.go b/pkg/runtime.v2/core/trainingruntime.go
index 1597bbf0a8..621d4eb533 100644
--- a/pkg/runtime.v2/core/trainingruntime.go
+++ b/pkg/runtime.v2/core/trainingruntime.go
@@ -55,10 +55,10 @@ var _ runtime.Runtime = (*TrainingRuntime)(nil)
var trainingRuntimeFactory *TrainingRuntime
func NewTrainingRuntime(ctx context.Context, c client.Client, indexer client.FieldIndexer) (runtime.Runtime, error) {
- if err := indexer.IndexField(ctx, &kubeflowv2.TrainJob{}, idxer.TrainJobTrainingRuntimeRefKey, idxer.IndexTrainJobTrainingRuntime); err != nil {
+ if err := indexer.IndexField(ctx, &kubeflowv2.TrainJob{}, idxer.TrainJobRuntimeRefKey, idxer.IndexTrainJobTrainingRuntime); err != nil {
return nil, fmt.Errorf("setting index on TrainingRuntime for TrainJob: %w", err)
}
- if err := indexer.IndexField(ctx, &kubeflowv2.TrainJob{}, idxer.TrainJobClusterTrainingRuntimeRefKey, idxer.IndexTrainJobClusterTrainingRuntime); err != nil {
+ if err := indexer.IndexField(ctx, &kubeflowv2.TrainJob{}, idxer.TrainJobClusterRuntimeRefKey, idxer.IndexTrainJobClusterTrainingRuntime); err != nil {
return nil, fmt.Errorf("setting index on ClusterTrainingRuntime for TrainJob: %w", err)
}
fwk, err := fwkcore.New(ctx, c, fwkplugins.NewRegistry(), indexer)
@@ -74,7 +74,7 @@ func NewTrainingRuntime(ctx context.Context, c client.Client, indexer client.Fie
func (r *TrainingRuntime) NewObjects(ctx context.Context, trainJob *kubeflowv2.TrainJob) ([]client.Object, error) {
var trainingRuntime kubeflowv2.TrainingRuntime
- err := r.client.Get(ctx, client.ObjectKey{Namespace: trainJob.Namespace, Name: trainJob.Spec.TrainingRuntimeRef.Name}, &trainingRuntime)
+ err := r.client.Get(ctx, client.ObjectKey{Namespace: trainJob.Namespace, Name: trainJob.Spec.RuntimeRef.Name}, &trainingRuntime)
if err != nil {
return nil, fmt.Errorf("%w: %w", errorNotFoundSpecifiedTrainingRuntime, err)
}
@@ -139,10 +139,10 @@ func (r *TrainingRuntime) EventHandlerRegistrars() []runtime.ReconcilerBuilder {
func (r *TrainingRuntime) ValidateObjects(ctx context.Context, old, new *kubeflowv2.TrainJob) (admission.Warnings, field.ErrorList) {
if err := r.client.Get(ctx, client.ObjectKey{
Namespace: old.Namespace,
- Name: old.Spec.TrainingRuntimeRef.Name,
+ Name: old.Spec.RuntimeRef.Name,
}, &kubeflowv2.TrainingRuntime{}); err != nil {
return nil, field.ErrorList{
- field.Invalid(field.NewPath("spec", "trainingRuntimeRef"), old.Spec.TrainingRuntimeRef,
+ field.Invalid(field.NewPath("spec", "runtimeRef"), old.Spec.RuntimeRef,
fmt.Sprintf("%v: specified trainingRuntime must be created before the TrainJob is created", err)),
}
}
diff --git a/pkg/runtime.v2/core/trainingruntime_test.go b/pkg/runtime.v2/core/trainingruntime_test.go
index a3bd63efa6..a32ad33852 100644
--- a/pkg/runtime.v2/core/trainingruntime_test.go
+++ b/pkg/runtime.v2/core/trainingruntime_test.go
@@ -47,7 +47,7 @@ func TestTrainingRuntimeNewObjects(t *testing.T) {
"succeeded to build JobSet and PodGroup": {
trainJob: testingutil.MakeTrainJobWrapper(metav1.NamespaceDefault, "test-job").
UID("uid").
- TrainingRuntimeRef(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.TrainingRuntimeKind), "test-runtime").
+ RuntimeRef(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.TrainingRuntimeKind), "test-runtime").
SpecLabel("conflictLabel", "override").
SpecAnnotation("conflictAnnotation", "override").
Trainer(
@@ -100,7 +100,7 @@ func TestTrainingRuntimeNewObjects(t *testing.T) {
"missing trainingRuntime resource": {
trainJob: testingutil.MakeTrainJobWrapper(metav1.NamespaceDefault, "test-job").
UID("uid").
- TrainingRuntimeRef(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.TrainingRuntimeKind), "test-runtime").
+ RuntimeRef(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.TrainingRuntimeKind), "test-runtime").
Trainer(
testingutil.MakeTrainJobTrainerWrapper().
ContainerImage("test:trainjob").
diff --git a/pkg/runtime.v2/framework/plugins/coscheduling/coscheduling.go b/pkg/runtime.v2/framework/plugins/coscheduling/coscheduling.go
index 36b7d6813d..aa0ef3b3a6 100644
--- a/pkg/runtime.v2/framework/plugins/coscheduling/coscheduling.go
+++ b/pkg/runtime.v2/framework/plugins/coscheduling/coscheduling.go
@@ -207,7 +207,7 @@ func (h *PodGroupRuntimeClassHandler) queueSuspendedTrainJobs(ctx context.Contex
var trainJobs []kubeflowv2.TrainJob
for _, trainingRuntime := range trainingRuntimes.Items {
var trainJobsWithTrainingRuntime kubeflowv2.TrainJobList
- err := h.client.List(ctx, &trainJobsWithTrainingRuntime, client.MatchingFields{runtimeindexer.TrainJobTrainingRuntimeRefKey: trainingRuntime.Name})
+ err := h.client.List(ctx, &trainJobsWithTrainingRuntime, client.MatchingFields{runtimeindexer.TrainJobRuntimeRefKey: trainingRuntime.Name})
if err != nil {
return err
}
@@ -215,7 +215,7 @@ func (h *PodGroupRuntimeClassHandler) queueSuspendedTrainJobs(ctx context.Contex
}
for _, clusterTrainingRuntime := range clusterTrainingRuntimes.Items {
var trainJobsWithClTrainingRuntime kubeflowv2.TrainJobList
- err := h.client.List(ctx, &trainJobsWithClTrainingRuntime, client.MatchingFields{runtimeindexer.TrainJobClusterTrainingRuntimeRefKey: clusterTrainingRuntime.Name})
+ err := h.client.List(ctx, &trainJobsWithClTrainingRuntime, client.MatchingFields{runtimeindexer.TrainJobClusterRuntimeRefKey: clusterTrainingRuntime.Name})
if err != nil {
return err
}
diff --git a/pkg/runtime.v2/indexer/indexer.go b/pkg/runtime.v2/indexer/indexer.go
index dacbfcd050..730f4267de 100644
--- a/pkg/runtime.v2/indexer/indexer.go
+++ b/pkg/runtime.v2/indexer/indexer.go
@@ -24,8 +24,8 @@ import (
)
const (
- TrainJobTrainingRuntimeRefKey = ".spec.trainingRuntimeRef.kind=trainingRuntime"
- TrainJobClusterTrainingRuntimeRefKey = ".spec.trainingRuntimeRef.kind=clusterTrainingRuntime"
+ TrainJobRuntimeRefKey = ".spec.runtimeRef.kind=trainingRuntime"
+ TrainJobClusterRuntimeRefKey = ".spec.runtimeRef.kind=clusterTrainingRuntime"
)
func IndexTrainJobTrainingRuntime(obj client.Object) []string {
@@ -33,9 +33,9 @@ func IndexTrainJobTrainingRuntime(obj client.Object) []string {
if !ok {
return nil
}
- if ptr.Deref(trainJob.Spec.TrainingRuntimeRef.APIGroup, "") == kubeflowv2.GroupVersion.Group &&
- ptr.Deref(trainJob.Spec.TrainingRuntimeRef.Kind, "") == kubeflowv2.TrainingRuntimeKind {
- return []string{trainJob.Spec.TrainingRuntimeRef.Name}
+ if ptr.Deref(trainJob.Spec.RuntimeRef.APIGroup, "") == kubeflowv2.GroupVersion.Group &&
+ ptr.Deref(trainJob.Spec.RuntimeRef.Kind, "") == kubeflowv2.TrainingRuntimeKind {
+ return []string{trainJob.Spec.RuntimeRef.Name}
}
return nil
}
@@ -45,9 +45,9 @@ func IndexTrainJobClusterTrainingRuntime(obj client.Object) []string {
if !ok {
return nil
}
- if ptr.Deref(trainJob.Spec.TrainingRuntimeRef.APIGroup, "") == kubeflowv2.GroupVersion.Group &&
- ptr.Deref(trainJob.Spec.TrainingRuntimeRef.Kind, "") == kubeflowv2.ClusterTrainingRuntimeKind {
- return []string{trainJob.Spec.TrainingRuntimeRef.Name}
+ if ptr.Deref(trainJob.Spec.RuntimeRef.APIGroup, "") == kubeflowv2.GroupVersion.Group &&
+ ptr.Deref(trainJob.Spec.RuntimeRef.Kind, "") == kubeflowv2.ClusterTrainingRuntimeKind {
+ return []string{trainJob.Spec.RuntimeRef.Name}
}
return nil
}
diff --git a/pkg/util.v2/testing/wrapper.go b/pkg/util.v2/testing/wrapper.go
index df3f8cbfce..3be7f4f194 100644
--- a/pkg/util.v2/testing/wrapper.go
+++ b/pkg/util.v2/testing/wrapper.go
@@ -249,8 +249,8 @@ func (t *TrainJobWrapper) Trainer(trainer *kubeflowv2.Trainer) *TrainJobWrapper
return t
}
-func (t *TrainJobWrapper) TrainingRuntimeRef(gvk schema.GroupVersionKind, name string) *TrainJobWrapper {
- t.Spec.TrainingRuntimeRef = kubeflowv2.TrainingRuntimeRef{
+func (t *TrainJobWrapper) RuntimeRef(gvk schema.GroupVersionKind, name string) *TrainJobWrapper {
+ t.Spec.RuntimeRef = kubeflowv2.RuntimeRef{
APIGroup: &gvk.Group,
Kind: &gvk.Kind,
Name: name,