Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

KEP-2170: Implement Initializer builders in the JobSet plugin #2316

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletions .github/workflows/test-go.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,10 @@ jobs:
run: |
go mod tidy && pushd hack/swagger && go mod tidy && popd && git add go.* &&
git diff --cached --exit-code || (echo 'Please run "go mod tidy" to sync Go modules' && exit 1);
- name: Check manifests
- name: Check auto-generated assets
run: |
make manifests && git add manifests &&
git diff --cached --exit-code || (echo 'Please run "make manifests" to generate manifests' && exit 1);
- name: Check auto-generated codes
run: |
make generate && git add pkg sdk &&
git diff --cached --exit-code || (echo 'Please run "make generate" to generate Go codes' && exit 1);
make generate && git add pkg sdk manifests &&
git diff --cached --exit-code || (echo 'Please run "make generate" to generate assets' && exit 1);
- name: Verify gofmt
run: |
make fmt && git add pkg cmd &&
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and Cust
output:rbac:artifacts:config=manifests/v2/base/rbac \
output:webhook:artifacts:config=manifests/v2/base/webhook

generate: controller-gen ## Generate apidoc, sdk and code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
generate: controller-gen manifests ## Generate apidoc, sdk and code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you remove this?

- name: Check manifests
run: |
make manifests && git add manifests &&
git diff --cached --exit-code || (echo 'Please run "make manifests" to generate manifests' && exit 1);

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tenzen-y Oh, you want to separate it.
Don't we want to include manifests command into generate script ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted to say removing "Check manifests" from workflow because we will perform manifests verification with make generate since this PR.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, you are right!

$(CONTROLLER_GEN) object:headerFile="hack/boilerplate/boilerplate.go.txt" paths="./pkg/apis/..."
hack/update-codegen.sh
hack/python-sdk/gen-sdk.sh
Expand Down
12 changes: 6 additions & 6 deletions api.v2/openapi-spec/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,8 @@
}
},
"secretRef": {
"description": "Reference to the TrainJob's secrets to download dataset.",
"$ref": "#/definitions/v1.SecretReference"
"description": "Reference to the secret with credentials to download dataset. Secret must be created in the TrainJob's namespace.",
"$ref": "#/definitions/v1.LocalObjectReference"
},
"storageUri": {
"description": "Storage uri for the dataset provider.",
Expand All @@ -159,8 +159,8 @@
}
},
"secretRef": {
"description": "Reference to the TrainJob's secrets to download model.",
"$ref": "#/definitions/v1.SecretReference"
"description": "Reference to the secret with credentials to download model. Secret must be created in the TrainJob's namespace.",
"$ref": "#/definitions/v1.LocalObjectReference"
},
"storageUri": {
"description": "Storage uri for the model provider.",
Expand Down Expand Up @@ -315,8 +315,8 @@
}
},
"secretRef": {
"description": "Reference to the TrainJob's secrets to export model.",
"$ref": "#/definitions/v1.SecretReference"
"description": "Reference to the secret with credentials to export model. Secret must be created in the TrainJob's namespace.",
"$ref": "#/definitions/v1.LocalObjectReference"
},
"storageUri": {
"description": "Storage uri for the model exporter.",
Expand Down
6 changes: 3 additions & 3 deletions docs/proposals/2170-kubeflow-training-v2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -597,7 +597,7 @@ type DatasetConfig struct {
Env []corev1.EnvVar `json:"env,omitempty"`

// Reference to the TrainJob's secrets to download dataset.
SecretRef *corev1.SecretReference `json:"secretRef,omitempty"`
SecretRef *corev1.LocalObjectReference `json:"secretRef,omitempty"`
}
```

Expand Down Expand Up @@ -665,7 +665,7 @@ type InputModel struct {
Env []corev1.EnvVar `json:"env,omitempty"`

// Reference to the TrainJob's secrets to download model.
SecretRef *corev1.SecretReference `json:"secretRef,omitempty"`
SecretRef *corev1.LocalObjectReference `json:"secretRef,omitempty"`
}

type OutputModel struct {
Expand All @@ -677,7 +677,7 @@ type OutputModel struct {
Env []corev1.EnvVar `json:"env,omitempty"`

// Reference to the TrainJob's secrets to export model.
SecretRef *corev1.SecretReference `json:"secretRef,omitempty"`
SecretRef *corev1.LocalObjectReference `json:"secretRef,omitempty"`
}
```

Expand Down
44 changes: 21 additions & 23 deletions manifests/v2/base/crds/kubeflow.org_trainjobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -173,15 +173,15 @@ spec:
type: object
type: array
secretRef:
description: Reference to the TrainJob's secrets to download dataset.
description: |-
Reference to the secret with credentials to download dataset.
Secret must be created in the TrainJob's namespace.
properties:
name:
description: name is unique within a namespace to reference
a secret resource.
type: string
namespace:
description: namespace defines the space within which the
secret name must be unique.
description: |-
Name of the referent.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?
type: string
type: object
x-kubernetes-map-type: atomic
Expand Down Expand Up @@ -339,16 +339,15 @@ spec:
type: object
type: array
secretRef:
description: Reference to the TrainJob's secrets to download
model.
description: |-
Reference to the secret with credentials to download model.
Secret must be created in the TrainJob's namespace.
properties:
name:
description: name is unique within a namespace to reference
a secret resource.
type: string
namespace:
description: namespace defines the space within which
the secret name must be unique.
description: |-
Name of the referent.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?
type: string
type: object
x-kubernetes-map-type: atomic
Expand Down Expand Up @@ -479,16 +478,15 @@ spec:
type: object
type: array
secretRef:
description: Reference to the TrainJob's secrets to export
model.
description: |-
Reference to the secret with credentials to export model.
Secret must be created in the TrainJob's namespace.
properties:
name:
description: name is unique within a namespace to reference
a secret resource.
type: string
namespace:
description: namespace defines the space within which
the secret name must be unique.
description: |-
Name of the referent.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?
type: string
type: object
x-kubernetes-map-type: atomic
Expand Down
18 changes: 9 additions & 9 deletions pkg/apis/kubeflow.org/v2alpha1/openapi_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 9 additions & 6 deletions pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,9 @@ type DatasetConfig struct {
// These values will be merged with the TrainingRuntime's dataset initializer environments.
Env []corev1.EnvVar `json:"env,omitempty"`

// Reference to the TrainJob's secrets to download dataset.
SecretRef *corev1.SecretReference `json:"secretRef,omitempty"`
// Reference to the secret with credentials to download dataset.
// Secret must be created in the TrainJob's namespace.
SecretRef *corev1.LocalObjectReference `json:"secretRef,omitempty"`
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tenzen-y I updated the API to use *corev1.LocalObjectReference type for the SecretRef.
I think, we should use this since we don't want to support cross-namespace reference for this secret.

}

// ModelConfig represents the desired model configuration.
Expand All @@ -193,8 +194,9 @@ type InputModel struct {
// These values will be merged with the TrainingRuntime's model initializer environments.
Env []corev1.EnvVar `json:"env,omitempty"`

// Reference to the TrainJob's secrets to download model.
SecretRef *corev1.SecretReference `json:"secretRef,omitempty"`
// Reference to the secret with credentials to download model.
// Secret must be created in the TrainJob's namespace.
SecretRef *corev1.LocalObjectReference `json:"secretRef,omitempty"`
}

// OutputModel represents the desired trained model configuration.
Expand All @@ -206,8 +208,9 @@ type OutputModel struct {
// These values will be merged with the TrainingRuntime's model exporter environments.
Env []corev1.EnvVar `json:"env,omitempty"`

// Reference to the TrainJob's secrets to export model.
SecretRef *corev1.SecretReference `json:"secretRef,omitempty"`
// Reference to the secret with credentials to export model.
// Secret must be created in the TrainJob's namespace.
SecretRef *corev1.LocalObjectReference `json:"secretRef,omitempty"`
}

// PodSpecOverride represents the custom overrides that will be applied for the TrainJob's resources.
Expand Down
6 changes: 3 additions & 3 deletions pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pkg/controller.v1/pytorch/elastic.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func GetElasticEnvVarGenerator() EnvVarGenerator {

func (e ElasticEnvVarGenerator) Generate(
job *kubeflowv1.PyTorchJob) ([]corev1.EnvVar, error) {
envVars := []corev1.EnvVar{}
var envVars []corev1.EnvVar

elasticPolicy := job.Spec.ElasticPolicy
if elasticPolicy == nil {
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller.v1/pytorch/master.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ func GetMasterEnvVarGenerator() EnvVarGenerator {

func (e MasterEnvVarGenerator) Generate(
job *kubeflowv1.PyTorchJob) ([]corev1.EnvVar, error) {
envVars := []corev1.EnvVar{}
var envVars []corev1.EnvVar
if job.Spec.PyTorchReplicaSpecs[kubeflowv1.PyTorchJobReplicaTypeMaster] != nil {
masterPort, err := getPortFromPyTorchJob(job, kubeflowv1.PyTorchJobReplicaTypeMaster)
if err != nil {
Expand Down
6 changes: 3 additions & 3 deletions pkg/runtime.v2/core/clustertrainingruntime_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ func TestClusterTrainingRuntimeNewObjects(t *testing.T) {
"succeeded to build PodGroup and JobSet with NumNodes from the Runtime and container from the Trainer.": {
clusterTrainingRuntime: testingutil.MakeClusterTrainingRuntimeWrapper("test-runtime").RuntimeSpec(
testingutil.MakeTrainingRuntimeSpecWrapper(testingutil.MakeClusterTrainingRuntimeWrapper("test-runtime").Spec).
InitContainerDatasetModelInitializer("test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests).
NumNodes(100).
ContainerTrainer("test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests).
ContainerDatasetModelInitializer("test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests).
PodGroupPolicyCoschedulingSchedulingTimeout(120).
Obj(),
).Obj(),
Expand All @@ -59,15 +59,15 @@ func TestClusterTrainingRuntimeNewObjects(t *testing.T) {
RuntimeRef(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.ClusterTrainingRuntimeKind), "test-runtime").
Trainer(
testingutil.MakeTrainJobTrainerWrapper().
ContainerTrainer("test:trainjob", []string{"trainjob"}, []string{"trainjob"}, resRequests).
Container("test:trainjob", []string{"trainjob"}, []string{"trainjob"}, resRequests).
Obj(),
).
Obj(),
wantObjs: []client.Object{
testingutil.MakeJobSetWrapper(metav1.NamespaceDefault, "test-job").
InitContainerDatasetModelInitializer("test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests).
NumNodes(100).
ContainerTrainer("test:trainjob", []string{"trainjob"}, []string{"trainjob"}, resRequests).
ContainerDatasetModelInitializer("test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests).
Suspend(true).
PodLabel(schedulerpluginsv1alpha1.PodGroupLabel, "test-job").
ControllerReference(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.TrainJobKind), "test-job", "uid").
Expand Down
Loading
Loading