-
Notifications
You must be signed in to change notification settings - Fork 705
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
KEP-2170: Add TrainJob conditions #2322
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -48,6 +48,43 @@ type TrainJob struct { | |
Status TrainJobStatus `json:"status,omitempty"` | ||
} | ||
|
||
const ( | ||
// TrainJobSuspended means the TrainJob is suspended. | ||
TrainJobSuspended string = "Suspended" | ||
|
||
// TrainJobComplete means that the TrainJob has completed its execution. | ||
TrainJobComplete string = "Complete" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please can you create an issue in JobSet to rename Completed condition to Complete as we discussed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, we can |
||
|
||
// TrainJobFailed means that the actual jobs have failed its execution. | ||
TrainJobFailed string = "Failed" | ||
|
||
// TrainJobCreated means that the actual jobs creation has succeeded. | ||
TrainJobCreated string = "Created" | ||
) | ||
|
||
const ( | ||
// TrainJobSuspendedReason is the "Suspended" condition reason. | ||
// When the TrainJob is suspended, this is added. | ||
TrainJobSuspendedReason string = "Suspended" | ||
|
||
// TrainJobResumedReason is the "Suspended" condition reason. | ||
// When the TrainJob suspension is changed from True to False, this is added. | ||
TrainJobResumedReason string = "Resumed" | ||
|
||
// TrainJobJobsCreationSucceededReason is the "Created" condition reason. | ||
// When the creating objects succeeded after building succeeded, this is added. | ||
TrainJobJobsCreationSucceededReason string = "JobsCreationSucceeded" | ||
|
||
// TrainJobJobsBuildFailedReason is the "Created" condition reason. | ||
// When the building objects based on the TrainJob and the specified runtime failed, | ||
// this is added. | ||
TrainJobJobsBuildFailedReason string = "JobsBuildFailed" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't we need this reason as well: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In that case, the Created reason is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Basically, we preserve the failed reason rather than succeeded reason (e.g., JobsBuildSucceeded vs JobsCreationFailed) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are right, so the reasons should be the one that user can always see on the TrainJob. |
||
|
||
// TrainJobJobsCreationFailedReason is the "Created" condition reason. | ||
// When the creating objects failed even though building succeeded, this is added. | ||
TrainJobJobsCreationFailedReason string = "JobsCreationFailed" | ||
) | ||
|
||
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object | ||
// +resource:path=trainjobs | ||
|
||
|
@@ -269,7 +306,13 @@ type ContainerOverride struct { | |
// TrainJobStatus represents the current status of TrainJob. | ||
type TrainJobStatus struct { | ||
// Conditions for the TrainJob. | ||
Conditions []metav1.Condition `json:"conditions,omitempty"` | ||
// | ||
// +optional | ||
// +listType=map | ||
// +listMapKey=type | ||
// +patchStrategy=merge | ||
// +patchMergeKey=type | ||
Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"` | ||
|
||
// JobsStatus tracks the child Jobs in TrainJob. | ||
JobsStatus []JobStatus `json:"jobsStatus,omitempty"` | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -22,6 +22,9 @@ import ( | |||||
"fmt" | ||||||
|
||||||
"github.com/go-logr/logr" | ||||||
"k8s.io/apimachinery/pkg/api/equality" | ||||||
"k8s.io/apimachinery/pkg/api/meta" | ||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||||||
"k8s.io/apimachinery/pkg/runtime/schema" | ||||||
"k8s.io/client-go/tools/record" | ||||||
"k8s.io/klog/v2" | ||||||
|
@@ -36,6 +39,15 @@ import ( | |||||
|
||||||
var errorUnsupportedRuntime = errors.New("the specified runtime is not supported") | ||||||
|
||||||
type objsOpState int | ||||||
|
||||||
const ( | ||||||
succeeded objsOpState = iota | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's call it
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||||||
buildFailed objsOpState = iota | ||||||
creationFailed objsOpState = iota | ||||||
updateFailed objsOpState = iota | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the goal to represent conditions as increment int constants ? Does it require less memory than just using strings ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is Go World ENUM. Shouldn't the state be represented as Enums? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see, do you know if k8s uses the same representation for the conditions ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Does this mean that whether or not any K8s controllers (like Deployment Controller) have the same transition calculation mechanism? |
||||||
) | ||||||
|
||||||
type TrainJobReconciler struct { | ||||||
log logr.Logger | ||||||
client client.Client | ||||||
|
@@ -63,29 +75,41 @@ func (r *TrainJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c | |||||
log := ctrl.LoggerFrom(ctx).WithValues("trainJob", klog.KObj(&trainJob)) | ||||||
ctx = ctrl.LoggerInto(ctx, log) | ||||||
log.V(2).Info("Reconciling TrainJob") | ||||||
if err := r.createOrUpdateObjs(ctx, &trainJob); err != nil { | ||||||
return ctrl.Result{}, err | ||||||
if isTrainJobFinished(&trainJob) { | ||||||
log.V(5).Info("TrainJob has already been finished") | ||||||
return ctrl.Result{}, nil | ||||||
} | ||||||
// TODO (tenzen-y): Do update the status. | ||||||
return ctrl.Result{}, nil | ||||||
} | ||||||
|
||||||
func (r *TrainJobReconciler) createOrUpdateObjs(ctx context.Context, trainJob *kubeflowv2.TrainJob) error { | ||||||
log := ctrl.LoggerFrom(ctx) | ||||||
|
||||||
runtimeRefGK := runtimeRefToGroupKind(trainJob.Spec.RuntimeRef).String() | ||||||
runtime, ok := r.runtimes[runtimeRefGK] | ||||||
if !ok { | ||||||
return fmt.Errorf("%w: %s", errorUnsupportedRuntime, runtimeRefGK) | ||||||
return ctrl.Result{}, fmt.Errorf("%w: %s", errorUnsupportedRuntime, runtimeRefGK) | ||||||
} | ||||||
opState, err := r.reconcileObjects(ctx, runtime, &trainJob) | ||||||
|
||||||
originStatus := trainJob.Status.DeepCopy() | ||||||
setSuspendedCondition(&trainJob) | ||||||
setCreatedCondition(&trainJob, opState) | ||||||
if terminalCondErr := setTerminalCondition(ctx, runtime, &trainJob); terminalCondErr != nil { | ||||||
return ctrl.Result{}, errors.Join(err, terminalCondErr) | ||||||
} | ||||||
if !equality.Semantic.DeepEqual(&trainJob, originStatus) { | ||||||
return ctrl.Result{}, errors.Join(err, r.client.Status().Update(ctx, &trainJob)) | ||||||
Comment on lines
+97
to
+98
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we return error if objects are not equal and TrainJob condition needs to be updated ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This error will be returned only when There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, you are right! |
||||||
} | ||||||
return ctrl.Result{}, err | ||||||
} | ||||||
|
||||||
func (r *TrainJobReconciler) reconcileObjects(ctx context.Context, runtime jobruntimes.Runtime, trainJob *kubeflowv2.TrainJob) (objsOpState, error) { | ||||||
log := ctrl.LoggerFrom(ctx) | ||||||
|
||||||
objs, err := runtime.NewObjects(ctx, trainJob) | ||||||
if err != nil { | ||||||
return err | ||||||
return buildFailed, err | ||||||
} | ||||||
for _, obj := range objs { | ||||||
var gvk schema.GroupVersionKind | ||||||
if gvk, err = apiutil.GVKForObject(obj.DeepCopyObject(), r.client.Scheme()); err != nil { | ||||||
return err | ||||||
return buildFailed, err | ||||||
} | ||||||
logKeysAndValues := []any{ | ||||||
"groupVersionKind", gvk.String(), | ||||||
|
@@ -102,21 +126,91 @@ func (r *TrainJobReconciler) createOrUpdateObjs(ctx context.Context, trainJob *k | |||||
} | ||||||
switch { | ||||||
case created: | ||||||
log.V(5).Info("Succeeded to create object", logKeysAndValues) | ||||||
log.V(5).Info("Succeeded to create object", logKeysAndValues...) | ||||||
continue | ||||||
case client.IgnoreAlreadyExists(creationErr) != nil: | ||||||
return creationErr | ||||||
return creationFailed, creationErr | ||||||
default: | ||||||
// This indicates CREATE operation has not been performed or the object has already existed in the cluster. | ||||||
if err = r.client.Update(ctx, obj); err != nil { | ||||||
return err | ||||||
return updateFailed, err | ||||||
} | ||||||
log.V(5).Info("Succeeded to update object", logKeysAndValues) | ||||||
log.V(5).Info("Succeeded to update object", logKeysAndValues...) | ||||||
} | ||||||
} | ||||||
return succeeded, nil | ||||||
} | ||||||
|
||||||
func setCreatedCondition(trainJob *kubeflowv2.TrainJob, opState objsOpState) { | ||||||
var newCond metav1.Condition | ||||||
switch opState { | ||||||
case succeeded: | ||||||
newCond = metav1.Condition{ | ||||||
Type: kubeflowv2.TrainJobCreated, | ||||||
Status: metav1.ConditionTrue, | ||||||
Message: "Succeeded to create Jobs", | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We might need to move messages to the Constants as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||||||
Reason: kubeflowv2.TrainJobJobsCreationSucceededReason, | ||||||
} | ||||||
case buildFailed: | ||||||
newCond = metav1.Condition{ | ||||||
Type: kubeflowv2.TrainJobCreated, | ||||||
Status: metav1.ConditionFalse, | ||||||
Message: "Failed to build Jobs", | ||||||
Reason: kubeflowv2.TrainJobJobsBuildFailedReason, | ||||||
} | ||||||
// TODO (tenzen-y): Provide more granular the message based on creation or update failure. | ||||||
case creationFailed, updateFailed: | ||||||
newCond = metav1.Condition{ | ||||||
Type: kubeflowv2.TrainJobCreated, | ||||||
Status: metav1.ConditionFalse, | ||||||
Message: "Failed to create Jobs", | ||||||
Reason: kubeflowv2.TrainJobJobsCreationFailedReason, | ||||||
} | ||||||
default: | ||||||
return | ||||||
} | ||||||
meta.SetStatusCondition(&trainJob.Status.Conditions, newCond) | ||||||
} | ||||||
|
||||||
func setSuspendedCondition(trainJob *kubeflowv2.TrainJob) { | ||||||
var newCond metav1.Condition | ||||||
switch { | ||||||
case ptr.Deref(trainJob.Spec.Suspend, false): | ||||||
newCond = metav1.Condition{ | ||||||
Type: kubeflowv2.TrainJobSuspended, | ||||||
Status: metav1.ConditionTrue, | ||||||
Message: "TrainJob is suspended", | ||||||
Reason: kubeflowv2.TrainJobSuspendedReason, | ||||||
} | ||||||
case meta.IsStatusConditionTrue(trainJob.Status.Conditions, kubeflowv2.TrainJobSuspended): | ||||||
newCond = metav1.Condition{ | ||||||
Type: kubeflowv2.TrainJobSuspended, | ||||||
Status: metav1.ConditionFalse, | ||||||
Message: "TrainJob is resumed", | ||||||
Reason: kubeflowv2.TrainJobResumedReason, | ||||||
} | ||||||
default: | ||||||
return | ||||||
} | ||||||
meta.SetStatusCondition(&trainJob.Status.Conditions, newCond) | ||||||
} | ||||||
|
||||||
func setTerminalCondition(ctx context.Context, runtime jobruntimes.Runtime, trainJob *kubeflowv2.TrainJob) error { | ||||||
terminalCond, err := runtime.TerminalCondition(ctx, trainJob) | ||||||
if err != nil { | ||||||
return err | ||||||
} | ||||||
if terminalCond != nil { | ||||||
meta.SetStatusCondition(&trainJob.Status.Conditions, *terminalCond) | ||||||
} | ||||||
return nil | ||||||
} | ||||||
|
||||||
func isTrainJobFinished(trainJob *kubeflowv2.TrainJob) bool { | ||||||
return meta.IsStatusConditionTrue(trainJob.Status.Conditions, kubeflowv2.TrainJobComplete) || | ||||||
meta.IsStatusConditionTrue(trainJob.Status.Conditions, kubeflowv2.TrainJobFailed) | ||||||
} | ||||||
|
||||||
func runtimeRefToGroupKind(runtimeRef kubeflowv2.RuntimeRef) schema.GroupKind { | ||||||
return schema.GroupKind{ | ||||||
Group: ptr.Deref(runtimeRef.APIGroup, ""), | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.