Skip to content

Commit

Permalink
chore(trial): retry reconcilation when reporting unavailable metrics …
Browse files Browse the repository at this point in the history
…failed.

Signed-off-by: Electronic-Waste <[email protected]>
  • Loading branch information
Electronic-Waste committed Aug 25, 2024
1 parent 45e4446 commit e0bd76a
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 3 deletions.
2 changes: 1 addition & 1 deletion pkg/controller.v1beta1/trial/trial_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ func (r *ReconcileTrial) reconcileTrial(instance *trialsv1beta1.Trial) error {
// if job has succeeded and if observation field is available.
// if job has failed
// This will ensure that trial is set to be complete only if metric is collected at least once
r.UpdateTrialStatusCondition(instance, deployedJob.GetName(), jobStatus)
return r.UpdateTrialStatusCondition(instance, deployedJob.GetName(), jobStatus)
}
return nil
}
Expand Down
34 changes: 34 additions & 0 deletions pkg/controller.v1beta1/trial/trial_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,40 @@ func TestReconcileBatchJob(t *testing.T) {
}, timeout).Should(gomega.BeTrue())
})

t.Run(`Trail with "Complete" BatchJob and Unavailable metrics(Push MC, failed once).`, func(t *testing.T) {
g := gomega.NewGomegaWithT(t)
gomock.InOrder(
mockManagerClient.EXPECT().GetTrialObservationLog(gomock.Any()).Return(observationLogUnavailable, nil).MinTimes(2),
mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, errMetricsNotReported).MinTimes(1),
mockManagerClient.EXPECT().ReportTrialObservationLog(gomock.Any(), gomock.Any()).Return(nil, nil).MinTimes(1),
mockManagerClient.EXPECT().DeleteTrialObservationLog(gomock.Any()).Return(nil, nil),
)
// Create the Trial with Push MC
trial := newFakeTrialBatchJob(commonv1beta1.PushCollector)
g.Expect(c.Create(ctx, trial)).NotTo(gomega.HaveOccurred())

// Expect that Trial status is succeeded with "false" status and "metrics unavailable" reason.
// Metrics unavailable because GetTrialObservationLog returns "unavailable".
g.Eventually(func() bool {
if err = c.Get(ctx, trialKey, trial); err != nil {
return false
}
return trial.IsMetricsUnavailable() &&
len(trial.Status.Observation.Metrics) > 0 &&
trial.Status.Observation.Metrics[0].Min == consts.UnavailableMetricValue &&
trial.Status.Observation.Metrics[0].Max == consts.UnavailableMetricValue &&
trial.Status.Observation.Metrics[0].Latest == consts.UnavailableMetricValue
}, timeout).Should(gomega.BeTrue())

// Delete the Trial
g.Expect(c.Delete(ctx, trial)).NotTo(gomega.HaveOccurred())

// Expect that Trial is deleted
g.Eventually(func() bool {
return errors.IsNotFound(c.Get(ctx, trialKey, &trialsv1beta1.Trial{}))
}, timeout).Should(gomega.BeTrue())
})

t.Run("Update status for empty Trial", func(t *testing.T) {
g := gomega.NewGomegaWithT(t)
g.Expect(r.updateStatus(&trialsv1beta1.Trial{})).To(gomega.HaveOccurred())
Expand Down
7 changes: 5 additions & 2 deletions pkg/controller.v1beta1/trial/trial_controller_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ const (
)

// UpdateTrialStatusCondition updates Trial status from current deployed Job status
func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1beta1.Trial, deployedJobName string, jobStatus *trialutil.TrialJobStatus) {
func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1beta1.Trial, deployedJobName string, jobStatus *trialutil.TrialJobStatus) error {
logger := log.WithValues("Trial", types.NamespacedName{Name: instance.GetName(), Namespace: instance.GetNamespace()})

timeNow := metav1.Now()
Expand Down Expand Up @@ -70,10 +70,12 @@ func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1beta1.Tria
msg := "Metrics are not available"
reason := TrialMetricsUnavailableReason

// If the type of metrics collector is Push, We should insert an unavailable value to Katib DB
// If the type of metrics collector is Push, We should insert an unavailable value to Katib DB.
// We would retry reconcilation if some error occurs while we report unavailable metrics.
if instance.Spec.MetricsCollector.Collector.Kind == commonv1beta1.PushCollector {
if err := r.reportUnavailableMetrics(instance); err != nil {
logger.Error(err, "Failed to insert unavailable value to Katib DB")
return errMetricsNotReported
}
}

Expand Down Expand Up @@ -126,6 +128,7 @@ func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1beta1.Tria
// TODO(gaocegege): Should we maintain a TrialsRunningCount?
}
// else nothing to do
return nil
}

func (r *ReconcileTrial) UpdateTrialStatusObservation(instance *trialsv1beta1.Trial) error {
Expand Down

0 comments on commit e0bd76a

Please sign in to comment.