From 20d68b74d20e698388cf845c48bd3a1fccf2fc75 Mon Sep 17 00:00:00 2001 From: Liran Rotenberg Date: Tue, 31 Oct 2023 14:50:38 +0200 Subject: [PATCH 1/4] Keep the populator pod on failure This patch will keep the populator pod existence when failing. It won't restart or recreated. This is in the mind of aligning with v2v pod failure and the thought that once we fail, we will most likely keep failing. This change will allow to get the populator logs in case of failure in a easy way. Signed-off-by: Liran Rotenberg --- pkg/controller/plan/migration.go | 18 ++++++++++++++++-- .../populator-machinery/BUILD.bazel | 1 - .../populator-machinery/controller.go | 8 -------- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/pkg/controller/plan/migration.go b/pkg/controller/plan/migration.go index 0e9506bb4..cf3a34fe3 100644 --- a/pkg/controller/plan/migration.go +++ b/pkg/controller/plan/migration.go @@ -1453,7 +1453,7 @@ func (r *Migration) updateCopyProgress(vm *plan.VMStatus, step *plan.Step) (err if r.Plan.Spec.Warm && len(importer.Status.ContainerStatuses) > 0 { vm.Warm.Failures = int(importer.Status.ContainerStatuses[0].RestartCount) } - if RestartLimitExceeded(importer) { + if restartLimitExceeded(importer) { task.MarkedCompleted() msg, _ := terminationMessage(importer) task.AddError(msg) @@ -1598,6 +1598,10 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St if err != nil { return } + populatorPods, err := r.kubevirt.getPopulatorPods() + if err != nil { + return + } for _, pvc := range pvcs { if _, ok := pvc.Annotations["lun"]; ok { @@ -1616,6 +1620,16 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St continue } + for _, pod := range populatorPods { + pvcId := strings.Split(pod.Name, "populate-")[1] + if string(pvc.UID) != pvcId { + continue + } + if pod.Status.Phase == core.PodFailed { + return fmt.Errorf("populator pod %s/%s failed for PVC %s. Please check the pod logs.", pod.Namespace, pod.Name, pvcId) + } + } + if pvc.Status.Phase == core.ClaimBound { task.Phase = Completed task.Reason = TransferCompleted @@ -1701,7 +1715,7 @@ func terminationMessage(pod *core.Pod) (msg string, ok bool) { } // Return whether the pod has failed and restarted too many times. -func RestartLimitExceeded(pod *core.Pod) (exceeded bool) { +func restartLimitExceeded(pod *core.Pod) (exceeded bool) { if len(pod.Status.ContainerStatuses) == 0 { return } diff --git a/pkg/lib-volume-populator/populator-machinery/BUILD.bazel b/pkg/lib-volume-populator/populator-machinery/BUILD.bazel index d9ec31e6f..af0c99c7b 100644 --- a/pkg/lib-volume-populator/populator-machinery/BUILD.bazel +++ b/pkg/lib-volume-populator/populator-machinery/BUILD.bazel @@ -10,7 +10,6 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/apis/forklift/v1beta1", - "//pkg/controller/plan", "//vendor/k8s.io/api/core/v1:core", "//vendor/k8s.io/api/storage/v1:storage", "//vendor/k8s.io/apimachinery/pkg/api/errors", diff --git a/pkg/lib-volume-populator/populator-machinery/controller.go b/pkg/lib-volume-populator/populator-machinery/controller.go index 28d862585..44f40cb42 100644 --- a/pkg/lib-volume-populator/populator-machinery/controller.go +++ b/pkg/lib-volume-populator/populator-machinery/controller.go @@ -31,7 +31,6 @@ import ( "time" "github.com/konveyor/forklift-controller/pkg/apis/forklift/v1beta1" - "github.com/konveyor/forklift-controller/pkg/controller/plan" corev1 "k8s.io/api/core/v1" storagev1 "k8s.io/api/storage/v1" "k8s.io/apimachinery/pkg/api/errors" @@ -698,13 +697,6 @@ func (c *controller) syncPvc(ctx context.Context, key, pvcNamespace, pvcName str if corev1.PodSucceeded != pod.Status.Phase { if corev1.PodFailed == pod.Status.Phase { c.recorder.Eventf(pvc, corev1.EventTypeWarning, reasonPodFailed, "Populator failed: %s", pod.Status.Message) - // Delete failed pods so we can try again - if !plan.RestartLimitExceeded(pod) { - err = c.kubeClient.CoreV1().Pods(populatorNamespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}) - if err != nil { - return err - } - } } // We'll get called again later when the pod succeeds return nil From 4fabbf5678465d0489b202c70a74853532a2c3cb Mon Sep 17 00:00:00 2001 From: Liran Rotenberg Date: Tue, 31 Oct 2023 15:41:40 +0200 Subject: [PATCH 2/4] Decreace updatePopulatorCopyProgress complexity Signed-off-by: Liran Rotenberg --- pkg/controller/plan/migration.go | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/pkg/controller/plan/migration.go b/pkg/controller/plan/migration.go index cf3a34fe3..6227bd52d 100644 --- a/pkg/controller/plan/migration.go +++ b/pkg/controller/plan/migration.go @@ -1620,14 +1620,9 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St continue } - for _, pod := range populatorPods { - pvcId := strings.Split(pod.Name, "populate-")[1] - if string(pvc.UID) != pvcId { - continue - } - if pod.Status.Phase == core.PodFailed { - return fmt.Errorf("populator pod %s/%s failed for PVC %s. Please check the pod logs.", pod.Namespace, pod.Name, pvcId) - } + err = r.isPopulatorFailed(populatorPods, string(pvc.UID)) + if err != nil { + return } if pvc.Status.Phase == core.ClaimBound { @@ -1652,6 +1647,19 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St return } +func (r *Migration) isPopulatorFailed(populatorPods []core.Pod, givenPvcId string) error { + for _, pod := range populatorPods { + pvcId := strings.Split(pod.Name, "populate-")[1] + if givenPvcId != pvcId { + continue + } + if pod.Status.Phase == core.PodFailed { + return fmt.Errorf("populator pod %s/%s failed for PVC %s. Please check the pod logs.", pod.Namespace, pod.Name, pvcId) + } + } + return nil +} + func (r *Migration) setPopulatorPodsWithLabels(vm *plan.VMStatus, migrationID string) { podList, err := r.kubevirt.GetPodsWithLabels(map[string]string{}) if err != nil { From 92195e12b12d0b254c080c8b0111664f16c29a76 Mon Sep 17 00:00:00 2001 From: Liran Rotenberg Date: Wed, 1 Nov 2023 16:35:11 +0200 Subject: [PATCH 3/4] Refactor the code Signed-off-by: Liran Rotenberg --- pkg/controller/plan/kubevirt.go | 4 ++-- pkg/controller/plan/migration.go | 16 +++++++++------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pkg/controller/plan/kubevirt.go b/pkg/controller/plan/kubevirt.go index 69eeb49fc..1eff5b3e6 100644 --- a/pkg/controller/plan/kubevirt.go +++ b/pkg/controller/plan/kubevirt.go @@ -903,7 +903,7 @@ func (r *KubeVirt) SetPopulatorPodOwnership(vm *plan.VMStatus) (err error) { return } for _, pod := range pods { - pvcId := strings.Split(pod.Name, "populate-")[1] + pvcId := pod.Name[len(PopulatorPodPrefix):] for _, pvc := range pvcs { if string(pvc.UID) != pvcId { continue @@ -987,7 +987,7 @@ func (r *KubeVirt) getPopulatorPods() (pods []core.Pod, err error) { return nil, liberr.Wrap(err) } for _, pod := range migrationPods.Items { - if strings.HasPrefix(pod.Name, "populate-") { + if strings.HasPrefix(pod.Name, PopulatorPodPrefix) { pods = append(pods, pod) } } diff --git a/pkg/controller/plan/migration.go b/pkg/controller/plan/migration.go index 6227bd52d..61df1d0e1 100644 --- a/pkg/controller/plan/migration.go +++ b/pkg/controller/plan/migration.go @@ -89,7 +89,8 @@ const ( ) const ( - TransferCompleted = "Transfer completed." + TransferCompleted = "Transfer completed." + PopulatorPodPrefix = "populate-" ) var ( @@ -1620,7 +1621,7 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St continue } - err = r.isPopulatorFailed(populatorPods, string(pvc.UID)) + _, err = r.isPopulatorFailed(populatorPods, string(pvc.UID)) if err != nil { return } @@ -1647,17 +1648,18 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St return } -func (r *Migration) isPopulatorFailed(populatorPods []core.Pod, givenPvcId string) error { +func (r *Migration) isPopulatorFailed(populatorPods []core.Pod, givenPvcId string) (bool, error) { for _, pod := range populatorPods { - pvcId := strings.Split(pod.Name, "populate-")[1] + pvcId := pod.Name[len(PopulatorPodPrefix):] if givenPvcId != pvcId { continue } if pod.Status.Phase == core.PodFailed { - return fmt.Errorf("populator pod %s/%s failed for PVC %s. Please check the pod logs.", pod.Namespace, pod.Name, pvcId) + return true, fmt.Errorf("populator pod %s/%s failed for PVC %s. Please check the pod logs.", pod.Namespace, pod.Name, pvcId) } + break } - return nil + return false, nil } func (r *Migration) setPopulatorPodsWithLabels(vm *plan.VMStatus, migrationID string) { @@ -1666,7 +1668,7 @@ func (r *Migration) setPopulatorPodsWithLabels(vm *plan.VMStatus, migrationID st return } for _, pod := range podList.Items { - if strings.HasPrefix(pod.Name, "populate-") { + if strings.HasPrefix(pod.Name, PopulatorPodPrefix) { // it's populator pod if _, ok := pod.Labels["migration"]; !ok { // un-labeled pod, we need to set it From 51d4ce3bfe31e97d7e9163eba9a174e73bf1a4fd Mon Sep 17 00:00:00 2001 From: Liran Rotenberg Date: Tue, 7 Nov 2023 18:27:16 +0200 Subject: [PATCH 4/4] Save annotation of restarts This change will count the number of restarts for the populator pod on the destination PVC. That allows us to limit the number of recreations of the pod. After 3 attempts, it will stop recreating it. Signed-off-by: Liran Rotenberg --- pkg/controller/plan/migration.go | 31 ++++++++++-------- .../populator-machinery/controller.go | 32 ++++++++++++++++++- 2 files changed, 49 insertions(+), 14 deletions(-) diff --git a/pkg/controller/plan/migration.go b/pkg/controller/plan/migration.go index 61df1d0e1..a29323c83 100644 --- a/pkg/controller/plan/migration.go +++ b/pkg/controller/plan/migration.go @@ -1599,10 +1599,6 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St if err != nil { return } - populatorPods, err := r.kubevirt.getPopulatorPods() - if err != nil { - return - } for _, pvc := range pvcs { if _, ok := pvc.Annotations["lun"]; ok { @@ -1621,11 +1617,6 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St continue } - _, err = r.isPopulatorFailed(populatorPods, string(pvc.UID)) - if err != nil { - return - } - if pvc.Status.Phase == core.ClaimBound { task.Phase = Completed task.Reason = TransferCompleted @@ -1641,25 +1632,39 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St } percent := float64(transferredBytes/0x100000) / float64(task.Progress.Total) - task.Progress.Completed = int64(percent * float64(task.Progress.Total)) + newProgress := int64(percent * float64(task.Progress.Total)) + if newProgress == task.Progress.Completed { + pvcId := string(pvc.UID) + populatorFailed := r.isPopulatorPodFailed(pvcId) + if populatorFailed { + return fmt.Errorf("populator pod failed for PVC %s. Please check the pod logs", pvcId) + } + } + task.Progress.Completed = newProgress } step.ReflectTasks() return } -func (r *Migration) isPopulatorFailed(populatorPods []core.Pod, givenPvcId string) (bool, error) { +// Checks if the populator pod failed when the progress didn't change +func (r *Migration) isPopulatorPodFailed(givenPvcId string) bool { + populatorPods, err := r.kubevirt.getPopulatorPods() + if err != nil { + r.Log.Error(err, "couldn't get the populator pods") + return false + } for _, pod := range populatorPods { pvcId := pod.Name[len(PopulatorPodPrefix):] if givenPvcId != pvcId { continue } if pod.Status.Phase == core.PodFailed { - return true, fmt.Errorf("populator pod %s/%s failed for PVC %s. Please check the pod logs.", pod.Namespace, pod.Name, pvcId) + return true } break } - return false, nil + return false } func (r *Migration) setPopulatorPodsWithLabels(vm *plan.VMStatus, migrationID string) { diff --git a/pkg/lib-volume-populator/populator-machinery/controller.go b/pkg/lib-volume-populator/populator-machinery/controller.go index 44f40cb42..e92f690b8 100644 --- a/pkg/lib-volume-populator/populator-machinery/controller.go +++ b/pkg/lib-volume-populator/populator-machinery/controller.go @@ -76,6 +76,7 @@ const ( reasonPVCCreationError = "PopulatorPVCCreationError" reasonPopulatorProgress = "PopulatorProgress" AnnDefaultNetwork = "v1.multus-cni.io/default-network" + AnnPopulatorReCreations = "recreations" qemuGroup = 107 ) @@ -696,7 +697,18 @@ func (c *controller) syncPvc(ctx context.Context, key, pvcNamespace, pvcName str if corev1.PodSucceeded != pod.Status.Phase { if corev1.PodFailed == pod.Status.Phase { - c.recorder.Eventf(pvc, corev1.EventTypeWarning, reasonPodFailed, "Populator failed: %s", pod.Status.Message) + restarts, ok := pvc.Annotations[AnnPopulatorReCreations] + if !ok { + return c.retryFailedPopulator(ctx, pvc, populatorNamespace, pod.Name, 1) + } + restartsInteger, err := strconv.Atoi(restarts) + if err != nil { + return err + } + if restartsInteger < 3 { + return c.retryFailedPopulator(ctx, pvc, populatorNamespace, pod.Name, restartsInteger+1) + } + c.recorder.Eventf(pvc, corev1.EventTypeWarning, reasonPodFailed, "Populator failed after few (3) attempts: Please check the logs of the populator pod, %s/%s", populatorNamespace, pod.Name) } // We'll get called again later when the pod succeeds return nil @@ -791,6 +803,24 @@ func (c *controller) syncPvc(ctx context.Context, key, pvcNamespace, pvcName str return nil } +func (c *controller) retryFailedPopulator(ctx context.Context, pvc *corev1.PersistentVolumeClaim, namespace, podName string, counter int) error { + pvc.Annotations[AnnPopulatorReCreations] = strconv.Itoa(counter) + err := c.updatePvc(ctx, pvc, namespace) + if err != nil { + return err + } + err = c.kubeClient.CoreV1().Pods(namespace).Delete(ctx, podName, metav1.DeleteOptions{}) + if err != nil { + return err + } + return nil +} + +func (c *controller) updatePvc(ctx context.Context, pvc *corev1.PersistentVolumeClaim, namespace string) (err error) { + _, err = c.kubeClient.CoreV1().PersistentVolumeClaims(namespace).Update(ctx, pvc, metav1.UpdateOptions{}) + return err +} + func (c *controller) updateProgress(pvc *corev1.PersistentVolumeClaim, podIP string, cr *unstructured.Unstructured) error { populatorKind := pvc.Spec.DataSourceRef.Kind var diskRegex = regexp.MustCompile(fmt.Sprintf(`volume_populators_%s\{%s=\"([0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12})"\} (\d{1,3}.*)`,