diff --git a/src/components/cl/hier/Makefile.am b/src/components/cl/hier/Makefile.am
index 5377ef4528..ed86a4e395 100644
--- a/src/components/cl/hier/Makefile.am
+++ b/src/components/cl/hier/Makefile.am
@@ -1,54 +1,56 @@
 #
-# Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 
-allgatherv = 					\
-	allgatherv/allgatherv.h 	\
-	allgatherv/allgatherv.c
+allgatherv =                         \
+    allgatherv/unpack.h              \
+    allgatherv/unpack.c              \
+    allgatherv/allgatherv.h          \
+    allgatherv/allgatherv.c
 
 allreduce =                          \
-	allreduce/allreduce.h            \
-	allreduce/allreduce.c            \
-	allreduce/allreduce_rab.c        \
-	allreduce/allreduce_split_rail.c
-
-alltoallv =                   \
-	alltoallv/alltoallv.h     \
-	alltoallv/alltoallv.c
-
-alltoall =                    \
-	alltoall/alltoall.h       \
-	alltoall/alltoall.c
-
-barrier =                     \
-	barrier/barrier.h         \
-	barrier/barrier.c
-
-bcast =                       \
-	bcast/bcast.h             \
-	bcast/bcast.c             \
-	bcast/bcast_2step.c
-
-reduce =                      \
-	reduce/reduce.h           \
-	reduce/reduce.c           \
-	reduce/reduce_2step.c
-
-sources =             \
-	cl_hier.h         \
-	cl_hier.c         \
-	cl_hier_lib.c     \
-	cl_hier_context.c \
-	cl_hier_team.c    \
-	cl_hier_coll.c    \
-	cl_hier_coll.h    \
-	$(allgatherv)     \
-	$(allreduce)      \
-	$(alltoallv)      \
-	$(alltoall)       \
-	$(barrier)        \
-	$(bcast)          \
-	$(reduce)
+    allreduce/allreduce.h            \
+    allreduce/allreduce.c            \
+    allreduce/allreduce_rab.c        \
+    allreduce/allreduce_split_rail.c
+
+alltoallv =                          \
+    alltoallv/alltoallv.h            \
+    alltoallv/alltoallv.c
+
+alltoall =                           \
+    alltoall/alltoall.h              \
+    alltoall/alltoall.c
+
+barrier =                            \
+    barrier/barrier.h                \
+    barrier/barrier.c
+
+bcast =                              \
+    bcast/bcast.h                    \
+    bcast/bcast.c                    \
+    bcast/bcast_2step.c
+
+reduce =                             \
+    reduce/reduce.h                  \
+    reduce/reduce.c                  \
+    reduce/reduce_2step.c
+
+sources =                            \
+    cl_hier.h                        \
+    cl_hier.c                        \
+    cl_hier_lib.c                    \
+    cl_hier_context.c                \
+    cl_hier_team.c                   \
+    cl_hier_coll.c                   \
+    cl_hier_coll.h                   \
+    $(allgatherv)                    \
+    $(allreduce)                     \
+    $(alltoallv)                     \
+    $(alltoall)                      \
+    $(barrier)                       \
+    $(bcast)                         \
+    $(reduce)
 
 module_LTLIBRARIES         = libucc_cl_hier.la
 libucc_cl_hier_la_SOURCES  = $(sources)
diff --git a/src/components/cl/hier/allgatherv/allgatherv.c b/src/components/cl/hier/allgatherv/allgatherv.c
index 6710517c77..778c17427f 100755
--- a/src/components/cl/hier/allgatherv/allgatherv.c
+++ b/src/components/cl/hier/allgatherv/allgatherv.c
@@ -1,14 +1,15 @@
 /**
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
 
 #include "allgatherv.h"
+#include "unpack.h"
 #include "../cl_hier_coll.h"
 #include "core/ucc_team.h"
 
-#define MAX_ALLGATHERV_TASKS 3
+#define MAX_ALLGATHERV_TASKS 4
 
 ucc_base_coll_alg_info_t
     ucc_cl_hier_allgatherv_algs[UCC_CL_HIER_ALLGATHERV_ALG_LAST + 1] = {
@@ -119,8 +120,6 @@ UCC_CL_HIER_PROFILE_FUNC(ucc_status_t, ucc_cl_hier_allgatherv_init,
     size_t leader_disps_size;
     size_t total_count;
     void *node_gathered_data;
-    //ucc_ee_executor_task_args_t eargs = {0};
-    //ucc_ee_executor_t *exec;
 
     schedule = &ucc_cl_hier_get_schedule(cl_team)->super.super;
     if (ucc_unlikely(!schedule)) {
@@ -230,10 +229,9 @@ UCC_CL_HIER_PROFILE_FUNC(ucc_status_t, ucc_cl_hier_allgatherv_init,
         n_tasks++;
     }
 
-    args = args_old;
-
     if (SBGP_ENABLED(cl_team, NODE) &&
         cl_team->top_sbgp != UCC_HIER_SBGP_NODE) {
+        args = args_old;
         args.args.coll_type = UCC_COLL_TYPE_BCAST;
         args.args.mask |= UCC_COLL_ARGS_FIELD_FLAGS;
         args.args.flags |= UCC_COLL_ARGS_FLAG_IN_PLACE;
@@ -249,20 +247,9 @@ UCC_CL_HIER_PROFILE_FUNC(ucc_status_t, ucc_cl_hier_allgatherv_init,
         n_tasks++;
 
         if (!is_contig) {
-            printf("not contig, scheduling unpack operation\n"); /*
-            UCC_CHECK_GOTO(ucc_coll_task_get_executor(&schedule->super, &exec), out, status);
-            eargs.task_type = UCC_EE_EXECUTOR_TASK_COPY;
-            size_t disp_counter = ucc_coll_args_get_total_count(&args.args, args.args.dst.info_v.counts, team_size);
-            for (i = team_size - 1; i >= 0; i++) {
-                size_t this_rank_count = ucc_coll_args_get_count(&args.args, args.args.dst.info_v.counts, i);
-                disp_counter -= this_rank_count;
-                eargs.copy.src  = PTR_OFFSET(args.args.dst.info_v.buffer, disp_counter * dt_size);
-                eargs.copy.dst  = PTR_OFFSET(args.args.dst.info_v.buffer, ucc_coll_args_get_displacement(&args.args, args.args.dst.info_v.displacements, i) * dt_size);
-                eargs.copy.len  = this_rank_count * dt_size;
-                ucc_ee_executor_task_t **ee_task_pp = NULL;
-                UCC_CHECK_GOTO(ucc_ee_executor_task_post(exec, &eargs, &tasks[n_tasks]), out, status);
-                n_tasks++;
-            }*/
+            args = args_old;
+            UCC_CHECK_GOTO(ucc_cl_hier_allgatherv_unpack_init(&args, team, &tasks[n_tasks]), out, status);
+            n_tasks++;
         }
     }
 
@@ -279,6 +266,7 @@ UCC_CL_HIER_PROFILE_FUNC(ucc_status_t, ucc_cl_hier_allgatherv_init,
         UCC_CHECK_GOTO(ucc_schedule_add_task(schedule, tasks[i]), out, status);
     }
 
+    schedule->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR;
     schedule->super.post     = ucc_cl_hier_allgatherv_start;
     schedule->super.finalize = ucc_cl_hier_allgatherv_finalize;
     *task                    = &schedule->super;
diff --git a/src/components/cl/hier/allgatherv/unpack.c b/src/components/cl/hier/allgatherv/unpack.c
new file mode 100644
index 0000000000..a4bb25ecb9
--- /dev/null
+++ b/src/components/cl/hier/allgatherv/unpack.c
@@ -0,0 +1,124 @@
+/**
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include "unpack.h"
+
+ucc_status_t ucc_cl_hier_allgatherv_unpack_finalize(ucc_coll_task_t *task)
+{
+    ucc_schedule_t *schedule = ucc_derived_of(task, ucc_schedule_t);
+    ucc_cl_hier_schedule_t *cl_schedule = ucc_derived_of(schedule, ucc_cl_hier_schedule_t);
+
+    ucc_mc_free(cl_schedule->scratch);
+
+    return UCC_OK;
+}
+
+void ucc_cl_hier_allgatherv_unpack_progress(ucc_coll_task_t *task)
+{
+    ucc_schedule_t *schedule = ucc_derived_of(task, ucc_schedule_t);
+    ucc_cl_hier_team_t  *cl_team = ucc_derived_of(task->team, ucc_cl_hier_team_t);
+    ucc_rank_t           team_size = UCC_CL_TEAM_SIZE(cl_team);
+    ucc_cl_hier_schedule_t *cl_schedule = ucc_derived_of(schedule, ucc_cl_hier_schedule_t);
+    ucc_ee_executor_task_t **tasks = cl_schedule->scratch->addr;
+    ucc_status_t st;
+    ucc_rank_t i;
+
+    for (i = 0; i < team_size; i++) {
+        ucc_ee_executor_task_t *etask = tasks[i];
+        if (etask != NULL) {
+            st = ucc_ee_executor_task_test(etask);
+            if (st == UCC_OK) {
+                ucc_ee_executor_task_finalize(etask);
+                tasks[i] = NULL;
+            } else {
+                if (ucc_likely(st > 0)) {
+                    st = UCC_INPROGRESS;
+                }
+                goto out;
+            }
+        }
+    }
+
+out:
+    schedule->super.status       = st;
+    schedule->super.super.status = st;
+}
+
+ucc_status_t ucc_cl_hier_allgatherv_unpack_start(ucc_coll_task_t *task)
+{
+    ucc_schedule_t *schedule = ucc_derived_of(task, ucc_schedule_t);
+    ucc_cl_hier_team_t  *cl_team = ucc_derived_of(task->team, ucc_cl_hier_team_t);
+    ucc_rank_t           team_size = UCC_CL_TEAM_SIZE(cl_team);
+    ucc_coll_args_t       *args = &task->bargs.args;
+    //ucc_rank_t           rank = UCC_CL_TEAM_RANK(cl_team);
+    ucc_ee_executor_task_args_t eargs = {0};
+    ucc_cl_hier_schedule_t *cl_schedule = ucc_derived_of(schedule, ucc_cl_hier_schedule_t);
+    ucc_ee_executor_task_t **tasks = cl_schedule->scratch->addr;
+    ucc_rank_t n_tasks = 0;
+    size_t dt_size = ucc_dt_size(args->dst.info_v.datatype);
+    ucc_ee_executor_t *exec;
+    ucc_status_t status;
+    int i;
+
+    UCC_CHECK_GOTO(ucc_coll_task_get_executor(&schedule->super, &exec), out, status);
+    eargs.task_type = UCC_EE_EXECUTOR_TASK_COPY;
+    size_t disp_counter = ucc_coll_args_get_total_count(args, args->dst.info_v.counts, team_size);
+
+    for (i = team_size - 1; i >= 0; i--) {
+        size_t this_rank_count = ucc_coll_args_get_count(args, args->dst.info_v.counts, i);
+        disp_counter -= this_rank_count;
+        eargs.copy.src  = PTR_OFFSET(args->dst.info_v.buffer, disp_counter * dt_size);
+        eargs.copy.dst  = PTR_OFFSET(args->dst.info_v.buffer, ucc_coll_args_get_displacement(args, args->dst.info_v.displacements, i) * dt_size);
+        eargs.copy.len  = this_rank_count * dt_size;
+        UCC_CHECK_GOTO(ucc_ee_executor_task_post(exec, &eargs, &tasks[n_tasks]), out, status);
+        n_tasks++;
+    }
+
+    schedule->super.status       = UCC_INPROGRESS;
+    schedule->super.super.status = UCC_INPROGRESS;
+
+    ucc_progress_queue_enqueue(cl_team->super.super.context->ucc_context->pq, task);
+
+    return UCC_OK;
+out:
+    return status;
+}
+
+ucc_status_t ucc_cl_hier_allgatherv_unpack_init(ucc_base_coll_args_t *coll_args,
+                                                ucc_base_team_t      *team,
+                                                ucc_coll_task_t     **task_h)
+{
+    ucc_cl_hier_team_t  *cl_team = ucc_derived_of(team, ucc_cl_hier_team_t);
+    ucc_rank_t           team_size = UCC_CL_TEAM_SIZE(cl_team);
+    //ucc_rank_t           rank = UCC_CL_TEAM_RANK(cl_team);
+    ucc_status_t         status;
+    ucc_schedule_t      *schedule;
+    ucc_cl_hier_schedule_t *cl_schedule;
+    size_t scratch_size;
+    
+
+    schedule = &ucc_cl_hier_get_schedule(cl_team)->super.super;
+    if (ucc_unlikely(!schedule)) {
+        return UCC_ERR_NO_MEMORY;
+    }
+    cl_schedule = ucc_derived_of(schedule, ucc_cl_hier_schedule_t);
+
+    UCC_CHECK_GOTO(ucc_schedule_init(schedule, coll_args, team), out, status);
+
+    scratch_size = team_size * sizeof(ucc_ee_executor_task_t*);
+    UCC_CHECK_GOTO(ucc_mc_alloc(&cl_schedule->scratch, scratch_size, UCC_MEMORY_TYPE_HOST), out, status);
+
+    schedule->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR;
+    schedule->super.post     = ucc_cl_hier_allgatherv_unpack_start;
+    schedule->super.progress = ucc_cl_hier_allgatherv_unpack_progress;
+    schedule->super.finalize = ucc_cl_hier_allgatherv_unpack_finalize;
+
+    *task_h = &schedule->super;
+    return UCC_OK;
+out:
+    ucc_cl_hier_put_schedule(schedule);
+    return status;
+}
diff --git a/src/components/cl/hier/allgatherv/unpack.h b/src/components/cl/hier/allgatherv/unpack.h
new file mode 100644
index 0000000000..8e59774dc2
--- /dev/null
+++ b/src/components/cl/hier/allgatherv/unpack.h
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include "../cl_hier_coll.h"
+#include "core/ucc_team.h"
+
+ucc_status_t ucc_cl_hier_allgatherv_unpack_init(ucc_base_coll_args_t *coll_args,
+                                                ucc_base_team_t      *team,
+                                                ucc_coll_task_t     **task_h);
+ucc_status_t ucc_cl_hier_allgatherv_unpack_start(ucc_coll_task_t *task);
+void         ucc_cl_hier_allgatherv_unpack_progress(ucc_coll_task_t *task);
+ucc_status_t ucc_cl_hier_allgatherv_unpack_finalize(ucc_coll_task_t *task);
diff --git a/src/components/topo/ucc_sbgp.c b/src/components/topo/ucc_sbgp.c
index ae0001f38e..cee2226f4b 100644
--- a/src/components/topo/ucc_sbgp.c
+++ b/src/components/topo/ucc_sbgp.c
@@ -245,6 +245,7 @@ static ucc_status_t sbgp_create_node_leaders(ucc_topo_t *topo, ucc_sbgp_t *sbgp,
             nl_array_3[sbgp_id + host_id * max_ctx_sbgp_size]++;
         }
 
+        /* Find the first rank that maps to this node, store in nl_array_2 */
         if (nl_array_1[host_id] == 0 || nl_array_1[host_id] == ctx_nlr) {
             nl_array_2[host_id] = i;
         }