diff --git a/src/components/cl/urom/Makefile.am b/src/components/cl/urom/Makefile.am
index 6e67c38406..8562ca5e58 100644
--- a/src/components/cl/urom/Makefile.am
+++ b/src/components/cl/urom/Makefile.am
@@ -13,14 +13,13 @@ sources =             \
 	cl_urom_context.c \
 	cl_urom_team.c    \
 	cl_urom_coll.c    \
-    cl_urom_memcpy.c  \
 	$(alltoall)
 
 module_LTLIBRARIES         = libucc_cl_urom.la
 libucc_cl_urom_la_SOURCES  = $(sources)
-libucc_cl_urom_la_CPPFLAGS = $(AM_CPPFLAGS) $(BASE_CPPFLAGS) $(UROM_CPPFLAGS) $(CUDA_CPPFLAGS)
+libucc_cl_urom_la_CPPFLAGS = $(AM_CPPFLAGS) $(BASE_CPPFLAGS) $(UROM_CPPFLAGS)
 libucc_cl_urom_la_CFLAGS   = $(BASE_CFLAGS)
-libucc_cl_urom_la_LDFLAGS  = -version-info $(SOVERSION) --as-needed $(UROM_LDFLAGS) $(CUDA_LDFLAGS)
-libucc_cl_urom_la_LIBADD   = $(UROM_LIBADD) $(CUDA_LIBS) $(UCC_TOP_BUILDDIR)/src/libucc.la 
+libucc_cl_urom_la_LDFLAGS  = -version-info $(SOVERSION) --as-needed $(UROM_LDFLAGS)
+libucc_cl_urom_la_LIBADD   = $(UROM_LIBADD) $(UCC_TOP_BUILDDIR)/src/libucc.la
 
 include $(top_srcdir)/config/module.am
diff --git a/src/components/cl/urom/alltoall/alltoall.c b/src/components/cl/urom/alltoall/alltoall.c
index 783d33c118..146fb52ed9 100644
--- a/src/components/cl/urom/alltoall/alltoall.c
+++ b/src/components/cl/urom/alltoall/alltoall.c
@@ -5,7 +5,6 @@
  */
 
 #include "alltoall.h"
-//#include "utils/arch/cuda_def.h"
 
 ucc_base_coll_alg_info_t
     ucc_cl_urom_alltoall_algs[UCC_CL_UROM_ALLTOALL_ALG_LAST + 1] = {
@@ -65,34 +64,41 @@ static ucc_status_t ucc_cl_urom_alltoall_full_start(ucc_coll_task_t *task)
         .ucc.cmd_type      = UROM_WORKER_CMD_UCC_COLL,
         .ucc.coll_cmd.coll_args = coll_args,
         .ucc.coll_cmd.team = cl_team->teams[0],
-        .ucc.coll_cmd.use_xgvmi = cl_lib->xgvmi_enabled,
+        .ucc.coll_cmd.use_xgvmi = ctx->xgvmi_enabled,
     };
+    ucc_memory_type_t prev_src, prev_dst;
+
+    prev_src = coll_args->src.info.mem_type;
+    prev_dst = coll_args->dst.info.mem_type;
+    coll_args->src.info.mem_type = UCC_MEMORY_TYPE_HOST;
+    coll_args->dst.info.mem_type = UCC_MEMORY_TYPE_HOST;
+    urom_status = urom_worker_push_cmdq(cl_lib->urom_ctx.urom_worker, 0, &coll_cmd);
+    if (UROM_OK != urom_status) {
+        cl_debug(&cl_lib->super, "failed to push collective to urom");
+        return UCC_ERR_NO_MESSAGE;
+    }
+    coll_args->src.info.mem_type = prev_src;
+    coll_args->dst.info.mem_type = prev_dst;
 
+/*
     if (coll_args->src.info.mem_type != UCC_MEMORY_TYPE_CUDA) {
-        urom_status = urom_worker_push_cmdq(cl_lib->urom_worker, 0, &coll_cmd);
+        urom_status = urom_worker_push_cmdq(cl_lib->urom_ctx.urom_worker, 0, &coll_cmd);
         if (UROM_OK != urom_status) {
             cl_debug(&cl_lib->super, "failed to push collective to urom");
             return UCC_ERR_NO_MESSAGE;
         }
     } else {
-#if HAVE_CUDA  
-        // FIXME: a better way is to tweak args in urom 
-        cudaStreamSynchronize(cl_lib->cuda_stream);
-
         coll_args->src.info.mem_type = UCC_MEMORY_TYPE_HOST;
         coll_args->dst.info.mem_type = UCC_MEMORY_TYPE_HOST;
-        urom_status = urom_worker_push_cmdq(cl_lib->urom_worker, 0, &coll_cmd);
+        urom_status = urom_worker_push_cmdq(cl_lib->urom_ctx.urom_worker, 0, &coll_cmd);
         if (UROM_OK != urom_status) {
             cl_debug(&cl_lib->super, "failed to push collective to urom");
             return UCC_ERR_NO_MESSAGE;
         }
         coll_args->src.info.mem_type = UCC_MEMORY_TYPE_CUDA;
         coll_args->dst.info.mem_type = UCC_MEMORY_TYPE_CUDA;
-#else
-        cl_error(&cl_lib->super, "attempting to use CUDA without CUDA support");
-        return UCC_ERR_NO_RESOURCE;
-#endif
     }
+*/
     task->status = UCC_INPROGRESS;
     cl_debug(&cl_lib->super, "pushed the collective to urom");
     return ucc_progress_queue_enqueue(ctx->super.super.ucc_context->pq, task);
@@ -117,7 +123,7 @@ static void ucc_cl_urom_alltoall_full_progress(ucc_coll_task_t *ctask)
     urom_status_t           urom_status = 0;
     urom_worker_notify_t   *notif;
 
-    urom_status = urom_worker_pop_notifyq(cl_lib->urom_worker, 0, &notif);
+    urom_status = urom_worker_pop_notifyq(cl_lib->urom_ctx.urom_worker, 0, &notif);
     if (UROM_ERR_QUEUE_EMPTY == urom_status) {
         return;
     }
@@ -133,25 +139,14 @@ static void ucc_cl_urom_alltoall_full_progress(ucc_coll_task_t *ctask)
         return;
     }
 
-    if (cl_lib->xgvmi_enabled) {
+    if (ctx->req_mc) {
         size_t size_mod = dt_size(ctask->bargs.args.dst.info.datatype);
 
-        if (cl_lib->req_mc) {
-            if (ctask->bargs.args.dst.info.mem_type != UCC_MEMORY_TYPE_CUDA) {
-                memcpy(cl_lib->old_dest, ctask->bargs.args.dst.info.buffer, ctask->bargs.args.src.info.count * size_mod);
-            } else {
-    #if HAVE_CUDA
-                cudaMemcpyAsync(cl_lib->old_dest, ctask->bargs.args.dst.info.buffer, ctask->bargs.args.src.info.count * size_mod , cudaMemcpyHostToDevice, cl_lib->cuda_stream);
-                cudaStreamSynchronize(cl_lib->cuda_stream);
-    #else
-                cl_error(&cl_lib->super, "attempting to use CUDA without CUDA support");
-                return UCC_ERR_NO_RESOURCE;
-    #endif
-            }
-            ctask->bargs.args.dst.info.buffer = cl_lib->old_dest;
-            ctask->bargs.args.src.info.buffer = cl_lib->old_src;
+        if ((ucc_status_t) notif->ucc.status == UCC_OK) {
+            ucc_mc_memcpy(ctx->old_dest, ctask->bargs.args.dst.info.buffer, ctask->bargs.args.dst.info.count * size_mod, ctask->bargs.args.dst.info.mem_type, UCC_MEMORY_TYPE_HOST);
+            ctask->bargs.args.dst.info.buffer = ctx->old_dest;
+            ctask->bargs.args.src.info.buffer = ctx->old_src;
         }
-
     }
     cl_debug(&cl_lib->super, "completed the collective from urom");
 
@@ -165,6 +160,7 @@ ucc_status_t ucc_cl_urom_alltoall_full_init(
     ucc_cl_urom_team_t     *cl_team = ucc_derived_of(team, ucc_cl_urom_team_t);
     ucc_cl_urom_context_t *ctx  = UCC_CL_UROM_TEAM_CTX(cl_team);
     ucc_cl_urom_lib_t *cl_lib = ucc_derived_of(ctx->super.super.lib, ucc_cl_urom_lib_t);
+
     ucc_cl_urom_schedule_t *cl_schedule;
     ucc_base_coll_args_t    args;
     ucc_schedule_t         *schedule;
@@ -175,26 +171,17 @@ ucc_status_t ucc_cl_urom_alltoall_full_init(
         return UCC_ERR_NO_MEMORY;
     }
     schedule = &cl_schedule->super.super;
-    if (cl_lib->xgvmi_enabled) {
+    if (ctx->req_mc) {
         size_t size_mod = dt_size(coll_args->args.src.info.datatype);
-        if (cl_lib->req_mc) {
-            //memcpy args to xgvmi buffer
-            void * ptr = cl_lib->xgvmi_buffer + (cl_lib->cfg.xgvmi_buffer_size * (schedule->super.seq_num % cl_lib->cfg.num_buffers));
-            if (coll_args->args.src.info.mem_type != UCC_MEMORY_TYPE_CUDA) {
-                memcpy(ptr, coll_args->args.src.info.buffer, coll_args->args.src.info.count * size_mod);
-            } else {
-    #if HAVE_CUDA
-                cudaMemcpyAsync(ptr, coll_args->args.src.info.buffer, coll_args->args.src.info.count * size_mod, cudaMemcpyDeviceToHost, cl_lib->cuda_stream);
-    #else
-                cl_error(&cl_lib->super, "attempting to use CUDA without CUDA support");
-                return UCC_ERR_NO_RESOURCE;
-    #endif
-            }
-            cl_lib->old_src = coll_args->args.src.info.buffer;
-            coll_args->args.src.info.buffer = ptr;
-            cl_lib->old_dest = coll_args->args.dst.info.buffer;
-            coll_args->args.dst.info.buffer = ptr + coll_args->args.src.info.count * size_mod;
-        } 
+        size_t count = coll_args->args.src.info.count * size_mod;
+        //memcpy args to xgvmi buffer
+        void * ptr = ctx->xgvmi.xgvmi_buffer + (cl_lib->cfg.xgvmi_buffer_size * (schedule->super.seq_num % cl_lib->cfg.num_buffers));
+        ucc_mc_memcpy(ptr, coll_args->args.src.info.buffer, count, UCC_MEMORY_TYPE_HOST, coll_args->args.src.info.mem_type);
+
+        ctx->old_src = coll_args->args.src.info.buffer;
+        coll_args->args.src.info.buffer = ptr;
+        ctx->old_dest = coll_args->args.dst.info.buffer;
+        coll_args->args.dst.info.buffer = ptr + count;
     } 
     memcpy(&args, coll_args, sizeof(args));
     status = ucc_schedule_init(schedule, &args, team); 
@@ -211,5 +198,6 @@ ucc_status_t ucc_cl_urom_alltoall_full_init(
         ucc_cl_urom_alltoall_triggered_post_setup;
 
     *task = &schedule->super;
+    cl_debug(cl_lib, "urom coll init'd");
     return UCC_OK;
 }
diff --git a/src/components/cl/urom/alltoall/tags b/src/components/cl/urom/alltoall/tags
deleted file mode 100644
index 0a18dd4a17..0000000000
--- a/src/components/cl/urom/alltoall/tags
+++ /dev/null
@@ -1,36 +0,0 @@
-!_TAG_FILE_FORMAT	2	/extended format; --format=1 will not append ;" to lines/
-!_TAG_FILE_SORTED	1	/0=unsorted, 1=sorted, 2=foldcase/
-!_TAG_OUTPUT_EXCMD	mixed	/number, pattern, mixed, or combineV2/
-!_TAG_OUTPUT_FILESEP	slash	/slash or backslash/
-!_TAG_OUTPUT_MODE	u-ctags	/u-ctags or e-ctags/
-!_TAG_PATTERN_LENGTH_LIMIT	96	/0 for no limit/
-!_TAG_PROC_CWD	/work/nvidia/ucc-urom-cl/src/components/cl/urom/alltoall/	//
-!_TAG_PROGRAM_AUTHOR	Universal Ctags Team	//
-!_TAG_PROGRAM_NAME	Universal Ctags	/Derived from Exuberant Ctags/
-!_TAG_PROGRAM_URL	https://ctags.io/	/official site/
-!_TAG_PROGRAM_VERSION	5.9.0	//
-UCC_CL_UROM_ALLTOALL_ALG_FULL	alltoall.h	/^    UCC_CL_UROM_ALLTOALL_ALG_FULL,$/;"	e	enum:__anon795cd0f00103
-UCC_CL_UROM_ALLTOALL_ALG_LAST	alltoall.h	/^    UCC_CL_UROM_ALLTOALL_ALG_LAST,$/;"	e	enum:__anon795cd0f00103
-UROM_ALLTOALL_H_	alltoall.h	/^#define UROM_ALLTOALL_H_$/;"	d
-__anon795cd0f00103	alltoall.h	/^{$/;"	g
-args	alltoall.c	/^    ucc_base_coll_args_t    args;$/;"	v	typeref:typename:ucc_base_coll_args_t
-cl_schedule	alltoall.c	/^    ucc_cl_urom_schedule_t *cl_schedule;$/;"	v	typeref:typename:ucc_cl_urom_schedule_t *
-coll_cmd	alltoall.c	/^    urom_worker_cmd_t   coll_cmd = {$/;"	v	typeref:typename:urom_worker_cmd_t
-elem_size	alltoall.c	/^    size_t                  elem_size;$/;"	v	typeref:typename:size_t
-full_only	alltoall.c	/^    int                     full_only = 0;$/;"	v	typeref:typename:int
-full_size	alltoall.c	/^    ucc_rank_t              full_size, node_size;$/;"	v	typeref:typename:ucc_rank_t
-node_size	alltoall.c	/^    ucc_rank_t              full_size, node_size;$/;"	v	typeref:typename:ucc_rank_t
-rdt_size	alltoall.c	/^    size_t                  sdt_size, rdt_size;$/;"	v	typeref:typename:size_t
-sbgp	alltoall.c	/^    ucc_sbgp_t             *sbgp;$/;"	v	typeref:typename:ucc_sbgp_t *
-schedule	alltoall.c	/^    ucc_schedule_t         *schedule;$/;"	v	typeref:typename:ucc_schedule_t *
-sdt_size	alltoall.c	/^    size_t                  sdt_size, rdt_size;$/;"	v	typeref:typename:size_t
-status	alltoall.c	/^    ucc_status_t            status;$/;"	v	typeref:typename:ucc_status_t
-task_full	alltoall.c	/^    ucc_coll_task_t        *task_node, *task_full;$/;"	v	typeref:typename:ucc_coll_task_t *
-task_node	alltoall.c	/^    ucc_coll_task_t        *task_node, *task_full;$/;"	v	typeref:typename:ucc_coll_task_t *
-ucc_cl_urom_alltoall_alg_from_str	alltoall.h	/^static inline int ucc_cl_urom_alltoall_alg_from_str(const char *str)$/;"	f	typeref:typename:int
-ucc_cl_urom_alltoall_algs	alltoall.c	/^    ucc_cl_urom_alltoall_algs[UCC_CL_UROM_ALLTOALL_ALG_LAST + 1] = {$/;"	v	typeref:typename:ucc_base_coll_alg_info_t[]
-ucc_cl_urom_alltoall_finalize	alltoall.c	/^static ucc_status_t ucc_cl_urom_alltoall_finalize(ucc_coll_task_t *task)$/;"	f	typeref:typename:ucc_status_t	file:
-ucc_cl_urom_alltoall_start	alltoall.c	/^static ucc_status_t ucc_cl_urom_alltoall_start(ucc_coll_task_t *task)$/;"	f	typeref:typename:ucc_status_t	file:
-ucc_cl_urom_alltoall_triggered_post_setup	alltoall.c	/^ucc_status_t ucc_cl_urom_alltoall_triggered_post_setup(ucc_coll_task_t *task)$/;"	f	typeref:typename:ucc_status_t
-ucc_status	alltoall.c	/^    ucc_status_t        ucc_status;$/;"	v	typeref:typename:ucc_status_t
-urom_status	alltoall.c	/^    urom_status_t       urom_status;$/;"	v	typeref:typename:urom_status_t
diff --git a/src/components/cl/urom/cl_urom.h b/src/components/cl/urom/cl_urom.h
index 6eb3d712cd..d838fc6f29 100644
--- a/src/components/cl/urom/cl_urom.h
+++ b/src/components/cl/urom/cl_urom.h
@@ -15,15 +15,10 @@
 #include <urom/api/urom.h>
 #include <urom/api/urom_ucc.h>
 
-#include "utils/arch/cuda_def.h"
-
 #ifndef UCC_CL_UROM_DEFAULT_SCORE
 #define UCC_CL_UROM_DEFAULT_SCORE 20
 #endif
 
-#define OFFSET_SIZE (128*1024*1024)
-#define NUM_OFFSETS 8
-
 typedef struct ucc_cl_urom_iface {
     ucc_cl_iface_t super;
 } ucc_cl_urom_iface_t;
@@ -47,54 +42,56 @@ typedef struct ucc_cl_urom_context_config {
     ucc_cl_context_config_t super;
 } ucc_cl_urom_context_config_t;
 
+typedef struct urom_ctx {
+    urom_service_h urom_service;
+    urom_worker_h  urom_worker;
+    void          *urom_worker_addr;
+    size_t         urom_worker_len;
+    uint64_t       worker_id;
+    int            pass_dc_exist;
+} urom_ctx_t;
+
+typedef struct xgvmi_info {
+    ucp_mem_h xgvmi_memh;
+    void     *packed_mkey;
+    uint64_t  packed_mkey_len;
+    void     *packed_xgvmi_memh;
+    uint64_t  packed_xgvmi_len;
+    void     *xgvmi_buffer;
+    size_t    xgvmi_size;
+} xgvmi_info_t;
+
 typedef struct ucc_cl_urom_lib {
     ucc_cl_lib_t             super;
     ucc_cl_urom_lib_config_t cfg;
-    urom_service_h           urom_service;
-    urom_worker_h            urom_worker;
-    void *                   urom_ucc_ctx_h;
-    void                    *urom_worker_addr;
-    size_t                   urom_worker_len;
-    uint64_t                 worker_id;
-    int                      pass_dc_exist;
-    int                      xgvmi_enabled;
-    ucp_mem_h                xgvmi_memh;
-    void *                   packed_mkey;
-    uint64_t                 packed_mkey_len;
-    void *                   packed_xgvmi_memh;
-    uint64_t                 packed_xgvmi_len;
-    void *                   xgvmi_buffer;
-    size_t                   xgvmi_size;
-    int                      req_mc;
-    void *                   old_dest;
-    void *                   old_src;
-    int                      xgvmi_offsets[NUM_OFFSETS];
-    int                      seq_num;
+    urom_ctx_t               urom_ctx;
     int                      tl_ucp_index; //FIXME: make this better
-    //void *                   cuda_stream;
-    #if HAVE_CUDA
-    cudaStream_t             cuda_stream;
-    #endif
-    ucc_rank_t               ctx_rank; //FIXME: this is not right
 } ucc_cl_urom_lib_t;
 UCC_CLASS_DECLARE(ucc_cl_urom_lib_t, const ucc_base_lib_params_t *,
                   const ucc_base_config_t *);
 
 typedef struct ucc_cl_urom_context {
-    ucc_cl_context_t   super;
-    urom_domain_h      urom_domain;
-    ucc_mpool_t        sched_mp;
+    ucc_cl_context_t super;
+    urom_domain_h    urom_domain;
+    void            *urom_ucc_ctx_h;
+    ucc_mpool_t      sched_mp;
+    xgvmi_info_t     xgvmi;
+    int              xgvmi_enabled;
+    int              req_mc;
+    void            *old_dest;
+    void            *old_src;
+    ucc_rank_t       ctx_rank;     //FIXME: this is not right
 } ucc_cl_urom_context_t;
 UCC_CLASS_DECLARE(ucc_cl_urom_context_t, const ucc_base_context_params_t *,
                   const ucc_base_config_t *);
 
 typedef struct ucc_cl_urom_team {
-    ucc_cl_team_t            super;
-    int                      team_posted;
-    ucc_team_h             **teams;
-    unsigned                 n_teams;
-    ucc_coll_score_t        *score;
-    ucc_score_map_t         *score_map;
+    ucc_cl_team_t     super;
+    int               team_posted;
+    ucc_team_h      **teams;
+    unsigned          n_teams;
+    ucc_coll_score_t *score;
+    ucc_score_map_t  *score_map;
 } ucc_cl_urom_team_t;
 UCC_CLASS_DECLARE(ucc_cl_urom_team_t, ucc_base_context_t *,
                   const ucc_base_team_params_t *);
@@ -103,7 +100,7 @@ ucc_status_t ucc_cl_urom_coll_init(ucc_base_coll_args_t *coll_args,
                                    ucc_base_team_t      *team,
                                    ucc_coll_task_t     **task);
 
-#define UCC_CL_UROM_TEAM_CTX(_team)                                           \
+#define UCC_CL_UROM_TEAM_CTX(_team)                                            \
     (ucc_derived_of((_team)->super.super.context, ucc_cl_urom_context_t))
 
 #endif
diff --git a/src/components/cl/urom/cl_urom_coll.c b/src/components/cl/urom/cl_urom_coll.c
index 5db56c1fdf..52decf3613 100644
--- a/src/components/cl/urom/cl_urom_coll.c
+++ b/src/components/cl/urom/cl_urom_coll.c
@@ -26,25 +26,25 @@ ucc_status_t ucc_cl_urom_coll_init(ucc_base_coll_args_t *coll_args,
     int ucp_index = urom_lib->tl_ucp_index;
     ucc_tl_ucp_context_t *tl_ctx = ucc_derived_of(ctx->super.tl_ctxs[ucp_index], ucc_tl_ucp_context_t);
     urom_status_t urom_status;
-    if (!urom_lib->pass_dc_exist) {
+    if (!urom_lib->urom_ctx.pass_dc_exist) {
         urom_worker_cmd_t pass_dc_cmd = {
             .cmd_type = UROM_WORKER_CMD_UCC,
             .ucc.cmd_type = UROM_WORKER_CMD_CREATE_PASSIVE_DATA_CHANNEL,
-            .ucc.dpu_worker_id = urom_lib->ctx_rank,
+            .ucc.dpu_worker_id = ctx->ctx_rank,
             .ucc.pass_dc_create_cmd.ucp_addr = tl_ctx->worker.worker_address,
             .ucc.pass_dc_create_cmd.addr_len = tl_ctx->worker.ucp_addrlen,
         };
         urom_worker_notify_t *notif;
 
-        urom_worker_push_cmdq(urom_lib->urom_worker, 0, &pass_dc_cmd);
+        urom_worker_push_cmdq(urom_lib->urom_ctx.urom_worker, 0, &pass_dc_cmd);
         while (UROM_ERR_QUEUE_EMPTY ==
-               (urom_status = urom_worker_pop_notifyq(urom_lib->urom_worker, 0, &notif))) {
+               (urom_status = urom_worker_pop_notifyq(urom_lib->urom_ctx.urom_worker, 0, &notif))) {
             sched_yield();
         }
         if ((ucc_status_t) notif->ucc.status != UCC_OK) {
             return (ucc_status_t) notif->ucc.status;
         }
-        urom_lib->pass_dc_exist = 1;
+        urom_lib->urom_ctx.pass_dc_exist = 1;
     }
     switch (coll_args->args.coll_type) {
         case UCC_COLL_TYPE_ALLTOALL:
diff --git a/src/components/cl/urom/cl_urom_context.c b/src/components/cl/urom/cl_urom_context.c
index 111cc13fc1..ac62e75632 100644
--- a/src/components/cl/urom/cl_urom_context.c
+++ b/src/components/cl/urom/cl_urom_context.c
@@ -94,26 +94,34 @@ UCC_CLASS_INIT_FUNC(ucc_cl_urom_context_t,
         self->super.tl_ctxs = NULL;
         return UCC_ERR_NOT_FOUND;
     }
-    urom_lib->worker_id = UROM_WORKER_ID_ANY;
-    urom_lib->ctx_rank = params->params.oob.oob_ep;
-    urom_status = urom_worker_spawn(
-        urom_lib->urom_service, UROM_WORKER_TYPE_UCC, urom_lib->urom_worker_addr,
-        &urom_lib->urom_worker_len, &urom_lib->worker_id);
-    if (UROM_OK != urom_status) {
-        cl_error(&urom_lib->super, "failed to connect to urom worker");
-        return UCC_ERR_NO_MESSAGE;
-    }
 
-    worker_params.serviceh        = urom_lib->urom_service;
-    worker_params.addr            = urom_lib->urom_worker_addr;
-    worker_params.addr_len        = urom_lib->urom_worker_len;
-    worker_params.num_cmd_notifyq = 1;
+    if (!urom_lib->urom_ctx.urom_worker) {
+        /* issues
+         *  1. user creates multiple contexts? 
+         *  2. worker sharing
+         */
+        urom_lib->urom_ctx.worker_id = 0;//UROM_WORKER_ID_ANY;
+        self->ctx_rank = params->params.oob.oob_ep;
+        urom_status = urom_worker_spawn(
+            urom_lib->urom_ctx.urom_service, UROM_WORKER_TYPE_UCC, urom_lib->urom_ctx.urom_worker_addr,
+            &urom_lib->urom_ctx.urom_worker_len, &urom_lib->urom_ctx.worker_id);
+        if (UROM_OK != urom_status) {
+            cl_error(&urom_lib->super, "failed to connect to urom worker");
+            return UCC_ERR_NO_MESSAGE;
+        }
+
+        worker_params.serviceh        = urom_lib->urom_ctx.urom_service;
+        worker_params.addr            = urom_lib->urom_ctx.urom_worker_addr;
+        worker_params.addr_len        = urom_lib->urom_ctx.urom_worker_len;
+        worker_params.num_cmd_notifyq = 1;
 
-    urom_status = urom_worker_connect(&worker_params, &urom_lib->urom_worker);
-    if (UROM_OK != urom_status) {
-        cl_error(&urom_lib->super, "failed to perform urom_worker_connect() with error: %s",
-                 urom_status_string(urom_status));
-        return UCC_ERR_NO_MESSAGE;
+        urom_status = urom_worker_connect(&worker_params, &urom_lib->urom_ctx.urom_worker);
+        if (UROM_OK != urom_status) {
+            cl_error(&urom_lib->super, "failed to perform urom_worker_connect() with error: %s",
+                     urom_status_string(urom_status));
+            return UCC_ERR_NO_MESSAGE;
+        }
+        cl_debug(&urom_lib->super, "connected to the urom worker");
     }
     tl_ctx = ucc_derived_of(self->super.tl_ctxs[ucp_index], ucc_tl_ucp_context_t);
     urom_domain_params.flags = UROM_DOMAIN_WORKER_ADDR;
@@ -128,24 +136,24 @@ UCC_CLASS_INIT_FUNC(ucc_cl_urom_context_t,
     urom_domain_params.oob.oob_index = params->params.oob.oob_ep;
 
     urom_domain_params.domain_worker_id = params->params.oob.oob_ep;
-    urom_domain_params.workers = &urom_lib->urom_worker;
+    urom_domain_params.workers = &urom_lib->urom_ctx.urom_worker;
     urom_domain_params.num_workers = 1,
     urom_domain_params.domain_size = params->params.oob.n_oob_eps;
-    urom_lib->req_mc = 1; /* requires a memcpy */
+    self->req_mc = 1; /* requires a memcpy */
 
     if (params->context->params.mask & UCC_CONTEXT_PARAM_FIELD_OOB &&
         params->context->params.mask & UCC_CONTEXT_PARAM_FIELD_MEM_PARAMS) {
         
         /* remap the segments for xgvmi if enabled */
-        urom_lib->xgvmi_buffer = ucc_mem_params.segments[0].address;
-        urom_lib->xgvmi_size = ucc_mem_params.segments[0].len;
+        self->xgvmi.xgvmi_buffer = ucc_mem_params.segments[0].address;
+        self->xgvmi.xgvmi_size = ucc_mem_params.segments[0].len;
         if (urom_lib->cfg.use_xgvmi) {
-            urom_lib->req_mc = 0;
+            self->req_mc = 0;
         }
     } else {
-        urom_lib->xgvmi_size = urom_lib->cfg.num_buffers * urom_lib->cfg.xgvmi_buffer_size;
-        urom_lib->xgvmi_buffer = ucc_calloc(1, urom_lib->xgvmi_size, "xgvmi buffer");
-        if (!urom_lib->xgvmi_buffer) {
+        self->xgvmi.xgvmi_size = urom_lib->cfg.num_buffers * urom_lib->cfg.xgvmi_buffer_size;
+        self->xgvmi.xgvmi_buffer = ucc_calloc(1, self->xgvmi.xgvmi_size, "xgvmi buffer");
+        if (!self->xgvmi.xgvmi_buffer) {
             return UCC_ERR_NO_MEMORY;
         }
     }
@@ -160,17 +168,17 @@ UCC_CLASS_INIT_FUNC(ucc_cl_urom_context_t,
 
     // mem_map the segment
     mem_params.field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS | UCP_MEM_MAP_PARAM_FIELD_LENGTH;
-    mem_params.address = urom_lib->xgvmi_buffer;
-    mem_params.length = urom_lib->xgvmi_size;
+    mem_params.address = self->xgvmi.xgvmi_buffer;
+    mem_params.length = self->xgvmi.xgvmi_size;
 
-    ucs_status = ucp_mem_map(tl_ctx->worker.ucp_context, &mem_params, &urom_lib->xgvmi_memh);
+    ucs_status = ucp_mem_map(tl_ctx->worker.ucp_context, &mem_params, &self->xgvmi.xgvmi_memh);
     assert(ucs_status == UCS_OK);
 
     if (urom_lib->cfg.use_xgvmi) {
         pack_params.field_mask = UCP_MEMH_PACK_PARAM_FIELD_FLAGS;
         pack_params.flags = UCP_MEMH_PACK_FLAG_EXPORT;
 
-        ucs_status = ucp_memh_pack(urom_lib->xgvmi_memh, &pack_params, &urom_lib->packed_xgvmi_memh, &urom_lib->packed_xgvmi_len);
+        ucs_status = ucp_memh_pack(self->xgvmi.xgvmi_memh, &pack_params, &self->xgvmi.packed_xgvmi_memh, &self->xgvmi.packed_xgvmi_len);
         if (ucs_status != UCS_OK) {
             cl_error(&urom_lib->super.super, "ucp_memh_pack() returned error: %s", ucs_status_string(ucs_status));
             cl_error(&urom_lib->super.super, "xgvmi will be disabled");
@@ -180,27 +188,27 @@ UCC_CLASS_INIT_FUNC(ucc_cl_urom_context_t,
         }
     }
 
-    ucs_status = ucp_rkey_pack(tl_ctx->worker.ucp_context, urom_lib->xgvmi_memh, &urom_lib->packed_mkey,
-                               &urom_lib->packed_mkey_len);
+    ucs_status = ucp_rkey_pack(tl_ctx->worker.ucp_context, self->xgvmi.xgvmi_memh, &self->xgvmi.packed_mkey,
+                               &self->xgvmi.packed_mkey_len);
     if (UCS_OK != ucs_status) {
         printf("ucp_rkey_pack() returned error: %s\n",
                ucs_status_string(ucs_status));
         return UCC_ERR_NO_RESOURCE;
     }
     domain_mem_map[0].mask = UROM_WORKER_MEM_MAP_FIELD_BASE_VA | UROM_WORKER_MEM_MAP_FIELD_MKEY;
-    domain_mem_map[0].base_va = (uint64_t)urom_lib->xgvmi_buffer;
-    domain_mem_map[0].len = urom_lib->xgvmi_size;
-    domain_mem_map[0].mkey = urom_lib->packed_mkey;
-    domain_mem_map[0].mkey_len = urom_lib->packed_mkey_len;
+    domain_mem_map[0].base_va = (uint64_t)self->xgvmi.xgvmi_buffer;
+    domain_mem_map[0].len = self->xgvmi.xgvmi_size;
+    domain_mem_map[0].mkey = self->xgvmi.packed_mkey;
+    domain_mem_map[0].mkey_len = self->xgvmi.packed_mkey_len;
     if (xgvmi_level) {
         domain_mem_map[0].mask |= UROM_WORKER_MEM_MAP_FIELD_MEMH;
-        domain_mem_map[0].memh = urom_lib->packed_xgvmi_memh;
-        domain_mem_map[0].memh_len = urom_lib->packed_xgvmi_len;
+        domain_mem_map[0].memh = self->xgvmi.packed_xgvmi_memh;
+        domain_mem_map[0].memh_len = self->xgvmi.packed_xgvmi_len;
     }
     urom_domain_params.mask |= UROM_DOMAIN_PARAM_FIELD_MEM_MAP;
     urom_domain_params.mem_map.segments = domain_mem_map;
     urom_domain_params.mem_map.n_segments = 1;
-    urom_lib->xgvmi_enabled = xgvmi_level; //FIXME: for now, just use xgvmi buffers
+    self->xgvmi_enabled = xgvmi_level; //FIXME: for now, just use xgvmi buffers
 
     urom_status = urom_domain_create_post(&urom_domain_params, &self->urom_domain);
     if (urom_status < UROM_OK) {
@@ -214,10 +222,10 @@ UCC_CLASS_INIT_FUNC(ucc_cl_urom_context_t,
         return UCC_ERR_NO_MESSAGE;
     }
 
-    urom_worker_push_cmdq(urom_lib->urom_worker, 0, &init_cmd);
+    urom_worker_push_cmdq(urom_lib->urom_ctx.urom_worker, 0, &init_cmd);
     
     while (UROM_ERR_QUEUE_EMPTY ==
-           (urom_status = urom_worker_pop_notifyq(urom_lib->urom_worker, 0, &notif_lib))) {
+           (urom_status = urom_worker_pop_notifyq(urom_lib->urom_ctx.urom_worker, 0, &notif_lib))) {
         sched_yield();
     }
     if ((ucc_status_t) notif_lib->ucc.status != UCC_OK) {
@@ -227,18 +235,11 @@ UCC_CLASS_INIT_FUNC(ucc_cl_urom_context_t,
         cl_debug(&urom_lib->super.super, "debug: lib created\n");
     }
 
-    if (urom_lib->xgvmi_enabled || n_segments == 1) {
-        ctx_cmd.ucc.context_create_cmd.base_va = urom_lib->xgvmi_buffer;
-        ctx_cmd.ucc.context_create_cmd.len = urom_lib->xgvmi_size;
-    } else {
-        if (params->context->params.mask & UCC_CONTEXT_PARAM_FIELD_MEM_PARAMS) {
-            ctx_cmd.ucc.context_create_cmd.base_va = ucc_mem_params.segments[0].address;
-            ctx_cmd.ucc.context_create_cmd.len = ucc_mem_params.segments[0].len;
-        }
-    }
-    urom_worker_push_cmdq(urom_lib->urom_worker, 0, &ctx_cmd);
+    ctx_cmd.ucc.context_create_cmd.base_va = self->xgvmi.xgvmi_buffer;
+    ctx_cmd.ucc.context_create_cmd.len = self->xgvmi.xgvmi_size;
+    urom_worker_push_cmdq(urom_lib->urom_ctx.urom_worker, 0, &ctx_cmd);
     while (UROM_ERR_QUEUE_EMPTY ==
-           (urom_status = urom_worker_pop_notifyq(urom_lib->urom_worker, 0, &notif_ctx)) 
+           (urom_status = urom_worker_pop_notifyq(urom_lib->urom_ctx.urom_worker, 0, &notif_ctx)) 
             ) {
         sched_yield();
     }
@@ -247,7 +248,7 @@ UCC_CLASS_INIT_FUNC(ucc_cl_urom_context_t,
         return (ucc_status_t) notif_ctx->ucc.status;
     }
 
-    urom_lib->urom_ucc_ctx_h = notif_ctx->ucc.context_create_nqe.context;
+    self->urom_ucc_ctx_h = notif_ctx->ucc.context_create_nqe.context;
 
     status = ucc_mpool_init(&self->sched_mp, 0, sizeof(ucc_cl_urom_schedule_t),
                             0, UCC_CACHE_LINE_SIZE, 2, UINT_MAX,
@@ -258,10 +259,6 @@ UCC_CLASS_INIT_FUNC(ucc_cl_urom_context_t,
         return UCC_ERR_NO_MESSAGE;
     }
 
-#if HAVE_CUDA
-    cudaStreamCreateWithFlags(&urom_lib->cuda_stream, cudaStreamNonBlocking);
-#endif
-
     cl_debug(cl_config->cl_lib, "initialized cl context: %p", self);
     return UCC_OK;
 }
@@ -274,7 +271,7 @@ UCC_CLASS_CLEANUP_FUNC(ucc_cl_urom_context_t)
         .ucc.cmd_type = UROM_WORKER_CMD_UCC_CONTEXT_DESTROY,
         .ucc.context_destroy_cmd = 
         {
-            .context_h = urom_lib->urom_ucc_ctx_h,
+            .context_h = self->urom_ucc_ctx_h,
         },
     };
     urom_worker_notify_t *notif;
@@ -287,16 +284,14 @@ UCC_CLASS_CLEANUP_FUNC(ucc_cl_urom_context_t)
     }
     ucc_free(self->super.tl_ctxs);
     
-    urom_worker_push_cmdq(urom_lib->urom_worker, 0, &ctx_destroy_cmd);
+    urom_worker_push_cmdq(urom_lib->urom_ctx.urom_worker, 0, &ctx_destroy_cmd);
     while (UROM_ERR_QUEUE_EMPTY ==
-           (urom_status = urom_worker_pop_notifyq(urom_lib->urom_worker, 0, &notif))) {
+           (urom_status = urom_worker_pop_notifyq(urom_lib->urom_ctx.urom_worker, 0, &notif))) {
         sched_yield();
     }
-    if (urom_lib->req_mc) {
-        ucc_free(urom_lib->xgvmi_buffer);
+    if (self->req_mc) {
+        ucc_free(self->xgvmi.xgvmi_buffer);
     }
-
-    cudaStreamDestroy(urom_lib->cuda_stream);
 }
 
 UCC_CLASS_DEFINE(ucc_cl_urom_context_t, ucc_cl_context_t);
diff --git a/src/components/cl/urom/cl_urom_lib.c b/src/components/cl/urom/cl_urom_lib.c
index f23714e796..4fdc1339e6 100644
--- a/src/components/cl/urom/cl_urom_lib.c
+++ b/src/components/cl/urom/cl_urom_lib.c
@@ -90,31 +90,22 @@ UCC_CLASS_INIT_FUNC(ucc_cl_urom_lib_t, const ucc_base_lib_params_t *params,
     memcpy(&self->cfg, cl_config, sizeof(*cl_config));
 
     cl_debug(&self->super, "initialized lib object: %p", self);
+    memset(&self->urom_ctx, 0, sizeof(urom_ctx_t));
 
     /* how to know service here? */
-    ret = device_connect(self, device, &self->urom_service);
+    ret = device_connect(self, device, &self->urom_ctx.urom_service);
     if (ret) {
         cl_error(&self->super, "failed to connect to urom");
         return UCC_ERR_NO_RESOURCE;
     }
 
-    self->urom_worker_addr = ucc_calloc(1, UROM_WORKER_ADDR_MAX_LEN, "urom worker addr");
-    if (!self->urom_worker_addr) {
+    self->urom_ctx.urom_worker_addr = ucc_calloc(1, UROM_WORKER_ADDR_MAX_LEN, "urom worker addr");
+    if (!self->urom_ctx.urom_worker_addr) {
         cl_error(&self->super, "failed to allocate %d bytes", UROM_WORKER_ADDR_MAX_LEN);
         return UCC_ERR_NO_MEMORY;
     }
 
-    self->pass_dc_exist = 0;
-    self->xgvmi_enabled = 0;
-/*
-    if (!self->cfg.num_buffers) {
-        self->cfg.num_buffers = 1;
-    }
-    if (!self->cfg.xgvmi_buffer_size) {
-        self->cfg.xgvmi_buffer_size = OFFSET_SIZE;
-    } 
-*/
-//    memset(self->xgvmi_offsets, 0, sizeof(int) * self->cfg.num_buffers);
+    self->tl_ucp_index = -1;
     return UCC_OK;
 }
 
@@ -122,17 +113,17 @@ UCC_CLASS_CLEANUP_FUNC(ucc_cl_urom_lib_t)
 {
     urom_status_t urom_status;
     cl_debug(&self->super, "finalizing lib object: %p", self);
-    urom_status = urom_worker_disconnect(self->urom_worker);
+    urom_status = urom_worker_disconnect(self->urom_ctx.urom_worker);
     if (urom_status != UROM_OK) {
         cl_error(self, "Failed to disconnect to UROM Worker");
     }
 
-    urom_status = urom_worker_destroy(self->urom_service, self->worker_id);
+    urom_status = urom_worker_destroy(self->urom_ctx.urom_service, self->urom_ctx.worker_id);
     if (urom_status != UROM_OK) {
         cl_error(self, "Failed to destroy UROM Worker");
     }
 
-    urom_status = urom_service_disconnect(self->urom_service);
+    urom_status = urom_service_disconnect(self->urom_ctx.urom_service);
     if (urom_status != UROM_OK) {
         cl_error(self, "Failed to disconnect from UROM service");
     }
diff --git a/src/components/cl/urom/cl_urom_memcpy.c b/src/components/cl/urom/cl_urom_memcpy.c
deleted file mode 100644
index 8ab547bd1a..0000000000
--- a/src/components/cl/urom/cl_urom_memcpy.c
+++ /dev/null
@@ -1,65 +0,0 @@
-/**
- * Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- *
- * See file LICENSE for terms.
- */
-
-#include "cl_urom.h"
-#include "utils/ucc_malloc.h"
-
-#include <string.h>
-
-#include "utils/arch/cuda_def.h"
-
-ucc_status_t memcpy_init(ucc_cl_urom_lib_t *cl_lib)
-{
-#if 0 
-    cudaStreamCreateWithFlags(&cl_lib->cuda_stream, cudaStreamNonBlocking);
-#endif
-    return UCC_OK;
-}
-
-ucc_status_t memcpy_nb(void *dst,
-              void *src,
-              ucc_memory_type_t src_mem_type,
-              ucc_memory_type_t dst_mem_type,
-              size_t n,
-              ucc_cl_urom_lib_t *cl_lib)
-{
-#ifndef HAVE_CUDA
-    if (src_mem_type == UCC_MEMORY_TYPE_CUDA ||
-        dst_mem_type == UCC_MEMORY_TYPE_CUDA) {
-        cl_error(cl_lib, "Unsupported operation. Did you build UCC with CUDA support?");
-        return UCC_ERR_NOT_SUPPORTED;
-    }
-#else 
-    /* copy from host to cuda */
-    if (dst_mem_type == UCC_MEMORY_TYPE_CUDA) {
-        printf("IM HOST->CUDA");
-#if 0
-        cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, cl_lib->cuda_stream);
-#endif
-        return UCC_OK;
-    } else if (src_mem_type == UCC_MEMORY_TYPE_CUDA) {
-        /* copy from cuda to host */
-        printf("IM CUDA->HOST");
-#if 0
-        cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, cl_lib->cuda_stream);
-#endif
-        return UCC_OK;
-    }
-#endif
-    else {
-        /* copy from host to host */
-        memcpy(dst, src, n);
-    }
-    return UCC_OK;
-}
-
-ucc_status_t memcpy_sync(ucc_cl_urom_lib_t *cl_lib)
-{
-#if 0
-    cudaStreamSynchronize(cl_lib->cuda_stream);
-#endif
-    return UCC_OK;
-}
diff --git a/src/components/cl/urom/cl_urom_team.c b/src/components/cl/urom/cl_urom_team.c
index 68dd93cb58..1fc3f77e0f 100644
--- a/src/components/cl/urom/cl_urom_team.c
+++ b/src/components/cl/urom/cl_urom_team.c
@@ -18,7 +18,7 @@ UCC_CLASS_INIT_FUNC(ucc_cl_urom_team_t, ucc_base_context_t *cl_context,
     ucc_status_t            status;
     urom_worker_cmd_t team_cmd = {
         .cmd_type = UROM_WORKER_CMD_UCC,
-        .ucc.dpu_worker_id = params->rank,
+        .ucc.dpu_worker_id = ctx->ctx_rank,
         .ucc.cmd_type = UROM_WORKER_CMD_UCC_TEAM_CREATE,
         /* FIXME: proper way: use ec map.. for now assume linear */
         .ucc.team_create_cmd = 
@@ -39,7 +39,7 @@ UCC_CLASS_INIT_FUNC(ucc_cl_urom_team_t, ucc_base_context_t *cl_context,
     self->n_teams = 0;
     self->score_map = NULL;
    
-    urom_status = urom_worker_push_cmdq(urom_lib->urom_worker, 0, &team_cmd);
+    urom_status = urom_worker_push_cmdq(urom_lib->urom_ctx.urom_worker, 0, &team_cmd);
     if (UROM_OK != urom_status) {
         cl_error(cl_context->lib, "failed to create team");
         return UCC_ERR_NO_MESSAGE;
@@ -73,7 +73,7 @@ ucc_status_t ucc_cl_urom_team_create_test(ucc_base_team_t *cl_team)
     ucc_status_t            ucc_status;
     urom_worker_notify_t   *notif;
 
-    urom_status = urom_worker_pop_notifyq(urom_lib->urom_worker, 0, &notif);
+    urom_status = urom_worker_pop_notifyq(urom_lib->urom_ctx.urom_worker, 0, &notif);
     if (UROM_ERR_QUEUE_EMPTY != urom_status) {
         if (urom_status == UROM_OK) {
             if (notif->ucc.status == (urom_status_t)UCC_OK) {
@@ -124,8 +124,6 @@ ucc_status_t ucc_cl_urom_team_get_scores(ucc_base_team_t   *cl_team,
         team_info.size                = UCC_CL_TEAM_SIZE(team);
 
         status = ucc_coll_score_update_from_str(ctx->score_str, &team_info, &team->super.super, *score);
-/*            ctx->score_str, *score, UCC_CL_TEAM_SIZE(team), NULL, cl_team,
-            UCC_CL_UROM_DEFAULT_SCORE, NULL, NULL, 0);*/
 
         /* If INVALID_PARAM - User provided incorrect input - try to proceed */
         if ((status < 0) && (status != UCC_ERR_INVALID_PARAM) &&