diff --git a/src/components/cl/urom/Makefile.am b/src/components/cl/urom/Makefile.am index 6e67c38406..8562ca5e58 100644 --- a/src/components/cl/urom/Makefile.am +++ b/src/components/cl/urom/Makefile.am @@ -13,14 +13,13 @@ sources = \ cl_urom_context.c \ cl_urom_team.c \ cl_urom_coll.c \ - cl_urom_memcpy.c \ $(alltoall) module_LTLIBRARIES = libucc_cl_urom.la libucc_cl_urom_la_SOURCES = $(sources) -libucc_cl_urom_la_CPPFLAGS = $(AM_CPPFLAGS) $(BASE_CPPFLAGS) $(UROM_CPPFLAGS) $(CUDA_CPPFLAGS) +libucc_cl_urom_la_CPPFLAGS = $(AM_CPPFLAGS) $(BASE_CPPFLAGS) $(UROM_CPPFLAGS) libucc_cl_urom_la_CFLAGS = $(BASE_CFLAGS) -libucc_cl_urom_la_LDFLAGS = -version-info $(SOVERSION) --as-needed $(UROM_LDFLAGS) $(CUDA_LDFLAGS) -libucc_cl_urom_la_LIBADD = $(UROM_LIBADD) $(CUDA_LIBS) $(UCC_TOP_BUILDDIR)/src/libucc.la +libucc_cl_urom_la_LDFLAGS = -version-info $(SOVERSION) --as-needed $(UROM_LDFLAGS) +libucc_cl_urom_la_LIBADD = $(UROM_LIBADD) $(UCC_TOP_BUILDDIR)/src/libucc.la include $(top_srcdir)/config/module.am diff --git a/src/components/cl/urom/alltoall/alltoall.c b/src/components/cl/urom/alltoall/alltoall.c index 783d33c118..146fb52ed9 100644 --- a/src/components/cl/urom/alltoall/alltoall.c +++ b/src/components/cl/urom/alltoall/alltoall.c @@ -5,7 +5,6 @@ */ #include "alltoall.h" -//#include "utils/arch/cuda_def.h" ucc_base_coll_alg_info_t ucc_cl_urom_alltoall_algs[UCC_CL_UROM_ALLTOALL_ALG_LAST + 1] = { @@ -65,34 +64,41 @@ static ucc_status_t ucc_cl_urom_alltoall_full_start(ucc_coll_task_t *task) .ucc.cmd_type = UROM_WORKER_CMD_UCC_COLL, .ucc.coll_cmd.coll_args = coll_args, .ucc.coll_cmd.team = cl_team->teams[0], - .ucc.coll_cmd.use_xgvmi = cl_lib->xgvmi_enabled, + .ucc.coll_cmd.use_xgvmi = ctx->xgvmi_enabled, }; + ucc_memory_type_t prev_src, prev_dst; + + prev_src = coll_args->src.info.mem_type; + prev_dst = coll_args->dst.info.mem_type; + coll_args->src.info.mem_type = UCC_MEMORY_TYPE_HOST; + coll_args->dst.info.mem_type = UCC_MEMORY_TYPE_HOST; + urom_status = urom_worker_push_cmdq(cl_lib->urom_ctx.urom_worker, 0, &coll_cmd); + if (UROM_OK != urom_status) { + cl_debug(&cl_lib->super, "failed to push collective to urom"); + return UCC_ERR_NO_MESSAGE; + } + coll_args->src.info.mem_type = prev_src; + coll_args->dst.info.mem_type = prev_dst; +/* if (coll_args->src.info.mem_type != UCC_MEMORY_TYPE_CUDA) { - urom_status = urom_worker_push_cmdq(cl_lib->urom_worker, 0, &coll_cmd); + urom_status = urom_worker_push_cmdq(cl_lib->urom_ctx.urom_worker, 0, &coll_cmd); if (UROM_OK != urom_status) { cl_debug(&cl_lib->super, "failed to push collective to urom"); return UCC_ERR_NO_MESSAGE; } } else { -#if HAVE_CUDA - // FIXME: a better way is to tweak args in urom - cudaStreamSynchronize(cl_lib->cuda_stream); - coll_args->src.info.mem_type = UCC_MEMORY_TYPE_HOST; coll_args->dst.info.mem_type = UCC_MEMORY_TYPE_HOST; - urom_status = urom_worker_push_cmdq(cl_lib->urom_worker, 0, &coll_cmd); + urom_status = urom_worker_push_cmdq(cl_lib->urom_ctx.urom_worker, 0, &coll_cmd); if (UROM_OK != urom_status) { cl_debug(&cl_lib->super, "failed to push collective to urom"); return UCC_ERR_NO_MESSAGE; } coll_args->src.info.mem_type = UCC_MEMORY_TYPE_CUDA; coll_args->dst.info.mem_type = UCC_MEMORY_TYPE_CUDA; -#else - cl_error(&cl_lib->super, "attempting to use CUDA without CUDA support"); - return UCC_ERR_NO_RESOURCE; -#endif } +*/ task->status = UCC_INPROGRESS; cl_debug(&cl_lib->super, "pushed the collective to urom"); return ucc_progress_queue_enqueue(ctx->super.super.ucc_context->pq, task); @@ -117,7 +123,7 @@ static void ucc_cl_urom_alltoall_full_progress(ucc_coll_task_t *ctask) urom_status_t urom_status = 0; urom_worker_notify_t *notif; - urom_status = urom_worker_pop_notifyq(cl_lib->urom_worker, 0, ¬if); + urom_status = urom_worker_pop_notifyq(cl_lib->urom_ctx.urom_worker, 0, ¬if); if (UROM_ERR_QUEUE_EMPTY == urom_status) { return; } @@ -133,25 +139,14 @@ static void ucc_cl_urom_alltoall_full_progress(ucc_coll_task_t *ctask) return; } - if (cl_lib->xgvmi_enabled) { + if (ctx->req_mc) { size_t size_mod = dt_size(ctask->bargs.args.dst.info.datatype); - if (cl_lib->req_mc) { - if (ctask->bargs.args.dst.info.mem_type != UCC_MEMORY_TYPE_CUDA) { - memcpy(cl_lib->old_dest, ctask->bargs.args.dst.info.buffer, ctask->bargs.args.src.info.count * size_mod); - } else { - #if HAVE_CUDA - cudaMemcpyAsync(cl_lib->old_dest, ctask->bargs.args.dst.info.buffer, ctask->bargs.args.src.info.count * size_mod , cudaMemcpyHostToDevice, cl_lib->cuda_stream); - cudaStreamSynchronize(cl_lib->cuda_stream); - #else - cl_error(&cl_lib->super, "attempting to use CUDA without CUDA support"); - return UCC_ERR_NO_RESOURCE; - #endif - } - ctask->bargs.args.dst.info.buffer = cl_lib->old_dest; - ctask->bargs.args.src.info.buffer = cl_lib->old_src; + if ((ucc_status_t) notif->ucc.status == UCC_OK) { + ucc_mc_memcpy(ctx->old_dest, ctask->bargs.args.dst.info.buffer, ctask->bargs.args.dst.info.count * size_mod, ctask->bargs.args.dst.info.mem_type, UCC_MEMORY_TYPE_HOST); + ctask->bargs.args.dst.info.buffer = ctx->old_dest; + ctask->bargs.args.src.info.buffer = ctx->old_src; } - } cl_debug(&cl_lib->super, "completed the collective from urom"); @@ -165,6 +160,7 @@ ucc_status_t ucc_cl_urom_alltoall_full_init( ucc_cl_urom_team_t *cl_team = ucc_derived_of(team, ucc_cl_urom_team_t); ucc_cl_urom_context_t *ctx = UCC_CL_UROM_TEAM_CTX(cl_team); ucc_cl_urom_lib_t *cl_lib = ucc_derived_of(ctx->super.super.lib, ucc_cl_urom_lib_t); + ucc_cl_urom_schedule_t *cl_schedule; ucc_base_coll_args_t args; ucc_schedule_t *schedule; @@ -175,26 +171,17 @@ ucc_status_t ucc_cl_urom_alltoall_full_init( return UCC_ERR_NO_MEMORY; } schedule = &cl_schedule->super.super; - if (cl_lib->xgvmi_enabled) { + if (ctx->req_mc) { size_t size_mod = dt_size(coll_args->args.src.info.datatype); - if (cl_lib->req_mc) { - //memcpy args to xgvmi buffer - void * ptr = cl_lib->xgvmi_buffer + (cl_lib->cfg.xgvmi_buffer_size * (schedule->super.seq_num % cl_lib->cfg.num_buffers)); - if (coll_args->args.src.info.mem_type != UCC_MEMORY_TYPE_CUDA) { - memcpy(ptr, coll_args->args.src.info.buffer, coll_args->args.src.info.count * size_mod); - } else { - #if HAVE_CUDA - cudaMemcpyAsync(ptr, coll_args->args.src.info.buffer, coll_args->args.src.info.count * size_mod, cudaMemcpyDeviceToHost, cl_lib->cuda_stream); - #else - cl_error(&cl_lib->super, "attempting to use CUDA without CUDA support"); - return UCC_ERR_NO_RESOURCE; - #endif - } - cl_lib->old_src = coll_args->args.src.info.buffer; - coll_args->args.src.info.buffer = ptr; - cl_lib->old_dest = coll_args->args.dst.info.buffer; - coll_args->args.dst.info.buffer = ptr + coll_args->args.src.info.count * size_mod; - } + size_t count = coll_args->args.src.info.count * size_mod; + //memcpy args to xgvmi buffer + void * ptr = ctx->xgvmi.xgvmi_buffer + (cl_lib->cfg.xgvmi_buffer_size * (schedule->super.seq_num % cl_lib->cfg.num_buffers)); + ucc_mc_memcpy(ptr, coll_args->args.src.info.buffer, count, UCC_MEMORY_TYPE_HOST, coll_args->args.src.info.mem_type); + + ctx->old_src = coll_args->args.src.info.buffer; + coll_args->args.src.info.buffer = ptr; + ctx->old_dest = coll_args->args.dst.info.buffer; + coll_args->args.dst.info.buffer = ptr + count; } memcpy(&args, coll_args, sizeof(args)); status = ucc_schedule_init(schedule, &args, team); @@ -211,5 +198,6 @@ ucc_status_t ucc_cl_urom_alltoall_full_init( ucc_cl_urom_alltoall_triggered_post_setup; *task = &schedule->super; + cl_debug(cl_lib, "urom coll init'd"); return UCC_OK; } diff --git a/src/components/cl/urom/alltoall/tags b/src/components/cl/urom/alltoall/tags deleted file mode 100644 index 0a18dd4a17..0000000000 --- a/src/components/cl/urom/alltoall/tags +++ /dev/null @@ -1,36 +0,0 @@ -!_TAG_FILE_FORMAT 2 /extended format; --format=1 will not append ;" to lines/ -!_TAG_FILE_SORTED 1 /0=unsorted, 1=sorted, 2=foldcase/ -!_TAG_OUTPUT_EXCMD mixed /number, pattern, mixed, or combineV2/ -!_TAG_OUTPUT_FILESEP slash /slash or backslash/ -!_TAG_OUTPUT_MODE u-ctags /u-ctags or e-ctags/ -!_TAG_PATTERN_LENGTH_LIMIT 96 /0 for no limit/ -!_TAG_PROC_CWD /work/nvidia/ucc-urom-cl/src/components/cl/urom/alltoall/ // -!_TAG_PROGRAM_AUTHOR Universal Ctags Team // -!_TAG_PROGRAM_NAME Universal Ctags /Derived from Exuberant Ctags/ -!_TAG_PROGRAM_URL https://ctags.io/ /official site/ -!_TAG_PROGRAM_VERSION 5.9.0 // -UCC_CL_UROM_ALLTOALL_ALG_FULL alltoall.h /^ UCC_CL_UROM_ALLTOALL_ALG_FULL,$/;" e enum:__anon795cd0f00103 -UCC_CL_UROM_ALLTOALL_ALG_LAST alltoall.h /^ UCC_CL_UROM_ALLTOALL_ALG_LAST,$/;" e enum:__anon795cd0f00103 -UROM_ALLTOALL_H_ alltoall.h /^#define UROM_ALLTOALL_H_$/;" d -__anon795cd0f00103 alltoall.h /^{$/;" g -args alltoall.c /^ ucc_base_coll_args_t args;$/;" v typeref:typename:ucc_base_coll_args_t -cl_schedule alltoall.c /^ ucc_cl_urom_schedule_t *cl_schedule;$/;" v typeref:typename:ucc_cl_urom_schedule_t * -coll_cmd alltoall.c /^ urom_worker_cmd_t coll_cmd = {$/;" v typeref:typename:urom_worker_cmd_t -elem_size alltoall.c /^ size_t elem_size;$/;" v typeref:typename:size_t -full_only alltoall.c /^ int full_only = 0;$/;" v typeref:typename:int -full_size alltoall.c /^ ucc_rank_t full_size, node_size;$/;" v typeref:typename:ucc_rank_t -node_size alltoall.c /^ ucc_rank_t full_size, node_size;$/;" v typeref:typename:ucc_rank_t -rdt_size alltoall.c /^ size_t sdt_size, rdt_size;$/;" v typeref:typename:size_t -sbgp alltoall.c /^ ucc_sbgp_t *sbgp;$/;" v typeref:typename:ucc_sbgp_t * -schedule alltoall.c /^ ucc_schedule_t *schedule;$/;" v typeref:typename:ucc_schedule_t * -sdt_size alltoall.c /^ size_t sdt_size, rdt_size;$/;" v typeref:typename:size_t -status alltoall.c /^ ucc_status_t status;$/;" v typeref:typename:ucc_status_t -task_full alltoall.c /^ ucc_coll_task_t *task_node, *task_full;$/;" v typeref:typename:ucc_coll_task_t * -task_node alltoall.c /^ ucc_coll_task_t *task_node, *task_full;$/;" v typeref:typename:ucc_coll_task_t * -ucc_cl_urom_alltoall_alg_from_str alltoall.h /^static inline int ucc_cl_urom_alltoall_alg_from_str(const char *str)$/;" f typeref:typename:int -ucc_cl_urom_alltoall_algs alltoall.c /^ ucc_cl_urom_alltoall_algs[UCC_CL_UROM_ALLTOALL_ALG_LAST + 1] = {$/;" v typeref:typename:ucc_base_coll_alg_info_t[] -ucc_cl_urom_alltoall_finalize alltoall.c /^static ucc_status_t ucc_cl_urom_alltoall_finalize(ucc_coll_task_t *task)$/;" f typeref:typename:ucc_status_t file: -ucc_cl_urom_alltoall_start alltoall.c /^static ucc_status_t ucc_cl_urom_alltoall_start(ucc_coll_task_t *task)$/;" f typeref:typename:ucc_status_t file: -ucc_cl_urom_alltoall_triggered_post_setup alltoall.c /^ucc_status_t ucc_cl_urom_alltoall_triggered_post_setup(ucc_coll_task_t *task)$/;" f typeref:typename:ucc_status_t -ucc_status alltoall.c /^ ucc_status_t ucc_status;$/;" v typeref:typename:ucc_status_t -urom_status alltoall.c /^ urom_status_t urom_status;$/;" v typeref:typename:urom_status_t diff --git a/src/components/cl/urom/cl_urom.h b/src/components/cl/urom/cl_urom.h index 6eb3d712cd..d838fc6f29 100644 --- a/src/components/cl/urom/cl_urom.h +++ b/src/components/cl/urom/cl_urom.h @@ -15,15 +15,10 @@ #include #include -#include "utils/arch/cuda_def.h" - #ifndef UCC_CL_UROM_DEFAULT_SCORE #define UCC_CL_UROM_DEFAULT_SCORE 20 #endif -#define OFFSET_SIZE (128*1024*1024) -#define NUM_OFFSETS 8 - typedef struct ucc_cl_urom_iface { ucc_cl_iface_t super; } ucc_cl_urom_iface_t; @@ -47,54 +42,56 @@ typedef struct ucc_cl_urom_context_config { ucc_cl_context_config_t super; } ucc_cl_urom_context_config_t; +typedef struct urom_ctx { + urom_service_h urom_service; + urom_worker_h urom_worker; + void *urom_worker_addr; + size_t urom_worker_len; + uint64_t worker_id; + int pass_dc_exist; +} urom_ctx_t; + +typedef struct xgvmi_info { + ucp_mem_h xgvmi_memh; + void *packed_mkey; + uint64_t packed_mkey_len; + void *packed_xgvmi_memh; + uint64_t packed_xgvmi_len; + void *xgvmi_buffer; + size_t xgvmi_size; +} xgvmi_info_t; + typedef struct ucc_cl_urom_lib { ucc_cl_lib_t super; ucc_cl_urom_lib_config_t cfg; - urom_service_h urom_service; - urom_worker_h urom_worker; - void * urom_ucc_ctx_h; - void *urom_worker_addr; - size_t urom_worker_len; - uint64_t worker_id; - int pass_dc_exist; - int xgvmi_enabled; - ucp_mem_h xgvmi_memh; - void * packed_mkey; - uint64_t packed_mkey_len; - void * packed_xgvmi_memh; - uint64_t packed_xgvmi_len; - void * xgvmi_buffer; - size_t xgvmi_size; - int req_mc; - void * old_dest; - void * old_src; - int xgvmi_offsets[NUM_OFFSETS]; - int seq_num; + urom_ctx_t urom_ctx; int tl_ucp_index; //FIXME: make this better - //void * cuda_stream; - #if HAVE_CUDA - cudaStream_t cuda_stream; - #endif - ucc_rank_t ctx_rank; //FIXME: this is not right } ucc_cl_urom_lib_t; UCC_CLASS_DECLARE(ucc_cl_urom_lib_t, const ucc_base_lib_params_t *, const ucc_base_config_t *); typedef struct ucc_cl_urom_context { - ucc_cl_context_t super; - urom_domain_h urom_domain; - ucc_mpool_t sched_mp; + ucc_cl_context_t super; + urom_domain_h urom_domain; + void *urom_ucc_ctx_h; + ucc_mpool_t sched_mp; + xgvmi_info_t xgvmi; + int xgvmi_enabled; + int req_mc; + void *old_dest; + void *old_src; + ucc_rank_t ctx_rank; //FIXME: this is not right } ucc_cl_urom_context_t; UCC_CLASS_DECLARE(ucc_cl_urom_context_t, const ucc_base_context_params_t *, const ucc_base_config_t *); typedef struct ucc_cl_urom_team { - ucc_cl_team_t super; - int team_posted; - ucc_team_h **teams; - unsigned n_teams; - ucc_coll_score_t *score; - ucc_score_map_t *score_map; + ucc_cl_team_t super; + int team_posted; + ucc_team_h **teams; + unsigned n_teams; + ucc_coll_score_t *score; + ucc_score_map_t *score_map; } ucc_cl_urom_team_t; UCC_CLASS_DECLARE(ucc_cl_urom_team_t, ucc_base_context_t *, const ucc_base_team_params_t *); @@ -103,7 +100,7 @@ ucc_status_t ucc_cl_urom_coll_init(ucc_base_coll_args_t *coll_args, ucc_base_team_t *team, ucc_coll_task_t **task); -#define UCC_CL_UROM_TEAM_CTX(_team) \ +#define UCC_CL_UROM_TEAM_CTX(_team) \ (ucc_derived_of((_team)->super.super.context, ucc_cl_urom_context_t)) #endif diff --git a/src/components/cl/urom/cl_urom_coll.c b/src/components/cl/urom/cl_urom_coll.c index 5db56c1fdf..52decf3613 100644 --- a/src/components/cl/urom/cl_urom_coll.c +++ b/src/components/cl/urom/cl_urom_coll.c @@ -26,25 +26,25 @@ ucc_status_t ucc_cl_urom_coll_init(ucc_base_coll_args_t *coll_args, int ucp_index = urom_lib->tl_ucp_index; ucc_tl_ucp_context_t *tl_ctx = ucc_derived_of(ctx->super.tl_ctxs[ucp_index], ucc_tl_ucp_context_t); urom_status_t urom_status; - if (!urom_lib->pass_dc_exist) { + if (!urom_lib->urom_ctx.pass_dc_exist) { urom_worker_cmd_t pass_dc_cmd = { .cmd_type = UROM_WORKER_CMD_UCC, .ucc.cmd_type = UROM_WORKER_CMD_CREATE_PASSIVE_DATA_CHANNEL, - .ucc.dpu_worker_id = urom_lib->ctx_rank, + .ucc.dpu_worker_id = ctx->ctx_rank, .ucc.pass_dc_create_cmd.ucp_addr = tl_ctx->worker.worker_address, .ucc.pass_dc_create_cmd.addr_len = tl_ctx->worker.ucp_addrlen, }; urom_worker_notify_t *notif; - urom_worker_push_cmdq(urom_lib->urom_worker, 0, &pass_dc_cmd); + urom_worker_push_cmdq(urom_lib->urom_ctx.urom_worker, 0, &pass_dc_cmd); while (UROM_ERR_QUEUE_EMPTY == - (urom_status = urom_worker_pop_notifyq(urom_lib->urom_worker, 0, ¬if))) { + (urom_status = urom_worker_pop_notifyq(urom_lib->urom_ctx.urom_worker, 0, ¬if))) { sched_yield(); } if ((ucc_status_t) notif->ucc.status != UCC_OK) { return (ucc_status_t) notif->ucc.status; } - urom_lib->pass_dc_exist = 1; + urom_lib->urom_ctx.pass_dc_exist = 1; } switch (coll_args->args.coll_type) { case UCC_COLL_TYPE_ALLTOALL: diff --git a/src/components/cl/urom/cl_urom_context.c b/src/components/cl/urom/cl_urom_context.c index 111cc13fc1..ac62e75632 100644 --- a/src/components/cl/urom/cl_urom_context.c +++ b/src/components/cl/urom/cl_urom_context.c @@ -94,26 +94,34 @@ UCC_CLASS_INIT_FUNC(ucc_cl_urom_context_t, self->super.tl_ctxs = NULL; return UCC_ERR_NOT_FOUND; } - urom_lib->worker_id = UROM_WORKER_ID_ANY; - urom_lib->ctx_rank = params->params.oob.oob_ep; - urom_status = urom_worker_spawn( - urom_lib->urom_service, UROM_WORKER_TYPE_UCC, urom_lib->urom_worker_addr, - &urom_lib->urom_worker_len, &urom_lib->worker_id); - if (UROM_OK != urom_status) { - cl_error(&urom_lib->super, "failed to connect to urom worker"); - return UCC_ERR_NO_MESSAGE; - } - worker_params.serviceh = urom_lib->urom_service; - worker_params.addr = urom_lib->urom_worker_addr; - worker_params.addr_len = urom_lib->urom_worker_len; - worker_params.num_cmd_notifyq = 1; + if (!urom_lib->urom_ctx.urom_worker) { + /* issues + * 1. user creates multiple contexts? + * 2. worker sharing + */ + urom_lib->urom_ctx.worker_id = 0;//UROM_WORKER_ID_ANY; + self->ctx_rank = params->params.oob.oob_ep; + urom_status = urom_worker_spawn( + urom_lib->urom_ctx.urom_service, UROM_WORKER_TYPE_UCC, urom_lib->urom_ctx.urom_worker_addr, + &urom_lib->urom_ctx.urom_worker_len, &urom_lib->urom_ctx.worker_id); + if (UROM_OK != urom_status) { + cl_error(&urom_lib->super, "failed to connect to urom worker"); + return UCC_ERR_NO_MESSAGE; + } + + worker_params.serviceh = urom_lib->urom_ctx.urom_service; + worker_params.addr = urom_lib->urom_ctx.urom_worker_addr; + worker_params.addr_len = urom_lib->urom_ctx.urom_worker_len; + worker_params.num_cmd_notifyq = 1; - urom_status = urom_worker_connect(&worker_params, &urom_lib->urom_worker); - if (UROM_OK != urom_status) { - cl_error(&urom_lib->super, "failed to perform urom_worker_connect() with error: %s", - urom_status_string(urom_status)); - return UCC_ERR_NO_MESSAGE; + urom_status = urom_worker_connect(&worker_params, &urom_lib->urom_ctx.urom_worker); + if (UROM_OK != urom_status) { + cl_error(&urom_lib->super, "failed to perform urom_worker_connect() with error: %s", + urom_status_string(urom_status)); + return UCC_ERR_NO_MESSAGE; + } + cl_debug(&urom_lib->super, "connected to the urom worker"); } tl_ctx = ucc_derived_of(self->super.tl_ctxs[ucp_index], ucc_tl_ucp_context_t); urom_domain_params.flags = UROM_DOMAIN_WORKER_ADDR; @@ -128,24 +136,24 @@ UCC_CLASS_INIT_FUNC(ucc_cl_urom_context_t, urom_domain_params.oob.oob_index = params->params.oob.oob_ep; urom_domain_params.domain_worker_id = params->params.oob.oob_ep; - urom_domain_params.workers = &urom_lib->urom_worker; + urom_domain_params.workers = &urom_lib->urom_ctx.urom_worker; urom_domain_params.num_workers = 1, urom_domain_params.domain_size = params->params.oob.n_oob_eps; - urom_lib->req_mc = 1; /* requires a memcpy */ + self->req_mc = 1; /* requires a memcpy */ if (params->context->params.mask & UCC_CONTEXT_PARAM_FIELD_OOB && params->context->params.mask & UCC_CONTEXT_PARAM_FIELD_MEM_PARAMS) { /* remap the segments for xgvmi if enabled */ - urom_lib->xgvmi_buffer = ucc_mem_params.segments[0].address; - urom_lib->xgvmi_size = ucc_mem_params.segments[0].len; + self->xgvmi.xgvmi_buffer = ucc_mem_params.segments[0].address; + self->xgvmi.xgvmi_size = ucc_mem_params.segments[0].len; if (urom_lib->cfg.use_xgvmi) { - urom_lib->req_mc = 0; + self->req_mc = 0; } } else { - urom_lib->xgvmi_size = urom_lib->cfg.num_buffers * urom_lib->cfg.xgvmi_buffer_size; - urom_lib->xgvmi_buffer = ucc_calloc(1, urom_lib->xgvmi_size, "xgvmi buffer"); - if (!urom_lib->xgvmi_buffer) { + self->xgvmi.xgvmi_size = urom_lib->cfg.num_buffers * urom_lib->cfg.xgvmi_buffer_size; + self->xgvmi.xgvmi_buffer = ucc_calloc(1, self->xgvmi.xgvmi_size, "xgvmi buffer"); + if (!self->xgvmi.xgvmi_buffer) { return UCC_ERR_NO_MEMORY; } } @@ -160,17 +168,17 @@ UCC_CLASS_INIT_FUNC(ucc_cl_urom_context_t, // mem_map the segment mem_params.field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS | UCP_MEM_MAP_PARAM_FIELD_LENGTH; - mem_params.address = urom_lib->xgvmi_buffer; - mem_params.length = urom_lib->xgvmi_size; + mem_params.address = self->xgvmi.xgvmi_buffer; + mem_params.length = self->xgvmi.xgvmi_size; - ucs_status = ucp_mem_map(tl_ctx->worker.ucp_context, &mem_params, &urom_lib->xgvmi_memh); + ucs_status = ucp_mem_map(tl_ctx->worker.ucp_context, &mem_params, &self->xgvmi.xgvmi_memh); assert(ucs_status == UCS_OK); if (urom_lib->cfg.use_xgvmi) { pack_params.field_mask = UCP_MEMH_PACK_PARAM_FIELD_FLAGS; pack_params.flags = UCP_MEMH_PACK_FLAG_EXPORT; - ucs_status = ucp_memh_pack(urom_lib->xgvmi_memh, &pack_params, &urom_lib->packed_xgvmi_memh, &urom_lib->packed_xgvmi_len); + ucs_status = ucp_memh_pack(self->xgvmi.xgvmi_memh, &pack_params, &self->xgvmi.packed_xgvmi_memh, &self->xgvmi.packed_xgvmi_len); if (ucs_status != UCS_OK) { cl_error(&urom_lib->super.super, "ucp_memh_pack() returned error: %s", ucs_status_string(ucs_status)); cl_error(&urom_lib->super.super, "xgvmi will be disabled"); @@ -180,27 +188,27 @@ UCC_CLASS_INIT_FUNC(ucc_cl_urom_context_t, } } - ucs_status = ucp_rkey_pack(tl_ctx->worker.ucp_context, urom_lib->xgvmi_memh, &urom_lib->packed_mkey, - &urom_lib->packed_mkey_len); + ucs_status = ucp_rkey_pack(tl_ctx->worker.ucp_context, self->xgvmi.xgvmi_memh, &self->xgvmi.packed_mkey, + &self->xgvmi.packed_mkey_len); if (UCS_OK != ucs_status) { printf("ucp_rkey_pack() returned error: %s\n", ucs_status_string(ucs_status)); return UCC_ERR_NO_RESOURCE; } domain_mem_map[0].mask = UROM_WORKER_MEM_MAP_FIELD_BASE_VA | UROM_WORKER_MEM_MAP_FIELD_MKEY; - domain_mem_map[0].base_va = (uint64_t)urom_lib->xgvmi_buffer; - domain_mem_map[0].len = urom_lib->xgvmi_size; - domain_mem_map[0].mkey = urom_lib->packed_mkey; - domain_mem_map[0].mkey_len = urom_lib->packed_mkey_len; + domain_mem_map[0].base_va = (uint64_t)self->xgvmi.xgvmi_buffer; + domain_mem_map[0].len = self->xgvmi.xgvmi_size; + domain_mem_map[0].mkey = self->xgvmi.packed_mkey; + domain_mem_map[0].mkey_len = self->xgvmi.packed_mkey_len; if (xgvmi_level) { domain_mem_map[0].mask |= UROM_WORKER_MEM_MAP_FIELD_MEMH; - domain_mem_map[0].memh = urom_lib->packed_xgvmi_memh; - domain_mem_map[0].memh_len = urom_lib->packed_xgvmi_len; + domain_mem_map[0].memh = self->xgvmi.packed_xgvmi_memh; + domain_mem_map[0].memh_len = self->xgvmi.packed_xgvmi_len; } urom_domain_params.mask |= UROM_DOMAIN_PARAM_FIELD_MEM_MAP; urom_domain_params.mem_map.segments = domain_mem_map; urom_domain_params.mem_map.n_segments = 1; - urom_lib->xgvmi_enabled = xgvmi_level; //FIXME: for now, just use xgvmi buffers + self->xgvmi_enabled = xgvmi_level; //FIXME: for now, just use xgvmi buffers urom_status = urom_domain_create_post(&urom_domain_params, &self->urom_domain); if (urom_status < UROM_OK) { @@ -214,10 +222,10 @@ UCC_CLASS_INIT_FUNC(ucc_cl_urom_context_t, return UCC_ERR_NO_MESSAGE; } - urom_worker_push_cmdq(urom_lib->urom_worker, 0, &init_cmd); + urom_worker_push_cmdq(urom_lib->urom_ctx.urom_worker, 0, &init_cmd); while (UROM_ERR_QUEUE_EMPTY == - (urom_status = urom_worker_pop_notifyq(urom_lib->urom_worker, 0, ¬if_lib))) { + (urom_status = urom_worker_pop_notifyq(urom_lib->urom_ctx.urom_worker, 0, ¬if_lib))) { sched_yield(); } if ((ucc_status_t) notif_lib->ucc.status != UCC_OK) { @@ -227,18 +235,11 @@ UCC_CLASS_INIT_FUNC(ucc_cl_urom_context_t, cl_debug(&urom_lib->super.super, "debug: lib created\n"); } - if (urom_lib->xgvmi_enabled || n_segments == 1) { - ctx_cmd.ucc.context_create_cmd.base_va = urom_lib->xgvmi_buffer; - ctx_cmd.ucc.context_create_cmd.len = urom_lib->xgvmi_size; - } else { - if (params->context->params.mask & UCC_CONTEXT_PARAM_FIELD_MEM_PARAMS) { - ctx_cmd.ucc.context_create_cmd.base_va = ucc_mem_params.segments[0].address; - ctx_cmd.ucc.context_create_cmd.len = ucc_mem_params.segments[0].len; - } - } - urom_worker_push_cmdq(urom_lib->urom_worker, 0, &ctx_cmd); + ctx_cmd.ucc.context_create_cmd.base_va = self->xgvmi.xgvmi_buffer; + ctx_cmd.ucc.context_create_cmd.len = self->xgvmi.xgvmi_size; + urom_worker_push_cmdq(urom_lib->urom_ctx.urom_worker, 0, &ctx_cmd); while (UROM_ERR_QUEUE_EMPTY == - (urom_status = urom_worker_pop_notifyq(urom_lib->urom_worker, 0, ¬if_ctx)) + (urom_status = urom_worker_pop_notifyq(urom_lib->urom_ctx.urom_worker, 0, ¬if_ctx)) ) { sched_yield(); } @@ -247,7 +248,7 @@ UCC_CLASS_INIT_FUNC(ucc_cl_urom_context_t, return (ucc_status_t) notif_ctx->ucc.status; } - urom_lib->urom_ucc_ctx_h = notif_ctx->ucc.context_create_nqe.context; + self->urom_ucc_ctx_h = notif_ctx->ucc.context_create_nqe.context; status = ucc_mpool_init(&self->sched_mp, 0, sizeof(ucc_cl_urom_schedule_t), 0, UCC_CACHE_LINE_SIZE, 2, UINT_MAX, @@ -258,10 +259,6 @@ UCC_CLASS_INIT_FUNC(ucc_cl_urom_context_t, return UCC_ERR_NO_MESSAGE; } -#if HAVE_CUDA - cudaStreamCreateWithFlags(&urom_lib->cuda_stream, cudaStreamNonBlocking); -#endif - cl_debug(cl_config->cl_lib, "initialized cl context: %p", self); return UCC_OK; } @@ -274,7 +271,7 @@ UCC_CLASS_CLEANUP_FUNC(ucc_cl_urom_context_t) .ucc.cmd_type = UROM_WORKER_CMD_UCC_CONTEXT_DESTROY, .ucc.context_destroy_cmd = { - .context_h = urom_lib->urom_ucc_ctx_h, + .context_h = self->urom_ucc_ctx_h, }, }; urom_worker_notify_t *notif; @@ -287,16 +284,14 @@ UCC_CLASS_CLEANUP_FUNC(ucc_cl_urom_context_t) } ucc_free(self->super.tl_ctxs); - urom_worker_push_cmdq(urom_lib->urom_worker, 0, &ctx_destroy_cmd); + urom_worker_push_cmdq(urom_lib->urom_ctx.urom_worker, 0, &ctx_destroy_cmd); while (UROM_ERR_QUEUE_EMPTY == - (urom_status = urom_worker_pop_notifyq(urom_lib->urom_worker, 0, ¬if))) { + (urom_status = urom_worker_pop_notifyq(urom_lib->urom_ctx.urom_worker, 0, ¬if))) { sched_yield(); } - if (urom_lib->req_mc) { - ucc_free(urom_lib->xgvmi_buffer); + if (self->req_mc) { + ucc_free(self->xgvmi.xgvmi_buffer); } - - cudaStreamDestroy(urom_lib->cuda_stream); } UCC_CLASS_DEFINE(ucc_cl_urom_context_t, ucc_cl_context_t); diff --git a/src/components/cl/urom/cl_urom_lib.c b/src/components/cl/urom/cl_urom_lib.c index f23714e796..4fdc1339e6 100644 --- a/src/components/cl/urom/cl_urom_lib.c +++ b/src/components/cl/urom/cl_urom_lib.c @@ -90,31 +90,22 @@ UCC_CLASS_INIT_FUNC(ucc_cl_urom_lib_t, const ucc_base_lib_params_t *params, memcpy(&self->cfg, cl_config, sizeof(*cl_config)); cl_debug(&self->super, "initialized lib object: %p", self); + memset(&self->urom_ctx, 0, sizeof(urom_ctx_t)); /* how to know service here? */ - ret = device_connect(self, device, &self->urom_service); + ret = device_connect(self, device, &self->urom_ctx.urom_service); if (ret) { cl_error(&self->super, "failed to connect to urom"); return UCC_ERR_NO_RESOURCE; } - self->urom_worker_addr = ucc_calloc(1, UROM_WORKER_ADDR_MAX_LEN, "urom worker addr"); - if (!self->urom_worker_addr) { + self->urom_ctx.urom_worker_addr = ucc_calloc(1, UROM_WORKER_ADDR_MAX_LEN, "urom worker addr"); + if (!self->urom_ctx.urom_worker_addr) { cl_error(&self->super, "failed to allocate %d bytes", UROM_WORKER_ADDR_MAX_LEN); return UCC_ERR_NO_MEMORY; } - self->pass_dc_exist = 0; - self->xgvmi_enabled = 0; -/* - if (!self->cfg.num_buffers) { - self->cfg.num_buffers = 1; - } - if (!self->cfg.xgvmi_buffer_size) { - self->cfg.xgvmi_buffer_size = OFFSET_SIZE; - } -*/ -// memset(self->xgvmi_offsets, 0, sizeof(int) * self->cfg.num_buffers); + self->tl_ucp_index = -1; return UCC_OK; } @@ -122,17 +113,17 @@ UCC_CLASS_CLEANUP_FUNC(ucc_cl_urom_lib_t) { urom_status_t urom_status; cl_debug(&self->super, "finalizing lib object: %p", self); - urom_status = urom_worker_disconnect(self->urom_worker); + urom_status = urom_worker_disconnect(self->urom_ctx.urom_worker); if (urom_status != UROM_OK) { cl_error(self, "Failed to disconnect to UROM Worker"); } - urom_status = urom_worker_destroy(self->urom_service, self->worker_id); + urom_status = urom_worker_destroy(self->urom_ctx.urom_service, self->urom_ctx.worker_id); if (urom_status != UROM_OK) { cl_error(self, "Failed to destroy UROM Worker"); } - urom_status = urom_service_disconnect(self->urom_service); + urom_status = urom_service_disconnect(self->urom_ctx.urom_service); if (urom_status != UROM_OK) { cl_error(self, "Failed to disconnect from UROM service"); } diff --git a/src/components/cl/urom/cl_urom_memcpy.c b/src/components/cl/urom/cl_urom_memcpy.c deleted file mode 100644 index 8ab547bd1a..0000000000 --- a/src/components/cl/urom/cl_urom_memcpy.c +++ /dev/null @@ -1,65 +0,0 @@ -/** - * Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * See file LICENSE for terms. - */ - -#include "cl_urom.h" -#include "utils/ucc_malloc.h" - -#include - -#include "utils/arch/cuda_def.h" - -ucc_status_t memcpy_init(ucc_cl_urom_lib_t *cl_lib) -{ -#if 0 - cudaStreamCreateWithFlags(&cl_lib->cuda_stream, cudaStreamNonBlocking); -#endif - return UCC_OK; -} - -ucc_status_t memcpy_nb(void *dst, - void *src, - ucc_memory_type_t src_mem_type, - ucc_memory_type_t dst_mem_type, - size_t n, - ucc_cl_urom_lib_t *cl_lib) -{ -#ifndef HAVE_CUDA - if (src_mem_type == UCC_MEMORY_TYPE_CUDA || - dst_mem_type == UCC_MEMORY_TYPE_CUDA) { - cl_error(cl_lib, "Unsupported operation. Did you build UCC with CUDA support?"); - return UCC_ERR_NOT_SUPPORTED; - } -#else - /* copy from host to cuda */ - if (dst_mem_type == UCC_MEMORY_TYPE_CUDA) { - printf("IM HOST->CUDA"); -#if 0 - cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, cl_lib->cuda_stream); -#endif - return UCC_OK; - } else if (src_mem_type == UCC_MEMORY_TYPE_CUDA) { - /* copy from cuda to host */ - printf("IM CUDA->HOST"); -#if 0 - cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, cl_lib->cuda_stream); -#endif - return UCC_OK; - } -#endif - else { - /* copy from host to host */ - memcpy(dst, src, n); - } - return UCC_OK; -} - -ucc_status_t memcpy_sync(ucc_cl_urom_lib_t *cl_lib) -{ -#if 0 - cudaStreamSynchronize(cl_lib->cuda_stream); -#endif - return UCC_OK; -} diff --git a/src/components/cl/urom/cl_urom_team.c b/src/components/cl/urom/cl_urom_team.c index 68dd93cb58..1fc3f77e0f 100644 --- a/src/components/cl/urom/cl_urom_team.c +++ b/src/components/cl/urom/cl_urom_team.c @@ -18,7 +18,7 @@ UCC_CLASS_INIT_FUNC(ucc_cl_urom_team_t, ucc_base_context_t *cl_context, ucc_status_t status; urom_worker_cmd_t team_cmd = { .cmd_type = UROM_WORKER_CMD_UCC, - .ucc.dpu_worker_id = params->rank, + .ucc.dpu_worker_id = ctx->ctx_rank, .ucc.cmd_type = UROM_WORKER_CMD_UCC_TEAM_CREATE, /* FIXME: proper way: use ec map.. for now assume linear */ .ucc.team_create_cmd = @@ -39,7 +39,7 @@ UCC_CLASS_INIT_FUNC(ucc_cl_urom_team_t, ucc_base_context_t *cl_context, self->n_teams = 0; self->score_map = NULL; - urom_status = urom_worker_push_cmdq(urom_lib->urom_worker, 0, &team_cmd); + urom_status = urom_worker_push_cmdq(urom_lib->urom_ctx.urom_worker, 0, &team_cmd); if (UROM_OK != urom_status) { cl_error(cl_context->lib, "failed to create team"); return UCC_ERR_NO_MESSAGE; @@ -73,7 +73,7 @@ ucc_status_t ucc_cl_urom_team_create_test(ucc_base_team_t *cl_team) ucc_status_t ucc_status; urom_worker_notify_t *notif; - urom_status = urom_worker_pop_notifyq(urom_lib->urom_worker, 0, ¬if); + urom_status = urom_worker_pop_notifyq(urom_lib->urom_ctx.urom_worker, 0, ¬if); if (UROM_ERR_QUEUE_EMPTY != urom_status) { if (urom_status == UROM_OK) { if (notif->ucc.status == (urom_status_t)UCC_OK) { @@ -124,8 +124,6 @@ ucc_status_t ucc_cl_urom_team_get_scores(ucc_base_team_t *cl_team, team_info.size = UCC_CL_TEAM_SIZE(team); status = ucc_coll_score_update_from_str(ctx->score_str, &team_info, &team->super.super, *score); -/* ctx->score_str, *score, UCC_CL_TEAM_SIZE(team), NULL, cl_team, - UCC_CL_UROM_DEFAULT_SCORE, NULL, NULL, 0);*/ /* If INVALID_PARAM - User provided incorrect input - try to proceed */ if ((status < 0) && (status != UCC_ERR_INVALID_PARAM) &&