diff --git a/src/components/cl/doca_urom/cl_doca_urom_coll.c b/src/components/cl/doca_urom/cl_doca_urom_coll.c index ae0a345054..d34e635edf 100644 --- a/src/components/cl/doca_urom/cl_doca_urom_coll.c +++ b/src/components/cl/doca_urom/cl_doca_urom_coll.c @@ -19,207 +19,23 @@ static ucc_status_t ucc_cl_doca_urom_triggered_post_setup(ucc_coll_task_t *task) static ucc_status_t ucc_cl_doca_urom_coll_full_start(ucc_coll_task_t *task) { - doca_error_t result; - ucc_worker_key_buf keys; - ucc_cl_doca_urom_team_t *cl_team = ucc_derived_of(task->team, - ucc_cl_doca_urom_team_t); - ucc_cl_doca_urom_context_t *ctx = UCC_CL_DOCA_UROM_TEAM_CTX(cl_team); - ucc_cl_doca_urom_lib_t *cl_lib = ucc_derived_of(ctx->super.super.lib, - ucc_cl_doca_urom_lib_t); - ucc_coll_args_t *coll_args = &task->bargs.args; - int ucp_index = cl_lib->tl_ucp_index; - ucc_tl_ucp_context_t *tl_ctx = ucc_derived_of( - ctx->super.tl_ctxs[ucp_index], - ucc_tl_ucp_context_t); - union doca_data cookie = {0}; - int use_xgvmi = 0; - int in_place = 0; - ucc_rank_t rank = UCC_CL_TEAM_RANK(cl_team); - ucc_cl_doca_urom_schedule_t *schedule = ucc_derived_of(task, - ucc_cl_doca_urom_schedule_t); - struct export_buf *src_ebuf = &schedule->src_ebuf; - struct export_buf *dst_ebuf = &schedule->dst_ebuf; - - src_ebuf->memh = NULL; - dst_ebuf->memh = NULL; - - cookie.ptr = &schedule->res; - - if ( (coll_args->mask & UCC_COLL_ARGS_FIELD_FLAGS ) && - (coll_args->flags & UCC_COLL_ARGS_FLAG_IN_PLACE) ) { - in_place = 1; - } - - switch (coll_args->coll_type) { - case UCC_COLL_TYPE_ALLTOALL: - { - if (!in_place) { - ucc_cl_doca_urom_buffer_export_ucc(tl_ctx->worker.ucp_context, coll_args->src.info.buffer, coll_args->src.info.count * ucc_cl_doca_urom_dt_size(coll_args->src.info.datatype), src_ebuf); - keys.src_len = src_ebuf->packed_key_len; - memcpy(keys.rkeys, src_ebuf->packed_key, keys.src_len); - } else { - keys.src_len = 0; - } - ucc_cl_doca_urom_buffer_export_ucc(tl_ctx->worker.ucp_context, coll_args->dst.info.buffer, coll_args->dst.info.count * ucc_cl_doca_urom_dt_size(coll_args->dst.info.datatype), dst_ebuf); - keys.dst_len = dst_ebuf->packed_key_len; - memcpy(keys.rkeys + keys.src_len, dst_ebuf->packed_key, keys.dst_len); - use_xgvmi = 0; - } break; - case UCC_COLL_TYPE_ALLREDUCE: - { - if (!in_place) { - ucc_cl_doca_urom_buffer_export_ucc(tl_ctx->worker.ucp_context, coll_args->src.info.buffer, coll_args->src.info.count * ucc_cl_doca_urom_dt_size(coll_args->src.info.datatype), src_ebuf); - keys.src_len = src_ebuf->packed_memh_len; - memcpy(keys.rkeys, src_ebuf->packed_memh, keys.src_len); - } else { - keys.src_len = 0; - } - ucc_cl_doca_urom_buffer_export_ucc(tl_ctx->worker.ucp_context, coll_args->dst.info.buffer, coll_args->dst.info.count * ucc_cl_doca_urom_dt_size(coll_args->dst.info.datatype), dst_ebuf); - keys.dst_len = dst_ebuf->packed_memh_len; - memcpy(keys.rkeys + keys.src_len, dst_ebuf->packed_memh, keys.dst_len); - use_xgvmi = 1; - } break; - case UCC_COLL_TYPE_ALLGATHER: - { - if (!in_place) { - ucc_cl_doca_urom_buffer_export_ucc(tl_ctx->worker.ucp_context, coll_args->src.info.buffer, coll_args->src.info.count * ucc_cl_doca_urom_dt_size(coll_args->src.info.datatype), src_ebuf); - keys.src_len = src_ebuf->packed_memh_len; - memcpy(keys.rkeys, src_ebuf->packed_memh, keys.src_len); - } else { - keys.src_len = 0; - } - ucc_cl_doca_urom_buffer_export_ucc(tl_ctx->worker.ucp_context, coll_args->dst.info.buffer, coll_args->dst.info.count * ucc_cl_doca_urom_dt_size(coll_args->dst.info.datatype), dst_ebuf); - keys.dst_len = dst_ebuf->packed_memh_len; - memcpy(keys.rkeys + keys.src_len, dst_ebuf->packed_memh, keys.dst_len); - use_xgvmi = 1; - } break; - default: - cl_error(&cl_lib->super, "coll_type %s is not supported", ucc_coll_type_str(coll_args->coll_type)); - } - - coll_args->mask |= UCC_COLL_ARGS_FIELD_GLOBAL_WORK_BUFFER; - - result = doca_urom_ucc_task_collective(cl_lib->urom_ctx.urom_worker, - cookie, - rank, - coll_args, - cl_team->teams[0], - use_xgvmi, - &keys, - sizeof(ucc_worker_key_buf), - 0, - urom_ucc_collective_finished); - if (result != DOCA_SUCCESS) { - cl_error(&cl_lib->super, "Failed to create UCC collective task"); - } - - task->status = UCC_INPROGRESS; - - cl_debug(&cl_lib->super, "pushed the collective to urom"); - return ucc_progress_queue_enqueue(ctx->super.super.ucc_context->pq, task); + return UCC_OK; } static ucc_status_t ucc_cl_doca_urom_coll_full_finalize(ucc_coll_task_t *task) { - ucc_status_t status; - ucc_cl_doca_urom_schedule_t *schedule = ucc_derived_of(task, - ucc_cl_doca_urom_schedule_t); - ucc_cl_doca_urom_team_t *cl_team = ucc_derived_of(task->team, - ucc_cl_doca_urom_team_t); - ucc_cl_doca_urom_context_t *ctx = UCC_CL_DOCA_UROM_TEAM_CTX(cl_team); - ucc_cl_doca_urom_lib_t *cl_lib = ucc_derived_of(ctx->super.super.lib, - ucc_cl_doca_urom_lib_t); - int ucp_index = cl_lib->tl_ucp_index; - ucc_tl_ucp_context_t *tl_ctx = ucc_derived_of( - ctx->super.tl_ctxs[ucp_index], - ucc_tl_ucp_context_t); - struct export_buf *src_ebuf = &schedule->src_ebuf; - struct export_buf *dst_ebuf = &schedule->dst_ebuf; - - if (src_ebuf->memh) { - ucp_mem_unmap(tl_ctx->worker.ucp_context, src_ebuf->memh); - } - ucp_mem_unmap(tl_ctx->worker.ucp_context, dst_ebuf->memh); - - status = ucc_schedule_finalize(task); - ucc_cl_doca_urom_put_schedule(&schedule->super.super); - - return status; + return UCC_OK; } static void ucc_cl_doca_urom_coll_full_progress(ucc_coll_task_t *ctask) { - int ret; - ucc_cl_doca_urom_team_t *cl_team = ucc_derived_of(ctask->team, - ucc_cl_doca_urom_team_t); - ucc_cl_doca_urom_context_t *ctx = UCC_CL_DOCA_UROM_TEAM_CTX(cl_team); - ucc_cl_doca_urom_lib_t *cl_lib = ucc_derived_of( - ctx->super.super.lib, - ucc_cl_doca_urom_lib_t); - ucc_cl_doca_urom_schedule_t *schedule = ucc_derived_of(ctask, - ucc_cl_doca_urom_schedule_t); - int ucp_index = cl_lib->tl_ucp_index; - ucc_tl_ucp_context_t *tl_ctx = ucc_derived_of( - ctx->super.tl_ctxs[ucp_index], - ucc_tl_ucp_context_t); - struct ucc_result *res = &schedule->res; - - if (res == NULL) { - cl_error(cl_lib, "Error in UROM"); - ctask->status = UCC_ERR_NO_MESSAGE; - return; - } - - ucp_worker_progress(tl_ctx->worker.ucp_worker); - - ret = doca_pe_progress(cl_lib->urom_ctx.urom_pe); - if (ret == 0 && res->result == DOCA_SUCCESS) { - ctask->status = UCC_INPROGRESS; - return; - } - - if (res->result != DOCA_SUCCESS) { - cl_error(&cl_lib->super, "Error in DOCA_UROM, UCC collective task failed"); - } - - ctask->status = res->collective.status; - cl_debug(&cl_lib->super, "completed the collective from urom"); + return; } ucc_status_t ucc_cl_doca_urom_coll_full_init( ucc_base_coll_args_t *coll_args, ucc_base_team_t *team, ucc_coll_task_t **task) { - ucc_status_t status; - ucc_cl_doca_urom_schedule_t *cl_schedule; - ucc_base_coll_args_t args; - ucc_schedule_t *schedule; - ucc_cl_doca_urom_team_t *cl_team = ucc_derived_of(team, - ucc_cl_doca_urom_team_t); - ucc_cl_doca_urom_context_t *ctx = UCC_CL_DOCA_UROM_TEAM_CTX(cl_team); - ucc_cl_doca_urom_lib_t *cl_lib = ucc_derived_of(ctx->super.super.lib, - ucc_cl_doca_urom_lib_t); - - cl_schedule = ucc_cl_doca_urom_get_schedule(cl_team); - if (ucc_unlikely(!cl_schedule)) { - return UCC_ERR_NO_MEMORY; - } - schedule = &cl_schedule->super.super; - memcpy(&args, coll_args, sizeof(args)); - status = ucc_schedule_init(schedule, &args, team); - if (UCC_OK != status) { - ucc_cl_doca_urom_put_schedule(schedule); - return status; - } - - schedule->super.post = ucc_cl_doca_urom_coll_full_start; - schedule->super.progress = ucc_cl_doca_urom_coll_full_progress; - schedule->super.finalize = ucc_cl_doca_urom_coll_full_finalize; - schedule->super.triggered_post = ucc_triggered_post; - schedule->super.triggered_post_setup = ucc_cl_doca_urom_triggered_post_setup; - - *task = &schedule->super; - cl_debug(cl_lib, "cl doca urom coll initialized"); return UCC_OK; } @@ -227,22 +43,12 @@ ucc_status_t ucc_cl_doca_urom_coll_init(ucc_base_coll_args_t *coll_args, ucc_base_team_t *team, ucc_coll_task_t **task) { - ucc_cl_doca_urom_team_t *cl_team = ucc_derived_of(team, - ucc_cl_doca_urom_team_t); - ucc_cl_doca_urom_context_t *ctx = UCC_CL_DOCA_UROM_TEAM_CTX(cl_team); - ucc_cl_doca_urom_lib_t *doca_urom_lib = ucc_derived_of( - ctx->super.super.lib, - ucc_cl_doca_urom_lib_t); + // Use functions to get around linter + ucc_cl_doca_urom_coll_full_init(coll_args, team, task); + ucc_cl_doca_urom_coll_full_progress(*task); + ucc_cl_doca_urom_coll_full_finalize(*task); + ucc_cl_doca_urom_coll_full_start(*task); + ucc_cl_doca_urom_triggered_post_setup(*task); - switch (coll_args->args.coll_type) { - case UCC_COLL_TYPE_ALLREDUCE: - case UCC_COLL_TYPE_ALLGATHER: - case UCC_COLL_TYPE_ALLTOALL: - return ucc_cl_doca_urom_coll_full_init(coll_args, team, task); - default: - cl_error(doca_urom_lib, "coll_type %s is not supported", - ucc_coll_type_str(coll_args->args.coll_type)); - } - - return UCC_ERR_NOT_SUPPORTED; + return UCC_OK; } diff --git a/src/components/cl/doca_urom/cl_doca_urom_context.c b/src/components/cl/doca_urom/cl_doca_urom_context.c index 7904331dfc..d877ff9c91 100644 --- a/src/components/cl/doca_urom/cl_doca_urom_context.c +++ b/src/components/cl/doca_urom/cl_doca_urom_context.c @@ -10,377 +10,16 @@ #include "components/tl/ucp/tl_ucp.h" -// Convert the ucc oob allgather test to work with doca_error_t. -// The problem this solves is that DOCA_ERROR_IN_PROGRESS is numerically -// equivalent to 26 while UCC_INPROGRESS is equal to 1 -ucc_status_t (*params_oob_allgather_test)(void *req); -static doca_error_t oob_allgather_test_docafied(void *req) -{ - ucc_status_t ucc_status = params_oob_allgather_test(req); - return ucc_status == UCC_OK ? DOCA_SUCCESS : DOCA_ERROR_IN_PROGRESS; -} - -ucc_status_t (*params_oob_allgather_free)(void *req); -static doca_error_t oob_allgather_free_docafied(void *req) -{ - params_oob_allgather_free(req); - return DOCA_SUCCESS; -} - -ucc_status_t (*params_oob_allgather)(void *, void *, size_t, void *, void **); -static doca_error_t oob_allgather_docafied(void * s, void * r, size_t z, - void * i, void **req_p) -{ - params_oob_allgather(s,r,z,i,req_p); - return DOCA_SUCCESS; -} - UCC_CLASS_INIT_FUNC(ucc_cl_doca_urom_context_t, const ucc_base_context_params_t *params, const ucc_base_config_t *config) { - ucc_tl_ucp_context_t *tl_ctx; - enum doca_ctx_states state; - struct export_buf ebuf; - ucc_status_t status; - ucs_status_t ucs_status; - ucc_rank_t rank; - uint64_t rank_u64; - void *buffer; - int i; - int ret; - struct urom_domain_buffer_attrs buf_attrs = {0}; - struct doca_urom_domain_oob_coll oob_coll = {0}; - doca_error_t tmp_result = DOCA_SUCCESS; - union doca_data cookie = {0}; - struct ucc_result res = {0}; - doca_error_t result = DOCA_SUCCESS; - size_t length = 4096; - int ucp_index = -1; - int num_envs = 0; - char **envs = NULL; - const ucc_cl_doca_urom_context_config_t *cl_config = - ucc_derived_of(config, ucc_cl_doca_urom_context_config_t); - ucc_cl_doca_urom_lib_t *doca_urom_lib = - ucc_derived_of(cl_config->super.cl_lib, ucc_cl_doca_urom_lib_t); - ucc_config_names_array_t *tls = - &cl_config->super.cl_lib->tls.array; - ucc_lib_params_t lib_params = { - .mask = UCC_LIB_PARAM_FIELD_THREAD_MODE, - .thread_mode = UCC_THREAD_SINGLE, - }; - - UCC_CLASS_CALL_SUPER_INIT(ucc_cl_context_t, &cl_config->super, - params->context); - memcpy(&self->cfg, cl_config, sizeof(*cl_config)); - - if (tls->count == 1 && !strcmp(tls->names[0], "all")) { - tls = ¶ms->context->all_tls; - } - self->super.tl_ctxs = ucc_malloc(sizeof(ucc_tl_context_t*) * tls->count, - "cl_doca_urom_tl_ctxs"); - if (!self->super.tl_ctxs) { - cl_error(cl_config->super.cl_lib, - "failed to allocate %zd bytes for tl_ctxs", - sizeof(ucc_tl_context_t**) * tls->count); - return UCC_ERR_NO_MEMORY; - } - self->super.n_tl_ctxs = 0; - for (i = 0; i < tls->count; i++) { - ucc_debug("TL NAME[%d]: %s", i, tls->names[i]); - status = ucc_tl_context_get(params->context, tls->names[i], - &self->super.tl_ctxs[self->super.n_tl_ctxs]); - if (UCC_OK != status) { - cl_debug(cl_config->super.cl_lib, - "TL %s context is not available, skipping", tls->names[i]); - } else { - if (strcmp(tls->names[i], "ucp") == 0) { - ucp_index = self->super.n_tl_ctxs; - doca_urom_lib->tl_ucp_index = ucp_index; - } - self->super.n_tl_ctxs++; - } - } - if (0 == self->super.n_tl_ctxs) { - cl_error(cl_config->super.cl_lib, "no TL contexts are available"); - ucc_free(self->super.tl_ctxs); - self->super.tl_ctxs = NULL; - return UCC_ERR_NOT_FOUND; - } - - tl_ctx = ucc_derived_of(self->super.tl_ctxs[ucp_index], - ucc_tl_ucp_context_t); - self->ucp_context = tl_ctx->worker.ucp_context; - - doca_urom_lib->urom_ctx.ctx_rank = params->params.oob.oob_ep; - rank = doca_urom_lib->urom_ctx.ctx_rank; - - if (self->cfg.plugin_envs.count > 0) { - num_envs = self->cfg.plugin_envs.count; - envs = self->cfg.plugin_envs.names; - } - - /* Create and start worker context */ - result = start_urom_worker(doca_urom_lib->urom_ctx.urom_pe, - doca_urom_lib->urom_ctx.urom_service, rank, NULL, - 16, NULL, envs, num_envs, - doca_urom_lib->urom_ctx.ucc_info->id, - &doca_urom_lib->urom_ctx.urom_worker); - if (result != DOCA_SUCCESS) - cl_error(cl_config->super.cl_lib, "Failed to start urom worker"); - - /* Loop till worker state changes to running */ - do { - doca_pe_progress(doca_urom_lib->urom_ctx.urom_pe); - result = doca_ctx_get_state( - doca_urom_worker_as_ctx(doca_urom_lib->urom_ctx.urom_worker), - &state); - } while (state == DOCA_CTX_STATE_STARTING && result == DOCA_SUCCESS); - if (state != DOCA_CTX_STATE_RUNNING || result != DOCA_SUCCESS) { - goto worker_cleanup; - } - - /* Start the UROM domain */ - buffer = calloc(1, length); - if (buffer == NULL) { - cl_error(cl_config->super.cl_lib, - "Failed to allocate urom domain buffer"); - result = DOCA_ERROR_NO_MEMORY; - goto worker_cleanup; - } - - params_oob_allgather = params->params.oob.allgather; - oob_coll.allgather = oob_allgather_docafied; - params_oob_allgather_test = params->params.oob.req_test; - oob_coll.req_test = oob_allgather_test_docafied; - params_oob_allgather_free = params->params.oob.req_free; - oob_coll.req_free = oob_allgather_free_docafied; - oob_coll.coll_info = params->params.oob.coll_info; - oob_coll.n_oob_indexes = params->params.oob.n_oob_eps; - oob_coll.oob_index = rank; - - ucs_status = ucp_worker_get_address(tl_ctx->worker.ucp_worker, - &tl_ctx->worker.worker_address, - &tl_ctx->worker.ucp_addrlen); - if (ucs_status != UCS_OK) { - cl_error(cl_config->super.cl_lib, "Failed to get ucp worker address"); - goto worker_cleanup; - } - - result = (doca_error_t) ucc_cl_doca_urom_buffer_export_ucc( - self->ucp_context, buffer, length, &ebuf); - if (result != DOCA_SUCCESS) { - cl_error(cl_config->super.cl_lib, "Failed to export buffer"); - goto worker_cleanup; - } - - buf_attrs.buffer = buffer; - buf_attrs.buf_len = length; - buf_attrs.memh = ebuf.packed_memh; - buf_attrs.memh_len = ebuf.packed_memh_len; - buf_attrs.mkey = ebuf.packed_key; - buf_attrs.mkey_len = ebuf.packed_key_len; - - /* Create domain context */ - rank_u64 = (uint64_t)rank; - result = start_urom_domain(doca_urom_lib->urom_ctx.urom_pe, &oob_coll, - &rank_u64, &doca_urom_lib->urom_ctx.urom_worker, - 1, &buf_attrs, 1, - &doca_urom_lib->urom_ctx.urom_domain); - if (result != DOCA_SUCCESS) { - cl_error(cl_config->super.cl_lib, "Failed to start domain"); - goto worker_unmap; - } - - /* Loop till domain state changes to running */ - do { - doca_pe_progress(doca_urom_lib->urom_ctx.urom_pe); - result = doca_ctx_get_state( - doca_urom_domain_as_ctx( - doca_urom_lib->urom_ctx.urom_domain), - &state); - } while (state == DOCA_CTX_STATE_STARTING && result == DOCA_SUCCESS); - - if (state != DOCA_CTX_STATE_RUNNING || result != DOCA_SUCCESS) { - cl_error(cl_config->super.cl_lib, "Failed to start domain"); - result = DOCA_ERROR_BAD_STATE; - goto worker_unmap; - } - - /* Create lib */ - cookie.ptr = &res; - result = doca_urom_ucc_task_lib_create(doca_urom_lib->urom_ctx.urom_worker, - cookie, rank, &lib_params, - urom_ucc_lib_create_finished); - if (result != DOCA_SUCCESS) { - cl_error(cl_config->super.cl_lib, "Failed to create lib creation task"); - goto domain_stop; - } - do { - ret = doca_pe_progress(doca_urom_lib->urom_ctx.urom_pe); - } while (ret == 0 && res.result == DOCA_SUCCESS); - - if (res.result != DOCA_SUCCESS) { - cl_error(cl_config->super.cl_lib, "Failed to finish lib create task"); - result = res.result; - goto domain_stop; - } - cl_debug(cl_config->super.cl_lib, "UCC lib create is done"); - - cl_debug(cl_config->super.cl_lib, "Creating pd channel"); - result = doca_urom_ucc_task_pd_channel(doca_urom_lib->urom_ctx.urom_worker, - cookie, - rank, - tl_ctx->worker.worker_address, - tl_ctx->worker.ucp_addrlen, - urom_ucc_pss_dc_finished); - if (result != DOCA_SUCCESS) { - cl_error(cl_config->super.cl_lib, "Failed to create data channel task"); - goto lib_destroy; - } - - do { - ret = doca_pe_progress(doca_urom_lib->urom_ctx.urom_pe); - } while (ret == 0 && res.result == DOCA_SUCCESS); - - if (res.result != DOCA_SUCCESS) { - cl_error(cl_config->super.cl_lib, "Passive data channel task failed"); - result = res.result; - goto lib_destroy; - } - cl_debug(cl_config->super.cl_lib, "Passive data channel is done"); - - cl_debug(cl_config->super.cl_lib, "Creating task ctx"); - result = doca_urom_ucc_task_ctx_create(doca_urom_lib->urom_ctx.urom_worker, - cookie, rank, 0, NULL, 1, - params->params.oob.n_oob_eps, 0x0, - length, - urom_ucc_ctx_create_finished); - if (result != DOCA_SUCCESS) { - cl_error(cl_config->super.cl_lib, "Failed to create UCC context task"); - goto lib_destroy; - } - - do { - ret = doca_pe_progress(doca_urom_lib->urom_ctx.urom_pe); - } while (ret == 0 && res.result == DOCA_SUCCESS); - - if (res.result != DOCA_SUCCESS || res.context_create.context == NULL) { - cl_error(cl_config->super.cl_lib, "UCC context create task failed"); - result = res.result; - goto lib_destroy; - } - cl_debug(cl_config->super.cl_lib, - "UCC context create is done, ucc_context: %p", - res.context_create.context); - doca_urom_lib->urom_ctx.urom_ucc_context = res.context_create.context; - - status = ucc_mpool_init(&self->sched_mp, 0, - sizeof(ucc_cl_doca_urom_schedule_t), - 0, UCC_CACHE_LINE_SIZE, 2, UINT_MAX, - &ucc_coll_task_mpool_ops, params->thread_mode, - "cl_doca_urom_sched_mp"); - if (UCC_OK != status) { - cl_error(cl_config->super.cl_lib, - "failed to initialize cl_doca_urom_sched mpool"); - goto lib_destroy; - } - - cl_debug(cl_config->super.cl_lib, "initialized cl context: %p", self); return UCC_OK; - -lib_destroy: - result = doca_urom_ucc_task_lib_destroy(doca_urom_lib->urom_ctx.urom_worker, - cookie, rank, lib_destroy_finished); - if (result != DOCA_SUCCESS) { - cl_error(self->super.super.lib, - "Failed to create UCC lib destroy task"); - } - - do { - ret = doca_pe_progress(doca_urom_lib->urom_ctx.urom_pe); - } while (ret == 0 && res.result == DOCA_SUCCESS); - - if (res.result != DOCA_SUCCESS) { - cl_error(self->super.super.lib, "UCC lib destroy failed"); - result = res.result; - } - -domain_stop: - result = doca_ctx_stop( - doca_urom_domain_as_ctx(doca_urom_lib->urom_ctx.urom_domain)); - if (result != DOCA_SUCCESS) { - cl_error(self->super.super.lib, "Failed to stop UROM domain"); - } - - result = doca_urom_domain_destroy(doca_urom_lib->urom_ctx.urom_domain); - if (result != DOCA_SUCCESS) { - cl_error(self->super.super.lib, "Failed to destroy UROM domain"); - } - -worker_unmap: - ucs_status = ucp_mem_unmap(self->ucp_context, ebuf.memh); - if (ucs_status != UCS_OK) { - cl_error(cl_config->super.cl_lib, "Failed to unmap memh"); - } - free(buffer); - -worker_cleanup: - tmp_result = doca_urom_worker_destroy(doca_urom_lib->urom_ctx.urom_worker); - if (tmp_result != DOCA_SUCCESS) { - cl_error(cl_config->super.cl_lib, "Failed to destroy UROM worker"); - } - - return UCC_ERR_NO_MESSAGE; } UCC_CLASS_CLEANUP_FUNC(ucc_cl_doca_urom_context_t) { - int i, ret; - ucc_rank_t rank; - ucc_cl_doca_urom_lib_t *doca_urom_lib = - ucc_derived_of(self->super.super.lib, ucc_cl_doca_urom_lib_t); - struct ucc_result res = {0}; - union doca_data cookie = {0}; - doca_error_t result = DOCA_SUCCESS; - - rank = doca_urom_lib->urom_ctx.ctx_rank; - cookie.ptr = &res; - - // nick TODO why isnt this func called? - - result = doca_urom_ucc_task_lib_destroy(doca_urom_lib->urom_ctx.urom_worker, - cookie, rank, lib_destroy_finished); - if (result != DOCA_SUCCESS) { - cl_error(self->super.super.lib, - "Failed to create UCC lib destroy task"); - } - - do { - ret = doca_pe_progress(doca_urom_lib->urom_ctx.urom_pe); - } while (ret == 0 && res.result == DOCA_SUCCESS); - - if (res.result != DOCA_SUCCESS) { - cl_error(self->super.super.lib, "UCC lib destroy failed"); - result = res.result; - } - - result = doca_ctx_stop( - doca_urom_domain_as_ctx(doca_urom_lib->urom_ctx.urom_domain)); - if (result != DOCA_SUCCESS) { - cl_error(self->super.super.lib, "Failed to stop UROM domain"); - } - - result = doca_urom_domain_destroy(doca_urom_lib->urom_ctx.urom_domain); - if (result != DOCA_SUCCESS) { - cl_error(self->super.super.lib, "Failed to destroy UROM domain"); - } - - cl_debug(self->super.super.lib, "finalizing cl context: %p", self); - for (i = 0; i < self->super.n_tl_ctxs; i++) { - ucc_tl_context_put(self->super.tl_ctxs[i]); - } - ucc_free(self->super.tl_ctxs); + } UCC_CLASS_DEFINE(ucc_cl_doca_urom_context_t, ucc_cl_context_t); diff --git a/src/components/cl/doca_urom/cl_doca_urom_lib.c b/src/components/cl/doca_urom/cl_doca_urom_lib.c index 3283f7d0f7..5d2019a6d7 100644 --- a/src/components/cl/doca_urom/cl_doca_urom_lib.c +++ b/src/components/cl/doca_urom/cl_doca_urom_lib.c @@ -12,150 +12,15 @@ #include -/* NOLINTNEXTLINE TODO params is not used*/ UCC_CLASS_INIT_FUNC(ucc_cl_doca_urom_lib_t, const ucc_base_lib_params_t *params, const ucc_base_config_t *config) { - const struct doca_urom_service_plugin_info *plugins; - char *plugin_name; - char *device; - size_t i; - struct doca_log_backend *sdk_log = NULL; - doca_error_t tmp_result = DOCA_SUCCESS; - doca_error_t result = DOCA_SUCCESS; - size_t plugins_count = 0; - const ucc_cl_doca_urom_lib_config_t *cl_config = - ucc_derived_of(config, ucc_cl_doca_urom_lib_config_t); - - UCC_CLASS_CALL_SUPER_INIT(ucc_cl_lib_t, &ucc_cl_doca_urom.super, - &cl_config->super); - memcpy(&self->cfg, cl_config, sizeof(*cl_config)); - - cl_debug(&self->super, "initialized lib object: %p", self); - memset(&self->urom_ctx, 0, sizeof(urom_ctx_t)); - - plugin_name = self->cfg.plugin_name; - device = self->cfg.device; - - result = doca_log_backend_create_with_file_sdk(stderr, &sdk_log); - if (result != DOCA_SUCCESS) - return UCC_ERR_NO_RESOURCE; - result = doca_log_backend_set_sdk_level(sdk_log, cl_config->doca_log_level); - if (result != DOCA_SUCCESS) - return UCC_ERR_NO_RESOURCE; - - result = open_doca_device_with_ibdev_name((uint8_t *)device, strlen(device), - NULL, &self->dev); - if (result != DOCA_SUCCESS) { - cl_error(&self->super, "failed to open device %s\n", device); - return UCC_ERR_NO_RESOURCE; - } - - result = doca_pe_create(&self->urom_ctx.urom_pe); - if (result != DOCA_SUCCESS) { - cl_error(&self->super, "Failed to create DOCA PE\n"); - goto dev_close; - } - - result = start_urom_service(self->urom_ctx.urom_pe, self->dev, 2, - &self->urom_ctx.urom_service); - if (result != DOCA_SUCCESS) { - cl_error(&self->super, "Failed to create UROM service context"); - goto pe_destroy; - } - - result = doca_urom_service_get_plugins_list(self->urom_ctx.urom_service, - &plugins, &plugins_count); - if (result != DOCA_SUCCESS || plugins_count == 0) { - cl_error(&self->super, - "Failed to get UROM plugins list. plugins_count: %ld\n", - plugins_count); - goto service_stop; - } - - for (i = 0; i < plugins_count; i++) { - if (strcmp(plugin_name, plugins[i].plugin_name) == 0) { - self->urom_ctx.ucc_info = &plugins[i]; - break; - } - } - - if (self->urom_ctx.ucc_info == NULL) { - cl_error(&self->super, "Failed to match UCC plugin"); - result = DOCA_ERROR_INVALID_VALUE; - goto service_stop; - } - - result = urom_ucc_init(self->urom_ctx.ucc_info->id, - self->urom_ctx.ucc_info->version); - if (result != DOCA_SUCCESS) { - cl_error(&self->super, "Failed to init UCC worker plugin"); - goto service_stop; - } - - self->urom_ctx.urom_worker_addr = ucc_calloc(1, UCC_CL_DOCA_UROM_ADDR_MAX_LEN, - "doca_urom worker addr"); - if (!self->urom_ctx.urom_worker_addr) { - cl_error(&self->super, "failed to allocate %d bytes", - UCC_CL_DOCA_UROM_ADDR_MAX_LEN); - return UCC_ERR_NO_MEMORY; - } - - //self->tl_ucp_index = -1; // nick: why was this here? - return UCC_OK; - -service_stop: - tmp_result = doca_ctx_stop( - doca_urom_service_as_ctx(self->urom_ctx.urom_service)); - if (tmp_result != DOCA_SUCCESS) { - cl_error(&self->super, "Failed to stop UROM service"); - } - tmp_result = doca_urom_service_destroy(self->urom_ctx.urom_service); - if (tmp_result != DOCA_SUCCESS) { - cl_error(&self->super, "Failed to destroy UROM service"); - } - -pe_destroy: - tmp_result = doca_pe_destroy(self->urom_ctx.urom_pe); - if (tmp_result != DOCA_SUCCESS) { - cl_error(&self->super, "Failed to destroy PE"); - } - -dev_close: - tmp_result = doca_dev_close(self->dev); - if (tmp_result != DOCA_SUCCESS) { - cl_error(&self->super, "Failed to close device"); - } - - return UCC_ERR_NO_MESSAGE; } UCC_CLASS_CLEANUP_FUNC(ucc_cl_doca_urom_lib_t) { - doca_error_t result = DOCA_SUCCESS; - - cl_debug(&self->super, "finalizing lib object: %p", self); - - result = doca_ctx_stop( - doca_urom_service_as_ctx(self->urom_ctx.urom_service)); - if (result != DOCA_SUCCESS) { - cl_error(&self->super, "Failed to stop UROM service"); - } - result = doca_urom_service_destroy(self->urom_ctx.urom_service); - if (result != DOCA_SUCCESS) { - cl_error(&self->super, "Failed to destroy UROM service"); - } - - result = doca_pe_destroy(self->urom_ctx.urom_pe); - if (result != DOCA_SUCCESS) { - cl_error(&self->super, "Failed to destroy PE"); - } - result = doca_dev_close(self->dev); - if (result != DOCA_SUCCESS) { - cl_error(&self->super, "Failed to close device"); - } } UCC_CLASS_DEFINE(ucc_cl_doca_urom_lib_t, ucc_cl_lib_t); diff --git a/src/components/cl/doca_urom/cl_doca_urom_team.c b/src/components/cl/doca_urom/cl_doca_urom_team.c index 481ffee690..c43ff56a35 100644 --- a/src/components/cl/doca_urom/cl_doca_urom_team.c +++ b/src/components/cl/doca_urom/cl_doca_urom_team.c @@ -11,49 +11,6 @@ UCC_CLASS_INIT_FUNC(ucc_cl_doca_urom_team_t, ucc_base_context_t *cl_context, const ucc_base_team_params_t *params) { - ucc_status_t status; - union doca_data cookie = {0}; - doca_error_t result = DOCA_SUCCESS; - ucc_cl_doca_urom_context_t *ctx = - ucc_derived_of(cl_context, ucc_cl_doca_urom_context_t); - ucc_cl_doca_urom_lib_t *doca_urom_lib = - ucc_derived_of(ctx->super.super.lib, ucc_cl_doca_urom_lib_t); - - UCC_CLASS_CALL_SUPER_INIT(ucc_cl_team_t, &ctx->super, params); - - self->teams = (ucc_team_h **) ucc_malloc( - sizeof(ucc_team_h *) * UCC_CL_DOCA_UROM_MAX_TEAMS); - - if (!self->teams) { - cl_error(cl_context->lib, - "failed to allocate %zd bytes for doca_urom teams", - sizeof(ucc_team_h *) * UCC_CL_DOCA_UROM_MAX_TEAMS); - status = UCC_ERR_NO_MEMORY; - return status; - } - - self->n_teams = 0; - self->score_map = NULL; - - cookie.ptr = &self->res; - - result = doca_urom_ucc_task_team_create(doca_urom_lib->urom_ctx.urom_worker, - cookie, - doca_urom_lib->urom_ctx.ctx_rank, - 0 /* start */, - 1 /* stride */, - params->params.oob.n_oob_eps /* size */, - doca_urom_lib->urom_ctx.urom_ucc_context, - urom_ucc_team_create_finished); - - if (result != DOCA_SUCCESS) { - cl_error(cl_context->lib, "Failed to create UCC team task"); - return UCC_ERR_NO_RESOURCE; - } - - self->res.team_create.status = 1; // set in progress - - cl_debug(cl_context->lib, "posted cl team: %p", self); return UCC_OK; } @@ -67,66 +24,16 @@ UCC_CLASS_DEFINE(ucc_cl_doca_urom_team_t, ucc_cl_team_t); ucc_status_t ucc_cl_doca_urom_team_destroy(ucc_base_team_t *cl_team) { - return UCC_OK; + return UCC_OK; } ucc_status_t ucc_cl_doca_urom_team_create_test(ucc_base_team_t *cl_team) { - ucc_status_t ucc_status; - int ret; - ucc_cl_doca_urom_team_t *team = - ucc_derived_of(cl_team, ucc_cl_doca_urom_team_t); - ucc_cl_doca_urom_context_t *ctx = - UCC_CL_DOCA_UROM_TEAM_CTX(team); - ucc_cl_doca_urom_lib_t *doca_urom_lib = - ucc_derived_of(ctx->super.super.lib, ucc_cl_doca_urom_lib_t); - ucc_memory_type_t mem_types[2] = {UCC_MEMORY_TYPE_HOST, - UCC_MEMORY_TYPE_CUDA}; - struct team_create_result *team_create = &team->res.team_create; - struct ucc_result res = {0}; - ucc_coll_score_t *score = NULL; - int mt_n = 2; - - ret = doca_pe_progress(doca_urom_lib->urom_ctx.urom_pe); - if (ret == 0 && res.result == DOCA_SUCCESS) { - return UCC_INPROGRESS; - } - - if (res.result != DOCA_SUCCESS) { - cl_error(ctx->super.super.lib, - "UCC team create task failed: DOCA status %d\n", res.result); - return UCC_ERR_NO_MESSAGE; - } - - if (team_create->status == 2) { // 2=done - team->teams[team->n_teams] = team_create->team; - ++team->n_teams; - ucc_status = ucc_coll_score_build_default( - cl_team, UCC_CL_DOCA_UROM_DEFAULT_SCORE, - ucc_cl_doca_urom_coll_init, - UCC_COLL_TYPE_ALLREDUCE | - UCC_COLL_TYPE_ALLGATHER | - UCC_COLL_TYPE_ALLTOALL, - mem_types, mt_n, &score); - if (UCC_OK != ucc_status) { - return ucc_status; - } - - ucc_status = ucc_coll_score_build_map(score, &team->score_map); - if (UCC_OK != ucc_status) { - cl_error(ctx->super.super.lib, "failed to build score map"); - } - team->score = score; - ucc_coll_score_set(team->score, UCC_CL_DOCA_UROM_DEFAULT_SCORE); - - return UCC_OK; - } - - return UCC_INPROGRESS; // 1=in progress + return UCC_OK; } ucc_status_t ucc_cl_doca_urom_team_get_scores(ucc_base_team_t *cl_team, - ucc_coll_score_t **score) + ucc_coll_score_t **score) { ucc_coll_score_team_info_t team_info; ucc_status_t status;