Skip to content

Commit

Permalink
CORE: corretly handle failure in ctx_create epilog (#612)
Browse files Browse the repository at this point in the history
  • Loading branch information
valentin petrov authored Oct 4, 2022
1 parent 0bfbcb2 commit ec1251b
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 41 deletions.
2 changes: 0 additions & 2 deletions src/components/cl/basic/cl_basic.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,6 @@ UCC_CLASS_DECLARE(ucc_cl_basic_lib_t, const ucc_base_lib_params_t *,

typedef struct ucc_cl_basic_context {
ucc_cl_context_t super;
ucc_tl_context_t **tl_ctxs;
unsigned n_tl_ctxs;
} ucc_cl_basic_context_t;
UCC_CLASS_DECLARE(ucc_cl_basic_context_t, const ucc_base_context_params_t *,
const ucc_base_config_t *);
Expand Down
30 changes: 15 additions & 15 deletions src/components/cl/basic/cl_basic_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,28 +23,28 @@ UCC_CLASS_INIT_FUNC(ucc_cl_basic_context_t,
tls = &params->context->all_tls;
}

self->tl_ctxs = ucc_malloc(sizeof(ucc_tl_context_t*) * tls->count,
self->super.tl_ctxs = ucc_malloc(sizeof(ucc_tl_context_t*) * tls->count,
"cl_basic_tl_ctxs");
if (!self->tl_ctxs) {
if (!self->super.tl_ctxs) {
cl_error(cl_config->cl_lib, "failed to allocate %zd bytes for tl_ctxs",
sizeof(ucc_tl_context_t**) * tls->count);
return UCC_ERR_NO_MEMORY;
}
self->n_tl_ctxs = 0;
self->super.n_tl_ctxs = 0;
for (i = 0; i < tls->count; i++) {
status = ucc_tl_context_get(params->context, tls->names[i],
&self->tl_ctxs[self->n_tl_ctxs]);
&self->super.tl_ctxs[self->super.n_tl_ctxs]);
if (UCC_OK != status) {
cl_info(cl_config->cl_lib,
"TL %s context is not available, skipping", tls->names[i]);
} else {
self->n_tl_ctxs++;
self->super.n_tl_ctxs++;
}
}
if (0 == self->n_tl_ctxs) {
if (0 == self->super.n_tl_ctxs) {
cl_error(cl_config->cl_lib, "no TL contexts are available");
ucc_free(self->tl_ctxs);
self->tl_ctxs = NULL;
ucc_free(self->super.tl_ctxs);
self->super.tl_ctxs = NULL;
return UCC_ERR_NOT_FOUND;
}
cl_info(cl_config->cl_lib, "initialized cl context: %p", self);
Expand All @@ -55,10 +55,10 @@ UCC_CLASS_CLEANUP_FUNC(ucc_cl_basic_context_t)
{
int i;
cl_info(self->super.super.lib, "finalizing cl context: %p", self);
for (i = 0; i < self->n_tl_ctxs; i++) {
ucc_tl_context_put(self->tl_ctxs[i]);
for (i = 0; i < self->super.n_tl_ctxs; i++) {
ucc_tl_context_put(self->super.tl_ctxs[i]);
}
ucc_free(self->tl_ctxs);
ucc_free(self->super.tl_ctxs);
}

UCC_CLASS_DEFINE(ucc_cl_basic_context_t, ucc_cl_context_t);
Expand All @@ -80,13 +80,13 @@ ucc_cl_basic_get_context_attr(const ucc_base_context_t *context,
/* CL BASIC reports topo_required if any of the TL available
TL contexts needs it */
attr->topo_required = 0;
for (i = 0; i < ctx->n_tl_ctxs; i++) {
for (i = 0; i < ctx->super.n_tl_ctxs; i++) {
memset(&tl_attr, 0, sizeof(tl_attr));
status = UCC_TL_CTX_IFACE(ctx->tl_ctxs[i])
->context.get_attr(&ctx->tl_ctxs[i]->super, &tl_attr);
status = UCC_TL_CTX_IFACE(ctx->super.tl_ctxs[i])
->context.get_attr(&ctx->super.tl_ctxs[i]->super, &tl_attr);
if (UCC_OK != status) {
cl_error(ctx->super.super.lib, "failed to get %s ctx attr",
UCC_TL_CTX_IFACE(ctx->tl_ctxs[i])->super.name);
UCC_TL_CTX_IFACE(ctx->super.tl_ctxs[i])->super.name);
return status;
}
if (tl_attr.topo_required) {
Expand Down
17 changes: 9 additions & 8 deletions src/components/cl/basic/cl_basic_team.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,36 +11,37 @@
UCC_CLASS_INIT_FUNC(ucc_cl_basic_team_t, ucc_base_context_t *cl_context,
const ucc_base_team_params_t *params)
{
ucc_cl_basic_context_t *ctx =
ucc_cl_basic_context_t *ctx =
ucc_derived_of(cl_context, ucc_cl_basic_context_t);
unsigned n_tl_ctxs = ctx->super.n_tl_ctxs;
int i;
ucc_status_t status;

UCC_CLASS_CALL_SUPER_INIT(ucc_cl_team_t, &ctx->super, params);
self->tl_teams = ucc_malloc(sizeof(ucc_tl_team_t *) * ctx->n_tl_ctxs,
self->tl_teams = ucc_malloc(sizeof(ucc_tl_team_t *) * n_tl_ctxs,
"cl_basic_tl_teams");
if (!self->tl_teams) {
cl_error(cl_context->lib, "failed to allocate %zd bytes for tl_teams",
sizeof(ucc_tl_team_t *) * ctx->n_tl_ctxs);
sizeof(ucc_tl_team_t *) * n_tl_ctxs);
status = UCC_ERR_NO_MEMORY;
goto err;
}
self->n_tl_teams = 0;
self->score_map = NULL;
status = ucc_team_multiple_req_alloc(&self->team_create_req,
ctx->n_tl_ctxs);
n_tl_ctxs);
if (UCC_OK != status) {
cl_error(cl_context->lib, "failed to allocate team req multiple");
goto err;
}
for (i = 0; i < ctx->n_tl_ctxs; i++) {
for (i = 0; i < n_tl_ctxs; i++) {
memcpy(&self->team_create_req->descs[i].param, params,
sizeof(ucc_base_team_params_t));
self->team_create_req->descs[i].ctx = ctx->tl_ctxs[i];
self->team_create_req->descs[i].ctx = ctx->super.tl_ctxs[i];
self->team_create_req->descs[i].param.scope = UCC_CL_BASIC;
self->team_create_req->descs[i].param.scope_id = 0;
}
self->team_create_req->n_teams = ctx->n_tl_ctxs;
self->team_create_req->n_teams = n_tl_ctxs;

status = ucc_tl_team_create_multiple(self->team_create_req);
if (status < 0) {
Expand Down Expand Up @@ -112,7 +113,7 @@ ucc_status_t ucc_cl_basic_team_create_test(ucc_base_team_t *cl_team)

status = ucc_tl_team_create_multiple(team->team_create_req);
if (status == UCC_OK) {
for (i = 0; i < ctx->n_tl_ctxs; i++) {
for (i = 0; i < ctx->super.n_tl_ctxs; i++) {
if (team->team_create_req->descs[i].status == UCC_OK) {
team->tl_teams[team->n_tl_teams++] =
team->team_create_req->descs[i].team;
Expand Down
2 changes: 0 additions & 2 deletions src/components/cl/hier/cl_hier.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,6 @@ UCC_CLASS_DECLARE(ucc_cl_hier_lib_t, const ucc_base_lib_params_t *,

typedef struct ucc_cl_hier_context {
ucc_cl_context_t super;
ucc_tl_context_t **tl_ctxs;
unsigned n_tl_ctxs;
ucc_mpool_t sched_mp;
} ucc_cl_hier_context_t;
UCC_CLASS_DECLARE(ucc_cl_hier_context_t, const ucc_base_context_params_t *,
Expand Down
20 changes: 10 additions & 10 deletions src/components/cl/hier/cl_hier_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,25 +27,25 @@ UCC_CLASS_INIT_FUNC(ucc_cl_hier_context_t,
tls = &params->context->all_tls;
}

self->tl_ctxs =
self->super.tl_ctxs =
ucc_malloc(sizeof(ucc_tl_context_t *) * tls->count, "cl_hier_tl_ctxs");
if (!self->tl_ctxs) {
if (!self->super.tl_ctxs) {
cl_error(cl_config->cl_lib, "failed to allocate %zd bytes for tl_ctxs",
sizeof(ucc_tl_context_t **) * tls->count);
return UCC_ERR_NO_MEMORY;
}
self->n_tl_ctxs = 0;
self->super.n_tl_ctxs = 0;
for (i = 0; i < tls->count; i++) {
status = ucc_tl_context_get(params->context, tls->names[i],
&self->tl_ctxs[self->n_tl_ctxs]);
&self->super.tl_ctxs[self->super.n_tl_ctxs]);
if (UCC_OK != status) {
cl_info(cl_config->cl_lib,
"TL %s context is not available, skipping", tls->names[i]);
} else {
self->n_tl_ctxs++;
self->super.n_tl_ctxs++;
}
}
if (0 == self->n_tl_ctxs) {
if (0 == self->super.n_tl_ctxs) {
cl_error(cl_config->cl_lib, "no TL contexts are available");
status = UCC_ERR_NOT_FOUND;
goto out;
Expand All @@ -64,7 +64,7 @@ UCC_CLASS_INIT_FUNC(ucc_cl_hier_context_t,
return UCC_OK;

out:
ucc_free(self->tl_ctxs);
ucc_free(self->super.tl_ctxs);
return status;
}

Expand All @@ -74,10 +74,10 @@ UCC_CLASS_CLEANUP_FUNC(ucc_cl_hier_context_t)
cl_info(self->super.super.lib, "finalizing cl context: %p", self);

ucc_mpool_cleanup(&self->sched_mp, 1);
for (i = 0; i < self->n_tl_ctxs; i++) {
ucc_tl_context_put(self->tl_ctxs[i]);
for (i = 0; i < self->super.n_tl_ctxs; i++) {
ucc_tl_context_put(self->super.tl_ctxs[i]);
}
ucc_free(self->tl_ctxs);
ucc_free(self->super.tl_ctxs);
}

UCC_CLASS_DEFINE(ucc_cl_hier_context_t, ucc_cl_context_t);
Expand Down
3 changes: 3 additions & 0 deletions src/components/cl/ucc_cl.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ typedef struct ucc_cl_lib ucc_cl_lib_t;
typedef struct ucc_cl_iface ucc_cl_iface_t;
typedef struct ucc_cl_context ucc_cl_context_t;
typedef struct ucc_cl_team ucc_cl_team_t;
typedef struct ucc_tl_context ucc_tl_context_t;

typedef struct ucc_cl_lib_config {
ucc_base_lib_config_t super;
Expand Down Expand Up @@ -81,6 +82,8 @@ UCC_CLASS_DECLARE(ucc_cl_lib_t, ucc_cl_iface_t *, const ucc_cl_lib_config_t *);

typedef struct ucc_cl_context {
ucc_base_context_t super;
ucc_tl_context_t **tl_ctxs;
unsigned n_tl_ctxs;
} ucc_cl_context_t;
UCC_CLASS_DECLARE(ucc_cl_context_t, const ucc_cl_context_config_t *,
ucc_context_t *);
Expand Down
42 changes: 38 additions & 4 deletions src/core/ucc_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,27 @@ ucc_status_t ucc_core_addr_exchange(ucc_context_t *context,
return UCC_OK;
}

static void remove_tl_ctx_from_array(ucc_tl_context_t **array, unsigned *size,
ucc_tl_context_t *tl_ctx)
{
int i;

for (i = 0; i < (*size); i++) {
if (array[i] == tl_ctx) {
break;
}
}
if (i == (*size)) {
/* given tl_ctx is not part of array */
return;
}
/* decrement array size and do cyclic shift */
(*size)--;
for (; i < (*size); i++) {
array[i] = array[i + 1];
}
}

ucc_status_t ucc_context_create_proc_info(ucc_lib_h lib,
const ucc_context_params_t *params,
const ucc_context_config_h config,
Expand All @@ -567,7 +588,7 @@ ucc_status_t ucc_context_create_proc_info(ucc_lib_h lib,
ucc_tl_lib_t *tl_lib;
ucc_context_t *ctx;
ucc_status_t status;
uint64_t i;
uint64_t i, j;
int num_cls;

num_cls = config->n_cl_cfg;
Expand Down Expand Up @@ -760,9 +781,22 @@ ucc_status_t ucc_context_create_proc_info(ucc_lib_h lib,
if (tl_lib->iface->context.create_epilog) {
status = tl_lib->iface->context.create_epilog(&tl_ctx->super);
if (UCC_OK != status) {
ucc_error("ctx create epilog for %s failed: %s",
tl_lib->iface->super.name, ucc_status_string(status));
goto error_ctx_create;
if (ucc_tl_is_required(lib, tl_lib->iface, 1)) {
ucc_error("ctx create epilog for %s failed: %s",
tl_lib->iface->super.name, ucc_status_string(status));
goto error_ctx_create;
} else {
ucc_debug("ctx create epilog for %s failed: %s",
tl_lib->iface->super.name, ucc_status_string(status));
tl_lib->iface->context.destroy(&tl_ctx->super);
for (j = 0; j < ctx->n_cl_ctx; j++) {
remove_tl_ctx_from_array(ctx->cl_ctx[j]->tl_ctxs,
&ctx->cl_ctx[j]->n_tl_ctxs,
tl_ctx);
}
remove_tl_ctx_from_array(ctx->tl_ctx, &ctx->n_tl_ctx,
tl_ctx);
}
}
}
}
Expand Down

0 comments on commit ec1251b

Please sign in to comment.