Skip to content

Commit

Permalink
Merge pull request #746 from bureddy/sharp-oob
Browse files Browse the repository at this point in the history
TL/SHARP: SHARP OOB fixes
  • Loading branch information
bureddy authored Mar 15, 2023
2 parents 45a0b49 + 6a35346 commit b837e87
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 1 deletion.
2 changes: 2 additions & 0 deletions config/m4/sharp.m4
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ AS_IF([test "x$with_sharp" != "xno"],
[
AC_SUBST(SHARP_CPPFLAGS, "-I$check_sharp_dir/include/ ")
AC_SUBST(SHARP_LDFLAGS, "-lsharp_coll -L$check_sharp_dir/lib")
AC_CHECK_DECLS([SHARP_COLL_HIDE_ERRORS], [], [], [[#include <sharp/api/sharp_coll.h>]])
AC_CHECK_DECLS([SHARP_COLL_DISABLE_LAZY_GROUP_RESOURCE_ALLOC], [], [], [[#include <sharp/api/sharp_coll.h>]])
],
[
AS_IF([test "x$with_sharp" != "xguess"],
Expand Down
9 changes: 8 additions & 1 deletion src/components/tl/sharp/tl_sharp.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ static ucc_config_field_t ucc_tl_sharp_context_config_table[] = {
{"", "", NULL, ucc_offsetof(ucc_tl_sharp_context_config_t, super),
UCC_CONFIG_TYPE_TABLE(ucc_tl_context_config_table)},

{"DEVICES", "mlx5_0:1",
{"DEVICES", "",
"SHARP device list",
ucc_offsetof(ucc_tl_sharp_context_config_t, dev_list),
UCC_CONFIG_TYPE_STRING},
Expand All @@ -55,6 +55,13 @@ static ucc_config_field_t ucc_tl_sharp_context_config_table[] = {
ucc_offsetof(ucc_tl_sharp_context_config_t, context_per_team),
UCC_CONFIG_TYPE_BOOL},

#if HAVE_DECL_SHARP_COLL_DISABLE_LAZY_GROUP_RESOURCE_ALLOC
{"ENABLE_LAZY_GROUP_ALLOC", "n",
"Enable lazy group resource allocation",
ucc_offsetof(ucc_tl_sharp_context_config_t, enable_lazy_group_alloc),
UCC_CONFIG_TYPE_BOOL},
#endif

{"RAND_SEED", "0",
"Seed for random sharp job ID. (0 - use default).",
ucc_offsetof(ucc_tl_sharp_context_config_t, rand_seed),
Expand Down
1 change: 1 addition & 0 deletions src/components/tl/sharp/tl_sharp.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ typedef struct ucc_tl_sharp_context_config {
unsigned int rand_seed;
unsigned int uprogress_num_polls;
int context_per_team;
int enable_lazy_group_alloc;
} ucc_tl_sharp_context_config_t;

typedef struct ucc_tl_sharp_lib {
Expand Down
11 changes: 11 additions & 0 deletions src/components/tl/sharp/tl_sharp_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,17 @@ ucc_status_t ucc_tl_sharp_context_init(ucc_tl_sharp_context_t *sharp_ctx,
init_spec.config = sharp_coll_default_config;
init_spec.config.user_progress_num_polls = sharp_ctx->cfg.uprogress_num_polls;
init_spec.config.ib_dev_list = sharp_ctx->cfg.dev_list;
#if HAVE_DECL_SHARP_COLL_HIDE_ERRORS
if (lib->super.super.log_component.log_level < UCC_LOG_LEVEL_DEBUG) {
init_spec.config.flags |= SHARP_COLL_HIDE_ERRORS;
}
#endif
#if HAVE_DECL_SHARP_COLL_DISABLE_LAZY_GROUP_RESOURCE_ALLOC
if(!sharp_ctx->cfg.enable_lazy_group_alloc) {
init_spec.config.flags |= SHARP_COLL_DISABLE_LAZY_GROUP_RESOURCE_ALLOC;
}
#endif

init_spec.job_id = ((getpid() ^ pthread_self())
^ rand_r(&sharp_ctx->cfg.rand_seed));
init_spec.enable_thread_support =
Expand Down

0 comments on commit b837e87

Please sign in to comment.