Skip to content

Commit

Permalink
TEST: build ucc with hpcsdk (#871)
Browse files Browse the repository at this point in the history
  • Loading branch information
Sergei-Lebedev authored Nov 7, 2023
1 parent 8a7b494 commit 21a424f
Show file tree
Hide file tree
Showing 18 changed files with 78 additions and 41 deletions.
25 changes: 25 additions & 0 deletions .github/workflows/hpcsdk.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: HPC_SDK

on: [push, pull_request]

env:
HPCXDIR: /opt/nvidia/hpc_sdk/Linux_x86_64/2023/comm_libs/12.2/hpcx/latest/
NCCLDIR: /opt/nvidia/hpc_sdk/Linux_x86_64/2023/comm_libs/12.2/nccl/
CUDADIR: /opt/nvidia/hpc_sdk/Linux_x86_64/2023/cuda/12.2/

jobs:
build:
runs-on: ubuntu-20.04
container:
image: nvcr.io/nvidia/nvhpc:23.9-devel-cuda12.2-ubuntu22.04
steps:
- name: Install dependencies
run: |
apt-get update
apt-get install -y --no-install-recommends libiberty-dev
- uses: actions/checkout@v1
- name: Build UCC
run: |
./autogen.sh
CC=nvc CXX=nvc++ ./configure --with-tls=ucp,mlx5,cuda,self,nccl,sharp --with-mpi=${HPCXDIR}/ompi --with-sharp=${HPCXDIR}/sharp --with-ucx=${HPCXDIR}/ucx --with-cuda=${CUDADIR} --with-nccl=${NCCLDIR} --with-nvcc-gencode="-gencode=arch=compute_80,code=sm_80"
make -j`nproc` install
2 changes: 1 addition & 1 deletion src/coll_patterns/recursive_knomial.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ typedef struct ucc_knomial_pattern {
size_t block_size_counts;
size_t count; /* collective buffer size */
ucc_rank_t block_size;
size_t block_offset;
ptrdiff_t block_offset;
} ucc_knomial_pattern_t;

/**
Expand Down
9 changes: 5 additions & 4 deletions src/coll_patterns/sra_knomial.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ ucc_kn_seg_desc_compute(ucc_knomial_pattern_t *p, ucc_kn_seg_desc_t *seg,

static inline void
ucc_knx_block(ucc_rank_t rank, ucc_rank_t size, ucc_kn_radix_t radix,
size_t count, int iter, size_t *b_count, size_t *b_offset)
size_t count, int iter, size_t *b_count, ptrdiff_t *b_offset)
{
ucc_rank_t offset = 0;
ucc_rank_t block_count;
Expand Down Expand Up @@ -213,7 +213,7 @@ ucc_kn_agx_pattern_init(ucc_rank_t size, ucc_rank_t rank, ucc_kn_radix_t radix,

static inline void
ucc_kn_ag_pattern_peer_seg(ucc_rank_t peer, ucc_knomial_pattern_t *p,
size_t *seg_count, size_t *seg_offset)
size_t *seg_count, ptrdiff_t *seg_offset)
{
ucc_rank_t step_radix, seg_index;
ucc_kn_seg_desc_t s;
Expand Down Expand Up @@ -278,7 +278,7 @@ static inline void ucc_kn_rsx_pattern_init(ucc_rank_t size, ucc_rank_t rank,

static inline void
ucc_kn_rs_pattern_peer_seg(ucc_rank_t peer, ucc_knomial_pattern_t *p,
size_t *peer_seg_count, size_t *peer_seg_offset)
size_t *peer_seg_count, ptrdiff_t *peer_seg_offset)
{
ucc_rank_t step_radix, seg_index;

Expand All @@ -305,7 +305,8 @@ ucc_kn_rs_pattern_peer_seg(ucc_rank_t peer, ucc_knomial_pattern_t *p,

static inline void ucc_kn_rs_pattern_next_iter(ucc_knomial_pattern_t *p)
{
size_t offset, bs;
size_t bs;
ptrdiff_t offset;

ucc_kn_rs_pattern_peer_seg(p->rank, p, &bs, &offset);
p->block_size_counts = bs;
Expand Down
3 changes: 2 additions & 1 deletion src/components/tl/cuda/tl_cuda_topo.c
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,8 @@ static ucc_status_t ucc_tl_cuda_topo_graph_create(ucc_tl_cuda_topo_t *topo)
ucc_tl_cuda_topo_dev_type_t dev_type;
ucc_tl_cuda_device_pci_id_t pci_id;
ucc_tl_cuda_topo_node_t *node, *peer_node;
int num_gpus, num_nvlinks, link, i;
int num_nvlinks, link, i;
unsigned int num_gpus;
nvmlReturn_t nvml_st;

nvml_st = nvmlInit_v2();
Expand Down
3 changes: 1 addition & 2 deletions src/components/tl/mlx5/alltoall/alltoall_mkeys.c
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,6 @@ ucc_status_t ucc_tl_mlx5_init_mkeys(ucc_tl_mlx5_team_t *team,
if (!node->ops[i].send_mkeys) {
tl_error(lib, "failed to malloc");
goto err_malloc;
return UCC_ERR_NO_MEMORY;
}
node->ops[i].recv_mkeys = (struct mlx5dv_mkey **)ucc_malloc(
sizeof(struct mlx5dv_mkey *) * a2a->max_num_of_columns);
Expand All @@ -230,7 +229,7 @@ ucc_status_t ucc_tl_mlx5_init_mkeys(ucc_tl_mlx5_team_t *team,
status = create_master_key(node->sbgp->group_size + 1, a2a->pd,
&node->ops[i].send_mkeys[j], lib);
if (status != UCC_OK) {
tl_error(lib, " failed to create send masterkey [%d,%d]", i, j);
tl_error(lib, "failed to create send masterkey [%d,%d]", i, j);
goto err_create_mkey;
}
status = create_master_key(node->sbgp->group_size + 1, a2a->pd,
Expand Down
6 changes: 3 additions & 3 deletions src/components/tl/sharp/tl_sharp_coll.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ enum sharp_datatype ucc_to_sharp_dtype[] = {
[UCC_DT_PREDEFINED_ID(UCC_DT_FLOAT64)] = SHARP_DTYPE_DOUBLE,
[UCC_DT_PREDEFINED_ID(UCC_DT_FLOAT128)] = SHARP_DTYPE_NULL,
#if SHARP_API > SHARP_VERSION(3, 0)
[UCC_DT_PREDEFINED_ID(UCC_DT_INT8)] = SHARP_DTYPE_UNKNOWN,
[UCC_DT_PREDEFINED_ID(UCC_DT_UINT8)] = SHARP_DTYPE_UNKNOWN,
[UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = SHARP_DTYPE_UNKNOWN,
[UCC_DT_PREDEFINED_ID(UCC_DT_INT8)] = (enum sharp_datatype)SHARP_DTYPE_UNKNOWN,
[UCC_DT_PREDEFINED_ID(UCC_DT_UINT8)] = (enum sharp_datatype)SHARP_DTYPE_UNKNOWN,
[UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = (enum sharp_datatype)SHARP_DTYPE_UNKNOWN,
#else
[UCC_DT_PREDEFINED_ID(UCC_DT_INT8)] = SHARP_DTYPE_NULL,
[UCC_DT_PREDEFINED_ID(UCC_DT_UINT8)] = SHARP_DTYPE_NULL,
Expand Down
4 changes: 2 additions & 2 deletions src/components/tl/sharp/tl_sharp_coll.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand All @@ -10,7 +10,7 @@
#include "tl_sharp.h"

/* need to query for datatype support at runtime */
#define SHARP_DTYPE_UNKNOWN -1
#define SHARP_DTYPE_UNKNOWN 0xFFFF

extern enum sharp_datatype ucc_to_sharp_dtype[];

Expand Down
2 changes: 1 addition & 1 deletion src/components/tl/sharp/tl_sharp_team.c
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_sharp_team_t, ucc_base_context_t *tl_context,

if (sharp_caps.support_mask.dtypes & UCC_BIT(SHARP_DTYPE_BFLOAT16)) {
tl_debug(ctx->super.super.lib, "enabling support for UCC_DT_BFLOAT16");
ucc_to_sharp_dtype[UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = UCC_DT_BFLOAT16;
ucc_to_sharp_dtype[UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = SHARP_DTYPE_BFLOAT16;
} else {
tl_debug(ctx->super.super.lib, "disabling support for UCC_DT_BFLOAT16");
ucc_to_sharp_dtype[UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = SHARP_DTYPE_NULL;
Expand Down
4 changes: 3 additions & 1 deletion src/components/tl/ucp/allgather/allgather_neighbor.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
static ucc_rank_t get_recv_from_rank(ucc_rank_t rank, ucc_rank_t size, int i)
{
const int i_parity = i % 2;
ucc_rank_t offset_at_step[2], recv_data_from;
int offset_at_step[2];
ucc_rank_t recv_data_from;

if (rank % 2) {
recv_data_from = (rank - 1 + size) % size;
offset_at_step[0] = (-2);
Expand Down
18 changes: 10 additions & 8 deletions src/components/tl/ucp/alltoallv/alltoallv_hybrid.c
Original file line number Diff line number Diff line change
Expand Up @@ -510,9 +510,11 @@ ucc_status_t post_recv(ucc_rank_t recvfrom, ucc_rank_t tsize, size_t dt_size,

/* check if we have space for maximum recieve. If not, recycle */
if (meta->offset * dt_size + step_buf_size > tmp_buf_size) {
new_offset = receive_buffer_recycler(tsize, (int *)op_metadata, (int *)op_metadata + tsize,
seg_st, p_tmp_recv_region, dt_size, BytesForPacking,
step, user_rbuf, rdisps, trank, radix, node_edge_id);
new_offset = receive_buffer_recycler(tsize, (unsigned int *)op_metadata,
(int *)op_metadata + tsize,
seg_st, p_tmp_recv_region, dt_size,
BytesForPacking, step, user_rbuf,
rdisps, trank, radix, node_edge_id);
meta->offset = new_offset;
}
ucc_assert(meta->offset * dt_size + step_buf_size <= tmp_buf_size);
Expand Down Expand Up @@ -595,8 +597,8 @@ static ucc_status_t complete_current_step_receives(ucc_rank_t tsize, int step,
temp_offset = PTR_OFFSET(temp_offset, cur_buf_length * dt_size);
} else {
/* data will be sent pairwise */
((int *)op_metadata)[i] = COUNT_DIRECT;
((int *)op_metadata)[i + tsize] = COUNT_DIRECT;
((int *)op_metadata)[i] = (int)COUNT_DIRECT;
((int *)op_metadata)[i + tsize] = (int)COUNT_DIRECT;
if (i < (step * radix)) {
int pairwise_src = (trank - i + tsize) % tsize;
if (rcounts[pairwise_src] > 0) {
Expand Down Expand Up @@ -636,8 +638,8 @@ static ucc_status_t complete_current_step_receives(ucc_rank_t tsize, int step,
next_p = tsize;
}
} else {
((int *)op_metadata)[i] = COUNT_DIRECT;
((int *)op_metadata)[i + tsize] = COUNT_DIRECT;
((int *)op_metadata)[i] = (int)COUNT_DIRECT;
((int *)op_metadata)[i + tsize] = (int)COUNT_DIRECT;
if (i < (step * radix)) {
int pairwise_src = (trank - i + tsize) % tsize;
if (rcounts[pairwise_src] > 0) {
Expand Down Expand Up @@ -709,7 +711,7 @@ ucc_status_t pairwise_manager(ucc_rank_t trank, ucc_rank_t tsize,
int *r_disps = (int*)TASK_ARGS(task).dst.info_v.displacements;
int *scounts = (int*)TASK_ARGS(task).src.info_v.counts;
int *rcounts = (int*)TASK_ARGS(task).dst.info_v.counts;
int* cur = &task->alltoallv_hybrid.cur_out;
ucc_rank_t *cur = &task->alltoallv_hybrid.cur_out;
int chunk_num_limit = UCC_TL_UCP_TEAM_LIB(team)->cfg.alltoallv_hybrid_pairwise_num_posts;
int chunk_byte_limit = UCC_TL_UCP_TEAM_LIB(team)->cfg.alltoallv_hybrid_chunk_byte_limit;
ucc_status_t status;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ static inline void get_sbuf_rbuf(ucc_tl_ucp_task_t *task, size_t block_count,
size_t dt_size = ucc_dt_size(args->dst.info.datatype);
void *scratch = task->reduce_scatter_kn.scratch;
ucc_knomial_pattern_t *p = &task->reduce_scatter_kn.p;
size_t offset, local_seg_offset, local_seg_count;
size_t offset, local_seg_count;
ptrdiff_t local_seg_offset;

if (ucc_knomial_pattern_loop_first_iteration(p)) {
*sbuf = ((KN_NODE_PROXY == p->node_type) || UCC_IS_INPLACE(*args))
Expand Down
2 changes: 1 addition & 1 deletion src/ucc/api/ucc.h
Original file line number Diff line number Diff line change
Expand Up @@ -1337,7 +1337,7 @@ struct ucc_ep_map_cb {
* @ingroup UCC_TEAM_DT
*/
typedef enum {
UCC_EP_MAP_FULL = 1, /*!< The ep range of the team spans all eps from a context*/
UCC_EP_MAP_FULL = 1, /*!< The ep range of the team spans all eps from a context. */
UCC_EP_MAP_STRIDED = 2, /*!< The ep range of the team can be described by the 2 values: start, stride.*/
UCC_EP_MAP_ARRAY = 3, /*!< The ep range is given as an array of intergers that map the ep in the team to
the team_context rank. */
Expand Down
4 changes: 3 additions & 1 deletion src/utils/ucc_coll_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -266,10 +266,11 @@ ucc_ep_map_from_array_generic(void **array, ucc_rank_t size,
ucc_rank_t full_size, int need_free, int is64)
{
int is_const_stride = 0;
ucc_ep_map_t map = {0};
ucc_ep_map_t map;
int64_t stride;
ucc_rank_t i;

map.type = (ucc_ep_map_type_t)0;
map.ep_num = size;
if (size > 1) {
/* try to detect strided pattern */
Expand Down Expand Up @@ -303,6 +304,7 @@ ucc_ep_map_from_array_generic(void **array, ucc_rank_t size,
map.array.map = (void *)(*array);
map.array.elem_size = is64 ? sizeof(uint64_t) : sizeof(ucc_rank_t);
}

return map;
}

Expand Down
6 changes: 5 additions & 1 deletion src/utils/ucc_coll_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,11 @@
#define UCC_COLL_ARGS_ACTIVE_SET(_args) \
((_args)->mask & UCC_COLL_ARGS_FIELD_ACTIVE_SET)

#define UCC_MEM_TYPE_MASK_FULL -1
#define UCC_MEM_TYPE_MASK_FULL (UCC_BIT(UCC_MEMORY_TYPE_HOST) | \
UCC_BIT(UCC_MEMORY_TYPE_CUDA) | \
UCC_BIT(UCC_MEMORY_TYPE_CUDA_MANAGED) | \
UCC_BIT(UCC_MEMORY_TYPE_ROCM) | \
UCC_BIT(UCC_MEMORY_TYPE_ROCM_MANAGED))

static inline int ucc_coll_args_is_reduction(ucc_coll_type_t ct)
{
Expand Down
17 changes: 10 additions & 7 deletions src/utils/ucc_parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,25 +86,28 @@ static inline int ucc_check_range(char *range_str, ucc_rank_t *begin,
char **range = ucc_str_split(range_str, "-");
char *str_end;
unsigned n_range;
long pbegin, pend;

if (!range) {
goto split_err;
}

n_range = ucc_str_split_count(range);
*begin = (size_t) strtol(range[0], &str_end, 10);
*end = *begin;
pbegin = strtol(range[0], &str_end, 10);
pend = pbegin;

if (n_range > 2 || *str_end != '\0' || *begin < 0) {
if (n_range > 2 || *str_end != '\0' || pbegin < 0) {
goto val_err;
}

if (n_range == 2) {
*end = (size_t) strtol(range[1], &str_end, 10);
if (*str_end != '\0' || *end < 0) {
pend = strtol(range[1], &str_end, 10);
if (*str_end != '\0' || pend < 0) {
goto val_err;
}
}
*begin = (ucc_rank_t)pbegin;
*end = (ucc_rank_t)pend;
ucc_str_split_free(range);
return 1;

Expand Down Expand Up @@ -852,7 +855,7 @@ int ucc_config_sscanf_uint_ranged(const char *buf, void *dest,
if (!r) {
goto err_tokens;
}
r->mtypes = -1; //mask all types
r->mtypes = UCC_MEM_TYPE_MASK_FULL;
r->start = 0;
r->end = SIZE_MAX;

Expand Down Expand Up @@ -905,7 +908,7 @@ int ucc_config_sprintf_uint_ranged(char *buf, size_t max, const void *src,
ucc_list_for_each(r, &s->ranges, list_elem) {
ucs_memunits_to_str(r->start, tmp_start, tmp_max);
ucs_memunits_to_str(r->end, tmp_end, tmp_max);
if (r->mtypes == -1) {
if (r->mtypes == UCC_MEM_TYPE_MASK_FULL) {
ucc_snprintf_safe(buf, max, "%s-%s:%u", tmp_start, tmp_end,
r->value);
} else {
Expand Down
1 change: 0 additions & 1 deletion test/mpi/test_mpi.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,6 @@ static inline const char* skip_str(test_skip_cause_t s) {
default:
return "unknown";
}
return NULL;
}

static inline const char* team_str(ucc_test_mpi_team_t t) {
Expand Down
5 changes: 2 additions & 3 deletions tools/perf/ucc_pt_cuda.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand All @@ -15,7 +15,7 @@ ucc_pt_cuda_iface_t ucc_pt_cuda_iface = {

#define LOAD_CUDA_SYM(_sym, _pt_sym) ({ \
void *h = dlsym(handle, _sym); \
if ((error = dlerror()) != NULL) { \
if (dlerror() != NULL) { \
return; \
} \
ucc_pt_cuda_iface. _pt_sym = \
Expand All @@ -24,7 +24,6 @@ ucc_pt_cuda_iface_t ucc_pt_cuda_iface = {

void ucc_pt_cuda_init(void)
{
char *error;
void *handle;

handle = dlopen ("libcudart.so", RTLD_LAZY);
Expand Down
5 changes: 2 additions & 3 deletions tools/perf/ucc_pt_rocm.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (C) Advanced Micro Devices, Inc. 2022. ALL RIGHTS RESERVED.
*
* See file LICENSE for terms.
Expand All @@ -16,7 +16,7 @@ ucc_pt_rocm_iface_t ucc_pt_rocm_iface = {

#define LOAD_ROCM_SYM(_sym, _pt_sym) ({ \
void *h = dlsym(handle, _sym); \
if ((error = dlerror()) != NULL) { \
if (dlerror() != NULL) { \
return; \
} \
ucc_pt_rocm_iface. _pt_sym = \
Expand All @@ -25,7 +25,6 @@ ucc_pt_rocm_iface_t ucc_pt_rocm_iface = {

void ucc_pt_rocm_init(void)
{
char *error;
void *handle;

handle = dlopen ("libamdhip64.so", RTLD_LAZY);
Expand Down

0 comments on commit 21a424f

Please sign in to comment.