From fce4f0b16b4541c741abf184eba1990197c927a4 Mon Sep 17 00:00:00 2001 From: Sergey Lebedev Date: Fri, 3 Nov 2023 16:35:48 +0100 Subject: [PATCH] TEST: build ucc with hpcsdk --- .github/workflows/hpcsdk.yaml | 25 +++++++++++++++++++ src/coll_patterns/recursive_knomial.h | 2 +- src/coll_patterns/sra_knomial.h | 9 ++++--- src/components/tl/cuda/tl_cuda_topo.c | 3 ++- .../tl/mlx5/alltoall/alltoall_mkeys.c | 3 +-- src/components/tl/sharp/tl_sharp_coll.c | 6 ++--- src/components/tl/sharp/tl_sharp_coll.h | 4 +-- src/components/tl/sharp/tl_sharp_team.c | 2 +- .../tl/ucp/allgather/allgather_neighbor.c | 4 ++- .../tl/ucp/alltoallv/alltoallv_hybrid.c | 18 +++++++------ .../reduce_scatter/reduce_scatter_knomial.c | 3 ++- src/ucc/api/ucc.h | 2 +- src/utils/ucc_coll_utils.c | 4 ++- src/utils/ucc_coll_utils.h | 6 ++++- src/utils/ucc_parser.c | 17 +++++++------ test/mpi/test_mpi.h | 1 - tools/perf/ucc_pt_cuda.cc | 5 ++-- tools/perf/ucc_pt_rocm.cc | 5 ++-- 18 files changed, 78 insertions(+), 41 deletions(-) create mode 100644 .github/workflows/hpcsdk.yaml diff --git a/.github/workflows/hpcsdk.yaml b/.github/workflows/hpcsdk.yaml new file mode 100644 index 0000000000..77188cd96a --- /dev/null +++ b/.github/workflows/hpcsdk.yaml @@ -0,0 +1,25 @@ +name: HPC_SDK + +on: [push, pull_request] + +env: + HPCXDIR: /opt/nvidia/hpc_sdk/Linux_x86_64/2023/comm_libs/12.2/hpcx/latest/ + NCCLDIR: /opt/nvidia/hpc_sdk/Linux_x86_64/2023/comm_libs/12.2/nccl/ + CUDADIR: /opt/nvidia/hpc_sdk/Linux_x86_64/2023/cuda/12.2/ + +jobs: + build: + runs-on: ubuntu-20.04 + container: + image: nvcr.io/nvidia/nvhpc:23.9-devel-cuda12.2-ubuntu22.04 + steps: + - name: Install dependencies + run: | + apt-get update + apt-get install -y --no-install-recommends libiberty-dev + - uses: actions/checkout@v1 + - name: Build UCC + run: | + ./autogen.sh + CC=nvc CXX=nvc++ ./configure --with-tls=ucp,mlx5,cuda,self,nccl,sharp --with-mpi=${HPCXDIR}/ompi --with-sharp=${HPCXDIR}/sharp --with-ucx=${HPCXDIR}/ucx --with-cuda=${CUDADIR} --with-nccl=${NCCLDIR} --with-nvcc-gencode="-gencode=arch=compute_80,code=sm_80" + make -j`nproc` install diff --git a/src/coll_patterns/recursive_knomial.h b/src/coll_patterns/recursive_knomial.h index 4f8981957c..ebf9a0981b 100644 --- a/src/coll_patterns/recursive_knomial.h +++ b/src/coll_patterns/recursive_knomial.h @@ -50,7 +50,7 @@ typedef struct ucc_knomial_pattern { size_t block_size_counts; size_t count; /* collective buffer size */ ucc_rank_t block_size; - size_t block_offset; + ptrdiff_t block_offset; } ucc_knomial_pattern_t; /** diff --git a/src/coll_patterns/sra_knomial.h b/src/coll_patterns/sra_knomial.h index 1574389632..2f63a243f2 100644 --- a/src/coll_patterns/sra_knomial.h +++ b/src/coll_patterns/sra_knomial.h @@ -159,7 +159,7 @@ ucc_kn_seg_desc_compute(ucc_knomial_pattern_t *p, ucc_kn_seg_desc_t *seg, static inline void ucc_knx_block(ucc_rank_t rank, ucc_rank_t size, ucc_kn_radix_t radix, - size_t count, int iter, size_t *b_count, size_t *b_offset) + size_t count, int iter, size_t *b_count, ptrdiff_t *b_offset) { ucc_rank_t offset = 0; ucc_rank_t block_count; @@ -213,7 +213,7 @@ ucc_kn_agx_pattern_init(ucc_rank_t size, ucc_rank_t rank, ucc_kn_radix_t radix, static inline void ucc_kn_ag_pattern_peer_seg(ucc_rank_t peer, ucc_knomial_pattern_t *p, - size_t *seg_count, size_t *seg_offset) + size_t *seg_count, ptrdiff_t *seg_offset) { ucc_rank_t step_radix, seg_index; ucc_kn_seg_desc_t s; @@ -278,7 +278,7 @@ static inline void ucc_kn_rsx_pattern_init(ucc_rank_t size, ucc_rank_t rank, static inline void ucc_kn_rs_pattern_peer_seg(ucc_rank_t peer, ucc_knomial_pattern_t *p, - size_t *peer_seg_count, size_t *peer_seg_offset) + size_t *peer_seg_count, ptrdiff_t *peer_seg_offset) { ucc_rank_t step_radix, seg_index; @@ -305,7 +305,8 @@ ucc_kn_rs_pattern_peer_seg(ucc_rank_t peer, ucc_knomial_pattern_t *p, static inline void ucc_kn_rs_pattern_next_iter(ucc_knomial_pattern_t *p) { - size_t offset, bs; + size_t bs; + ptrdiff_t offset; ucc_kn_rs_pattern_peer_seg(p->rank, p, &bs, &offset); p->block_size_counts = bs; diff --git a/src/components/tl/cuda/tl_cuda_topo.c b/src/components/tl/cuda/tl_cuda_topo.c index 96862e921e..a0f54d57e6 100644 --- a/src/components/tl/cuda/tl_cuda_topo.c +++ b/src/components/tl/cuda/tl_cuda_topo.c @@ -220,7 +220,8 @@ static ucc_status_t ucc_tl_cuda_topo_graph_create(ucc_tl_cuda_topo_t *topo) ucc_tl_cuda_topo_dev_type_t dev_type; ucc_tl_cuda_device_pci_id_t pci_id; ucc_tl_cuda_topo_node_t *node, *peer_node; - int num_gpus, num_nvlinks, link, i; + int num_nvlinks, link, i; + unsigned int num_gpus; nvmlReturn_t nvml_st; nvml_st = nvmlInit_v2(); diff --git a/src/components/tl/mlx5/alltoall/alltoall_mkeys.c b/src/components/tl/mlx5/alltoall/alltoall_mkeys.c index 7dd90d49b8..0fa197e6c7 100644 --- a/src/components/tl/mlx5/alltoall/alltoall_mkeys.c +++ b/src/components/tl/mlx5/alltoall/alltoall_mkeys.c @@ -217,7 +217,6 @@ ucc_status_t ucc_tl_mlx5_init_mkeys(ucc_tl_mlx5_team_t *team, if (!node->ops[i].send_mkeys) { tl_error(lib, "failed to malloc"); goto err_malloc; - return UCC_ERR_NO_MEMORY; } node->ops[i].recv_mkeys = (struct mlx5dv_mkey **)ucc_malloc( sizeof(struct mlx5dv_mkey *) * a2a->max_num_of_columns); @@ -230,7 +229,7 @@ ucc_status_t ucc_tl_mlx5_init_mkeys(ucc_tl_mlx5_team_t *team, status = create_master_key(node->sbgp->group_size + 1, a2a->pd, &node->ops[i].send_mkeys[j], lib); if (status != UCC_OK) { - tl_error(lib, " failed to create send masterkey [%d,%d]", i, j); + tl_error(lib, "failed to create send masterkey [%d,%d]", i, j); goto err_create_mkey; } status = create_master_key(node->sbgp->group_size + 1, a2a->pd, diff --git a/src/components/tl/sharp/tl_sharp_coll.c b/src/components/tl/sharp/tl_sharp_coll.c index d246fcc563..1dcf2465c1 100644 --- a/src/components/tl/sharp/tl_sharp_coll.c +++ b/src/components/tl/sharp/tl_sharp_coll.c @@ -26,9 +26,9 @@ enum sharp_datatype ucc_to_sharp_dtype[] = { [UCC_DT_PREDEFINED_ID(UCC_DT_FLOAT64)] = SHARP_DTYPE_DOUBLE, [UCC_DT_PREDEFINED_ID(UCC_DT_FLOAT128)] = SHARP_DTYPE_NULL, #if SHARP_API > SHARP_VERSION(3, 0) - [UCC_DT_PREDEFINED_ID(UCC_DT_INT8)] = SHARP_DTYPE_UNKNOWN, - [UCC_DT_PREDEFINED_ID(UCC_DT_UINT8)] = SHARP_DTYPE_UNKNOWN, - [UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = SHARP_DTYPE_UNKNOWN, + [UCC_DT_PREDEFINED_ID(UCC_DT_INT8)] = (enum sharp_datatype)SHARP_DTYPE_UNKNOWN, + [UCC_DT_PREDEFINED_ID(UCC_DT_UINT8)] = (enum sharp_datatype)SHARP_DTYPE_UNKNOWN, + [UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = (enum sharp_datatype)SHARP_DTYPE_UNKNOWN, #else [UCC_DT_PREDEFINED_ID(UCC_DT_INT8)] = SHARP_DTYPE_NULL, [UCC_DT_PREDEFINED_ID(UCC_DT_UINT8)] = SHARP_DTYPE_NULL, diff --git a/src/components/tl/sharp/tl_sharp_coll.h b/src/components/tl/sharp/tl_sharp_coll.h index 4b0dba17b6..6b12c69900 100644 --- a/src/components/tl/sharp/tl_sharp_coll.h +++ b/src/components/tl/sharp/tl_sharp_coll.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ @@ -10,7 +10,7 @@ #include "tl_sharp.h" /* need to query for datatype support at runtime */ -#define SHARP_DTYPE_UNKNOWN -1 +#define SHARP_DTYPE_UNKNOWN 0xFFFF extern enum sharp_datatype ucc_to_sharp_dtype[]; diff --git a/src/components/tl/sharp/tl_sharp_team.c b/src/components/tl/sharp/tl_sharp_team.c index fe4a5875fb..6b8f369c7c 100644 --- a/src/components/tl/sharp/tl_sharp_team.c +++ b/src/components/tl/sharp/tl_sharp_team.c @@ -117,7 +117,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_sharp_team_t, ucc_base_context_t *tl_context, if (sharp_caps.support_mask.dtypes & UCC_BIT(SHARP_DTYPE_BFLOAT16)) { tl_debug(ctx->super.super.lib, "enabling support for UCC_DT_BFLOAT16"); - ucc_to_sharp_dtype[UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = UCC_DT_BFLOAT16; + ucc_to_sharp_dtype[UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = SHARP_DTYPE_BFLOAT16; } else { tl_debug(ctx->super.super.lib, "disabling support for UCC_DT_BFLOAT16"); ucc_to_sharp_dtype[UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = SHARP_DTYPE_NULL; diff --git a/src/components/tl/ucp/allgather/allgather_neighbor.c b/src/components/tl/ucp/allgather/allgather_neighbor.c index 771ba2d3b8..534c197e4e 100644 --- a/src/components/tl/ucp/allgather/allgather_neighbor.c +++ b/src/components/tl/ucp/allgather/allgather_neighbor.c @@ -15,7 +15,9 @@ static ucc_rank_t get_recv_from_rank(ucc_rank_t rank, ucc_rank_t size, int i) { const int i_parity = i % 2; - ucc_rank_t offset_at_step[2], recv_data_from; + int offset_at_step[2]; + ucc_rank_t recv_data_from; + if (rank % 2) { recv_data_from = (rank - 1 + size) % size; offset_at_step[0] = (-2); diff --git a/src/components/tl/ucp/alltoallv/alltoallv_hybrid.c b/src/components/tl/ucp/alltoallv/alltoallv_hybrid.c index 61b130eaa5..7b8c7b7b67 100644 --- a/src/components/tl/ucp/alltoallv/alltoallv_hybrid.c +++ b/src/components/tl/ucp/alltoallv/alltoallv_hybrid.c @@ -510,9 +510,11 @@ ucc_status_t post_recv(ucc_rank_t recvfrom, ucc_rank_t tsize, size_t dt_size, /* check if we have space for maximum recieve. If not, recycle */ if (meta->offset * dt_size + step_buf_size > tmp_buf_size) { - new_offset = receive_buffer_recycler(tsize, (int *)op_metadata, (int *)op_metadata + tsize, - seg_st, p_tmp_recv_region, dt_size, BytesForPacking, - step, user_rbuf, rdisps, trank, radix, node_edge_id); + new_offset = receive_buffer_recycler(tsize, (unsigned int *)op_metadata, + (int *)op_metadata + tsize, + seg_st, p_tmp_recv_region, dt_size, + BytesForPacking, step, user_rbuf, + rdisps, trank, radix, node_edge_id); meta->offset = new_offset; } ucc_assert(meta->offset * dt_size + step_buf_size <= tmp_buf_size); @@ -595,8 +597,8 @@ static ucc_status_t complete_current_step_receives(ucc_rank_t tsize, int step, temp_offset = PTR_OFFSET(temp_offset, cur_buf_length * dt_size); } else { /* data will be sent pairwise */ - ((int *)op_metadata)[i] = COUNT_DIRECT; - ((int *)op_metadata)[i + tsize] = COUNT_DIRECT; + ((int *)op_metadata)[i] = (int)COUNT_DIRECT; + ((int *)op_metadata)[i + tsize] = (int)COUNT_DIRECT; if (i < (step * radix)) { int pairwise_src = (trank - i + tsize) % tsize; if (rcounts[pairwise_src] > 0) { @@ -636,8 +638,8 @@ static ucc_status_t complete_current_step_receives(ucc_rank_t tsize, int step, next_p = tsize; } } else { - ((int *)op_metadata)[i] = COUNT_DIRECT; - ((int *)op_metadata)[i + tsize] = COUNT_DIRECT; + ((int *)op_metadata)[i] = (int)COUNT_DIRECT; + ((int *)op_metadata)[i + tsize] = (int)COUNT_DIRECT; if (i < (step * radix)) { int pairwise_src = (trank - i + tsize) % tsize; if (rcounts[pairwise_src] > 0) { @@ -709,7 +711,7 @@ ucc_status_t pairwise_manager(ucc_rank_t trank, ucc_rank_t tsize, int *r_disps = (int*)TASK_ARGS(task).dst.info_v.displacements; int *scounts = (int*)TASK_ARGS(task).src.info_v.counts; int *rcounts = (int*)TASK_ARGS(task).dst.info_v.counts; - int* cur = &task->alltoallv_hybrid.cur_out; + ucc_rank_t *cur = &task->alltoallv_hybrid.cur_out; int chunk_num_limit = UCC_TL_UCP_TEAM_LIB(team)->cfg.alltoallv_hybrid_pairwise_num_posts; int chunk_byte_limit = UCC_TL_UCP_TEAM_LIB(team)->cfg.alltoallv_hybrid_chunk_byte_limit; ucc_status_t status; diff --git a/src/components/tl/ucp/reduce_scatter/reduce_scatter_knomial.c b/src/components/tl/ucp/reduce_scatter/reduce_scatter_knomial.c index 11a2abc859..ca5457dfb4 100644 --- a/src/components/tl/ucp/reduce_scatter/reduce_scatter_knomial.c +++ b/src/components/tl/ucp/reduce_scatter/reduce_scatter_knomial.c @@ -22,7 +22,8 @@ static inline void get_sbuf_rbuf(ucc_tl_ucp_task_t *task, size_t block_count, size_t dt_size = ucc_dt_size(args->dst.info.datatype); void *scratch = task->reduce_scatter_kn.scratch; ucc_knomial_pattern_t *p = &task->reduce_scatter_kn.p; - size_t offset, local_seg_offset, local_seg_count; + size_t offset, local_seg_count; + ptrdiff_t local_seg_offset; if (ucc_knomial_pattern_loop_first_iteration(p)) { *sbuf = ((KN_NODE_PROXY == p->node_type) || UCC_IS_INPLACE(*args)) diff --git a/src/ucc/api/ucc.h b/src/ucc/api/ucc.h index c7c0ce10b0..02e5e11540 100644 --- a/src/ucc/api/ucc.h +++ b/src/ucc/api/ucc.h @@ -1337,7 +1337,7 @@ struct ucc_ep_map_cb { * @ingroup UCC_TEAM_DT */ typedef enum { - UCC_EP_MAP_FULL = 1, /*!< The ep range of the team spans all eps from a context*/ + UCC_EP_MAP_FULL = 1, /*!< The ep range of the team spans all eps from a context. */ UCC_EP_MAP_STRIDED = 2, /*!< The ep range of the team can be described by the 2 values: start, stride.*/ UCC_EP_MAP_ARRAY = 3, /*!< The ep range is given as an array of intergers that map the ep in the team to the team_context rank. */ diff --git a/src/utils/ucc_coll_utils.c b/src/utils/ucc_coll_utils.c index 3921f1262e..2f3b858cbb 100644 --- a/src/utils/ucc_coll_utils.c +++ b/src/utils/ucc_coll_utils.c @@ -266,10 +266,11 @@ ucc_ep_map_from_array_generic(void **array, ucc_rank_t size, ucc_rank_t full_size, int need_free, int is64) { int is_const_stride = 0; - ucc_ep_map_t map = {0}; + ucc_ep_map_t map; int64_t stride; ucc_rank_t i; + map.type = (ucc_ep_map_type_t)0; map.ep_num = size; if (size > 1) { /* try to detect strided pattern */ @@ -303,6 +304,7 @@ ucc_ep_map_from_array_generic(void **array, ucc_rank_t size, map.array.map = (void *)(*array); map.array.elem_size = is64 ? sizeof(uint64_t) : sizeof(ucc_rank_t); } + return map; } diff --git a/src/utils/ucc_coll_utils.h b/src/utils/ucc_coll_utils.h index 2d3a919f08..ead7fe4081 100644 --- a/src/utils/ucc_coll_utils.h +++ b/src/utils/ucc_coll_utils.h @@ -71,7 +71,11 @@ #define UCC_COLL_ARGS_ACTIVE_SET(_args) \ ((_args)->mask & UCC_COLL_ARGS_FIELD_ACTIVE_SET) -#define UCC_MEM_TYPE_MASK_FULL -1 +#define UCC_MEM_TYPE_MASK_FULL (UCC_BIT(UCC_MEMORY_TYPE_HOST) | \ + UCC_BIT(UCC_MEMORY_TYPE_CUDA) | \ + UCC_BIT(UCC_MEMORY_TYPE_CUDA_MANAGED) | \ + UCC_BIT(UCC_MEMORY_TYPE_ROCM) | \ + UCC_BIT(UCC_MEMORY_TYPE_ROCM_MANAGED)) static inline int ucc_coll_args_is_reduction(ucc_coll_type_t ct) { diff --git a/src/utils/ucc_parser.c b/src/utils/ucc_parser.c index fff69e47c6..6db8ef52f8 100644 --- a/src/utils/ucc_parser.c +++ b/src/utils/ucc_parser.c @@ -86,25 +86,28 @@ static inline int ucc_check_range(char *range_str, ucc_rank_t *begin, char **range = ucc_str_split(range_str, "-"); char *str_end; unsigned n_range; + long pbegin, pend; if (!range) { goto split_err; } n_range = ucc_str_split_count(range); - *begin = (size_t) strtol(range[0], &str_end, 10); - *end = *begin; + pbegin = strtol(range[0], &str_end, 10); + pend = pbegin; - if (n_range > 2 || *str_end != '\0' || *begin < 0) { + if (n_range > 2 || *str_end != '\0' || pbegin < 0) { goto val_err; } if (n_range == 2) { - *end = (size_t) strtol(range[1], &str_end, 10); - if (*str_end != '\0' || *end < 0) { + pend = strtol(range[1], &str_end, 10); + if (*str_end != '\0' || pend < 0) { goto val_err; } } + *begin = (ucc_rank_t)pbegin; + *end = (ucc_rank_t)pend; ucc_str_split_free(range); return 1; @@ -852,7 +855,7 @@ int ucc_config_sscanf_uint_ranged(const char *buf, void *dest, if (!r) { goto err_tokens; } - r->mtypes = -1; //mask all types + r->mtypes = UCC_MEM_TYPE_MASK_FULL; r->start = 0; r->end = SIZE_MAX; @@ -905,7 +908,7 @@ int ucc_config_sprintf_uint_ranged(char *buf, size_t max, const void *src, ucc_list_for_each(r, &s->ranges, list_elem) { ucs_memunits_to_str(r->start, tmp_start, tmp_max); ucs_memunits_to_str(r->end, tmp_end, tmp_max); - if (r->mtypes == -1) { + if (r->mtypes == UCC_MEM_TYPE_MASK_FULL) { ucc_snprintf_safe(buf, max, "%s-%s:%u", tmp_start, tmp_end, r->value); } else { diff --git a/test/mpi/test_mpi.h b/test/mpi/test_mpi.h index d96a08a3f9..4196d85d52 100644 --- a/test/mpi/test_mpi.h +++ b/test/mpi/test_mpi.h @@ -145,7 +145,6 @@ static inline const char* skip_str(test_skip_cause_t s) { default: return "unknown"; } - return NULL; } static inline const char* team_str(ucc_test_mpi_team_t t) { diff --git a/tools/perf/ucc_pt_cuda.cc b/tools/perf/ucc_pt_cuda.cc index fdf17457be..bcadabc955 100644 --- a/tools/perf/ucc_pt_cuda.cc +++ b/tools/perf/ucc_pt_cuda.cc @@ -1,5 +1,5 @@ /** - * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ @@ -15,7 +15,7 @@ ucc_pt_cuda_iface_t ucc_pt_cuda_iface = { #define LOAD_CUDA_SYM(_sym, _pt_sym) ({ \ void *h = dlsym(handle, _sym); \ - if ((error = dlerror()) != NULL) { \ + if (dlerror() != NULL) { \ return; \ } \ ucc_pt_cuda_iface. _pt_sym = \ @@ -24,7 +24,6 @@ ucc_pt_cuda_iface_t ucc_pt_cuda_iface = { void ucc_pt_cuda_init(void) { - char *error; void *handle; handle = dlopen ("libcudart.so", RTLD_LAZY); diff --git a/tools/perf/ucc_pt_rocm.cc b/tools/perf/ucc_pt_rocm.cc index 2e1f121b8f..2851be0deb 100644 --- a/tools/perf/ucc_pt_rocm.cc +++ b/tools/perf/ucc_pt_rocm.cc @@ -1,5 +1,5 @@ /** - * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * Copyright (C) Advanced Micro Devices, Inc. 2022. ALL RIGHTS RESERVED. * * See file LICENSE for terms. @@ -16,7 +16,7 @@ ucc_pt_rocm_iface_t ucc_pt_rocm_iface = { #define LOAD_ROCM_SYM(_sym, _pt_sym) ({ \ void *h = dlsym(handle, _sym); \ - if ((error = dlerror()) != NULL) { \ + if (dlerror() != NULL) { \ return; \ } \ ucc_pt_rocm_iface. _pt_sym = \ @@ -25,7 +25,6 @@ ucc_pt_rocm_iface_t ucc_pt_rocm_iface = { void ucc_pt_rocm_init(void) { - char *error; void *handle; handle = dlopen ("libamdhip64.so", RTLD_LAZY);