Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TEST: build ucc with hpcsdk #871

Merged
merged 1 commit into from
Nov 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions .github/workflows/hpcsdk.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: HPC_SDK

on: [push, pull_request]

env:
HPCXDIR: /opt/nvidia/hpc_sdk/Linux_x86_64/2023/comm_libs/12.2/hpcx/latest/
NCCLDIR: /opt/nvidia/hpc_sdk/Linux_x86_64/2023/comm_libs/12.2/nccl/
CUDADIR: /opt/nvidia/hpc_sdk/Linux_x86_64/2023/cuda/12.2/

jobs:
build:
runs-on: ubuntu-20.04
container:
image: nvcr.io/nvidia/nvhpc:23.9-devel-cuda12.2-ubuntu22.04
steps:
- name: Install dependencies
run: |
apt-get update
apt-get install -y --no-install-recommends libiberty-dev
- uses: actions/checkout@v1
- name: Build UCC
run: |
./autogen.sh
CC=nvc CXX=nvc++ ./configure --with-tls=ucp,mlx5,cuda,self,nccl,sharp --with-mpi=${HPCXDIR}/ompi --with-sharp=${HPCXDIR}/sharp --with-ucx=${HPCXDIR}/ucx --with-cuda=${CUDADIR} --with-nccl=${NCCLDIR} --with-nvcc-gencode="-gencode=arch=compute_80,code=sm_80"
make -j`nproc` install
2 changes: 1 addition & 1 deletion src/coll_patterns/recursive_knomial.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ typedef struct ucc_knomial_pattern {
size_t block_size_counts;
size_t count; /* collective buffer size */
ucc_rank_t block_size;
size_t block_offset;
ptrdiff_t block_offset;
} ucc_knomial_pattern_t;

/**
Expand Down
9 changes: 5 additions & 4 deletions src/coll_patterns/sra_knomial.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ ucc_kn_seg_desc_compute(ucc_knomial_pattern_t *p, ucc_kn_seg_desc_t *seg,

static inline void
ucc_knx_block(ucc_rank_t rank, ucc_rank_t size, ucc_kn_radix_t radix,
size_t count, int iter, size_t *b_count, size_t *b_offset)
size_t count, int iter, size_t *b_count, ptrdiff_t *b_offset)
{
ucc_rank_t offset = 0;
ucc_rank_t block_count;
Expand Down Expand Up @@ -213,7 +213,7 @@ ucc_kn_agx_pattern_init(ucc_rank_t size, ucc_rank_t rank, ucc_kn_radix_t radix,

static inline void
ucc_kn_ag_pattern_peer_seg(ucc_rank_t peer, ucc_knomial_pattern_t *p,
size_t *seg_count, size_t *seg_offset)
size_t *seg_count, ptrdiff_t *seg_offset)
{
ucc_rank_t step_radix, seg_index;
ucc_kn_seg_desc_t s;
Expand Down Expand Up @@ -278,7 +278,7 @@ static inline void ucc_kn_rsx_pattern_init(ucc_rank_t size, ucc_rank_t rank,

static inline void
ucc_kn_rs_pattern_peer_seg(ucc_rank_t peer, ucc_knomial_pattern_t *p,
size_t *peer_seg_count, size_t *peer_seg_offset)
size_t *peer_seg_count, ptrdiff_t *peer_seg_offset)
{
ucc_rank_t step_radix, seg_index;

Expand All @@ -305,7 +305,8 @@ ucc_kn_rs_pattern_peer_seg(ucc_rank_t peer, ucc_knomial_pattern_t *p,

static inline void ucc_kn_rs_pattern_next_iter(ucc_knomial_pattern_t *p)
{
size_t offset, bs;
size_t bs;
ptrdiff_t offset;

ucc_kn_rs_pattern_peer_seg(p->rank, p, &bs, &offset);
p->block_size_counts = bs;
Expand Down
3 changes: 2 additions & 1 deletion src/components/tl/cuda/tl_cuda_topo.c
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,8 @@ static ucc_status_t ucc_tl_cuda_topo_graph_create(ucc_tl_cuda_topo_t *topo)
ucc_tl_cuda_topo_dev_type_t dev_type;
ucc_tl_cuda_device_pci_id_t pci_id;
ucc_tl_cuda_topo_node_t *node, *peer_node;
int num_gpus, num_nvlinks, link, i;
int num_nvlinks, link, i;
unsigned int num_gpus;
nvmlReturn_t nvml_st;

nvml_st = nvmlInit_v2();
Expand Down
3 changes: 1 addition & 2 deletions src/components/tl/mlx5/alltoall/alltoall_mkeys.c
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,6 @@ ucc_status_t ucc_tl_mlx5_init_mkeys(ucc_tl_mlx5_team_t *team,
if (!node->ops[i].send_mkeys) {
tl_error(lib, "failed to malloc");
goto err_malloc;
return UCC_ERR_NO_MEMORY;
}
node->ops[i].recv_mkeys = (struct mlx5dv_mkey **)ucc_malloc(
sizeof(struct mlx5dv_mkey *) * a2a->max_num_of_columns);
Expand All @@ -230,7 +229,7 @@ ucc_status_t ucc_tl_mlx5_init_mkeys(ucc_tl_mlx5_team_t *team,
status = create_master_key(node->sbgp->group_size + 1, a2a->pd,
&node->ops[i].send_mkeys[j], lib);
if (status != UCC_OK) {
tl_error(lib, " failed to create send masterkey [%d,%d]", i, j);
tl_error(lib, "failed to create send masterkey [%d,%d]", i, j);
goto err_create_mkey;
}
status = create_master_key(node->sbgp->group_size + 1, a2a->pd,
Expand Down
6 changes: 3 additions & 3 deletions src/components/tl/sharp/tl_sharp_coll.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ enum sharp_datatype ucc_to_sharp_dtype[] = {
[UCC_DT_PREDEFINED_ID(UCC_DT_FLOAT64)] = SHARP_DTYPE_DOUBLE,
[UCC_DT_PREDEFINED_ID(UCC_DT_FLOAT128)] = SHARP_DTYPE_NULL,
#if SHARP_API > SHARP_VERSION(3, 0)
[UCC_DT_PREDEFINED_ID(UCC_DT_INT8)] = SHARP_DTYPE_UNKNOWN,
[UCC_DT_PREDEFINED_ID(UCC_DT_UINT8)] = SHARP_DTYPE_UNKNOWN,
[UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = SHARP_DTYPE_UNKNOWN,
[UCC_DT_PREDEFINED_ID(UCC_DT_INT8)] = (enum sharp_datatype)SHARP_DTYPE_UNKNOWN,
[UCC_DT_PREDEFINED_ID(UCC_DT_UINT8)] = (enum sharp_datatype)SHARP_DTYPE_UNKNOWN,
[UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = (enum sharp_datatype)SHARP_DTYPE_UNKNOWN,
#else
[UCC_DT_PREDEFINED_ID(UCC_DT_INT8)] = SHARP_DTYPE_NULL,
[UCC_DT_PREDEFINED_ID(UCC_DT_UINT8)] = SHARP_DTYPE_NULL,
Expand Down
4 changes: 2 additions & 2 deletions src/components/tl/sharp/tl_sharp_coll.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand All @@ -10,7 +10,7 @@
#include "tl_sharp.h"

/* need to query for datatype support at runtime */
#define SHARP_DTYPE_UNKNOWN -1
#define SHARP_DTYPE_UNKNOWN 0xFFFF

extern enum sharp_datatype ucc_to_sharp_dtype[];

Expand Down
2 changes: 1 addition & 1 deletion src/components/tl/sharp/tl_sharp_team.c
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_sharp_team_t, ucc_base_context_t *tl_context,

if (sharp_caps.support_mask.dtypes & UCC_BIT(SHARP_DTYPE_BFLOAT16)) {
tl_debug(ctx->super.super.lib, "enabling support for UCC_DT_BFLOAT16");
ucc_to_sharp_dtype[UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = UCC_DT_BFLOAT16;
ucc_to_sharp_dtype[UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = SHARP_DTYPE_BFLOAT16;
} else {
tl_debug(ctx->super.super.lib, "disabling support for UCC_DT_BFLOAT16");
ucc_to_sharp_dtype[UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = SHARP_DTYPE_NULL;
Expand Down
4 changes: 3 additions & 1 deletion src/components/tl/ucp/allgather/allgather_neighbor.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
static ucc_rank_t get_recv_from_rank(ucc_rank_t rank, ucc_rank_t size, int i)
{
const int i_parity = i % 2;
ucc_rank_t offset_at_step[2], recv_data_from;
int offset_at_step[2];
ucc_rank_t recv_data_from;

if (rank % 2) {
recv_data_from = (rank - 1 + size) % size;
offset_at_step[0] = (-2);
Expand Down
18 changes: 10 additions & 8 deletions src/components/tl/ucp/alltoallv/alltoallv_hybrid.c
Original file line number Diff line number Diff line change
Expand Up @@ -510,9 +510,11 @@ ucc_status_t post_recv(ucc_rank_t recvfrom, ucc_rank_t tsize, size_t dt_size,

/* check if we have space for maximum recieve. If not, recycle */
if (meta->offset * dt_size + step_buf_size > tmp_buf_size) {
new_offset = receive_buffer_recycler(tsize, (int *)op_metadata, (int *)op_metadata + tsize,
seg_st, p_tmp_recv_region, dt_size, BytesForPacking,
step, user_rbuf, rdisps, trank, radix, node_edge_id);
new_offset = receive_buffer_recycler(tsize, (unsigned int *)op_metadata,
(int *)op_metadata + tsize,
seg_st, p_tmp_recv_region, dt_size,
BytesForPacking, step, user_rbuf,
rdisps, trank, radix, node_edge_id);
meta->offset = new_offset;
}
ucc_assert(meta->offset * dt_size + step_buf_size <= tmp_buf_size);
Expand Down Expand Up @@ -595,8 +597,8 @@ static ucc_status_t complete_current_step_receives(ucc_rank_t tsize, int step,
temp_offset = PTR_OFFSET(temp_offset, cur_buf_length * dt_size);
} else {
/* data will be sent pairwise */
((int *)op_metadata)[i] = COUNT_DIRECT;
((int *)op_metadata)[i + tsize] = COUNT_DIRECT;
((int *)op_metadata)[i] = (int)COUNT_DIRECT;
((int *)op_metadata)[i + tsize] = (int)COUNT_DIRECT;
if (i < (step * radix)) {
int pairwise_src = (trank - i + tsize) % tsize;
if (rcounts[pairwise_src] > 0) {
Expand Down Expand Up @@ -636,8 +638,8 @@ static ucc_status_t complete_current_step_receives(ucc_rank_t tsize, int step,
next_p = tsize;
}
} else {
((int *)op_metadata)[i] = COUNT_DIRECT;
((int *)op_metadata)[i + tsize] = COUNT_DIRECT;
((int *)op_metadata)[i] = (int)COUNT_DIRECT;
((int *)op_metadata)[i + tsize] = (int)COUNT_DIRECT;
if (i < (step * radix)) {
int pairwise_src = (trank - i + tsize) % tsize;
if (rcounts[pairwise_src] > 0) {
Expand Down Expand Up @@ -709,7 +711,7 @@ ucc_status_t pairwise_manager(ucc_rank_t trank, ucc_rank_t tsize,
int *r_disps = (int*)TASK_ARGS(task).dst.info_v.displacements;
int *scounts = (int*)TASK_ARGS(task).src.info_v.counts;
int *rcounts = (int*)TASK_ARGS(task).dst.info_v.counts;
int* cur = &task->alltoallv_hybrid.cur_out;
ucc_rank_t *cur = &task->alltoallv_hybrid.cur_out;
int chunk_num_limit = UCC_TL_UCP_TEAM_LIB(team)->cfg.alltoallv_hybrid_pairwise_num_posts;
int chunk_byte_limit = UCC_TL_UCP_TEAM_LIB(team)->cfg.alltoallv_hybrid_chunk_byte_limit;
ucc_status_t status;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ static inline void get_sbuf_rbuf(ucc_tl_ucp_task_t *task, size_t block_count,
size_t dt_size = ucc_dt_size(args->dst.info.datatype);
void *scratch = task->reduce_scatter_kn.scratch;
ucc_knomial_pattern_t *p = &task->reduce_scatter_kn.p;
size_t offset, local_seg_offset, local_seg_count;
size_t offset, local_seg_count;
ptrdiff_t local_seg_offset;

if (ucc_knomial_pattern_loop_first_iteration(p)) {
*sbuf = ((KN_NODE_PROXY == p->node_type) || UCC_IS_INPLACE(*args))
Expand Down
2 changes: 1 addition & 1 deletion src/ucc/api/ucc.h
Original file line number Diff line number Diff line change
Expand Up @@ -1337,7 +1337,7 @@ struct ucc_ep_map_cb {
* @ingroup UCC_TEAM_DT
*/
typedef enum {
UCC_EP_MAP_FULL = 1, /*!< The ep range of the team spans all eps from a context*/
UCC_EP_MAP_FULL = 1, /*!< The ep range of the team spans all eps from a context. */
UCC_EP_MAP_STRIDED = 2, /*!< The ep range of the team can be described by the 2 values: start, stride.*/
UCC_EP_MAP_ARRAY = 3, /*!< The ep range is given as an array of intergers that map the ep in the team to
the team_context rank. */
Expand Down
4 changes: 3 additions & 1 deletion src/utils/ucc_coll_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -266,10 +266,11 @@ ucc_ep_map_from_array_generic(void **array, ucc_rank_t size,
ucc_rank_t full_size, int need_free, int is64)
{
int is_const_stride = 0;
ucc_ep_map_t map = {0};
ucc_ep_map_t map;
int64_t stride;
ucc_rank_t i;

map.type = (ucc_ep_map_type_t)0;
map.ep_num = size;
if (size > 1) {
/* try to detect strided pattern */
Expand Down Expand Up @@ -303,6 +304,7 @@ ucc_ep_map_from_array_generic(void **array, ucc_rank_t size,
map.array.map = (void *)(*array);
map.array.elem_size = is64 ? sizeof(uint64_t) : sizeof(ucc_rank_t);
}

return map;
}

Expand Down
6 changes: 5 additions & 1 deletion src/utils/ucc_coll_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,11 @@
#define UCC_COLL_ARGS_ACTIVE_SET(_args) \
((_args)->mask & UCC_COLL_ARGS_FIELD_ACTIVE_SET)

#define UCC_MEM_TYPE_MASK_FULL -1
#define UCC_MEM_TYPE_MASK_FULL (UCC_BIT(UCC_MEMORY_TYPE_HOST) | \
UCC_BIT(UCC_MEMORY_TYPE_CUDA) | \
UCC_BIT(UCC_MEMORY_TYPE_CUDA_MANAGED) | \
UCC_BIT(UCC_MEMORY_TYPE_ROCM) | \
UCC_BIT(UCC_MEMORY_TYPE_ROCM_MANAGED))

static inline int ucc_coll_args_is_reduction(ucc_coll_type_t ct)
{
Expand Down
17 changes: 10 additions & 7 deletions src/utils/ucc_parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,25 +86,28 @@ static inline int ucc_check_range(char *range_str, ucc_rank_t *begin,
char **range = ucc_str_split(range_str, "-");
char *str_end;
unsigned n_range;
long pbegin, pend;

if (!range) {
goto split_err;
}

n_range = ucc_str_split_count(range);
*begin = (size_t) strtol(range[0], &str_end, 10);
*end = *begin;
pbegin = strtol(range[0], &str_end, 10);
pend = pbegin;

if (n_range > 2 || *str_end != '\0' || *begin < 0) {
if (n_range > 2 || *str_end != '\0' || pbegin < 0) {
goto val_err;
}

if (n_range == 2) {
*end = (size_t) strtol(range[1], &str_end, 10);
if (*str_end != '\0' || *end < 0) {
pend = strtol(range[1], &str_end, 10);
if (*str_end != '\0' || pend < 0) {
goto val_err;
}
}
*begin = (ucc_rank_t)pbegin;
*end = (ucc_rank_t)pend;
ucc_str_split_free(range);
return 1;

Expand Down Expand Up @@ -852,7 +855,7 @@ int ucc_config_sscanf_uint_ranged(const char *buf, void *dest,
if (!r) {
goto err_tokens;
}
r->mtypes = -1; //mask all types
r->mtypes = UCC_MEM_TYPE_MASK_FULL;
r->start = 0;
r->end = SIZE_MAX;

Expand Down Expand Up @@ -905,7 +908,7 @@ int ucc_config_sprintf_uint_ranged(char *buf, size_t max, const void *src,
ucc_list_for_each(r, &s->ranges, list_elem) {
ucs_memunits_to_str(r->start, tmp_start, tmp_max);
ucs_memunits_to_str(r->end, tmp_end, tmp_max);
if (r->mtypes == -1) {
if (r->mtypes == UCC_MEM_TYPE_MASK_FULL) {
ucc_snprintf_safe(buf, max, "%s-%s:%u", tmp_start, tmp_end,
r->value);
} else {
Expand Down
1 change: 0 additions & 1 deletion test/mpi/test_mpi.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,6 @@ static inline const char* skip_str(test_skip_cause_t s) {
default:
return "unknown";
}
return NULL;
}

static inline const char* team_str(ucc_test_mpi_team_t t) {
Expand Down
5 changes: 2 additions & 3 deletions tools/perf/ucc_pt_cuda.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand All @@ -15,7 +15,7 @@ ucc_pt_cuda_iface_t ucc_pt_cuda_iface = {

#define LOAD_CUDA_SYM(_sym, _pt_sym) ({ \
void *h = dlsym(handle, _sym); \
if ((error = dlerror()) != NULL) { \
if (dlerror() != NULL) { \
return; \
} \
ucc_pt_cuda_iface. _pt_sym = \
Expand All @@ -24,7 +24,6 @@ ucc_pt_cuda_iface_t ucc_pt_cuda_iface = {

void ucc_pt_cuda_init(void)
{
char *error;
void *handle;

handle = dlopen ("libcudart.so", RTLD_LAZY);
Expand Down
5 changes: 2 additions & 3 deletions tools/perf/ucc_pt_rocm.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (C) Advanced Micro Devices, Inc. 2022. ALL RIGHTS RESERVED.
*
* See file LICENSE for terms.
Expand All @@ -16,7 +16,7 @@ ucc_pt_rocm_iface_t ucc_pt_rocm_iface = {

#define LOAD_ROCM_SYM(_sym, _pt_sym) ({ \
void *h = dlsym(handle, _sym); \
if ((error = dlerror()) != NULL) { \
if (dlerror() != NULL) { \
return; \
} \
ucc_pt_rocm_iface. _pt_sym = \
Expand All @@ -25,7 +25,6 @@ ucc_pt_rocm_iface_t ucc_pt_rocm_iface = {

void ucc_pt_rocm_init(void)
{
char *error;
void *handle;

handle = dlopen ("libamdhip64.so", RTLD_LAZY);
Expand Down
Loading