From fce4f0b16b4541c741abf184eba1990197c927a4 Mon Sep 17 00:00:00 2001
From: Sergey Lebedev <sergeyle@nvidia.com>
Date: Fri, 3 Nov 2023 16:35:48 +0100
Subject: [PATCH] TEST: build ucc with hpcsdk

---
 .github/workflows/hpcsdk.yaml                 | 25 +++++++++++++++++++
 src/coll_patterns/recursive_knomial.h         |  2 +-
 src/coll_patterns/sra_knomial.h               |  9 ++++---
 src/components/tl/cuda/tl_cuda_topo.c         |  3 ++-
 .../tl/mlx5/alltoall/alltoall_mkeys.c         |  3 +--
 src/components/tl/sharp/tl_sharp_coll.c       |  6 ++---
 src/components/tl/sharp/tl_sharp_coll.h       |  4 +--
 src/components/tl/sharp/tl_sharp_team.c       |  2 +-
 .../tl/ucp/allgather/allgather_neighbor.c     |  4 ++-
 .../tl/ucp/alltoallv/alltoallv_hybrid.c       | 18 +++++++------
 .../reduce_scatter/reduce_scatter_knomial.c   |  3 ++-
 src/ucc/api/ucc.h                             |  2 +-
 src/utils/ucc_coll_utils.c                    |  4 ++-
 src/utils/ucc_coll_utils.h                    |  6 ++++-
 src/utils/ucc_parser.c                        | 17 +++++++------
 test/mpi/test_mpi.h                           |  1 -
 tools/perf/ucc_pt_cuda.cc                     |  5 ++--
 tools/perf/ucc_pt_rocm.cc                     |  5 ++--
 18 files changed, 78 insertions(+), 41 deletions(-)
 create mode 100644 .github/workflows/hpcsdk.yaml

diff --git a/.github/workflows/hpcsdk.yaml b/.github/workflows/hpcsdk.yaml
new file mode 100644
index 0000000000..77188cd96a
--- /dev/null
+++ b/.github/workflows/hpcsdk.yaml
@@ -0,0 +1,25 @@
+name: HPC_SDK
+
+on: [push, pull_request]
+
+env:
+  HPCXDIR: /opt/nvidia/hpc_sdk/Linux_x86_64/2023/comm_libs/12.2/hpcx/latest/
+  NCCLDIR: /opt/nvidia/hpc_sdk/Linux_x86_64/2023/comm_libs/12.2/nccl/
+  CUDADIR: /opt/nvidia/hpc_sdk/Linux_x86_64/2023/cuda/12.2/
+
+jobs:
+  build:
+    runs-on: ubuntu-20.04
+    container:
+      image: nvcr.io/nvidia/nvhpc:23.9-devel-cuda12.2-ubuntu22.04
+    steps:
+    - name: Install dependencies
+      run: |
+        apt-get update
+        apt-get install -y --no-install-recommends libiberty-dev
+    - uses: actions/checkout@v1
+    - name: Build UCC
+      run: |
+        ./autogen.sh
+        CC=nvc CXX=nvc++ ./configure --with-tls=ucp,mlx5,cuda,self,nccl,sharp --with-mpi=${HPCXDIR}/ompi --with-sharp=${HPCXDIR}/sharp --with-ucx=${HPCXDIR}/ucx --with-cuda=${CUDADIR} --with-nccl=${NCCLDIR} --with-nvcc-gencode="-gencode=arch=compute_80,code=sm_80"
+        make -j`nproc` install
diff --git a/src/coll_patterns/recursive_knomial.h b/src/coll_patterns/recursive_knomial.h
index 4f8981957c..ebf9a0981b 100644
--- a/src/coll_patterns/recursive_knomial.h
+++ b/src/coll_patterns/recursive_knomial.h
@@ -50,7 +50,7 @@ typedef struct ucc_knomial_pattern {
     size_t         block_size_counts;
     size_t         count;         /* collective buffer size */
     ucc_rank_t     block_size;
-    size_t         block_offset;
+    ptrdiff_t      block_offset;
 } ucc_knomial_pattern_t;
 
 /**
diff --git a/src/coll_patterns/sra_knomial.h b/src/coll_patterns/sra_knomial.h
index 1574389632..2f63a243f2 100644
--- a/src/coll_patterns/sra_knomial.h
+++ b/src/coll_patterns/sra_knomial.h
@@ -159,7 +159,7 @@ ucc_kn_seg_desc_compute(ucc_knomial_pattern_t *p, ucc_kn_seg_desc_t *seg,
 
 static inline void
 ucc_knx_block(ucc_rank_t rank, ucc_rank_t size, ucc_kn_radix_t radix,
-              size_t count, int iter, size_t *b_count, size_t *b_offset)
+              size_t count, int iter, size_t *b_count, ptrdiff_t *b_offset)
 {
     ucc_rank_t            offset = 0;
     ucc_rank_t            block_count;
@@ -213,7 +213,7 @@ ucc_kn_agx_pattern_init(ucc_rank_t size, ucc_rank_t rank, ucc_kn_radix_t radix,
 
 static inline void
 ucc_kn_ag_pattern_peer_seg(ucc_rank_t peer, ucc_knomial_pattern_t *p,
-                           size_t *seg_count, size_t *seg_offset)
+                           size_t *seg_count, ptrdiff_t *seg_offset)
 {
     ucc_rank_t step_radix, seg_index;
     ucc_kn_seg_desc_t s;
@@ -278,7 +278,7 @@ static inline void ucc_kn_rsx_pattern_init(ucc_rank_t size, ucc_rank_t rank,
 
 static inline void
 ucc_kn_rs_pattern_peer_seg(ucc_rank_t peer, ucc_knomial_pattern_t *p,
-                           size_t *peer_seg_count, size_t *peer_seg_offset)
+                           size_t *peer_seg_count, ptrdiff_t *peer_seg_offset)
 {
     ucc_rank_t step_radix, seg_index;
 
@@ -305,7 +305,8 @@ ucc_kn_rs_pattern_peer_seg(ucc_rank_t peer, ucc_knomial_pattern_t *p,
 
 static inline void ucc_kn_rs_pattern_next_iter(ucc_knomial_pattern_t *p)
 {
-    size_t offset, bs;
+    size_t bs;
+    ptrdiff_t offset;
 
     ucc_kn_rs_pattern_peer_seg(p->rank, p, &bs, &offset);
     p->block_size_counts = bs;
diff --git a/src/components/tl/cuda/tl_cuda_topo.c b/src/components/tl/cuda/tl_cuda_topo.c
index 96862e921e..a0f54d57e6 100644
--- a/src/components/tl/cuda/tl_cuda_topo.c
+++ b/src/components/tl/cuda/tl_cuda_topo.c
@@ -220,7 +220,8 @@ static ucc_status_t ucc_tl_cuda_topo_graph_create(ucc_tl_cuda_topo_t *topo)
     ucc_tl_cuda_topo_dev_type_t dev_type;
     ucc_tl_cuda_device_pci_id_t pci_id;
     ucc_tl_cuda_topo_node_t *node, *peer_node;
-    int num_gpus, num_nvlinks, link, i;
+    int num_nvlinks, link, i;
+    unsigned int num_gpus;
     nvmlReturn_t nvml_st;
 
     nvml_st = nvmlInit_v2();
diff --git a/src/components/tl/mlx5/alltoall/alltoall_mkeys.c b/src/components/tl/mlx5/alltoall/alltoall_mkeys.c
index 7dd90d49b8..0fa197e6c7 100644
--- a/src/components/tl/mlx5/alltoall/alltoall_mkeys.c
+++ b/src/components/tl/mlx5/alltoall/alltoall_mkeys.c
@@ -217,7 +217,6 @@ ucc_status_t ucc_tl_mlx5_init_mkeys(ucc_tl_mlx5_team_t *team,
         if (!node->ops[i].send_mkeys) {
             tl_error(lib, "failed to malloc");
             goto err_malloc;
-            return UCC_ERR_NO_MEMORY;
         }
         node->ops[i].recv_mkeys = (struct mlx5dv_mkey **)ucc_malloc(
             sizeof(struct mlx5dv_mkey *) * a2a->max_num_of_columns);
@@ -230,7 +229,7 @@ ucc_status_t ucc_tl_mlx5_init_mkeys(ucc_tl_mlx5_team_t *team,
             status = create_master_key(node->sbgp->group_size + 1, a2a->pd,
                                        &node->ops[i].send_mkeys[j], lib);
             if (status != UCC_OK) {
-                tl_error(lib, " failed to create send masterkey [%d,%d]", i, j);
+                tl_error(lib, "failed to create send masterkey [%d,%d]", i, j);
                 goto err_create_mkey;
             }
             status = create_master_key(node->sbgp->group_size + 1, a2a->pd,
diff --git a/src/components/tl/sharp/tl_sharp_coll.c b/src/components/tl/sharp/tl_sharp_coll.c
index d246fcc563..1dcf2465c1 100644
--- a/src/components/tl/sharp/tl_sharp_coll.c
+++ b/src/components/tl/sharp/tl_sharp_coll.c
@@ -26,9 +26,9 @@ enum sharp_datatype ucc_to_sharp_dtype[] = {
     [UCC_DT_PREDEFINED_ID(UCC_DT_FLOAT64)]          = SHARP_DTYPE_DOUBLE,
     [UCC_DT_PREDEFINED_ID(UCC_DT_FLOAT128)]         = SHARP_DTYPE_NULL,
 #if SHARP_API > SHARP_VERSION(3, 0)
-    [UCC_DT_PREDEFINED_ID(UCC_DT_INT8)]             = SHARP_DTYPE_UNKNOWN,
-    [UCC_DT_PREDEFINED_ID(UCC_DT_UINT8)]            = SHARP_DTYPE_UNKNOWN,
-    [UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)]         = SHARP_DTYPE_UNKNOWN,
+    [UCC_DT_PREDEFINED_ID(UCC_DT_INT8)]             = (enum sharp_datatype)SHARP_DTYPE_UNKNOWN,
+    [UCC_DT_PREDEFINED_ID(UCC_DT_UINT8)]            = (enum sharp_datatype)SHARP_DTYPE_UNKNOWN,
+    [UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)]         = (enum sharp_datatype)SHARP_DTYPE_UNKNOWN,
 #else
     [UCC_DT_PREDEFINED_ID(UCC_DT_INT8)]             = SHARP_DTYPE_NULL,
     [UCC_DT_PREDEFINED_ID(UCC_DT_UINT8)]            = SHARP_DTYPE_NULL,
diff --git a/src/components/tl/sharp/tl_sharp_coll.h b/src/components/tl/sharp/tl_sharp_coll.h
index 4b0dba17b6..6b12c69900 100644
--- a/src/components/tl/sharp/tl_sharp_coll.h
+++ b/src/components/tl/sharp/tl_sharp_coll.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
@@ -10,7 +10,7 @@
 #include "tl_sharp.h"
 
 /* need to query for datatype support at runtime */
-#define SHARP_DTYPE_UNKNOWN -1
+#define SHARP_DTYPE_UNKNOWN 0xFFFF
 
 extern enum sharp_datatype ucc_to_sharp_dtype[];
 
diff --git a/src/components/tl/sharp/tl_sharp_team.c b/src/components/tl/sharp/tl_sharp_team.c
index fe4a5875fb..6b8f369c7c 100644
--- a/src/components/tl/sharp/tl_sharp_team.c
+++ b/src/components/tl/sharp/tl_sharp_team.c
@@ -117,7 +117,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_sharp_team_t, ucc_base_context_t *tl_context,
 
         if (sharp_caps.support_mask.dtypes & UCC_BIT(SHARP_DTYPE_BFLOAT16)) {
             tl_debug(ctx->super.super.lib, "enabling support for UCC_DT_BFLOAT16");
-            ucc_to_sharp_dtype[UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = UCC_DT_BFLOAT16;
+            ucc_to_sharp_dtype[UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = SHARP_DTYPE_BFLOAT16;
         } else {
             tl_debug(ctx->super.super.lib, "disabling support for UCC_DT_BFLOAT16");
             ucc_to_sharp_dtype[UCC_DT_PREDEFINED_ID(UCC_DT_BFLOAT16)] = SHARP_DTYPE_NULL;
diff --git a/src/components/tl/ucp/allgather/allgather_neighbor.c b/src/components/tl/ucp/allgather/allgather_neighbor.c
index 771ba2d3b8..534c197e4e 100644
--- a/src/components/tl/ucp/allgather/allgather_neighbor.c
+++ b/src/components/tl/ucp/allgather/allgather_neighbor.c
@@ -15,7 +15,9 @@
 static ucc_rank_t get_recv_from_rank(ucc_rank_t rank, ucc_rank_t size, int i)
 {
     const int  i_parity = i % 2;
-    ucc_rank_t offset_at_step[2], recv_data_from;
+    int offset_at_step[2];
+    ucc_rank_t recv_data_from;
+
     if (rank % 2) {
         recv_data_from    = (rank - 1 + size) % size;
         offset_at_step[0] = (-2);
diff --git a/src/components/tl/ucp/alltoallv/alltoallv_hybrid.c b/src/components/tl/ucp/alltoallv/alltoallv_hybrid.c
index 61b130eaa5..7b8c7b7b67 100644
--- a/src/components/tl/ucp/alltoallv/alltoallv_hybrid.c
+++ b/src/components/tl/ucp/alltoallv/alltoallv_hybrid.c
@@ -510,9 +510,11 @@ ucc_status_t post_recv(ucc_rank_t recvfrom, ucc_rank_t tsize, size_t dt_size,
 
     /* check if we have space for maximum recieve. If not, recycle */
     if (meta->offset * dt_size + step_buf_size > tmp_buf_size) {
-        new_offset = receive_buffer_recycler(tsize, (int *)op_metadata, (int *)op_metadata + tsize,
-                                             seg_st, p_tmp_recv_region, dt_size, BytesForPacking,
-                                             step, user_rbuf, rdisps, trank, radix, node_edge_id);
+        new_offset = receive_buffer_recycler(tsize, (unsigned int *)op_metadata,
+                                             (int *)op_metadata + tsize,
+                                             seg_st, p_tmp_recv_region, dt_size,
+                                             BytesForPacking, step, user_rbuf,
+                                             rdisps, trank, radix, node_edge_id);
         meta->offset = new_offset;
     }
     ucc_assert(meta->offset * dt_size + step_buf_size <= tmp_buf_size);
@@ -595,8 +597,8 @@ static ucc_status_t complete_current_step_receives(ucc_rank_t tsize, int step,
                     temp_offset = PTR_OFFSET(temp_offset, cur_buf_length * dt_size);
                 } else {
                     /* data will be sent pairwise */
-                    ((int *)op_metadata)[i]         = COUNT_DIRECT;
-                    ((int *)op_metadata)[i + tsize] = COUNT_DIRECT;
+                    ((int *)op_metadata)[i]         = (int)COUNT_DIRECT;
+                    ((int *)op_metadata)[i + tsize] = (int)COUNT_DIRECT;
                     if (i < (step * radix)) {
                         int pairwise_src = (trank - i + tsize) % tsize;
                         if (rcounts[pairwise_src] > 0) {
@@ -636,8 +638,8 @@ static ucc_status_t complete_current_step_receives(ucc_rank_t tsize, int step,
                         next_p = tsize;
                     }
                 } else {
-                    ((int *)op_metadata)[i]         = COUNT_DIRECT;
-                    ((int *)op_metadata)[i + tsize] = COUNT_DIRECT;
+                    ((int *)op_metadata)[i]         = (int)COUNT_DIRECT;
+                    ((int *)op_metadata)[i + tsize] = (int)COUNT_DIRECT;
                     if (i < (step * radix)) {
                         int pairwise_src = (trank - i + tsize) % tsize;
                         if (rcounts[pairwise_src] > 0) {
@@ -709,7 +711,7 @@ ucc_status_t pairwise_manager(ucc_rank_t trank, ucc_rank_t tsize,
     int               *r_disps          = (int*)TASK_ARGS(task).dst.info_v.displacements;
     int               *scounts          = (int*)TASK_ARGS(task).src.info_v.counts;
     int               *rcounts          = (int*)TASK_ARGS(task).dst.info_v.counts;
-    int*               cur              = &task->alltoallv_hybrid.cur_out;
+    ucc_rank_t        *cur              = &task->alltoallv_hybrid.cur_out;
     int                chunk_num_limit  = UCC_TL_UCP_TEAM_LIB(team)->cfg.alltoallv_hybrid_pairwise_num_posts;
     int                chunk_byte_limit = UCC_TL_UCP_TEAM_LIB(team)->cfg.alltoallv_hybrid_chunk_byte_limit;
     ucc_status_t status;
diff --git a/src/components/tl/ucp/reduce_scatter/reduce_scatter_knomial.c b/src/components/tl/ucp/reduce_scatter/reduce_scatter_knomial.c
index 11a2abc859..ca5457dfb4 100644
--- a/src/components/tl/ucp/reduce_scatter/reduce_scatter_knomial.c
+++ b/src/components/tl/ucp/reduce_scatter/reduce_scatter_knomial.c
@@ -22,7 +22,8 @@ static inline void get_sbuf_rbuf(ucc_tl_ucp_task_t *task, size_t block_count,
     size_t                 dt_size   = ucc_dt_size(args->dst.info.datatype);
     void                  *scratch   = task->reduce_scatter_kn.scratch;
     ucc_knomial_pattern_t *p         = &task->reduce_scatter_kn.p;
-    size_t offset, local_seg_offset, local_seg_count;
+    size_t offset, local_seg_count;
+    ptrdiff_t local_seg_offset;
 
     if (ucc_knomial_pattern_loop_first_iteration(p)) {
         *sbuf = ((KN_NODE_PROXY ==  p->node_type) || UCC_IS_INPLACE(*args))
diff --git a/src/ucc/api/ucc.h b/src/ucc/api/ucc.h
index c7c0ce10b0..02e5e11540 100644
--- a/src/ucc/api/ucc.h
+++ b/src/ucc/api/ucc.h
@@ -1337,7 +1337,7 @@ struct ucc_ep_map_cb {
  *  @ingroup UCC_TEAM_DT
  */
 typedef enum {
-    UCC_EP_MAP_FULL     = 1, /*!< The ep range of the team  spans all eps from a context*/
+    UCC_EP_MAP_FULL     = 1, /*!< The ep range of the team  spans all eps from a context. */
     UCC_EP_MAP_STRIDED  = 2, /*!< The ep range of the team can be described by the 2 values: start, stride.*/
     UCC_EP_MAP_ARRAY    = 3, /*!< The ep range is given as an array of intergers that map the ep in the team to
                                        the team_context rank. */
diff --git a/src/utils/ucc_coll_utils.c b/src/utils/ucc_coll_utils.c
index 3921f1262e..2f3b858cbb 100644
--- a/src/utils/ucc_coll_utils.c
+++ b/src/utils/ucc_coll_utils.c
@@ -266,10 +266,11 @@ ucc_ep_map_from_array_generic(void **array, ucc_rank_t size,
                               ucc_rank_t full_size, int need_free, int is64)
 {
     int          is_const_stride = 0;
-    ucc_ep_map_t map             = {0};
+    ucc_ep_map_t map;
     int64_t      stride;
     ucc_rank_t   i;
 
+    map.type   = (ucc_ep_map_type_t)0;
     map.ep_num = size;
     if (size > 1) {
         /* try to detect strided pattern */
@@ -303,6 +304,7 @@ ucc_ep_map_from_array_generic(void **array, ucc_rank_t size,
         map.array.map       = (void *)(*array);
         map.array.elem_size = is64 ? sizeof(uint64_t) : sizeof(ucc_rank_t);
     }
+
     return map;
 }
 
diff --git a/src/utils/ucc_coll_utils.h b/src/utils/ucc_coll_utils.h
index 2d3a919f08..ead7fe4081 100644
--- a/src/utils/ucc_coll_utils.h
+++ b/src/utils/ucc_coll_utils.h
@@ -71,7 +71,11 @@
 #define UCC_COLL_ARGS_ACTIVE_SET(_args)                                        \
     ((_args)->mask & UCC_COLL_ARGS_FIELD_ACTIVE_SET)
 
-#define UCC_MEM_TYPE_MASK_FULL -1
+#define UCC_MEM_TYPE_MASK_FULL (UCC_BIT(UCC_MEMORY_TYPE_HOST) |                \
+                                UCC_BIT(UCC_MEMORY_TYPE_CUDA) |                \
+                                UCC_BIT(UCC_MEMORY_TYPE_CUDA_MANAGED) |        \
+                                UCC_BIT(UCC_MEMORY_TYPE_ROCM) |                \
+                                UCC_BIT(UCC_MEMORY_TYPE_ROCM_MANAGED))
 
 static inline int ucc_coll_args_is_reduction(ucc_coll_type_t ct)
 {
diff --git a/src/utils/ucc_parser.c b/src/utils/ucc_parser.c
index fff69e47c6..6db8ef52f8 100644
--- a/src/utils/ucc_parser.c
+++ b/src/utils/ucc_parser.c
@@ -86,25 +86,28 @@ static inline int ucc_check_range(char *range_str, ucc_rank_t *begin,
     char   **range = ucc_str_split(range_str, "-");
     char    *str_end;
     unsigned n_range;
+    long pbegin, pend;
 
     if (!range) {
         goto split_err;
     }
 
     n_range = ucc_str_split_count(range);
-    *begin  = (size_t) strtol(range[0], &str_end, 10);
-    *end    = *begin;
+    pbegin  = strtol(range[0], &str_end, 10);
+    pend    = pbegin;
 
-    if (n_range > 2 || *str_end != '\0' || *begin < 0) {
+    if (n_range > 2 || *str_end != '\0' || pbegin < 0) {
         goto val_err;
     }
 
     if (n_range == 2) {
-        *end = (size_t) strtol(range[1], &str_end, 10);
-        if (*str_end != '\0' || *end < 0) {
+        pend = strtol(range[1], &str_end, 10);
+        if (*str_end != '\0' || pend < 0) {
             goto val_err;
         }
     }
+    *begin = (ucc_rank_t)pbegin;
+    *end = (ucc_rank_t)pend;
     ucc_str_split_free(range);
     return 1;
 
@@ -852,7 +855,7 @@ int ucc_config_sscanf_uint_ranged(const char *buf, void *dest,
             if (!r) {
                 goto err_tokens;
             }
-            r->mtypes = -1; //mask all types
+            r->mtypes = UCC_MEM_TYPE_MASK_FULL;
             r->start  = 0;
             r->end    = SIZE_MAX;
 
@@ -905,7 +908,7 @@ int ucc_config_sprintf_uint_ranged(char *buf, size_t max, const void *src,
     ucc_list_for_each(r, &s->ranges, list_elem) {
         ucs_memunits_to_str(r->start, tmp_start, tmp_max);
         ucs_memunits_to_str(r->end, tmp_end, tmp_max);
-        if (r->mtypes == -1) {
+        if (r->mtypes == UCC_MEM_TYPE_MASK_FULL) {
             ucc_snprintf_safe(buf, max, "%s-%s:%u", tmp_start, tmp_end,
                               r->value);
         } else {
diff --git a/test/mpi/test_mpi.h b/test/mpi/test_mpi.h
index d96a08a3f9..4196d85d52 100644
--- a/test/mpi/test_mpi.h
+++ b/test/mpi/test_mpi.h
@@ -145,7 +145,6 @@ static inline const char* skip_str(test_skip_cause_t s) {
     default:
         return "unknown";
     }
-    return NULL;
 }
 
 static inline const char* team_str(ucc_test_mpi_team_t t) {
diff --git a/tools/perf/ucc_pt_cuda.cc b/tools/perf/ucc_pt_cuda.cc
index fdf17457be..bcadabc955 100644
--- a/tools/perf/ucc_pt_cuda.cc
+++ b/tools/perf/ucc_pt_cuda.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
@@ -15,7 +15,7 @@ ucc_pt_cuda_iface_t ucc_pt_cuda_iface = {
 
 #define LOAD_CUDA_SYM(_sym, _pt_sym) ({                                    \
             void *h = dlsym(handle, _sym);                                 \
-            if ((error = dlerror()) != NULL)  {                            \
+            if (dlerror() != NULL)  {                                      \
                 return;                                                    \
             }                                                              \
             ucc_pt_cuda_iface. _pt_sym =                                   \
@@ -24,7 +24,6 @@ ucc_pt_cuda_iface_t ucc_pt_cuda_iface = {
 
 void ucc_pt_cuda_init(void)
 {
-    char *error;
     void *handle;
 
     handle = dlopen ("libcudart.so", RTLD_LAZY);
diff --git a/tools/perf/ucc_pt_rocm.cc b/tools/perf/ucc_pt_rocm.cc
index 2e1f121b8f..2851be0deb 100644
--- a/tools/perf/ucc_pt_rocm.cc
+++ b/tools/perf/ucc_pt_rocm.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * Copyright (C) Advanced Micro Devices, Inc. 2022. ALL RIGHTS RESERVED.
  *
  * See file LICENSE for terms.
@@ -16,7 +16,7 @@ ucc_pt_rocm_iface_t ucc_pt_rocm_iface = {
 
 #define LOAD_ROCM_SYM(_sym, _pt_sym) ({                                    \
             void *h = dlsym(handle, _sym);                                 \
-            if ((error = dlerror()) != NULL)  {                            \
+            if (dlerror() != NULL)  {                                      \
                 return;                                                    \
             }                                                              \
             ucc_pt_rocm_iface. _pt_sym =                                   \
@@ -25,7 +25,6 @@ ucc_pt_rocm_iface_t ucc_pt_rocm_iface = {
 
 void ucc_pt_rocm_init(void)
 {
-    char *error;
     void *handle;
 
     handle = dlopen ("libamdhip64.so", RTLD_LAZY);