Skip to content

Commit

Permalink
CL/DOCA_UROM: Add CL_DOCA_UROM
Browse files Browse the repository at this point in the history
  • Loading branch information
Nicholas Sarkauskas authored and nsarka committed Jun 24, 2024
1 parent 5e4b986 commit 1ae5756
Show file tree
Hide file tree
Showing 25 changed files with 6,329 additions and 6 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/codestyle.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
fi
fi
H1="CODESTYLE|REVIEW|CORE|UTIL|TEST|API|DOCS|TOOLS|BUILD|MC|EC|SCHEDULE|TOPO"
H2="CI|CL/|TL/|MC/|EC/|UCP|SHM|NCCL|SHARP|BASIC|HIER|CUDA|CPU|EE|RCCL|ROCM|SELF|MLX5"
H2="CI|CL/|TL/|MC/|EC/|UCP|SHM|NCCL|SHARP|BASIC|HIER|DOCA_UROM|CUDA|CPU|EE|RCCL|ROCM|SELF|MLX5"
if ! echo $msg | grep -qP '^Merge |^'"(($H1)|($H2))"'+: \w'
then
echo "Wrong header"
Expand Down
1 change: 1 addition & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
if !DOCS_ONLY
SUBDIRS = \
src \
contrib \
tools/info \
cmake

Expand Down
75 changes: 75 additions & 0 deletions config/m4/doca_urom.m4
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# See file LICENSE for terms.
#

AC_DEFUN([CHECK_DOCA_UROM],[
AS_IF([test "x$doca_urom_checked" != "xyes"],[
doca_urom_happy="no"
AC_ARG_WITH([doca_urom],
[AS_HELP_STRING([--with-doca_urom=(DIR)], [Enable the use of DOCA_UROM (default is guess).])],
[], [with_doca_urom=guess])
AS_IF([test "x$with_doca_urom" != "xno"],
[
save_CPPFLAGS="$CPPFLAGS"
save_LDFLAGS="$LDFLAGS"
AS_IF([test ! -z "$with_doca_urom" -a "x$with_doca_urom" != "xyes" -a "x$with_doca_urom" != "xguess"],
[
AS_IF([test ! -d $with_doca_urom],
[AC_MSG_ERROR([Provided "--with-doca_urom=${with_doca_urom}" location does not exist])])
check_doca_urom_dir="$with_doca_urom"
check_doca_urom_libdir="$with_doca_urom/lib64"
CPPFLAGS="-I$with_doca_urom/include $UCS_CPPFLAGS $save_CPPFLAGS"
LDFLAGS="-L$check_doca_urom_libdir $save_LDFLAGS"
])
AS_IF([test ! -z "$with_doca_urom_libdir" -a "x$with_doca_urom_libdir" != "xyes"],
[
check_doca_urom_libdir="$with_doca_urom_libdir"
LDFLAGS="-L$check_doca_urom_libdir $save_LDFLAGS"
])
AC_CHECK_HEADERS([doca_urom.h],
[
AC_CHECK_LIB([doca_urom], [doca_urom_service_create],
[
doca_urom_happy="yes"
],
[
echo "CPPFLAGS: $CPPFLAGS"
doca_urom_happy="no"
], [-ldoca_common -ldoca_argp -ldoca_urom])
],
[
doca_urom_happy="no"
])
AS_IF([test "x$doca_urom_happy" = "xyes"],
[
AS_IF([test "x$check_doca_urom_dir" != "x"],
[
AC_MSG_RESULT([DOCA_UROM dir: $check_doca_urom_dir])
AC_SUBST(DOCA_UROM_CPPFLAGS, "-I$check_doca_urom_dir/include/ $doca_urom_old_headers")
])
AS_IF([test "x$check_doca_urom_libdir" != "x"],
[
AC_SUBST(DOCA_UROM_LDFLAGS, "-L$check_doca_urom_libdir")
])
AC_SUBST(DOCA_UROM_LIBADD, "-ldoca_common -ldoca_argp -ldoca_urom")
AC_DEFINE([HAVE_DOCA_UROM], 1, [Enable DOCA_UROM support])
],
[
AS_IF([test "x$with_doca_urom" != "xguess"],
[
AC_MSG_ERROR([DOCA_UROM support is requested but DOCA_UROM packages cannot be found! $CPPFLAGS $LDFLAGS])
],
[
AC_MSG_WARN([DOCA_UROM not found])
])
])
CPPFLAGS="$save_CPPFLAGS"
LDFLAGS="$save_LDFLAGS"
],
[
AC_MSG_WARN([DOCA_UROM was explicitly disabled])
])
doca_urom_checked=yes
AM_CONDITIONAL([HAVE_DOCA_UROM], [test "x$doca_urom_happy" != xno])
])])
8 changes: 8 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ AS_IF([test "x$with_docs_only" = xyes],
AM_CONDITIONAL([HAVE_IBVERBS],[false])
AM_CONDITIONAL([HAVE_RDMACM],[false])
AM_CONDITIONAL([HAVE_MLX5DV],[false])
AM_CONDITIONAL([HAVE_DOCA_UROM], [false])
],
[
AM_CONDITIONAL([DOCS_ONLY], [false])
Expand All @@ -172,6 +173,7 @@ AS_IF([test "x$with_docs_only" = xyes],
m4_include([config/m4/cuda.m4])
m4_include([config/m4/nccl.m4])
m4_include([config/m4/rocm.m4])
m4_include([config/m4/doca_urom.m4])
m4_include([config/m4/rccl.m4])
m4_include([config/m4/sharp.m4])
m4_include([config/m4/mpi.m4])
Expand Down Expand Up @@ -205,6 +207,9 @@ AS_IF([test "x$with_docs_only" = xyes],
mc_modules="${mc_modules}:rocm"
fi
CHECK_DOCA_UROM
AC_MSG_RESULT([DOCA_UROM support: $doca_urom_happy])
CHECK_GTEST
AC_MSG_RESULT([GTEST support: $gtest_happy])
Expand All @@ -224,11 +229,13 @@ LDFLAGS="$LDFLAGS $UCS_LDFLAGS $UCS_LIBADD"
CHECK_TL_COLL_PLUGINS
AC_CONFIG_FILES([
Makefile
contrib/Makefile
src/Makefile
src/ucc/api/ucc_version.h
src/core/ucc_version.c
src/components/cl/basic/Makefile
src/components/cl/hier/Makefile
src/components/cl/doca_urom/Makefile
src/components/mc/cpu/Makefile
src/components/mc/cuda/Makefile
src/components/ec/cpu/Makefile
Expand Down Expand Up @@ -265,6 +272,7 @@ AC_MSG_NOTICE([ C++ compiler: ${CXX} ${CXXFLAGS} ${BASE_CXXFLAGS}])
AS_IF([test "x$cuda_happy" = "xyes"],[
AC_MSG_NOTICE([ NVCC gencodes: ${NVCC_ARCH}])
])
AC_MSG_NOTICE([ DOCA UROM enabled: ${doca_urom_happy}])
AC_MSG_NOTICE([ Perftest: ${mpi_enable}])
AC_MSG_NOTICE([ Gtest: ${gtest_enable}])
AC_MSG_NOTICE([ MC modules: <$(echo ${mc_modules}|tr ':' ' ') >])
Expand Down
22 changes: 22 additions & 0 deletions contrib/Makefile.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#

if HAVE_DOCA_UROM

sources = \
doca_urom_ucc_plugin/common/urom_ucc.h \
doca_urom_ucc_plugin/dpu/worker_ucc_p2p.c \
doca_urom_ucc_plugin/dpu/worker_ucc.h \
doca_urom_ucc_plugin/dpu/worker_ucc.c

plugindir = $(moduledir)/doca_plugins

plugin_LTLIBRARIES = libucc_doca_urom_plugin.la
libucc_doca_urom_plugin_la_SOURCES = $(sources)
libucc_doca_urom_plugin_la_CPPFLAGS = $(AM_CPPFLAGS) $(BASE_CPPFLAGS) $(UCX_CPPFLAGS) $(DOCA_UROM_CPPFLAGS)
libucc_doca_urom_plugin_la_CFLAGS = $(BASE_CFLAGS)
libucc_doca_urom_plugin_la_LDFLAGS = -version-info $(SOVERSION) --as-needed $(DOCA_UROM_LDFLAGS)
libucc_doca_urom_plugin_la_LIBADD = $(UCX_LIBADD) $(DOCA_UROM_LIBADD) $(UCC_TOP_BUILDDIR)/src/libucc.la

endif
171 changes: 171 additions & 0 deletions contrib/doca_urom_ucc_plugin/common/urom_ucc.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
/*
* Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES, ALL RIGHTS RESERVED.
*
* This software product is a proprietary product of NVIDIA CORPORATION &
* AFFILIATES (the "Company") and all right, title, and interest in and to the
* software product, including all associated intellectual property rights, are
* and shall remain exclusively with the Company.
*
* This software product is governed by the End User License Agreement
* provided with the software product.
*
*/

#ifndef UROM_UCC_H_
#define UROM_UCC_H_

#include <ucp/api/ucp.h>
#include <ucc/api/ucc.h>

#ifdef __cplusplus
extern "C" {
#endif

/* UCC serializing next raw, iter points to the offset place and returns the buffer start */
#define urom_ucc_serialize_next_raw(_iter, _type, _offset) \
({ \
_type *_result = (_type *)(*(_iter)); \
*(_iter) = UCS_PTR_BYTE_OFFSET(*(_iter), _offset); \
_result; \
})

/* UCC command types */
enum urom_worker_ucc_cmd_type {
UROM_WORKER_CMD_UCC_LIB_CREATE, /* UCC library create command */
UROM_WORKER_CMD_UCC_LIB_DESTROY, /* UCC library destroy command */
UROM_WORKER_CMD_UCC_CONTEXT_CREATE, /* UCC context create command */
UROM_WORKER_CMD_UCC_CONTEXT_DESTROY, /* UCC context destroy command */
UROM_WORKER_CMD_UCC_TEAM_CREATE, /* UCC team create command */
UROM_WORKER_CMD_UCC_COLL, /* UCC collective create command */
UROM_WORKER_CMD_UCC_CREATE_PASSIVE_DATA_CHANNEL, /* UCC passive data channel command */
};

/*
* UCC library create command structure
*
* Input parameters for creating the library handle. The semantics of the parameters are defined by ucc.h
* On successful completion of urom_worker_cmd_ucc_lib_create,
* The UROM worker will generate a notification on the notification queue. This
* notification has reference to local library handle on the worker. The
* implementation can choose to create shadow handles or safely pack the
* library handle on the BlueCC worker to the AEU.
*/
struct urom_worker_cmd_ucc_lib_create {
void *params; /* UCC library parameters */
};

/* UCC context create command structure */
struct urom_worker_cmd_ucc_context_create {
union {
int64_t start; /* The started index */
int64_t *array; /* Set stride to <= 0 if array is used */
};
int64_t stride; /* Set number of strides */
int64_t size; /* Set stride size */
void *base_va; /* Shared buffer address */
uint64_t len; /* Buffer length */
};

/* UCC passive data channel command structure */
struct urom_worker_cmd_ucc_pass_dc {
void *ucp_addr; /* UCP worker address on host */
size_t addr_len; /* UCP worker address length */
};

/* UCC context destroy command structure */
struct urom_worker_cmd_ucc_context_destroy {
void *context_h; /* UCC context pointer */
};

/* UCC team create command structure */
struct urom_worker_cmd_ucc_team_create {
int64_t start; /* Team start index */
int64_t stride; /* Number of strides */
int64_t size; /* Stride size */
void *context_h; /* UCC context */
};

/* UCC team destroy command structure */
struct urom_worker_cmd_ucc_team_destroy {
void *team; /* UCC team to destroy */
};

/* UCC collective command structure */
struct urom_worker_cmd_ucc_coll {
void *coll_args; /* Collective arguments */
void *team; /* UCC team */
int use_xgvmi; /* If operation uses XGVMI */
void *work_buffer; /* Work buffer */
size_t work_buffer_size; /* Buffer size */
size_t team_size; /* Team size */
};

/* UROM UCC worker command structure */
struct urom_worker_ucc_cmd {
uint64_t cmd_type; /* Type of command as defined by urom_worker_ucc_cmd_type */
uint64_t dpu_worker_id; /* DPU worker id as part of the team */
union {
struct urom_worker_cmd_ucc_lib_create lib_create_cmd; /* Lib create command */
struct urom_worker_cmd_ucc_context_create context_create_cmd; /* Context create command */
struct urom_worker_cmd_ucc_context_destroy context_destroy_cmd; /* Context destroy command */
struct urom_worker_cmd_ucc_team_create team_create_cmd; /* Team create command */
struct urom_worker_cmd_ucc_team_destroy team_destroy_cmd; /* Team destroy command */
struct urom_worker_cmd_ucc_coll coll_cmd; /* UCC collective command */
struct urom_worker_cmd_ucc_pass_dc pass_dc_create_cmd; /* Passive data channel command */
};
};

/* UCC notification types */
enum urom_worker_ucc_notify_type {
UROM_WORKER_NOTIFY_UCC_LIB_CREATE_COMPLETE, /* Create UCC library on DPU notification */
UROM_WORKER_NOTIFY_UCC_LIB_DESTROY_COMPLETE, /* Destroy UCC library on DPU notification */
UROM_WORKER_NOTIFY_UCC_CONTEXT_CREATE_COMPLETE, /* Create UCC context on DPU notification */
UROM_WORKER_NOTIFY_UCC_CONTEXT_DESTROY_COMPLETE, /* Destroy UCC context on DPU notification */
UROM_WORKER_NOTIFY_UCC_TEAM_CREATE_COMPLETE, /* Create UCC team on DPU notification */
UROM_WORKER_NOTIFY_UCC_COLLECTIVE_COMPLETE, /* UCC collective completion notification */
UROM_WORKER_NOTIFY_UCC_PASSIVE_DATA_CHANNEL_COMPLETE, /* UCC data channel completion notification */
};

/* UCC context create notification structure */
struct urom_worker_ucc_notify_context_create {
void *context; /* Pointer to UCC context */
};

/* UCC team create notification structure */
struct urom_worker_ucc_notify_team_create {
void *team; /* Pointer to UCC team */
};

/* UCC collective notification structure */
struct urom_worker_ucc_notify_collective {
ucc_status_t status; /* UCC collective status */
};

/* UCC passive data channel notification structure */
struct urom_worker_ucc_notify_pass_dc {
ucc_status_t status; /* UCC data channel status */
};

/* UROM UCC worker notification structure */
struct urom_worker_notify_ucc {
uint64_t notify_type; /* Notify type as defined by urom_worker_ucc_notify_type */
uint64_t dpu_worker_id; /* DPU worker id */
union {
struct urom_worker_ucc_notify_context_create context_create_nqe; /* Context create notification */
struct urom_worker_ucc_notify_team_create team_create_nqe; /* Team create notification */
struct urom_worker_ucc_notify_collective coll_nqe; /* Collective notification */
struct urom_worker_ucc_notify_pass_dc pass_dc_nqe; /* Passive data channel notification */
};
};

typedef struct ucc_worker_key_buf {
size_t src_len;
size_t dst_len;
char rkeys[1024];
} ucc_worker_key_buf;

#ifdef __cplusplus
} /* extern "C" */
#endif

#endif /* UROM_UCC_H_ */
Loading

0 comments on commit 1ae5756

Please sign in to comment.