Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DOCA CL #3

Open
wants to merge 18 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/codestyle.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
fi
fi
H1="CODESTYLE|REVIEW|CORE|UTIL|TEST|API|DOCS|TOOLS|BUILD|MC|EC|SCHEDULE|TOPO"
H2="CI|CL/|TL/|MC/|EC/|UCP|SHM|NCCL|SHARP|BASIC|HIER|CUDA|CPU|EE|RCCL|ROCM|SELF|MLX5"
H2="CI|CL/|TL/|MC/|EC/|UCP|SHM|NCCL|SHARP|BASIC|HIER|DOCA_UROM|CUDA|CPU|EE|RCCL|ROCM|SELF|MLX5"
if ! echo $msg | grep -qP '^Merge |^'"(($H1)|($H2))"'+: \w'
then
echo "Wrong header"
Expand Down
3 changes: 2 additions & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
#
# Copyright (c) 2020, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (C) Huawei Technologies Co., Ltd. 2020. All rights reserved.
# $HEADER$
#

if !DOCS_ONLY
SUBDIRS = \
src \
contrib \
tools/info \
cmake

Expand Down
74 changes: 74 additions & 0 deletions config/m4/doca_urom.m4
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# See file LICENSE for terms.
#

AC_DEFUN([CHECK_DOCA_UROM],[
AS_IF([test "x$doca_urom_checked" != "xyes"],[
doca_urom_happy="no"
AC_ARG_WITH([doca_urom],
[AS_HELP_STRING([--with-doca_urom=(DIR)], [Enable the use of DOCA_UROM (default is guess).])],
[], [with_doca_urom=guess])
AS_IF([test "x$with_doca_urom" != "xno"],
[
save_CPPFLAGS="$CPPFLAGS"
save_LDFLAGS="$LDFLAGS"
AS_IF([test ! -z "$with_doca_urom" -a "x$with_doca_urom" != "xyes" -a "x$with_doca_urom" != "xguess"],
[
AS_IF([test ! -d $with_doca_urom],
[AC_MSG_ERROR([Provided "--with-doca_urom=${with_doca_urom}" location does not exist])])
check_doca_urom_dir="$with_doca_urom"
check_doca_urom_libdir="$with_doca_urom/lib64"
CPPFLAGS="-I$with_doca_urom/include $UCS_CPPFLAGS $save_CPPFLAGS"
LDFLAGS="-L$check_doca_urom_libdir $save_LDFLAGS"
])
AS_IF([test ! -z "$with_doca_urom_libdir" -a "x$with_doca_urom_libdir" != "xyes"],
[
check_doca_urom_libdir="$with_doca_urom_libdir"
LDFLAGS="-L$check_doca_urom_libdir $save_LDFLAGS"
])
AC_CHECK_HEADERS([doca_urom.h],
[
AC_CHECK_LIB([doca_urom], [doca_urom_service_create],
[
doca_urom_happy="yes"
],
[
doca_urom_happy="no"
], [-ldoca_common -ldoca_argp -ldoca_urom])
],
[
doca_urom_happy="no"
])
AS_IF([test "x$doca_urom_happy" = "xyes"],
[
AS_IF([test "x$check_doca_urom_dir" != "x"],
[
AC_MSG_RESULT([DOCA_UROM dir: $check_doca_urom_dir])
AC_SUBST(DOCA_UROM_CPPFLAGS, "-I$check_doca_urom_dir/include/")
])
AS_IF([test "x$check_doca_urom_libdir" != "x"],
[
AC_SUBST(DOCA_UROM_LDFLAGS, "-L$check_doca_urom_libdir")
])
AC_SUBST(DOCA_UROM_LIBADD, "-ldoca_common -ldoca_argp -ldoca_urom")
AC_DEFINE([HAVE_DOCA_UROM], 1, [Enable DOCA_UROM support])
],
[
AS_IF([test "x$with_doca_urom" != "xguess"],
[
AC_MSG_ERROR([DOCA_UROM support is requested but DOCA_UROM packages cannot be found! $CPPFLAGS $LDFLAGS])
],
[
AC_MSG_WARN([DOCA_UROM not found])
])
])
CPPFLAGS="$save_CPPFLAGS"
LDFLAGS="$save_LDFLAGS"
],
[
AC_MSG_WARN([DOCA_UROM was explicitly disabled])
])
doca_urom_checked=yes
AM_CONDITIONAL([HAVE_DOCA_UROM], [test "x$doca_urom_happy" != xno])
])])
11 changes: 10 additions & 1 deletion configure.ac
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2001-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2001-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# This software product is a proprietary product of Mellanox Technologies Ltd.
# (the "Company") and all right, title, and interest and to the software product,
# including all associated intellectual property rights, are and shall
Expand Down Expand Up @@ -162,6 +162,7 @@ AS_IF([test "x$with_docs_only" = xyes],
AM_CONDITIONAL([HAVE_IBVERBS],[false])
AM_CONDITIONAL([HAVE_RDMACM],[false])
AM_CONDITIONAL([HAVE_MLX5DV],[false])
AM_CONDITIONAL([HAVE_DOCA_UROM], [false])
],
[
AM_CONDITIONAL([DOCS_ONLY], [false])
Expand All @@ -172,6 +173,7 @@ AS_IF([test "x$with_docs_only" = xyes],
m4_include([config/m4/cuda.m4])
m4_include([config/m4/nccl.m4])
m4_include([config/m4/rocm.m4])
m4_include([config/m4/doca_urom.m4])
m4_include([config/m4/rccl.m4])
m4_include([config/m4/sharp.m4])
m4_include([config/m4/mpi.m4])
Expand Down Expand Up @@ -205,6 +207,9 @@ AS_IF([test "x$with_docs_only" = xyes],
mc_modules="${mc_modules}:rocm"
fi

CHECK_DOCA_UROM
AC_MSG_RESULT([DOCA_UROM support: $doca_urom_happy])

CHECK_GTEST
AC_MSG_RESULT([GTEST support: $gtest_happy])

Expand All @@ -224,11 +229,14 @@ LDFLAGS="$LDFLAGS $UCS_LDFLAGS $UCS_LIBADD"
CHECK_TL_COLL_PLUGINS
AC_CONFIG_FILES([
Makefile
contrib/Makefile
contrib/doca_urom_ucc_plugin/Makefile
src/Makefile
src/ucc/api/ucc_version.h
src/core/ucc_version.c
src/components/cl/basic/Makefile
src/components/cl/hier/Makefile
src/components/cl/doca_urom/Makefile
src/components/mc/cpu/Makefile
src/components/mc/cuda/Makefile
src/components/ec/cpu/Makefile
Expand Down Expand Up @@ -265,6 +273,7 @@ AC_MSG_NOTICE([ C++ compiler: ${CXX} ${CXXFLAGS} ${BASE_CXXFLAGS}])
AS_IF([test "x$cuda_happy" = "xyes"],[
AC_MSG_NOTICE([ NVCC gencodes: ${NVCC_ARCH}])
])
AC_MSG_NOTICE([ DOCA UROM enabled: ${doca_urom_happy}])
AS_IF([test "x$rocm_happy" = xyes],[
AC_MSG_NOTICE([ROCM architectures: ${ROCM_ARCH}])
])
Expand Down
5 changes: 5 additions & 0 deletions contrib/Makefile.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#

SUBDIRS = doca_urom_ucc_plugin
22 changes: 22 additions & 0 deletions contrib/doca_urom_ucc_plugin/Makefile.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#

if HAVE_DOCA_UROM

sources = \
common/urom_ucc.h \
dpu/worker_ucc_p2p.c \
dpu/worker_ucc.h \
dpu/worker_ucc.c

plugindir = $(moduledir)/doca_plugins

plugin_LTLIBRARIES = libucc_doca_urom_plugin.la
libucc_doca_urom_plugin_la_SOURCES = $(sources)
libucc_doca_urom_plugin_la_CPPFLAGS = $(AM_CPPFLAGS) $(BASE_CPPFLAGS) $(UCX_CPPFLAGS) $(DOCA_UROM_CPPFLAGS)
libucc_doca_urom_plugin_la_CFLAGS = $(BASE_CFLAGS)
libucc_doca_urom_plugin_la_LDFLAGS = -version-info $(SOVERSION) --as-needed $(UCX_LDFLAGS) $(DOCA_UROM_LDFLAGS)
libucc_doca_urom_plugin_la_LIBADD = $(UCX_LIBADD) $(DOCA_UROM_LIBADD) $(UCC_TOP_BUILDDIR)/src/libucc.la

endif
166 changes: 166 additions & 0 deletions contrib/doca_urom_ucc_plugin/common/urom_ucc.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
/*
* Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES, ALL RIGHTS RESERVED.
*
* This software product is a proprietary product of NVIDIA CORPORATION &
* AFFILIATES (the "Company") and all right, title, and interest in and to the
* software product, including all associated intellectual property rights, are
* and shall remain exclusively with the Company.
*
* This software product is governed by the End User License Agreement
* provided with the software product.
*
*/

#ifndef UROM_UCC_H_
#define UROM_UCC_H_

#include <ucp/api/ucp.h>
#include <ucc/api/ucc.h>

#ifdef __cplusplus
extern "C" {
#endif

/* UCC serializing next raw, iter points to the offset place and returns the
buffer start */
#define urom_ucc_serialize_next_raw(_iter, _type, _offset) \
({ \
_type *_result = (_type *)(*(_iter)); \
*(_iter) = UCS_PTR_BYTE_OFFSET(*(_iter), _offset); \
_result; \
})

/* UCC command types */
enum urom_worker_ucc_cmd_type {
UROM_WORKER_CMD_UCC_LIB_CREATE, /* UCC library create command */
UROM_WORKER_CMD_UCC_LIB_DESTROY, /* UCC library destroy command */
UROM_WORKER_CMD_UCC_CONTEXT_CREATE, /* UCC context create command */
UROM_WORKER_CMD_UCC_CONTEXT_DESTROY, /* UCC context destroy command */
UROM_WORKER_CMD_UCC_TEAM_CREATE, /* UCC team create command */
UROM_WORKER_CMD_UCC_COLL, /* UCC collective create command */
UROM_WORKER_CMD_UCC_CREATE_PASSIVE_DATA_CHANNEL, /* UCC passive data channel command */
};

/*
* UCC library create command structure
*
* Input parameters for creating the library handle. The semantics of the
* parameters are defined by ucc.h On successful completion of
* urom_worker_cmd_ucc_lib_create, The UROM worker will generate a notification
* on the notification queue. This notification has reference to local library
* handle on the worker. The implementation can choose to create shadow handles
* or safely pack the library handle on the BlueCC worker to the AEU.
*/
struct urom_worker_cmd_ucc_lib_create {
void *params; /* UCC library parameters */
};

/* UCC context create command structure */
struct urom_worker_cmd_ucc_context_create {
union {
int64_t start; /* The started index */
int64_t *array; /* Set stride to <= 0 if array is used */
};
int64_t stride; /* Set number of strides */
int64_t size; /* Set stride size */
void *base_va; /* Shared buffer address */
uint64_t len; /* Buffer length */
};

/* UCC passive data channel command structure */
struct urom_worker_cmd_ucc_pass_dc {
void *ucp_addr; /* UCP worker address on host */
size_t addr_len; /* UCP worker address length */
};

/* UCC context destroy command structure */
struct urom_worker_cmd_ucc_context_destroy {
ucc_context_h context_h; /* UCC context pointer */
};

/* UCC team create command structure */
struct urom_worker_cmd_ucc_team_create {
int64_t start; /* Team start index */
int64_t stride; /* Number of strides */
int64_t size; /* Stride size */
ucc_context_h context_h; /* UCC context */
};

/* UCC collective command structure */
struct urom_worker_cmd_ucc_coll {
ucc_coll_args_t *coll_args; /* Collective arguments */
ucc_team_h team; /* UCC team */
int use_xgvmi; /* If operation uses XGVMI */
void *work_buffer; /* Work buffer */
size_t work_buffer_size; /* Buffer size */
size_t team_size; /* Team size */
};

/* UROM UCC worker command structure */
struct urom_worker_ucc_cmd {
enum urom_worker_ucc_cmd_type cmd_type;
uint64_t dpu_worker_id; /* DPU worker id as part of the team */
union {
struct urom_worker_cmd_ucc_lib_create lib_create_cmd; /* Lib create command */
struct urom_worker_cmd_ucc_context_create context_create_cmd; /* Context create command */
struct urom_worker_cmd_ucc_context_destroy context_destroy_cmd; /* Context destroy command */
struct urom_worker_cmd_ucc_team_create team_create_cmd; /* Team create command */
struct urom_worker_cmd_ucc_coll coll_cmd; /* UCC collective command */
struct urom_worker_cmd_ucc_pass_dc pass_dc_create_cmd; /* Passive data channel command */
};
};

/* UCC notification types */
enum urom_worker_ucc_notify_type {
UROM_WORKER_NOTIFY_UCC_LIB_CREATE_COMPLETE, /* Create UCC library on DPU notification */
UROM_WORKER_NOTIFY_UCC_LIB_DESTROY_COMPLETE, /* Destroy UCC library on DPU notification */
UROM_WORKER_NOTIFY_UCC_CONTEXT_CREATE_COMPLETE, /* Create UCC context on DPU notification */
UROM_WORKER_NOTIFY_UCC_CONTEXT_DESTROY_COMPLETE, /* Destroy UCC context on DPU notification */
UROM_WORKER_NOTIFY_UCC_TEAM_CREATE_COMPLETE, /* Create UCC team on DPU notification */
UROM_WORKER_NOTIFY_UCC_COLLECTIVE_COMPLETE, /* UCC collective completion notification */
UROM_WORKER_NOTIFY_UCC_PASSIVE_DATA_CHANNEL_COMPLETE, /* UCC data channel completion notification */
};

/* UCC context create notification structure */
struct urom_worker_ucc_notify_context_create {
ucc_context_h context; /* Pointer to UCC context */
};

/* UCC team create notification structure */
struct urom_worker_ucc_notify_team_create {
ucc_team_h team; /* Pointer to UCC team */
};

/* UCC collective notification structure */
struct urom_worker_ucc_notify_collective {
ucc_status_t status; /* UCC collective status */
};

/* UCC passive data channel notification structure */
struct urom_worker_ucc_notify_pass_dc {
ucc_status_t status; /* UCC data channel status */
};

/* UROM UCC worker notification structure */
struct urom_worker_notify_ucc {
enum urom_worker_ucc_notify_type notify_type;
uint64_t dpu_worker_id; /* DPU worker id */
union {
struct urom_worker_ucc_notify_context_create context_create_nqe; /* Context create notification */
struct urom_worker_ucc_notify_team_create team_create_nqe; /* Team create notification */
struct urom_worker_ucc_notify_collective coll_nqe; /* Collective notification */
struct urom_worker_ucc_notify_pass_dc pass_dc_nqe; /* Passive data channel notification */
};
};

typedef struct ucc_worker_key_buf {
size_t src_len;
size_t dst_len;
char rkeys[1024];
} ucc_worker_key_buf;

#ifdef __cplusplus
} /* extern "C" */
#endif

#endif /* UROM_UCC_H_ */
Loading
Loading