Skip to content

Commit

Permalink
CL/DOCA_UROM: Clean up
Browse files Browse the repository at this point in the history
  • Loading branch information
nsarka committed Jun 18, 2024
1 parent 20f37d9 commit f9537b3
Show file tree
Hide file tree
Showing 14 changed files with 242 additions and 918 deletions.
2 changes: 1 addition & 1 deletion contrib/doca_urom_ucc_plugin/dpu/worker_ucc_p2p.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#include "worker_ucc.h"
#include "../common/urom_ucc.h"

DOCA_LOG_REGISTER(UROM::WORKER::UCC::P2P);
DOCA_LOG_REGISTER(UCC::DOCA_CL : WORKER_UCC_P2P);

void urom_ep_err_cb(void *arg, ucp_ep_h ep, ucs_status_t ucs_status)
{
Expand Down
6 changes: 2 additions & 4 deletions src/components/cl/doca_urom/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,8 @@ sources = \
cl_doca_urom_lib.c \
cl_doca_urom_context.c \
cl_doca_urom_team.c \
cl_doca_urom_common_doca.c \
cl_doca_urom_common_doca.h \
cl_doca_urom_common_doca_urom.c \
cl_doca_urom_common_doca_urom.h \
cl_doca_urom_common.c \
cl_doca_urom_common.h \
cl_doca_urom_worker_ucc.c \
cl_doca_urom_worker_ucc.h \
cl_doca_urom_coll.c
Expand Down
5 changes: 2 additions & 3 deletions src/components/cl/doca_urom/cl_doca_urom.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@
#include <doca_ctx.h>
#include <doca_buf.h>

#include "cl_doca_urom_common_doca.h"
#include "cl_doca_urom_common_doca_urom.h"
#include "cl_doca_urom_common.h"
#include "cl_doca_urom_worker_ucc.h"

#include <urom_ucc.h>
Expand Down Expand Up @@ -90,7 +89,7 @@ typedef struct ucc_cl_doca_urom_team {
unsigned n_teams;
ucc_coll_score_t *score;
ucc_score_map_t *score_map;
struct ucc_result res; // used for the cookie
struct ucc_cl_doca_urom_result res; // used for the cookie
} ucc_cl_doca_urom_team_t;
UCC_CLASS_DECLARE(ucc_cl_doca_urom_team_t, ucc_base_context_t *,
const ucc_base_team_params_t *);
Expand Down
18 changes: 9 additions & 9 deletions src/components/cl/doca_urom/cl_doca_urom_coll.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,41 +53,41 @@ static ucc_status_t ucc_cl_doca_urom_coll_full_start(ucc_coll_task_t *task)
case UCC_COLL_TYPE_ALLTOALL:
{
if (!in_place) {
ucc_cl_doca_urom_buffer_export_ucc(tl_ctx->worker.ucp_context, coll_args->src.info.buffer, coll_args->src.info.count * ucc_cl_doca_urom_dt_size(coll_args->src.info.datatype), src_ebuf);
ucc_cl_doca_urom_buffer_export_ucc(tl_ctx->worker.ucp_context, coll_args->src.info.buffer, coll_args->src.info.count * ucc_dt_size(coll_args->src.info.datatype), src_ebuf);
keys.src_len = src_ebuf->packed_key_len;
memcpy(keys.rkeys, src_ebuf->packed_key, keys.src_len);
} else {
keys.src_len = 0;
}
ucc_cl_doca_urom_buffer_export_ucc(tl_ctx->worker.ucp_context, coll_args->dst.info.buffer, coll_args->dst.info.count * ucc_cl_doca_urom_dt_size(coll_args->dst.info.datatype), dst_ebuf);
ucc_cl_doca_urom_buffer_export_ucc(tl_ctx->worker.ucp_context, coll_args->dst.info.buffer, coll_args->dst.info.count * ucc_dt_size(coll_args->dst.info.datatype), dst_ebuf);
keys.dst_len = dst_ebuf->packed_key_len;
memcpy(keys.rkeys + keys.src_len, dst_ebuf->packed_key, keys.dst_len);
use_xgvmi = 0;
} break;
case UCC_COLL_TYPE_ALLREDUCE:
{
if (!in_place) {
ucc_cl_doca_urom_buffer_export_ucc(tl_ctx->worker.ucp_context, coll_args->src.info.buffer, coll_args->src.info.count * ucc_cl_doca_urom_dt_size(coll_args->src.info.datatype), src_ebuf);
ucc_cl_doca_urom_buffer_export_ucc(tl_ctx->worker.ucp_context, coll_args->src.info.buffer, coll_args->src.info.count * ucc_dt_size(coll_args->src.info.datatype), src_ebuf);
keys.src_len = src_ebuf->packed_memh_len;
memcpy(keys.rkeys, src_ebuf->packed_memh, keys.src_len);
} else {
keys.src_len = 0;
}
ucc_cl_doca_urom_buffer_export_ucc(tl_ctx->worker.ucp_context, coll_args->dst.info.buffer, coll_args->dst.info.count * ucc_cl_doca_urom_dt_size(coll_args->dst.info.datatype), dst_ebuf);
ucc_cl_doca_urom_buffer_export_ucc(tl_ctx->worker.ucp_context, coll_args->dst.info.buffer, coll_args->dst.info.count * ucc_dt_size(coll_args->dst.info.datatype), dst_ebuf);
keys.dst_len = dst_ebuf->packed_memh_len;
memcpy(keys.rkeys + keys.src_len, dst_ebuf->packed_memh, keys.dst_len);
use_xgvmi = 1;
} break;
case UCC_COLL_TYPE_ALLGATHER:
{
if (!in_place) {
ucc_cl_doca_urom_buffer_export_ucc(tl_ctx->worker.ucp_context, coll_args->src.info.buffer, coll_args->src.info.count * ucc_cl_doca_urom_dt_size(coll_args->src.info.datatype), src_ebuf);
ucc_cl_doca_urom_buffer_export_ucc(tl_ctx->worker.ucp_context, coll_args->src.info.buffer, coll_args->src.info.count * ucc_dt_size(coll_args->src.info.datatype), src_ebuf);
keys.src_len = src_ebuf->packed_memh_len;
memcpy(keys.rkeys, src_ebuf->packed_memh, keys.src_len);
} else {
keys.src_len = 0;
}
ucc_cl_doca_urom_buffer_export_ucc(tl_ctx->worker.ucp_context, coll_args->dst.info.buffer, coll_args->dst.info.count * ucc_cl_doca_urom_dt_size(coll_args->dst.info.datatype), dst_ebuf);
ucc_cl_doca_urom_buffer_export_ucc(tl_ctx->worker.ucp_context, coll_args->dst.info.buffer, coll_args->dst.info.count * ucc_dt_size(coll_args->dst.info.datatype), dst_ebuf);
keys.dst_len = dst_ebuf->packed_memh_len;
memcpy(keys.rkeys + keys.src_len, dst_ebuf->packed_memh, keys.dst_len);
use_xgvmi = 1;
Expand All @@ -98,7 +98,7 @@ static ucc_status_t ucc_cl_doca_urom_coll_full_start(ucc_coll_task_t *task)

coll_args->mask |= UCC_COLL_ARGS_FIELD_GLOBAL_WORK_BUFFER;

result = doca_urom_ucc_task_collective(cl_lib->urom_ctx.urom_worker,
result = ucc_cl_doca_urom_task_collective(cl_lib->urom_ctx.urom_worker,
cookie,
rank,
coll_args,
Expand All @@ -107,7 +107,7 @@ static ucc_status_t ucc_cl_doca_urom_coll_full_start(ucc_coll_task_t *task)
&keys,
sizeof(ucc_worker_key_buf),
0,
urom_ucc_collective_finished);
ucc_cl_doca_urom_collective_finished);
if (result != DOCA_SUCCESS) {
cl_error(&cl_lib->super, "Failed to create UCC collective task");
}
Expand Down Expand Up @@ -161,7 +161,7 @@ static void ucc_cl_doca_urom_coll_full_progress(ucc_coll_task_t *ctask)
ucc_tl_ucp_context_t *tl_ctx = ucc_derived_of(
ctx->super.tl_ctxs[ucp_index],
ucc_tl_ucp_context_t);
struct ucc_result *res = &schedule->res;
struct ucc_cl_doca_urom_result *res = &schedule->res;

if (res == NULL) {
cl_error(cl_lib, "Error in UROM");
Expand Down
2 changes: 1 addition & 1 deletion src/components/cl/doca_urom/cl_doca_urom_coll.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ extern const char

typedef struct ucc_cl_doca_urom_schedule_t {
ucc_schedule_pipelined_t super;
struct ucc_result res;
struct ucc_cl_doca_urom_result res;
struct export_buf src_ebuf;
struct export_buf dst_ebuf;
} ucc_cl_doca_urom_schedule_t;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES, ALL RIGHTS RESERVED.
* Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES, ALL RIGHTS RESERVED.
*
* This software product is a proprietary product of NVIDIA CORPORATION &
* AFFILIATES (the "Company") and all right, title, and interest in and to the
Expand All @@ -26,61 +26,11 @@
#include <doca_ctx.h>
#include <doca_log.h>

#include "cl_doca_urom_common_doca_urom.h"
#include "cl_doca_urom_common.h"

DOCA_LOG_REGISTER(UROM::SAMPLES : COMMON);
DOCA_LOG_REGISTER(UCC::DOCA_CL : UROM_COMMON);

/*
* ARGP Callback - Handle IB device name parameter
*
* @param [in]: Input parameter
* @config [in/out]: Program configuration context
* @return: DOCA_SUCCESS on success and DOCA_ERROR otherwise
*/
static doca_error_t device_address_callback(void *param, void *config)
{
struct urom_common_cfg *urom_cfg = (struct urom_common_cfg *)config;
char *device_name = (char *)param;
int len;

len = strnlen(device_name, DOCA_DEVINFO_IBDEV_NAME_SIZE);
if (len == DOCA_DEVINFO_IBDEV_NAME_SIZE) {
DOCA_LOG_ERR("Entered IB device name exceeding the maximum size of %d",
DOCA_DEVINFO_IBDEV_NAME_SIZE - 1);
return DOCA_ERROR_INVALID_VALUE;
}
strncpy(urom_cfg->device_name, device_name, len + 1);

return DOCA_SUCCESS;
}

doca_error_t register_urom_common_params(void)
{
doca_error_t result;
struct doca_argp_param *device_param;

/* Create and register device param */
result = doca_argp_param_create(&device_param);
if (result != DOCA_SUCCESS) {
DOCA_LOG_ERR("Failed to create ARGP param: %s", doca_error_get_descr(result));
return result;
}
doca_argp_param_set_short_name(device_param, "d");
doca_argp_param_set_long_name(device_param, "device");
doca_argp_param_set_arguments(device_param, "<IB device name>");
doca_argp_param_set_description(device_param, "IB device name.");
doca_argp_param_set_callback(device_param, device_address_callback);
doca_argp_param_set_type(device_param, DOCA_ARGP_TYPE_STRING);
result = doca_argp_register_param(device_param);
if (result != DOCA_SUCCESS) {
DOCA_LOG_ERR("Failed to register program param: %s", doca_error_get_descr(result));
return result;
}

return result;
}

doca_error_t start_urom_service(struct doca_pe *pe,
doca_error_t ucc_cl_doca_urom_start_urom_service(struct doca_pe *pe,
struct doca_dev *dev,
uint64_t nb_workers,
struct doca_urom_service **service)
Expand Down Expand Up @@ -133,7 +83,7 @@ doca_error_t start_urom_service(struct doca_pe *pe,
return result;
}

doca_error_t start_urom_worker(struct doca_pe *pe,
doca_error_t ucc_cl_doca_urom_start_urom_worker(struct doca_pe *pe,
struct doca_urom_service *service,
uint64_t worker_id,
uint32_t *gid,
Expand Down Expand Up @@ -228,12 +178,12 @@ doca_error_t start_urom_worker(struct doca_pe *pe,
return result;
}

doca_error_t start_urom_domain(struct doca_pe *pe,
doca_error_t ucc_cl_doca_urom_start_urom_domain(struct doca_pe *pe,
struct doca_urom_domain_oob_coll *oob,
uint64_t *worker_ids,
struct doca_urom_worker **workers,
size_t nb_workers,
struct urom_domain_buffer_attrs *buffers,
struct ucc_cl_doca_urom_domain_buffer_attrs *buffers,
size_t nb_buffers,
struct doca_urom_domain **domain)
{
Expand Down Expand Up @@ -309,3 +259,56 @@ doca_error_t start_urom_domain(struct doca_pe *pe,
}
return result;
}


doca_error_t ucc_cl_doca_urom_open_doca_device_with_ibdev_name(const uint8_t *value,
size_t val_size,
ucc_cl_doca_urom_tasks_check func,
struct doca_dev **retval)
{
struct doca_devinfo **dev_list;
uint32_t nb_devs;
char buf[DOCA_DEVINFO_IBDEV_NAME_SIZE] = {};
char val_copy[DOCA_DEVINFO_IBDEV_NAME_SIZE] = {};
int res;
size_t i;

/* Set default return value */
*retval = NULL;

/* Setup */
if (val_size > DOCA_DEVINFO_IBDEV_NAME_SIZE) {
DOCA_LOG_ERR("Value size too large. Failed to locate device");
return DOCA_ERROR_INVALID_VALUE;
}
memcpy(val_copy, value, val_size);

res = doca_devinfo_create_list(&dev_list, &nb_devs);
if (res != DOCA_SUCCESS) {
DOCA_LOG_ERR("Failed to load doca devices list: %s", doca_error_get_descr(res));
return res;
}

/* Search */
for (i = 0; i < nb_devs; i++) {
res = doca_devinfo_get_ibdev_name(dev_list[i], buf, DOCA_DEVINFO_IBDEV_NAME_SIZE);
if (res == DOCA_SUCCESS && strncmp(buf, val_copy, val_size) == 0) {
/* If any special capabilities are needed */
if (func != NULL && func(dev_list[i]) != DOCA_SUCCESS)
continue;

/* if device can be opened */
res = doca_dev_open(dev_list[i], retval);
if (res == DOCA_SUCCESS) {
doca_devinfo_destroy_list(dev_list);
return res;
}
}
}

DOCA_LOG_WARN("Matching device not found");
res = DOCA_ERROR_NOT_FOUND;

doca_devinfo_destroy_list(dev_list);
return res;
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES, ALL RIGHTS RESERVED.
* Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES, ALL RIGHTS RESERVED.
*
* This software product is a proprietary product of NVIDIA CORPORATION &
* AFFILIATES (the "Company") and all right, title, and interest in and to the
Expand All @@ -15,24 +15,21 @@
#define _GNU_SOURCE
#endif

#ifndef UROM_COMMON_H_
#define UROM_COMMON_H_
#ifndef UCC_CL_DOCA_UROM_COMMON_H_
#define UCC_CL_DOCA_UROM_COMMON_H_

#include <doca_dev.h>
#include <doca_urom.h>
#include <doca_pe.h>
#include <doca_error.h>

/*
* Struct contains all the common configurations that needed for DOCA UROM samples.
*/
struct urom_common_cfg {
char device_name[DOCA_DEVINFO_IBDEV_NAME_SIZE]; /* DOCA device name */
};
/* Function to check if a given device is capable of executing some task */
typedef doca_error_t (*ucc_cl_doca_urom_tasks_check)(struct doca_devinfo *);

/*
* Struct contains domain shared buffer details
*/
struct urom_domain_buffer_attrs {
struct ucc_cl_doca_urom_domain_buffer_attrs {
void *buffer; /* Buffer address */
size_t buf_len; /* Buffer length */
void *memh; /* Buffer packed memory handle */
Expand All @@ -42,11 +39,18 @@ struct urom_domain_buffer_attrs {
};

/*
* Register the common command line parameter for the sample.
* Open a DOCA device according to a given IB device name
*
* @value [in]: IB device name
* @val_size [in]: input length, in bytes
* @func [in]: pointer to a function that checks if the device have some task capabilities (Ignored if set to NULL)
* @retval [out]: pointer to doca_dev struct, NULL if not found
* @return: DOCA_SUCCESS on success and DOCA_ERROR otherwise
*/
doca_error_t register_urom_common_params(void);
doca_error_t ucc_cl_doca_urom_open_doca_device_with_ibdev_name(const uint8_t *value,
size_t val_size,
ucc_cl_doca_urom_tasks_check func,
struct doca_dev **retval);

/*
* Start UROM service context
Expand All @@ -57,7 +61,7 @@ doca_error_t register_urom_common_params(void);
* @service [out]: service context
* @return: DOCA_SUCCESS on success and DOCA_ERROR otherwise
*/
doca_error_t start_urom_service(struct doca_pe *pe,
doca_error_t ucc_cl_doca_urom_start_urom_service(struct doca_pe *pe,
struct doca_dev *dev,
uint64_t nb_workers,
struct doca_urom_service **service);
Expand All @@ -77,7 +81,7 @@ doca_error_t start_urom_service(struct doca_pe *pe,
* @worker [out]: set worker context
* @return: DOCA_SUCCESS on success and DOCA_ERROR otherwise
*/
doca_error_t start_urom_worker(struct doca_pe *pe,
doca_error_t ucc_cl_doca_urom_start_urom_worker(struct doca_pe *pe,
struct doca_urom_service *service,
uint64_t worker_id,
uint32_t *gid,
Expand All @@ -101,12 +105,13 @@ doca_error_t start_urom_worker(struct doca_pe *pe,
* @domain [out]: domain context
* @return: DOCA_SUCCESS on success and DOCA_ERROR otherwise
*/
doca_error_t start_urom_domain(struct doca_pe *pe,
doca_error_t ucc_cl_doca_urom_start_urom_domain(struct doca_pe *pe,
struct doca_urom_domain_oob_coll *oob,
uint64_t *worker_ids,
struct doca_urom_worker **workers,
size_t nb_workers,
struct urom_domain_buffer_attrs *buffers,
struct ucc_cl_doca_urom_domain_buffer_attrs *buffers,
size_t nb_buffers,
struct doca_urom_domain **domain);
#endif /* UROM_COMMON_H_ */

#endif /* UCC_CL_DOCA_UROM_COMMON_H_ */
Loading

0 comments on commit f9537b3

Please sign in to comment.