Skip to content

Commit

Permalink
TL/MLX5: adding ip over ib mcast helper functions (openucx#861)
Browse files Browse the repository at this point in the history
  • Loading branch information
MamziB authored and janjust committed Jan 31, 2024
1 parent dcf8716 commit 46a6e56
Show file tree
Hide file tree
Showing 5 changed files with 227 additions and 2 deletions.
1 change: 1 addition & 0 deletions src/components/tl/mlx5/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ mcast = \
mcast/p2p/ucc_tl_mlx5_mcast_p2p.c \
mcast/tl_mlx5_mcast_progress.h \
mcast/tl_mlx5_mcast_helper.h \
mcast/tl_mlx5_mcast_helper.c \
mcast/tl_mlx5_mcast_team.c

sources = \
Expand Down
191 changes: 191 additions & 0 deletions src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
/**
* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/

#include "tl_mlx5_mcast_helper.h"
#include <glob.h>
#include <net/if.h>
#include <ifaddrs.h>

#define PREF "/sys/class/net/"
#define SUFF "/device/resource"
#define MAX_STR_LEN 128

static ucc_status_t ucc_tl_mlx5_get_ipoib_ip(char *ifname, struct sockaddr_storage *addr)
{
ucc_status_t status = UCC_ERR_NO_RESOURCE;
struct ifaddrs *ifaddr = NULL;
struct ifaddrs *ifa = NULL;
int is_ipv4 = 0;
int family;
int n;
int is_up;

if (getifaddrs(&ifaddr) == -1) {
return UCC_ERR_NO_RESOURCE;
}

for (ifa = ifaddr, n = 0; ifa != NULL; ifa=ifa->ifa_next, n++) {
if (ifa->ifa_addr == NULL) {
continue;
}

family = ifa->ifa_addr->sa_family;
if (family != AF_INET && family != AF_INET6) {
continue;
}

is_up = (ifa->ifa_flags & IFF_UP) == IFF_UP;
is_ipv4 = (family == AF_INET) ? 1 : 0;

if (is_up && !strncmp(ifa->ifa_name, ifname, strlen(ifname)) ) {
if (is_ipv4) {
memcpy((struct sockaddr_in *) addr,
(struct sockaddr_in *) ifa->ifa_addr,
sizeof(struct sockaddr_in));
} else {
memcpy((struct sockaddr_in6 *) addr,
(struct sockaddr_in6 *) ifa->ifa_addr,
sizeof(struct sockaddr_in6));
}

status = UCC_OK;
break;
}
}

freeifaddrs(ifaddr);
return status;
}

static int cmp_files(char *f1, char *f2)
{
int answer = 0;
FILE *fp1;
FILE *fp2;
int ch1;
int ch2;

if ((fp1 = fopen(f1, "r")) == NULL) {
goto out;
} else if ((fp2 = fopen(f2, "r")) == NULL) {
goto close;
}

do {
ch1 = getc(fp1);
ch2 = getc(fp2);
} while((ch1 != EOF) && (ch2 != EOF) && (ch1 == ch2));


if (ch1 == ch2) {
answer = 1;
}

if (fclose(fp2) != 0) {
return 0;
}
close:
if (fclose(fp1) != 0) {
return 0;
}
out:
return answer;
}

static int port_from_file(char *port_file)
{
int res = -1;
char buf1[MAX_STR_LEN];
char buf2[MAX_STR_LEN];
FILE *fp;
int len;

if ((fp = fopen(port_file, "r")) == NULL) {
return -1;
}

if (fgets(buf1, MAX_STR_LEN - 1, fp) == NULL) {
goto out;
}

len = strlen(buf1) - 2;
strncpy(buf2, buf1 + 2, len);
buf2[len] = 0;
res = atoi(buf2);

out:
if (fclose(fp) != 0) {
return -1;
}
return res;
}

static ucc_status_t dev2if(char *dev_name, char *port, struct sockaddr_storage
*rdma_src_addr)
{
ucc_status_t status = UCC_OK;
glob_t glob_el = {0,};
char dev_file [MAX_STR_LEN];
char port_file[MAX_STR_LEN];
char net_file [MAX_STR_LEN];
char if_name [MAX_STR_LEN];
char glob_path[MAX_STR_LEN];
int i;
char **p;
int len;

sprintf(glob_path, PREF"*");

sprintf(dev_file, "/sys/class/infiniband/%s"SUFF, dev_name);
if (glob(glob_path, 0, 0, &glob_el)) {
return UCC_ERR_NO_RESOURCE;
}
p = glob_el.gl_pathv;

if (glob_el.gl_pathc >= 1) {
for (i = 0; i < glob_el.gl_pathc; i++, p++) {
sprintf(port_file, "%s/dev_id", *p);
sprintf(net_file, "%s"SUFF, *p);
if(cmp_files(net_file, dev_file) && port != NULL &&
port_from_file(port_file) == atoi(port) - 1) {
len = strlen(net_file) - strlen(PREF) - strlen(SUFF);
strncpy(if_name, net_file + strlen(PREF), len);
if_name[len] = 0;

status = ucc_tl_mlx5_get_ipoib_ip(if_name, rdma_src_addr);
if (UCC_OK == status) {
break;
}
}
}
}

globfree(&glob_el);
return status;
}

ucc_status_t ucc_tl_mlx5_probe_ip_over_ib(char* ib_dev, struct
sockaddr_storage *addr)
{
char *ib_name = NULL;
char *port = NULL;
ucc_status_t status;
struct sockaddr_storage rdma_src_addr;

if (NULL == ib_dev) {
status = UCC_ERR_NO_RESOURCE;
} else {
ucc_string_split(ib_dev, ":", 2, &ib_name, &port);
status = dev2if(ib_name, port, &rdma_src_addr);
}

if (UCC_OK == status) {
*addr = rdma_src_addr;
}

return status;
}

7 changes: 5 additions & 2 deletions src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -352,14 +352,17 @@ static inline ucc_status_t ucc_tl_mlx5_mcast_reliable(ucc_tl_mlx5_mcast_coll_com
return UCC_INPROGRESS;
}

ucc_status_t ucc_tl_setup_mcast(ucc_tl_mlx5_mcast_coll_comm_t *comm);
ucc_status_t ucc_tl_mlx5_probe_ip_over_ib(char* ib_dev_list,
struct sockaddr_storage *addr);

ucc_status_t ucc_tl_mlx5_setup_mcast(ucc_tl_mlx5_mcast_coll_comm_t *comm);

ucc_status_t ucc_tl_mlx5_mcast_init_qps(ucc_tl_mlx5_mcast_coll_context_t *ctx,
ucc_tl_mlx5_mcast_coll_comm_t *comm);

ucc_status_t ucc_tl_mlx5_mcast_setup_qps(ucc_tl_mlx5_mcast_coll_context_t *ctx,
ucc_tl_mlx5_mcast_coll_comm_t *comm);

ucc_status_t ucc_tl_clean_mcast_comm(ucc_tl_mlx5_mcast_coll_comm_t *comm);
ucc_status_t ucc_tl_mlx5_clean_mcast_comm(ucc_tl_mlx5_mcast_coll_comm_t *comm);

#endif /* TL_MLX5_MCAST_HELPER_H_ */
29 changes: 29 additions & 0 deletions src/components/tl/mlx5/tl_mlx5.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,27 @@ static ucc_config_field_t ucc_tl_mlx5_lib_config_table[] = {
ucc_offsetof(ucc_tl_mlx5_lib_config_t, qp_conf.qp_max_atomic),
UCC_CONFIG_TYPE_UINT},

{"MCAST_SX_DEPTH", "512", "Send context depth of the Mcast comm",
ucc_offsetof(ucc_tl_mlx5_lib_config_t, mcast_conf.sx_depth),
UCC_CONFIG_TYPE_INT},

{"MCAST_SX_INLINE", "128", "Minimal size for inline data send in Mcast",
ucc_offsetof(ucc_tl_mlx5_lib_config_t, mcast_conf.sx_inline),
UCC_CONFIG_TYPE_MEMUNITS},

{"MCAST_RX_DEPTH", "4096", "Recv context depth of the Mcast comm",
ucc_offsetof(ucc_tl_mlx5_lib_config_t, mcast_conf.rx_depth),
UCC_CONFIG_TYPE_INT},

{"MCAST_POST_RECV_THRESH", "64",
"Threshold for posting recv into rx ctx of the Mcast comm",
ucc_offsetof(ucc_tl_mlx5_lib_config_t, mcast_conf.post_recv_thresh),
UCC_CONFIG_TYPE_INT},

{"MCAST_WINDOW_SIZE", "64", "Reliability Mcast window size",
ucc_offsetof(ucc_tl_mlx5_lib_config_t, mcast_conf.wsize),
UCC_CONFIG_TYPE_INT},

{NULL}};

static ucc_config_field_t ucc_tl_mlx5_context_config_table[] = {
Expand All @@ -77,6 +98,14 @@ static ucc_config_field_t ucc_tl_mlx5_context_config_table[] = {
ucc_offsetof(ucc_tl_mlx5_context_config_t, devices),
UCC_CONFIG_TYPE_STRING_ARRAY},

{"MCAST_TIMEOUT", "10000", "Timeout [usec] for the reliability NACK in Mcast",
ucc_offsetof(ucc_tl_mlx5_context_config_t, mcast_ctx_conf.timeout),
UCC_CONFIG_TYPE_INT},

{"MCAST_NET_DEVICE", "", "Specifies which network device to use for Mcast",
ucc_offsetof(ucc_tl_mlx5_context_config_t, mcast_ctx_conf.ib_dev_name),
UCC_CONFIG_TYPE_STRING},

{NULL}};

UCC_CLASS_DEFINE_NEW_FUNC(ucc_tl_mlx5_lib_t, ucc_base_lib_t,
Expand Down
1 change: 1 addition & 0 deletions src/utils/ucc_compiler_def.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#define ucc_snprintf_safe snprintf
#define ucc_likely ucs_likely
#define ucc_unlikely ucs_unlikely
#define ucc_string_split ucs_string_split

/**
* Prevent compiler from reordering instructions
Expand Down

0 comments on commit 46a6e56

Please sign in to comment.