diff --git a/src/include/mpir_hwtopo.h b/src/include/mpir_hwtopo.h index 272d26d403e..0e101d15d6e 100644 --- a/src/include/mpir_hwtopo.h +++ b/src/include/mpir_hwtopo.h @@ -12,6 +12,7 @@ typedef enum { MPIR_HWTOPO_TYPE__NODE, MPIR_HWTOPO_TYPE__PACKAGE, MPIR_HWTOPO_TYPE__SOCKET, + MPIR_HWTOPO_TYPE__GROUP, MPIR_HWTOPO_TYPE__CPU, MPIR_HWTOPO_TYPE__CORE, MPIR_HWTOPO_TYPE__HWTHREAD, @@ -146,4 +147,26 @@ bool MPIR_hwtopo_is_dev_close_by_pci(int domain, int bus, int dev, int func); * Return the global id of the first non-io object above the PCI device */ MPIR_hwtopo_gid_t MPIR_hwtopo_get_dev_parent_by_pci(int domain, int bus, int dev, int func); + +/* + * Return the number of numa nodes. + * This function is used to determine if a node is in SPR SNC4 mode + */ +int MPIR_hwtopo_get_num_numa_nodes(void); + +/* + * Return the global id of the group ancestor of the first bound PU. + * This function is used for nic binding in SPR SNC4 mode + */ +MPIR_hwtopo_gid_t MPIR_hwtopo_get_first_pu_group(void); + +/* + * Return the global id of the socket ancestor of the passed gid. + */ +MPIR_hwtopo_gid_t MPIR_hwtopo_get_parent_socket(MPIR_hwtopo_gid_t gid); + +/* + * Return the local index of my nic in my first non io ancestor. + */ +int MPIR_hwtopo_get_pci_network_lid(int domain, int bus, int dev, int func); #endif /* MPIR_HWTOPO_H_INCLUDED */ diff --git a/src/mpid/ch4/netmod/ofi/ofi_nic.c b/src/mpid/ch4/netmod/ofi/ofi_nic.c index 8ab3ea5c1be..ea0ce347cc9 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_nic.c +++ b/src/mpid/ch4/netmod/ofi/ofi_nic.c @@ -31,6 +31,50 @@ static bool is_nic_close(struct fi_info *info) return MPIR_hwtopo_is_dev_close_by_name(info->domain_attr->name); } +/* Return true if the NIC is close to the group of the calling process */ +static bool is_nic_close_snc4(const MPIDI_OFI_nic_info_t * nic_info, int num_parents) +{ + int nic_socket_gid = MPIR_hwtopo_get_parent_socket(nic_info->parent); + int rank_socket_gid = MPIR_hwtopo_get_parent_socket(MPIR_hwtopo_get_first_pu_group()); + + /* In SNC4 mode, when there are 4 groups that have nics, it means that there are 4 + * other adjacent groups with no nics. This leads to each set of 2 groups having 2 nics + * such that, the first group has no nics and the second group has 2 nics. + * The correct assignment strategy is such the 2 nics of the second group is considered + * close to the ranks on both the groups.*/ + if (num_parents == 4) { + /* Check that the parent socket of the rank and the nic is the same */ + if (nic_socket_gid == rank_socket_gid) { + int nic_group_lid = MPIR_hwtopo_get_lid(nic_info->parent); + int rank_group_lid = MPIR_hwtopo_get_lid(MPIR_hwtopo_get_first_pu_group()); + if (nic_group_lid == rank_group_lid || nic_group_lid - rank_group_lid == 1) { + struct fi_info *info = (struct fi_info *) (nic_info->nic); + if (info->nic->bus_attr->bus_type == FI_BUS_PCI) { + struct fi_pci_attr pci = info->nic->bus_attr->attr.pci; + + int nic_lid = MPIR_hwtopo_get_pci_network_lid(pci.domain_id, + pci.bus_id, + pci.device_id, + pci.function_id); + + /* Map 1st nic of the group to the previous group */ + if (nic_lid == 0 && nic_group_lid - rank_group_lid == 1) + return 1; + /* Map 2nd nic of the group to the current group */ + else if (nic_lid == 1 && nic_group_lid == rank_group_lid) + return 1; + } + } + } + } else { + /* On using a different configuration than having 4 num_parents, simply + * compare parent socket of the nic and the rank */ + if (nic_socket_gid == rank_socket_gid) + return 1; + } + return 0; +} + /* Comparison function for NIC names. Used in qsort() */ static int compare_nic_names(const void *info1, const void *info2) { @@ -170,6 +214,19 @@ static int setup_single_nic(void) } #ifdef HAVE_LIBFABRIC_NIC +/* Comparison function for NICs in SPR SNC4 mode. This function is used in qsort(). */ +static int compare_nics_snc4(const void *nic1, const void *nic2) +{ + const MPIDI_OFI_nic_info_t *i1 = (const MPIDI_OFI_nic_info_t *) nic1; + const MPIDI_OFI_nic_info_t *i2 = (const MPIDI_OFI_nic_info_t *) nic2; + + if (i1->close && !i2->close) + return -1; + else if (i2->close && !i1->close) + return 1; + return compare_nic_names(&(i1->nic), &(i2->nic)); +} + /* TODO: Now that multiple NICs are detected, sort them based on preferred-ness, * closeness and count of other processes using the NIC. */ static int setup_multi_nic(int nic_count) @@ -195,24 +252,26 @@ static int setup_multi_nic(int nic_count) MPIDI_OFI_global.num_nics = MPIR_CVAR_CH4_OFI_MAX_NICS; } - /* Now go through every NIC and set initial information - * from current process's perspective */ - for (int i = 0; i < MPIDI_OFI_global.num_nics; ++i) { - nics[i].nic = MPIDI_OFI_global.prov_use[i]; - nics[i].id = i; - /* Determine NIC's "closeness" to current process */ - nics[i].close = is_nic_close(nics[i].nic); - if (nics[i].close) - MPIDI_OFI_global.num_close_nics++; - /* Set the preference of all NICs to least preferable (lower is more preferable) */ - nics[i].prefer = MPIDI_OFI_global.num_nics + 1; - nics[i].count = 0; - nics[i].num_close_ranks = 0; - /* Determine NIC's first normal parent topology - * item (e.g., typically the socket parent) */ - nics[i].parent = get_nic_parent(nics[i].nic); - /* Expand list of close NIC-parent topology items or increment */ - if (nics[i].close) { + int num_numa_nodes = MPIR_hwtopo_get_num_numa_nodes(); + bool is_snc4_with_cxi_nics = false; + + if ((num_numa_nodes == 8 || num_numa_nodes == 16)) + if (MPIDI_OFI_global.num_nics > 1) + if (strstr(MPIDI_OFI_global.prov_use[0]->domain_attr->name, "cxi")) + is_snc4_with_cxi_nics = true; + + /* Special case of nic assignment for SPR in SNC4 mode */ + if (is_snc4_with_cxi_nics) { + for (int i = 0; i < MPIDI_OFI_global.num_nics; ++i) { + nics[i].nic = MPIDI_OFI_global.prov_use[i]; + nics[i].id = i; + /* Set the preference of all NICs to least preferable (lower is more preferable) */ + nics[i].prefer = MPIDI_OFI_global.num_nics + 1; + nics[i].count = 0; + nics[i].num_close_ranks = 0; + + nics[i].parent = get_nic_parent(nics[i].nic); + int found = 0; for (int j = 0; j < num_parents; ++j) { if (parents[j] == nics[i].parent) { @@ -225,6 +284,47 @@ static int setup_multi_nic(int nic_count) num_parents++; } } + /* Use num_parents to determine nic closeness */ + for (int i = 0; i < MPIDI_OFI_global.num_nics; ++i) { + nics[i].close = is_nic_close_snc4(&nics[i], num_parents); + if (nics[i].close) + MPIDI_OFI_global.num_close_nics++; + } + + } else { + /* General case of nic assignment */ + + /* Now go through every NIC and set initial information + * from current process's perspective */ + for (int i = 0; i < MPIDI_OFI_global.num_nics; ++i) { + nics[i].nic = MPIDI_OFI_global.prov_use[i]; + nics[i].id = i; + /* Determine NIC's "closeness" to current process */ + nics[i].close = is_nic_close(nics[i].nic); + if (nics[i].close) + MPIDI_OFI_global.num_close_nics++; + /* Set the preference of all NICs to least preferable (lower is more preferable) */ + nics[i].prefer = MPIDI_OFI_global.num_nics + 1; + nics[i].count = 0; + nics[i].num_close_ranks = 0; + /* Determine NIC's first normal parent topology + * item (e.g., typically the socket parent) */ + nics[i].parent = get_nic_parent(nics[i].nic); + /* Expand list of close NIC-parent topology items or increment */ + if (nics[i].close) { + int found = 0; + for (int j = 0; j < num_parents; ++j) { + if (parents[j] == nics[i].parent) { + found = 1; + break; + } + } + if (!found) { + parents[num_parents] = nics[i].parent; + num_parents++; + } + } + } } /* If there were zero NICs on my socket, then just consider every NIC close @@ -237,7 +337,13 @@ static int setup_multi_nic(int nic_count) /* Sort the NICs array based on closeness first. This way all the close * NICs are at the beginning of the array */ - qsort(nics, MPIDI_OFI_global.num_nics, sizeof(nics[0]), compare_nics); + if (is_snc4_with_cxi_nics) { + /* Use a separate sorting function for snc4 nics in order to just compare + * closeness followed by nic name */ + qsort(nics, MPIDI_OFI_global.num_nics, sizeof(nics[0]), compare_nics_snc4); + } else { + qsort(nics, MPIDI_OFI_global.num_nics, sizeof(nics[0]), compare_nics); + } /* Because we cannot communicate with the other local processes to avoid collisions with the * same NICs, just shift NICs that have multiple close NICs around according to their local diff --git a/src/util/mpir_hwtopo.c b/src/util/mpir_hwtopo.c index 33e88bcf49d..2279a415bd8 100644 --- a/src/util/mpir_hwtopo.c +++ b/src/util/mpir_hwtopo.c @@ -106,6 +106,9 @@ static hwloc_obj_type_t get_hwloc_obj_type(MPIR_hwtopo_type_e type) case MPIR_HWTOPO_TYPE__CPU: hwloc_obj_type = HWLOC_OBJ_PACKAGE; break; + case MPIR_HWTOPO_TYPE__GROUP: + hwloc_obj_type = HWLOC_OBJ_GROUP; + break; case MPIR_HWTOPO_TYPE__CORE: hwloc_obj_type = HWLOC_OBJ_CORE; break; @@ -320,6 +323,7 @@ MPIR_hwtopo_type_e MPIR_hwtopo_get_type_id(const char *name) {"machine", MPIR_HWTOPO_TYPE__NODE}, {"socket", MPIR_HWTOPO_TYPE__SOCKET}, {"package", MPIR_HWTOPO_TYPE__PACKAGE}, + {"group", MPIR_HWTOPO_TYPE__GROUP}, {"cpu", MPIR_HWTOPO_TYPE__CPU}, {"core", MPIR_HWTOPO_TYPE__CORE}, {"hwthread", MPIR_HWTOPO_TYPE__HWTHREAD}, @@ -628,3 +632,129 @@ MPIR_hwtopo_gid_t MPIR_hwtopo_get_dev_parent_by_pci(int domain, int bus, int dev #endif return gid; } + +int MPIR_hwtopo_get_num_numa_nodes(void) +{ + int num_numa_nodes = 0; + +#ifdef HAVE_HWLOC + MPIR_hwtopo_gid_t gid = MPIR_hwtopo_get_obj_by_name("node"); + hwloc_obj_t obj = + hwloc_get_obj_by_depth(hwloc_topology, HWTOPO_GET_DEPTH(gid), HWTOPO_GET_INDEX(gid)); + + hwloc_obj_t tmp = NULL; + + while ((tmp = hwloc_get_next_obj_by_type(hwloc_topology, HWLOC_OBJ_NUMANODE, tmp)) != NULL) { + if (hwloc_bitmap_isset(obj->nodeset, tmp->os_index)) { + num_numa_nodes++; + } + } +#endif + return num_numa_nodes; +} + +MPIR_hwtopo_gid_t MPIR_hwtopo_get_first_pu_group(void) +{ + MPIR_hwtopo_gid_t gid = MPIR_HWTOPO_GID_ROOT; +#ifdef HAVE_HWLOC + hwloc_cpuset_t cpuset = hwloc_bitmap_alloc(); + hwloc_get_proc_cpubind(hwloc_topology, getpid(), cpuset, HWLOC_CPUBIND_PROCESS); + + hwloc_obj_t obj = hwloc_get_pu_obj_by_os_index(hwloc_topology, hwloc_bitmap_first(cpuset)); + gid = HWTOPO_GET_GID(get_type_class(obj->type), obj->depth, obj->logical_index); + + /* Traverse up the PU object until a group object is reached */ + while (obj && obj->type != HWLOC_OBJ_GROUP && obj->parent) + obj = obj->parent; + gid = HWTOPO_GET_GID(get_type_class(obj->type), obj->depth, obj->logical_index); +#endif + return gid; +} + +MPIR_hwtopo_gid_t MPIR_hwtopo_get_parent_socket(MPIR_hwtopo_gid_t gid) +{ + MPIR_hwtopo_gid_t parent_gid = MPIR_HWTOPO_GID_ROOT; +#ifdef HAVE_HWLOC + hwloc_obj_t obj = + hwloc_get_obj_by_depth(hwloc_topology, HWTOPO_GET_DEPTH(gid), HWTOPO_GET_INDEX(gid)); + + while (obj && obj->parent && obj->type != HWLOC_OBJ_PACKAGE) + obj = obj->parent; + + if (obj->type == HWLOC_OBJ_PACKAGE) + parent_gid = HWTOPO_GET_GID(get_type_class(obj->type), obj->depth, obj->logical_index); +#endif + return parent_gid; +} + +#ifdef HAVE_HWLOC +static MPIR_hwtopo_gid_t obj_to_gid(hwloc_obj_t obj) +{ + hwtopo_class_e class = get_type_class(obj->type); + return HWTOPO_GET_GID(class, obj->depth, obj->logical_index); +} + +static int get_number_of_nics_below_me(hwloc_obj_t obj) +{ + int num = 0; + + /* Found a network device, increment by 1 */ + if (obj->attr && obj->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK) + num++; + + /* Find network devices among all my 'regular' children */ + for (int i = 0; i < obj->arity; i++) { + num += get_number_of_nics_below_me(obj->children[i]); + } + + /* Find network devices among all my io children */ + hwloc_obj_t io_child = obj->io_first_child; + while (io_child) { + num += get_number_of_nics_below_me(io_child); + io_child = io_child->next_sibling; + } + return num; +} +#endif + +int MPIR_hwtopo_get_pci_network_lid(int domain, int bus, int dev, int func) +{ + int myIndex = 0; +#ifdef HAVE_HWLOC + hwloc_obj_t my_io_device = hwloc_get_pcidev_by_busid(hwloc_topology, domain, bus, dev, func); + MPIR_Assert(my_io_device); + hwloc_obj_t my_first_non_io = hwloc_get_non_io_ancestor_obj(hwloc_topology, my_io_device); + MPIR_Assert(my_first_non_io); + + MPIR_hwtopo_gid_t my_parent_gid = obj_to_gid(my_first_non_io); + hwloc_obj_t io_device = my_io_device; + + /* Determine the number of network devices before me in my first non io ancestor. This + * can be used to determine my local network nic, which is used for nic mapping. + * First, look for network devices among my previous siblings. */ + while (io_device->prev_sibling) { + MPIR_hwtopo_gid_t prev_sibling_parent_gid = + obj_to_gid(hwloc_get_non_io_ancestor_obj(hwloc_topology, io_device->prev_sibling)); + + if (my_parent_gid != prev_sibling_parent_gid) + break; + + myIndex += get_number_of_nics_below_me(io_device->prev_sibling); + io_device = io_device->prev_sibling; + } + + /* Next, look for network devices among my previous cousins */ + io_device = my_io_device; + while (io_device->prev_cousin) { + MPIR_hwtopo_gid_t prev_cousin_parent_gid = + obj_to_gid(hwloc_get_non_io_ancestor_obj(hwloc_topology, io_device->prev_cousin)); + + if (my_parent_gid != prev_cousin_parent_gid) + break; + + myIndex += get_number_of_nics_below_me(io_device->prev_cousin); + io_device = io_device->prev_cousin; + } +#endif + return myIndex; +}