From 03831440019db047fd9432e13e226a69bc15d07c Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 7 Jan 2025 22:59:19 -0600 Subject: [PATCH] ch4/ofi: fix the av table assumptions Because we insert all remote endpoints to all local endpoints at the same time, thus follow the exact same insertion order, they will share the same av table index except for the local root endpoint because it has inserted other remote root endpoints at init time. The local root to remote non-root endpoints will have a fixed offset from that of local non-root. --- src/mpid/ch4/netmod/ofi/ofi_pre.h | 9 +++++++-- src/mpid/ch4/netmod/ofi/ofi_vci.c | 19 +++++++++++++++---- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_pre.h b/src/mpid/ch4/netmod/ofi/ofi_pre.h index 92166fe33dc..52f205e104f 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_pre.h +++ b/src/mpid/ch4/netmod/ofi/ofi_pre.h @@ -311,9 +311,14 @@ typedef struct { /* Maximum number of network interfaces CH4 can support. */ #define MPIDI_OFI_MAX_NICS 8 +/* Imagine a dimension of [local_vci][local_nic][rank][vci][nic] - + * all local endpoints will share the same remote address due to the same insertion order + * and use of FI_AV_TABLE except the local root endpoint. + */ typedef struct { - fi_addr_t root_dest; - fi_addr_t *all_dest; /* to be allocated into an array of [nic * vci] */ + fi_addr_t root_dest; /* [0][0][r][0][0] */ + fi_addr_t root_offset; /* [0][0][r][vci][nic] - [*][*][r][vci][nic] */ + fi_addr_t *all_dest; /* [*][*][r][vci][nic] */ } MPIDI_OFI_addr_t; #endif /* OFI_PRE_H_INCLUDED */ diff --git a/src/mpid/ch4/netmod/ofi/ofi_vci.c b/src/mpid/ch4/netmod/ofi/ofi_vci.c index 9f52d12ae7a..baa65ccac3c 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_vci.c +++ b/src/mpid/ch4/netmod/ofi/ofi_vci.c @@ -231,12 +231,13 @@ static int addr_exchange_all_ctx(MPIR_Comm * comm, int *all_num_vcis) /* insert and store non-root nic/vci on the root context */ for (int r = 0; r < nprocs; r++) { + fi_addr_t expect_addr = FI_ADDR_NOTAVAIL; + fi_addr_t root_offset = 0; GET_AV_AND_ADDRNAMES(r); /* for each remote endpoints */ for (int nic = 0; nic < num_nics; nic++) { for (int vci = 0; vci < NUM_VCIS_FOR_RANK(r); vci++) { /* for each local endpoints */ - fi_addr_t expect_addr = FI_ADDR_NOTAVAIL; for (int nic_local = 0; nic_local < num_nics; nic_local++) { for (int vci_local = 0; vci_local < my_num_vcis; vci_local++) { /* skip root */ @@ -245,18 +246,28 @@ static int addr_exchange_all_ctx(MPIR_Comm * comm, int *all_num_vcis) } int ctx_idx = MPIDI_OFI_get_ctx_index(vci_local, nic_local); DO_AV_INSERT(ctx_idx, nic, vci); - /* we expect all resulting addr to be the same */ + /* we expect all resulting addr to be the same except for local root endpoint, which + * will have an offset */ if (expect_addr == FI_ADDR_NOTAVAIL) { expect_addr = addr; + } else if (nic_local == 0 && vci_local == 0) { + if (root_offset == 0) { + root_offset = addr - expect_addr; + } else { + MPIR_Assert(addr == expect_addr + root_offset); + } } else { - MPIR_Assert(expect_addr == addr); + MPIR_Assert(addr == expect_addr); } } } MPIR_Assert(expect_addr != FI_ADDR_NOTAVAIL); - MPIDI_OFI_AV_ADDR_NONROOT(av, vci, nic) = expect_addr; + MPIDI_OFI_AV_ADDR_NO_OFFSET(av, vci, nic) = expect_addr; + /* next */ + expect_addr++; } } + MPIDI_OFI_AV(av).root_offset = root_offset; } mpi_errno = MPIR_Barrier_fallback(comm, MPIR_ERR_NONE);