From 10a58a0e8cb828be180d4ea7048d801e8a2c2bb4 Mon Sep 17 00:00:00 2001 From: Sergey Oblomov Date: Wed, 4 Dec 2019 12:04:17 +0200 Subject: [PATCH] SHM: refactoring for reachable test --- src/ucs/sys/sys.c | 16 +++++++------- src/ucs/sys/sys.h | 9 +++++--- src/uct/sm/base/sm_iface.c | 2 +- src/uct/sm/cma/cma_ep.c | 3 ++- src/uct/sm/cma/cma_iface.c | 40 +++++++++++++++++++++++++++++++--- src/uct/sm/cma/cma_iface.h | 3 +++ src/uct/sm/mm/base/mm_iface.c | 22 ++++++++++++++++++- src/uct/sm/mm/base/mm_md.h | 10 +++++++++ src/uct/sm/mm/posix/mm_posix.c | 37 ++++++++++++++++++++++++++----- src/uct/sm/mm/sysv/mm_sysv.c | 3 ++- src/uct/sm/mm/xpmem/mm_xpmem.c | 3 ++- 11 files changed, 124 insertions(+), 24 deletions(-) diff --git a/src/ucs/sys/sys.c b/src/ucs/sys/sys.c index b7851478dba0..7f6190050c5d 100644 --- a/src/ucs/sys/sys.c +++ b/src/ucs/sys/sys.c @@ -48,9 +48,9 @@ struct { - const char *name; - ino_t ino; - ino_t dflt; + const char *name; + ucs_sys_ns_t ino; + ucs_sys_ns_t dflt; } static ucs_sys_namespace_info[] = { [UCS_SYS_NS_IPC] = {.name = "ipc", .ino = 0, .dflt = UCS_PROCESS_NS_FIRST - 1}, [UCS_SYS_NS_MNT] = {.name = "mnt", .ino = 0, .dflt = UCS_PROCESS_NS_FIRST - 0}, @@ -1180,13 +1180,13 @@ void ucs_sys_cpuset_copy(ucs_cpu_set_t *dst, const ucs_sys_cpuset_t *src) } } -ino_t ucs_sys_get_ns(ucs_sys_get_ns_name_t name) +ucs_sys_ns_t ucs_sys_get_ns(ucs_sys_ns_name_t name) { static ucs_init_once_t init_once = UCS_INIT_ONCE_INITIALIZER; char filename[MAXPATHLEN]; int res; struct stat st; - ucs_sys_get_ns_name_t ns; + ucs_sys_ns_name_t ns; if (name >= UCS_SYS_NS_LAST) { return 0; @@ -1199,7 +1199,7 @@ ino_t ucs_sys_get_ns(ucs_sys_get_ns_name_t name) res = stat(filename, &st); if (res == 0) { - ucs_sys_namespace_info[ns].ino = st.st_ino; + ucs_sys_namespace_info[ns].ino = (ucs_sys_ns_t)st.st_ino; } else { ucs_sys_namespace_info[ns].ino = ucs_sys_namespace_info[ns].dflt; } @@ -1209,9 +1209,9 @@ ino_t ucs_sys_get_ns(ucs_sys_get_ns_name_t name) return ucs_sys_namespace_info[name].ino; } -int ucs_sys_ns_is_root(ucs_sys_get_ns_name_t name) +int ucs_sys_ns_is_root(ucs_sys_ns_name_t name) { - ino_t ns = ucs_sys_get_ns(name); + ucs_sys_ns_t ns = ucs_sys_get_ns(name); return ns == ucs_sys_namespace_info[name].dflt; } diff --git a/src/ucs/sys/sys.h b/src/ucs/sys/sys.h index 34d13eb02ed9..9ec6cf7cd0fd 100644 --- a/src/ucs/sys/sys.h +++ b/src/ucs/sys/sys.h @@ -67,6 +67,9 @@ BEGIN_C_DECLS /** @file sys.h */ +typedef ino_t ucs_sys_ns_t; + + typedef enum { UCS_SYS_NS_IPC, UCS_SYS_NS_MNT, @@ -75,7 +78,7 @@ typedef enum { UCS_SYS_NS_USER, UCS_SYS_NS_UTS, UCS_SYS_NS_LAST -} ucs_sys_get_ns_name_t; +} ucs_sys_ns_name_t; /** @@ -431,7 +434,7 @@ void ucs_sys_cpuset_copy(ucs_cpu_set_t *dst, const ucs_sys_cpuset_t *src); * * @return namespace value or 0 if namespaces are not supported */ -ino_t ucs_sys_get_ns(ucs_sys_get_ns_name_t name); +ucs_sys_ns_t ucs_sys_get_ns(ucs_sys_ns_name_t name); /** @@ -441,7 +444,7 @@ ino_t ucs_sys_get_ns(ucs_sys_get_ns_name_t name); * * @return 1 in case if namespace is root, 0 - in other cases */ -int ucs_sys_ns_is_root(ucs_sys_get_ns_name_t name); +int ucs_sys_ns_is_root(ucs_sys_ns_name_t name); END_C_DECLS diff --git a/src/uct/sm/base/sm_iface.c b/src/uct/sm/base/sm_iface.c index 7bfb11a5974d..e8e548afad18 100644 --- a/src/uct/sm/base/sm_iface.c +++ b/src/uct/sm/base/sm_iface.c @@ -25,7 +25,7 @@ typedef struct { typedef struct { ucs_sm_iface_base_device_addr_t super; - ino_t ipc_ns; + ucs_sys_ns_t ipc_ns; } ucs_sm_iface_ext_device_addr_t; diff --git a/src/uct/sm/cma/cma_ep.c b/src/uct/sm/cma/cma_ep.c index aff9ea2842a2..7c079e90f4f4 100644 --- a/src/uct/sm/cma/cma_ep.c +++ b/src/uct/sm/cma/cma_ep.c @@ -26,7 +26,8 @@ static UCS_CLASS_INIT_FUNC(uct_cma_ep_t, const uct_ep_params_t *params) "UCT_EP_PARAM_FIELD_IFACE_ADDR and UCT_EP_PARAM_FIELD_DEV_ADDR are not defined"); UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super.super); - self->remote_pid = *(const pid_t*)params->iface_addr; + self->remote_pid = *(const pid_t*)params->iface_addr & + ~UCT_CMA_IFACE_ADDR_FLAG_PID_NS; return UCS_OK; } diff --git a/src/uct/sm/cma/cma_iface.c b/src/uct/sm/cma/cma_iface.c index ab000d48dc93..00fec9393719 100644 --- a/src/uct/sm/cma/cma_iface.c +++ b/src/uct/sm/cma/cma_iface.c @@ -12,6 +12,12 @@ #include +typedef struct { + uint32_t pid_ns:1; /* PID ns is used */ + uint32_t pid :31; +} uct_cma_iface_addr_t; + + static ucs_config_field_t uct_cma_iface_config_table[] = { {"", "ALLOC=huge,thp,mmap,heap;BW=11145MBs", NULL, ucs_offsetof(uct_cma_iface_config_t, super), @@ -23,7 +29,13 @@ static ucs_config_field_t uct_cma_iface_config_table[] = { static ucs_status_t uct_cma_iface_get_address(uct_iface_t *tl_iface, uct_iface_addr_t *addr) { - *(pid_t*)addr = getpid(); + pid_t *iface_addr = (void*)addr; + + *iface_addr = getpid() & ~UCT_CMA_IFACE_ADDR_FLAG_PID_NS; + if (!ucs_sys_ns_is_root(UCS_SYS_NS_PID)) { + *iface_addr |= UCT_CMA_IFACE_ADDR_FLAG_PID_NS; + *(ucs_sys_ns_t*)(iface_addr + 1) = ucs_sys_get_ns(UCS_SYS_NS_PID); + } return UCS_OK; } @@ -51,7 +63,9 @@ static ucs_status_t uct_cma_iface_query(uct_iface_h tl_iface, iface_attr->cap.am.opt_zcopy_align = 1; iface_attr->cap.am.align_mtu = iface_attr->cap.am.opt_zcopy_align; - iface_attr->iface_addr_len = sizeof(pid_t); + iface_attr->iface_addr_len = sizeof(pid_t) + + (ucs_sys_ns_is_root(UCS_SYS_NS_PID) ? + 0 : sizeof(ucs_sys_ns_t)); iface_attr->device_addr_len = uct_sm_iface_get_device_addr_len(); iface_attr->ep_addr_len = 0; iface_attr->max_conn_priv = 0; @@ -68,6 +82,26 @@ static ucs_status_t uct_cma_iface_query(uct_iface_h tl_iface, return UCS_OK; } +static int +uct_cma_iface_is_reachable(const uct_iface_h tl_iface, + const uct_device_addr_t *dev_addr, + const uct_iface_addr_t *tl_iface_addr) +{ + pid_t *iface_addr = (void*)tl_iface_addr; + int reachable; + + reachable = uct_sm_iface_is_reachable(tl_iface, dev_addr, tl_iface_addr); + if (!reachable) { + return 0; + } + + if (*iface_addr & UCT_CMA_IFACE_ADDR_FLAG_PID_NS) { + return ucs_sys_get_ns(UCS_SYS_NS_PID) == *(ucs_sys_ns_t*)(iface_addr + 1); + } + + return ucs_sys_ns_is_root(UCS_SYS_NS_PID); +} + static UCS_CLASS_DECLARE_DELETE_FUNC(uct_cma_iface_t, uct_iface_t); static uct_iface_ops_t uct_cma_iface_ops = { @@ -88,7 +122,7 @@ static uct_iface_ops_t uct_cma_iface_ops = { .iface_query = uct_cma_iface_query, .iface_get_address = uct_cma_iface_get_address, .iface_get_device_address = uct_sm_iface_get_device_address, - .iface_is_reachable = uct_sm_iface_is_reachable + .iface_is_reachable = uct_cma_iface_is_reachable }; static UCS_CLASS_INIT_FUNC(uct_cma_iface_t, uct_md_h md, uct_worker_h worker, diff --git a/src/uct/sm/cma/cma_iface.h b/src/uct/sm/cma/cma_iface.h index 5d175f67cb9b..f14a046da965 100644 --- a/src/uct/sm/cma/cma_iface.h +++ b/src/uct/sm/cma/cma_iface.h @@ -11,6 +11,9 @@ #include +#define UCT_CMA_IFACE_ADDR_FLAG_PID_NS UCS_BIT(31) /* use PID NS in address */ + + typedef struct uct_cma_iface_config { uct_sm_iface_config_t super; } uct_cma_iface_config_t; diff --git a/src/uct/sm/mm/base/mm_iface.c b/src/uct/sm/mm/base/mm_iface.c index cb655c240ea2..4be514bc7bb7 100644 --- a/src/uct/sm/mm/base/mm_iface.c +++ b/src/uct/sm/mm/base/mm_iface.c @@ -73,6 +73,26 @@ static ucs_status_t uct_mm_iface_get_address(uct_iface_t *tl_iface, return uct_mm_md_mapper_ops(md)->iface_addr_pack(md, iface_addr + 1); } +static int +uct_mm_iface_is_reachable(const uct_iface_h tl_iface, + const uct_device_addr_t *dev_addr, + const uct_iface_addr_t *tl_iface_addr) +{ + uct_mm_iface_t *iface = ucs_derived_of(tl_iface, uct_mm_iface_t); + uct_mm_md_t *md = ucs_derived_of(iface->super.super.md, + uct_mm_md_t); + uct_mm_iface_addr_t *iface_addr = (void*)tl_iface_addr; + int is_reachable; + + is_reachable = uct_sm_iface_is_reachable(tl_iface, dev_addr, tl_iface_addr); + if (!is_reachable) { + return 0; + } + + return uct_mm_md_mapper_ops(md)->is_reachable(md, iface_addr->fifo_seg_id, + iface_addr + 1); +} + void uct_mm_iface_release_desc(uct_recv_desc_t *self, void *desc) { void *mm_desc; @@ -343,7 +363,7 @@ static uct_iface_ops_t uct_mm_iface_ops = { .iface_query = uct_mm_iface_query, .iface_get_device_address = uct_sm_iface_get_device_address, .iface_get_address = uct_mm_iface_get_address, - .iface_is_reachable = uct_sm_iface_is_reachable + .iface_is_reachable = uct_mm_iface_is_reachable }; static void uct_mm_iface_recv_desc_init(uct_iface_h tl_iface, void *obj, diff --git a/src/uct/sm/mm/base/mm_md.h b/src/uct/sm/mm/base/mm_md.h index d26ba80841f8..2eb971e9d0f8 100644 --- a/src/uct/sm/mm/base/mm_md.h +++ b/src/uct/sm/mm/base/mm_md.h @@ -88,6 +88,15 @@ typedef ucs_status_t uct_mm_remote_seg_t *rseg); +/* Check if memory may be attached using mem_attach. seg_id is from + * 'uct_mm_seg_t' structure, and iface_addr is from iface_addr_pack() on the + * remote process + */ +typedef int +(*uct_mm_mapper_is_reachable_func_t)(uct_mm_md_t *md, uct_mm_seg_id_t seg_id, + const void *iface_addr); + + /* Clean up the remote segment handle created by mem_attach() */ typedef void (*uct_mm_mapper_mem_detach_func_t)(uct_mm_md_t *md, @@ -104,6 +113,7 @@ typedef struct uct_mm_mapper_ops { uct_mm_mapper_iface_addr_pack_func_t iface_addr_pack; uct_mm_mapper_mem_attach_func_t mem_attach; uct_mm_mapper_mem_detach_func_t mem_detach; + uct_mm_mapper_is_reachable_func_t is_reachable; } uct_mm_md_mapper_ops_t; diff --git a/src/uct/sm/mm/posix/mm_posix.c b/src/uct/sm/mm/posix/mm_posix.c index d54728ac6565..7906917c4655 100644 --- a/src/uct/sm/mm/posix/mm_posix.c +++ b/src/uct/sm/mm/posix/mm_posix.c @@ -30,13 +30,15 @@ open fd symlink from procfs */ #define UCT_POSIX_SEG_FLAG_SHM_OPEN UCS_BIT(62) /* use shm_open() rather than open() */ #define UCT_POSIX_SEG_FLAG_HUGETLB UCS_BIT(61) /* use MAP_HUGETLB */ +#define UCT_POSIX_SEG_FLAG_PID_NS UCS_BIT(60) /* use PID NS in address */ #define UCT_POSIX_SEG_FLAGS_MASK (UCT_POSIX_SEG_FLAG_PROCFS | \ UCT_POSIX_SEG_FLAG_SHM_OPEN | \ + UCT_POSIX_SEG_FLAG_PID_NS | \ UCT_POSIX_SEG_FLAG_HUGETLB) #define UCT_POSIX_SEG_MMID_MASK (~UCT_POSIX_SEG_FLAGS_MASK) /* Packing mmid for procfs mode */ -#define UCT_POSIX_PROCFS_MMID_FD_BITS 31 /* how many bits for file descriptor */ +#define UCT_POSIX_PROCFS_MMID_FD_BITS 30 /* how many bits for file descriptor */ #define UCT_POSIX_PROCFS_MMID_PID_BITS 30 /* how many bits for pid */ /* Filesystem paths */ @@ -90,8 +92,12 @@ static size_t uct_posix_iface_addr_length(uct_mm_md_t *md) * requested backing file is needed so that the user would know how much * space to allocate for the rkey. */ - return uct_posix_use_shm_open(posix_config) ? 0 : - (strlen(posix_config->dir) + 1); + if (posix_config->use_proc_link) { + return ucs_sys_ns_is_root(UCS_SYS_NS_PID) ? 0 : sizeof(ucs_sys_ns_t); + } + + return uct_posix_use_shm_open(posix_config) ? + 0 : (strlen(posix_config->dir) + 1); } static ucs_status_t uct_posix_md_query(uct_md_h tl_md, uct_md_attr_t *md_attr) @@ -354,6 +360,17 @@ uct_posix_mem_attach_common(uct_mm_seg_id_t seg_id, size_t length, return status; } +static int +uct_posix_is_reachable(uct_mm_md_t *md, uct_mm_seg_id_t seg_id, + const void *iface_addr) +{ + if (seg_id & UCT_POSIX_SEG_FLAG_PID_NS) { + return ucs_sys_get_ns(UCS_SYS_NS_PID) == *(ucs_sys_ns_t*)iface_addr; + } + + return ucs_sys_ns_is_root(UCS_SYS_NS_PID); +} + static ucs_status_t uct_posix_mem_detach_common(const uct_mm_remote_seg_t *rseg) { return uct_posix_munmap(rseg->address, (size_t)rseg->cookie); @@ -434,7 +451,9 @@ uct_posix_mem_alloc(uct_md_h tl_md, size_t *length_p, void **address_p, /* Replace mmid by pid+fd. Keep previous SHM_OPEN flag for mkey_pack() */ seg->seg_id = uct_posix_mmid_procfs_pack(fd) | (seg->seg_id & UCT_POSIX_SEG_FLAG_SHM_OPEN) | - UCT_POSIX_SEG_FLAG_PROCFS; + UCT_POSIX_SEG_FLAG_PROCFS | + (ucs_sys_ns_is_root(UCS_SYS_NS_PID) ? 0 : + UCT_POSIX_SEG_FLAG_PID_NS); } /* mmap the shared memory segment that was created by shm_open */ @@ -544,6 +563,13 @@ static ucs_status_t uct_posix_iface_addr_pack(uct_mm_md_t *md, void *buffer) const uct_posix_md_config_t *posix_config = ucs_derived_of(md->config, uct_posix_md_config_t); + if (posix_config->use_proc_link) { + if (!ucs_sys_ns_is_root(UCS_SYS_NS_PID)) { + *(ucs_sys_ns_t*)buffer = ucs_sys_get_ns(UCS_SYS_NS_PID); + } + return UCS_OK; + } + if (!uct_posix_use_shm_open(posix_config)) { uct_posix_copy_dir(md, buffer); } @@ -640,7 +666,8 @@ static uct_mm_md_mapper_ops_t uct_posix_md_ops = { .iface_addr_length = uct_posix_iface_addr_length, .iface_addr_pack = uct_posix_iface_addr_pack, .mem_attach = uct_posix_mem_attach, - .mem_detach = uct_posix_mem_detach + .mem_detach = uct_posix_mem_detach, + .is_reachable = uct_posix_is_reachable }; UCT_MM_TL_DEFINE(posix, &uct_posix_md_ops, uct_posix_rkey_unpack, diff --git a/src/uct/sm/mm/sysv/mm_sysv.c b/src/uct/sm/mm/sysv/mm_sysv.c index daaa3eeb9df1..bfce99fe0cad 100644 --- a/src/uct/sm/mm/sysv/mm_sysv.c +++ b/src/uct/sm/mm/sysv/mm_sysv.c @@ -189,7 +189,8 @@ static uct_mm_md_mapper_ops_t uct_sysv_md_ops = { .iface_addr_pack = (uct_mm_mapper_iface_addr_pack_func_t) ucs_empty_function_return_success, .mem_attach = uct_sysv_mem_attach, - .mem_detach = uct_sysv_mem_detach + .mem_detach = uct_sysv_mem_detach, + .is_reachable = (uct_mm_mapper_is_reachable_func_t)ucs_empty_function_return_one }; UCT_MM_TL_DEFINE(sysv, &uct_sysv_md_ops, uct_sysv_rkey_unpack, diff --git a/src/uct/sm/mm/xpmem/mm_xpmem.c b/src/uct/sm/mm/xpmem/mm_xpmem.c index 09e9399839fd..5a4885de0f36 100644 --- a/src/uct/sm/mm/xpmem/mm_xpmem.c +++ b/src/uct/sm/mm/xpmem/mm_xpmem.c @@ -538,7 +538,8 @@ static uct_mm_md_mapper_ops_t uct_xpmem_md_ops = { .iface_addr_length = uct_xpmem_iface_addr_length, .iface_addr_pack = uct_xpmem_iface_addr_pack, .mem_attach = uct_xpmem_mem_attach, - .mem_detach = uct_xpmem_mem_detach + .mem_detach = uct_xpmem_mem_detach, + .is_reachable = (uct_mm_mapper_is_reachable_func_t)ucs_empty_function_return_one }; UCT_MM_TL_DEFINE(xpmem, &uct_xpmem_md_ops, uct_xpmem_rkey_unpack,