Skip to content

Commit

Permalink
Use much faster getcpu() via vDSO
Browse files Browse the repository at this point in the history
vDSO (virtual dynamic shared object) is exported by Linux kernel into
every userspace program, designed to speed up this process for certain
system calls. For Linux/x86_64, getcpu() can be called via vDSO, which
makes getcpu() much faster. The faster getcpu() invocation is beneficial
when retrieving NUMA node information.

Benchmarking[1] on AMD Ryzen Threadripper 2990WX 32-Core Processor:
  getcpu: syscall: 103 nsec/call
  getcpu:    vdso: 18 nsec/call

We can not use dlsym to resolve the vDSO symbol "__vdso_getcpu" directly
becase it would cause recursive malloc calls when MI_DEBUG_FULL is enabled.

[1] https://github.com/nathanlynch/vdsotest

Co-authored-by: Chin-Hao Lo <[email protected]>
Signed-off-by: Jim Huang <[email protected]>
  • Loading branch information
jserv and hankluo6 committed Jun 26, 2021
1 parent 076f815 commit 6cd59aa
Showing 1 changed file with 87 additions and 3 deletions.
90 changes: 87 additions & 3 deletions src/os.c
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,81 @@ void _mi_os_init() {
os_alloc_granularity = 16;
}
#else
#if defined(__linux__) && defined(__x86_64__)
#include <elf.h>
#include <sys/auxv.h>
typedef int (*getcpu_vdso_t)(unsigned*, unsigned*, void*);
static getcpu_vdso_t mi_os_getcpu_vdso = NULL;
static struct vdso_info {
uintptr_t load_addr, load_offset;
Elf64_Sym *symtab; // ELF symbol table
const char *symstrings;
void *bucket, *chain;
Elf64_Word nbucket;
} vdso_info;
static void mi_os_vdso_init(void) {
unsigned long base = getauxval(AT_SYSINFO_EHDR);
if (!base)
return;
bool found_vaddr = false;
vdso_info.load_addr = base;
Elf64_Ehdr* hdr = (Elf64_Ehdr*)base;
if (hdr->e_ident[EI_CLASS] != ELFCLASS64)
return;
Elf64_Phdr* pt = (Elf64_Phdr*)(vdso_info.load_addr + hdr->e_phoff);
Elf64_Dyn* dyn = 0;
// we need to load offset and the dynamic table from the segment table
for (size_t i = 0; i < hdr->e_phnum; i++) {
if (pt[i].p_type == PT_LOAD && !found_vaddr) {
found_vaddr = true;
vdso_info.load_offset = base + (uintptr_t)pt[i].p_offset - (uintptr_t)pt[i].p_vaddr;
} else if (pt[i].p_type == PT_DYNAMIC) {
dyn = (Elf64_Dyn*)(base + pt[i].p_offset);
}
}
if (!found_vaddr || !dyn)
return;
Elf64_Word* hash = 0;
vdso_info.symstrings = NULL;
vdso_info.symtab = NULL;
for (size_t i = 0; dyn[i].d_tag != DT_NULL; i++) {
switch (dyn[i].d_tag) {
case DT_STRTAB:
vdso_info.symstrings = (const char*)((uintptr_t)dyn[i].d_un.d_ptr + vdso_info.load_offset);
break;
case DT_SYMTAB:
vdso_info.symtab = (Elf64_Sym*)((uintptr_t)dyn[i].d_un.d_ptr + vdso_info.load_offset);
break;
case DT_HASH:
hash = (Elf64_Word*)((uintptr_t)dyn[i].d_un.d_ptr + vdso_info.load_offset);
break;
}
}
if (!vdso_info.symstrings || !vdso_info.symtab || !hash)
return;
vdso_info.nbucket = hash[0];
vdso_info.bucket = &hash[2];
vdso_info.chain = &hash[vdso_info.nbucket + 2];
}
static void* mi_os_vdso_get_sym(void) {
const char *name = "__vdso_getcpu";
Elf64_Word chain = ((Elf64_Word*)vdso_info.bucket)[11538501 % vdso_info.nbucket];
for (; chain != STN_UNDEF; chain = ((Elf64_Word*)vdso_info.chain)[chain]) {
Elf64_Sym* sym = &vdso_info.symtab[chain];
// Check for a defined global or weak function with right name
if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC)
continue;
if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL && ELF64_ST_BIND(sym->st_info) != STB_WEAK)
continue;
if (sym->st_shndx == SHN_UNDEF)
continue;
if (strcmp(name, vdso_info.symstrings + sym->st_name))
continue;
return (void*)(vdso_info.load_offset + sym->st_value);
}
return 0;
}
#endif
void _mi_os_init() {
// get the page size
long result = sysconf(_SC_PAGESIZE);
Expand All @@ -222,6 +297,11 @@ void _mi_os_init() {
os_alloc_granularity = os_page_size;
}
large_os_page_size = 2*MiB; // TODO: can we query the OS for this?
#if defined(__linux__) && defined(__x86_64__)
// set up symbols exported by vDSO (virtual dynamic shared object)
mi_os_vdso_init();
mi_os_getcpu_vdso = (getcpu_vdso_t)mi_os_vdso_get_sym();
#endif
}
#endif

Expand Down Expand Up @@ -1173,9 +1253,13 @@ static size_t mi_os_numa_node_countx(void) {
#include <stdio.h> // access

static size_t mi_os_numa_nodex(void) {
#ifdef SYS_getcpu
unsigned long node = 0;
unsigned long ncpu = 0;
#if defined(SYS_getcpu)
unsigned int node = 0, ncpu = 0;
#if defined(__x86_64__)
if (mi_likely(mi_os_getcpu_vdso != NULL)) {
return (mi_os_getcpu_vdso(&ncpu, &node, NULL) != -1) ? node : 0;
}
#endif
long err = syscall(SYS_getcpu, &ncpu, &node, NULL);
if (err != 0) return 0;
return node;
Expand Down

0 comments on commit 6cd59aa

Please sign in to comment.