From 323ed2e3a8c88c7db89b4119b10192af4303d29c Mon Sep 17 00:00:00 2001 From: Qi Wang Date: Wed, 11 Sep 2024 15:08:24 -0700 Subject: [PATCH] Optimize fast path to allow static size class computation. After inlining at LTO time, many callsites have input size known which means the index and usable size can be translated at compile time. However the size-index lookup table prevents it -- this commit solves that by switching to the compute approach when the size is detected to be a known const. --- .../internal/jemalloc_internal_inlines_c.h | 5 +-- include/jemalloc/internal/sz.h | 33 +++++++++++++++---- include/jemalloc/internal/util.h | 10 ++++++ 3 files changed, 40 insertions(+), 8 deletions(-) diff --git a/include/jemalloc/internal/jemalloc_internal_inlines_c.h b/include/jemalloc/internal/jemalloc_internal_inlines_c.h index 6dcffac9..432ec17c 100644 --- a/include/jemalloc/internal/jemalloc_internal_inlines_c.h +++ b/include/jemalloc/internal/jemalloc_internal_inlines_c.h @@ -496,6 +496,7 @@ bool free_fastpath(void *ptr, size_t size, bool size_hint) { *tsd_thread_deallocated_next_event_fastp_get_unsafe(tsd) == 0); emap_alloc_ctx_t alloc_ctx; + size_t usize; if (!size_hint) { bool err = emap_alloc_ctx_try_lookup_fast(tsd, &arena_emap_global, ptr, &alloc_ctx); @@ -507,6 +508,7 @@ bool free_fastpath(void *ptr, size_t size, bool size_hint) { return false; } assert(alloc_ctx.szind != SC_NSIZES); + usize = sz_index2size(alloc_ctx.szind); } else { /* * Check for both sizes that are too large, and for sampled / @@ -518,7 +520,7 @@ bool free_fastpath(void *ptr, size_t size, bool size_hint) { /* check_prof */ true))) { return false; } - alloc_ctx.szind = sz_size2index_lookup(size); + sz_size2index_usize_fastpath(size, &alloc_ctx.szind, &usize); /* Max lookup class must be small. */ assert(alloc_ctx.szind < SC_NBINS); /* This is a dead store, except when opt size checking is on. */ @@ -534,7 +536,6 @@ bool free_fastpath(void *ptr, size_t size, bool size_hint) { uint64_t deallocated, threshold; te_free_fastpath_ctx(tsd, &deallocated, &threshold); - size_t usize = sz_index2size(alloc_ctx.szind); uint64_t deallocated_after = deallocated + usize; /* * Check for events and tsd non-nominal (fast_threshold will be set to diff --git a/include/jemalloc/internal/sz.h b/include/jemalloc/internal/sz.h index 955d8ec0..a2d2debc 100644 --- a/include/jemalloc/internal/sz.h +++ b/include/jemalloc/internal/sz.h @@ -152,8 +152,8 @@ sz_psz2u(size_t psz) { return usize; } -static inline szind_t -sz_size2index_compute(size_t size) { +JEMALLOC_ALWAYS_INLINE szind_t +sz_size2index_compute_inline(size_t size) { if (unlikely(size > SC_LARGE_MAXCLASS)) { return SC_NSIZES; } @@ -186,6 +186,11 @@ sz_size2index_compute(size_t size) { } } +static inline szind_t +sz_size2index_compute(size_t size) { + return sz_size2index_compute_inline(size); +} + JEMALLOC_ALWAYS_INLINE szind_t sz_size2index_lookup_impl(size_t size) { assert(size <= SC_LOOKUP_MAXCLASS); @@ -208,8 +213,8 @@ sz_size2index(size_t size) { return sz_size2index_compute(size); } -static inline size_t -sz_index2size_compute(szind_t index) { +JEMALLOC_ALWAYS_INLINE size_t +sz_index2size_compute_inline(szind_t index) { #if (SC_NTINY > 0) if (index < SC_NTINY) { return (ZU(1) << (SC_LG_TINY_MAXCLASS - SC_NTINY + 1 + index)); @@ -234,6 +239,11 @@ sz_index2size_compute(szind_t index) { } } +static inline size_t +sz_index2size_compute(szind_t index) { + return sz_index2size_compute_inline(index); +} + JEMALLOC_ALWAYS_INLINE size_t sz_index2size_lookup_impl(szind_t index) { return sz_index2size_tab[index]; @@ -254,8 +264,19 @@ sz_index2size(szind_t index) { JEMALLOC_ALWAYS_INLINE void sz_size2index_usize_fastpath(size_t size, szind_t *ind, size_t *usize) { - *ind = sz_size2index_lookup_impl(size); - *usize = sz_index2size_lookup_impl(*ind); + if (util_compile_time_const(size)) { + /* + * When inlined, the size may become known at compile + * time, which allows static computation through LTO. + */ + *ind = sz_size2index_compute_inline(size); + assert(*ind == sz_size2index_lookup_impl(size)); + *usize = sz_index2size_compute_inline(*ind); + assert(*usize == sz_index2size_lookup_impl(*ind)); + } else { + *ind = sz_size2index_lookup_impl(size); + *usize = sz_index2size_lookup_impl(*ind); + } } JEMALLOC_ALWAYS_INLINE size_t diff --git a/include/jemalloc/internal/util.h b/include/jemalloc/internal/util.h index f4035095..24f23629 100644 --- a/include/jemalloc/internal/util.h +++ b/include/jemalloc/internal/util.h @@ -79,6 +79,16 @@ get_errno(void) { } while(0) #endif +/* Allows compiler constant folding on inlined paths. */ +#if defined(__has_builtin) +# if __has_builtin(__builtin_constant_p) +# define util_compile_time_const(x) __builtin_constant_p(x) +# endif +#endif +#ifndef util_compile_time_const +# define util_compile_time_const(x) (false) +#endif + /* ptr should be valid. */ JEMALLOC_ALWAYS_INLINE void util_prefetch_read(void *ptr) {