diff --git a/configure.ac b/configure.ac index c615cab2..56ac2c6c 100644 --- a/configure.ac +++ b/configure.ac @@ -1434,6 +1434,22 @@ if test "x$enable_experimental_smallocx" = "x1" ; then fi AC_SUBST([enable_experimental_smallocx]) +dnl Do not enable fastpath prefetch by default. +AC_ARG_ENABLE([experimental_fp_prefetch], + [AS_HELP_STRING([--enable-experimental-fp-prefetch], [Enable experimental fastpath prefetch])], +[if test "x$enable_experimental_fp_prefetch" = "xno" ; then +enable_experimental_fp_prefetch="0" +else +enable_experimental_fp_prefetch="1" +fi +], +[enable_experimental_fp_prefetch="0"] +) +if test "x$enable_experimental_fp_prefetch" = "x1" ; then + AC_DEFINE([JEMALLOC_EXPERIMENTAL_FASTPATH_PREFETCH], [ ], [ ]) +fi +AC_SUBST([enable_experimental_fp_prefetch]) + dnl Do not enable profiling by default. AC_ARG_ENABLE([prof], [AS_HELP_STRING([--enable-prof], [Enable allocation profiling])], diff --git a/include/jemalloc/internal/cache_bin.h b/include/jemalloc/internal/cache_bin.h index 7ab48dc9..e9e93b07 100644 --- a/include/jemalloc/internal/cache_bin.h +++ b/include/jemalloc/internal/cache_bin.h @@ -376,6 +376,38 @@ cache_bin_low_water_adjust(cache_bin_t *bin) { } } +#ifdef JEMALLOC_JET +typedef void (*test_prefetch_hook_t)(void *ptr, bool is_write); +test_prefetch_hook_t +cache_bin_prefetch_hook_set(test_prefetch_hook_t); +extern test_prefetch_hook_t cache_bin_prefetch_test_hook; +#endif + +#ifdef JEMALLOC_EXPERIMENTAL_FASTPATH_PREFETCH +/* + * We pad each non-disabled bin with a slot so that we can safely prefetch the + * next pointer after the one returned on the fast path. + */ +static inline void +prefetch_one_w(void *ptr) { +#ifdef JEMALLOC_JET + if (cache_bin_prefetch_test_hook) { + cache_bin_prefetch_test_hook(ptr, /* write */ true); + } else { + /* Still want to exercise the code in tests without the hook */ + util_prefetch_write(ptr); + } +#else + util_prefetch_write(ptr); +#endif /* JEMALLOC_JET */ +} + +#else + +static inline void prefetch_one_w(void *ptr) {} + +#endif /* JEMALLOC_EXPERIMENTAL_FASTPATH_PREFETCH */ + JEMALLOC_ALWAYS_INLINE void * cache_bin_alloc_impl(cache_bin_t *bin, bool *success, bool adjust_low_water) { /* @@ -400,6 +432,7 @@ cache_bin_alloc_impl(cache_bin_t *bin, bool *success, bool adjust_low_water) { */ if (likely(low_bits != bin->low_bits_low_water)) { bin->stack_head = new_head; + prefetch_one_w(*new_head); *success = true; return ret; } @@ -414,6 +447,7 @@ cache_bin_alloc_impl(cache_bin_t *bin, bool *success, bool adjust_low_water) { */ if (likely(low_bits != bin->low_bits_empty)) { bin->stack_head = new_head; + prefetch_one_w(*new_head); bin->low_bits_low_water = (cache_bin_sz_t)(uintptr_t)new_head; *success = true; return ret; diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in index 6d557959..8dc3ad47 100644 --- a/include/jemalloc/internal/jemalloc_internal_defs.h.in +++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in @@ -160,6 +160,11 @@ /* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */ #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API +/* JEMALLOC_EXPERIMENTAL_FASTPATH_PREFETCH enables prefetch + * on malloc fast path. + */ +#undef JEMALLOC_EXPERIMENTAL_FASTPATH_PREFETCH + /* JEMALLOC_PROF enables allocation profiling. */ #undef JEMALLOC_PROF diff --git a/src/cache_bin.c b/src/cache_bin.c index 2f5afeb9..92561ad9 100644 --- a/src/cache_bin.c +++ b/src/cache_bin.c @@ -36,10 +36,18 @@ cache_bin_info_compute_alloc(const cache_bin_info_t *infos, szind_t ninfos, * checking "is_empty"; and * 2) the cur_ptr can go beyond the empty position by 1 step safely on * the fast path (i.e. no overflow). + * + * For each non-disabled cache_bin reserve extra slot to allow prefetch + * without checking the boundary on the fast path */ *size = sizeof(void *) * 2; for (szind_t i = 0; i < ninfos; i++) { *size += infos[i].ncached_max * sizeof(void *); +#ifdef JEMALLOC_EXPERIMENTAL_FASTPATH_PREFETCH + if (infos[i].ncached_max > 0) { + *size += sizeof(void *); + } +#endif } /* @@ -100,6 +108,12 @@ cache_bin_init(cache_bin_t *bin, const cache_bin_info_t *info, void *alloc, bin->low_bits_full, (cache_bin_sz_t)(uintptr_t)bin->stack_head); assert(free_spots == bin_stack_size); if (!cache_bin_disabled(bin)) { +#ifdef JEMALLOC_EXPERIMENTAL_FASTPATH_PREFETCH + /* Address will be mapped to physical page already */ + void **addr = (void **)((byte_t *)alloc + *cur_offset); + *addr = addr; + *cur_offset += sizeof(void *); +#endif assert(cache_bin_ncached_get_local(bin) == 0); } assert(cache_bin_empty_position_get(bin) == empty_position); @@ -117,3 +131,13 @@ cache_bin_init_disabled(cache_bin_t *bin, cache_bin_sz_t ncached_max) { cache_bin_info_init(&bin->bin_info, ncached_max); assert(fake_offset == 0); } + +#ifdef JEMALLOC_JET +test_prefetch_hook_t cache_bin_prefetch_test_hook = NULL; +test_prefetch_hook_t +cache_bin_prefetch_hook_set(test_prefetch_hook_t f) { + test_prefetch_hook_t old = cache_bin_prefetch_test_hook; + cache_bin_prefetch_test_hook = f; + return old; +} +#endif diff --git a/test/unit/cache_bin.c b/test/unit/cache_bin.c index 1bb750d7..530b33aa 100644 --- a/test/unit/cache_bin.c +++ b/test/unit/cache_bin.c @@ -1,5 +1,11 @@ #include "test/jemalloc_test.h" +#ifdef JEMALLOC_EXPERIMENTAL_FASTPATH_PREFETCH +static bool experimental_fast_prefetch_enabled = true; +#else +static bool experimental_fast_prefetch_enabled = false; +#endif + static void do_fill_test(cache_bin_t *bin, void **ptrs, cache_bin_sz_t ncached_max, cache_bin_sz_t nfill_attempt, cache_bin_sz_t nfill_succeed) { @@ -379,8 +385,117 @@ TEST_BEGIN(test_cache_bin_stash) { } TEST_END +typedef struct { + void *ptr; + bool is_write; +} prefetch_arg_t; + +#define PREFETCH_SZ 256 +static prefetch_arg_t prefetch_calls[PREFETCH_SZ]; +static unsigned nprefetch_calls; + +static void +prefetch_hook(void *p, bool is_write) { + prefetch_calls[nprefetch_calls].ptr = p; + prefetch_calls[nprefetch_calls].is_write = is_write; + ++nprefetch_calls; +} + +static void +reset_prefetch_calls(void) { + nprefetch_calls = 0; + cache_bin_prefetch_hook_set(prefetch_hook); +} + +static void** +do_dallocs_allocs(cache_bin_t *bin, int ncached_max) { + bool success = false; + assert(ncached_max < PREFETCH_SZ); + /* + * We allocate fully, so we can test + * prefetch at the end of the cache bin. + */ + void **ptrs = mallocx(sizeof(void *) * (ncached_max + 1), 0); + assert_ptr_not_null(ptrs, "Unexpected mallocx failure"); + for (cache_bin_sz_t i = 0; i < ncached_max; i++) { + expect_true(cache_bin_ncached_get_local(bin) == i, ""); + success = cache_bin_dalloc_easy(bin, &ptrs[i]); + } + expect_true(cache_bin_ncached_get_local(bin) == ncached_max, + ""); + + reset_prefetch_calls(); + for (cache_bin_sz_t i = 0; i < ncached_max; i++) { + void *ptr = cache_bin_alloc_easy(bin, &success); + expect_true(success, ""); + expect_ptr_eq(ptr, &ptrs[ncached_max - i - 1], ""); + } + return ptrs; +} + +TEST_BEGIN(test_cache_bin_alloc_easy_prefetch_disabled) { + test_skip_if(experimental_fast_prefetch_enabled); + + const int ncached_max = 10; + cache_bin_info_t info; + cache_bin_info_init(&info, ncached_max); + cache_bin_t bin; + test_bin_init(&bin, &info); + + /* Initialize to empty; should then have 0 elements. */ + expect_d_eq(ncached_max, cache_bin_ncached_max_get(&bin), ""); + expect_true(cache_bin_ncached_get_local(&bin) == 0, ""); + + void **ptrs = do_dallocs_allocs(&bin, ncached_max); + /* Check prefetch calls */ + expect_zu_eq(nprefetch_calls, 0, "No calls when prefetch disabled"); + + free(ptrs); + cache_bin_prefetch_hook_set(NULL); +} +TEST_END + +TEST_BEGIN(test_cache_bin_alloc_easy_prefetch_enabled) { + test_skip_if(!experimental_fast_prefetch_enabled); + const int ncached_max = 10; + + cache_bin_info_t info; + cache_bin_info_init(&info, ncached_max); + cache_bin_t bin; + test_bin_init(&bin, &info); + + /* Initialize to empty; should then have 0 elements. */ + expect_d_eq(ncached_max, cache_bin_ncached_max_get(&bin), ""); + expect_true(cache_bin_ncached_get_local(&bin) == 0, ""); + + void **ptrs = do_dallocs_allocs(&bin, ncached_max); + /* Check prefetch calls */ + expect_zu_eq(nprefetch_calls, ncached_max, "Not enough prefetch calls"); + /* + * Each prefetched pointer should match one ahead in original array + * in the opposite order as bin's head moves backwards on allocations. + */ + for (cache_bin_sz_t i = 1; i < ncached_max; i++) { + expect_ptr_eq(prefetch_calls[i-1].ptr, + &ptrs[ncached_max - 1 - i], "prefetch address wrong"); + } + + /* Bin is empty now. stack_head points one past the "real" slots */ + expect_true(cache_bin_ncached_get_local(&bin) == 0, ""); + void **expected_ptr = bin.stack_head; + expect_ptr_eq(prefetch_calls[ncached_max - 1].ptr, expected_ptr, + "prefetch address wrong for out of boundary"); + expect_ptr_eq(expected_ptr, *expected_ptr, "Content is the address"); + + free(ptrs); + cache_bin_prefetch_hook_set(NULL); +} +TEST_END + int main(void) { return test(test_cache_bin, - test_cache_bin_stash); + test_cache_bin_stash, + test_cache_bin_alloc_easy_prefetch_disabled, + test_cache_bin_alloc_easy_prefetch_enabled); }