diff --git a/Makefile.in b/Makefile.in index ef6e1764..94208f37 100644 --- a/Makefile.in +++ b/Makefile.in @@ -206,6 +206,7 @@ TESTS_UNIT := \ $(srcroot)test/unit/base.c \ $(srcroot)test/unit/batch_alloc.c \ $(srcroot)test/unit/batcher.c \ + $(srcroot)test/unit/bin_batching.c \ $(srcroot)test/unit/binshard.c \ $(srcroot)test/unit/bitmap.c \ $(srcroot)test/unit/bit_util.c \ diff --git a/include/jemalloc/internal/arena_inlines_b.h b/include/jemalloc/internal/arena_inlines_b.h index 18a72e7c..7f5f6bb0 100644 --- a/include/jemalloc/internal/arena_inlines_b.h +++ b/include/jemalloc/internal/arena_inlines_b.h @@ -563,10 +563,11 @@ arena_dalloc_bin_locked_begin(arena_dalloc_bin_locked_info_t *info, * stats updates, which happen during finish (this lets running counts get left * in a register). */ -JEMALLOC_ALWAYS_INLINE bool +JEMALLOC_ALWAYS_INLINE void arena_dalloc_bin_locked_step(tsdn_t *tsdn, arena_t *arena, bin_t *bin, arena_dalloc_bin_locked_info_t *info, szind_t binind, edata_t *slab, - void *ptr) { + void *ptr, edata_t **dalloc_slabs, unsigned ndalloc_slabs, + unsigned *dalloc_slabs_count, edata_list_active_t *dalloc_slabs_extra) { const bin_info_t *bin_info = &bin_infos[binind]; size_t regind = arena_slab_regind(info, binind, slab, ptr); slab_data_t *slab_data = edata_slab_data_get(slab); @@ -586,12 +587,17 @@ arena_dalloc_bin_locked_step(tsdn_t *tsdn, arena_t *arena, bin_t *bin, if (nfree == bin_info->nregs) { arena_dalloc_bin_locked_handle_newly_empty(tsdn, arena, slab, bin); - return true; + + if (*dalloc_slabs_count < ndalloc_slabs) { + dalloc_slabs[*dalloc_slabs_count] = slab; + (*dalloc_slabs_count)++; + } else { + edata_list_active_append(dalloc_slabs_extra, slab); + } } else if (nfree == 1 && slab != bin->slabcur) { arena_dalloc_bin_locked_handle_newly_nonempty(tsdn, arena, slab, bin); } - return false; } JEMALLOC_ALWAYS_INLINE void @@ -604,11 +610,129 @@ arena_dalloc_bin_locked_finish(tsdn_t *tsdn, arena_t *arena, bin_t *bin, } } +JEMALLOC_ALWAYS_INLINE void +arena_bin_flush_batch_impl(tsdn_t *tsdn, arena_t *arena, bin_t *bin, + arena_dalloc_bin_locked_info_t *dalloc_bin_info, unsigned binind, + edata_t **dalloc_slabs, unsigned ndalloc_slabs, unsigned *dalloc_count, + edata_list_active_t *dalloc_slabs_extra) { + assert(binind < bin_info_nbatched_sizes); + bin_with_batch_t *batched_bin = (bin_with_batch_t *)bin; + size_t nelems_to_pop = batcher_pop_begin(tsdn, + &batched_bin->remote_frees); + + bin_batching_test_mid_pop(nelems_to_pop); + if (nelems_to_pop == BATCHER_NO_IDX) { + malloc_mutex_assert_not_owner(tsdn, + &batched_bin->remote_frees.mtx); + return; + } else { + malloc_mutex_assert_owner(tsdn, + &batched_bin->remote_frees.mtx); + } + + bin_remote_free_data_t remote_free_data[BIN_REMOTE_FREE_ELEMS_MAX]; + for (size_t i = 0; i < nelems_to_pop; i++) { + remote_free_data[i] = batched_bin->remote_free_data[i]; + } + batcher_pop_end(tsdn, &batched_bin->remote_frees); + + for (size_t i = 0; i < nelems_to_pop; i++) { + arena_dalloc_bin_locked_step(tsdn, arena, bin, dalloc_bin_info, + binind, remote_free_data[i].slab, remote_free_data[i].ptr, + dalloc_slabs, ndalloc_slabs, dalloc_count, + dalloc_slabs_extra); + } +} + +typedef struct arena_bin_flush_batch_state_s arena_bin_flush_batch_state_t; +struct arena_bin_flush_batch_state_s { + arena_dalloc_bin_locked_info_t info; + + /* + * Bin batching is subtle in that there are unusual edge cases in which + * it can trigger the deallocation of more slabs than there were items + * flushed (say, if every original deallocation triggered a slab + * deallocation, and so did every batched one). So we keep a small + * backup array for any "extra" slabs, as well as a a list to allow a + * dynamic number of ones exceeding that array. + */ + edata_t *dalloc_slabs[8]; + unsigned dalloc_slab_count; + edata_list_active_t dalloc_slabs_extra; +}; + +JEMALLOC_ALWAYS_INLINE unsigned +arena_bin_batch_get_ndalloc_slabs(unsigned preallocated_slabs) { + if (preallocated_slabs > bin_batching_test_ndalloc_slabs_max) { + return bin_batching_test_ndalloc_slabs_max; + } + return preallocated_slabs; +} + +JEMALLOC_ALWAYS_INLINE void +arena_bin_flush_batch_after_lock(tsdn_t *tsdn, arena_t *arena, bin_t *bin, + unsigned binind, arena_bin_flush_batch_state_t *state) { + if (binind >= bin_info_nbatched_sizes) { + return; + } + + arena_dalloc_bin_locked_begin(&state->info, binind); + state->dalloc_slab_count = 0; + edata_list_active_init(&state->dalloc_slabs_extra); + + unsigned preallocated_slabs = (unsigned)(sizeof(state->dalloc_slabs) + / sizeof(state->dalloc_slabs[0])); + unsigned ndalloc_slabs = arena_bin_batch_get_ndalloc_slabs( + preallocated_slabs); + + arena_bin_flush_batch_impl(tsdn, arena, bin, &state->info, binind, + state->dalloc_slabs, ndalloc_slabs, + &state->dalloc_slab_count, &state->dalloc_slabs_extra); +} + +JEMALLOC_ALWAYS_INLINE void +arena_bin_flush_batch_before_unlock(tsdn_t *tsdn, arena_t *arena, bin_t *bin, + unsigned binind, arena_bin_flush_batch_state_t *state) { + if (binind >= bin_info_nbatched_sizes) { + return; + } + + arena_dalloc_bin_locked_finish(tsdn, arena, bin, &state->info); +} + static inline bool arena_bin_has_batch(szind_t binind) { return binind < bin_info_nbatched_sizes; } +JEMALLOC_ALWAYS_INLINE void +arena_bin_flush_batch_after_unlock(tsdn_t *tsdn, arena_t *arena, bin_t *bin, + unsigned binind, arena_bin_flush_batch_state_t *state) { + if (!arena_bin_has_batch(binind)) { + return; + } + /* + * The initialization of dalloc_slabs_extra is guarded by an + * arena_bin_has_batch check higher up the stack. But the clang + * analyzer forgets this down the stack, triggering a spurious error + * reported here. + */ + JEMALLOC_CLANG_ANALYZER_SUPPRESS { + bin_batching_test_after_unlock(state->dalloc_slab_count, + edata_list_active_empty(&state->dalloc_slabs_extra)); + } + for (unsigned i = 0; i < state->dalloc_slab_count; i++) { + edata_t *slab = state->dalloc_slabs[i]; + arena_slab_dalloc(tsdn, arena_get_from_edata(slab), slab); + } + while (!edata_list_active_empty(&state->dalloc_slabs_extra)) { + edata_t *slab = edata_list_active_first( + &state->dalloc_slabs_extra); + edata_list_active_remove(&state->dalloc_slabs_extra, slab); + arena_slab_dalloc(tsdn, arena_get_from_edata(slab), slab); + } +} + static inline bin_t * arena_get_bin(arena_t *arena, szind_t binind, unsigned binshard) { bin_t *shard0 = (bin_t *)((byte_t *)arena + arena_bin_offsets[binind]); diff --git a/include/jemalloc/internal/bin.h b/include/jemalloc/internal/bin.h index 36fce04f..5b776c17 100644 --- a/include/jemalloc/internal/bin.h +++ b/include/jemalloc/internal/bin.h @@ -11,6 +11,51 @@ #define BIN_REMOTE_FREE_ELEMS_MAX 16 +#ifdef JEMALLOC_JET +extern void (*bin_batching_test_after_push_hook)(size_t idx); +extern void (*bin_batching_test_mid_pop_hook)(size_t elems_to_pop); +extern void (*bin_batching_test_after_unlock_hook)(unsigned slab_dalloc_count, + bool list_empty); +#endif + +#ifdef JEMALLOC_JET +extern unsigned bin_batching_test_ndalloc_slabs_max; +#else +static const unsigned bin_batching_test_ndalloc_slabs_max = (unsigned)-1; +#endif + +JEMALLOC_ALWAYS_INLINE void +bin_batching_test_after_push(size_t idx) { + (void)idx; +#ifdef JEMALLOC_JET + if (bin_batching_test_after_push_hook != NULL) { + bin_batching_test_after_push_hook(idx); + } +#endif +} + +JEMALLOC_ALWAYS_INLINE void +bin_batching_test_mid_pop(size_t elems_to_pop) { + (void)elems_to_pop; +#ifdef JEMALLOC_JET + if (bin_batching_test_mid_pop_hook != NULL) { + bin_batching_test_mid_pop_hook(elems_to_pop); + } +#endif +} + +JEMALLOC_ALWAYS_INLINE void +bin_batching_test_after_unlock(unsigned slab_dalloc_count, bool list_empty) { + (void)slab_dalloc_count; + (void)list_empty; +#ifdef JEMALLOC_JET + if (bin_batching_test_after_unlock_hook != NULL) { + bin_batching_test_after_unlock_hook(slab_dalloc_count, + list_empty); + } +#endif +} + /* * A bin contains a set of extents that are currently being used for slab * allocations. @@ -70,7 +115,7 @@ bool bin_update_shard_size(unsigned bin_shards[SC_NBINS], size_t start_size, size_t end_size, size_t nshards); /* Initializes a bin to empty. Returns true on error. */ -bool bin_init(bin_t *bin); +bool bin_init(bin_t *bin, unsigned binind); /* Forking. */ void bin_prefork(tsdn_t *tsdn, bin_t *bin, bool has_batch); diff --git a/include/jemalloc/internal/bin_info.h b/include/jemalloc/internal/bin_info.h index f743b7d8..88d58c91 100644 --- a/include/jemalloc/internal/bin_info.h +++ b/include/jemalloc/internal/bin_info.h @@ -48,6 +48,8 @@ struct bin_info_s { extern size_t opt_bin_info_max_batched_size; /* The number of batches per batched size class. */ extern size_t opt_bin_info_remote_free_max_batch; +// The max number of pending elems (across all batches) +extern size_t opt_bin_info_remote_free_max; extern szind_t bin_info_nbatched_sizes; extern unsigned bin_info_nbatched_bins; diff --git a/src/arena.c b/src/arena.c index 71ef26f5..21010279 100644 --- a/src/arena.c +++ b/src/arena.c @@ -661,10 +661,17 @@ arena_bin_slabs_full_remove(arena_t *arena, bin_t *bin, edata_t *slab) { } static void -arena_bin_reset(tsd_t *tsd, arena_t *arena, bin_t *bin) { +arena_bin_reset(tsd_t *tsd, arena_t *arena, bin_t *bin, unsigned binind) { edata_t *slab; malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock); + + if (arena_bin_has_batch(binind)) { + bin_with_batch_t *batched_bin = (bin_with_batch_t *)bin; + batcher_init(&batched_bin->remote_frees, + BIN_REMOTE_FREE_ELEMS_MAX); + } + if (bin->slabcur != NULL) { slab = bin->slabcur; bin->slabcur = NULL; @@ -815,7 +822,8 @@ arena_reset(tsd_t *tsd, arena_t *arena) { /* Bins. */ for (unsigned i = 0; i < SC_NBINS; i++) { for (unsigned j = 0; j < bin_infos[i].n_shards; j++) { - arena_bin_reset(tsd, arena, arena_get_bin(arena, i, j)); + arena_bin_reset(tsd, arena, arena_get_bin(arena, i, j), + i); } } pa_shard_reset(tsd_tsdn(tsd), &arena->pa_shard); @@ -1080,8 +1088,18 @@ arena_cache_bin_fill_small(tsdn_t *tsdn, arena_t *arena, unsigned binshard; bin_t *bin = arena_bin_choose(tsdn, arena, binind, &binshard); + /* + * This has some fields that are conditionally initialized down batch + * flush pathways. This can trigger static analysis warnings deeper + * down in the static. The accesses are guarded by the same checks as + * the initialization, but the analysis isn't able to track that across + * multiple stack frames. + */ + arena_bin_flush_batch_state_t batch_flush_state + JEMALLOC_CLANG_ANALYZER_SILENCE_INIT({0}); label_refill: malloc_mutex_lock(tsdn, &bin->lock); + arena_bin_flush_batch_after_lock(tsdn, arena, bin, binind, &batch_flush_state); while (filled < nfill) { /* Try batch-fill from slabcur first. */ @@ -1136,7 +1154,11 @@ label_refill: cache_bin->tstats.nrequests = 0; } + arena_bin_flush_batch_before_unlock(tsdn, arena, bin, binind, + &batch_flush_state); malloc_mutex_unlock(tsdn, &bin->lock); + arena_bin_flush_batch_after_unlock(tsdn, arena, bin, binind, + &batch_flush_state); if (alloc_and_retry) { assert(fresh_slab == NULL); @@ -1427,12 +1449,16 @@ arena_dalloc_bin(tsdn_t *tsdn, arena_t *arena, edata_t *edata, void *ptr) { malloc_mutex_lock(tsdn, &bin->lock); arena_dalloc_bin_locked_info_t info; arena_dalloc_bin_locked_begin(&info, binind); - bool ret = arena_dalloc_bin_locked_step(tsdn, arena, bin, - &info, binind, edata, ptr); + edata_t *dalloc_slabs[1]; + unsigned dalloc_slabs_count = 0; + arena_dalloc_bin_locked_step(tsdn, arena, bin, &info, binind, edata, + ptr, dalloc_slabs, /* ndalloc_slabs */ 1, &dalloc_slabs_count, + /* dalloc_slabs_extra */ NULL); arena_dalloc_bin_locked_finish(tsdn, arena, bin, &info); malloc_mutex_unlock(tsdn, &bin->lock); - if (ret) { + if (dalloc_slabs_count != 0) { + assert(dalloc_slabs[0] == edata); arena_slab_dalloc(tsdn, arena, edata); } } @@ -1731,7 +1757,7 @@ arena_new(tsdn_t *tsdn, unsigned ind, const arena_config_t *config) { for (unsigned i = 0; i < SC_NBINS; i++) { for (unsigned j = 0; j < bin_infos[i].n_shards; j++) { bin_t *bin = arena_get_bin(arena, i, j); - bool err = bin_init(bin); + bool err = bin_init(bin, i); if (err) { goto label_error; } diff --git a/src/bin.c b/src/bin.c index b9b4be2c..267aa0f3 100644 --- a/src/bin.c +++ b/src/bin.c @@ -6,6 +6,14 @@ #include "jemalloc/internal/sc.h" #include "jemalloc/internal/witness.h" +#ifdef JEMALLOC_JET +unsigned bin_batching_test_ndalloc_slabs_max = (unsigned)-1; +void (*bin_batching_test_after_push_hook)(size_t push_idx); +void (*bin_batching_test_mid_pop_hook)(size_t nelems_to_pop); +void (*bin_batching_test_after_unlock_hook)(unsigned slab_dalloc_count, + bool list_empty); +#endif + bool bin_update_shard_size(unsigned bin_shard_sizes[SC_NBINS], size_t start_size, size_t end_size, size_t nshards) { @@ -39,7 +47,7 @@ bin_shard_sizes_boot(unsigned bin_shard_sizes[SC_NBINS]) { } bool -bin_init(bin_t *bin) { +bin_init(bin_t *bin, unsigned binind) { if (malloc_mutex_init(&bin->lock, "bin", WITNESS_RANK_BIN, malloc_mutex_rank_exclusive)) { return true; @@ -50,6 +58,11 @@ bin_init(bin_t *bin) { if (config_stats) { memset(&bin->stats, 0, sizeof(bin_stats_t)); } + if (arena_bin_has_batch(binind)) { + bin_with_batch_t *batched_bin = (bin_with_batch_t *)bin; + batcher_init(&batched_bin->remote_frees, + opt_bin_info_remote_free_max); + } return false; } @@ -57,8 +70,23 @@ void bin_prefork(tsdn_t *tsdn, bin_t *bin, bool has_batch) { malloc_mutex_prefork(tsdn, &bin->lock); if (has_batch) { + /* + * The batch mutex has lower rank than the bin mutex (as it must + * -- it's acquired later). But during forking, we go + * bin-at-a-time, so that we acquire mutex on bin 0, then on + * the bin 0 batcher, then on bin 1. This is a safe ordering + * (it's ordered by the index of arenas and bins within those + * arenas), but will trigger witness errors that would + * otherwise force another level of arena forking that breaks + * bin encapsulation (because the witness API doesn't "know" + * about arena or bin ordering -- it just sees that the batcher + * has a lower rank than the bin). So instead we exclude the + * batcher mutex from witness checking during fork (which is + * the only time we touch multiple bins at once) by passing + * TSDN_NULL. + */ bin_with_batch_t *batched = (bin_with_batch_t *)bin; - batcher_prefork(tsdn, &batched->remote_frees); + batcher_prefork(TSDN_NULL, &batched->remote_frees); } } @@ -67,7 +95,7 @@ bin_postfork_parent(tsdn_t *tsdn, bin_t *bin, bool has_batch) { malloc_mutex_postfork_parent(tsdn, &bin->lock); if (has_batch) { bin_with_batch_t *batched = (bin_with_batch_t *)bin; - batcher_postfork_parent(tsdn, &batched->remote_frees); + batcher_postfork_parent(TSDN_NULL, &batched->remote_frees); } } @@ -76,6 +104,6 @@ bin_postfork_child(tsdn_t *tsdn, bin_t *bin, bool has_batch) { malloc_mutex_postfork_child(tsdn, &bin->lock); if (has_batch) { bin_with_batch_t *batched = (bin_with_batch_t *)bin; - batcher_postfork_child(tsdn, &batched->remote_frees); + batcher_postfork_child(TSDN_NULL, &batched->remote_frees); } } diff --git a/src/bin_info.c b/src/bin_info.c index 27f0be17..f8a64ae3 100644 --- a/src/bin_info.c +++ b/src/bin_info.c @@ -3,8 +3,19 @@ #include "jemalloc/internal/bin_info.h" -size_t opt_bin_info_max_batched_size; -size_t opt_bin_info_remote_free_max_batch; +/* + * We leave bin-batching disabled by default, with other settings chosen mostly + * empirically; across the test programs I looked at they provided the most bang + * for the buck. With other default settings, these choices for bin batching + * result in them consuming far less memory (even in the worst case) than the + * tcaches themselves, the arena, etc. + * Note that we always try to pop all bins on every arena cache bin lock + * operation, so the typical memory waste is far less than this (and only on + * hot bins, which tend to be large anyways). + */ +size_t opt_bin_info_max_batched_size = 0; /* 192 is a good default. */ +size_t opt_bin_info_remote_free_max_batch = 4; +size_t opt_bin_info_remote_free_max = BIN_REMOTE_FREE_ELEMS_MAX; bin_info_t bin_infos[SC_NBINS]; diff --git a/src/ctl.c b/src/ctl.c index 1b76b792..ab40050d 100644 --- a/src/ctl.c +++ b/src/ctl.c @@ -130,6 +130,7 @@ CTL_PROTO(opt_utrace) CTL_PROTO(opt_xmalloc) CTL_PROTO(opt_experimental_infallible_new) CTL_PROTO(opt_max_batched_size) +CTL_PROTO(opt_remote_free_max) CTL_PROTO(opt_remote_free_max_batch) CTL_PROTO(opt_tcache) CTL_PROTO(opt_tcache_max) @@ -483,6 +484,7 @@ static const ctl_named_node_t opt_node[] = { {NAME("experimental_infallible_new"), CTL(opt_experimental_infallible_new)}, {NAME("max_batched_size"), CTL(opt_max_batched_size)}, + {NAME("remote_free_max"), CTL(opt_remote_free_max)}, {NAME("remote_free_max_batch"), CTL(opt_remote_free_max_batch)}, {NAME("tcache"), CTL(opt_tcache)}, {NAME("tcache_max"), CTL(opt_tcache_max)}, @@ -2208,6 +2210,8 @@ CTL_RO_NL_CGEN(config_xmalloc, opt_xmalloc, opt_xmalloc, bool) CTL_RO_NL_CGEN(config_enable_cxx, opt_experimental_infallible_new, opt_experimental_infallible_new, bool) CTL_RO_NL_GEN(opt_max_batched_size, opt_bin_info_max_batched_size, size_t) +CTL_RO_NL_GEN(opt_remote_free_max, opt_bin_info_remote_free_max, + size_t) CTL_RO_NL_GEN(opt_remote_free_max_batch, opt_bin_info_remote_free_max_batch, size_t) CTL_RO_NL_GEN(opt_tcache, opt_tcache, bool) diff --git a/src/jemalloc.c b/src/jemalloc.c index 8f40e0cc..89f4b29d 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -1334,6 +1334,11 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS], BIN_REMOTE_FREE_ELEMS_MAX, CONF_DONT_CHECK_MIN, CONF_CHECK_MAX, /* clip */ true) + CONF_HANDLE_SIZE_T(opt_bin_info_remote_free_max, + "remote_free_max", 0, + BIN_REMOTE_FREE_ELEMS_MAX, + CONF_DONT_CHECK_MIN, CONF_CHECK_MAX, + /* clip */ true) if (CONF_MATCH("tcache_ncached_max")) { bool err = tcache_bin_info_default_init( diff --git a/src/stats.c b/src/stats.c index 359a252c..f057e722 100644 --- a/src/stats.c +++ b/src/stats.c @@ -1556,6 +1556,7 @@ stats_general_print(emitter_t *emitter) { OPT_WRITE_BOOL("xmalloc") OPT_WRITE_BOOL("experimental_infallible_new") OPT_WRITE_SIZE_T("max_batched_size") + OPT_WRITE_SIZE_T("remote_free_max") OPT_WRITE_SIZE_T("remote_free_max_batch") OPT_WRITE_BOOL("tcache") OPT_WRITE_SIZE_T("tcache_max") diff --git a/src/tcache.c b/src/tcache.c index 4dd5ccd6..564b5d9c 100644 --- a/src/tcache.c +++ b/src/tcache.c @@ -325,6 +325,7 @@ tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin assert(binind < SC_NBINS); arena_t *tcache_arena = tcache_slow->arena; assert(tcache_arena != NULL); + unsigned tcache_binshard = tsd_binshardsp_get(tsdn_tsd(tsdn))->binshard[binind]; /* * Variable length array must have > 0 length; the last element is never @@ -341,6 +342,18 @@ tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin unsigned dalloc_count = 0; VARIABLE_ARRAY(edata_t *, dalloc_slabs, nflush + 1); + /* + * There's an edge case where we need to deallocate more slabs than we + * have elements of dalloc_slabs. This can if we end up deallocating + * items batched by another thread in addition to ones flushed from the + * cache. Since this is not very likely (most small object + * deallocations don't free up a whole slab), we don't want to burn the + * stack space to keep those excess slabs in an array. Instead we'll + * maintain an overflow list. + */ + edata_list_active_t dalloc_slabs_extra; + edata_list_active_init(&dalloc_slabs_extra); + /* * We're about to grab a bunch of locks. If one of them happens to be * the one guarding the arena-level stats counters we flush our @@ -418,40 +431,136 @@ tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin } } - /* Actually do the flushing. */ - malloc_mutex_lock(tsdn, &cur_bin->lock); /* - * Flush stats first, if that was the right lock. Note that we - * don't actually have to flush stats into the current thread's - * binshard. Flushing into any binshard in the same arena is - * enough; we don't expose stats on per-binshard basis (just - * per-bin). + * We never batch when flushing to our home-base bin shard, + * since it's likely that we'll have to acquire that lock anyway + * when flushing stats. + * + * A plausible check we could add to can_batch is + * '&& arena_is_auto(cur_arena)'. The motivation would be that + * we have a higher tolerance for dubious user assumptions + * around non-auto arenas (e.g. "if I deallocate every object I + * allocated, and then call tcache.flush, then the arena stats + * must reflect zero live allocations"). + * + * This is dubious for a couple reasons: + * - We already don't provide perfect fidelity for stats + * counting (e.g. for profiled allocations, whose size can + * inflate in stats). + * - Hanging load-bearing guarantees around stats impedes + * scalability in general. + * + * There are some "complete" strategies we could do instead: + * - Add a arena..quiesce call to pop all bins for users who + * do want those stats accounted for. + * - Make batchability a user-controllable per-arena option. + * - Do a batch pop after every mutex acquisition for which we + * want to provide accurate stats. This gives perfectly + * accurate stats, but can cause weird performance effects + * (because doing stats collection can now result in slabs + * becoming empty, and therefore purging, large mutex + * acquisition, etc.). + * - Propagate the "why" behind a flush down to the level of the + * batcher, and include a batch pop attempt down full tcache + * flushing pathways. This is just a lot of plumbing and + * internal complexity. + * + * We don't do any of these right now, but the decision calculus + * and tradeoffs are subtle enough that the reasoning was worth + * leaving in this comment. */ - if (config_stats && tcache_arena == cur_arena - && !merged_stats) { - merged_stats = true; - cur_bin->stats.nflushes++; - cur_bin->stats.nrequests += - cache_bin->tstats.nrequests; - cache_bin->tstats.nrequests = 0; + bool bin_is_batched = arena_bin_has_batch(binind); + bool home_binshard = (cur_arena == tcache_arena + && cur_binshard == tcache_binshard); + bool can_batch = (flush_start - prev_flush_start + <= opt_bin_info_remote_free_max_batch) + && !home_binshard && bin_is_batched; + + /* + * We try to avoid the batching pathway if we can, so we always + * at least *try* to lock. + */ + bool locked = false; + bool batched = false; + if (can_batch) { + locked = !malloc_mutex_trylock(tsdn, &cur_bin->lock); } - /* Next flush objects. */ - /* Init only to avoid used-uninitialized warning. */ - arena_dalloc_bin_locked_info_t dalloc_bin_info = {0}; - arena_dalloc_bin_locked_begin(&dalloc_bin_info, binind); - for (unsigned i = prev_flush_start; i < flush_start; i++) { - void *ptr = ptrs->ptr[i]; - edata_t *edata = item_edata[i].edata; - if (arena_dalloc_bin_locked_step(tsdn, - cur_arena, cur_bin, &dalloc_bin_info, - binind, edata, ptr)) { - dalloc_slabs[dalloc_count] = edata; - dalloc_count++; + if (can_batch && !locked) { + bin_with_batch_t *batched_bin = + (bin_with_batch_t *)cur_bin; + size_t push_idx = batcher_push_begin(tsdn, + &batched_bin->remote_frees, + flush_start - prev_flush_start); + bin_batching_test_after_push(push_idx); + + if (push_idx != BATCHER_NO_IDX) { + batched = true; + unsigned nbatched + = flush_start - prev_flush_start; + for (unsigned i = 0; i < nbatched; i++) { + unsigned src_ind = prev_flush_start + i; + batched_bin->remote_free_data[ + push_idx + i].ptr + = ptrs->ptr[src_ind]; + batched_bin->remote_free_data[ + push_idx + i].slab + = item_edata[src_ind].edata; + } + batcher_push_end(tsdn, + &batched_bin->remote_frees); } } - arena_dalloc_bin_locked_finish(tsdn, cur_arena, cur_bin, - &dalloc_bin_info); - malloc_mutex_unlock(tsdn, &cur_bin->lock); + if (!batched) { + if (!locked) { + malloc_mutex_lock(tsdn, &cur_bin->lock); + } + /* + * Flush stats first, if that was the right lock. Note + * that we don't actually have to flush stats into the + * current thread's binshard. Flushing into any binshard + * in the same arena is enough; we don't expose stats on + * per-binshard basis (just per-bin). + */ + if (config_stats && tcache_arena == cur_arena + && !merged_stats) { + merged_stats = true; + cur_bin->stats.nflushes++; + cur_bin->stats.nrequests += + cache_bin->tstats.nrequests; + cache_bin->tstats.nrequests = 0; + } + unsigned preallocated_slabs = nflush; + unsigned ndalloc_slabs = arena_bin_batch_get_ndalloc_slabs( + preallocated_slabs); + + /* Next flush objects our own objects. */ + /* Init only to avoid used-uninitialized warning. */ + arena_dalloc_bin_locked_info_t dalloc_bin_info = {0}; + arena_dalloc_bin_locked_begin(&dalloc_bin_info, binind); + for (unsigned i = prev_flush_start; i < flush_start; + i++) { + void *ptr = ptrs->ptr[i]; + edata_t *edata = item_edata[i].edata; + arena_dalloc_bin_locked_step(tsdn, cur_arena, + cur_bin, &dalloc_bin_info, binind, edata, + ptr, dalloc_slabs, ndalloc_slabs, + &dalloc_count, &dalloc_slabs_extra); + } + /* + * Lastly, flush any batched objects (from other + * threads). + */ + if (bin_is_batched) { + arena_bin_flush_batch_impl(tsdn, cur_arena, + cur_bin, &dalloc_bin_info, binind, + dalloc_slabs, ndalloc_slabs, + &dalloc_count, &dalloc_slabs_extra); + } + + arena_dalloc_bin_locked_finish(tsdn, cur_arena, cur_bin, + &dalloc_bin_info); + malloc_mutex_unlock(tsdn, &cur_bin->lock); + } arena_decay_ticks(tsdn, cur_arena, flush_start - prev_flush_start); } @@ -460,7 +569,11 @@ tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin for (unsigned i = 0; i < dalloc_count; i++) { edata_t *slab = dalloc_slabs[i]; arena_slab_dalloc(tsdn, arena_get_from_edata(slab), slab); - + } + while (!edata_list_active_empty(&dalloc_slabs_extra)) { + edata_t *slab = edata_list_active_first(&dalloc_slabs_extra); + edata_list_active_remove(&dalloc_slabs_extra, slab); + arena_slab_dalloc(tsdn, arena_get_from_edata(slab), slab); } if (config_stats && !merged_stats) { diff --git a/test/include/test/fork.h b/test/include/test/fork.h new file mode 100644 index 00000000..ac9b1858 --- /dev/null +++ b/test/include/test/fork.h @@ -0,0 +1,32 @@ +#ifndef JEMALLOC_TEST_FORK_H +#define JEMALLOC_TEST_FORK_H + +#ifndef _WIN32 + +#include + +static inline void +fork_wait_for_child_exit(int pid) { + int status; + while (true) { + if (waitpid(pid, &status, 0) == -1) { + test_fail("Unexpected waitpid() failure."); + } + if (WIFSIGNALED(status)) { + test_fail("Unexpected child termination due to " + "signal %d", WTERMSIG(status)); + break; + } + if (WIFEXITED(status)) { + if (WEXITSTATUS(status) != 0) { + test_fail("Unexpected child exit value %d", + WEXITSTATUS(status)); + } + break; + } + } +} + +#endif + +#endif /* JEMALLOC_TEST_FORK_H */ diff --git a/test/include/test/jemalloc_test.h.in b/test/include/test/jemalloc_test.h.in index f9c506da..8b139db1 100644 --- a/test/include/test/jemalloc_test.h.in +++ b/test/include/test/jemalloc_test.h.in @@ -1,3 +1,6 @@ +#ifndef JEMALLOC_TEST_H +#define JEMALLOC_TEST_H + #ifdef __cplusplus extern "C" { #endif @@ -172,3 +175,5 @@ extern "C" { #ifdef __cplusplus } #endif + +#endif diff --git a/test/unit/bin_batching.c b/test/unit/bin_batching.c new file mode 100644 index 00000000..525f59e0 --- /dev/null +++ b/test/unit/bin_batching.c @@ -0,0 +1,264 @@ +#include "test/jemalloc_test.h" +#include "test/fork.h" + +enum { + STRESS_THREADS = 3, + STRESS_OBJECTS_PER_THREAD = 1000, + STRESS_ALLOC_SZ = PAGE / 2, +}; + +typedef struct stress_thread_data_s stress_thread_data_t; +struct stress_thread_data_s { + unsigned thd_id; + atomic_zu_t *ready_thds; + atomic_zu_t *done_thds; + void **to_dalloc; +}; + +static atomic_zu_t push_failure_count; +static atomic_zu_t pop_attempt_results[2]; +static atomic_zu_t dalloc_zero_slab_count; +static atomic_zu_t dalloc_nonzero_slab_count; +static atomic_zu_t dalloc_nonempty_list_count; + +static bool +should_skip() { + return + /* + * We do batching operations on tcache flush pathways; we can't if + * caching is disabled. + */ + !opt_tcache || + /* We rely on tcache fill/flush operations of the size we use. */ + opt_tcache_max < STRESS_ALLOC_SZ + /* + * Some of the races we want to trigger are fiddly enough that they + * only show up under real concurrency. We add 1 to account for the + * main thread, which also does some work. + */ + || ncpus < STRESS_THREADS + 1; +} + +static void +increment_push_failure(size_t push_idx) { + if (push_idx == BATCHER_NO_IDX) { + atomic_fetch_add_zu(&push_failure_count, 1, ATOMIC_RELAXED); + } else { + assert_zu_lt(push_idx, 4, "Only 4 elems"); + volatile int x = 10000; + while (--x) { + /* Spin for a while, to try to provoke a failure. */ + } + } +} + +static void +increment_pop_attempt(size_t elems_to_pop) { + bool elems = (elems_to_pop != BATCHER_NO_IDX); + atomic_fetch_add_zu(&pop_attempt_results[elems], 1, ATOMIC_RELAXED); +} + +static void +increment_slab_dalloc_count(unsigned slab_dalloc_count, bool list_empty) { + if (slab_dalloc_count > 0) { + atomic_fetch_add_zu(&dalloc_nonzero_slab_count, 1, + ATOMIC_RELAXED); + } else { + atomic_fetch_add_zu(&dalloc_zero_slab_count, 1, + ATOMIC_RELAXED); + } + if (!list_empty) { + atomic_fetch_add_zu(&dalloc_nonempty_list_count, 1, + ATOMIC_RELAXED); + } +} + +static void flush_tcache() { + assert_d_eq(0, mallctl("thread.tcache.flush", NULL, NULL, NULL, 0), + "Unexpected mallctl failure"); +} + +static void * +stress_thread(void *arg) { + stress_thread_data_t *data = arg; + uint64_t prng_state = data->thd_id; + atomic_fetch_add_zu(data->ready_thds, 1, ATOMIC_RELAXED); + while (atomic_load_zu(data->ready_thds, ATOMIC_RELAXED) + != STRESS_THREADS) { + /* Spin */ + } + for (int i = 0; i < STRESS_OBJECTS_PER_THREAD; i++) { + dallocx(data->to_dalloc[i], 0); + if (prng_range_u64(&prng_state, 3) == 0) { + flush_tcache(); + } + + } + flush_tcache(); + atomic_fetch_add_zu(data->done_thds, 1, ATOMIC_RELAXED); + return NULL; +} + +/* + * Run main_thread_fn in conditions that trigger all the various edge cases and + * subtle race conditions. + */ +static void +stress_run(void (*main_thread_fn)(), int nruns) { + bin_batching_test_ndalloc_slabs_max = 1; + bin_batching_test_after_push_hook = &increment_push_failure; + bin_batching_test_mid_pop_hook = &increment_pop_attempt; + bin_batching_test_after_unlock_hook = &increment_slab_dalloc_count; + + atomic_store_zu(&push_failure_count, 0, ATOMIC_RELAXED); + atomic_store_zu(&pop_attempt_results[2], 0, ATOMIC_RELAXED); + atomic_store_zu(&dalloc_zero_slab_count, 0, ATOMIC_RELAXED); + atomic_store_zu(&dalloc_nonzero_slab_count, 0, ATOMIC_RELAXED); + atomic_store_zu(&dalloc_nonempty_list_count, 0, ATOMIC_RELAXED); + + for (int run = 0; run < nruns; run++) { + thd_t thds[STRESS_THREADS]; + stress_thread_data_t thd_datas[STRESS_THREADS]; + atomic_zu_t ready_thds; + atomic_store_zu(&ready_thds, 0, ATOMIC_RELAXED); + atomic_zu_t done_thds; + atomic_store_zu(&done_thds, 0, ATOMIC_RELAXED); + + void *ptrs[STRESS_THREADS][STRESS_OBJECTS_PER_THREAD]; + for (int i = 0; i < STRESS_THREADS; i++) { + thd_datas[i].thd_id = i; + thd_datas[i].ready_thds = &ready_thds; + thd_datas[i].done_thds = &done_thds; + thd_datas[i].to_dalloc = ptrs[i]; + for (int j = 0; j < STRESS_OBJECTS_PER_THREAD; j++) { + void *ptr = mallocx(STRESS_ALLOC_SZ, 0); + assert_ptr_not_null(ptr, "alloc failure"); + ptrs[i][j] = ptr; + } + } + for (int i = 0; i < STRESS_THREADS; i++) { + thd_create(&thds[i], stress_thread, &thd_datas[i]); + } + while (atomic_load_zu(&done_thds, ATOMIC_RELAXED) + != STRESS_THREADS) { + main_thread_fn(); + } + for (int i = 0; i < STRESS_THREADS; i++) { + thd_join(thds[i], NULL); + } + } + + bin_batching_test_ndalloc_slabs_max = (unsigned)-1; + bin_batching_test_after_push_hook = NULL; + bin_batching_test_mid_pop_hook = NULL; + bin_batching_test_after_unlock_hook = NULL; +} + +static void +do_allocs_frees() { + enum {NALLOCS = 32}; + flush_tcache(); + void *ptrs[NALLOCS]; + for (int i = 0; i < NALLOCS; i++) { + ptrs[i] = mallocx(STRESS_ALLOC_SZ, 0); + } + for (int i = 0; i < NALLOCS; i++) { + dallocx(ptrs[i], 0); + } + flush_tcache(); +} + +static void +test_arena_reset_main_fn() { + do_allocs_frees(); +} + +TEST_BEGIN(test_arena_reset) { + int err; + unsigned arena; + unsigned old_arena; + + test_skip_if(should_skip()); + test_skip_if(opt_percpu_arena != percpu_arena_disabled); + + size_t arena_sz = sizeof(arena); + err = mallctl("arenas.create", (void *)&arena, &arena_sz, NULL, 0); + assert_d_eq(0, err, "Arena creation failed"); + + err = mallctl("thread.arena", &old_arena, &arena_sz, &arena, arena_sz); + assert_d_eq(0, err, "changing arena failed"); + + stress_run(&test_arena_reset_main_fn, /* nruns */ 10); + + flush_tcache(); + + char buf[100]; + malloc_snprintf(buf, sizeof(buf), "arena.%u.reset", arena); + err = mallctl(buf, NULL, NULL, NULL, 0); + assert_d_eq(0, err, "Couldn't change arena"); + + do_allocs_frees(); + + err = mallctl("thread.arena", NULL, NULL, &old_arena, arena_sz); + assert_d_eq(0, err, "changing arena failed"); +} +TEST_END + +static void +test_fork_main_fn() { +#ifndef _WIN32 + pid_t pid = fork(); + if (pid == -1) { + test_fail("Fork failure!"); + } else if (pid == 0) { + /* Child */ + do_allocs_frees(); + _exit(0); + } else { + fork_wait_for_child_exit(pid); + do_allocs_frees(); + } +#endif +} + +TEST_BEGIN(test_fork) { +#ifdef _WIN32 + test_skip("No fork on windows"); +#endif + test_skip_if(should_skip()); + stress_run(&test_fork_main_fn, /* nruns */ 10); +} +TEST_END + +static void +test_races_main_fn() { + do_allocs_frees(); +} + +TEST_BEGIN(test_races) { + test_skip_if(should_skip()); + + stress_run(&test_races_main_fn, /* nruns */ 400); + + assert_zu_lt(0, atomic_load_zu(&push_failure_count, ATOMIC_RELAXED), + "Should have seen some push failures"); + assert_zu_lt(0, atomic_load_zu(&pop_attempt_results[0], ATOMIC_RELAXED), + "Should have seen some pop failures"); + assert_zu_lt(0, atomic_load_zu(&pop_attempt_results[1], ATOMIC_RELAXED), + "Should have seen some pop successes"); + assert_zu_lt(0, atomic_load_zu(&dalloc_zero_slab_count, ATOMIC_RELAXED), + "Expected some frees that didn't empty a slab"); + assert_zu_lt(0, atomic_load_zu(&dalloc_nonzero_slab_count, + ATOMIC_RELAXED), "expected some frees that emptied a slab"); + assert_zu_lt(0, atomic_load_zu(&dalloc_nonempty_list_count, + ATOMIC_RELAXED), "expected some frees that used the empty list"); +} +TEST_END + +int +main(void) { + return test_no_reentrancy( + test_arena_reset, + test_races, + test_fork); +} diff --git a/test/unit/bin_batching.sh b/test/unit/bin_batching.sh new file mode 100644 index 00000000..fef9bdc6 --- /dev/null +++ b/test/unit/bin_batching.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +# This value of max_batched_size effectively requires all bins to be batched; +# our page limits are fuzzy, but we bound slab item counts to 2**32, so we'd be +# at multi-gigabyte minimum page sizes. +# The reason for this sort of hacky approach is that we want to +# allocate/deallocate PAGE/2-sized objects (to trigger the "non-empty" -> +# "empty" and "non-empty"-> "full" transitions often, which have special +# handling). But the value of PAGE isn't easily available in test scripts. +export MALLOC_CONF="narenas:2,bin_shards:1-1000000000:3,max_batched_size:1000000000,remote_free_max_batch:1,remote_free_max:4" diff --git a/test/unit/fork.c b/test/unit/fork.c index 447eb191..1a4c575e 100644 --- a/test/unit/fork.c +++ b/test/unit/fork.c @@ -1,32 +1,5 @@ #include "test/jemalloc_test.h" - -#ifndef _WIN32 -#include -#endif - -#ifndef _WIN32 -static void -wait_for_child_exit(int pid) { - int status; - while (true) { - if (waitpid(pid, &status, 0) == -1) { - test_fail("Unexpected waitpid() failure."); - } - if (WIFSIGNALED(status)) { - test_fail("Unexpected child termination due to " - "signal %d", WTERMSIG(status)); - break; - } - if (WIFEXITED(status)) { - if (WEXITSTATUS(status) != 0) { - test_fail("Unexpected child exit value %d", - WEXITSTATUS(status)); - } - break; - } - } -} -#endif +#include "test/fork.h" TEST_BEGIN(test_fork) { #ifndef _WIN32 @@ -64,7 +37,7 @@ TEST_BEGIN(test_fork) { /* Child. */ _exit(0); } else { - wait_for_child_exit(pid); + fork_wait_for_child_exit(pid); } #else test_skip("fork(2) is irrelevant to Windows"); @@ -87,7 +60,7 @@ do_fork_thd(void *arg) { test_fail("Exec failed"); } else { /* Parent */ - wait_for_child_exit(pid); + fork_wait_for_child_exit(pid); } return NULL; } @@ -124,7 +97,7 @@ TEST_BEGIN(test_fork_multithreaded) { do_test_fork_multithreaded(); _exit(0); } else { - wait_for_child_exit(pid); + fork_wait_for_child_exit(pid); } } #else