Revert PR #2608: Manually revert commits 70c94d..f9c0b5

This commit is contained in:
Shirui Cheng 2025-07-15 15:44:14 -07:00 committed by Guangli Dai
parent 9186700eb3
commit e2da7477f8
30 changed files with 124 additions and 1364 deletions

View file

@ -39,7 +39,8 @@ div_info_t arena_binind_div_info[SC_NBINS];
size_t opt_oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT;
size_t oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT;
uint32_t arena_bin_offsets[SC_NBINS];
uint32_t arena_bin_offsets[SC_NBINS];
static unsigned nbins_total;
/*
* a0 is used to handle huge requests before malloc init completes. After
@ -674,17 +675,11 @@ arena_bin_slabs_full_remove(arena_t *arena, bin_t *bin, edata_t *slab) {
}
static void
arena_bin_reset(tsd_t *tsd, arena_t *arena, bin_t *bin, unsigned binind) {
arena_bin_reset(tsd_t *tsd, arena_t *arena, bin_t *bin) {
edata_t *slab;
malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
if (arena_bin_has_batch(binind)) {
bin_with_batch_t *batched_bin = (bin_with_batch_t *)bin;
batcher_init(
&batched_bin->remote_frees, BIN_REMOTE_FREE_ELEMS_MAX);
}
if (bin->slabcur != NULL) {
slab = bin->slabcur;
bin->slabcur = NULL;
@ -835,8 +830,7 @@ arena_reset(tsd_t *tsd, arena_t *arena) {
/* Bins. */
for (unsigned i = 0; i < SC_NBINS; i++) {
for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
arena_bin_reset(
tsd, arena, arena_get_bin(arena, i, j), i);
arena_bin_reset(tsd, arena, arena_get_bin(arena, i, j));
}
}
pa_shard_reset(tsd_tsdn(tsd), &arena->pa_shard);
@ -1103,19 +1097,8 @@ arena_cache_bin_fill_small(tsdn_t *tsdn, arena_t *arena, cache_bin_t *cache_bin,
unsigned binshard;
bin_t *bin = arena_bin_choose(tsdn, arena, binind, &binshard);
/*
* This has some fields that are conditionally initialized down batch
* flush pathways. This can trigger static analysis warnings deeper
* down in the static. The accesses are guarded by the same checks as
* the initialization, but the analysis isn't able to track that across
* multiple stack frames.
*/
arena_bin_flush_batch_state_t batch_flush_state
JEMALLOC_CLANG_ANALYZER_SILENCE_INIT({0});
label_refill:
malloc_mutex_lock(tsdn, &bin->lock);
arena_bin_flush_batch_after_lock(
tsdn, arena, bin, binind, &batch_flush_state);
while (filled < nfill_min) {
/* Try batch-fill from slabcur first. */
@ -1176,11 +1159,7 @@ label_refill:
cache_bin->tstats.nrequests = 0;
}
arena_bin_flush_batch_before_unlock(
tsdn, arena, bin, binind, &batch_flush_state);
malloc_mutex_unlock(tsdn, &bin->lock);
arena_bin_flush_batch_after_unlock(
tsdn, arena, bin, binind, &batch_flush_state);
if (alloc_and_retry) {
assert(fresh_slab == NULL);
@ -1474,16 +1453,12 @@ arena_dalloc_bin(tsdn_t *tsdn, arena_t *arena, edata_t *edata, void *ptr) {
malloc_mutex_lock(tsdn, &bin->lock);
arena_dalloc_bin_locked_info_t info;
arena_dalloc_bin_locked_begin(&info, binind);
edata_t *dalloc_slabs[1];
unsigned dalloc_slabs_count = 0;
arena_dalloc_bin_locked_step(tsdn, arena, bin, &info, binind, edata,
ptr, dalloc_slabs, /* ndalloc_slabs */ 1, &dalloc_slabs_count,
/* dalloc_slabs_extra */ NULL);
bool ret = arena_dalloc_bin_locked_step(
tsdn, arena, bin, &info, binind, edata, ptr);
arena_dalloc_bin_locked_finish(tsdn, arena, bin, &info);
malloc_mutex_unlock(tsdn, &bin->lock);
if (dalloc_slabs_count != 0) {
assert(dalloc_slabs[0] == edata);
if (ret) {
arena_slab_dalloc(tsdn, arena, edata);
}
}
@ -1722,6 +1697,7 @@ arena_t *
arena_new(tsdn_t *tsdn, unsigned ind, const arena_config_t *config) {
arena_t *arena;
base_t *base;
unsigned i;
if (ind == 0) {
base = b0get();
@ -1734,13 +1710,14 @@ arena_new(tsdn_t *tsdn, unsigned ind, const arena_config_t *config) {
}
size_t arena_size = ALIGNMENT_CEILING(sizeof(arena_t), CACHELINE)
+ sizeof(bin_with_batch_t) * bin_info_nbatched_bins
+ sizeof(bin_t) * bin_info_nunbatched_bins;
+ sizeof(bin_t) * nbins_total;
arena = (arena_t *)base_alloc(tsdn, base, arena_size, CACHELINE);
if (arena == NULL) {
goto label_error;
}
JEMALLOC_SUPPRESS_WARN_ON_USAGE(
assert((uintptr_t)&arena->all_bins[nbins_total - 1] + sizeof(bin_t)
<= (uintptr_t)arena + arena_size);)
atomic_store_u(&arena->nthreads[0], 0, ATOMIC_RELAXED);
atomic_store_u(&arena->nthreads[1], 0, ATOMIC_RELAXED);
arena->last_thd = NULL;
@ -1779,13 +1756,11 @@ arena_new(tsdn_t *tsdn, unsigned ind, const arena_config_t *config) {
/* Initialize bins. */
atomic_store_u(&arena->binshard_next, 0, ATOMIC_RELEASE);
for (unsigned i = 0; i < SC_NBINS; i++) {
for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
bin_t *bin = arena_get_bin(arena, i, j);
bool err = bin_init(bin, i);
if (err) {
goto label_error;
}
for (i = 0; i < nbins_total; i++) {
JEMALLOC_SUPPRESS_WARN_ON_USAGE(
bool err = bin_init(&arena->all_bins[i]);)
if (err) {
goto label_error;
}
}
@ -1943,10 +1918,8 @@ arena_boot(sc_data_t *sc_data, base_t *base, bool hpa) {
uint32_t cur_offset = (uint32_t)offsetof(arena_t, all_bins);)
for (szind_t i = 0; i < SC_NBINS; i++) {
arena_bin_offsets[i] = cur_offset;
uint32_t bin_sz = (i < bin_info_nbatched_sizes
? sizeof(bin_with_batch_t)
: sizeof(bin_t));
cur_offset += (uint32_t)bin_infos[i].n_shards * bin_sz;
nbins_total += bin_infos[i].n_shards;
cur_offset += (uint32_t)(bin_infos[i].n_shards * sizeof(bin_t));
}
return pa_central_init(
&arena_pa_central_global, base, hpa, &hpa_hooks_default);
@ -1996,21 +1969,17 @@ arena_prefork7(tsdn_t *tsdn, arena_t *arena) {
void
arena_prefork8(tsdn_t *tsdn, arena_t *arena) {
for (szind_t i = 0; i < SC_NBINS; i++) {
for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
bin_t *bin = arena_get_bin(arena, i, j);
bin_prefork(tsdn, bin, arena_bin_has_batch(i));
}
for (unsigned i = 0; i < nbins_total; i++) {
JEMALLOC_SUPPRESS_WARN_ON_USAGE(
bin_prefork(tsdn, &arena->all_bins[i]);)
}
}
void
arena_postfork_parent(tsdn_t *tsdn, arena_t *arena) {
for (szind_t i = 0; i < SC_NBINS; i++) {
for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
bin_t *bin = arena_get_bin(arena, i, j);
bin_postfork_parent(tsdn, bin, arena_bin_has_batch(i));
}
for (unsigned i = 0; i < nbins_total; i++) {
JEMALLOC_SUPPRESS_WARN_ON_USAGE(
bin_postfork_parent(tsdn, &arena->all_bins[i]);)
}
malloc_mutex_postfork_parent(tsdn, &arena->large_mtx);
@ -2047,11 +2016,9 @@ arena_postfork_child(tsdn_t *tsdn, arena_t *arena) {
}
}
for (szind_t i = 0; i < SC_NBINS; i++) {
for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
bin_t *bin = arena_get_bin(arena, i, j);
bin_postfork_child(tsdn, bin, arena_bin_has_batch(i));
}
for (unsigned i = 0; i < nbins_total; i++) {
JEMALLOC_SUPPRESS_WARN_ON_USAGE(
bin_postfork_child(tsdn, &arena->all_bins[i]);)
}
malloc_mutex_postfork_child(tsdn, &arena->large_mtx);

View file

@ -1,98 +0,0 @@
#include "jemalloc/internal/jemalloc_preamble.h"
#include "jemalloc/internal/batcher.h"
#include "jemalloc/internal/assert.h"
#include "jemalloc/internal/atomic.h"
void
batcher_init(batcher_t *batcher, size_t nelems_max) {
atomic_store_zu(&batcher->nelems, 0, ATOMIC_RELAXED);
batcher->nelems_max = nelems_max;
batcher->npushes = 0;
malloc_mutex_init(&batcher->mtx, "batcher", WITNESS_RANK_BATCHER,
malloc_mutex_rank_exclusive);
}
/*
* Returns an index (into some user-owned array) to use for pushing, or
* BATCHER_NO_IDX if no index is free.
*/
size_t
batcher_push_begin(tsdn_t *tsdn, batcher_t *batcher, size_t elems_to_push) {
assert(elems_to_push > 0);
size_t nelems_guess = atomic_load_zu(&batcher->nelems, ATOMIC_RELAXED);
if (nelems_guess + elems_to_push > batcher->nelems_max) {
return BATCHER_NO_IDX;
}
malloc_mutex_lock(tsdn, &batcher->mtx);
size_t nelems = atomic_load_zu(&batcher->nelems, ATOMIC_RELAXED);
if (nelems + elems_to_push > batcher->nelems_max) {
malloc_mutex_unlock(tsdn, &batcher->mtx);
return BATCHER_NO_IDX;
}
assert(elems_to_push <= batcher->nelems_max - nelems);
/*
* We update nelems at push time (instead of during pop) so that other
* racing accesses of the batcher can fail fast instead of trying to
* acquire a mutex only to discover that there's no space for them.
*/
atomic_store_zu(
&batcher->nelems, nelems + elems_to_push, ATOMIC_RELAXED);
batcher->npushes++;
return nelems;
}
size_t
batcher_pop_get_pushes(tsdn_t *tsdn, batcher_t *batcher) {
malloc_mutex_assert_owner(tsdn, &batcher->mtx);
size_t npushes = batcher->npushes;
batcher->npushes = 0;
return npushes;
}
void
batcher_push_end(tsdn_t *tsdn, batcher_t *batcher) {
malloc_mutex_assert_owner(tsdn, &batcher->mtx);
assert(atomic_load_zu(&batcher->nelems, ATOMIC_RELAXED) > 0);
malloc_mutex_unlock(tsdn, &batcher->mtx);
}
size_t
batcher_pop_begin(tsdn_t *tsdn, batcher_t *batcher) {
size_t nelems_guess = atomic_load_zu(&batcher->nelems, ATOMIC_RELAXED);
assert(nelems_guess <= batcher->nelems_max);
if (nelems_guess == 0) {
return BATCHER_NO_IDX;
}
malloc_mutex_lock(tsdn, &batcher->mtx);
size_t nelems = atomic_load_zu(&batcher->nelems, ATOMIC_RELAXED);
assert(nelems <= batcher->nelems_max);
if (nelems == 0) {
malloc_mutex_unlock(tsdn, &batcher->mtx);
return BATCHER_NO_IDX;
}
atomic_store_zu(&batcher->nelems, 0, ATOMIC_RELAXED);
return nelems;
}
void
batcher_pop_end(tsdn_t *tsdn, batcher_t *batcher) {
assert(atomic_load_zu(&batcher->nelems, ATOMIC_RELAXED) == 0);
malloc_mutex_unlock(tsdn, &batcher->mtx);
}
void
batcher_prefork(tsdn_t *tsdn, batcher_t *batcher) {
malloc_mutex_prefork(tsdn, &batcher->mtx);
}
void
batcher_postfork_parent(tsdn_t *tsdn, batcher_t *batcher) {
malloc_mutex_postfork_parent(tsdn, &batcher->mtx);
}
void
batcher_postfork_child(tsdn_t *tsdn, batcher_t *batcher) {
malloc_mutex_postfork_child(tsdn, &batcher->mtx);
}

View file

@ -6,14 +6,6 @@
#include "jemalloc/internal/sc.h"
#include "jemalloc/internal/witness.h"
#ifdef JEMALLOC_JET
unsigned bin_batching_test_ndalloc_slabs_max = (unsigned)-1;
void (*bin_batching_test_after_push_hook)(size_t push_idx);
void (*bin_batching_test_mid_pop_hook)(size_t nelems_to_pop);
void (*bin_batching_test_after_unlock_hook)(
unsigned slab_dalloc_count, bool list_empty);
#endif
bool
bin_update_shard_size(unsigned bin_shard_sizes[SC_NBINS], size_t start_size,
size_t end_size, size_t nshards) {
@ -47,7 +39,7 @@ bin_shard_sizes_boot(unsigned bin_shard_sizes[SC_NBINS]) {
}
bool
bin_init(bin_t *bin, unsigned binind) {
bin_init(bin_t *bin) {
if (malloc_mutex_init(&bin->lock, "bin", WITNESS_RANK_BIN,
malloc_mutex_rank_exclusive)) {
return true;
@ -58,52 +50,20 @@ bin_init(bin_t *bin, unsigned binind) {
if (config_stats) {
memset(&bin->stats, 0, sizeof(bin_stats_t));
}
if (arena_bin_has_batch(binind)) {
bin_with_batch_t *batched_bin = (bin_with_batch_t *)bin;
batcher_init(
&batched_bin->remote_frees, opt_bin_info_remote_free_max);
}
return false;
}
void
bin_prefork(tsdn_t *tsdn, bin_t *bin, bool has_batch) {
bin_prefork(tsdn_t *tsdn, bin_t *bin) {
malloc_mutex_prefork(tsdn, &bin->lock);
if (has_batch) {
/*
* The batch mutex has lower rank than the bin mutex (as it must
* -- it's acquired later). But during forking, we go
* bin-at-a-time, so that we acquire mutex on bin 0, then on
* the bin 0 batcher, then on bin 1. This is a safe ordering
* (it's ordered by the index of arenas and bins within those
* arenas), but will trigger witness errors that would
* otherwise force another level of arena forking that breaks
* bin encapsulation (because the witness API doesn't "know"
* about arena or bin ordering -- it just sees that the batcher
* has a lower rank than the bin). So instead we exclude the
* batcher mutex from witness checking during fork (which is
* the only time we touch multiple bins at once) by passing
* TSDN_NULL.
*/
bin_with_batch_t *batched = (bin_with_batch_t *)bin;
batcher_prefork(TSDN_NULL, &batched->remote_frees);
}
}
void
bin_postfork_parent(tsdn_t *tsdn, bin_t *bin, bool has_batch) {
bin_postfork_parent(tsdn_t *tsdn, bin_t *bin) {
malloc_mutex_postfork_parent(tsdn, &bin->lock);
if (has_batch) {
bin_with_batch_t *batched = (bin_with_batch_t *)bin;
batcher_postfork_parent(TSDN_NULL, &batched->remote_frees);
}
}
void
bin_postfork_child(tsdn_t *tsdn, bin_t *bin, bool has_batch) {
bin_postfork_child(tsdn_t *tsdn, bin_t *bin) {
malloc_mutex_postfork_child(tsdn, &bin->lock);
if (has_batch) {
bin_with_batch_t *batched = (bin_with_batch_t *)bin;
batcher_postfork_child(TSDN_NULL, &batched->remote_frees);
}
}

View file

@ -3,26 +3,8 @@
#include "jemalloc/internal/bin_info.h"
/*
* We leave bin-batching disabled by default, with other settings chosen mostly
* empirically; across the test programs I looked at they provided the most bang
* for the buck. With other default settings, these choices for bin batching
* result in them consuming far less memory (even in the worst case) than the
* tcaches themselves, the arena, etc.
* Note that we always try to pop all bins on every arena cache bin lock
* operation, so the typical memory waste is far less than this (and only on
* hot bins, which tend to be large anyways).
*/
size_t opt_bin_info_max_batched_size = 0; /* 192 is a good default. */
size_t opt_bin_info_remote_free_max_batch = 4;
size_t opt_bin_info_remote_free_max = BIN_REMOTE_FREE_ELEMS_MAX;
bin_info_t bin_infos[SC_NBINS];
szind_t bin_info_nbatched_sizes;
unsigned bin_info_nbatched_bins;
unsigned bin_info_nunbatched_bins;
static void
bin_infos_init(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
bin_info_t infos[SC_NBINS]) {
@ -38,12 +20,6 @@ bin_infos_init(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
bitmap_info_t bitmap_info = BITMAP_INFO_INITIALIZER(
bin_info->nregs);
bin_info->bitmap_info = bitmap_info;
if (bin_info->reg_size <= opt_bin_info_max_batched_size) {
bin_info_nbatched_sizes++;
bin_info_nbatched_bins += bin_info->n_shards;
} else {
bin_info_nunbatched_bins += bin_info->n_shards;
}
}
}

View file

@ -134,9 +134,6 @@ CTL_PROTO(opt_utrace)
CTL_PROTO(opt_xmalloc)
CTL_PROTO(opt_experimental_infallible_new)
CTL_PROTO(opt_experimental_tcache_gc)
CTL_PROTO(opt_max_batched_size)
CTL_PROTO(opt_remote_free_max)
CTL_PROTO(opt_remote_free_max_batch)
CTL_PROTO(opt_tcache)
CTL_PROTO(opt_tcache_max)
CTL_PROTO(opt_tcache_nslots_small_min)
@ -248,10 +245,6 @@ CTL_PROTO(stats_arenas_i_bins_j_nslabs)
CTL_PROTO(stats_arenas_i_bins_j_nreslabs)
CTL_PROTO(stats_arenas_i_bins_j_curslabs)
CTL_PROTO(stats_arenas_i_bins_j_nonfull_slabs)
CTL_PROTO(stats_arenas_i_bins_j_batch_pops)
CTL_PROTO(stats_arenas_i_bins_j_batch_failed_pushes)
CTL_PROTO(stats_arenas_i_bins_j_batch_pushes)
CTL_PROTO(stats_arenas_i_bins_j_batch_pushed_elems)
INDEX_PROTO(stats_arenas_i_bins_j)
CTL_PROTO(stats_arenas_i_lextents_j_nmalloc)
CTL_PROTO(stats_arenas_i_lextents_j_ndalloc)
@ -501,9 +494,6 @@ static const ctl_named_node_t opt_node[] = {{NAME("abort"), CTL(opt_abort)},
{NAME("utrace"), CTL(opt_utrace)}, {NAME("xmalloc"), CTL(opt_xmalloc)},
{NAME("experimental_infallible_new"), CTL(opt_experimental_infallible_new)},
{NAME("experimental_tcache_gc"), CTL(opt_experimental_tcache_gc)},
{NAME("max_batched_size"), CTL(opt_max_batched_size)},
{NAME("remote_free_max"), CTL(opt_remote_free_max)},
{NAME("remote_free_max_batch"), CTL(opt_remote_free_max_batch)},
{NAME("tcache"), CTL(opt_tcache)},
{NAME("tcache_max"), CTL(opt_tcache_max)},
{NAME("tcache_nslots_small_min"), CTL(opt_tcache_nslots_small_min)},
@ -673,11 +663,6 @@ static const ctl_named_node_t stats_arenas_i_bins_j_node[] = {
{NAME("nreslabs"), CTL(stats_arenas_i_bins_j_nreslabs)},
{NAME("curslabs"), CTL(stats_arenas_i_bins_j_curslabs)},
{NAME("nonfull_slabs"), CTL(stats_arenas_i_bins_j_nonfull_slabs)},
{NAME("batch_pops"), CTL(stats_arenas_i_bins_j_batch_pops)},
{NAME("batch_failed_pushes"),
CTL(stats_arenas_i_bins_j_batch_failed_pushes)},
{NAME("batch_pushes"), CTL(stats_arenas_i_bins_j_batch_pushes)},
{NAME("batch_pushed_elems"), CTL(stats_arenas_i_bins_j_batch_pushed_elems)},
{NAME("mutex"), CHILD(named, stats_arenas_i_bins_j_mutex)}};
static const ctl_named_node_t super_stats_arenas_i_bins_j_node[] = {
@ -1219,14 +1204,6 @@ ctl_arena_stats_sdmerge(
assert(bstats->curslabs == 0);
assert(bstats->nonfull_slabs == 0);
}
merged->batch_pops += bstats->batch_pops;
merged->batch_failed_pushes +=
bstats->batch_failed_pushes;
merged->batch_pushes += bstats->batch_pushes;
merged->batch_pushed_elems +=
bstats->batch_pushed_elems;
malloc_mutex_prof_merge(&sdstats->bstats[i].mutex_data,
&astats->bstats[i].mutex_data);
}
@ -2202,10 +2179,6 @@ CTL_RO_NL_CGEN(config_xmalloc, opt_xmalloc, opt_xmalloc, bool)
CTL_RO_NL_CGEN(config_enable_cxx, opt_experimental_infallible_new,
opt_experimental_infallible_new, bool)
CTL_RO_NL_GEN(opt_experimental_tcache_gc, opt_experimental_tcache_gc, bool)
CTL_RO_NL_GEN(opt_max_batched_size, opt_bin_info_max_batched_size, size_t)
CTL_RO_NL_GEN(opt_remote_free_max, opt_bin_info_remote_free_max, size_t)
CTL_RO_NL_GEN(
opt_remote_free_max_batch, opt_bin_info_remote_free_max_batch, size_t)
CTL_RO_NL_GEN(opt_tcache, opt_tcache, bool)
CTL_RO_NL_GEN(opt_tcache_max, opt_tcache_max, size_t)
CTL_RO_NL_GEN(
@ -3982,16 +3955,6 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_curslabs,
arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.curslabs, size_t)
CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nonfull_slabs,
arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.nonfull_slabs, size_t)
CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_batch_pops,
arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.batch_pops, uint64_t)
CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_batch_failed_pushes,
arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.batch_failed_pushes,
uint64_t)
CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_batch_pushes,
arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.batch_pushes, uint64_t)
CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_batch_pushed_elems,
arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.batch_pushed_elems,
uint64_t)
static const ctl_named_node_t *
stats_arenas_i_bins_j_index(

View file

@ -1391,20 +1391,6 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
} while (vlen_left > 0);
CONF_CONTINUE;
}
CONF_HANDLE_SIZE_T(opt_bin_info_max_batched_size,
"max_batched_size", 0, SIZE_T_MAX,
CONF_DONT_CHECK_MIN, CONF_DONT_CHECK_MAX,
/* clip */ true)
CONF_HANDLE_SIZE_T(opt_bin_info_remote_free_max_batch,
"remote_free_max_batch", 0,
BIN_REMOTE_FREE_ELEMS_MAX, CONF_DONT_CHECK_MIN,
CONF_CHECK_MAX,
/* clip */ true)
CONF_HANDLE_SIZE_T(opt_bin_info_remote_free_max,
"remote_free_max", 0, BIN_REMOTE_FREE_ELEMS_MAX,
CONF_DONT_CHECK_MIN, CONF_CHECK_MAX,
/* clip */ true)
if (CONF_MATCH("tcache_ncached_max")) {
bool err = tcache_bin_info_default_init(
v, vlen);

View file

@ -357,15 +357,6 @@ stats_arena_bins_print(
COL_HDR(row, nreslabs, NULL, right, 13, uint64)
COL_HDR(row, nreslabs_ps, "(#/sec)", right, 8, uint64)
COL_HDR(row, pops, NULL, right, 10, uint64)
COL_HDR(row, pops_ps, "(#/sec)", right, 8, uint64)
COL_HDR(row, failed_push, NULL, right, 13, uint64)
COL_HDR(row, failed_push_ps, "(#/sec)", right, 8, uint64)
COL_HDR(row, push, NULL, right, 7, uint64)
COL_HDR(row, push_ps, "(#/sec)", right, 8, uint64)
COL_HDR(row, push_elem, NULL, right, 12, uint64)
COL_HDR(row, push_elem_ps, "(#/sec)", right, 8, uint64)
/* Don't want to actually print the name. */
header_justify_spacer.str_val = " ";
col_justify_spacer.str_val = " ";
@ -406,15 +397,13 @@ stats_arena_bins_print(
}
for (j = 0, in_gap = false; j < nbins; j++) {
uint64_t nslabs;
size_t reg_size, slab_size, curregs;
size_t curslabs;
size_t nonfull_slabs;
uint32_t nregs, nshards;
uint64_t nmalloc, ndalloc, nrequests, nfills, nflushes;
uint64_t nreslabs;
uint64_t batch_pops, batch_failed_pushes, batch_pushes,
batch_pushed_elems;
uint64_t nslabs;
size_t reg_size, slab_size, curregs;
size_t curslabs;
size_t nonfull_slabs;
uint32_t nregs, nshards;
uint64_t nmalloc, ndalloc, nrequests, nfills, nflushes;
uint64_t nreslabs;
prof_stats_t prof_live;
prof_stats_t prof_accum;
@ -463,15 +452,6 @@ stats_arena_bins_print(
CTL_LEAF(stats_arenas_mib, 5, "nonfull_slabs", &nonfull_slabs,
size_t);
CTL_LEAF(
stats_arenas_mib, 5, "batch_pops", &batch_pops, uint64_t);
CTL_LEAF(stats_arenas_mib, 5, "batch_failed_pushes",
&batch_failed_pushes, uint64_t);
CTL_LEAF(stats_arenas_mib, 5, "batch_pushes", &batch_pushes,
uint64_t);
CTL_LEAF(stats_arenas_mib, 5, "batch_pushed_elems",
&batch_pushed_elems, uint64_t);
if (mutex) {
mutex_stats_read_arena_bin(stats_arenas_mib, 5,
col_mutex64, col_mutex32, uptime);
@ -506,14 +486,6 @@ stats_arena_bins_print(
emitter, "curslabs", emitter_type_size, &curslabs);
emitter_json_kv(emitter, "nonfull_slabs", emitter_type_size,
&nonfull_slabs);
emitter_json_kv(
emitter, "batch_pops", emitter_type_uint64, &batch_pops);
emitter_json_kv(emitter, "batch_failed_pushes",
emitter_type_uint64, &batch_failed_pushes);
emitter_json_kv(emitter, "batch_pushes", emitter_type_uint64,
&batch_pushes);
emitter_json_kv(emitter, "batch_pushed_elems",
emitter_type_uint64, &batch_pushed_elems);
if (mutex) {
emitter_json_object_kv_begin(emitter, "mutex");
mutex_stats_emit(
@ -573,19 +545,6 @@ stats_arena_bins_print(
col_nreslabs.uint64_val = nreslabs;
col_nreslabs_ps.uint64_val = rate_per_second(nreslabs, uptime);
col_pops.uint64_val = batch_pops;
col_pops_ps.uint64_val = rate_per_second(batch_pops, uptime);
col_failed_push.uint64_val = batch_failed_pushes;
col_failed_push_ps.uint64_val = rate_per_second(
batch_failed_pushes, uptime);
col_push.uint64_val = batch_pushes;
col_push_ps.uint64_val = rate_per_second(batch_pushes, uptime);
col_push_elem.uint64_val = batch_pushed_elems;
col_push_elem_ps.uint64_val = rate_per_second(
batch_pushed_elems, uptime);
/*
* Note that mutex columns were initialized above, if mutex ==
* true.
@ -1677,9 +1636,6 @@ stats_general_print(emitter_t *emitter) {
OPT_WRITE_BOOL("xmalloc")
OPT_WRITE_BOOL("experimental_infallible_new")
OPT_WRITE_BOOL("experimental_tcache_gc")
OPT_WRITE_SIZE_T("max_batched_size")
OPT_WRITE_SIZE_T("remote_free_max")
OPT_WRITE_SIZE_T("remote_free_max_batch")
OPT_WRITE_BOOL("tcache")
OPT_WRITE_SIZE_T("tcache_max")
OPT_WRITE_UNSIGNED("tcache_nslots_small_min")

View file

@ -608,7 +608,7 @@ tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
}
arena_cache_bin_fill_small(tsdn, arena, cache_bin, binind,
/* nfill_min */
opt_experimental_tcache_gc ? ((nfill >> 1) + 1) : nfill,
opt_experimental_tcache_gc ? ((nfill >> 1) + 1) : nfill,
/* nfill_max */ nfill);
tcache_slow->bin_refilled[binind] = true;
tcache_nfill_small_burst_prepare(tcache_slow, binind);
@ -680,8 +680,6 @@ tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache,
assert(binind < SC_NBINS);
arena_t *tcache_arena = tcache_slow->arena;
assert(tcache_arena != NULL);
unsigned tcache_binshard =
tsd_binshardsp_get(tsdn_tsd(tsdn))->binshard[binind];
/*
* Variable length array must have > 0 length; the last element is never
@ -698,25 +696,12 @@ tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache,
unsigned dalloc_count = 0;
VARIABLE_ARRAY(edata_t *, dalloc_slabs, nflush + 1);
/*
* There's an edge case where we need to deallocate more slabs than we
* have elements of dalloc_slabs. This can if we end up deallocating
* items batched by another thread in addition to ones flushed from the
* cache. Since this is not very likely (most small object
* deallocations don't free up a whole slab), we don't want to burn the
* stack space to keep those excess slabs in an array. Instead we'll
* maintain an overflow list.
*/
edata_list_active_t dalloc_slabs_extra;
edata_list_active_init(&dalloc_slabs_extra);
/*
* We're about to grab a bunch of locks. If one of them happens to be
* the one guarding the arena-level stats counters we flush our
* thread-local ones to, we do so under one critical section.
*/
bool merged_stats = false;
/*
* We maintain the invariant that all edatas yet to be flushed are
* contained in the half-open range [flush_start, flush_end). We'll
@ -741,7 +726,6 @@ tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache,
unsigned cur_binshard = edata_binshard_get(cur_edata);
bin_t *cur_bin = arena_get_bin(cur_arena, binind, cur_binshard);
assert(cur_binshard < bin_infos[binind].n_shards);
/*
* Start off the partition; item_edata[i] always matches itself
* of course.
@ -788,150 +772,43 @@ tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache,
}
}
/*
* We never batch when flushing to our home-base bin shard,
* since it's likely that we'll have to acquire that lock anyway
* when flushing stats.
*
* A plausible check we could add to can_batch is
* '&& arena_is_auto(cur_arena)'. The motivation would be that
* we have a higher tolerance for dubious user assumptions
* around non-auto arenas (e.g. "if I deallocate every object I
* allocated, and then call tcache.flush, then the arena stats
* must reflect zero live allocations").
*
* This is dubious for a couple reasons:
* - We already don't provide perfect fidelity for stats
* counting (e.g. for profiled allocations, whose size can
* inflate in stats).
* - Hanging load-bearing guarantees around stats impedes
* scalability in general.
*
* There are some "complete" strategies we could do instead:
* - Add a arena.<i>.quiesce call to pop all bins for users who
* do want those stats accounted for.
* - Make batchability a user-controllable per-arena option.
* - Do a batch pop after every mutex acquisition for which we
* want to provide accurate stats. This gives perfectly
* accurate stats, but can cause weird performance effects
* (because doing stats collection can now result in slabs
* becoming empty, and therefore purging, large mutex
* acquisition, etc.).
* - Propagate the "why" behind a flush down to the level of the
* batcher, and include a batch pop attempt down full tcache
* flushing pathways. This is just a lot of plumbing and
* internal complexity.
*
* We don't do any of these right now, but the decision calculus
* and tradeoffs are subtle enough that the reasoning was worth
* leaving in this comment.
*/
bool bin_is_batched = arena_bin_has_batch(binind);
bool home_binshard = (cur_arena == tcache_arena
&& cur_binshard == tcache_binshard);
bool can_batch = (flush_start - prev_flush_start
<= opt_bin_info_remote_free_max_batch)
&& !home_binshard && bin_is_batched;
/* Actually do the flushing. */
malloc_mutex_lock(tsdn, &cur_bin->lock);
/*
* We try to avoid the batching pathway if we can, so we always
* at least *try* to lock.
* Flush stats first, if that was the right lock. Note that we
* don't actually have to flush stats into the current thread's
* binshard. Flushing into any binshard in the same arena is
* enough; we don't expose stats on per-binshard basis (just
* per-bin).
*/
bool locked = false;
bool batched = false;
bool batch_failed = false;
if (can_batch) {
locked = !malloc_mutex_trylock(tsdn, &cur_bin->lock);
if (config_stats && tcache_arena == cur_arena
&& !merged_stats) {
merged_stats = true;
cur_bin->stats.nflushes++;
cur_bin->stats.nrequests += cache_bin->tstats.nrequests;
cache_bin->tstats.nrequests = 0;
}
if (can_batch && !locked) {
bin_with_batch_t *batched_bin = (bin_with_batch_t *)
cur_bin;
size_t push_idx = batcher_push_begin(tsdn,
&batched_bin->remote_frees,
flush_start - prev_flush_start);
bin_batching_test_after_push(push_idx);
if (push_idx != BATCHER_NO_IDX) {
batched = true;
unsigned nbatched = flush_start
- prev_flush_start;
for (unsigned i = 0; i < nbatched; i++) {
unsigned src_ind = prev_flush_start + i;
batched_bin
->remote_free_data[push_idx + i]
.ptr = ptrs->ptr[src_ind];
batched_bin
->remote_free_data[push_idx + i]
.slab = item_edata[src_ind].edata;
}
batcher_push_end(
tsdn, &batched_bin->remote_frees);
} else {
batch_failed = true;
/* Next flush objects. */
/* Init only to avoid used-uninitialized warning. */
arena_dalloc_bin_locked_info_t dalloc_bin_info = {0};
arena_dalloc_bin_locked_begin(&dalloc_bin_info, binind);
for (unsigned i = prev_flush_start; i < flush_start; i++) {
void *ptr = ptrs->ptr[i];
edata_t *edata = item_edata[i].edata;
if (arena_dalloc_bin_locked_step(tsdn, cur_arena,
cur_bin, &dalloc_bin_info, binind, edata,
ptr)) {
dalloc_slabs[dalloc_count] = edata;
dalloc_count++;
}
}
if (!batched) {
if (!locked) {
malloc_mutex_lock(tsdn, &cur_bin->lock);
}
/*
* Unlike other stats (which only ever get flushed into
* a tcache's associated arena), batch_failed counts get
* accumulated into the bin where the push attempt
* failed.
*/
if (config_stats && batch_failed) {
cur_bin->stats.batch_failed_pushes++;
}
/*
* Flush stats first, if that was the right lock. Note
* that we don't actually have to flush stats into the
* current thread's binshard. Flushing into any binshard
* in the same arena is enough; we don't expose stats on
* per-binshard basis (just per-bin).
*/
if (config_stats && tcache_arena == cur_arena
&& !merged_stats) {
merged_stats = true;
cur_bin->stats.nflushes++;
cur_bin->stats.nrequests +=
cache_bin->tstats.nrequests;
cache_bin->tstats.nrequests = 0;
}
unsigned preallocated_slabs = nflush;
unsigned ndalloc_slabs =
arena_bin_batch_get_ndalloc_slabs(
preallocated_slabs);
arena_dalloc_bin_locked_finish(
tsdn, cur_arena, cur_bin, &dalloc_bin_info);
malloc_mutex_unlock(tsdn, &cur_bin->lock);
/* Next flush objects our own objects. */
/* Init only to avoid used-uninitialized warning. */
arena_dalloc_bin_locked_info_t dalloc_bin_info = {0};
arena_dalloc_bin_locked_begin(&dalloc_bin_info, binind);
for (unsigned i = prev_flush_start; i < flush_start;
i++) {
void *ptr = ptrs->ptr[i];
edata_t *edata = item_edata[i].edata;
arena_dalloc_bin_locked_step(tsdn, cur_arena,
cur_bin, &dalloc_bin_info, binind, edata,
ptr, dalloc_slabs, ndalloc_slabs,
&dalloc_count, &dalloc_slabs_extra);
}
/*
* Lastly, flush any batched objects (from other
* threads).
*/
if (bin_is_batched) {
arena_bin_flush_batch_impl(tsdn, cur_arena,
cur_bin, &dalloc_bin_info, binind,
dalloc_slabs, ndalloc_slabs, &dalloc_count,
&dalloc_slabs_extra);
}
arena_dalloc_bin_locked_finish(
tsdn, cur_arena, cur_bin, &dalloc_bin_info);
malloc_mutex_unlock(tsdn, &cur_bin->lock);
}
arena_decay_ticks(
tsdn, cur_arena, flush_start - prev_flush_start);
}
@ -941,18 +818,13 @@ tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache,
edata_t *slab = dalloc_slabs[i];
arena_slab_dalloc(tsdn, arena_get_from_edata(slab), slab);
}
while (!edata_list_active_empty(&dalloc_slabs_extra)) {
edata_t *slab = edata_list_active_first(&dalloc_slabs_extra);
edata_list_active_remove(&dalloc_slabs_extra, slab);
arena_slab_dalloc(tsdn, arena_get_from_edata(slab), slab);
}
if (config_stats && !merged_stats) {
/*
* The flush loop didn't happen to flush to this
* thread's arena, so the stats didn't get merged.
* Manually do so now.
*/
* The flush loop didn't happen to flush to this
* thread's arena, so the stats didn't get merged.
* Manually do so now.
*/
bin_t *bin = arena_bin_choose(tsdn, tcache_arena, binind, NULL);
malloc_mutex_lock(tsdn, &bin->lock);
bin->stats.nflushes++;