Pre-generate ncached_max for all bins for better tcache_max tuning experience.

This commit is contained in:
guangli-dai 2023-08-22 16:31:54 -07:00 committed by Qi Wang
parent 36becb1302
commit 6b197fdd46
12 changed files with 417 additions and 308 deletions

View file

@ -60,10 +60,10 @@ unsigned opt_lg_tcache_flush_large_div = 1;
/*
* Number of cache bins enabled, including both large and small. This value
* is only used to initialize tcache_nhbins in the per-thread tcache.
* is only used to initialize tcache_nbins in the per-thread tcache.
* Directly modifying it will not affect threads already launched.
*/
unsigned global_do_not_change_nhbins;
unsigned global_do_not_change_nbins;
/*
* Max size class to be cached (can be small or large). This value is only used
* to initialize tcache_max in the per-thread tcache. Directly modifying it
@ -129,6 +129,7 @@ tcache_gc_small(tsd_t *tsd, tcache_slow_t *tcache_slow, tcache_t *tcache,
assert(szind < SC_NBINS);
cache_bin_t *cache_bin = &tcache->bins[szind];
assert(!tcache_bin_disabled(szind, cache_bin, tcache->tcache_slow));
cache_bin_sz_t ncached = cache_bin_ncached_get_local(cache_bin,
&cache_bin->bin_info);
cache_bin_sz_t low_water = cache_bin_low_water_get(cache_bin,
@ -155,7 +156,7 @@ tcache_gc_small(tsd_t *tsd, tcache_slow_t *tcache_slow, tcache_t *tcache,
* Reduce fill count by 2X. Limit lg_fill_div such that
* the fill count is always at least 1.
*/
if ((cache_bin_info_ncached_max(&cache_bin->bin_info)
if ((cache_bin_info_ncached_max_get(cache_bin, &cache_bin->bin_info)
>> (tcache_slow->lg_fill_div[szind] + 1)) >= 1) {
tcache_slow->lg_fill_div[szind]++;
}
@ -167,6 +168,7 @@ tcache_gc_large(tsd_t *tsd, tcache_slow_t *tcache_slow, tcache_t *tcache,
/* Like the small GC; flush 3/4 of untouched items. */
assert(szind >= SC_NBINS);
cache_bin_t *cache_bin = &tcache->bins[szind];
assert(!tcache_bin_disabled(szind, cache_bin, tcache->tcache_slow));
cache_bin_sz_t ncached = cache_bin_ncached_get_local(cache_bin,
&cache_bin->bin_info);
cache_bin_sz_t low_water = cache_bin_low_water_get(cache_bin,
@ -187,8 +189,12 @@ tcache_event(tsd_t *tsd) {
bool is_small = (szind < SC_NBINS);
cache_bin_t *cache_bin = &tcache->bins[szind];
tcache_bin_flush_stashed(tsd, tcache, cache_bin, szind, is_small);
if (tcache_bin_disabled(szind, cache_bin, tcache_slow)) {
goto label_done;
}
tcache_bin_flush_stashed(tsd, tcache, cache_bin, szind,
is_small);
cache_bin_sz_t low_water = cache_bin_low_water_get(cache_bin,
&cache_bin->bin_info);
if (low_water > 0) {
@ -210,8 +216,9 @@ tcache_event(tsd_t *tsd) {
}
cache_bin_low_water_set(cache_bin);
label_done:
tcache_slow->next_gc_bin++;
if (tcache_slow->next_gc_bin == tcache_nhbins_get(tcache)) {
if (tcache_slow->next_gc_bin == tcache_nbins_get(tcache_slow)) {
tcache_slow->next_gc_bin = 0;
}
}
@ -236,8 +243,9 @@ tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena,
void *ret;
assert(tcache_slow->arena != NULL);
unsigned nfill = cache_bin_info_ncached_max(&cache_bin->bin_info)
>> tcache_slow->lg_fill_div[binind];
assert(!tcache_bin_disabled(binind, cache_bin, tcache_slow));
unsigned nfill = cache_bin_info_ncached_max_get(cache_bin,
&cache_bin->bin_info) >> tcache_slow->lg_fill_div[binind];
arena_cache_bin_fill_small(tsdn, arena, cache_bin,
&cache_bin->bin_info, binind, nfill);
tcache_slow->bin_refilled[binind] = true;
@ -321,7 +329,7 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
if (small) {
assert(binind < SC_NBINS);
} else {
assert(binind < tcache_nhbins_get(tcache));
assert(binind < tcache_nbins_get(tcache_slow));
}
arena_t *tcache_arena = tcache_slow->arena;
assert(tcache_arena != NULL);
@ -508,6 +516,7 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
JEMALLOC_ALWAYS_INLINE void
tcache_bin_flush_bottom(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
szind_t binind, unsigned rem, bool small) {
assert(!tcache_bin_disabled(binind, cache_bin, tcache->tcache_slow));
tcache_bin_flush_stashed(tsd, tcache, cache_bin, binind, small);
cache_bin_sz_t ncached = cache_bin_ncached_get_local(cache_bin,
@ -551,6 +560,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
void
tcache_bin_flush_stashed(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
szind_t binind, bool is_small) {
assert(!tcache_bin_disabled(binind, cache_bin, tcache->tcache_slow));
cache_bin_info_t *info = &cache_bin->bin_info;
/*
* The two below are for assertion only. The content of original cached
@ -562,7 +572,8 @@ tcache_bin_flush_stashed(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
info);
cache_bin_sz_t nstashed = cache_bin_nstashed_get_local(cache_bin, info);
assert(orig_cached + nstashed <= cache_bin_info_ncached_max(info));
assert(orig_cached + nstashed <=
cache_bin_info_ncached_max_get(cache_bin, info));
if (nstashed == 0) {
return;
}
@ -637,33 +648,11 @@ tcache_arena_reassociate(tsdn_t *tsdn, tcache_slow_t *tcache_slow,
}
static void
tcache_max_and_nhbins_init(tcache_t *tcache) {
assert(tcache != NULL);
tcache_default_settings_init(tcache_slow_t *tcache_slow) {
assert(tcache_slow != NULL);
assert(global_do_not_change_tcache_maxclass != 0);
assert(global_do_not_change_nhbins != 0);
tcache->tcache_max = global_do_not_change_tcache_maxclass;
tcache->tcache_nhbins = global_do_not_change_nhbins;
assert(tcache->tcache_nhbins == sz_size2index(tcache->tcache_max) + 1);
}
bool
tsd_tcache_enabled_data_init(tsd_t *tsd) {
/* Called upon tsd initialization. */
tsd_tcache_enabled_set(tsd, opt_tcache);
/*
* tcache is not available yet, but we need to set up its tcache_max
* and tcache_nhbins in advance.
*/
tcache_t *tcache = tsd_tcachep_get(tsd);
tcache_max_and_nhbins_init(tcache);
tsd_slow_update(tsd);
if (opt_tcache) {
/* Trigger tcache init. */
tsd_tcache_data_init(tsd, NULL);
}
return false;
assert(global_do_not_change_nbins != 0);
tcache_slow->tcache_nbins = global_do_not_change_nbins;
}
static void
@ -679,19 +668,15 @@ tcache_init(tsd_t *tsd, tcache_slow_t *tcache_slow, tcache_t *tcache,
/*
* We reserve cache bins for all small size classes, even if some may
* not get used (i.e. bins higher than tcache_nhbins). This allows
* not get used (i.e. bins higher than tcache_nbins). This allows
* the fast and common paths to access cache bin metadata safely w/o
* worrying about which ones are disabled.
*/
unsigned tcache_nhbins = tcache_nhbins_get(tcache);
unsigned n_reserved_bins = tcache_nhbins < SC_NBINS ? SC_NBINS
: tcache_nhbins;
memset(tcache->bins, 0, sizeof(cache_bin_t) * n_reserved_bins);
unsigned tcache_nbins = tcache_nbins_get(tcache_slow);
size_t cur_offset = 0;
cache_bin_preincrement(tcache_bin_info, tcache_nhbins, mem,
cache_bin_preincrement(tcache_bin_info, tcache_nbins, mem,
&cur_offset);
for (unsigned i = 0; i < tcache_nhbins; i++) {
for (unsigned i = 0; i < tcache_nbins; i++) {
if (i < SC_NBINS) {
tcache_slow->lg_fill_div[i] = 1;
tcache_slow->bin_refilled[i] = false;
@ -699,40 +684,40 @@ tcache_init(tsd_t *tsd, tcache_slow_t *tcache_slow, tcache_t *tcache,
= tcache_gc_item_delay_compute(i);
}
cache_bin_t *cache_bin = &tcache->bins[i];
cache_bin_init(cache_bin, &tcache_bin_info[i], mem,
&cur_offset);
if (tcache_bin_info[i].ncached_max > 0) {
cache_bin_init(cache_bin, &tcache_bin_info[i], mem,
&cur_offset);
} else {
cache_bin_init_disabled(cache_bin,
tcache_bin_info[i].ncached_max);
}
}
/*
* For small size classes beyond tcache_max(i.e.
* tcache_nhbins< NBINS), their cache bins are initialized to a state
* to safely and efficiently fail all fastpath alloc / free, so that
* no additional check around tcache_nhbins is needed on fastpath.
* Initialize all disabled bins to a state that can safely and
* efficiently fail all fastpath alloc / free, so that no additional
* check around tcache_nbins is needed on fastpath. Yet we still
* store the ncached_max in the bin_info for future usage.
*/
for (unsigned i = tcache_nhbins; i < SC_NBINS; i++) {
/* Disabled small bins. */
for (unsigned i = tcache_nbins; i < TCACHE_NBINS_MAX; i++) {
cache_bin_t *cache_bin = &tcache->bins[i];
void *fake_stack = mem;
size_t fake_offset = 0;
cache_bin_init(cache_bin, &tcache_bin_info[i], fake_stack,
&fake_offset);
assert(tcache_small_bin_disabled(i, cache_bin));
cache_bin_init_disabled(cache_bin,
tcache_bin_info[i].ncached_max);
assert(tcache_bin_disabled(i, cache_bin, tcache->tcache_slow));
}
cache_bin_postincrement(mem, &cur_offset);
if (config_debug) {
/* Sanity check that the whole stack is used. */
size_t size, alignment;
cache_bin_info_compute_alloc(tcache_bin_info, tcache_nhbins,
cache_bin_info_compute_alloc(tcache_bin_info, tcache_nbins,
&size, &alignment);
assert(cur_offset == size);
}
}
static inline unsigned
tcache_ncached_max_compute(szind_t szind, unsigned current_nhbins) {
tcache_ncached_max_compute(szind_t szind) {
if (szind >= SC_NBINS) {
assert(szind < current_nhbins);
return opt_tcache_nslots_large;
}
unsigned slab_nregs = bin_infos[szind].nregs;
@ -788,32 +773,28 @@ tcache_ncached_max_compute(szind_t szind, unsigned current_nhbins) {
}
static void
tcache_bin_info_compute(cache_bin_info_t *tcache_bin_info,
unsigned tcache_nhbins) {
for (szind_t i = 0; i < tcache_nhbins; i++) {
unsigned ncached_max = tcache_ncached_max_compute(i,
tcache_nhbins);
tcache_bin_info_compute(cache_bin_info_t tcache_bin_info[TCACHE_NBINS_MAX]) {
/*
* Compute the values for each bin, but for bins with indices larger
* than tcache_nbins, no items will be cached.
*/
for (szind_t i = 0; i < TCACHE_NBINS_MAX; i++) {
unsigned ncached_max = tcache_ncached_max_compute(i);
assert(ncached_max <= CACHE_BIN_NCACHED_MAX);
cache_bin_info_init(&tcache_bin_info[i], ncached_max);
}
for (szind_t i = tcache_nhbins; i < SC_NBINS; i++) {
/* Disabled small bins. */
cache_bin_info_init(&tcache_bin_info[i], 0);
}
}
/* Initialize auto tcache (embedded in TSD). */
bool
tsd_tcache_data_init(tsd_t *tsd, arena_t *arena) {
static bool
tsd_tcache_data_init_impl(tsd_t *tsd, arena_t *arena,
cache_bin_info_t *tcache_bin_info) {
tcache_slow_t *tcache_slow = tsd_tcache_slowp_get_unsafe(tsd);
tcache_t *tcache = tsd_tcachep_get_unsafe(tsd);
assert(cache_bin_still_zero_initialized(&tcache->bins[0]));
unsigned tcache_nhbins = tcache_nhbins_get(tcache);
unsigned tcache_nbins = tcache_nbins_get(tcache_slow);
size_t size, alignment;
/* Takes 146B stack space. */
cache_bin_info_t tcache_bin_info[TCACHE_NBINS_MAX] = {0};
tcache_bin_info_compute(tcache_bin_info, tcache_nhbins);
cache_bin_info_compute_alloc(tcache_bin_info, tcache_nhbins,
cache_bin_info_compute_alloc(tcache_bin_info, tcache_nbins,
&size, &alignment);
void *mem;
@ -860,6 +841,23 @@ tsd_tcache_data_init(tsd_t *tsd, arena_t *arena) {
return false;
}
static bool
tsd_tcache_data_init_with_bin_settings(tsd_t *tsd, arena_t *arena,
cache_bin_info_t tcache_bin_info[TCACHE_NBINS_MAX]) {
assert(tcache_bin_info != NULL);
return tsd_tcache_data_init_impl(tsd, arena, tcache_bin_info);
}
/* Initialize auto tcache (embedded in TSD). */
static bool
tsd_tcache_data_init(tsd_t *tsd, arena_t *arena) {
/* Takes 146B stack space. */
cache_bin_info_t tcache_bin_info[TCACHE_NBINS_MAX] = {{0}};
tcache_bin_info_compute(tcache_bin_info);
return tsd_tcache_data_init_impl(tsd, arena, tcache_bin_info);
}
/* Created manual tcache for tcache.create mallctl. */
tcache_t *
tcache_create_explicit(tsd_t *tsd) {
@ -868,11 +866,11 @@ tcache_create_explicit(tsd_t *tsd) {
* the beginning of the whole allocation (for freeing). The makes sure
* the cache bins have the requested alignment.
*/
unsigned tcache_nhbins = global_do_not_change_nhbins;
unsigned tcache_nbins = global_do_not_change_nbins;
size_t tcache_size, alignment;
cache_bin_info_t tcache_bin_info[TCACHE_NBINS_MAX] = {0};
tcache_bin_info_compute(tcache_bin_info, tcache_nhbins);
cache_bin_info_compute_alloc(tcache_bin_info, tcache_nhbins,
cache_bin_info_t tcache_bin_info[TCACHE_NBINS_MAX] = {{0}};
tcache_bin_info_compute(tcache_bin_info);
cache_bin_info_compute_alloc(tcache_bin_info, tcache_nbins,
&tcache_size, &alignment);
size_t size = tcache_size + sizeof(tcache_t)
@ -889,7 +887,7 @@ tcache_create_explicit(tsd_t *tsd) {
tcache_t *tcache = (void *)((byte_t *)mem + tcache_size);
tcache_slow_t *tcache_slow =
(void *)((byte_t *)mem + tcache_size + sizeof(tcache_t));
tcache_max_and_nhbins_init(tcache);
tcache_default_settings_init(tcache_slow);
tcache_init(tsd, tcache_slow, tcache, mem, tcache_bin_info);
tcache_arena_associate(tsd_tsdn(tsd), tcache_slow, tcache,
@ -898,13 +896,83 @@ tcache_create_explicit(tsd_t *tsd) {
return tcache;
}
bool
tsd_tcache_enabled_data_init(tsd_t *tsd) {
/* Called upon tsd initialization. */
tsd_tcache_enabled_set(tsd, opt_tcache);
/*
* tcache is not available yet, but we need to set up its tcache_nbins
* in advance.
*/
tcache_default_settings_init(tsd_tcache_slowp_get(tsd));
tsd_slow_update(tsd);
if (opt_tcache) {
/* Trigger tcache init. */
tsd_tcache_data_init(tsd, NULL);
}
return false;
}
void
tcache_enabled_set(tsd_t *tsd, bool enabled) {
bool was_enabled = tsd_tcache_enabled_get(tsd);
if (!was_enabled && enabled) {
tsd_tcache_data_init(tsd, NULL);
} else if (was_enabled && !enabled) {
tcache_cleanup(tsd);
}
/* Commit the state last. Above calls check current state. */
tsd_tcache_enabled_set(tsd, enabled);
tsd_slow_update(tsd);
}
void
thread_tcache_max_set(tsd_t *tsd, size_t tcache_max) {
assert(tcache_max <= TCACHE_MAXCLASS_LIMIT);
assert(tcache_max == sz_s2u(tcache_max));
tcache_t *tcache = tsd_tcachep_get(tsd);
tcache_slow_t *tcache_slow = tcache->tcache_slow;
cache_bin_info_t tcache_bin_info[TCACHE_NBINS_MAX] = {{0}};
assert(tcache != NULL && tcache_slow != NULL);
bool enabled = tcache_available(tsd);
arena_t *assigned_arena;
if (enabled) {
assigned_arena = tcache_slow->arena;
/* Carry over the bin settings during the reboot. */
tcache_bin_settings_backup(tcache, tcache_bin_info);
/* Shutdown and reboot the tcache for a clean slate. */
tcache_cleanup(tsd);
}
/*
* Still set tcache_nbins of the tcache even if the tcache is not
* available yet because the values are stored in tsd_t and are
* always available for changing.
*/
tcache_max_set(tcache_slow, tcache_max);
if (enabled) {
tsd_tcache_data_init_with_bin_settings(tsd, assigned_arena,
tcache_bin_info);
}
assert(tcache_nbins_get(tcache_slow) == sz_size2index(tcache_max) + 1);
}
static void
tcache_flush_cache(tsd_t *tsd, tcache_t *tcache) {
tcache_slow_t *tcache_slow = tcache->tcache_slow;
assert(tcache_slow->arena != NULL);
for (unsigned i = 0; i < tcache_nhbins_get(tcache); i++) {
for (unsigned i = 0; i < tcache_nbins_get(tcache_slow); i++) {
cache_bin_t *cache_bin = &tcache->bins[i];
if (tcache_bin_disabled(i, cache_bin, tcache_slow)) {
continue;
}
if (i < SC_NBINS) {
tcache_bin_flush_small(tsd, tcache, cache_bin, i, 0);
} else {
@ -974,8 +1042,7 @@ tcache_cleanup(tsd_t *tsd) {
tcache_destroy(tsd, tcache, true);
/* Make sure all bins used are reinitialized to the clean state. */
memset(tcache->bins, 0, sizeof(cache_bin_t) *
tcache_nhbins_get(tcache));
memset(tcache->bins, 0, sizeof(cache_bin_t) * TCACHE_NBINS_MAX);
}
void
@ -983,8 +1050,11 @@ tcache_stats_merge(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena) {
cassert(config_stats);
/* Merge and reset tcache stats. */
for (unsigned i = 0; i < tcache_nhbins_get(tcache); i++) {
for (unsigned i = 0; i < tcache_nbins_get(tcache->tcache_slow); i++) {
cache_bin_t *cache_bin = &tcache->bins[i];
if (tcache_bin_disabled(i, cache_bin, tcache->tcache_slow)) {
continue;
}
if (i < SC_NBINS) {
bin_t *bin = arena_bin_choose(tsdn, arena, i, NULL);
malloc_mutex_lock(tsdn, &bin->lock);
@ -1110,7 +1180,7 @@ bool
tcache_boot(tsdn_t *tsdn, base_t *base) {
global_do_not_change_tcache_maxclass = sz_s2u(opt_tcache_max);
assert(global_do_not_change_tcache_maxclass <= TCACHE_MAXCLASS_LIMIT);
global_do_not_change_nhbins =
global_do_not_change_nbins =
sz_size2index(global_do_not_change_tcache_maxclass) + 1;
if (malloc_mutex_init(&tcaches_mtx, "tcaches", WITNESS_RANK_TCACHES,