Bin batching: add some stats.

This lets us easily see what fraction of flush load is being taken up by the
bins, and helps guide future optimization approaches (for example: should we
prefetch during cache bin fills? It depends on how many objects the average fill
pops out of the batch).
This commit is contained in:
David Goldblatt 2024-02-20 14:54:43 -08:00 committed by David Goldblatt
parent fc615739cb
commit f9c0b5f7f8
8 changed files with 114 additions and 0 deletions

View file

@ -630,6 +630,8 @@ arena_bin_flush_batch_impl(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
&batched_bin->remote_frees.mtx);
}
size_t npushes = batcher_pop_get_pushes(tsdn,
&batched_bin->remote_frees);
bin_remote_free_data_t remote_free_data[BIN_REMOTE_FREE_ELEMS_MAX];
for (size_t i = 0; i < nelems_to_pop; i++) {
remote_free_data[i] = batched_bin->remote_free_data[i];
@ -642,6 +644,10 @@ arena_bin_flush_batch_impl(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
dalloc_slabs, ndalloc_slabs, dalloc_count,
dalloc_slabs_extra);
}
bin->stats.batch_pops++;
bin->stats.batch_pushes += npushes;
bin->stats.batch_pushed_elems += nelems_to_pop;
}
typedef struct arena_bin_flush_batch_state_s arena_bin_flush_batch_state_t;

View file

@ -16,6 +16,7 @@ struct batcher_s {
*/
atomic_zu_t nelems;
size_t nelems_max;
size_t npushes;
malloc_mutex_t mtx;
};
@ -35,6 +36,7 @@ void batcher_push_end(tsdn_t *tsdn, batcher_t *batcher);
* If the former, must be followed by a call to batcher_pop_end.
*/
size_t batcher_pop_begin(tsdn_t *tsdn, batcher_t *batcher);
size_t batcher_pop_get_pushes(tsdn_t *tsdn, batcher_t *batcher);
void batcher_pop_end(tsdn_t *tsdn, batcher_t *batcher);
void batcher_prefork(tsdn_t *tsdn, batcher_t *batcher);

View file

@ -138,6 +138,11 @@ bin_stats_merge(tsdn_t *tsdn, bin_stats_data_t *dst_bin_stats, bin_t *bin) {
stats->reslabs += bin->stats.reslabs;
stats->curslabs += bin->stats.curslabs;
stats->nonfull_slabs += bin->stats.nonfull_slabs;
stats->batch_failed_pushes += bin->stats.batch_failed_pushes;
stats->batch_pushes += bin->stats.batch_pushes;
stats->batch_pushed_elems += bin->stats.batch_pushed_elems;
malloc_mutex_unlock(tsdn, &bin->lock);
}

View file

@ -48,6 +48,11 @@ struct bin_stats_s {
/* Current size of nonfull slabs heap in this bin. */
size_t nonfull_slabs;
uint64_t batch_pops;
uint64_t batch_failed_pushes;
uint64_t batch_pushes;
uint64_t batch_pushed_elems;
};
typedef struct bin_stats_data_s bin_stats_data_t;

View file

@ -9,6 +9,7 @@ void
batcher_init(batcher_t *batcher, size_t nelems_max) {
atomic_store_zu(&batcher->nelems, 0, ATOMIC_RELAXED);
batcher->nelems_max = nelems_max;
batcher->npushes = 0;
malloc_mutex_init(&batcher->mtx, "batcher", WITNESS_RANK_BATCHER,
malloc_mutex_rank_exclusive);
}
@ -37,9 +38,18 @@ size_t batcher_push_begin(tsdn_t *tsdn, batcher_t *batcher,
* acquire a mutex only to discover that there's no space for them.
*/
atomic_store_zu(&batcher->nelems, nelems + elems_to_push, ATOMIC_RELAXED);
batcher->npushes++;
return nelems;
}
size_t
batcher_pop_get_pushes(tsdn_t *tsdn, batcher_t *batcher) {
malloc_mutex_assert_owner(tsdn, &batcher->mtx);
size_t npushes = batcher->npushes;
batcher->npushes = 0;
return npushes;
}
void
batcher_push_end(tsdn_t *tsdn, batcher_t *batcher) {
malloc_mutex_assert_owner(tsdn, &batcher->mtx);

View file

@ -239,6 +239,10 @@ CTL_PROTO(stats_arenas_i_bins_j_nslabs)
CTL_PROTO(stats_arenas_i_bins_j_nreslabs)
CTL_PROTO(stats_arenas_i_bins_j_curslabs)
CTL_PROTO(stats_arenas_i_bins_j_nonfull_slabs)
CTL_PROTO(stats_arenas_i_bins_j_batch_pops)
CTL_PROTO(stats_arenas_i_bins_j_batch_failed_pushes)
CTL_PROTO(stats_arenas_i_bins_j_batch_pushes)
CTL_PROTO(stats_arenas_i_bins_j_batch_pushed_elems)
INDEX_PROTO(stats_arenas_i_bins_j)
CTL_PROTO(stats_arenas_i_lextents_j_nmalloc)
CTL_PROTO(stats_arenas_i_lextents_j_ndalloc)
@ -694,6 +698,14 @@ static const ctl_named_node_t stats_arenas_i_bins_j_node[] = {
{NAME("nreslabs"), CTL(stats_arenas_i_bins_j_nreslabs)},
{NAME("curslabs"), CTL(stats_arenas_i_bins_j_curslabs)},
{NAME("nonfull_slabs"), CTL(stats_arenas_i_bins_j_nonfull_slabs)},
{NAME("batch_pops"),
CTL(stats_arenas_i_bins_j_batch_pops)},
{NAME("batch_failed_pushes"),
CTL(stats_arenas_i_bins_j_batch_failed_pushes)},
{NAME("batch_pushes"),
CTL(stats_arenas_i_bins_j_batch_pushes)},
{NAME("batch_pushed_elems"),
CTL(stats_arenas_i_bins_j_batch_pushed_elems)},
{NAME("mutex"), CHILD(named, stats_arenas_i_bins_j_mutex)}
};
@ -1246,6 +1258,16 @@ MUTEX_PROF_ARENA_MUTEXES
assert(bstats->curslabs == 0);
assert(bstats->nonfull_slabs == 0);
}
merged->batch_pops
+= bstats->batch_pops;
merged->batch_failed_pushes
+= bstats->batch_failed_pushes;
merged->batch_pushes
+= bstats->batch_pushes;
merged->batch_pushed_elems
+= bstats->batch_pushed_elems;
malloc_mutex_prof_merge(&sdstats->bstats[i].mutex_data,
&astats->bstats[i].mutex_data);
}
@ -3957,6 +3979,14 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_curslabs,
arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.curslabs, size_t)
CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nonfull_slabs,
arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.nonfull_slabs, size_t)
CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_batch_pops,
arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.batch_pops, uint64_t)
CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_batch_failed_pushes,
arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.batch_failed_pushes, uint64_t)
CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_batch_pushes,
arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.batch_pushes, uint64_t)
CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_batch_pushed_elems,
arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.batch_pushed_elems, uint64_t)
static const ctl_named_node_t *
stats_arenas_i_bins_j_index(tsdn_t *tsdn, const size_t *mib,

View file

@ -358,6 +358,15 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i,
COL_HDR(row, nreslabs, NULL, right, 13, uint64)
COL_HDR(row, nreslabs_ps, "(#/sec)", right, 8, uint64)
COL_HDR(row, pops, NULL, right, 10, uint64)
COL_HDR(row, pops_ps, "(#/sec)", right, 8, uint64)
COL_HDR(row, failed_push, NULL, right, 13, uint64)
COL_HDR(row, failed_push_ps, "(#/sec)", right, 8, uint64)
COL_HDR(row, push, NULL, right, 7, uint64)
COL_HDR(row, push_ps, "(#/sec)", right, 8, uint64)
COL_HDR(row, push_elem, NULL, right, 12, uint64)
COL_HDR(row, push_elem_ps, "(#/sec)", right, 8, uint64)
/* Don't want to actually print the name. */
header_justify_spacer.str_val = " ";
col_justify_spacer.str_val = " ";
@ -405,6 +414,8 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i,
uint32_t nregs, nshards;
uint64_t nmalloc, ndalloc, nrequests, nfills, nflushes;
uint64_t nreslabs;
uint64_t batch_pops, batch_failed_pushes, batch_pushes,
batch_pushed_elems;
prof_stats_t prof_live;
prof_stats_t prof_accum;
@ -453,6 +464,15 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i,
CTL_LEAF(stats_arenas_mib, 5, "nonfull_slabs", &nonfull_slabs,
size_t);
CTL_LEAF(stats_arenas_mib, 5, "batch_pops", &batch_pops,
uint64_t);
CTL_LEAF(stats_arenas_mib, 5, "batch_failed_pushes",
&batch_failed_pushes, uint64_t);
CTL_LEAF(stats_arenas_mib, 5, "batch_pushes",
&batch_pushes, uint64_t);
CTL_LEAF(stats_arenas_mib, 5, "batch_pushed_elems",
&batch_pushed_elems, uint64_t);
if (mutex) {
mutex_stats_read_arena_bin(stats_arenas_mib, 5,
col_mutex64, col_mutex32, uptime);
@ -487,6 +507,14 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i,
&curslabs);
emitter_json_kv(emitter, "nonfull_slabs", emitter_type_size,
&nonfull_slabs);
emitter_json_kv(emitter, "batch_pops",
emitter_type_uint64, &batch_pops);
emitter_json_kv(emitter, "batch_failed_pushes",
emitter_type_uint64, &batch_failed_pushes);
emitter_json_kv(emitter, "batch_pushes",
emitter_type_uint64, &batch_pushes);
emitter_json_kv(emitter, "batch_pushed_elems",
emitter_type_uint64, &batch_pushed_elems);
if (mutex) {
emitter_json_object_kv_begin(emitter, "mutex");
mutex_stats_emit(emitter, NULL, col_mutex64,
@ -545,6 +573,21 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i,
col_nreslabs.uint64_val = nreslabs;
col_nreslabs_ps.uint64_val = rate_per_second(nreslabs, uptime);
col_pops.uint64_val = batch_pops;
col_pops_ps.uint64_val
= rate_per_second(batch_pops, uptime);
col_failed_push.uint64_val = batch_failed_pushes;
col_failed_push_ps.uint64_val
= rate_per_second(batch_failed_pushes, uptime);
col_push.uint64_val = batch_pushes;
col_push_ps.uint64_val
= rate_per_second(batch_pushes, uptime);
col_push_elem.uint64_val = batch_pushed_elems;
col_push_elem_ps.uint64_val
= rate_per_second(batch_pushed_elems, uptime);
/*
* Note that mutex columns were initialized above, if mutex ==
* true.

View file

@ -482,6 +482,7 @@ tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin
*/
bool locked = false;
bool batched = false;
bool batch_failed = false;
if (can_batch) {
locked = !malloc_mutex_trylock(tsdn, &cur_bin->lock);
}
@ -508,12 +509,24 @@ tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin
}
batcher_push_end(tsdn,
&batched_bin->remote_frees);
} else {
batch_failed = true;
}
}
if (!batched) {
if (!locked) {
malloc_mutex_lock(tsdn, &cur_bin->lock);
}
/*
* Unlike other stats (which only ever get flushed into
* a tcache's associated arena), batch_failed counts get
* accumulated into the bin where the push attempt
* failed.
*/
if (config_stats && batch_failed) {
cur_bin->stats.batch_failed_pushes++;
}
/*
* Flush stats first, if that was the right lock. Note
* that we don't actually have to flush stats into the