Bin batching: add some stats.

This lets us easily see what fraction of flush load is being taken up by the bins, and helps guide future optimization approaches (for example: should we prefetch during cache bin fills? It depends on how many objects the average fill pops out of the batch).
2026-06-18 18:05:39 +03:00 · 2024-02-20 14:54:43 -08:00 · 2024-02-20 14:54:43 -08:00 · f9c0b5f7f8
commit f9c0b5f7f8
parent fc615739cb
8 changed files with 114 additions and 0 deletions
--- a/include/jemalloc/internal/arena_inlines_b.h
+++ b/include/jemalloc/internal/arena_inlines_b.h
@ -630,6 +630,8 @@ arena_bin_flush_batch_impl(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
 		    &batched_bin->remote_frees.mtx);
 	}

+	size_t npushes = batcher_pop_get_pushes(tsdn,
+	    &batched_bin->remote_frees);
 	bin_remote_free_data_t remote_free_data[BIN_REMOTE_FREE_ELEMS_MAX];
 	for (size_t i = 0; i < nelems_to_pop; i++) {
 		remote_free_data[i] = batched_bin->remote_free_data[i];
@ -642,6 +644,10 @@ arena_bin_flush_batch_impl(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
 		    dalloc_slabs, ndalloc_slabs, dalloc_count,
 		    dalloc_slabs_extra);
 	}
+
+	bin->stats.batch_pops++;
+	bin->stats.batch_pushes += npushes;
+	bin->stats.batch_pushed_elems += nelems_to_pop;
 }

 typedef struct arena_bin_flush_batch_state_s arena_bin_flush_batch_state_t;
--- a/include/jemalloc/internal/batcher.h
+++ b/include/jemalloc/internal/batcher.h
@ -16,6 +16,7 @@ struct batcher_s {
 	 */
 	atomic_zu_t nelems;
 	size_t nelems_max;
+	size_t npushes;
 	malloc_mutex_t mtx;
 };

@ -35,6 +36,7 @@ void batcher_push_end(tsdn_t *tsdn, batcher_t *batcher);
 * If the former, must be followed by a call to batcher_pop_end.
 */
 size_t batcher_pop_begin(tsdn_t *tsdn, batcher_t *batcher);
+size_t batcher_pop_get_pushes(tsdn_t *tsdn, batcher_t *batcher);
 void batcher_pop_end(tsdn_t *tsdn, batcher_t *batcher);

 void batcher_prefork(tsdn_t *tsdn, batcher_t *batcher);
--- a/include/jemalloc/internal/bin.h
+++ b/include/jemalloc/internal/bin.h
@ -138,6 +138,11 @@ bin_stats_merge(tsdn_t *tsdn, bin_stats_data_t *dst_bin_stats, bin_t *bin) {
 	stats->reslabs += bin->stats.reslabs;
 	stats->curslabs += bin->stats.curslabs;
 	stats->nonfull_slabs += bin->stats.nonfull_slabs;
+
+	stats->batch_failed_pushes += bin->stats.batch_failed_pushes;
+	stats->batch_pushes += bin->stats.batch_pushes;
+	stats->batch_pushed_elems += bin->stats.batch_pushed_elems;
+
 	malloc_mutex_unlock(tsdn, &bin->lock);
 }

--- a/include/jemalloc/internal/bin_stats.h
+++ b/include/jemalloc/internal/bin_stats.h
@ -48,6 +48,11 @@ struct bin_stats_s {

 	/* Current size of nonfull slabs heap in this bin. */
 	size_t		nonfull_slabs;
+
+	uint64_t	batch_pops;
+	uint64_t	batch_failed_pushes;
+	uint64_t	batch_pushes;
+	uint64_t	batch_pushed_elems;
 };

 typedef struct bin_stats_data_s bin_stats_data_t;
--- a/src/batcher.c
+++ b/src/batcher.c
@ -9,6 +9,7 @@ void
 batcher_init(batcher_t *batcher, size_t nelems_max) {
 	atomic_store_zu(&batcher->nelems, 0, ATOMIC_RELAXED);
 	batcher->nelems_max = nelems_max;
+	batcher->npushes = 0;
 	malloc_mutex_init(&batcher->mtx, "batcher", WITNESS_RANK_BATCHER,
 	    malloc_mutex_rank_exclusive);
 }
@ -37,9 +38,18 @@ size_t batcher_push_begin(tsdn_t *tsdn, batcher_t *batcher,
 	 * acquire a mutex only to discover that there's no space for them.
 	 */
 	atomic_store_zu(&batcher->nelems, nelems + elems_to_push, ATOMIC_RELAXED);
+	batcher->npushes++;
 	return nelems;
 }

+size_t
+batcher_pop_get_pushes(tsdn_t *tsdn, batcher_t *batcher) {
+	malloc_mutex_assert_owner(tsdn, &batcher->mtx);
+	size_t npushes = batcher->npushes;
+	batcher->npushes = 0;
+	return npushes;
+}
+
 void
 batcher_push_end(tsdn_t *tsdn, batcher_t *batcher) {
 	malloc_mutex_assert_owner(tsdn, &batcher->mtx);
--- a/src/ctl.c
+++ b/src/ctl.c
@ -239,6 +239,10 @@ CTL_PROTO(stats_arenas_i_bins_j_nslabs)
 CTL_PROTO(stats_arenas_i_bins_j_nreslabs)
 CTL_PROTO(stats_arenas_i_bins_j_curslabs)
 CTL_PROTO(stats_arenas_i_bins_j_nonfull_slabs)
+CTL_PROTO(stats_arenas_i_bins_j_batch_pops)
+CTL_PROTO(stats_arenas_i_bins_j_batch_failed_pushes)
+CTL_PROTO(stats_arenas_i_bins_j_batch_pushes)
+CTL_PROTO(stats_arenas_i_bins_j_batch_pushed_elems)
 INDEX_PROTO(stats_arenas_i_bins_j)
 CTL_PROTO(stats_arenas_i_lextents_j_nmalloc)
 CTL_PROTO(stats_arenas_i_lextents_j_ndalloc)
@ -694,6 +698,14 @@ static const ctl_named_node_t stats_arenas_i_bins_j_node[] = {
 	{NAME("nreslabs"),	CTL(stats_arenas_i_bins_j_nreslabs)},
 	{NAME("curslabs"),	CTL(stats_arenas_i_bins_j_curslabs)},
 	{NAME("nonfull_slabs"),	CTL(stats_arenas_i_bins_j_nonfull_slabs)},
+	{NAME("batch_pops"),
+		CTL(stats_arenas_i_bins_j_batch_pops)},
+	{NAME("batch_failed_pushes"),
+		CTL(stats_arenas_i_bins_j_batch_failed_pushes)},
+	{NAME("batch_pushes"),
+		CTL(stats_arenas_i_bins_j_batch_pushes)},
+	{NAME("batch_pushed_elems"),
+		CTL(stats_arenas_i_bins_j_batch_pushed_elems)},
 	{NAME("mutex"),		CHILD(named, stats_arenas_i_bins_j_mutex)}
 };

@ -1246,6 +1258,16 @@ MUTEX_PROF_ARENA_MUTEXES
 				assert(bstats->curslabs == 0);
 				assert(bstats->nonfull_slabs == 0);
 			}
+
+			merged->batch_pops
+			    += bstats->batch_pops;
+			merged->batch_failed_pushes
+			    += bstats->batch_failed_pushes;
+			merged->batch_pushes
+			    += bstats->batch_pushes;
+			merged->batch_pushed_elems
+			    += bstats->batch_pushed_elems;
+
 			malloc_mutex_prof_merge(&sdstats->bstats[i].mutex_data,
 			    &astats->bstats[i].mutex_data);
 		}
@ -3957,6 +3979,14 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_curslabs,
    arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.curslabs, size_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nonfull_slabs,
    arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.nonfull_slabs, size_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_batch_pops,
+    arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.batch_pops, uint64_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_batch_failed_pushes,
+    arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.batch_failed_pushes, uint64_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_batch_pushes,
+    arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.batch_pushes, uint64_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_batch_pushed_elems,
+    arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.batch_pushed_elems, uint64_t)

 static const ctl_named_node_t *
 stats_arenas_i_bins_j_index(tsdn_t *tsdn, const size_t *mib,
--- a/src/stats.c
+++ b/src/stats.c
@ -358,6 +358,15 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i,
 	COL_HDR(row, nreslabs, NULL, right, 13, uint64)
 	COL_HDR(row, nreslabs_ps, "(#/sec)", right, 8, uint64)

+	COL_HDR(row, pops, NULL, right, 10, uint64)
+	COL_HDR(row, pops_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, failed_push, NULL, right, 13, uint64)
+	COL_HDR(row, failed_push_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, push, NULL, right, 7, uint64)
+	COL_HDR(row, push_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, push_elem, NULL, right, 12, uint64)
+	COL_HDR(row, push_elem_ps, "(#/sec)", right, 8, uint64)
+
 	/* Don't want to actually print the name. */
 	header_justify_spacer.str_val = " ";
 	col_justify_spacer.str_val = " ";
@ -405,6 +414,8 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i,
 		uint32_t nregs, nshards;
 		uint64_t nmalloc, ndalloc, nrequests, nfills, nflushes;
 		uint64_t nreslabs;
+		uint64_t batch_pops, batch_failed_pushes, batch_pushes,
+		    batch_pushed_elems;
 		prof_stats_t prof_live;
 		prof_stats_t prof_accum;

@ -453,6 +464,15 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i,
 		CTL_LEAF(stats_arenas_mib, 5, "nonfull_slabs", &nonfull_slabs,
 		    size_t);

+		CTL_LEAF(stats_arenas_mib, 5, "batch_pops", &batch_pops,
+		    uint64_t);
+		CTL_LEAF(stats_arenas_mib, 5, "batch_failed_pushes",
+		    &batch_failed_pushes, uint64_t);
+		CTL_LEAF(stats_arenas_mib, 5, "batch_pushes",
+		    &batch_pushes, uint64_t);
+		CTL_LEAF(stats_arenas_mib, 5, "batch_pushed_elems",
+		    &batch_pushed_elems, uint64_t);
+
 		if (mutex) {
 			mutex_stats_read_arena_bin(stats_arenas_mib, 5,
 			    col_mutex64, col_mutex32, uptime);
@ -487,6 +507,14 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i,
 		    &curslabs);
 		emitter_json_kv(emitter, "nonfull_slabs", emitter_type_size,
 		    &nonfull_slabs);
+		emitter_json_kv(emitter, "batch_pops",
+		    emitter_type_uint64, &batch_pops);
+		emitter_json_kv(emitter, "batch_failed_pushes",
+		    emitter_type_uint64, &batch_failed_pushes);
+		emitter_json_kv(emitter, "batch_pushes",
+		    emitter_type_uint64, &batch_pushes);
+		emitter_json_kv(emitter, "batch_pushed_elems",
+		    emitter_type_uint64, &batch_pushed_elems);
 		if (mutex) {
 			emitter_json_object_kv_begin(emitter, "mutex");
 			mutex_stats_emit(emitter, NULL, col_mutex64,
@ -545,6 +573,21 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i,
 		col_nreslabs.uint64_val = nreslabs;
 		col_nreslabs_ps.uint64_val = rate_per_second(nreslabs, uptime);

+		col_pops.uint64_val = batch_pops;
+		col_pops_ps.uint64_val
+		    = rate_per_second(batch_pops, uptime);
+
+		col_failed_push.uint64_val = batch_failed_pushes;
+		col_failed_push_ps.uint64_val
+		    = rate_per_second(batch_failed_pushes, uptime);
+		col_push.uint64_val = batch_pushes;
+		col_push_ps.uint64_val
+		    = rate_per_second(batch_pushes, uptime);
+
+		col_push_elem.uint64_val = batch_pushed_elems;
+		col_push_elem_ps.uint64_val
+		    = rate_per_second(batch_pushed_elems, uptime);
+
 		/*
 		 * Note that mutex columns were initialized above, if mutex ==
 		 * true.
--- a/src/tcache.c
+++ b/src/tcache.c
@ -482,6 +482,7 @@ tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin
 		 */
 		bool locked = false;
 		bool batched = false;
+		bool batch_failed = false;
 		if (can_batch) {
 			locked = !malloc_mutex_trylock(tsdn, &cur_bin->lock);
 		}
@ -508,12 +509,24 @@ tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin
 				}
 				batcher_push_end(tsdn,
 				    &batched_bin->remote_frees);
+			} else {
+				batch_failed = true;
 			}
 		}
 		if (!batched) {
 			if (!locked) {
 				malloc_mutex_lock(tsdn, &cur_bin->lock);
 			}
+			/*
+			 * Unlike other stats (which only ever get flushed into
+			 * a tcache's associated arena), batch_failed counts get
+			 * accumulated into the bin where the push attempt
+			 * failed.
+			 */
+			if (config_stats && batch_failed) {
+				cur_bin->stats.batch_failed_pushes++;
+			}
+
 			/*
 			 * Flush stats first, if that was the right lock.  Note
 			 * that we don't actually have to flush stats into the