Revert PR #2608: Manually revert commits 70c94d..f9c0b5

2026-05-01 01:28:00 +03:00 · 2025-07-15 15:44:14 -07:00 · 2025-07-15 15:44:14 -07:00 · e2da7477f8
commit e2da7477f8
parent 9186700eb3
30 changed files with 124 additions and 1364 deletions
--- a/src/arena.c
+++ b/src/arena.c
@ -39,7 +39,8 @@ div_info_t arena_binind_div_info[SC_NBINS];
 size_t opt_oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT;
 size_t oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT;

-uint32_t arena_bin_offsets[SC_NBINS];
+uint32_t        arena_bin_offsets[SC_NBINS];
+static unsigned nbins_total;

 /*
 * a0 is used to handle huge requests before malloc init completes. After
@ -674,17 +675,11 @@ arena_bin_slabs_full_remove(arena_t *arena, bin_t *bin, edata_t *slab) {
 }

 static void
-arena_bin_reset(tsd_t *tsd, arena_t *arena, bin_t *bin, unsigned binind) {
+arena_bin_reset(tsd_t *tsd, arena_t *arena, bin_t *bin) {
 	edata_t *slab;

 	malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);

-	if (arena_bin_has_batch(binind)) {
-		bin_with_batch_t *batched_bin = (bin_with_batch_t *)bin;
-		batcher_init(
-		    &batched_bin->remote_frees, BIN_REMOTE_FREE_ELEMS_MAX);
-	}
-
 	if (bin->slabcur != NULL) {
 		slab = bin->slabcur;
 		bin->slabcur = NULL;
@ -835,8 +830,7 @@ arena_reset(tsd_t *tsd, arena_t *arena) {
 	/* Bins. */
 	for (unsigned i = 0; i < SC_NBINS; i++) {
 		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
-			arena_bin_reset(
-			    tsd, arena, arena_get_bin(arena, i, j), i);
+			arena_bin_reset(tsd, arena, arena_get_bin(arena, i, j));
 		}
 	}
 	pa_shard_reset(tsd_tsdn(tsd), &arena->pa_shard);
@ -1103,19 +1097,8 @@ arena_cache_bin_fill_small(tsdn_t *tsdn, arena_t *arena, cache_bin_t *cache_bin,
 	unsigned       binshard;
 	bin_t         *bin = arena_bin_choose(tsdn, arena, binind, &binshard);

-	/*
-	 * This has some fields that are conditionally initialized down batch
-	 * flush pathways.  This can trigger static analysis warnings deeper
-	 * down in the static.  The accesses are guarded by the same checks as
-	 * the initialization, but the analysis isn't able to track that across
-	 * multiple stack frames.
-	 */
-	arena_bin_flush_batch_state_t batch_flush_state
-	    JEMALLOC_CLANG_ANALYZER_SILENCE_INIT({0});
 label_refill:
 	malloc_mutex_lock(tsdn, &bin->lock);
-	arena_bin_flush_batch_after_lock(
-	    tsdn, arena, bin, binind, &batch_flush_state);

 	while (filled < nfill_min) {
 		/* Try batch-fill from slabcur first. */
@ -1176,11 +1159,7 @@ label_refill:
 		cache_bin->tstats.nrequests = 0;
 	}

-	arena_bin_flush_batch_before_unlock(
-	    tsdn, arena, bin, binind, &batch_flush_state);
 	malloc_mutex_unlock(tsdn, &bin->lock);
-	arena_bin_flush_batch_after_unlock(
-	    tsdn, arena, bin, binind, &batch_flush_state);

 	if (alloc_and_retry) {
 		assert(fresh_slab == NULL);
@ -1474,16 +1453,12 @@ arena_dalloc_bin(tsdn_t *tsdn, arena_t *arena, edata_t *edata, void *ptr) {
 	malloc_mutex_lock(tsdn, &bin->lock);
 	arena_dalloc_bin_locked_info_t info;
 	arena_dalloc_bin_locked_begin(&info, binind);
-	edata_t *dalloc_slabs[1];
-	unsigned dalloc_slabs_count = 0;
-	arena_dalloc_bin_locked_step(tsdn, arena, bin, &info, binind, edata,
-	    ptr, dalloc_slabs, /* ndalloc_slabs */ 1, &dalloc_slabs_count,
-	    /* dalloc_slabs_extra */ NULL);
+	bool ret = arena_dalloc_bin_locked_step(
+	    tsdn, arena, bin, &info, binind, edata, ptr);
 	arena_dalloc_bin_locked_finish(tsdn, arena, bin, &info);
 	malloc_mutex_unlock(tsdn, &bin->lock);

-	if (dalloc_slabs_count != 0) {
-		assert(dalloc_slabs[0] == edata);
+	if (ret) {
 		arena_slab_dalloc(tsdn, arena, edata);
 	}
 }
@ -1722,6 +1697,7 @@ arena_t *
 arena_new(tsdn_t *tsdn, unsigned ind, const arena_config_t *config) {
 	arena_t *arena;
 	base_t  *base;
+	unsigned i;

 	if (ind == 0) {
 		base = b0get();
@ -1734,13 +1710,14 @@ arena_new(tsdn_t *tsdn, unsigned ind, const arena_config_t *config) {
 	}

 	size_t arena_size = ALIGNMENT_CEILING(sizeof(arena_t), CACHELINE)
-	    + sizeof(bin_with_batch_t) * bin_info_nbatched_bins
-	    + sizeof(bin_t) * bin_info_nunbatched_bins;
+	    + sizeof(bin_t) * nbins_total;
 	arena = (arena_t *)base_alloc(tsdn, base, arena_size, CACHELINE);
 	if (arena == NULL) {
 		goto label_error;
 	}
-
+	JEMALLOC_SUPPRESS_WARN_ON_USAGE(
+	    assert((uintptr_t)&arena->all_bins[nbins_total - 1] + sizeof(bin_t)
+	        <= (uintptr_t)arena + arena_size);)
 	atomic_store_u(&arena->nthreads[0], 0, ATOMIC_RELAXED);
 	atomic_store_u(&arena->nthreads[1], 0, ATOMIC_RELAXED);
 	arena->last_thd = NULL;
@ -1779,13 +1756,11 @@ arena_new(tsdn_t *tsdn, unsigned ind, const arena_config_t *config) {

 	/* Initialize bins. */
 	atomic_store_u(&arena->binshard_next, 0, ATOMIC_RELEASE);
-	for (unsigned i = 0; i < SC_NBINS; i++) {
-		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
-			bin_t *bin = arena_get_bin(arena, i, j);
-			bool   err = bin_init(bin, i);
-			if (err) {
-				goto label_error;
-			}
+	for (i = 0; i < nbins_total; i++) {
+		JEMALLOC_SUPPRESS_WARN_ON_USAGE(
+		    bool err = bin_init(&arena->all_bins[i]);)
+		if (err) {
+			goto label_error;
 		}
 	}

@ -1943,10 +1918,8 @@ arena_boot(sc_data_t *sc_data, base_t *base, bool hpa) {
 	    uint32_t cur_offset = (uint32_t)offsetof(arena_t, all_bins);)
 	for (szind_t i = 0; i < SC_NBINS; i++) {
 		arena_bin_offsets[i] = cur_offset;
-		uint32_t bin_sz = (i < bin_info_nbatched_sizes
-		        ? sizeof(bin_with_batch_t)
-		        : sizeof(bin_t));
-		cur_offset += (uint32_t)bin_infos[i].n_shards * bin_sz;
+		nbins_total += bin_infos[i].n_shards;
+		cur_offset += (uint32_t)(bin_infos[i].n_shards * sizeof(bin_t));
 	}
 	return pa_central_init(
 	    &arena_pa_central_global, base, hpa, &hpa_hooks_default);
@ -1996,21 +1969,17 @@ arena_prefork7(tsdn_t *tsdn, arena_t *arena) {

 void
 arena_prefork8(tsdn_t *tsdn, arena_t *arena) {
-	for (szind_t i = 0; i < SC_NBINS; i++) {
-		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
-			bin_t *bin = arena_get_bin(arena, i, j);
-			bin_prefork(tsdn, bin, arena_bin_has_batch(i));
-		}
+	for (unsigned i = 0; i < nbins_total; i++) {
+		JEMALLOC_SUPPRESS_WARN_ON_USAGE(
+		    bin_prefork(tsdn, &arena->all_bins[i]);)
 	}
 }

 void
 arena_postfork_parent(tsdn_t *tsdn, arena_t *arena) {
-	for (szind_t i = 0; i < SC_NBINS; i++) {
-		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
-			bin_t *bin = arena_get_bin(arena, i, j);
-			bin_postfork_parent(tsdn, bin, arena_bin_has_batch(i));
-		}
+	for (unsigned i = 0; i < nbins_total; i++) {
+		JEMALLOC_SUPPRESS_WARN_ON_USAGE(
+		    bin_postfork_parent(tsdn, &arena->all_bins[i]);)
 	}

 	malloc_mutex_postfork_parent(tsdn, &arena->large_mtx);
@ -2047,11 +2016,9 @@ arena_postfork_child(tsdn_t *tsdn, arena_t *arena) {
 		}
 	}

-	for (szind_t i = 0; i < SC_NBINS; i++) {
-		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
-			bin_t *bin = arena_get_bin(arena, i, j);
-			bin_postfork_child(tsdn, bin, arena_bin_has_batch(i));
-		}
+	for (unsigned i = 0; i < nbins_total; i++) {
+		JEMALLOC_SUPPRESS_WARN_ON_USAGE(
+		    bin_postfork_child(tsdn, &arena->all_bins[i]);)
 	}

 	malloc_mutex_postfork_child(tsdn, &arena->large_mtx);
--- a/src/batcher.c
+++ b/src/batcher.c
@ -1,98 +0,0 @@
-#include "jemalloc/internal/jemalloc_preamble.h"
-
-#include "jemalloc/internal/batcher.h"
-
-#include "jemalloc/internal/assert.h"
-#include "jemalloc/internal/atomic.h"
-
-void
-batcher_init(batcher_t *batcher, size_t nelems_max) {
-	atomic_store_zu(&batcher->nelems, 0, ATOMIC_RELAXED);
-	batcher->nelems_max = nelems_max;
-	batcher->npushes = 0;
-	malloc_mutex_init(&batcher->mtx, "batcher", WITNESS_RANK_BATCHER,
-	    malloc_mutex_rank_exclusive);
-}
-
-/*
- * Returns an index (into some user-owned array) to use for pushing, or
- * BATCHER_NO_IDX if no index is free.
- */
-size_t
-batcher_push_begin(tsdn_t *tsdn, batcher_t *batcher, size_t elems_to_push) {
-	assert(elems_to_push > 0);
-	size_t nelems_guess = atomic_load_zu(&batcher->nelems, ATOMIC_RELAXED);
-	if (nelems_guess + elems_to_push > batcher->nelems_max) {
-		return BATCHER_NO_IDX;
-	}
-	malloc_mutex_lock(tsdn, &batcher->mtx);
-	size_t nelems = atomic_load_zu(&batcher->nelems, ATOMIC_RELAXED);
-	if (nelems + elems_to_push > batcher->nelems_max) {
-		malloc_mutex_unlock(tsdn, &batcher->mtx);
-		return BATCHER_NO_IDX;
-	}
-	assert(elems_to_push <= batcher->nelems_max - nelems);
-	/*
-	 * We update nelems at push time (instead of during pop) so that other
-	 * racing accesses of the batcher can fail fast instead of trying to
-	 * acquire a mutex only to discover that there's no space for them.
-	 */
-	atomic_store_zu(
-	    &batcher->nelems, nelems + elems_to_push, ATOMIC_RELAXED);
-	batcher->npushes++;
-	return nelems;
-}
-
-size_t
-batcher_pop_get_pushes(tsdn_t *tsdn, batcher_t *batcher) {
-	malloc_mutex_assert_owner(tsdn, &batcher->mtx);
-	size_t npushes = batcher->npushes;
-	batcher->npushes = 0;
-	return npushes;
-}
-
-void
-batcher_push_end(tsdn_t *tsdn, batcher_t *batcher) {
-	malloc_mutex_assert_owner(tsdn, &batcher->mtx);
-	assert(atomic_load_zu(&batcher->nelems, ATOMIC_RELAXED) > 0);
-	malloc_mutex_unlock(tsdn, &batcher->mtx);
-}
-
-size_t
-batcher_pop_begin(tsdn_t *tsdn, batcher_t *batcher) {
-	size_t nelems_guess = atomic_load_zu(&batcher->nelems, ATOMIC_RELAXED);
-	assert(nelems_guess <= batcher->nelems_max);
-	if (nelems_guess == 0) {
-		return BATCHER_NO_IDX;
-	}
-	malloc_mutex_lock(tsdn, &batcher->mtx);
-	size_t nelems = atomic_load_zu(&batcher->nelems, ATOMIC_RELAXED);
-	assert(nelems <= batcher->nelems_max);
-	if (nelems == 0) {
-		malloc_mutex_unlock(tsdn, &batcher->mtx);
-		return BATCHER_NO_IDX;
-	}
-	atomic_store_zu(&batcher->nelems, 0, ATOMIC_RELAXED);
-	return nelems;
-}
-
-void
-batcher_pop_end(tsdn_t *tsdn, batcher_t *batcher) {
-	assert(atomic_load_zu(&batcher->nelems, ATOMIC_RELAXED) == 0);
-	malloc_mutex_unlock(tsdn, &batcher->mtx);
-}
-
-void
-batcher_prefork(tsdn_t *tsdn, batcher_t *batcher) {
-	malloc_mutex_prefork(tsdn, &batcher->mtx);
-}
-
-void
-batcher_postfork_parent(tsdn_t *tsdn, batcher_t *batcher) {
-	malloc_mutex_postfork_parent(tsdn, &batcher->mtx);
-}
-
-void
-batcher_postfork_child(tsdn_t *tsdn, batcher_t *batcher) {
-	malloc_mutex_postfork_child(tsdn, &batcher->mtx);
-}
--- a/src/bin.c
+++ b/src/bin.c
@ -6,14 +6,6 @@
 #include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/witness.h"

-#ifdef JEMALLOC_JET
-unsigned bin_batching_test_ndalloc_slabs_max = (unsigned)-1;
-void (*bin_batching_test_after_push_hook)(size_t push_idx);
-void (*bin_batching_test_mid_pop_hook)(size_t nelems_to_pop);
-void (*bin_batching_test_after_unlock_hook)(
-    unsigned slab_dalloc_count, bool list_empty);
-#endif
-
 bool
 bin_update_shard_size(unsigned bin_shard_sizes[SC_NBINS], size_t start_size,
    size_t end_size, size_t nshards) {
@ -47,7 +39,7 @@ bin_shard_sizes_boot(unsigned bin_shard_sizes[SC_NBINS]) {
 }

 bool
-bin_init(bin_t *bin, unsigned binind) {
+bin_init(bin_t *bin) {
 	if (malloc_mutex_init(&bin->lock, "bin", WITNESS_RANK_BIN,
 	        malloc_mutex_rank_exclusive)) {
 		return true;
@ -58,52 +50,20 @@ bin_init(bin_t *bin, unsigned binind) {
 	if (config_stats) {
 		memset(&bin->stats, 0, sizeof(bin_stats_t));
 	}
-	if (arena_bin_has_batch(binind)) {
-		bin_with_batch_t *batched_bin = (bin_with_batch_t *)bin;
-		batcher_init(
-		    &batched_bin->remote_frees, opt_bin_info_remote_free_max);
-	}
 	return false;
 }

 void
-bin_prefork(tsdn_t *tsdn, bin_t *bin, bool has_batch) {
+bin_prefork(tsdn_t *tsdn, bin_t *bin) {
 	malloc_mutex_prefork(tsdn, &bin->lock);
-	if (has_batch) {
-		/*
-		 * The batch mutex has lower rank than the bin mutex (as it must
-		 * -- it's acquired later).  But during forking, we go
-		 *  bin-at-a-time, so that we acquire mutex on bin 0, then on
-		 *  the bin 0 batcher, then on bin 1.  This is a safe ordering
-		 *  (it's ordered by the index of arenas and bins within those
-		 *  arenas), but will trigger witness errors that would
-		 *  otherwise force another level of arena forking that breaks
-		 *  bin encapsulation (because the witness API doesn't "know"
-		 *  about arena or bin ordering -- it just sees that the batcher
-		 *  has a lower rank than the bin).  So instead we exclude the
-		 *  batcher mutex from witness checking during fork (which is
-		 *  the only time we touch multiple bins at once) by passing
-		 *  TSDN_NULL.
-		 */
-		bin_with_batch_t *batched = (bin_with_batch_t *)bin;
-		batcher_prefork(TSDN_NULL, &batched->remote_frees);
-	}
 }

 void
-bin_postfork_parent(tsdn_t *tsdn, bin_t *bin, bool has_batch) {
+bin_postfork_parent(tsdn_t *tsdn, bin_t *bin) {
 	malloc_mutex_postfork_parent(tsdn, &bin->lock);
-	if (has_batch) {
-		bin_with_batch_t *batched = (bin_with_batch_t *)bin;
-		batcher_postfork_parent(TSDN_NULL, &batched->remote_frees);
-	}
 }

 void
-bin_postfork_child(tsdn_t *tsdn, bin_t *bin, bool has_batch) {
+bin_postfork_child(tsdn_t *tsdn, bin_t *bin) {
 	malloc_mutex_postfork_child(tsdn, &bin->lock);
-	if (has_batch) {
-		bin_with_batch_t *batched = (bin_with_batch_t *)bin;
-		batcher_postfork_child(TSDN_NULL, &batched->remote_frees);
-	}
 }
--- a/src/bin_info.c
+++ b/src/bin_info.c
@ -3,26 +3,8 @@

 #include "jemalloc/internal/bin_info.h"

-/*
- * We leave bin-batching disabled by default, with other settings chosen mostly
- * empirically; across the test programs I looked at they provided the most bang
- * for the buck.  With other default settings, these choices for bin batching
- * result in them consuming far less memory (even in the worst case) than the
- * tcaches themselves, the arena, etc.
- * Note that we always try to pop all bins on every arena cache bin lock
- * operation, so the typical memory waste is far less than this (and only on
- * hot bins, which tend to be large anyways).
- */
-size_t opt_bin_info_max_batched_size = 0; /* 192 is a good default. */
-size_t opt_bin_info_remote_free_max_batch = 4;
-size_t opt_bin_info_remote_free_max = BIN_REMOTE_FREE_ELEMS_MAX;
-
 bin_info_t bin_infos[SC_NBINS];

-szind_t  bin_info_nbatched_sizes;
-unsigned bin_info_nbatched_bins;
-unsigned bin_info_nunbatched_bins;
-
 static void
 bin_infos_init(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
    bin_info_t infos[SC_NBINS]) {
@ -38,12 +20,6 @@ bin_infos_init(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
 		bitmap_info_t bitmap_info = BITMAP_INFO_INITIALIZER(
 		    bin_info->nregs);
 		bin_info->bitmap_info = bitmap_info;
-		if (bin_info->reg_size <= opt_bin_info_max_batched_size) {
-			bin_info_nbatched_sizes++;
-			bin_info_nbatched_bins += bin_info->n_shards;
-		} else {
-			bin_info_nunbatched_bins += bin_info->n_shards;
-		}
 	}
 }

--- a/src/ctl.c
+++ b/src/ctl.c
@ -134,9 +134,6 @@ CTL_PROTO(opt_utrace)
 CTL_PROTO(opt_xmalloc)
 CTL_PROTO(opt_experimental_infallible_new)
 CTL_PROTO(opt_experimental_tcache_gc)
-CTL_PROTO(opt_max_batched_size)
-CTL_PROTO(opt_remote_free_max)
-CTL_PROTO(opt_remote_free_max_batch)
 CTL_PROTO(opt_tcache)
 CTL_PROTO(opt_tcache_max)
 CTL_PROTO(opt_tcache_nslots_small_min)
@ -248,10 +245,6 @@ CTL_PROTO(stats_arenas_i_bins_j_nslabs)
 CTL_PROTO(stats_arenas_i_bins_j_nreslabs)
 CTL_PROTO(stats_arenas_i_bins_j_curslabs)
 CTL_PROTO(stats_arenas_i_bins_j_nonfull_slabs)
-CTL_PROTO(stats_arenas_i_bins_j_batch_pops)
-CTL_PROTO(stats_arenas_i_bins_j_batch_failed_pushes)
-CTL_PROTO(stats_arenas_i_bins_j_batch_pushes)
-CTL_PROTO(stats_arenas_i_bins_j_batch_pushed_elems)
 INDEX_PROTO(stats_arenas_i_bins_j)
 CTL_PROTO(stats_arenas_i_lextents_j_nmalloc)
 CTL_PROTO(stats_arenas_i_lextents_j_ndalloc)
@ -501,9 +494,6 @@ static const ctl_named_node_t opt_node[] = {{NAME("abort"), CTL(opt_abort)},
    {NAME("utrace"), CTL(opt_utrace)}, {NAME("xmalloc"), CTL(opt_xmalloc)},
    {NAME("experimental_infallible_new"), CTL(opt_experimental_infallible_new)},
    {NAME("experimental_tcache_gc"), CTL(opt_experimental_tcache_gc)},
-    {NAME("max_batched_size"), CTL(opt_max_batched_size)},
-    {NAME("remote_free_max"), CTL(opt_remote_free_max)},
-    {NAME("remote_free_max_batch"), CTL(opt_remote_free_max_batch)},
    {NAME("tcache"), CTL(opt_tcache)},
    {NAME("tcache_max"), CTL(opt_tcache_max)},
    {NAME("tcache_nslots_small_min"), CTL(opt_tcache_nslots_small_min)},
@ -673,11 +663,6 @@ static const ctl_named_node_t stats_arenas_i_bins_j_node[] = {
    {NAME("nreslabs"), CTL(stats_arenas_i_bins_j_nreslabs)},
    {NAME("curslabs"), CTL(stats_arenas_i_bins_j_curslabs)},
    {NAME("nonfull_slabs"), CTL(stats_arenas_i_bins_j_nonfull_slabs)},
-    {NAME("batch_pops"), CTL(stats_arenas_i_bins_j_batch_pops)},
-    {NAME("batch_failed_pushes"),
-        CTL(stats_arenas_i_bins_j_batch_failed_pushes)},
-    {NAME("batch_pushes"), CTL(stats_arenas_i_bins_j_batch_pushes)},
-    {NAME("batch_pushed_elems"), CTL(stats_arenas_i_bins_j_batch_pushed_elems)},
    {NAME("mutex"), CHILD(named, stats_arenas_i_bins_j_mutex)}};

 static const ctl_named_node_t super_stats_arenas_i_bins_j_node[] = {
@ -1219,14 +1204,6 @@ ctl_arena_stats_sdmerge(
 				assert(bstats->curslabs == 0);
 				assert(bstats->nonfull_slabs == 0);
 			}
-
-			merged->batch_pops += bstats->batch_pops;
-			merged->batch_failed_pushes +=
-			    bstats->batch_failed_pushes;
-			merged->batch_pushes += bstats->batch_pushes;
-			merged->batch_pushed_elems +=
-			    bstats->batch_pushed_elems;
-
 			malloc_mutex_prof_merge(&sdstats->bstats[i].mutex_data,
 			    &astats->bstats[i].mutex_data);
 		}
@ -2202,10 +2179,6 @@ CTL_RO_NL_CGEN(config_xmalloc, opt_xmalloc, opt_xmalloc, bool)
 CTL_RO_NL_CGEN(config_enable_cxx, opt_experimental_infallible_new,
    opt_experimental_infallible_new, bool)
 CTL_RO_NL_GEN(opt_experimental_tcache_gc, opt_experimental_tcache_gc, bool)
-CTL_RO_NL_GEN(opt_max_batched_size, opt_bin_info_max_batched_size, size_t)
-CTL_RO_NL_GEN(opt_remote_free_max, opt_bin_info_remote_free_max, size_t)
-CTL_RO_NL_GEN(
-    opt_remote_free_max_batch, opt_bin_info_remote_free_max_batch, size_t)
 CTL_RO_NL_GEN(opt_tcache, opt_tcache, bool)
 CTL_RO_NL_GEN(opt_tcache_max, opt_tcache_max, size_t)
 CTL_RO_NL_GEN(
@ -3982,16 +3955,6 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_curslabs,
    arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.curslabs, size_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nonfull_slabs,
    arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.nonfull_slabs, size_t)
-CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_batch_pops,
-    arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.batch_pops, uint64_t)
-CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_batch_failed_pushes,
-    arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.batch_failed_pushes,
-    uint64_t)
-CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_batch_pushes,
-    arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.batch_pushes, uint64_t)
-CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_batch_pushed_elems,
-    arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.batch_pushed_elems,
-    uint64_t)

 static const ctl_named_node_t *
 stats_arenas_i_bins_j_index(
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@ -1391,20 +1391,6 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
 				} while (vlen_left > 0);
 				CONF_CONTINUE;
 			}
-			CONF_HANDLE_SIZE_T(opt_bin_info_max_batched_size,
-			    "max_batched_size", 0, SIZE_T_MAX,
-			    CONF_DONT_CHECK_MIN, CONF_DONT_CHECK_MAX,
-			    /* clip */ true)
-			CONF_HANDLE_SIZE_T(opt_bin_info_remote_free_max_batch,
-			    "remote_free_max_batch", 0,
-			    BIN_REMOTE_FREE_ELEMS_MAX, CONF_DONT_CHECK_MIN,
-			    CONF_CHECK_MAX,
-			    /* clip */ true)
-			CONF_HANDLE_SIZE_T(opt_bin_info_remote_free_max,
-			    "remote_free_max", 0, BIN_REMOTE_FREE_ELEMS_MAX,
-			    CONF_DONT_CHECK_MIN, CONF_CHECK_MAX,
-			    /* clip */ true)
-
 			if (CONF_MATCH("tcache_ncached_max")) {
 				bool err = tcache_bin_info_default_init(
 				    v, vlen);
--- a/src/stats.c
+++ b/src/stats.c
@ -357,15 +357,6 @@ stats_arena_bins_print(
 	COL_HDR(row, nreslabs, NULL, right, 13, uint64)
 	COL_HDR(row, nreslabs_ps, "(#/sec)", right, 8, uint64)

-	COL_HDR(row, pops, NULL, right, 10, uint64)
-	COL_HDR(row, pops_ps, "(#/sec)", right, 8, uint64)
-	COL_HDR(row, failed_push, NULL, right, 13, uint64)
-	COL_HDR(row, failed_push_ps, "(#/sec)", right, 8, uint64)
-	COL_HDR(row, push, NULL, right, 7, uint64)
-	COL_HDR(row, push_ps, "(#/sec)", right, 8, uint64)
-	COL_HDR(row, push_elem, NULL, right, 12, uint64)
-	COL_HDR(row, push_elem_ps, "(#/sec)", right, 8, uint64)
-
 	/* Don't want to actually print the name. */
 	header_justify_spacer.str_val = " ";
 	col_justify_spacer.str_val = " ";
@ -406,15 +397,13 @@ stats_arena_bins_print(
 	}

 	for (j = 0, in_gap = false; j < nbins; j++) {
-		uint64_t nslabs;
-		size_t   reg_size, slab_size, curregs;
-		size_t   curslabs;
-		size_t   nonfull_slabs;
-		uint32_t nregs, nshards;
-		uint64_t nmalloc, ndalloc, nrequests, nfills, nflushes;
-		uint64_t nreslabs;
-		uint64_t batch_pops, batch_failed_pushes, batch_pushes,
-		    batch_pushed_elems;
+		uint64_t     nslabs;
+		size_t       reg_size, slab_size, curregs;
+		size_t       curslabs;
+		size_t       nonfull_slabs;
+		uint32_t     nregs, nshards;
+		uint64_t     nmalloc, ndalloc, nrequests, nfills, nflushes;
+		uint64_t     nreslabs;
 		prof_stats_t prof_live;
 		prof_stats_t prof_accum;

@ -463,15 +452,6 @@ stats_arena_bins_print(
 		CTL_LEAF(stats_arenas_mib, 5, "nonfull_slabs", &nonfull_slabs,
 		    size_t);

-		CTL_LEAF(
-		    stats_arenas_mib, 5, "batch_pops", &batch_pops, uint64_t);
-		CTL_LEAF(stats_arenas_mib, 5, "batch_failed_pushes",
-		    &batch_failed_pushes, uint64_t);
-		CTL_LEAF(stats_arenas_mib, 5, "batch_pushes", &batch_pushes,
-		    uint64_t);
-		CTL_LEAF(stats_arenas_mib, 5, "batch_pushed_elems",
-		    &batch_pushed_elems, uint64_t);
-
 		if (mutex) {
 			mutex_stats_read_arena_bin(stats_arenas_mib, 5,
 			    col_mutex64, col_mutex32, uptime);
@ -506,14 +486,6 @@ stats_arena_bins_print(
 		    emitter, "curslabs", emitter_type_size, &curslabs);
 		emitter_json_kv(emitter, "nonfull_slabs", emitter_type_size,
 		    &nonfull_slabs);
-		emitter_json_kv(
-		    emitter, "batch_pops", emitter_type_uint64, &batch_pops);
-		emitter_json_kv(emitter, "batch_failed_pushes",
-		    emitter_type_uint64, &batch_failed_pushes);
-		emitter_json_kv(emitter, "batch_pushes", emitter_type_uint64,
-		    &batch_pushes);
-		emitter_json_kv(emitter, "batch_pushed_elems",
-		    emitter_type_uint64, &batch_pushed_elems);
 		if (mutex) {
 			emitter_json_object_kv_begin(emitter, "mutex");
 			mutex_stats_emit(
@ -573,19 +545,6 @@ stats_arena_bins_print(
 		col_nreslabs.uint64_val = nreslabs;
 		col_nreslabs_ps.uint64_val = rate_per_second(nreslabs, uptime);

-		col_pops.uint64_val = batch_pops;
-		col_pops_ps.uint64_val = rate_per_second(batch_pops, uptime);
-
-		col_failed_push.uint64_val = batch_failed_pushes;
-		col_failed_push_ps.uint64_val = rate_per_second(
-		    batch_failed_pushes, uptime);
-		col_push.uint64_val = batch_pushes;
-		col_push_ps.uint64_val = rate_per_second(batch_pushes, uptime);
-
-		col_push_elem.uint64_val = batch_pushed_elems;
-		col_push_elem_ps.uint64_val = rate_per_second(
-		    batch_pushed_elems, uptime);
-
 		/*
 		 * Note that mutex columns were initialized above, if mutex ==
 		 * true.
@ -1677,9 +1636,6 @@ stats_general_print(emitter_t *emitter) {
 	OPT_WRITE_BOOL("xmalloc")
 	OPT_WRITE_BOOL("experimental_infallible_new")
 	OPT_WRITE_BOOL("experimental_tcache_gc")
-	OPT_WRITE_SIZE_T("max_batched_size")
-	OPT_WRITE_SIZE_T("remote_free_max")
-	OPT_WRITE_SIZE_T("remote_free_max_batch")
 	OPT_WRITE_BOOL("tcache")
 	OPT_WRITE_SIZE_T("tcache_max")
 	OPT_WRITE_UNSIGNED("tcache_nslots_small_min")
--- a/src/tcache.c
+++ b/src/tcache.c
@ -608,7 +608,7 @@ tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 	}
 	arena_cache_bin_fill_small(tsdn, arena, cache_bin, binind,
 	    /* nfill_min */
-	        opt_experimental_tcache_gc ? ((nfill >> 1) + 1) : nfill,
+	    opt_experimental_tcache_gc ? ((nfill >> 1) + 1) : nfill,
 	    /* nfill_max */ nfill);
 	tcache_slow->bin_refilled[binind] = true;
 	tcache_nfill_small_burst_prepare(tcache_slow, binind);
@ -680,8 +680,6 @@ tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache,
 	assert(binind < SC_NBINS);
 	arena_t *tcache_arena = tcache_slow->arena;
 	assert(tcache_arena != NULL);
-	unsigned tcache_binshard =
-	    tsd_binshardsp_get(tsdn_tsd(tsdn))->binshard[binind];

 	/*
 	 * Variable length array must have > 0 length; the last element is never
@ -698,25 +696,12 @@ tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache,
 	unsigned dalloc_count = 0;
 	VARIABLE_ARRAY(edata_t *, dalloc_slabs, nflush + 1);

-	/*
-	 * There's an edge case where we need to deallocate more slabs than we
-	 * have elements of dalloc_slabs.  This can if we end up deallocating
-	 * items batched by another thread in addition to ones flushed from the
-	 * cache.  Since this is not very likely (most small object
-	 * deallocations don't free up a whole slab), we don't want to burn the
-	 * stack space to keep those excess slabs in an array.  Instead we'll
-	 * maintain an overflow list.
-	 */
-	edata_list_active_t dalloc_slabs_extra;
-	edata_list_active_init(&dalloc_slabs_extra);
-
 	/*
 	 * We're about to grab a bunch of locks.  If one of them happens to be
 	 * the one guarding the arena-level stats counters we flush our
 	 * thread-local ones to, we do so under one critical section.
 	 */
 	bool merged_stats = false;
-
 	/*
 	 * We maintain the invariant that all edatas yet to be flushed are
 	 * contained in the half-open range [flush_start, flush_end).  We'll
@ -741,7 +726,6 @@ tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache,
 		unsigned cur_binshard = edata_binshard_get(cur_edata);
 		bin_t *cur_bin = arena_get_bin(cur_arena, binind, cur_binshard);
 		assert(cur_binshard < bin_infos[binind].n_shards);
-
 		/*
 		 * Start off the partition; item_edata[i] always matches itself
 		 * of course.
@ -788,150 +772,43 @@ tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache,
 			}
 		}

-		/*
-		 * We never batch when flushing to our home-base bin shard,
-		 * since it's likely that we'll have to acquire that lock anyway
-		 * when flushing stats.
-		 *
-		 * A plausible check we could add to can_batch is
-		 * '&& arena_is_auto(cur_arena)'.  The motivation would be that
-		 * we have a higher tolerance for dubious user assumptions
-		 * around non-auto arenas (e.g. "if I deallocate every object I
-		 * allocated, and then call tcache.flush, then the arena stats
-		 * must reflect zero live allocations").
-		 *
-		 * This is dubious for a couple reasons:
-		 * - We already don't provide perfect fidelity for stats
-		 *   counting (e.g. for profiled allocations, whose size can
-		 *   inflate in stats).
-		 * - Hanging load-bearing guarantees around stats impedes
-		 *   scalability in general.
-		 *
-		 * There are some "complete" strategies we could do instead:
-		 * - Add a arena.<i>.quiesce call to pop all bins for users who
-		 *   do want those stats accounted for.
-		 * - Make batchability a user-controllable per-arena option.
-		 * - Do a batch pop after every mutex acquisition for which we
-		 *   want to provide accurate stats.  This gives perfectly
-		 *   accurate stats, but can cause weird performance effects
-		 *   (because doing stats collection can now result in slabs
-		 *   becoming empty, and therefore purging, large mutex
-		 *   acquisition, etc.).
-		 * - Propagate the "why" behind a flush down to the level of the
-		 *   batcher, and include a batch pop attempt down full tcache
-		 *   flushing pathways.  This is just a lot of plumbing and
-		 *   internal complexity.
-		 *
-		 * We don't do any of these right now, but the decision calculus
-		 * and tradeoffs are subtle enough that the reasoning was worth
-		 * leaving in this comment.
-		 */
-		bool bin_is_batched = arena_bin_has_batch(binind);
-		bool home_binshard = (cur_arena == tcache_arena
-		    && cur_binshard == tcache_binshard);
-		bool can_batch = (flush_start - prev_flush_start
-		                     <= opt_bin_info_remote_free_max_batch)
-		    && !home_binshard && bin_is_batched;
+		/* Actually do the flushing. */
+		malloc_mutex_lock(tsdn, &cur_bin->lock);

 		/*
-		 * We try to avoid the batching pathway if we can, so we always
-		 * at least *try* to lock.
+		 * Flush stats first, if that was the right lock.  Note that we
+		 * don't actually have to flush stats into the current thread's
+		 * binshard. Flushing into any binshard in the same arena is
+		 * enough; we don't expose stats on per-binshard basis (just
+		 * per-bin).
 		 */
-		bool locked = false;
-		bool batched = false;
-		bool batch_failed = false;
-		if (can_batch) {
-			locked = !malloc_mutex_trylock(tsdn, &cur_bin->lock);
+		if (config_stats && tcache_arena == cur_arena
+		    && !merged_stats) {
+			merged_stats = true;
+			cur_bin->stats.nflushes++;
+			cur_bin->stats.nrequests += cache_bin->tstats.nrequests;
+			cache_bin->tstats.nrequests = 0;
 		}
-		if (can_batch && !locked) {
-			bin_with_batch_t *batched_bin = (bin_with_batch_t *)
-			    cur_bin;
-			size_t push_idx = batcher_push_begin(tsdn,
-			    &batched_bin->remote_frees,
-			    flush_start - prev_flush_start);
-			bin_batching_test_after_push(push_idx);

-			if (push_idx != BATCHER_NO_IDX) {
-				batched = true;
-				unsigned nbatched = flush_start
-				    - prev_flush_start;
-				for (unsigned i = 0; i < nbatched; i++) {
-					unsigned src_ind = prev_flush_start + i;
-					batched_bin
-					    ->remote_free_data[push_idx + i]
-					    .ptr = ptrs->ptr[src_ind];
-					batched_bin
-					    ->remote_free_data[push_idx + i]
-					    .slab = item_edata[src_ind].edata;
-				}
-				batcher_push_end(
-				    tsdn, &batched_bin->remote_frees);
-			} else {
-				batch_failed = true;
+		/* Next flush objects. */
+		/* Init only to avoid used-uninitialized warning. */
+		arena_dalloc_bin_locked_info_t dalloc_bin_info = {0};
+		arena_dalloc_bin_locked_begin(&dalloc_bin_info, binind);
+		for (unsigned i = prev_flush_start; i < flush_start; i++) {
+			void    *ptr = ptrs->ptr[i];
+			edata_t *edata = item_edata[i].edata;
+			if (arena_dalloc_bin_locked_step(tsdn, cur_arena,
+			        cur_bin, &dalloc_bin_info, binind, edata,
+			        ptr)) {
+				dalloc_slabs[dalloc_count] = edata;
+				dalloc_count++;
 			}
 		}
-		if (!batched) {
-			if (!locked) {
-				malloc_mutex_lock(tsdn, &cur_bin->lock);
-			}
-			/*
-			 * Unlike other stats (which only ever get flushed into
-			 * a tcache's associated arena), batch_failed counts get
-			 * accumulated into the bin where the push attempt
-			 * failed.
-			 */
-			if (config_stats && batch_failed) {
-				cur_bin->stats.batch_failed_pushes++;
-			}

-			/*
-			 * Flush stats first, if that was the right lock.  Note
-			 * that we don't actually have to flush stats into the
-			 * current thread's binshard. Flushing into any binshard
-			 * in the same arena is enough; we don't expose stats on
-			 * per-binshard basis (just per-bin).
-			 */
-			if (config_stats && tcache_arena == cur_arena
-			    && !merged_stats) {
-				merged_stats = true;
-				cur_bin->stats.nflushes++;
-				cur_bin->stats.nrequests +=
-				    cache_bin->tstats.nrequests;
-				cache_bin->tstats.nrequests = 0;
-			}
-			unsigned preallocated_slabs = nflush;
-			unsigned ndalloc_slabs =
-			    arena_bin_batch_get_ndalloc_slabs(
-			        preallocated_slabs);
+		arena_dalloc_bin_locked_finish(
+		    tsdn, cur_arena, cur_bin, &dalloc_bin_info);
+		malloc_mutex_unlock(tsdn, &cur_bin->lock);

-			/* Next flush objects our own objects. */
-			/* Init only to avoid used-uninitialized warning. */
-			arena_dalloc_bin_locked_info_t dalloc_bin_info = {0};
-			arena_dalloc_bin_locked_begin(&dalloc_bin_info, binind);
-			for (unsigned i = prev_flush_start; i < flush_start;
-			     i++) {
-				void    *ptr = ptrs->ptr[i];
-				edata_t *edata = item_edata[i].edata;
-				arena_dalloc_bin_locked_step(tsdn, cur_arena,
-				    cur_bin, &dalloc_bin_info, binind, edata,
-				    ptr, dalloc_slabs, ndalloc_slabs,
-				    &dalloc_count, &dalloc_slabs_extra);
-			}
-			/*
-			 * Lastly, flush any batched objects (from other
-			 * threads).
-			 */
-			if (bin_is_batched) {
-				arena_bin_flush_batch_impl(tsdn, cur_arena,
-				    cur_bin, &dalloc_bin_info, binind,
-				    dalloc_slabs, ndalloc_slabs, &dalloc_count,
-				    &dalloc_slabs_extra);
-			}
-
-			arena_dalloc_bin_locked_finish(
-			    tsdn, cur_arena, cur_bin, &dalloc_bin_info);
-			malloc_mutex_unlock(tsdn, &cur_bin->lock);
-		}
 		arena_decay_ticks(
 		    tsdn, cur_arena, flush_start - prev_flush_start);
 	}
@ -941,18 +818,13 @@ tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache,
 		edata_t *slab = dalloc_slabs[i];
 		arena_slab_dalloc(tsdn, arena_get_from_edata(slab), slab);
 	}
-	while (!edata_list_active_empty(&dalloc_slabs_extra)) {
-		edata_t *slab = edata_list_active_first(&dalloc_slabs_extra);
-		edata_list_active_remove(&dalloc_slabs_extra, slab);
-		arena_slab_dalloc(tsdn, arena_get_from_edata(slab), slab);
-	}

 	if (config_stats && !merged_stats) {
 		/*
-			 * The flush loop didn't happen to flush to this
-			 * thread's arena, so the stats didn't get merged.
-			 * Manually do so now.
-			 */
+		 * The flush loop didn't happen to flush to this
+		 * thread's arena, so the stats didn't get merged.
+		 * Manually do so now.
+		 */
 		bin_t *bin = arena_bin_choose(tsdn, tcache_arena, binind, NULL);
 		malloc_mutex_lock(tsdn, &bin->lock);
 		bin->stats.nflushes++;