move fill/flush pointer array out of tcache.c

2026-06-16 08:55:37 +03:00 · 2025-10-21 18:53:35 -07:00 · 2025-10-21 18:53:35 -07:00 · c4367d7794
commit c4367d7794
parent c9046de297
5 changed files with 404 additions and 390 deletions
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@ -61,13 +61,13 @@ bool arena_decay_ms_set(
 ssize_t arena_decay_ms_get(arena_t *arena, extent_state_t state);
 void    arena_decay(
       tsdn_t *tsdn, arena_t *arena, bool is_background_thread, bool all);
-uint64_t arena_time_until_deferred(tsdn_t *tsdn, arena_t *arena);
-void     arena_do_deferred_work(tsdn_t *tsdn, arena_t *arena);
-void     arena_reset(tsd_t *tsd, arena_t *arena);
-void     arena_destroy(tsd_t *tsd, arena_t *arena);
-void     arena_cache_bin_fill_small(tsdn_t *tsdn, arena_t *arena,
-        cache_bin_t *cache_bin, szind_t binind, const cache_bin_sz_t nfill_min,
-        const cache_bin_sz_t nfill_max);
+uint64_t       arena_time_until_deferred(tsdn_t *tsdn, arena_t *arena);
+void           arena_do_deferred_work(tsdn_t *tsdn, arena_t *arena);
+void           arena_reset(tsd_t *tsd, arena_t *arena);
+void           arena_destroy(tsd_t *tsd, arena_t *arena);
+cache_bin_sz_t arena_ptr_array_fill_small(tsdn_t *tsdn, arena_t *arena,
+    szind_t binind, cache_bin_ptr_array_t *arr, const cache_bin_sz_t nfill_min,
+    const cache_bin_sz_t nfill_max, cache_bin_stats_t merge_stats);

 void *arena_malloc_hard(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind,
    bool zero, bool slab);
@ -84,6 +84,9 @@ void arena_dalloc_bin_locked_handle_newly_empty(
 void arena_dalloc_bin_locked_handle_newly_nonempty(
    tsdn_t *tsdn, arena_t *arena, edata_t *slab, bin_t *bin);
 void  arena_dalloc_small(tsdn_t *tsdn, void *ptr);
+void  arena_ptr_array_flush(tsd_t *tsd, szind_t binind,
+     cache_bin_ptr_array_t *arr, unsigned nflush, bool small,
+     arena_t *stats_arena, cache_bin_stats_t merge_stats);
 bool  arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
     size_t extra, bool zero, size_t *newsize);
 void *arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize,
--- a/include/jemalloc/internal/arena_types.h
+++ b/include/jemalloc/internal/arena_types.h
@ -12,8 +12,7 @@
 /* Maximum length of the arena name. */
 #define ARENA_NAME_LEN 32

-typedef struct arena_decay_s arena_decay_t;
-typedef struct arena_s       arena_t;
+typedef struct arena_s arena_t;

 typedef enum {
 	percpu_arena_mode_names_base = 0, /* Used for options processing. */
--- a/include/jemalloc/internal/cache_bin.h
+++ b/include/jemalloc/internal/cache_bin.h
@ -690,6 +690,10 @@ cache_bin_finish_fill(
 		    nfilled * sizeof(void *));
 	}
 	bin->stack_head = empty_position - nfilled;
+	/* Reset the bin stats as it's merged during fill. */
+	if (config_stats) {
+		bin->tstats.nrequests = 0;
+	}
 }

 /*
@ -711,6 +715,10 @@ cache_bin_finish_flush(
 	    bin->stack_head + nflushed, bin->stack_head, rem * sizeof(void *));
 	bin->stack_head += nflushed;
 	cache_bin_low_water_adjust(bin);
+	/* Reset the bin stats as it's merged during flush. */
+	if (config_stats) {
+		bin->tstats.nrequests = 0;
+	}
 }

 static inline void
@ -731,6 +739,10 @@ cache_bin_finish_flush_stashed(cache_bin_t *bin) {
 	/* Reset the bin local full position. */
 	bin->low_bits_full = (uint16_t)(uintptr_t)low_bound;
 	assert(cache_bin_nstashed_get_local(bin) == 0);
+	/* Reset the bin stats as it's merged during flush. */
+	if (config_stats) {
+		bin->tstats.nrequests = 0;
+	}
 }

 /*
--- a/src/arena.c
+++ b/src/arena.c
@ -693,7 +693,7 @@ arena_bin_reset(tsd_t *tsd, arena_t *arena, bin_t *bin) {
 		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
 	}
 	for (slab = edata_list_active_first(&bin->slabs_full); slab != NULL;
-	     slab = edata_list_active_first(&bin->slabs_full)) {
+	    slab = edata_list_active_first(&bin->slabs_full)) {
 		arena_bin_slabs_full_remove(arena, bin, slab);
 		malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
 		arena_slab_dalloc(tsd_tsdn(tsd), arena, slab);
@ -799,7 +799,7 @@ arena_reset(tsd_t *tsd, arena_t *arena) {
 	malloc_mutex_lock(tsd_tsdn(tsd), &arena->large_mtx);

 	for (edata_t *edata = edata_list_active_first(&arena->large);
-	     edata != NULL; edata = edata_list_active_first(&arena->large)) {
+	    edata != NULL; edata = edata_list_active_first(&arena->large)) {
 		void  *ptr = edata_base_get(edata);
 		size_t usize;

@ -1052,18 +1052,13 @@ arena_bin_choose(
 	return arena_get_bin(arena, binind, binshard);
 }

-void
-arena_cache_bin_fill_small(tsdn_t *tsdn, arena_t *arena, cache_bin_t *cache_bin,
-    szind_t binind, const cache_bin_sz_t nfill_min,
-    const cache_bin_sz_t nfill_max) {
-	assert(cache_bin_ncached_get_local(cache_bin) == 0);
+cache_bin_sz_t
+arena_ptr_array_fill_small(tsdn_t *tsdn, arena_t *arena, szind_t binind,
+    cache_bin_ptr_array_t *arr, const cache_bin_sz_t nfill_min,
+    const cache_bin_sz_t nfill_max, cache_bin_stats_t merge_stats) {
 	assert(nfill_min > 0 && nfill_min <= nfill_max);
-	assert(nfill_max <= cache_bin_ncached_max_get(cache_bin));

 	const bin_info_t *bin_info = &bin_infos[binind];
-
-	CACHE_BIN_PTR_ARRAY_DECLARE(ptrs, nfill_max);
-	cache_bin_init_ptr_array_for_fill(cache_bin, &ptrs, nfill_max);
 	/*
 	 * Bin-local resources are used first: 1) bin->slabcur, and 2) nonfull
 	 * slabs.  After both are exhausted, new slabs will be allocated through
@ -1115,7 +1110,7 @@ label_refill:
 			}

 			arena_slab_reg_alloc_batch(
-			    slabcur, bin_info, cnt, &ptrs.ptr[filled]);
+			    slabcur, bin_info, cnt, &arr->ptr[filled]);
 			made_progress = true;
 			filled += cnt;
 			continue;
@ -1153,10 +1148,9 @@ label_refill:

 	if (config_stats && !alloc_and_retry) {
 		bin->stats.nmalloc += filled;
-		bin->stats.nrequests += cache_bin->tstats.nrequests;
+		bin->stats.nrequests += merge_stats.nrequests;
 		bin->stats.curregs += filled;
 		bin->stats.nfills++;
-		cache_bin->tstats.nrequests = 0;
 	}

 	malloc_mutex_unlock(tsdn, &bin->lock);
@ -1184,8 +1178,8 @@ label_refill:
 		fresh_slab = NULL;
 	}

-	cache_bin_finish_fill(cache_bin, &ptrs, filled);
 	arena_decay_tick(tsdn, arena);
+	return filled;
 }

 size_t
@ -1472,6 +1466,357 @@ arena_dalloc_small(tsdn_t *tsdn, void *ptr) {
 	arena_decay_tick(tsdn, arena);
 }

+static const void *
+arena_ptr_array_flush_ptr_getter(void *arr_ctx, size_t ind) {
+	cache_bin_ptr_array_t *arr = (cache_bin_ptr_array_t *)arr_ctx;
+	return arr->ptr[ind];
+}
+
+static void
+arena_ptr_array_flush_metadata_visitor(
+    void *szind_sum_ctx, emap_full_alloc_ctx_t *alloc_ctx) {
+	size_t *szind_sum = (size_t *)szind_sum_ctx;
+	*szind_sum -= alloc_ctx->szind;
+	util_prefetch_write_range(alloc_ctx->edata, sizeof(edata_t));
+}
+
+JEMALLOC_NOINLINE static void
+arena_ptr_array_flush_size_check_fail(cache_bin_ptr_array_t *arr, szind_t szind,
+    size_t nptrs, emap_batch_lookup_result_t *edatas) {
+	bool found_mismatch = false;
+	for (size_t i = 0; i < nptrs; i++) {
+		szind_t true_szind = edata_szind_get(edatas[i].edata);
+		if (true_szind != szind) {
+			found_mismatch = true;
+			safety_check_fail_sized_dealloc(
+			    /* current_dealloc */ false,
+			    /* ptr */ arena_ptr_array_flush_ptr_getter(arr, i),
+			    /* true_size */ sz_index2size(true_szind),
+			    /* input_size */ sz_index2size(szind));
+		}
+	}
+	assert(found_mismatch);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_ptr_array_flush_impl_small(tsdn_t *tsdn, szind_t binind,
+    cache_bin_ptr_array_t *arr, emap_batch_lookup_result_t *item_edata,
+    cache_bin_sz_t nflush, arena_t *stats_arena,
+    cache_bin_stats_t **merge_stats) {
+	/*
+	 * The slabs where we freed the last remaining object in the slab (and
+	 * so need to free the slab itself).
+	 * Used only if small == true.
+	 */
+	unsigned dalloc_count = 0;
+	VARIABLE_ARRAY(edata_t *, dalloc_slabs, nflush + 1);
+	/*
+	 * We're about to grab a bunch of locks.  If one of them happens to be
+	 * the one guarding the arena-level stats counters we flush our
+	 * thread-local ones to, we do so under one critical section.
+	 */
+	/*
+	 * We maintain the invariant that all edatas yet to be flushed are
+	 * contained in the half-open range [flush_start, flush_end).  We'll
+	 * repeatedly partition the array so that the unflushed items are at the
+	 * end.
+	 */
+	unsigned flush_start = 0;
+
+	while (flush_start < nflush) {
+		/*
+		 * After our partitioning step, all objects to flush will be in
+		 * the half-open range [prev_flush_start, flush_start), and
+		 * flush_start will be updated to correspond to the next loop
+		 * iteration.
+		 */
+		unsigned prev_flush_start = flush_start;
+
+		edata_t *cur_edata = item_edata[flush_start].edata;
+		unsigned cur_arena_ind = edata_arena_ind_get(cur_edata);
+		arena_t *cur_arena = arena_get(tsdn, cur_arena_ind, false);
+
+		unsigned cur_binshard = edata_binshard_get(cur_edata);
+		bin_t *cur_bin = arena_get_bin(cur_arena, binind, cur_binshard);
+		assert(cur_binshard < bin_infos[binind].n_shards);
+		/*
+		 * Start off the partition; item_edata[i] always matches itself
+		 * of course.
+		 */
+		flush_start++;
+		for (unsigned i = flush_start; i < nflush; i++) {
+			void    *ptr = arr->ptr[i];
+			edata_t *edata = item_edata[i].edata;
+			assert(ptr != NULL && edata != NULL);
+			assert(
+			    (uintptr_t)ptr >= (uintptr_t)edata_addr_get(edata));
+			assert(
+			    (uintptr_t)ptr < (uintptr_t)edata_past_get(edata));
+			if (edata_arena_ind_get(edata) == cur_arena_ind
+			    && edata_binshard_get(edata) == cur_binshard) {
+				/* Swap the edatas. */
+				emap_batch_lookup_result_t temp_edata =
+				    item_edata[flush_start];
+				item_edata[flush_start] = item_edata[i];
+				item_edata[i] = temp_edata;
+				/* Swap the pointers */
+				void *temp_ptr = arr->ptr[flush_start];
+				arr->ptr[flush_start] = arr->ptr[i];
+				arr->ptr[i] = temp_ptr;
+				flush_start++;
+			}
+		}
+		/* Make sure we implemented partitioning correctly. */
+		if (config_debug) {
+			for (unsigned i = prev_flush_start; i < flush_start;
+			    i++) {
+				edata_t *edata = item_edata[i].edata;
+				unsigned arena_ind = edata_arena_ind_get(edata);
+				assert(arena_ind == cur_arena_ind);
+				unsigned binshard = edata_binshard_get(edata);
+				assert(binshard == cur_binshard);
+			}
+			for (unsigned i = flush_start; i < nflush; i++) {
+				edata_t *edata = item_edata[i].edata;
+				assert(
+				    edata_arena_ind_get(edata) != cur_arena_ind
+				    || edata_binshard_get(edata)
+				        != cur_binshard);
+			}
+		}
+
+		/* Actually do the flushing. */
+		malloc_mutex_lock(tsdn, &cur_bin->lock);
+
+		/*
+		 * Flush stats first, if that was the right lock.  Note that we
+		 * don't actually have to flush stats into the current thread's
+		 * binshard. Flushing into any binshard in the same arena is
+		 * enough; we don't expose stats on per-binshard basis (just
+		 * per-bin).
+		 */
+		if (config_stats && stats_arena == cur_arena
+		    && *merge_stats != NULL) {
+			cur_bin->stats.nflushes++;
+			cur_bin->stats.nrequests += (*merge_stats)->nrequests;
+			*merge_stats = NULL;
+		}
+
+		/* Next flush objects. */
+		/* Init only to avoid used-uninitialized warning. */
+		arena_dalloc_bin_locked_info_t dalloc_bin_info = {0};
+		arena_dalloc_bin_locked_begin(&dalloc_bin_info, binind);
+		for (unsigned i = prev_flush_start; i < flush_start; i++) {
+			void    *ptr = arr->ptr[i];
+			edata_t *edata = item_edata[i].edata;
+			if (arena_dalloc_bin_locked_step(tsdn, cur_arena,
+			        cur_bin, &dalloc_bin_info, binind, edata,
+			        ptr)) {
+				dalloc_slabs[dalloc_count] = edata;
+				dalloc_count++;
+			}
+		}
+
+		arena_dalloc_bin_locked_finish(
+		    tsdn, cur_arena, cur_bin, &dalloc_bin_info);
+		malloc_mutex_unlock(tsdn, &cur_bin->lock);
+
+		arena_decay_ticks(
+		    tsdn, cur_arena, flush_start - prev_flush_start);
+	}
+
+	/* Handle all deferred slab dalloc. */
+	for (unsigned i = 0; i < dalloc_count; i++) {
+		edata_t *slab = dalloc_slabs[i];
+		arena_slab_dalloc(tsdn, arena_get_from_edata(slab), slab);
+	}
+
+	if (config_stats && *merge_stats != NULL) {
+		/*
+		 * The flush loop didn't happen to flush to this
+		 * thread's arena, so the stats didn't get merged.
+		 * Manually do so now.
+		 */
+		bin_t *bin = arena_bin_choose(tsdn, stats_arena, binind, NULL);
+		malloc_mutex_lock(tsdn, &bin->lock);
+		bin->stats.nflushes++;
+		bin->stats.nrequests += (*merge_stats)->nrequests;
+		*merge_stats = NULL;
+		malloc_mutex_unlock(tsdn, &bin->lock);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_ptr_array_flush_impl_large(tsdn_t *tsdn, szind_t binind,
+    cache_bin_ptr_array_t *arr, emap_batch_lookup_result_t *item_edata,
+    cache_bin_sz_t nflush, arena_t *stats_arena,
+    cache_bin_stats_t **merge_stats) {
+	/*
+	 * We're about to grab a bunch of locks.  If one of them happens to be
+	 * the one guarding the arena-level stats counters we flush our
+	 * thread-local ones to, we do so under one critical section.
+	 */
+	while (nflush > 0) {
+		/* Lock the arena, or bin, associated with the first object. */
+		edata_t *edata = item_edata[0].edata;
+		unsigned cur_arena_ind = edata_arena_ind_get(edata);
+		arena_t *cur_arena = arena_get(tsdn, cur_arena_ind, false);
+
+		if (!arena_is_auto(cur_arena)) {
+			malloc_mutex_lock(tsdn, &cur_arena->large_mtx);
+		}
+
+		/*
+		 * If we acquired the right lock and have some stats to flush,
+		 * flush them.
+		 */
+		if (config_stats && stats_arena == cur_arena
+		    && *merge_stats != NULL) {
+			arena_stats_large_flush_nrequests_add(tsdn,
+			    &stats_arena->stats, binind,
+			    (*merge_stats)->nrequests);
+			*merge_stats = NULL;
+		}
+
+		/*
+		 * Large allocations need special prep done.  Afterwards, we can
+		 * drop the large lock.
+		 */
+		for (unsigned i = 0; i < nflush; i++) {
+			void *ptr = arr->ptr[i];
+			edata = item_edata[i].edata;
+			assert(ptr != NULL && edata != NULL);
+
+			if (edata_arena_ind_get(edata) == cur_arena_ind) {
+				large_dalloc_prep_locked(tsdn, edata);
+			}
+		}
+		if (!arena_is_auto(cur_arena)) {
+			malloc_mutex_unlock(tsdn, &cur_arena->large_mtx);
+		}
+
+		/* Deallocate whatever we can. */
+		unsigned ndeferred = 0;
+		for (unsigned i = 0; i < nflush; i++) {
+			void *ptr = arr->ptr[i];
+			edata = item_edata[i].edata;
+			assert(ptr != NULL && edata != NULL);
+			if (edata_arena_ind_get(edata) != cur_arena_ind) {
+				/*
+				 * The object was allocated either via a
+				 * different arena, or a different bin in this
+				 * arena.  Either way, stash the object so that
+				 * it can be handled in a future pass.
+				 */
+				arr->ptr[ndeferred] = ptr;
+				item_edata[ndeferred].edata = edata;
+				ndeferred++;
+				continue;
+			}
+			if (large_dalloc_safety_checks(
+			        edata, ptr, sz_index2size(binind))) {
+				/* See the comment in isfree. */
+				continue;
+			}
+			large_dalloc_finish(tsdn, edata);
+		}
+		arena_decay_ticks(tsdn, cur_arena, nflush - ndeferred);
+		nflush = ndeferred;
+	}
+
+	if (config_stats && *merge_stats != NULL) {
+		arena_stats_large_flush_nrequests_add(tsdn, &stats_arena->stats,
+		    binind, (*merge_stats)->nrequests);
+		*merge_stats = NULL;
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_ptr_array_flush_impl(tsd_t *tsd, szind_t binind,
+    cache_bin_ptr_array_t *arr, unsigned nflush, bool small,
+    arena_t *stats_arena, cache_bin_stats_t **merge_stats) {
+	/*
+	 * A couple lookup calls take tsdn; declare it once for convenience
+	 * instead of calling tsd_tsdn(tsd) all the time.
+	 */
+	tsdn_t *tsdn = tsd_tsdn(tsd);
+	/*
+	 * Variable length array must have > 0 length; the last element is never
+	 * touched (it's just included to satisfy the no-zero-length rule).
+	 */
+	VARIABLE_ARRAY(emap_batch_lookup_result_t, item_edata, nflush + 1);
+	/*
+	 * This gets compiled away when config_opt_safety_checks is false.
+	 * Checks for sized deallocation bugs, failing early rather than
+	 * corrupting metadata.
+	 */
+	size_t szind_sum = binind * nflush;
+	emap_edata_lookup_batch(tsd, &arena_emap_global, nflush,
+	    &arena_ptr_array_flush_ptr_getter, (void *)arr,
+	    &arena_ptr_array_flush_metadata_visitor, (void *)&szind_sum,
+	    item_edata);
+	if (config_opt_safety_checks && unlikely(szind_sum != 0)) {
+		arena_ptr_array_flush_size_check_fail(
+		    arr, binind, nflush, item_edata);
+	}
+
+	/*
+	 * The small/large flush logic is very similar; you might conclude that
+	 * it's a good opportunity to share code.  We've tried this, and by and
+	 * large found this to obscure more than it helps; there are so many
+	 * fiddly bits around things like stats handling, precisely when and
+	 * which mutexes are acquired, etc., that almost all code ends up being
+	 * gated behind 'if (small) { ... } else { ... }'.  Even though the
+	 * '...' is morally equivalent, the code itself needs slight tweaks.
+	 */
+	if (small) {
+		return arena_ptr_array_flush_impl_small(tsdn, binind, arr,
+		    item_edata, nflush, stats_arena, merge_stats);
+	} else {
+		return arena_ptr_array_flush_impl_large(tsdn, binind, arr,
+		    item_edata, nflush, stats_arena, merge_stats);
+	}
+}
+
+/*
+ * In practice, pointers are flushed back to their original allocation arenas,
+ * so multiple arenas may be involved here. The input stats_arena simply
+ * indicates where the cache stats should be merged into.
+ */
+void
+arena_ptr_array_flush(tsd_t *tsd, szind_t binind, cache_bin_ptr_array_t *arr,
+    unsigned nflush, bool small, arena_t *stats_arena,
+    cache_bin_stats_t merge_stats) {
+	assert(arr != NULL && arr->ptr != NULL);
+	/*
+     * The input cache bin stats represent a snapshot taken when the pointer
+	 * array is set up, and will be merged into the next-level bin stats.
+     * The original bin stats will be reset by the caller itself.
+     * This separation ensures that each layer operates independently and
+     * does not modify another layer's data directly.
+     */
+	cache_bin_stats_t    *stats = &merge_stats;
+	unsigned              nflush_batch, nflushed = 0;
+	cache_bin_ptr_array_t ptrs_batch;
+	do {
+		nflush_batch = nflush - nflushed;
+		if (nflush_batch > CACHE_BIN_NFLUSH_BATCH_MAX) {
+			nflush_batch = CACHE_BIN_NFLUSH_BATCH_MAX;
+		}
+		assert(nflush_batch <= CACHE_BIN_NFLUSH_BATCH_MAX);
+		(&ptrs_batch)->n = (cache_bin_sz_t)nflush_batch;
+		(&ptrs_batch)->ptr = arr->ptr + nflushed;
+		arena_ptr_array_flush_impl(tsd, binind, &ptrs_batch,
+		    nflush_batch, small, stats_arena, &stats);
+		nflushed += nflush_batch;
+	} while (nflushed < nflush);
+	assert(nflush == nflushed);
+	assert((arr->ptr + nflush) == ((&ptrs_batch)->ptr + nflush_batch));
+	if (config_stats) {
+		assert(stats == NULL);
+	}
+}
+
 bool
 arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
    size_t extra, bool zero, size_t *newsize) {
@ -1890,7 +2235,8 @@ arena_init_huge(tsdn_t *tsdn, arena_t *a0) {
 		/* Make sure that b0 thp auto-switch won't happen concurrently here. */
 		malloc_mutex_lock(tsdn, &b0->mtx);
 		(&huge_arena_pac_thp)->thp_madvise = opt_huge_arena_pac_thp
-		    && metadata_thp_enabled() && (opt_thp == thp_mode_do_nothing)
+		    && metadata_thp_enabled()
+		    && (opt_thp == thp_mode_do_nothing)
 		    && (init_system_thp_mode == system_thp_mode_madvise);
 		(&huge_arena_pac_thp)->auto_thp_switched =
 		    b0->auto_thp_switched;
--- a/src/tcache.c
+++ b/src/tcache.c
@ -601,15 +601,26 @@ tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,

 	assert(tcache_slow->arena != NULL);
 	assert(!tcache_bin_disabled(binind, cache_bin, tcache_slow));
+	assert(cache_bin_ncached_get_local(cache_bin) == 0);
 	cache_bin_sz_t nfill = cache_bin_ncached_max_get(cache_bin)
 	    >> tcache_nfill_small_lg_div_get(tcache_slow, binind);
 	if (nfill == 0) {
 		nfill = 1;
 	}
-	arena_cache_bin_fill_small(tsdn, arena, cache_bin, binind,
-	    /* nfill_min */
-	    opt_experimental_tcache_gc ? ((nfill >> 1) + 1) : nfill,
-	    /* nfill_max */ nfill);
+	cache_bin_sz_t nfill_min = opt_experimental_tcache_gc
+	    ? ((nfill >> 1) + 1)
+	    : nfill;
+	cache_bin_sz_t nfill_max = nfill;
+	CACHE_BIN_PTR_ARRAY_DECLARE(ptrs, nfill_max);
+	cache_bin_init_ptr_array_for_fill(cache_bin, &ptrs, nfill_max);
+
+	cache_bin_sz_t filled = arena_ptr_array_fill_small(tsdn, arena, binind,
+	    &ptrs, /* nfill_min */ nfill_min, /* nfill_max */ nfill_max,
+	    cache_bin->tstats);
+	cache_bin_finish_fill(cache_bin, &ptrs, filled);
+	assert(filled >= nfill_min && filled <= nfill_max);
+	assert(cache_bin_ncached_get_local(cache_bin) == filled);
+
 	tcache_slow->bin_refilled[binind] = true;
 	tcache_nfill_small_burst_prepare(tcache_slow, binind);
 	ret = cache_bin_alloc(cache_bin, tcache_success);
@ -617,363 +628,6 @@ tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 	return ret;
 }

-static const void *
-tcache_bin_flush_ptr_getter(void *arr_ctx, size_t ind) {
-	cache_bin_ptr_array_t *arr = (cache_bin_ptr_array_t *)arr_ctx;
-	return arr->ptr[ind];
-}
-
-static void
-tcache_bin_flush_metadata_visitor(
-    void *szind_sum_ctx, emap_full_alloc_ctx_t *alloc_ctx) {
-	size_t *szind_sum = (size_t *)szind_sum_ctx;
-	*szind_sum -= alloc_ctx->szind;
-	util_prefetch_write_range(alloc_ctx->edata, sizeof(edata_t));
-}
-
-JEMALLOC_NOINLINE static void
-tcache_bin_flush_size_check_fail(cache_bin_ptr_array_t *arr, szind_t szind,
-    size_t nptrs, emap_batch_lookup_result_t *edatas) {
-	bool found_mismatch = false;
-	for (size_t i = 0; i < nptrs; i++) {
-		szind_t true_szind = edata_szind_get(edatas[i].edata);
-		if (true_szind != szind) {
-			found_mismatch = true;
-			safety_check_fail_sized_dealloc(
-			    /* current_dealloc */ false,
-			    /* ptr */ tcache_bin_flush_ptr_getter(arr, i),
-			    /* true_size */ sz_index2size(true_szind),
-			    /* input_size */ sz_index2size(szind));
-		}
-	}
-	assert(found_mismatch);
-}
-
-static void
-tcache_bin_flush_edatas_lookup(tsd_t *tsd, cache_bin_ptr_array_t *arr,
-    szind_t binind, size_t nflush, emap_batch_lookup_result_t *edatas) {
-	/*
-	 * This gets compiled away when config_opt_safety_checks is false.
-	 * Checks for sized deallocation bugs, failing early rather than
-	 * corrupting metadata.
-	 */
-	size_t szind_sum = binind * nflush;
-	emap_edata_lookup_batch(tsd, &arena_emap_global, nflush,
-	    &tcache_bin_flush_ptr_getter, (void *)arr,
-	    &tcache_bin_flush_metadata_visitor, (void *)&szind_sum, edatas);
-	if (config_opt_safety_checks && unlikely(szind_sum != 0)) {
-		tcache_bin_flush_size_check_fail(arr, binind, nflush, edatas);
-	}
-}
-
-JEMALLOC_ALWAYS_INLINE void
-tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache,
-    cache_bin_t *cache_bin, szind_t binind, cache_bin_ptr_array_t *ptrs,
-    unsigned nflush) {
-	tcache_slow_t *tcache_slow = tcache->tcache_slow;
-	/*
-	 * A couple lookup calls take tsdn; declare it once for convenience
-	 * instead of calling tsd_tsdn(tsd) all the time.
-	 */
-	tsdn_t *tsdn = tsd_tsdn(tsd);
-
-	assert(binind < SC_NBINS);
-	arena_t *tcache_arena = tcache_slow->arena;
-	assert(tcache_arena != NULL);
-
-	/*
-	 * Variable length array must have > 0 length; the last element is never
-	 * touched (it's just included to satisfy the no-zero-length rule).
-	 */
-	VARIABLE_ARRAY(emap_batch_lookup_result_t, item_edata, nflush + 1);
-	tcache_bin_flush_edatas_lookup(tsd, ptrs, binind, nflush, item_edata);
-
-	/*
-	 * The slabs where we freed the last remaining object in the slab (and
-	 * so need to free the slab itself).
-	 * Used only if small == true.
-	 */
-	unsigned dalloc_count = 0;
-	VARIABLE_ARRAY(edata_t *, dalloc_slabs, nflush + 1);
-
-	/*
-	 * We're about to grab a bunch of locks.  If one of them happens to be
-	 * the one guarding the arena-level stats counters we flush our
-	 * thread-local ones to, we do so under one critical section.
-	 */
-	bool merged_stats = false;
-	/*
-	 * We maintain the invariant that all edatas yet to be flushed are
-	 * contained in the half-open range [flush_start, flush_end).  We'll
-	 * repeatedly partition the array so that the unflushed items are at the
-	 * end.
-	 */
-	unsigned flush_start = 0;
-
-	while (flush_start < nflush) {
-		/*
-		 * After our partitioning step, all objects to flush will be in
-		 * the half-open range [prev_flush_start, flush_start), and
-		 * flush_start will be updated to correspond to the next loop
-		 * iteration.
-		 */
-		unsigned prev_flush_start = flush_start;
-
-		edata_t *cur_edata = item_edata[flush_start].edata;
-		unsigned cur_arena_ind = edata_arena_ind_get(cur_edata);
-		arena_t *cur_arena = arena_get(tsdn, cur_arena_ind, false);
-
-		unsigned cur_binshard = edata_binshard_get(cur_edata);
-		bin_t *cur_bin = arena_get_bin(cur_arena, binind, cur_binshard);
-		assert(cur_binshard < bin_infos[binind].n_shards);
-		/*
-		 * Start off the partition; item_edata[i] always matches itself
-		 * of course.
-		 */
-		flush_start++;
-		for (unsigned i = flush_start; i < nflush; i++) {
-			void    *ptr = ptrs->ptr[i];
-			edata_t *edata = item_edata[i].edata;
-			assert(ptr != NULL && edata != NULL);
-			assert(
-			    (uintptr_t)ptr >= (uintptr_t)edata_addr_get(edata));
-			assert(
-			    (uintptr_t)ptr < (uintptr_t)edata_past_get(edata));
-			if (edata_arena_ind_get(edata) == cur_arena_ind
-			    && edata_binshard_get(edata) == cur_binshard) {
-				/* Swap the edatas. */
-				emap_batch_lookup_result_t temp_edata =
-				    item_edata[flush_start];
-				item_edata[flush_start] = item_edata[i];
-				item_edata[i] = temp_edata;
-				/* Swap the pointers */
-				void *temp_ptr = ptrs->ptr[flush_start];
-				ptrs->ptr[flush_start] = ptrs->ptr[i];
-				ptrs->ptr[i] = temp_ptr;
-				flush_start++;
-			}
-		}
-		/* Make sure we implemented partitioning correctly. */
-		if (config_debug) {
-			for (unsigned i = prev_flush_start; i < flush_start;
-			     i++) {
-				edata_t *edata = item_edata[i].edata;
-				unsigned arena_ind = edata_arena_ind_get(edata);
-				assert(arena_ind == cur_arena_ind);
-				unsigned binshard = edata_binshard_get(edata);
-				assert(binshard == cur_binshard);
-			}
-			for (unsigned i = flush_start; i < nflush; i++) {
-				edata_t *edata = item_edata[i].edata;
-				assert(
-				    edata_arena_ind_get(edata) != cur_arena_ind
-				    || edata_binshard_get(edata)
-				        != cur_binshard);
-			}
-		}
-
-		/* Actually do the flushing. */
-		malloc_mutex_lock(tsdn, &cur_bin->lock);
-
-		/*
-		 * Flush stats first, if that was the right lock.  Note that we
-		 * don't actually have to flush stats into the current thread's
-		 * binshard. Flushing into any binshard in the same arena is
-		 * enough; we don't expose stats on per-binshard basis (just
-		 * per-bin).
-		 */
-		if (config_stats && tcache_arena == cur_arena
-		    && !merged_stats) {
-			merged_stats = true;
-			cur_bin->stats.nflushes++;
-			cur_bin->stats.nrequests += cache_bin->tstats.nrequests;
-			cache_bin->tstats.nrequests = 0;
-		}
-
-		/* Next flush objects. */
-		/* Init only to avoid used-uninitialized warning. */
-		arena_dalloc_bin_locked_info_t dalloc_bin_info = {0};
-		arena_dalloc_bin_locked_begin(&dalloc_bin_info, binind);
-		for (unsigned i = prev_flush_start; i < flush_start; i++) {
-			void    *ptr = ptrs->ptr[i];
-			edata_t *edata = item_edata[i].edata;
-			if (arena_dalloc_bin_locked_step(tsdn, cur_arena,
-			        cur_bin, &dalloc_bin_info, binind, edata,
-			        ptr)) {
-				dalloc_slabs[dalloc_count] = edata;
-				dalloc_count++;
-			}
-		}
-
-		arena_dalloc_bin_locked_finish(
-		    tsdn, cur_arena, cur_bin, &dalloc_bin_info);
-		malloc_mutex_unlock(tsdn, &cur_bin->lock);
-
-		arena_decay_ticks(
-		    tsdn, cur_arena, flush_start - prev_flush_start);
-	}
-
-	/* Handle all deferred slab dalloc. */
-	for (unsigned i = 0; i < dalloc_count; i++) {
-		edata_t *slab = dalloc_slabs[i];
-		arena_slab_dalloc(tsdn, arena_get_from_edata(slab), slab);
-	}
-
-	if (config_stats && !merged_stats) {
-		/*
-		 * The flush loop didn't happen to flush to this
-		 * thread's arena, so the stats didn't get merged.
-		 * Manually do so now.
-		 */
-		bin_t *bin = arena_bin_choose(tsdn, tcache_arena, binind, NULL);
-		malloc_mutex_lock(tsdn, &bin->lock);
-		bin->stats.nflushes++;
-		bin->stats.nrequests += cache_bin->tstats.nrequests;
-		cache_bin->tstats.nrequests = 0;
-		malloc_mutex_unlock(tsdn, &bin->lock);
-	}
-}
-
-JEMALLOC_ALWAYS_INLINE void
-tcache_bin_flush_impl_large(tsd_t *tsd, tcache_t *tcache,
-    cache_bin_t *cache_bin, szind_t binind, cache_bin_ptr_array_t *ptrs,
-    unsigned nflush) {
-	tcache_slow_t *tcache_slow = tcache->tcache_slow;
-	/*
-	 * A couple lookup calls take tsdn; declare it once for convenience
-	 * instead of calling tsd_tsdn(tsd) all the time.
-	 */
-	tsdn_t *tsdn = tsd_tsdn(tsd);
-
-	assert(binind < tcache_nbins_get(tcache_slow));
-	arena_t *tcache_arena = tcache_slow->arena;
-	assert(tcache_arena != NULL);
-
-	/*
-	 * Variable length array must have > 0 length; the last element is never
-	 * touched (it's just included to satisfy the no-zero-length rule).
-	 */
-	VARIABLE_ARRAY(emap_batch_lookup_result_t, item_edata, nflush + 1);
-	tcache_bin_flush_edatas_lookup(tsd, ptrs, binind, nflush, item_edata);
-
-	/*
-	 * We're about to grab a bunch of locks.  If one of them happens to be
-	 * the one guarding the arena-level stats counters we flush our
-	 * thread-local ones to, we do so under one critical section.
-	 */
-	bool merged_stats = false;
-	while (nflush > 0) {
-		/* Lock the arena, or bin, associated with the first object. */
-		edata_t *edata = item_edata[0].edata;
-		unsigned cur_arena_ind = edata_arena_ind_get(edata);
-		arena_t *cur_arena = arena_get(tsdn, cur_arena_ind, false);
-
-		if (!arena_is_auto(cur_arena)) {
-			malloc_mutex_lock(tsdn, &cur_arena->large_mtx);
-		}
-
-		/*
-		 * If we acquired the right lock and have some stats to flush,
-		 * flush them.
-		 */
-		if (config_stats && tcache_arena == cur_arena
-		    && !merged_stats) {
-			merged_stats = true;
-			arena_stats_large_flush_nrequests_add(tsdn,
-			    &tcache_arena->stats, binind,
-			    cache_bin->tstats.nrequests);
-			cache_bin->tstats.nrequests = 0;
-		}
-
-		/*
-		 * Large allocations need special prep done.  Afterwards, we can
-		 * drop the large lock.
-		 */
-		for (unsigned i = 0; i < nflush; i++) {
-			void *ptr = ptrs->ptr[i];
-			edata = item_edata[i].edata;
-			assert(ptr != NULL && edata != NULL);
-
-			if (edata_arena_ind_get(edata) == cur_arena_ind) {
-				large_dalloc_prep_locked(tsdn, edata);
-			}
-		}
-		if (!arena_is_auto(cur_arena)) {
-			malloc_mutex_unlock(tsdn, &cur_arena->large_mtx);
-		}
-
-		/* Deallocate whatever we can. */
-		unsigned ndeferred = 0;
-		for (unsigned i = 0; i < nflush; i++) {
-			void *ptr = ptrs->ptr[i];
-			edata = item_edata[i].edata;
-			assert(ptr != NULL && edata != NULL);
-			if (edata_arena_ind_get(edata) != cur_arena_ind) {
-				/*
-				 * The object was allocated either via a
-				 * different arena, or a different bin in this
-				 * arena.  Either way, stash the object so that
-				 * it can be handled in a future pass.
-				 */
-				ptrs->ptr[ndeferred] = ptr;
-				item_edata[ndeferred].edata = edata;
-				ndeferred++;
-				continue;
-			}
-			if (large_dalloc_safety_checks(
-			        edata, ptr, sz_index2size(binind))) {
-				/* See the comment in isfree. */
-				continue;
-			}
-			large_dalloc_finish(tsdn, edata);
-		}
-		arena_decay_ticks(tsdn, cur_arena, nflush - ndeferred);
-		nflush = ndeferred;
-	}
-
-	if (config_stats && !merged_stats) {
-		arena_stats_large_flush_nrequests_add(tsdn,
-		    &tcache_arena->stats, binind, cache_bin->tstats.nrequests);
-		cache_bin->tstats.nrequests = 0;
-	}
-}
-
-JEMALLOC_ALWAYS_INLINE void
-tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
-    szind_t binind, cache_bin_ptr_array_t *ptrs, unsigned nflush, bool small) {
-	assert(ptrs != NULL && ptrs->ptr != NULL);
-	unsigned              nflush_batch, nflushed = 0;
-	cache_bin_ptr_array_t ptrs_batch;
-	do {
-		nflush_batch = nflush - nflushed;
-		if (nflush_batch > CACHE_BIN_NFLUSH_BATCH_MAX) {
-			nflush_batch = CACHE_BIN_NFLUSH_BATCH_MAX;
-		}
-		assert(nflush_batch <= CACHE_BIN_NFLUSH_BATCH_MAX);
-		(&ptrs_batch)->n = (cache_bin_sz_t)nflush_batch;
-		(&ptrs_batch)->ptr = ptrs->ptr + nflushed;
-		/*
-		 * The small/large flush logic is very similar; you might conclude that
-		 * it's a good opportunity to share code.  We've tried this, and by and
-		 * large found this to obscure more than it helps; there are so many
-		 * fiddly bits around things like stats handling, precisely when and
-		 * which mutexes are acquired, etc., that almost all code ends up being
-		 * gated behind 'if (small) { ... } else { ... }'.  Even though the
-		 * '...' is morally equivalent, the code itself needs slight tweaks.
-		 */
-		if (small) {
-			tcache_bin_flush_impl_small(tsd, tcache, cache_bin,
-			    binind, &ptrs_batch, nflush_batch);
-		} else {
-			tcache_bin_flush_impl_large(tsd, tcache, cache_bin,
-			    binind, &ptrs_batch, nflush_batch);
-		}
-		nflushed += nflush_batch;
-	} while (nflushed < nflush);
-	assert(nflush == nflushed);
-	assert((ptrs->ptr + nflush) == ((&ptrs_batch)->ptr + nflush_batch));
-}
-
 JEMALLOC_ALWAYS_INLINE void
 tcache_bin_flush_bottom(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
    szind_t binind, unsigned rem, bool small) {
@ -1001,8 +655,8 @@ tcache_bin_flush_bottom(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
 	CACHE_BIN_PTR_ARRAY_DECLARE(ptrs, nflush);
 	cache_bin_init_ptr_array_for_flush(cache_bin, &ptrs, nflush);

-	tcache_bin_flush_impl(
-	    tsd, tcache, cache_bin, binind, &ptrs, nflush, small);
+	arena_ptr_array_flush(tsd, binind, &ptrs, nflush, small,
+	    tcache->tcache_slow->arena, cache_bin->tstats);

 	cache_bin_finish_flush(cache_bin, &ptrs, nflush);
 }
@ -1054,8 +708,8 @@ tcache_bin_flush_stashed(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
 	cache_bin_init_ptr_array_for_stashed(
 	    cache_bin, binind, &ptrs, nstashed);
 	san_check_stashed_ptrs(ptrs.ptr, nstashed, sz_index2size(binind));
-	tcache_bin_flush_impl(
-	    tsd, tcache, cache_bin, binind, &ptrs, nstashed, is_small);
+	arena_ptr_array_flush(tsd, binind, &ptrs, nstashed, is_small,
+	    tcache->tcache_slow->arena, cache_bin->tstats);
 	cache_bin_finish_flush_stashed(cache_bin);

 	assert(cache_bin_nstashed_get_local(cache_bin) == 0);