diff --git a/src/tcache.c b/src/tcache.c
index 2114ff95..8bec5d6c 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -312,20 +312,9 @@ tcache_bin_flush_edatas_lookup(tsd_t *tsd, cache_bin_ptr_array_t *arr,
 	}
 }
 
-JEMALLOC_ALWAYS_INLINE bool
-tcache_bin_flush_match(edata_t *edata, unsigned cur_arena_ind,
-    unsigned cur_binshard, bool small) {
-	if (small) {
-		return edata_arena_ind_get(edata) == cur_arena_ind
-		    && edata_binshard_get(edata) == cur_binshard;
-	} else {
-		return edata_arena_ind_get(edata) == cur_arena_ind;
-	}
-}
-
 JEMALLOC_ALWAYS_INLINE void
-tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
-    szind_t binind, cache_bin_ptr_array_t *ptrs, unsigned nflush, bool small) {
+tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
+    szind_t binind, cache_bin_ptr_array_t *ptrs, unsigned nflush) {
 	tcache_slow_t *tcache_slow = tcache->tcache_slow;
 	/*
 	 * A couple lookup calls take tsdn; declare it once for convenience
@@ -333,11 +322,7 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
 	 */
 	tsdn_t *tsdn = tsd_tsdn(tsd);
 
-	if (small) {
-		assert(binind < SC_NBINS);
-	} else {
-		assert(binind < tcache_nbins_get(tcache_slow));
-	}
+	assert(binind < SC_NBINS);
 	arena_t *tcache_arena = tcache_slow->arena;
 	assert(tcache_arena != NULL);
 
@@ -368,34 +353,19 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
 		unsigned cur_arena_ind = edata_arena_ind_get(edata);
 		arena_t *cur_arena = arena_get(tsdn, cur_arena_ind, false);
 
+		unsigned cur_binshard = edata_binshard_get(edata);
+		bin_t *cur_bin = arena_get_bin(cur_arena, binind,
+		    cur_binshard);
+		assert(cur_binshard < bin_infos[binind].n_shards);
 		/*
-		 * These assignments are always overwritten when small is true,
-		 * and their values are always ignored when small is false, but
-		 * to avoid the technical UB when we pass them as parameters, we
-		 * need to intialize them.
+		 * If you're looking at profiles, you might think this
+		 * is a good place to prefetch the bin stats, which are
+		 * often a cache miss.  This turns out not to be
+		 * helpful on the workloads we've looked at, with moving
+		 * the bin stats next to the lock seeming to do better.
 		 */
-		unsigned cur_binshard = 0;
-		bin_t *cur_bin = NULL;
-		if (small) {
-			cur_binshard = edata_binshard_get(edata);
-			cur_bin = arena_get_bin(cur_arena, binind,
-			    cur_binshard);
-			assert(cur_binshard < bin_infos[binind].n_shards);
-			/*
-			 * If you're looking at profiles, you might think this
-			 * is a good place to prefetch the bin stats, which are
-			 * often a cache miss.  This turns out not to be
-			 * helpful on the workloads we've looked at, with moving
-			 * the bin stats next to the lock seeming to do better.
-			 */
-		}
 
-		if (small) {
-			malloc_mutex_lock(tsdn, &cur_bin->lock);
-		}
-		if (!small && !arena_is_auto(cur_arena)) {
-			malloc_mutex_lock(tsdn, &cur_arena->large_mtx);
-		}
+		malloc_mutex_lock(tsdn, &cur_bin->lock);
 
 		/*
 		 * If we acquired the right lock and have some stats to flush,
@@ -404,53 +374,23 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
 		if (config_stats && tcache_arena == cur_arena
 		    && !merged_stats) {
 			merged_stats = true;
-			if (small) {
-				cur_bin->stats.nflushes++;
-				cur_bin->stats.nrequests +=
-				    cache_bin->tstats.nrequests;
-				cache_bin->tstats.nrequests = 0;
-			} else {
-				arena_stats_large_flush_nrequests_add(tsdn,
-				    &tcache_arena->stats, binind,
-				    cache_bin->tstats.nrequests);
-				cache_bin->tstats.nrequests = 0;
-			}
-		}
-
-		/*
-		 * Large allocations need special prep done.  Afterwards, we can
-		 * drop the large lock.
-		 */
-		if (!small) {
-			for (unsigned i = 0; i < nflush; i++) {
-				void *ptr = ptrs->ptr[i];
-				edata = item_edata[i].edata;
-				assert(ptr != NULL && edata != NULL);
-
-				if (tcache_bin_flush_match(edata, cur_arena_ind,
-				    cur_binshard, small)) {
-					large_dalloc_prep_locked(tsdn,
-					    edata);
-				}
-			}
-		}
-		if (!small && !arena_is_auto(cur_arena)) {
-			malloc_mutex_unlock(tsdn, &cur_arena->large_mtx);
+			cur_bin->stats.nflushes++;
+			cur_bin->stats.nrequests +=
+			    cache_bin->tstats.nrequests;
+			cache_bin->tstats.nrequests = 0;
 		}
 
 		/* Deallocate whatever we can. */
 		unsigned ndeferred = 0;
 		/* Init only to avoid used-uninitialized warning. */
 		arena_dalloc_bin_locked_info_t dalloc_bin_info = {0};
-		if (small) {
-			arena_dalloc_bin_locked_begin(&dalloc_bin_info, binind);
-		}
+		arena_dalloc_bin_locked_begin(&dalloc_bin_info, binind);
 		for (unsigned i = 0; i < nflush; i++) {
 			void *ptr = ptrs->ptr[i];
 			edata = item_edata[i].edata;
 			assert(ptr != NULL && edata != NULL);
-			if (!tcache_bin_flush_match(edata, cur_arena_ind,
-			    cur_binshard, small)) {
+			if (edata_arena_ind_get(edata) != cur_arena_ind
+			    || edata_binshard_get(edata) != cur_binshard) {
 				/*
 				 * The object was allocated either via a
 				 * different arena, or a different bin in this
@@ -462,34 +402,23 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
 				ndeferred++;
 				continue;
 			}
-			if (small) {
-				if (arena_dalloc_bin_locked_step(tsdn,
-				    cur_arena, cur_bin, &dalloc_bin_info,
-				    binind, edata, ptr)) {
-					dalloc_slabs[dalloc_count] = edata;
-					dalloc_count++;
-				}
-			} else {
-				if (large_dalloc_safety_checks(edata, ptr,
-				    binind)) {
-					/* See the comment in isfree. */
-					continue;
-				}
-				large_dalloc_finish(tsdn, edata);
+			if (arena_dalloc_bin_locked_step(tsdn,
+			    cur_arena, cur_bin, &dalloc_bin_info,
+			    binind, edata, ptr)) {
+				dalloc_slabs[dalloc_count] = edata;
+				dalloc_count++;
 			}
 		}
 
-		if (small) {
-			arena_dalloc_bin_locked_finish(tsdn, cur_arena, cur_bin,
-			    &dalloc_bin_info);
-			malloc_mutex_unlock(tsdn, &cur_bin->lock);
-		}
+		arena_dalloc_bin_locked_finish(tsdn, cur_arena, cur_bin,
+		    &dalloc_bin_info);
+		malloc_mutex_unlock(tsdn, &cur_bin->lock);
+
 		arena_decay_ticks(tsdn, cur_arena, nflush - ndeferred);
 		nflush = ndeferred;
 	}
 
 	/* Handle all deferred slab dalloc. */
-	assert(small || dalloc_count == 0);
 	for (unsigned i = 0; i < dalloc_count; i++) {
 		edata_t *slab = dalloc_slabs[i];
 		arena_slab_dalloc(tsdn, arena_get_from_edata(slab), slab);
@@ -497,7 +426,6 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
 	}
 
 	if (config_stats && !merged_stats) {
-		if (small) {
 			/*
 			 * The flush loop didn't happen to flush to this
 			 * thread's arena, so the stats didn't get merged.
@@ -510,14 +438,132 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
 			bin->stats.nrequests += cache_bin->tstats.nrequests;
 			cache_bin->tstats.nrequests = 0;
 			malloc_mutex_unlock(tsdn, &bin->lock);
-		} else {
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+tcache_bin_flush_impl_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
+    szind_t binind, cache_bin_ptr_array_t *ptrs, unsigned nflush) {
+	tcache_slow_t *tcache_slow = tcache->tcache_slow;
+	/*
+	 * A couple lookup calls take tsdn; declare it once for convenience
+	 * instead of calling tsd_tsdn(tsd) all the time.
+	 */
+	tsdn_t *tsdn = tsd_tsdn(tsd);
+
+	assert(binind < tcache_nbins_get(tcache_slow));
+	arena_t *tcache_arena = tcache_slow->arena;
+	assert(tcache_arena != NULL);
+
+	/*
+	 * Variable length array must have > 0 length; the last element is never
+	 * touched (it's just included to satisfy the no-zero-length rule).
+	 */
+	VARIABLE_ARRAY(emap_batch_lookup_result_t, item_edata, nflush + 1);
+	tcache_bin_flush_edatas_lookup(tsd, ptrs, binind, nflush, item_edata);
+
+	/*
+	 * We're about to grab a bunch of locks.  If one of them happens to be
+	 * the one guarding the arena-level stats counters we flush our
+	 * thread-local ones to, we do so under one critical section.
+	 */
+	bool merged_stats = false;
+	while (nflush > 0) {
+		/* Lock the arena, or bin, associated with the first object. */
+		edata_t *edata = item_edata[0].edata;
+		unsigned cur_arena_ind = edata_arena_ind_get(edata);
+		arena_t *cur_arena = arena_get(tsdn, cur_arena_ind, false);
+
+		if (!arena_is_auto(cur_arena)) {
+			malloc_mutex_lock(tsdn, &cur_arena->large_mtx);
+		}
+
+		/*
+		 * If we acquired the right lock and have some stats to flush,
+		 * flush them.
+		 */
+		if (config_stats && tcache_arena == cur_arena
+		    && !merged_stats) {
+			merged_stats = true;
 			arena_stats_large_flush_nrequests_add(tsdn,
 			    &tcache_arena->stats, binind,
 			    cache_bin->tstats.nrequests);
 			cache_bin->tstats.nrequests = 0;
 		}
+
+		/*
+		 * Large allocations need special prep done.  Afterwards, we can
+		 * drop the large lock.
+		 */
+		for (unsigned i = 0; i < nflush; i++) {
+			void *ptr = ptrs->ptr[i];
+			edata = item_edata[i].edata;
+			assert(ptr != NULL && edata != NULL);
+
+			if (edata_arena_ind_get(edata) == cur_arena_ind) {
+				large_dalloc_prep_locked(tsdn,
+				    edata);
+			}
+		}
+		if (!arena_is_auto(cur_arena)) {
+			malloc_mutex_unlock(tsdn, &cur_arena->large_mtx);
+		}
+
+		/* Deallocate whatever we can. */
+		unsigned ndeferred = 0;
+		for (unsigned i = 0; i < nflush; i++) {
+			void *ptr = ptrs->ptr[i];
+			edata = item_edata[i].edata;
+			assert(ptr != NULL && edata != NULL);
+			if (edata_arena_ind_get(edata) != cur_arena_ind) {
+				/*
+				 * The object was allocated either via a
+				 * different arena, or a different bin in this
+				 * arena.  Either way, stash the object so that
+				 * it can be handled in a future pass.
+				 */
+				ptrs->ptr[ndeferred] = ptr;
+				item_edata[ndeferred].edata = edata;
+				ndeferred++;
+				continue;
+			}
+			if (large_dalloc_safety_checks(edata, ptr, binind)) {
+				/* See the comment in isfree. */
+				continue;
+			}
+			large_dalloc_finish(tsdn, edata);
+		}
+		arena_decay_ticks(tsdn, cur_arena, nflush - ndeferred);
+		nflush = ndeferred;
 	}
 
+	if (config_stats && !merged_stats) {
+		arena_stats_large_flush_nrequests_add(tsdn,
+		    &tcache_arena->stats, binind,
+		    cache_bin->tstats.nrequests);
+		cache_bin->tstats.nrequests = 0;
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
+    szind_t binind, cache_bin_ptr_array_t *ptrs, unsigned nflush, bool small) {
+	/*
+	 * The small/large flush logic is very similar; you might conclude that
+	 * it's a good opportunity to share code.  We've tried this, and by and
+	 * large found this to obscure more than it helps; there are so many
+	 * fiddly bits around things like stats handling, precisely when and
+	 * which mutexes are acquired, etc., that almost all code ends up being
+	 * gated behind 'if (small) { ... } else { ... }'.  Even though the
+	 * '...' is morally equivalent, the code itself needs slight tweaks.
+	 */
+	if (small) {
+		tcache_bin_flush_impl_small(tsd, tcache, cache_bin, binind,
+		    ptrs, nflush);
+	} else {
+		tcache_bin_flush_impl_large(tsd, tcache, cache_bin, binind,
+		    ptrs, nflush);
+	}
 }
 
 JEMALLOC_ALWAYS_INLINE void
@@ -556,13 +602,15 @@ tcache_bin_flush_bottom(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
 void
 tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
     szind_t binind, unsigned rem) {
-	tcache_bin_flush_bottom(tsd, tcache, cache_bin, binind, rem, true);
+	tcache_bin_flush_bottom(tsd, tcache, cache_bin, binind, rem,
+	    /* small */ true);
 }
 
 void
 tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
     szind_t binind, unsigned rem) {
-	tcache_bin_flush_bottom(tsd, tcache, cache_bin, binind, rem, false);
+	tcache_bin_flush_bottom(tsd, tcache, cache_bin, binind, rem,
+	    /* small */ false);
 }
 
 /*