Tcache: Split up small/large handling.

The main bits of shared code are the edata filtering and the stats flushing logic, both of which are fairly simple to read and not so painful to duplicate. The shared code comes at the cost of guarding all the subtle logic with `if (small)`, which doesn't feel worth it.
2026-07-15 21:27:18 +03:00 · 2024-02-07 10:21:53 -08:00 · 2024-02-07 10:21:53 -08:00 · 6e56848850
commit 6e56848850
parent c085530c71
1 changed files with 152 additions and 104 deletions
--- a/src/tcache.c
+++ b/src/tcache.c
@ -312,20 +312,9 @@ tcache_bin_flush_edatas_lookup(tsd_t *tsd, cache_bin_ptr_array_t *arr,
 	}
 }

-JEMALLOC_ALWAYS_INLINE bool
-tcache_bin_flush_match(edata_t *edata, unsigned cur_arena_ind,
-    unsigned cur_binshard, bool small) {
-	if (small) {
-		return edata_arena_ind_get(edata) == cur_arena_ind
-		    && edata_binshard_get(edata) == cur_binshard;
-	} else {
-		return edata_arena_ind_get(edata) == cur_arena_ind;
-	}
-}
-
 JEMALLOC_ALWAYS_INLINE void
-tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
-    szind_t binind, cache_bin_ptr_array_t *ptrs, unsigned nflush, bool small) {
+tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
+    szind_t binind, cache_bin_ptr_array_t *ptrs, unsigned nflush) {
 	tcache_slow_t *tcache_slow = tcache->tcache_slow;
 	/*
 	 * A couple lookup calls take tsdn; declare it once for convenience
@ -333,11 +322,7 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
 	 */
 	tsdn_t *tsdn = tsd_tsdn(tsd);

-	if (small) {
-		assert(binind < SC_NBINS);
-	} else {
-		assert(binind < tcache_nbins_get(tcache_slow));
-	}
+	assert(binind < SC_NBINS);
 	arena_t *tcache_arena = tcache_slow->arena;
 	assert(tcache_arena != NULL);

@ -368,34 +353,19 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
 		unsigned cur_arena_ind = edata_arena_ind_get(edata);
 		arena_t *cur_arena = arena_get(tsdn, cur_arena_ind, false);

+		unsigned cur_binshard = edata_binshard_get(edata);
+		bin_t *cur_bin = arena_get_bin(cur_arena, binind,
+		    cur_binshard);
+		assert(cur_binshard < bin_infos[binind].n_shards);
 		/*
-		 * These assignments are always overwritten when small is true,
-		 * and their values are always ignored when small is false, but
-		 * to avoid the technical UB when we pass them as parameters, we
-		 * need to intialize them.
+		 * If you're looking at profiles, you might think this
+		 * is a good place to prefetch the bin stats, which are
+		 * often a cache miss.  This turns out not to be
+		 * helpful on the workloads we've looked at, with moving
+		 * the bin stats next to the lock seeming to do better.
 		 */
-		unsigned cur_binshard = 0;
-		bin_t *cur_bin = NULL;
-		if (small) {
-			cur_binshard = edata_binshard_get(edata);
-			cur_bin = arena_get_bin(cur_arena, binind,
-			    cur_binshard);
-			assert(cur_binshard < bin_infos[binind].n_shards);
-			/*
-			 * If you're looking at profiles, you might think this
-			 * is a good place to prefetch the bin stats, which are
-			 * often a cache miss.  This turns out not to be
-			 * helpful on the workloads we've looked at, with moving
-			 * the bin stats next to the lock seeming to do better.
-			 */
-		}

-		if (small) {
-			malloc_mutex_lock(tsdn, &cur_bin->lock);
-		}
-		if (!small && !arena_is_auto(cur_arena)) {
-			malloc_mutex_lock(tsdn, &cur_arena->large_mtx);
-		}
+		malloc_mutex_lock(tsdn, &cur_bin->lock);

 		/*
 		 * If we acquired the right lock and have some stats to flush,
@ -404,53 +374,23 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
 		if (config_stats && tcache_arena == cur_arena
 		    && !merged_stats) {
 			merged_stats = true;
-			if (small) {
-				cur_bin->stats.nflushes++;
-				cur_bin->stats.nrequests +=
-				    cache_bin->tstats.nrequests;
-				cache_bin->tstats.nrequests = 0;
-			} else {
-				arena_stats_large_flush_nrequests_add(tsdn,
-				    &tcache_arena->stats, binind,
-				    cache_bin->tstats.nrequests);
-				cache_bin->tstats.nrequests = 0;
-			}
-		}
-
-		/*
-		 * Large allocations need special prep done.  Afterwards, we can
-		 * drop the large lock.
-		 */
-		if (!small) {
-			for (unsigned i = 0; i < nflush; i++) {
-				void *ptr = ptrs->ptr[i];
-				edata = item_edata[i].edata;
-				assert(ptr != NULL && edata != NULL);
-
-				if (tcache_bin_flush_match(edata, cur_arena_ind,
-				    cur_binshard, small)) {
-					large_dalloc_prep_locked(tsdn,
-					    edata);
-				}
-			}
-		}
-		if (!small && !arena_is_auto(cur_arena)) {
-			malloc_mutex_unlock(tsdn, &cur_arena->large_mtx);
+			cur_bin->stats.nflushes++;
+			cur_bin->stats.nrequests +=
+			    cache_bin->tstats.nrequests;
+			cache_bin->tstats.nrequests = 0;
 		}

 		/* Deallocate whatever we can. */
 		unsigned ndeferred = 0;
 		/* Init only to avoid used-uninitialized warning. */
 		arena_dalloc_bin_locked_info_t dalloc_bin_info = {0};
-		if (small) {
-			arena_dalloc_bin_locked_begin(&dalloc_bin_info, binind);
-		}
+		arena_dalloc_bin_locked_begin(&dalloc_bin_info, binind);
 		for (unsigned i = 0; i < nflush; i++) {
 			void *ptr = ptrs->ptr[i];
 			edata = item_edata[i].edata;
 			assert(ptr != NULL && edata != NULL);
-			if (!tcache_bin_flush_match(edata, cur_arena_ind,
-			    cur_binshard, small)) {
+			if (edata_arena_ind_get(edata) != cur_arena_ind
+			    || edata_binshard_get(edata) != cur_binshard) {
 				/*
 				 * The object was allocated either via a
 				 * different arena, or a different bin in this
@ -462,34 +402,23 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
 				ndeferred++;
 				continue;
 			}
-			if (small) {
-				if (arena_dalloc_bin_locked_step(tsdn,
-				    cur_arena, cur_bin, &dalloc_bin_info,
-				    binind, edata, ptr)) {
-					dalloc_slabs[dalloc_count] = edata;
-					dalloc_count++;
-				}
-			} else {
-				if (large_dalloc_safety_checks(edata, ptr,
-				    binind)) {
-					/* See the comment in isfree. */
-					continue;
-				}
-				large_dalloc_finish(tsdn, edata);
+			if (arena_dalloc_bin_locked_step(tsdn,
+			    cur_arena, cur_bin, &dalloc_bin_info,
+			    binind, edata, ptr)) {
+				dalloc_slabs[dalloc_count] = edata;
+				dalloc_count++;
 			}
 		}

-		if (small) {
-			arena_dalloc_bin_locked_finish(tsdn, cur_arena, cur_bin,
-			    &dalloc_bin_info);
-			malloc_mutex_unlock(tsdn, &cur_bin->lock);
-		}
+		arena_dalloc_bin_locked_finish(tsdn, cur_arena, cur_bin,
+		    &dalloc_bin_info);
+		malloc_mutex_unlock(tsdn, &cur_bin->lock);
+
 		arena_decay_ticks(tsdn, cur_arena, nflush - ndeferred);
 		nflush = ndeferred;
 	}

 	/* Handle all deferred slab dalloc. */
-	assert(small || dalloc_count == 0);
 	for (unsigned i = 0; i < dalloc_count; i++) {
 		edata_t *slab = dalloc_slabs[i];
 		arena_slab_dalloc(tsdn, arena_get_from_edata(slab), slab);
@ -497,7 +426,6 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
 	}

 	if (config_stats && !merged_stats) {
-		if (small) {
 			/*
 			 * The flush loop didn't happen to flush to this
 			 * thread's arena, so the stats didn't get merged.
@ -510,14 +438,132 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
 			bin->stats.nrequests += cache_bin->tstats.nrequests;
 			cache_bin->tstats.nrequests = 0;
 			malloc_mutex_unlock(tsdn, &bin->lock);
-		} else {
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+tcache_bin_flush_impl_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
+    szind_t binind, cache_bin_ptr_array_t *ptrs, unsigned nflush) {
+	tcache_slow_t *tcache_slow = tcache->tcache_slow;
+	/*
+	 * A couple lookup calls take tsdn; declare it once for convenience
+	 * instead of calling tsd_tsdn(tsd) all the time.
+	 */
+	tsdn_t *tsdn = tsd_tsdn(tsd);
+
+	assert(binind < tcache_nbins_get(tcache_slow));
+	arena_t *tcache_arena = tcache_slow->arena;
+	assert(tcache_arena != NULL);
+
+	/*
+	 * Variable length array must have > 0 length; the last element is never
+	 * touched (it's just included to satisfy the no-zero-length rule).
+	 */
+	VARIABLE_ARRAY(emap_batch_lookup_result_t, item_edata, nflush + 1);
+	tcache_bin_flush_edatas_lookup(tsd, ptrs, binind, nflush, item_edata);
+
+	/*
+	 * We're about to grab a bunch of locks.  If one of them happens to be
+	 * the one guarding the arena-level stats counters we flush our
+	 * thread-local ones to, we do so under one critical section.
+	 */
+	bool merged_stats = false;
+	while (nflush > 0) {
+		/* Lock the arena, or bin, associated with the first object. */
+		edata_t *edata = item_edata[0].edata;
+		unsigned cur_arena_ind = edata_arena_ind_get(edata);
+		arena_t *cur_arena = arena_get(tsdn, cur_arena_ind, false);
+
+		if (!arena_is_auto(cur_arena)) {
+			malloc_mutex_lock(tsdn, &cur_arena->large_mtx);
+		}
+
+		/*
+		 * If we acquired the right lock and have some stats to flush,
+		 * flush them.
+		 */
+		if (config_stats && tcache_arena == cur_arena
+		    && !merged_stats) {
+			merged_stats = true;
 			arena_stats_large_flush_nrequests_add(tsdn,
 			    &tcache_arena->stats, binind,
 			    cache_bin->tstats.nrequests);
 			cache_bin->tstats.nrequests = 0;
 		}
+
+		/*
+		 * Large allocations need special prep done.  Afterwards, we can
+		 * drop the large lock.
+		 */
+		for (unsigned i = 0; i < nflush; i++) {
+			void *ptr = ptrs->ptr[i];
+			edata = item_edata[i].edata;
+			assert(ptr != NULL && edata != NULL);
+
+			if (edata_arena_ind_get(edata) == cur_arena_ind) {
+				large_dalloc_prep_locked(tsdn,
+				    edata);
+			}
+		}
+		if (!arena_is_auto(cur_arena)) {
+			malloc_mutex_unlock(tsdn, &cur_arena->large_mtx);
+		}
+
+		/* Deallocate whatever we can. */
+		unsigned ndeferred = 0;
+		for (unsigned i = 0; i < nflush; i++) {
+			void *ptr = ptrs->ptr[i];
+			edata = item_edata[i].edata;
+			assert(ptr != NULL && edata != NULL);
+			if (edata_arena_ind_get(edata) != cur_arena_ind) {
+				/*
+				 * The object was allocated either via a
+				 * different arena, or a different bin in this
+				 * arena.  Either way, stash the object so that
+				 * it can be handled in a future pass.
+				 */
+				ptrs->ptr[ndeferred] = ptr;
+				item_edata[ndeferred].edata = edata;
+				ndeferred++;
+				continue;
+			}
+			if (large_dalloc_safety_checks(edata, ptr, binind)) {
+				/* See the comment in isfree. */
+				continue;
+			}
+			large_dalloc_finish(tsdn, edata);
+		}
+		arena_decay_ticks(tsdn, cur_arena, nflush - ndeferred);
+		nflush = ndeferred;
 	}

+	if (config_stats && !merged_stats) {
+		arena_stats_large_flush_nrequests_add(tsdn,
+		    &tcache_arena->stats, binind,
+		    cache_bin->tstats.nrequests);
+		cache_bin->tstats.nrequests = 0;
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
+    szind_t binind, cache_bin_ptr_array_t *ptrs, unsigned nflush, bool small) {
+	/*
+	 * The small/large flush logic is very similar; you might conclude that
+	 * it's a good opportunity to share code.  We've tried this, and by and
+	 * large found this to obscure more than it helps; there are so many
+	 * fiddly bits around things like stats handling, precisely when and
+	 * which mutexes are acquired, etc., that almost all code ends up being
+	 * gated behind 'if (small) { ... } else { ... }'.  Even though the
+	 * '...' is morally equivalent, the code itself needs slight tweaks.
+	 */
+	if (small) {
+		tcache_bin_flush_impl_small(tsd, tcache, cache_bin, binind,
+		    ptrs, nflush);
+	} else {
+		tcache_bin_flush_impl_large(tsd, tcache, cache_bin, binind,
+		    ptrs, nflush);
+	}
 }

 JEMALLOC_ALWAYS_INLINE void
@ -556,13 +602,15 @@ tcache_bin_flush_bottom(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
 void
 tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
    szind_t binind, unsigned rem) {
-	tcache_bin_flush_bottom(tsd, tcache, cache_bin, binind, rem, true);
+	tcache_bin_flush_bottom(tsd, tcache, cache_bin, binind, rem,
+	    /* small */ true);
 }

 void
 tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
    szind_t binind, unsigned rem) {
-	tcache_bin_flush_bottom(tsd, tcache, cache_bin, binind, rem, false);
+	tcache_bin_flush_bottom(tsd, tcache, cache_bin, binind, rem,
+	    /* small */ false);
 }

 /*