From 47c9bcd402110be3f64517ad9366d1cfaa751d48 Mon Sep 17 00:00:00 2001
From: Shirui Cheng <sherrycheng@meta.com>
Date: Thu, 18 Jul 2024 17:33:07 -0700
Subject: [PATCH] Use a for-loop to fulfill flush requests that are larger than
 CACHE_BIN_NFLUSH_BATCH_MAX items

---
 include/jemalloc/internal/cache_bin.h |  8 +++++
 src/tcache.c                          | 47 ++++++++++++++++++---------
 2 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/include/jemalloc/internal/cache_bin.h b/include/jemalloc/internal/cache_bin.h
index a26c3671..a7a5e40e 100644
--- a/include/jemalloc/internal/cache_bin.h
+++ b/include/jemalloc/internal/cache_bin.h
@@ -600,6 +600,14 @@ cache_bin_nitems_get_remote(cache_bin_t *bin, cache_bin_sz_t *ncached,
 	 */
 }
 
+/*
+ * Limit how many items can be flushed in a batch (Which is the upper bound
+ * for the nflush parameter in tcache_bin_flush_impl()).
+ * This is to avoid stack overflow when we do batch edata look up, which
+ * reserves a nflush * sizeof(emap_batch_lookup_result_t) stack variable.
+ */
+#define CACHE_BIN_NFLUSH_BATCH_MAX (VARIABLE_ARRAY_SIZE_MAX >> LG_SIZEOF_PTR)
+
 /*
  * Filling and flushing are done in batch, on arrays of void *s.  For filling,
  * the arrays go forward, and can be accessed with ordinary array arithmetic.
diff --git a/src/tcache.c b/src/tcache.c
index 03ec5136..4144524d 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -712,22 +712,37 @@ tcache_bin_flush_impl_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin
 JEMALLOC_ALWAYS_INLINE void
 tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin,
     szind_t binind, cache_bin_ptr_array_t *ptrs, unsigned nflush, bool small) {
-	/*
-	 * The small/large flush logic is very similar; you might conclude that
-	 * it's a good opportunity to share code.  We've tried this, and by and
-	 * large found this to obscure more than it helps; there are so many
-	 * fiddly bits around things like stats handling, precisely when and
-	 * which mutexes are acquired, etc., that almost all code ends up being
-	 * gated behind 'if (small) { ... } else { ... }'.  Even though the
-	 * '...' is morally equivalent, the code itself needs slight tweaks.
-	 */
-	if (small) {
-		tcache_bin_flush_impl_small(tsd, tcache, cache_bin, binind,
-		    ptrs, nflush);
-	} else {
-		tcache_bin_flush_impl_large(tsd, tcache, cache_bin, binind,
-		    ptrs, nflush);
-	}
+	assert(ptrs != NULL && ptrs->ptr != NULL);
+	unsigned nflush_batch, nflushed = 0;
+	cache_bin_ptr_array_t ptrs_batch;
+	do {
+		nflush_batch = nflush - nflushed;
+		if (nflush_batch > CACHE_BIN_NFLUSH_BATCH_MAX) {
+			nflush_batch = CACHE_BIN_NFLUSH_BATCH_MAX;
+		}
+		assert(nflush_batch <= CACHE_BIN_NFLUSH_BATCH_MAX);
+		(&ptrs_batch)->n = (cache_bin_sz_t)nflush_batch;
+		(&ptrs_batch)->ptr = ptrs->ptr + nflushed;
+		/*
+		 * The small/large flush logic is very similar; you might conclude that
+		 * it's a good opportunity to share code.  We've tried this, and by and
+		 * large found this to obscure more than it helps; there are so many
+		 * fiddly bits around things like stats handling, precisely when and
+		 * which mutexes are acquired, etc., that almost all code ends up being
+		 * gated behind 'if (small) { ... } else { ... }'.  Even though the
+		 * '...' is morally equivalent, the code itself needs slight tweaks.
+		 */
+		if (small) {
+			tcache_bin_flush_impl_small(tsd, tcache, cache_bin, binind,
+			    &ptrs_batch, nflush_batch);
+		} else {
+			tcache_bin_flush_impl_large(tsd, tcache, cache_bin, binind,
+			    &ptrs_batch, nflush_batch);
+		}
+		nflushed += nflush_batch;
+	} while (nflushed < nflush);
+	assert(nflush == nflushed);
+	assert((ptrs->ptr + nflush) == ((&ptrs_batch)->ptr + nflush_batch));
 }
 
 JEMALLOC_ALWAYS_INLINE void