diff --git a/include/jemalloc/internal/sz.h b/include/jemalloc/internal/sz.h
index 56c63123..7eef7bce 100644
--- a/include/jemalloc/internal/sz.h
+++ b/include/jemalloc/internal/sz.h
@@ -310,6 +310,17 @@ sz_size2index_usize_fastpath(size_t size, szind_t *ind, size_t *usize) {
 	}
 }
 
+JEMALLOC_ALWAYS_INLINE size_t
+sz_s2u_size_class_compute(size_t size) {
+	size_t x = lg_floor((size<<1)-1);
+	size_t lg_delta = (x < SC_LG_NGROUP + LG_QUANTUM + 1)
+	    ?  LG_QUANTUM : x - SC_LG_NGROUP - 1;
+	size_t delta = ZU(1) << lg_delta;
+	size_t delta_mask = delta - 1;
+	size_t usize = (size + delta_mask) & ~delta_mask;
+	return usize;
+}
+
 JEMALLOC_ALWAYS_INLINE size_t
 sz_s2u_compute(size_t size) {
 	if (unlikely(size > SC_LARGE_MAXCLASS)) {
@@ -328,13 +339,7 @@ sz_s2u_compute(size_t size) {
 	}
 #endif
 	if (!sz_limit_usize_gap_enabled() || size <= SC_SMALL_MAXCLASS) {
-		size_t x = lg_floor((size<<1)-1);
-		size_t lg_delta = (x < SC_LG_NGROUP + LG_QUANTUM + 1)
-		    ?  LG_QUANTUM : x - SC_LG_NGROUP - 1;
-		size_t delta = ZU(1) << lg_delta;
-		size_t delta_mask = delta - 1;
-		size_t usize = (size + delta_mask) & ~delta_mask;
-		return usize;
+		return sz_s2u_size_class_compute(size);
 	} else {
 		size_t usize = ((size + PAGE - 1) >> LG_PAGE) << LG_PAGE;
 		assert(usize - size < PAGE);
diff --git a/src/pac.c b/src/pac.c
index 57a0c953..28cf2c39 100644
--- a/src/pac.c
+++ b/src/pac.c
@@ -112,10 +112,24 @@ pac_may_have_muzzy(pac_t *pac) {
 	return pac_decay_ms_get(pac, extent_state_muzzy) != 0;
 }
 
+size_t pac_batched_alloc_retained_size(size_t size) {
+	if (size > SC_LARGE_MAXCLASS) {
+		return size;
+	}
+	size_t batched_size = sz_s2u_size_class_compute(size);
+	size_t capped_size = ((size + HUGEPAGE - 1) >> LG_HUGEPAGE)
+	    << LG_HUGEPAGE;
+	if (batched_size >= capped_size) {
+		batched_size = capped_size;
+	}
+	return batched_size;
+}
+
 static edata_t *
 pac_alloc_real(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size,
     size_t alignment, bool zero, bool guarded) {
 	assert(!guarded || alignment <= PAGE);
+	size_t newly_mapped_size = 0;
 
 	edata_t *edata = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_dirty,
 	    NULL, size, alignment, zero, guarded);
@@ -124,14 +138,64 @@ pac_alloc_real(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size,
 		edata = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_muzzy,
 		    NULL, size, alignment, zero, guarded);
 	}
+
+	/*
+	 * We batched allocate a larger extent when limit_usize_gap is enabled
+	 * because the reuse of extents in the dirty pool is worse without size
+	 * classes for large allocs.  For instance, when limit_usize_gap is not
+	 * enabled, 1.1MB, 1.15MB, and 1.2MB allocs will all be ceiled to
+	 * 1.25MB and can reuse the same buffer if they are alloc & dalloc
+	 * sequentially.  However, with limit_usize_gap enabled, they cannot
+	 * reuse the same buffer and their sequential allocs & dallocs will
+	 * result in three different extents.  Thus, we cache extra mergeable
+	 * extents in the dirty pool to improve the reuse.  We skip this
+	 * optimization if both maps_coalesce and opt_retain are disabled
+	 * because VM is not cheap enough to be used aggressively and extents
+	 * cannot be merged at will (only extents from the same VirtualAlloc
+	 * can be merged).  Note that it could still be risky to cache more
+	 * extents when either mpas_coalesce or opt_retain is enabled.  Yet
+	 * doing so is still beneficial in improving the reuse of extents
+	 * with some limits.  This choice should be reevaluated if
+	 * pac_batched_alloc_retained_size is changed to be more aggressive.
+	 */
+	if (sz_limit_usize_gap_enabled() && edata == NULL &&
+	    (maps_coalesce || opt_retain)) {
+		size_t batched_size = pac_batched_alloc_retained_size(
+		    size);
+		edata = ecache_alloc_grow(tsdn, pac, ehooks,
+		    &pac->ecache_retained, NULL, batched_size,
+		    alignment, zero, guarded);
+
+		if (edata != NULL && batched_size > size) {
+			edata_t *trail = extent_split_wrapper(tsdn, pac,
+			    ehooks, edata, size, batched_size - size,
+			    /* holding_core_locks */ false);
+			if (trail == NULL) {
+				ecache_dalloc(tsdn, pac, ehooks,
+				    &pac->ecache_retained, edata);
+				edata = NULL;
+			} else {
+				ecache_dalloc(tsdn, pac, ehooks,
+				    &pac->ecache_dirty, trail);
+
+			}
+		}
+
+		if (edata != NULL) {
+			newly_mapped_size = batched_size;
+		}
+	}
+
 	if (edata == NULL) {
 		edata = ecache_alloc_grow(tsdn, pac, ehooks,
 		    &pac->ecache_retained, NULL, size, alignment, zero,
 		    guarded);
-		if (config_stats && edata != NULL) {
-			atomic_fetch_add_zu(&pac->stats->pac_mapped, size,
-			    ATOMIC_RELAXED);
-		}
+		newly_mapped_size = size;
+	}
+
+	if (config_stats && newly_mapped_size != 0) {
+		atomic_fetch_add_zu(&pac->stats->pac_mapped,
+		    newly_mapped_size, ATOMIC_RELAXED);
 	}
 
 	return edata;
diff --git a/test/unit/arena_decay.c b/test/unit/arena_decay.c
index 10d1a6b1..00a38326 100644
--- a/test/unit/arena_decay.c
+++ b/test/unit/arena_decay.c
@@ -410,7 +410,14 @@ TEST_BEGIN(test_decay_never) {
 	/* Verify that each deallocation generates additional dirty pages. */
 	size_t pdirty_prev = get_arena_pdirty(arena_ind);
 	size_t pmuzzy_prev = get_arena_pmuzzy(arena_ind);
-	expect_zu_eq(pdirty_prev, 0, "Unexpected dirty pages");
+	/*
+	 * With limit_usize_gap enabled, some more extents
+	 * are cached in the dirty pool, making the assumption below
+	 * not true.
+	 */
+	if (!sz_limit_usize_gap_enabled()) {
+		expect_zu_eq(pdirty_prev, 0, "Unexpected dirty pages");
+	}
 	expect_zu_eq(pmuzzy_prev, 0, "Unexpected muzzy pages");
 	for (unsigned i = 0; i < sizeof(sizes)/sizeof(size_t); i++) {
 		dallocx(ptrs[i], flags);