Use SEC in PAC to reduce lock contention on the ecaches

Add a small extent cache in front of the PAC ecaches. Allocs and dallocs that fit are served from per-shard SEC bins without taking the ecache mutex; overflow falls through to the backing ecaches, including ecache_pinned for pinned extents. The feature is gated behind experimental_pac_sec_nshards (default 0, disabled). To support independent HPA and PAC SEC instances, sec_alloc/sec_dalloc/sec_fill take an explicit shard argument, with HPA and PAC using separate TSD shard slots.
2026-06-04 11:14:20 +03:00 · 2026-05-19 00:11:15 -07:00 · 2026-05-19 00:11:15 -07:00 · 6b13adf375
commit 6b13adf375
parent 11b99d7a21
19 changed files with 680 additions and 59 deletions
--- a/src/conf.c
+++ b/src/conf.c
@ -953,6 +953,17 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
 			CONF_HANDLE_SIZE_T(opt_hpa_sec_opts.max_bytes,
 			    "hpa_sec_max_bytes", SEC_OPTS_MAX_BYTES_DEFAULT, 0,
 			    CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, true);
+			CONF_HANDLE_SIZE_T(opt_pac_sec_opts.nshards,
+			    "experimental_pac_sec_nshards", 0, 0,
+			    CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, true);
+			CONF_HANDLE_SIZE_T(opt_pac_sec_opts.max_alloc,
+			    "experimental_pac_sec_max_alloc", PAGE,
+			    USIZE_GROW_SLOW_THRESHOLD, CONF_CHECK_MIN,
+			    CONF_CHECK_MAX, true);
+			CONF_HANDLE_SIZE_T(opt_pac_sec_opts.max_bytes,
+			    "experimental_pac_sec_max_bytes",
+			    SEC_OPTS_MAX_BYTES_DEFAULT, 0,
+			    CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, true);

 			if (CONF_MATCH("slab_sizes")) {
 				if (CONF_MATCH_VALUE("default")) {
--- a/src/ctl.c
+++ b/src/ctl.c
@ -115,6 +115,9 @@ CTL_PROTO(opt_hpa_dirty_mult)
 CTL_PROTO(opt_hpa_sec_nshards)
 CTL_PROTO(opt_hpa_sec_max_alloc)
 CTL_PROTO(opt_hpa_sec_max_bytes)
+CTL_PROTO(opt_experimental_pac_sec_nshards)
+CTL_PROTO(opt_experimental_pac_sec_max_alloc)
+CTL_PROTO(opt_experimental_pac_sec_max_bytes)
 CTL_PROTO(opt_huge_arena_pac_thp)
 CTL_PROTO(opt_metadata_thp)
 CTL_PROTO(opt_retain)
@ -352,6 +355,11 @@ CTL_PROTO(stats_arenas_i_hpa_sec_misses)
 CTL_PROTO(stats_arenas_i_hpa_sec_dalloc_flush)
 CTL_PROTO(stats_arenas_i_hpa_sec_dalloc_noflush)
 CTL_PROTO(stats_arenas_i_hpa_sec_overfills)
+CTL_PROTO(stats_arenas_i_pac_sec_bytes)
+CTL_PROTO(stats_arenas_i_pac_sec_hits)
+CTL_PROTO(stats_arenas_i_pac_sec_misses)
+CTL_PROTO(stats_arenas_i_pac_sec_dalloc_flush)
+CTL_PROTO(stats_arenas_i_pac_sec_dalloc_noflush)
 INDEX_PROTO(stats_arenas_i)
 CTL_PROTO(stats_allocated)
 CTL_PROTO(stats_active)
@ -495,6 +503,12 @@ static const ctl_named_node_t opt_node[] = {{NAME("abort"), CTL(opt_abort)},
    {NAME("hpa_sec_nshards"), CTL(opt_hpa_sec_nshards)},
    {NAME("hpa_sec_max_alloc"), CTL(opt_hpa_sec_max_alloc)},
    {NAME("hpa_sec_max_bytes"), CTL(opt_hpa_sec_max_bytes)},
+    {NAME("experimental_pac_sec_nshards"),
+        CTL(opt_experimental_pac_sec_nshards)},
+    {NAME("experimental_pac_sec_max_alloc"),
+        CTL(opt_experimental_pac_sec_max_alloc)},
+    {NAME("experimental_pac_sec_max_bytes"),
+        CTL(opt_experimental_pac_sec_max_bytes)},
    {NAME("huge_arena_pac_thp"), CTL(opt_huge_arena_pac_thp)},
    {NAME("metadata_thp"), CTL(opt_metadata_thp)},
    {NAME("retain"), CTL(opt_retain)}, {NAME("dss"), CTL(opt_dss)},
@ -859,6 +873,12 @@ static const ctl_named_node_t stats_arenas_i_node[] = {
        CTL(stats_arenas_i_hpa_sec_dalloc_noflush)},
    {NAME("hpa_sec_dalloc_flush"), CTL(stats_arenas_i_hpa_sec_dalloc_flush)},
    {NAME("hpa_sec_overfills"), CTL(stats_arenas_i_hpa_sec_overfills)},
+    {NAME("pac_sec_bytes"), CTL(stats_arenas_i_pac_sec_bytes)},
+    {NAME("pac_sec_hits"), CTL(stats_arenas_i_pac_sec_hits)},
+    {NAME("pac_sec_misses"), CTL(stats_arenas_i_pac_sec_misses)},
+    {NAME("pac_sec_dalloc_noflush"),
+        CTL(stats_arenas_i_pac_sec_dalloc_noflush)},
+    {NAME("pac_sec_dalloc_flush"), CTL(stats_arenas_i_pac_sec_dalloc_flush)},
    {NAME("small"), CHILD(named, stats_arenas_i_small)},
    {NAME("large"), CHILD(named, stats_arenas_i_large)},
    {NAME("bins"), CHILD(indexed, stats_arenas_i_bins)},
@ -1219,6 +1239,10 @@ ctl_arena_stats_sdmerge(
 		    &sdstats->astats.pa_shard_stats.pac_stats.abandoned_vm,
 		    &astats->astats.pa_shard_stats.pac_stats.abandoned_vm);

+		sec_stats_accum(
+		    &sdstats->astats.pa_shard_stats.pac_stats.pac_sec_stats,
+		    &astats->astats.pa_shard_stats.pac_stats.pac_sec_stats);
+
 		sdstats->astats.tcache_bytes += astats->astats.tcache_bytes;
 		sdstats->astats.tcache_stashed_bytes +=
 		    astats->astats.tcache_stashed_bytes;
@ -2208,6 +2232,12 @@ CTL_RO_NL_GEN(opt_hpa_slab_max_alloc, opt_hpa_opts.slab_max_alloc, size_t)
 CTL_RO_NL_GEN(opt_hpa_sec_nshards, opt_hpa_sec_opts.nshards, size_t)
 CTL_RO_NL_GEN(opt_hpa_sec_max_alloc, opt_hpa_sec_opts.max_alloc, size_t)
 CTL_RO_NL_GEN(opt_hpa_sec_max_bytes, opt_hpa_sec_opts.max_bytes, size_t)
+CTL_RO_NL_GEN(opt_experimental_pac_sec_nshards,
+    opt_pac_sec_opts.nshards, size_t)
+CTL_RO_NL_GEN(opt_experimental_pac_sec_max_alloc,
+    opt_pac_sec_opts.max_alloc, size_t)
+CTL_RO_NL_GEN(opt_experimental_pac_sec_max_bytes,
+    opt_pac_sec_opts.max_bytes, size_t)
 CTL_RO_NL_GEN(opt_huge_arena_pac_thp, opt_huge_arena_pac_thp, bool)
 CTL_RO_NL_GEN(
    opt_metadata_thp, metadata_thp_mode_names[opt_metadata_thp], const char *)
@ -3881,6 +3911,17 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_sec_dalloc_noflush,
 CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_sec_overfills,
    arenas_i(mib[2])->astats->hpastats.secstats.total.noverfills, size_t)

+CTL_RO_CGEN(config_stats, stats_arenas_i_pac_sec_bytes,
+    arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.pac_sec_stats.bytes, size_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_pac_sec_hits,
+    arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.pac_sec_stats.total.nhits, size_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_pac_sec_misses,
+    arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.pac_sec_stats.total.nmisses, size_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_pac_sec_dalloc_flush,
+    arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.pac_sec_stats.total.ndalloc_flush, size_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_pac_sec_dalloc_noflush,
+    arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.pac_sec_stats.total.ndalloc_noflush, size_t)
+
 CTL_RO_CGEN(config_stats, stats_arenas_i_small_allocated,
    arenas_i(mib[2])->astats->allocated_small, size_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_small_nmalloc,
--- a/src/hpa.c
+++ b/src/hpa.c
@ -16,6 +16,18 @@ const char *const hpa_hugify_style_names[] = {"auto", "none", "eager", "lazy"};
 bool opt_experimental_hpa_start_huge_if_thp_always = true;
 bool opt_experimental_hpa_enforce_hugify = false;

+static inline uint8_t
+hpa_sec_shard_pick(tsdn_t *tsdn, sec_t *sec) {
+	if (sec->opts.nshards <= 1) {
+		return 0;
+	}
+	if (tsdn_null(tsdn)) {
+		return 0;
+	}
+	tsd_t *tsd = tsdn_tsd(tsdn);
+	return sec_shard_pick(tsd, sec, tsd_sec_shardp_get(tsd));
+}
+
 bool
 hpa_hugepage_size_exceeds_limit(void) {
 	return HUGEPAGE > HUGEPAGE_MAX_EXPECTED_SIZE;
@ -947,7 +959,8 @@ hpa_alloc(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, size_t alignment,
 	    && (size > shard->opts.slab_max_alloc)) {
 		return NULL;
 	}
-	edata_t *edata = sec_alloc(tsdn, &shard->sec, size);
+	edata_t *edata = sec_alloc(tsdn, &shard->sec, size,
+	    hpa_sec_shard_pick(tsdn, &shard->sec));
 	if (edata != NULL) {
 		return edata;
 	}
@ -968,7 +981,8 @@ hpa_alloc(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, size_t alignment,
 	}
 	if (nsuccess > 0) {
 		assert(sec_size_supported(&shard->sec, size));
-		sec_fill(tsdn, &shard->sec, size, &results, nsuccess);
+		sec_fill(tsdn, &shard->sec, size, &results, nsuccess,
+		    hpa_sec_shard_pick(tsdn, &shard->sec));
 		/* Unlikely rollback in case of overfill */
 		if (!edata_list_active_empty(&results)) {
 			hpa_dalloc_batch(
@ -1075,7 +1089,8 @@ hpa_dalloc(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata,
 	edata_list_active_init(&dalloc_list);
 	edata_list_active_append(&dalloc_list, edata);

-	sec_dalloc(tsdn, &shard->sec, &dalloc_list);
+	sec_dalloc(tsdn, &shard->sec, &dalloc_list,
+	    hpa_sec_shard_pick(tsdn, &shard->sec));
 	if (edata_list_active_empty(&dalloc_list)) {
 		/* sec consumed the pointer */
 		*deferred_work_generated = false;
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@ -181,6 +181,9 @@ size_t opt_calloc_madvise_threshold = CALLOC_MADVISE_THRESHOLD_DEFAULT;
 bool             opt_hpa = false;
 hpa_shard_opts_t opt_hpa_opts = HPA_SHARD_OPTS_DEFAULT;
 sec_opts_t       opt_hpa_sec_opts = SEC_OPTS_DEFAULT;
+sec_opts_t       opt_pac_sec_opts = {0,
+    (32 * 1024) > (PAGE * 2) ? (32 * 1024) : (PAGE * 2),
+    SEC_OPTS_MAX_BYTES_DEFAULT};

 /* False should be the common case.  Set to true to trigger initialization. */
 bool malloc_slow = true;
--- a/src/pa.c
+++ b/src/pa.c
@ -94,6 +94,7 @@ pa_shard_reset(tsdn_t *tsdn, pa_shard_t *shard) {

 void
 pa_shard_flush(tsdn_t *tsdn, pa_shard_t *shard) {
+	pac_sec_flush(tsdn, &shard->pac);
 	if (shard->ever_used_hpa) {
 		hpa_shard_flush(tsdn, &shard->hpa);
 	}
--- a/src/pa_extra.c
+++ b/src/pa_extra.c
@ -16,6 +16,7 @@ pa_shard_prefork0(tsdn_t *tsdn, pa_shard_t *shard) {

 void
 pa_shard_prefork2(tsdn_t *tsdn, pa_shard_t *shard) {
+	sec_prefork2(tsdn, &shard->pac.sec);
 	if (shard->ever_used_hpa) {
 		hpa_shard_prefork2(tsdn, &shard->hpa);
 	}
@ -53,6 +54,7 @@ pa_shard_postfork_parent(tsdn_t *tsdn, pa_shard_t *shard) {
 	ecache_postfork_parent(tsdn, &shard->pac.ecache_retained);
 	ecache_postfork_parent(tsdn, &shard->pac.ecache_pinned);
 	malloc_mutex_postfork_parent(tsdn, &shard->pac.grow_mtx);
+	sec_postfork_parent(tsdn, &shard->pac.sec);
 	malloc_mutex_postfork_parent(tsdn, &shard->pac.decay_dirty.mtx);
 	malloc_mutex_postfork_parent(tsdn, &shard->pac.decay_muzzy.mtx);
 	if (shard->ever_used_hpa) {
@ -68,6 +70,7 @@ pa_shard_postfork_child(tsdn_t *tsdn, pa_shard_t *shard) {
 	ecache_postfork_child(tsdn, &shard->pac.ecache_retained);
 	ecache_postfork_child(tsdn, &shard->pac.ecache_pinned);
 	malloc_mutex_postfork_child(tsdn, &shard->pac.grow_mtx);
+	sec_postfork_child(tsdn, &shard->pac.sec);
 	malloc_mutex_postfork_child(tsdn, &shard->pac.decay_dirty.mtx);
 	malloc_mutex_postfork_child(tsdn, &shard->pac.decay_muzzy.mtx);
 	if (shard->ever_used_hpa) {
@ -179,6 +182,9 @@ pa_shard_stats_merge(tsdn_t *tsdn, pa_shard_t *shard,
 	if (shard->ever_used_hpa) {
 		hpa_shard_stats_merge(tsdn, &shard->hpa, hpa_stats_out);
 	}
+
+	sec_stats_merge(tsdn, &shard->pac.sec,
+	    &pa_shard_stats_out->pac_stats.pac_sec_stats);
 }

 static void
@ -207,6 +213,9 @@ pa_shard_mtx_stats_read(tsdn_t *tsdn, pa_shard_t *shard,
 	pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data,
 	    &shard->pac.decay_muzzy.mtx, arena_prof_mutex_decay_muzzy);

+	sec_mutex_stats_read(tsdn, &shard->pac.sec,
+	    &mutex_prof_data[arena_prof_mutex_pac_sec]);
+
 	if (shard->ever_used_hpa) {
 		pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data,
 		    &shard->hpa.mtx, arena_prof_mutex_hpa_shard);
--- a/src/pac.c
+++ b/src/pac.c
@ -4,6 +4,18 @@
 #include "jemalloc/internal/pac.h"
 #include "jemalloc/internal/san.h"

+static inline uint8_t
+pac_sec_shard_pick(tsdn_t *tsdn, sec_t *sec) {
+	if (sec->opts.nshards <= 1) {
+		return 0;
+	}
+	if (tsdn_null(tsdn)) {
+		return 0;
+	}
+	tsd_t *tsd = tsdn_tsd(tsdn);
+	return sec_shard_pick(tsd, sec, tsd_pac_sec_shardp_get(tsd));
+}
+
 static inline void
 pac_decay_data_get(pac_t *pac, extent_state_t state, decay_t **r_decay,
    pac_decay_stats_t **r_decay_stats, ecache_t **r_ecache) {
@ -92,6 +104,17 @@ pac_init(tsdn_t *tsdn, pac_t *pac, base_t *base, emap_t *emap,
 	pac->stats_mtx = stats_mtx;
 	atomic_store_zu(&pac->extent_sn_next, 0, ATOMIC_RELAXED);

+	if (sec_init(tsdn, &pac->sec, base, &opt_pac_sec_opts)) {
+		/* Fall back to no SEC on allocation failure. */
+		pac->sec.opts.nshards = 0;
+	}
+	if (!sec_is_used(&pac->sec) || dirty_decay_ms == 0) {
+		atomic_store_zu(&pac->sec_max_alloc, 0, ATOMIC_RELAXED);
+	} else {
+		atomic_store_zu(&pac->sec_max_alloc,
+		    pac->sec.opts.max_alloc, ATOMIC_RELAXED);
+	}
+
 	return false;
 }

@ -133,6 +156,15 @@ pac_alloc_real(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size,

 	edata_t *edata = NULL;

+	if (!guarded && !zero && alignment <= PAGE
+	    && size <= atomic_load_zu(&pac->sec_max_alloc, ATOMIC_RELAXED)) {
+		edata = sec_alloc(tsdn, &pac->sec, size,
+		    pac_sec_shard_pick(tsdn, &pac->sec));
+		if (edata != NULL) {
+			return edata;
+		}
+	}
+
 	/*
 	 * Guarded allocations need surrounding guard pages, which the pinned
 	 * pool does not maintain; skip ecache_pinned in that case.
@ -395,6 +427,32 @@ pac_dalloc(tsdn_t *tsdn, pac_t *pac, edata_t *edata,
 			san_unguard_pages_two_sided(
 			    tsdn, ehooks, edata, pac->emap);
 		}
+	} else if (edata_size_get(edata)
+	    <= atomic_load_zu(&pac->sec_max_alloc, ATOMIC_RELAXED)) {
+		edata_zeroed_set(edata, false);
+		edata_list_active_t dalloc_list;
+		edata_list_active_init(&dalloc_list);
+		edata_list_active_append(&dalloc_list, edata);
+		sec_dalloc(tsdn, &pac->sec, &dalloc_list,
+		    pac_sec_shard_pick(tsdn, &pac->sec));
+		if (edata_list_active_empty(&dalloc_list)) {
+			*deferred_work_generated = false;
+			return;
+		}
+		/* Flush overflow extents to their backing ecaches. */
+		bool any_deferred_work = false;
+		edata_t *flush_edata;
+		while ((flush_edata =
+		    edata_list_active_first(&dalloc_list)) != NULL) {
+			edata_list_active_remove(&dalloc_list,
+			    flush_edata);
+			if (!edata_pinned_get(flush_edata)) {
+				any_deferred_work = true;
+			}
+			pac_ecache_dalloc(tsdn, pac, ehooks, flush_edata);
+		}
+		*deferred_work_generated = any_deferred_work;
+		return;
 	}

 	bool pinned = edata_pinned_get(edata);
@ -717,6 +775,13 @@ pac_decay_ms_set(tsdn_t *tsdn, pac_t *pac, extent_state_t state,
 		return true;
 	}

+	bool update_pac_sec = state == extent_state_dirty
+	    && sec_is_used(&pac->sec);
+	if (update_pac_sec && decay_ms == 0) {
+		atomic_store_zu(&pac->sec_max_alloc, 0, ATOMIC_RELAXED);
+		pac_sec_flush(tsdn, pac);
+	}
+
 	malloc_mutex_lock(tsdn, &decay->mtx);
 	/*
 	 * Restart decay backlog from scratch, which may cause many dirty pages
@ -732,6 +797,11 @@ pac_decay_ms_set(tsdn_t *tsdn, pac_t *pac, extent_state_t state,
 	pac_maybe_decay_purge(tsdn, pac, decay, decay_stats, ecache, eagerness);
 	malloc_mutex_unlock(tsdn, &decay->mtx);

+	if (update_pac_sec && decay_ms != 0) {
+		atomic_store_zu(&pac->sec_max_alloc,
+		    pac->sec.opts.max_alloc, ATOMIC_RELAXED);
+	}
+
 	return false;
 }

@ -746,12 +816,11 @@ pac_decay_ms_get(pac_t *pac, extent_state_t state) {

 void
 pac_reset(tsdn_t *tsdn, pac_t *pac) {
+	pac_sec_flush(tsdn, pac);
 	/*
-	 * No-op for now; purging is still done at the arena-level.  It should
-	 * get moved in here, though.
+	 * Purging is still done at the arena-level.  It should get moved in
+	 * here, though.
 	 */
-	(void)tsdn;
-	(void)pac;
 }

 void
@ -816,3 +885,16 @@ pac_destroy(tsdn_t *tsdn, pac_t *pac) {
 		extent_destroy_wrapper(tsdn, pac, ehooks, edata);
 	}
 }
+
+void
+pac_sec_flush(tsdn_t *tsdn, pac_t *pac) {
+	ehooks_t *ehooks = pac_ehooks_get(pac);
+	edata_list_active_t to_flush;
+	edata_list_active_init(&to_flush);
+	sec_flush(tsdn, &pac->sec, &to_flush);
+	edata_t *edata;
+	while ((edata = edata_list_active_first(&to_flush)) != NULL) {
+		edata_list_active_remove(&to_flush, edata);
+		pac_ecache_dalloc(tsdn, pac, ehooks, edata);
+	}
+}
--- a/src/sec.c
+++ b/src/sec.c
@ -25,6 +25,8 @@ sec_bin_init(sec_bin_t *bin) {
 bool
 sec_init(tsdn_t *tsdn, sec_t *sec, base_t *base, const sec_opts_t *opts) {
 	sec->opts = *opts;
+	sec->bins = NULL;
+	sec->npsizes = 0;
 	if (opts->nshards == 0) {
 		return false;
 	}
@ -57,18 +59,16 @@ sec_init(tsdn_t *tsdn, sec_t *sec, base_t *base, const sec_opts_t *opts) {
 	return false;
 }

-static uint8_t
-sec_shard_pick(tsdn_t *tsdn, sec_t *sec) {
+uint8_t
+sec_shard_pick(tsd_t *tsd, sec_t *sec, uint8_t *idxp) {
 	/*
 	 * Eventually, we should implement affinity, tracking source shard using
 	 * the edata_t's newly freed up fields.  For now, just randomly
 	 * distribute across all shards.
+	 *
+	 * Callers must ensure sec->opts.nshards > 1.
 	 */
-	if (tsdn_null(tsdn)) {
-		return 0;
-	}
-	tsd_t   *tsd = tsdn_tsd(tsdn);
-	uint8_t *idxp = tsd_sec_shardp_get(tsd);
+	assert(sec->opts.nshards > 1);
 	if (*idxp == (uint8_t)-1) {
 		/*
 		 * First use; initialize using the trick from Daniel Lemire's
@ -143,10 +143,10 @@ sec_bin_alloc_locked(tsdn_t *tsdn, sec_t *sec, sec_bin_t *bin, size_t size) {

 static edata_t *
 sec_multishard_trylock_alloc(
-    tsdn_t *tsdn, sec_t *sec, size_t size, pszind_t pszind) {
+    tsdn_t *tsdn, sec_t *sec, size_t size, pszind_t pszind, uint8_t shard) {
 	assert(sec->opts.nshards > 0);

-	uint8_t    cur_shard = sec_shard_pick(tsdn, sec);
+	uint8_t    cur_shard = shard;
 	sec_bin_t *bin;
 	for (size_t i = 0; i < sec->opts.nshards; ++i) {
 		bin = sec_bin_pick(sec, cur_shard, pszind);
@ -170,7 +170,7 @@ sec_multishard_trylock_alloc(
 	 * declaring a miss.  That could recover more remote-shard hits under
 	 * contention, but it also changes the allocation latency policy.
 	 */
-	assert(cur_shard == sec_shard_pick(tsdn, sec));
+	assert(cur_shard == shard);
 	bin = sec_bin_pick(sec, cur_shard, pszind);
 	malloc_mutex_lock(tsdn, &bin->mtx);
 	edata_t *edata = sec_bin_alloc_locked(tsdn, sec, bin, size);
@ -184,7 +184,7 @@ sec_multishard_trylock_alloc(
 }

 edata_t *
-sec_alloc(tsdn_t *tsdn, sec_t *sec, size_t size) {
+sec_alloc(tsdn_t *tsdn, sec_t *sec, size_t size, uint8_t shard) {
 	if (!sec_size_supported(sec, size)) {
 		return NULL;
 	}
@ -208,7 +208,7 @@ sec_alloc(tsdn_t *tsdn, sec_t *sec, size_t size) {
 		    /* frequent_reuse */ 1);
 		return edata;
 	}
-	return sec_multishard_trylock_alloc(tsdn, sec, size, pszind);
+	return sec_multishard_trylock_alloc(tsdn, sec, size, pszind, shard);
 }

 static void
@ -248,11 +248,11 @@ sec_bin_dalloc_locked(tsdn_t *tsdn, sec_t *sec, sec_bin_t *bin, size_t size,

 static void
 sec_multishard_trylock_dalloc(tsdn_t *tsdn, sec_t *sec, size_t size,
-    pszind_t pszind, edata_list_active_t *dalloc_list) {
+    pszind_t pszind, edata_list_active_t *dalloc_list, uint8_t shard) {
 	assert(sec->opts.nshards > 0);

 	/* Try to dalloc in this threads bin first */
-	uint8_t cur_shard = sec_shard_pick(tsdn, sec);
+	uint8_t cur_shard = shard;
 	for (size_t i = 0; i < sec->opts.nshards; ++i) {
 		sec_bin_t *bin = sec_bin_pick(sec, cur_shard, pszind);
 		if (!malloc_mutex_trylock(tsdn, &bin->mtx)) {
@ -267,7 +267,7 @@ sec_multishard_trylock_dalloc(tsdn_t *tsdn, sec_t *sec, size_t size,
 		}
 	}
 	/* No bin had alloc or had the extent */
-	assert(cur_shard == sec_shard_pick(tsdn, sec));
+	assert(cur_shard == shard);
 	sec_bin_t *bin = sec_bin_pick(sec, cur_shard, pszind);
 	malloc_mutex_lock(tsdn, &bin->mtx);
 	sec_bin_dalloc_locked(tsdn, sec, bin, size, dalloc_list);
@ -275,13 +275,11 @@ sec_multishard_trylock_dalloc(tsdn_t *tsdn, sec_t *sec, size_t size,
 }

 void
-sec_dalloc(tsdn_t *tsdn, sec_t *sec, edata_list_active_t *dalloc_list) {
-	if (!sec_is_used(sec)) {
-		return;
-	}
+sec_dalloc(tsdn_t *tsdn, sec_t *sec, edata_list_active_t *dalloc_list,
+    uint8_t shard) {
 	edata_t *edata = edata_list_active_first(dalloc_list);
 	size_t   size = edata_size_get(edata);
-	if (size > sec->opts.max_alloc) {
+	if (!sec_size_supported(sec, size)) {
 		return;
 	}
 	pszind_t pszind = sz_psz2ind(size);
@ -298,20 +296,21 @@ sec_dalloc(tsdn_t *tsdn, sec_t *sec, edata_list_active_t *dalloc_list) {
 		malloc_mutex_unlock(tsdn, &bin->mtx);
 		return;
 	}
-	sec_multishard_trylock_dalloc(tsdn, sec, size, pszind, dalloc_list);
+	sec_multishard_trylock_dalloc(
+	    tsdn, sec, size, pszind, dalloc_list, shard);
 }

 void
 sec_fill(tsdn_t *tsdn, sec_t *sec, size_t size, edata_list_active_t *result,
-    size_t nallocs) {
+    size_t nallocs, uint8_t shard) {
 	assert((size & PAGE_MASK) == 0);
-	assert(sec->opts.nshards != 0 && size <= sec->opts.max_alloc);
+	assert(sec_size_supported(sec, size));
 	assert(nallocs > 0);

 	pszind_t pszind = sz_psz2ind(size);
 	assert(pszind < sec->npsizes);

-	sec_bin_t *bin = sec_bin_pick(sec, sec_shard_pick(tsdn, sec), pszind);
+	sec_bin_t *bin = sec_bin_pick(sec, shard, pszind);
 	malloc_mutex_assert_not_owner(tsdn, &bin->mtx);
 	malloc_mutex_lock(tsdn, &bin->mtx);
 	size_t new_cached_bytes = nallocs * size;
--- a/src/stats.c
+++ b/src/stats.c
@ -835,6 +835,37 @@ stats_arena_hpa_shard_sec_print(emitter_t *emitter, unsigned i) {
 	    &sec_overfills);
 }

+static void
+stats_arena_pac_sec_print(emitter_t *emitter, unsigned i) {
+	size_t sec_bytes;
+	size_t sec_hits;
+	size_t sec_misses;
+	size_t sec_dalloc_flush;
+	size_t sec_dalloc_noflush;
+	CTL_M2_GET("stats.arenas.0.pac_sec_bytes", i, &sec_bytes, size_t);
+	emitter_kv(emitter, "pac_sec_bytes",
+	    "Bytes in PAC small extent cache",
+	    emitter_type_size, &sec_bytes);
+	CTL_M2_GET("stats.arenas.0.pac_sec_hits", i, &sec_hits, size_t);
+	emitter_kv(emitter, "pac_sec_hits",
+	    "Total hits in PAC small extent cache",
+	    emitter_type_size, &sec_hits);
+	CTL_M2_GET("stats.arenas.0.pac_sec_misses", i, &sec_misses, size_t);
+	emitter_kv(emitter, "pac_sec_misses",
+	    "Total misses in PAC small extent cache",
+	    emitter_type_size, &sec_misses);
+	CTL_M2_GET("stats.arenas.0.pac_sec_dalloc_noflush", i,
+	    &sec_dalloc_noflush, size_t);
+	emitter_kv(emitter, "pac_sec_dalloc_noflush",
+	    "Dalloc calls without flush in PAC small extent cache",
+	    emitter_type_size, &sec_dalloc_noflush);
+	CTL_M2_GET("stats.arenas.0.pac_sec_dalloc_flush", i, &sec_dalloc_flush,
+	    size_t);
+	emitter_kv(emitter, "pac_sec_dalloc_flush",
+	    "Dalloc calls with flush in PAC small extent cache",
+	    emitter_type_size, &sec_dalloc_flush);
+}
+
 static void
 stats_arena_hpa_shard_counters_print(
    emitter_t *emitter, unsigned i, uint64_t uptime) {
@ -1567,6 +1598,10 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
 	GET_AND_EMIT_MEM_STAT(extent_avail)
 #undef GET_AND_EMIT_MEM_STAT

+	if (opt_pac_sec_opts.nshards > 0) {
+		stats_arena_pac_sec_print(emitter, i);
+	}
+
 	if (mutex) {
 		stats_arena_mutexes_print(emitter, i, uptime);
 	}
@ -1761,6 +1796,9 @@ stats_general_print(emitter_t *emitter) {
 	OPT_WRITE_SIZE_T("hpa_sec_nshards")
 	OPT_WRITE_SIZE_T("hpa_sec_max_alloc")
 	OPT_WRITE_SIZE_T("hpa_sec_max_bytes")
+	OPT_WRITE_SIZE_T("experimental_pac_sec_nshards")
+	OPT_WRITE_SIZE_T("experimental_pac_sec_max_alloc")
+	OPT_WRITE_SIZE_T("experimental_pac_sec_max_bytes")
 	OPT_WRITE_BOOL("huge_arena_pac_thp")
 	OPT_WRITE_CHAR_P("metadata_thp")
 	OPT_WRITE_INT64("mutex_max_spin")