[SEC] Make SEC owned by hpa_shard, simplify the code, add stats, lock per bin

2026-05-16 01:46:23 +03:00 · 2025-10-30 16:05:04 -07:00 · 2025-10-30 16:05:04 -07:00 · b5da68dbc3
commit b5da68dbc3
parent d930391cf3
35 changed files with 1264 additions and 1257 deletions
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@ -46,7 +46,7 @@ void arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
    const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
    size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats,
    bin_stats_data_t *bstats, arena_stats_large_t *lstats, pac_estats_t *estats,
-    hpa_shard_stats_t *hpastats, sec_stats_t *secstats);
+    hpa_shard_stats_t *hpastats);
 void arena_handle_deferred_work(tsdn_t *tsdn, arena_t *arena);
 edata_t *arena_extent_alloc_large(
    tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment, bool zero);
--- a/include/jemalloc/internal/ctl.h
+++ b/include/jemalloc/internal/ctl.h
@ -51,7 +51,6 @@ typedef struct ctl_arena_stats_s {
 	arena_stats_large_t lstats[SC_NSIZES - SC_NBINS];
 	pac_estats_t        estats[SC_NPSIZES];
 	hpa_shard_stats_t   hpastats;
-	sec_stats_t         secstats;
 } ctl_arena_stats_t;

 typedef struct ctl_stats_s {
--- a/include/jemalloc/internal/hpa.h
+++ b/include/jemalloc/internal/hpa.h
@ -12,6 +12,7 @@
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/pai.h"
 #include "jemalloc/internal/psset.h"
+#include "jemalloc/internal/sec.h"

 typedef struct hpa_shard_nonderived_stats_s hpa_shard_nonderived_stats_t;
 struct hpa_shard_nonderived_stats_s {
@ -57,6 +58,7 @@ typedef struct hpa_shard_stats_s hpa_shard_stats_t;
 struct hpa_shard_stats_s {
 	psset_stats_t                psset_stats;
 	hpa_shard_nonderived_stats_t nonderived_stats;
+	sec_stats_t                  secstats;
 };

 typedef struct hpa_shard_s hpa_shard_t;
@ -69,14 +71,17 @@ struct hpa_shard_s {

 	/* The central allocator we get our hugepages from. */
 	hpa_central_t *central;
+
 	/* Protects most of this shard's state. */
 	malloc_mutex_t mtx;
+
 	/*
 	 * Guards the shard's access to the central allocator (preventing
 	 * multiple threads operating on this shard from accessing the central
 	 * allocator).
 	 */
 	malloc_mutex_t grow_mtx;
+
 	/* The base metadata allocator. */
 	base_t *base;

@ -87,6 +92,9 @@ struct hpa_shard_s {
 	 */
 	edata_cache_fast_t ecf;

+	/* Small extent cache (not guarded by mtx) */
+	JEMALLOC_ALIGNED(CACHELINE) sec_t sec;
+
 	psset_t psset;

 	/*
@ -142,9 +150,9 @@ bool hpa_hugepage_size_exceeds_limit(void);
 * just that it can function properly given the system it's running on.
 */
 bool hpa_supported(void);
-bool hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap,
-    base_t *base, edata_cache_t *edata_cache, unsigned ind,
-    const hpa_shard_opts_t *opts);
+bool hpa_shard_init(tsdn_t *tsdn, hpa_shard_t *shard, hpa_central_t *central,
+    emap_t *emap, base_t *base, edata_cache_t *edata_cache, unsigned ind,
+    const hpa_shard_opts_t *opts, const sec_opts_t *sec_opts);

 void hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src);
 void hpa_shard_stats_merge(
@ -157,6 +165,8 @@ void hpa_shard_stats_merge(
 */
 void hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard);
 void hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard);
+/* Flush caches that shard may be using */
+void hpa_shard_flush(tsdn_t *tsdn, hpa_shard_t *shard);

 void hpa_shard_set_deferral_allowed(
    tsdn_t *tsdn, hpa_shard_t *shard, bool deferral_allowed);
@ -164,8 +174,9 @@ void hpa_shard_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard);

 /*
 * We share the fork ordering with the PA and arena prefork handling; that's why
- * these are 3 and 4 rather than 0 and 1.
+ * these are 2, 3 and 4 rather than 0 and 1.
 */
+void hpa_shard_prefork2(tsdn_t *tsdn, hpa_shard_t *shard);
 void hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard);
 void hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard);
 void hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard);
--- a/include/jemalloc/internal/pa.h
+++ b/include/jemalloc/internal/pa.h
@ -96,12 +96,6 @@ struct pa_shard_s {
 	/* Allocates from a PAC. */
 	pac_t pac;

-	/*
-	 * We place a small extent cache in front of the HPA, since we intend
-	 * these configurations to use many fewer arenas, and therefore have a
-	 * higher risk of hot locks.
-	 */
-	sec_t       hpa_sec;
 	hpa_shard_t hpa_shard;

 	/* The source of edata_t objects. */
@ -166,6 +160,9 @@ void pa_shard_reset(tsdn_t *tsdn, pa_shard_t *shard);
 */
 void pa_shard_destroy(tsdn_t *tsdn, pa_shard_t *shard);

+/* Flush any caches used by shard */
+void pa_shard_flush(tsdn_t *tsdn, pa_shard_t *shard);
+
 /* Gets an edata for the given allocation. */
 edata_t *pa_alloc(tsdn_t *tsdn, pa_shard_t *shard, size_t size,
    size_t alignment, bool slab, szind_t szind, bool zero, bool guarded,
@ -233,8 +230,7 @@ void pa_shard_basic_stats_merge(

 void pa_shard_stats_merge(tsdn_t *tsdn, pa_shard_t *shard,
    pa_shard_stats_t *pa_shard_stats_out, pac_estats_t *estats_out,
-    hpa_shard_stats_t *hpa_stats_out, sec_stats_t *sec_stats_out,
-    size_t *resident);
+    hpa_shard_stats_t *hpa_stats_out, size_t *resident);

 /*
 * Reads the PA-owned mutex stats into the output stats array, at the
--- a/include/jemalloc/internal/pai.h
+++ b/include/jemalloc/internal/pai.h
@ -13,15 +13,6 @@ struct pai_s {
 	edata_t *(*alloc)(tsdn_t *tsdn, pai_t *self, size_t size,
 	    size_t alignment, bool zero, bool guarded, bool frequent_reuse,
 	    bool *deferred_work_generated);
-	/*
-	 * Returns the number of extents added to the list (which may be fewer
-	 * than requested, in case of OOM).  The list should already be
-	 * initialized.  The only alignment guarantee is page-alignment, and
-	 * the results are not necessarily zeroed.
-	 */
-	size_t (*alloc_batch)(tsdn_t *tsdn, pai_t *self, size_t size,
-	    size_t nallocs, edata_list_active_t *results, bool frequent_reuse,
-	    bool *deferred_work_generated);
 	bool (*expand)(tsdn_t *tsdn, pai_t *self, edata_t *edata,
 	    size_t old_size, size_t new_size, bool zero,
 	    bool *deferred_work_generated);
@ -29,9 +20,6 @@ struct pai_s {
 	    size_t old_size, size_t new_size, bool *deferred_work_generated);
 	void (*dalloc)(tsdn_t *tsdn, pai_t *self, edata_t *edata,
 	    bool *deferred_work_generated);
-	/* This function empties out list as a side-effect of being called. */
-	void (*dalloc_batch)(tsdn_t *tsdn, pai_t *self,
-	    edata_list_active_t *list, bool *deferred_work_generated);
 	uint64_t (*time_until_deferred_work)(tsdn_t *tsdn, pai_t *self);
 };

@ -47,14 +35,6 @@ pai_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero,
 	    frequent_reuse, deferred_work_generated);
 }

-static inline size_t
-pai_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, size_t nallocs,
-    edata_list_active_t *results, bool frequent_reuse,
-    bool *deferred_work_generated) {
-	return self->alloc_batch(tsdn, self, size, nallocs, results,
-	    frequent_reuse, deferred_work_generated);
-}
-
 static inline bool
 pai_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size,
    size_t new_size, bool zero, bool *deferred_work_generated) {
@ -75,26 +55,9 @@ pai_dalloc(
 	self->dalloc(tsdn, self, edata, deferred_work_generated);
 }

-static inline void
-pai_dalloc_batch(tsdn_t *tsdn, pai_t *self, edata_list_active_t *list,
-    bool *deferred_work_generated) {
-	self->dalloc_batch(tsdn, self, list, deferred_work_generated);
-}
-
 static inline uint64_t
 pai_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) {
 	return self->time_until_deferred_work(tsdn, self);
 }

-/*
- * An implementation of batch allocation that simply calls alloc once for
- * each item in the list.
- */
-size_t pai_alloc_batch_default(tsdn_t *tsdn, pai_t *self, size_t size,
-    size_t nallocs, edata_list_active_t *results, bool frequent_reuse,
-    bool *deferred_work_generated);
-/* Ditto, for dalloc. */
-void pai_dalloc_batch_default(tsdn_t *tsdn, pai_t *self,
-    edata_list_active_t *list, bool *deferred_work_generated);
-
 #endif /* JEMALLOC_INTERNAL_PAI_H */
--- a/include/jemalloc/internal/sec.h
+++ b/include/jemalloc/internal/sec.h
@ -17,91 +17,104 @@
 * knowledge of the underlying PAI implementation).
 */

-/*
- * For now, this is just one field; eventually, we'll probably want to get more
- * fine-grained data out (like per-size class statistics).
- */
+typedef struct sec_bin_stats_s sec_bin_stats_t;
+struct sec_bin_stats_s {
+	/* Number of alloc requests that did not find extent in this bin */
+	size_t nmisses;
+	/* Number of successful alloc requests. */
+	size_t nhits;
+	/* Number of dallocs causing the flush */
+	size_t ndalloc_flush;
+	/* Number of dallocs not causing the flush */
+	size_t ndalloc_noflush;
+	/* Number of fills that hit max_bytes */
+	size_t noverfills;
+};
 typedef struct sec_stats_s sec_stats_t;
 struct sec_stats_s {
 	/* Sum of bytes_cur across all shards. */
 	size_t bytes;
+
+	/* Totals of bin_stats. */
+	sec_bin_stats_t total;
 };

+static inline void
+sec_bin_stats_init(sec_bin_stats_t *stats) {
+	stats->ndalloc_flush = 0;
+	stats->nmisses = 0;
+	stats->nhits = 0;
+	stats->ndalloc_noflush = 0;
+	stats->noverfills = 0;
+}
+
+static inline void
+sec_bin_stats_accum(sec_bin_stats_t *dst, sec_bin_stats_t *src) {
+	dst->nmisses += src->nmisses;
+	dst->nhits += src->nhits;
+	dst->ndalloc_flush += src->ndalloc_flush;
+	dst->ndalloc_noflush += src->ndalloc_noflush;
+	dst->noverfills += src->noverfills;
+}
+
 static inline void
 sec_stats_accum(sec_stats_t *dst, sec_stats_t *src) {
 	dst->bytes += src->bytes;
+	sec_bin_stats_accum(&dst->total, &src->total);
 }

 /* A collections of free extents, all of the same size. */
 typedef struct sec_bin_s sec_bin_t;
 struct sec_bin_s {
 	/*
-	 * When we fail to fulfill an allocation, we do a batch-alloc on the
-	 * underlying allocator to fill extra items, as well.  We drop the SEC
-	 * lock while doing so, to allow operations on other bins to succeed.
-	 * That introduces the possibility of other threads also trying to
-	 * allocate out of this bin, failing, and also going to the backing
-	 * allocator.  To avoid a thundering herd problem in which lots of
-	 * threads do batch allocs and overfill this bin as a result, we only
-	 * allow one batch allocation at a time for a bin.  This bool tracks
-	 * whether or not some thread is already batch allocating.
-	 *
-	 * Eventually, the right answer may be a smarter sharding policy for the
-	 * bins (e.g. a mutex per bin, which would also be more scalable
-	 * generally; the batch-allocating thread could hold it while
-	 * batch-allocating).
+	 * Protects the data members of the bin.
 	 */
-	bool being_batch_filled;
+	malloc_mutex_t mtx;

 	/*
-	 * Number of bytes in this particular bin (as opposed to the
-	 * sec_shard_t's bytes_cur.  This isn't user visible or reported in
-	 * stats; rather, it allows us to quickly determine the change in the
-	 * centralized counter when flushing.
+	 * Number of bytes in this particular bin.
 	 */
 	size_t              bytes_cur;
 	edata_list_active_t freelist;
-};
-
-typedef struct sec_shard_s sec_shard_t;
-struct sec_shard_s {
-	/*
-	 * We don't keep per-bin mutexes, even though that would allow more
-	 * sharding; this allows global cache-eviction, which in turn allows for
-	 * better balancing across free lists.
-	 */
-	malloc_mutex_t mtx;
-	/*
-	 * A SEC may need to be shut down (i.e. flushed of its contents and
-	 * prevented from further caching).  To avoid tricky synchronization
-	 * issues, we just track enabled-status in each shard, guarded by a
-	 * mutex.  In practice, this is only ever checked during brief races,
-	 * since the arena-level atomic boolean tracking HPA enabled-ness means
-	 * that we won't go down these pathways very often after custom extent
-	 * hooks are installed.
-	 */
-	bool       enabled;
-	sec_bin_t *bins;
-	/* Number of bytes in all bins in the shard. */
-	size_t bytes_cur;
-	/* The next pszind to flush in the flush-some pathways. */
-	pszind_t to_flush_next;
+	sec_bin_stats_t     stats;
 };

 typedef struct sec_s sec_t;
 struct sec_s {
-	pai_t  pai;
-	pai_t *fallback;
-
-	sec_opts_t   opts;
-	sec_shard_t *shards;
-	pszind_t     npsizes;
+	sec_opts_t opts;
+	sec_bin_t *bins;
+	pszind_t   npsizes;
 };

-bool sec_init(tsdn_t *tsdn, sec_t *sec, base_t *base, pai_t *fallback,
-    const sec_opts_t *opts);
-void sec_flush(tsdn_t *tsdn, sec_t *sec);
-void sec_disable(tsdn_t *tsdn, sec_t *sec);
+static inline bool
+sec_is_used(sec_t *sec) {
+	return sec->opts.nshards != 0;
+}
+
+static inline bool
+sec_size_supported(sec_t *sec, size_t size) {
+	return sec_is_used(sec) && size <= sec->opts.max_alloc;
+}
+
+/* If sec does not have extent available, it will return NULL. */
+edata_t *sec_alloc(tsdn_t *tsdn, sec_t *sec, size_t size);
+void     sec_fill(tsdn_t *tsdn, sec_t *sec, size_t size,
+        edata_list_active_t *result, size_t nallocs);
+
+/*
+ * Upon return dalloc_list may be empty if edata is consumed by sec or non-empty
+ * if there are extents that need to be flushed from cache.  Please note, that
+ * if we need to flush, extent(s) returned in the list to be deallocated
+ * will almost certainly not contain the one being dalloc-ed (that one will be
+ * considered "hot" and preserved in the cache, while "colder" ones are
+ * returned).
+ */
+void sec_dalloc(tsdn_t *tsdn, sec_t *sec, edata_list_active_t *dalloc_list);
+
+bool sec_init(tsdn_t *tsdn, sec_t *sec, base_t *base, const sec_opts_t *opts);
+
+/* Fills to_flush with extents that need to be deallocated */
+void sec_flush(tsdn_t *tsdn, sec_t *sec, edata_list_active_t *to_flush);

 /*
 * Morally, these two stats methods probably ought to be a single one (and the
--- a/include/jemalloc/internal/sec_opts.h
+++ b/include/jemalloc/internal/sec_opts.h
@ -12,46 +12,39 @@ typedef struct sec_opts_s sec_opts_t;
 struct sec_opts_s {
 	/*
 	 * We don't necessarily always use all the shards; requests are
-	 * distributed across shards [0, nshards - 1).
+	 * distributed across shards [0, nshards - 1).  Once thread picks a
+	 * shard it will always use that one.  If this value is set to 0 sec is
+	 * not used.
 	 */
 	size_t nshards;
 	/*
 	 * We'll automatically refuse to cache any objects in this sec if
-	 * they're larger than max_alloc bytes, instead forwarding such objects
-	 * directly to the fallback.
+	 * they're larger than max_alloc bytes.
 	 */
 	size_t max_alloc;
 	/*
-	 * Exceeding this amount of cached extents in a shard causes us to start
-	 * flushing bins in that shard until we fall below bytes_after_flush.
+	 * Exceeding this amount of cached extents in a bin causes us to flush
+	 * until we are 1/4 below max_bytes.
 	 */
 	size_t max_bytes;
-	/*
-	 * The number of bytes (in all bins) we flush down to when we exceed
-	 * bytes_cur.  We want this to be less than bytes_cur, because
-	 * otherwise we could get into situations where a shard undergoing
-	 * net-deallocation keeps bytes_cur very near to max_bytes, so that
-	 * most deallocations get immediately forwarded to the underlying PAI
-	 * implementation, defeating the point of the SEC.
-	 */
-	size_t bytes_after_flush;
 	/*
 	 * When we can't satisfy an allocation out of the SEC because there are
-	 * no available ones cached, we allocate multiple of that size out of
-	 * the fallback allocator.  Eventually we might want to do something
-	 * cleverer, but for now we just grab a fixed number.
+	 * no available ones cached, allocator will allocate a batch with extra
+	 * batch_fill_extra extents of the same size.
 	 */
 	size_t batch_fill_extra;
 };

+#define SEC_OPTS_NSHARDS_DEFAULT 2
+#define SEC_OPTS_BATCH_FILL_EXTRA_DEFAULT 3
+#define SEC_OPTS_MAX_ALLOC_DEFAULT ((32 * 1024) < PAGE ? PAGE : (32 * 1024))
+#define SEC_OPTS_MAX_BYTES_DEFAULT                                             \
+	((256 * 1024) < (4 * SEC_OPTS_MAX_ALLOC_DEFAULT)                       \
+	        ? (4 * SEC_OPTS_MAX_ALLOC_DEFAULT)                             \
+	        : (256 * 1024))
+
 #define SEC_OPTS_DEFAULT                                                       \
-	{                                                                      \
-		/* nshards */                                                  \
-		4,                                           /* max_alloc */   \
-		    (32 * 1024) < PAGE ? PAGE : (32 * 1024), /* max_bytes */   \
-		    256 * 1024, /* bytes_after_flush */                        \
-		    128 * 1024, /* batch_fill_extra */                         \
-		    0                                                          \
-	}
+	{SEC_OPTS_NSHARDS_DEFAULT, SEC_OPTS_MAX_ALLOC_DEFAULT,                 \
+	    SEC_OPTS_MAX_BYTES_DEFAULT, SEC_OPTS_BATCH_FILL_EXTRA_DEFAULT}

 #endif /* JEMALLOC_INTERNAL_SEC_OPTS_H */
--- a/include/jemalloc/internal/witness.h
+++ b/include/jemalloc/internal/witness.h
@ -46,7 +46,7 @@ enum witness_rank_e {
 	WITNESS_RANK_DECAY = WITNESS_RANK_CORE,
 	WITNESS_RANK_TCACHE_QL,

-	WITNESS_RANK_SEC_SHARD,
+	WITNESS_RANK_SEC_BIN,

 	WITNESS_RANK_EXTENT_GROW,
 	WITNESS_RANK_HPA_SHARD_GROW = WITNESS_RANK_EXTENT_GROW,