[SEC] Make SEC owned by hpa_shard, simplify the code, add stats, lock per bin

This commit is contained in:
Slobodan Predolac 2025-10-30 16:05:04 -07:00 committed by guangli-dai
parent d930391cf3
commit b5da68dbc3
35 changed files with 1264 additions and 1257 deletions

View file

@ -46,7 +46,7 @@ void arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats,
bin_stats_data_t *bstats, arena_stats_large_t *lstats, pac_estats_t *estats,
hpa_shard_stats_t *hpastats, sec_stats_t *secstats);
hpa_shard_stats_t *hpastats);
void arena_handle_deferred_work(tsdn_t *tsdn, arena_t *arena);
edata_t *arena_extent_alloc_large(
tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment, bool zero);

View file

@ -51,7 +51,6 @@ typedef struct ctl_arena_stats_s {
arena_stats_large_t lstats[SC_NSIZES - SC_NBINS];
pac_estats_t estats[SC_NPSIZES];
hpa_shard_stats_t hpastats;
sec_stats_t secstats;
} ctl_arena_stats_t;
typedef struct ctl_stats_s {

View file

@ -12,6 +12,7 @@
#include "jemalloc/internal/mutex.h"
#include "jemalloc/internal/pai.h"
#include "jemalloc/internal/psset.h"
#include "jemalloc/internal/sec.h"
typedef struct hpa_shard_nonderived_stats_s hpa_shard_nonderived_stats_t;
struct hpa_shard_nonderived_stats_s {
@ -57,6 +58,7 @@ typedef struct hpa_shard_stats_s hpa_shard_stats_t;
struct hpa_shard_stats_s {
psset_stats_t psset_stats;
hpa_shard_nonderived_stats_t nonderived_stats;
sec_stats_t secstats;
};
typedef struct hpa_shard_s hpa_shard_t;
@ -69,14 +71,17 @@ struct hpa_shard_s {
/* The central allocator we get our hugepages from. */
hpa_central_t *central;
/* Protects most of this shard's state. */
malloc_mutex_t mtx;
/*
* Guards the shard's access to the central allocator (preventing
* multiple threads operating on this shard from accessing the central
* allocator).
*/
malloc_mutex_t grow_mtx;
/* The base metadata allocator. */
base_t *base;
@ -87,6 +92,9 @@ struct hpa_shard_s {
*/
edata_cache_fast_t ecf;
/* Small extent cache (not guarded by mtx) */
JEMALLOC_ALIGNED(CACHELINE) sec_t sec;
psset_t psset;
/*
@ -142,9 +150,9 @@ bool hpa_hugepage_size_exceeds_limit(void);
* just that it can function properly given the system it's running on.
*/
bool hpa_supported(void);
bool hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap,
base_t *base, edata_cache_t *edata_cache, unsigned ind,
const hpa_shard_opts_t *opts);
bool hpa_shard_init(tsdn_t *tsdn, hpa_shard_t *shard, hpa_central_t *central,
emap_t *emap, base_t *base, edata_cache_t *edata_cache, unsigned ind,
const hpa_shard_opts_t *opts, const sec_opts_t *sec_opts);
void hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src);
void hpa_shard_stats_merge(
@ -157,6 +165,8 @@ void hpa_shard_stats_merge(
*/
void hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard);
void hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard);
/* Flush caches that shard may be using */
void hpa_shard_flush(tsdn_t *tsdn, hpa_shard_t *shard);
void hpa_shard_set_deferral_allowed(
tsdn_t *tsdn, hpa_shard_t *shard, bool deferral_allowed);
@ -164,8 +174,9 @@ void hpa_shard_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard);
/*
* We share the fork ordering with the PA and arena prefork handling; that's why
* these are 3 and 4 rather than 0 and 1.
* these are 2, 3 and 4 rather than 0 and 1.
*/
void hpa_shard_prefork2(tsdn_t *tsdn, hpa_shard_t *shard);
void hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard);
void hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard);
void hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard);

View file

@ -96,12 +96,6 @@ struct pa_shard_s {
/* Allocates from a PAC. */
pac_t pac;
/*
* We place a small extent cache in front of the HPA, since we intend
* these configurations to use many fewer arenas, and therefore have a
* higher risk of hot locks.
*/
sec_t hpa_sec;
hpa_shard_t hpa_shard;
/* The source of edata_t objects. */
@ -166,6 +160,9 @@ void pa_shard_reset(tsdn_t *tsdn, pa_shard_t *shard);
*/
void pa_shard_destroy(tsdn_t *tsdn, pa_shard_t *shard);
/* Flush any caches used by shard */
void pa_shard_flush(tsdn_t *tsdn, pa_shard_t *shard);
/* Gets an edata for the given allocation. */
edata_t *pa_alloc(tsdn_t *tsdn, pa_shard_t *shard, size_t size,
size_t alignment, bool slab, szind_t szind, bool zero, bool guarded,
@ -233,8 +230,7 @@ void pa_shard_basic_stats_merge(
void pa_shard_stats_merge(tsdn_t *tsdn, pa_shard_t *shard,
pa_shard_stats_t *pa_shard_stats_out, pac_estats_t *estats_out,
hpa_shard_stats_t *hpa_stats_out, sec_stats_t *sec_stats_out,
size_t *resident);
hpa_shard_stats_t *hpa_stats_out, size_t *resident);
/*
* Reads the PA-owned mutex stats into the output stats array, at the

View file

@ -13,15 +13,6 @@ struct pai_s {
edata_t *(*alloc)(tsdn_t *tsdn, pai_t *self, size_t size,
size_t alignment, bool zero, bool guarded, bool frequent_reuse,
bool *deferred_work_generated);
/*
* Returns the number of extents added to the list (which may be fewer
* than requested, in case of OOM). The list should already be
* initialized. The only alignment guarantee is page-alignment, and
* the results are not necessarily zeroed.
*/
size_t (*alloc_batch)(tsdn_t *tsdn, pai_t *self, size_t size,
size_t nallocs, edata_list_active_t *results, bool frequent_reuse,
bool *deferred_work_generated);
bool (*expand)(tsdn_t *tsdn, pai_t *self, edata_t *edata,
size_t old_size, size_t new_size, bool zero,
bool *deferred_work_generated);
@ -29,9 +20,6 @@ struct pai_s {
size_t old_size, size_t new_size, bool *deferred_work_generated);
void (*dalloc)(tsdn_t *tsdn, pai_t *self, edata_t *edata,
bool *deferred_work_generated);
/* This function empties out list as a side-effect of being called. */
void (*dalloc_batch)(tsdn_t *tsdn, pai_t *self,
edata_list_active_t *list, bool *deferred_work_generated);
uint64_t (*time_until_deferred_work)(tsdn_t *tsdn, pai_t *self);
};
@ -47,14 +35,6 @@ pai_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero,
frequent_reuse, deferred_work_generated);
}
static inline size_t
pai_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, size_t nallocs,
edata_list_active_t *results, bool frequent_reuse,
bool *deferred_work_generated) {
return self->alloc_batch(tsdn, self, size, nallocs, results,
frequent_reuse, deferred_work_generated);
}
static inline bool
pai_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size,
size_t new_size, bool zero, bool *deferred_work_generated) {
@ -75,26 +55,9 @@ pai_dalloc(
self->dalloc(tsdn, self, edata, deferred_work_generated);
}
static inline void
pai_dalloc_batch(tsdn_t *tsdn, pai_t *self, edata_list_active_t *list,
bool *deferred_work_generated) {
self->dalloc_batch(tsdn, self, list, deferred_work_generated);
}
static inline uint64_t
pai_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) {
return self->time_until_deferred_work(tsdn, self);
}
/*
* An implementation of batch allocation that simply calls alloc once for
* each item in the list.
*/
size_t pai_alloc_batch_default(tsdn_t *tsdn, pai_t *self, size_t size,
size_t nallocs, edata_list_active_t *results, bool frequent_reuse,
bool *deferred_work_generated);
/* Ditto, for dalloc. */
void pai_dalloc_batch_default(tsdn_t *tsdn, pai_t *self,
edata_list_active_t *list, bool *deferred_work_generated);
#endif /* JEMALLOC_INTERNAL_PAI_H */

View file

@ -17,91 +17,104 @@
* knowledge of the underlying PAI implementation).
*/
/*
* For now, this is just one field; eventually, we'll probably want to get more
* fine-grained data out (like per-size class statistics).
*/
typedef struct sec_bin_stats_s sec_bin_stats_t;
struct sec_bin_stats_s {
/* Number of alloc requests that did not find extent in this bin */
size_t nmisses;
/* Number of successful alloc requests. */
size_t nhits;
/* Number of dallocs causing the flush */
size_t ndalloc_flush;
/* Number of dallocs not causing the flush */
size_t ndalloc_noflush;
/* Number of fills that hit max_bytes */
size_t noverfills;
};
typedef struct sec_stats_s sec_stats_t;
struct sec_stats_s {
/* Sum of bytes_cur across all shards. */
size_t bytes;
/* Totals of bin_stats. */
sec_bin_stats_t total;
};
static inline void
sec_bin_stats_init(sec_bin_stats_t *stats) {
stats->ndalloc_flush = 0;
stats->nmisses = 0;
stats->nhits = 0;
stats->ndalloc_noflush = 0;
stats->noverfills = 0;
}
static inline void
sec_bin_stats_accum(sec_bin_stats_t *dst, sec_bin_stats_t *src) {
dst->nmisses += src->nmisses;
dst->nhits += src->nhits;
dst->ndalloc_flush += src->ndalloc_flush;
dst->ndalloc_noflush += src->ndalloc_noflush;
dst->noverfills += src->noverfills;
}
static inline void
sec_stats_accum(sec_stats_t *dst, sec_stats_t *src) {
dst->bytes += src->bytes;
sec_bin_stats_accum(&dst->total, &src->total);
}
/* A collections of free extents, all of the same size. */
typedef struct sec_bin_s sec_bin_t;
struct sec_bin_s {
/*
* When we fail to fulfill an allocation, we do a batch-alloc on the
* underlying allocator to fill extra items, as well. We drop the SEC
* lock while doing so, to allow operations on other bins to succeed.
* That introduces the possibility of other threads also trying to
* allocate out of this bin, failing, and also going to the backing
* allocator. To avoid a thundering herd problem in which lots of
* threads do batch allocs and overfill this bin as a result, we only
* allow one batch allocation at a time for a bin. This bool tracks
* whether or not some thread is already batch allocating.
*
* Eventually, the right answer may be a smarter sharding policy for the
* bins (e.g. a mutex per bin, which would also be more scalable
* generally; the batch-allocating thread could hold it while
* batch-allocating).
* Protects the data members of the bin.
*/
bool being_batch_filled;
malloc_mutex_t mtx;
/*
* Number of bytes in this particular bin (as opposed to the
* sec_shard_t's bytes_cur. This isn't user visible or reported in
* stats; rather, it allows us to quickly determine the change in the
* centralized counter when flushing.
* Number of bytes in this particular bin.
*/
size_t bytes_cur;
edata_list_active_t freelist;
};
typedef struct sec_shard_s sec_shard_t;
struct sec_shard_s {
/*
* We don't keep per-bin mutexes, even though that would allow more
* sharding; this allows global cache-eviction, which in turn allows for
* better balancing across free lists.
*/
malloc_mutex_t mtx;
/*
* A SEC may need to be shut down (i.e. flushed of its contents and
* prevented from further caching). To avoid tricky synchronization
* issues, we just track enabled-status in each shard, guarded by a
* mutex. In practice, this is only ever checked during brief races,
* since the arena-level atomic boolean tracking HPA enabled-ness means
* that we won't go down these pathways very often after custom extent
* hooks are installed.
*/
bool enabled;
sec_bin_t *bins;
/* Number of bytes in all bins in the shard. */
size_t bytes_cur;
/* The next pszind to flush in the flush-some pathways. */
pszind_t to_flush_next;
sec_bin_stats_t stats;
};
typedef struct sec_s sec_t;
struct sec_s {
pai_t pai;
pai_t *fallback;
sec_opts_t opts;
sec_shard_t *shards;
pszind_t npsizes;
sec_opts_t opts;
sec_bin_t *bins;
pszind_t npsizes;
};
bool sec_init(tsdn_t *tsdn, sec_t *sec, base_t *base, pai_t *fallback,
const sec_opts_t *opts);
void sec_flush(tsdn_t *tsdn, sec_t *sec);
void sec_disable(tsdn_t *tsdn, sec_t *sec);
static inline bool
sec_is_used(sec_t *sec) {
return sec->opts.nshards != 0;
}
static inline bool
sec_size_supported(sec_t *sec, size_t size) {
return sec_is_used(sec) && size <= sec->opts.max_alloc;
}
/* If sec does not have extent available, it will return NULL. */
edata_t *sec_alloc(tsdn_t *tsdn, sec_t *sec, size_t size);
void sec_fill(tsdn_t *tsdn, sec_t *sec, size_t size,
edata_list_active_t *result, size_t nallocs);
/*
* Upon return dalloc_list may be empty if edata is consumed by sec or non-empty
* if there are extents that need to be flushed from cache. Please note, that
* if we need to flush, extent(s) returned in the list to be deallocated
* will almost certainly not contain the one being dalloc-ed (that one will be
* considered "hot" and preserved in the cache, while "colder" ones are
* returned).
*/
void sec_dalloc(tsdn_t *tsdn, sec_t *sec, edata_list_active_t *dalloc_list);
bool sec_init(tsdn_t *tsdn, sec_t *sec, base_t *base, const sec_opts_t *opts);
/* Fills to_flush with extents that need to be deallocated */
void sec_flush(tsdn_t *tsdn, sec_t *sec, edata_list_active_t *to_flush);
/*
* Morally, these two stats methods probably ought to be a single one (and the

View file

@ -12,46 +12,39 @@ typedef struct sec_opts_s sec_opts_t;
struct sec_opts_s {
/*
* We don't necessarily always use all the shards; requests are
* distributed across shards [0, nshards - 1).
* distributed across shards [0, nshards - 1). Once thread picks a
* shard it will always use that one. If this value is set to 0 sec is
* not used.
*/
size_t nshards;
/*
* We'll automatically refuse to cache any objects in this sec if
* they're larger than max_alloc bytes, instead forwarding such objects
* directly to the fallback.
* they're larger than max_alloc bytes.
*/
size_t max_alloc;
/*
* Exceeding this amount of cached extents in a shard causes us to start
* flushing bins in that shard until we fall below bytes_after_flush.
* Exceeding this amount of cached extents in a bin causes us to flush
* until we are 1/4 below max_bytes.
*/
size_t max_bytes;
/*
* The number of bytes (in all bins) we flush down to when we exceed
* bytes_cur. We want this to be less than bytes_cur, because
* otherwise we could get into situations where a shard undergoing
* net-deallocation keeps bytes_cur very near to max_bytes, so that
* most deallocations get immediately forwarded to the underlying PAI
* implementation, defeating the point of the SEC.
*/
size_t bytes_after_flush;
/*
* When we can't satisfy an allocation out of the SEC because there are
* no available ones cached, we allocate multiple of that size out of
* the fallback allocator. Eventually we might want to do something
* cleverer, but for now we just grab a fixed number.
* no available ones cached, allocator will allocate a batch with extra
* batch_fill_extra extents of the same size.
*/
size_t batch_fill_extra;
};
#define SEC_OPTS_NSHARDS_DEFAULT 2
#define SEC_OPTS_BATCH_FILL_EXTRA_DEFAULT 3
#define SEC_OPTS_MAX_ALLOC_DEFAULT ((32 * 1024) < PAGE ? PAGE : (32 * 1024))
#define SEC_OPTS_MAX_BYTES_DEFAULT \
((256 * 1024) < (4 * SEC_OPTS_MAX_ALLOC_DEFAULT) \
? (4 * SEC_OPTS_MAX_ALLOC_DEFAULT) \
: (256 * 1024))
#define SEC_OPTS_DEFAULT \
{ \
/* nshards */ \
4, /* max_alloc */ \
(32 * 1024) < PAGE ? PAGE : (32 * 1024), /* max_bytes */ \
256 * 1024, /* bytes_after_flush */ \
128 * 1024, /* batch_fill_extra */ \
0 \
}
{SEC_OPTS_NSHARDS_DEFAULT, SEC_OPTS_MAX_ALLOC_DEFAULT, \
SEC_OPTS_MAX_BYTES_DEFAULT, SEC_OPTS_BATCH_FILL_EXTRA_DEFAULT}
#endif /* JEMALLOC_INTERNAL_SEC_OPTS_H */

View file

@ -46,7 +46,7 @@ enum witness_rank_e {
WITNESS_RANK_DECAY = WITNESS_RANK_CORE,
WITNESS_RANK_TCACHE_QL,
WITNESS_RANK_SEC_SHARD,
WITNESS_RANK_SEC_BIN,
WITNESS_RANK_EXTENT_GROW,
WITNESS_RANK_HPA_SHARD_GROW = WITNESS_RANK_EXTENT_GROW,