mirror of
https://github.com/jemalloc/jemalloc.git
synced 2026-05-16 01:46:23 +03:00
[SEC] Make SEC owned by hpa_shard, simplify the code, add stats, lock per bin
This commit is contained in:
parent
d930391cf3
commit
b5da68dbc3
35 changed files with 1264 additions and 1257 deletions
|
|
@ -46,7 +46,7 @@ void arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
|
|||
const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
|
||||
size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats,
|
||||
bin_stats_data_t *bstats, arena_stats_large_t *lstats, pac_estats_t *estats,
|
||||
hpa_shard_stats_t *hpastats, sec_stats_t *secstats);
|
||||
hpa_shard_stats_t *hpastats);
|
||||
void arena_handle_deferred_work(tsdn_t *tsdn, arena_t *arena);
|
||||
edata_t *arena_extent_alloc_large(
|
||||
tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment, bool zero);
|
||||
|
|
|
|||
|
|
@ -51,7 +51,6 @@ typedef struct ctl_arena_stats_s {
|
|||
arena_stats_large_t lstats[SC_NSIZES - SC_NBINS];
|
||||
pac_estats_t estats[SC_NPSIZES];
|
||||
hpa_shard_stats_t hpastats;
|
||||
sec_stats_t secstats;
|
||||
} ctl_arena_stats_t;
|
||||
|
||||
typedef struct ctl_stats_s {
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@
|
|||
#include "jemalloc/internal/mutex.h"
|
||||
#include "jemalloc/internal/pai.h"
|
||||
#include "jemalloc/internal/psset.h"
|
||||
#include "jemalloc/internal/sec.h"
|
||||
|
||||
typedef struct hpa_shard_nonderived_stats_s hpa_shard_nonderived_stats_t;
|
||||
struct hpa_shard_nonderived_stats_s {
|
||||
|
|
@ -57,6 +58,7 @@ typedef struct hpa_shard_stats_s hpa_shard_stats_t;
|
|||
struct hpa_shard_stats_s {
|
||||
psset_stats_t psset_stats;
|
||||
hpa_shard_nonderived_stats_t nonderived_stats;
|
||||
sec_stats_t secstats;
|
||||
};
|
||||
|
||||
typedef struct hpa_shard_s hpa_shard_t;
|
||||
|
|
@ -69,14 +71,17 @@ struct hpa_shard_s {
|
|||
|
||||
/* The central allocator we get our hugepages from. */
|
||||
hpa_central_t *central;
|
||||
|
||||
/* Protects most of this shard's state. */
|
||||
malloc_mutex_t mtx;
|
||||
|
||||
/*
|
||||
* Guards the shard's access to the central allocator (preventing
|
||||
* multiple threads operating on this shard from accessing the central
|
||||
* allocator).
|
||||
*/
|
||||
malloc_mutex_t grow_mtx;
|
||||
|
||||
/* The base metadata allocator. */
|
||||
base_t *base;
|
||||
|
||||
|
|
@ -87,6 +92,9 @@ struct hpa_shard_s {
|
|||
*/
|
||||
edata_cache_fast_t ecf;
|
||||
|
||||
/* Small extent cache (not guarded by mtx) */
|
||||
JEMALLOC_ALIGNED(CACHELINE) sec_t sec;
|
||||
|
||||
psset_t psset;
|
||||
|
||||
/*
|
||||
|
|
@ -142,9 +150,9 @@ bool hpa_hugepage_size_exceeds_limit(void);
|
|||
* just that it can function properly given the system it's running on.
|
||||
*/
|
||||
bool hpa_supported(void);
|
||||
bool hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap,
|
||||
base_t *base, edata_cache_t *edata_cache, unsigned ind,
|
||||
const hpa_shard_opts_t *opts);
|
||||
bool hpa_shard_init(tsdn_t *tsdn, hpa_shard_t *shard, hpa_central_t *central,
|
||||
emap_t *emap, base_t *base, edata_cache_t *edata_cache, unsigned ind,
|
||||
const hpa_shard_opts_t *opts, const sec_opts_t *sec_opts);
|
||||
|
||||
void hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src);
|
||||
void hpa_shard_stats_merge(
|
||||
|
|
@ -157,6 +165,8 @@ void hpa_shard_stats_merge(
|
|||
*/
|
||||
void hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard);
|
||||
void hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard);
|
||||
/* Flush caches that shard may be using */
|
||||
void hpa_shard_flush(tsdn_t *tsdn, hpa_shard_t *shard);
|
||||
|
||||
void hpa_shard_set_deferral_allowed(
|
||||
tsdn_t *tsdn, hpa_shard_t *shard, bool deferral_allowed);
|
||||
|
|
@ -164,8 +174,9 @@ void hpa_shard_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard);
|
|||
|
||||
/*
|
||||
* We share the fork ordering with the PA and arena prefork handling; that's why
|
||||
* these are 3 and 4 rather than 0 and 1.
|
||||
* these are 2, 3 and 4 rather than 0 and 1.
|
||||
*/
|
||||
void hpa_shard_prefork2(tsdn_t *tsdn, hpa_shard_t *shard);
|
||||
void hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard);
|
||||
void hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard);
|
||||
void hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard);
|
||||
|
|
|
|||
|
|
@ -96,12 +96,6 @@ struct pa_shard_s {
|
|||
/* Allocates from a PAC. */
|
||||
pac_t pac;
|
||||
|
||||
/*
|
||||
* We place a small extent cache in front of the HPA, since we intend
|
||||
* these configurations to use many fewer arenas, and therefore have a
|
||||
* higher risk of hot locks.
|
||||
*/
|
||||
sec_t hpa_sec;
|
||||
hpa_shard_t hpa_shard;
|
||||
|
||||
/* The source of edata_t objects. */
|
||||
|
|
@ -166,6 +160,9 @@ void pa_shard_reset(tsdn_t *tsdn, pa_shard_t *shard);
|
|||
*/
|
||||
void pa_shard_destroy(tsdn_t *tsdn, pa_shard_t *shard);
|
||||
|
||||
/* Flush any caches used by shard */
|
||||
void pa_shard_flush(tsdn_t *tsdn, pa_shard_t *shard);
|
||||
|
||||
/* Gets an edata for the given allocation. */
|
||||
edata_t *pa_alloc(tsdn_t *tsdn, pa_shard_t *shard, size_t size,
|
||||
size_t alignment, bool slab, szind_t szind, bool zero, bool guarded,
|
||||
|
|
@ -233,8 +230,7 @@ void pa_shard_basic_stats_merge(
|
|||
|
||||
void pa_shard_stats_merge(tsdn_t *tsdn, pa_shard_t *shard,
|
||||
pa_shard_stats_t *pa_shard_stats_out, pac_estats_t *estats_out,
|
||||
hpa_shard_stats_t *hpa_stats_out, sec_stats_t *sec_stats_out,
|
||||
size_t *resident);
|
||||
hpa_shard_stats_t *hpa_stats_out, size_t *resident);
|
||||
|
||||
/*
|
||||
* Reads the PA-owned mutex stats into the output stats array, at the
|
||||
|
|
|
|||
|
|
@ -13,15 +13,6 @@ struct pai_s {
|
|||
edata_t *(*alloc)(tsdn_t *tsdn, pai_t *self, size_t size,
|
||||
size_t alignment, bool zero, bool guarded, bool frequent_reuse,
|
||||
bool *deferred_work_generated);
|
||||
/*
|
||||
* Returns the number of extents added to the list (which may be fewer
|
||||
* than requested, in case of OOM). The list should already be
|
||||
* initialized. The only alignment guarantee is page-alignment, and
|
||||
* the results are not necessarily zeroed.
|
||||
*/
|
||||
size_t (*alloc_batch)(tsdn_t *tsdn, pai_t *self, size_t size,
|
||||
size_t nallocs, edata_list_active_t *results, bool frequent_reuse,
|
||||
bool *deferred_work_generated);
|
||||
bool (*expand)(tsdn_t *tsdn, pai_t *self, edata_t *edata,
|
||||
size_t old_size, size_t new_size, bool zero,
|
||||
bool *deferred_work_generated);
|
||||
|
|
@ -29,9 +20,6 @@ struct pai_s {
|
|||
size_t old_size, size_t new_size, bool *deferred_work_generated);
|
||||
void (*dalloc)(tsdn_t *tsdn, pai_t *self, edata_t *edata,
|
||||
bool *deferred_work_generated);
|
||||
/* This function empties out list as a side-effect of being called. */
|
||||
void (*dalloc_batch)(tsdn_t *tsdn, pai_t *self,
|
||||
edata_list_active_t *list, bool *deferred_work_generated);
|
||||
uint64_t (*time_until_deferred_work)(tsdn_t *tsdn, pai_t *self);
|
||||
};
|
||||
|
||||
|
|
@ -47,14 +35,6 @@ pai_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero,
|
|||
frequent_reuse, deferred_work_generated);
|
||||
}
|
||||
|
||||
static inline size_t
|
||||
pai_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, size_t nallocs,
|
||||
edata_list_active_t *results, bool frequent_reuse,
|
||||
bool *deferred_work_generated) {
|
||||
return self->alloc_batch(tsdn, self, size, nallocs, results,
|
||||
frequent_reuse, deferred_work_generated);
|
||||
}
|
||||
|
||||
static inline bool
|
||||
pai_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size,
|
||||
size_t new_size, bool zero, bool *deferred_work_generated) {
|
||||
|
|
@ -75,26 +55,9 @@ pai_dalloc(
|
|||
self->dalloc(tsdn, self, edata, deferred_work_generated);
|
||||
}
|
||||
|
||||
static inline void
|
||||
pai_dalloc_batch(tsdn_t *tsdn, pai_t *self, edata_list_active_t *list,
|
||||
bool *deferred_work_generated) {
|
||||
self->dalloc_batch(tsdn, self, list, deferred_work_generated);
|
||||
}
|
||||
|
||||
static inline uint64_t
|
||||
pai_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) {
|
||||
return self->time_until_deferred_work(tsdn, self);
|
||||
}
|
||||
|
||||
/*
|
||||
* An implementation of batch allocation that simply calls alloc once for
|
||||
* each item in the list.
|
||||
*/
|
||||
size_t pai_alloc_batch_default(tsdn_t *tsdn, pai_t *self, size_t size,
|
||||
size_t nallocs, edata_list_active_t *results, bool frequent_reuse,
|
||||
bool *deferred_work_generated);
|
||||
/* Ditto, for dalloc. */
|
||||
void pai_dalloc_batch_default(tsdn_t *tsdn, pai_t *self,
|
||||
edata_list_active_t *list, bool *deferred_work_generated);
|
||||
|
||||
#endif /* JEMALLOC_INTERNAL_PAI_H */
|
||||
|
|
|
|||
|
|
@ -17,91 +17,104 @@
|
|||
* knowledge of the underlying PAI implementation).
|
||||
*/
|
||||
|
||||
/*
|
||||
* For now, this is just one field; eventually, we'll probably want to get more
|
||||
* fine-grained data out (like per-size class statistics).
|
||||
*/
|
||||
typedef struct sec_bin_stats_s sec_bin_stats_t;
|
||||
struct sec_bin_stats_s {
|
||||
/* Number of alloc requests that did not find extent in this bin */
|
||||
size_t nmisses;
|
||||
/* Number of successful alloc requests. */
|
||||
size_t nhits;
|
||||
/* Number of dallocs causing the flush */
|
||||
size_t ndalloc_flush;
|
||||
/* Number of dallocs not causing the flush */
|
||||
size_t ndalloc_noflush;
|
||||
/* Number of fills that hit max_bytes */
|
||||
size_t noverfills;
|
||||
};
|
||||
typedef struct sec_stats_s sec_stats_t;
|
||||
struct sec_stats_s {
|
||||
/* Sum of bytes_cur across all shards. */
|
||||
size_t bytes;
|
||||
|
||||
/* Totals of bin_stats. */
|
||||
sec_bin_stats_t total;
|
||||
};
|
||||
|
||||
static inline void
|
||||
sec_bin_stats_init(sec_bin_stats_t *stats) {
|
||||
stats->ndalloc_flush = 0;
|
||||
stats->nmisses = 0;
|
||||
stats->nhits = 0;
|
||||
stats->ndalloc_noflush = 0;
|
||||
stats->noverfills = 0;
|
||||
}
|
||||
|
||||
static inline void
|
||||
sec_bin_stats_accum(sec_bin_stats_t *dst, sec_bin_stats_t *src) {
|
||||
dst->nmisses += src->nmisses;
|
||||
dst->nhits += src->nhits;
|
||||
dst->ndalloc_flush += src->ndalloc_flush;
|
||||
dst->ndalloc_noflush += src->ndalloc_noflush;
|
||||
dst->noverfills += src->noverfills;
|
||||
}
|
||||
|
||||
static inline void
|
||||
sec_stats_accum(sec_stats_t *dst, sec_stats_t *src) {
|
||||
dst->bytes += src->bytes;
|
||||
sec_bin_stats_accum(&dst->total, &src->total);
|
||||
}
|
||||
|
||||
/* A collections of free extents, all of the same size. */
|
||||
typedef struct sec_bin_s sec_bin_t;
|
||||
struct sec_bin_s {
|
||||
/*
|
||||
* When we fail to fulfill an allocation, we do a batch-alloc on the
|
||||
* underlying allocator to fill extra items, as well. We drop the SEC
|
||||
* lock while doing so, to allow operations on other bins to succeed.
|
||||
* That introduces the possibility of other threads also trying to
|
||||
* allocate out of this bin, failing, and also going to the backing
|
||||
* allocator. To avoid a thundering herd problem in which lots of
|
||||
* threads do batch allocs and overfill this bin as a result, we only
|
||||
* allow one batch allocation at a time for a bin. This bool tracks
|
||||
* whether or not some thread is already batch allocating.
|
||||
*
|
||||
* Eventually, the right answer may be a smarter sharding policy for the
|
||||
* bins (e.g. a mutex per bin, which would also be more scalable
|
||||
* generally; the batch-allocating thread could hold it while
|
||||
* batch-allocating).
|
||||
* Protects the data members of the bin.
|
||||
*/
|
||||
bool being_batch_filled;
|
||||
malloc_mutex_t mtx;
|
||||
|
||||
/*
|
||||
* Number of bytes in this particular bin (as opposed to the
|
||||
* sec_shard_t's bytes_cur. This isn't user visible or reported in
|
||||
* stats; rather, it allows us to quickly determine the change in the
|
||||
* centralized counter when flushing.
|
||||
* Number of bytes in this particular bin.
|
||||
*/
|
||||
size_t bytes_cur;
|
||||
edata_list_active_t freelist;
|
||||
};
|
||||
|
||||
typedef struct sec_shard_s sec_shard_t;
|
||||
struct sec_shard_s {
|
||||
/*
|
||||
* We don't keep per-bin mutexes, even though that would allow more
|
||||
* sharding; this allows global cache-eviction, which in turn allows for
|
||||
* better balancing across free lists.
|
||||
*/
|
||||
malloc_mutex_t mtx;
|
||||
/*
|
||||
* A SEC may need to be shut down (i.e. flushed of its contents and
|
||||
* prevented from further caching). To avoid tricky synchronization
|
||||
* issues, we just track enabled-status in each shard, guarded by a
|
||||
* mutex. In practice, this is only ever checked during brief races,
|
||||
* since the arena-level atomic boolean tracking HPA enabled-ness means
|
||||
* that we won't go down these pathways very often after custom extent
|
||||
* hooks are installed.
|
||||
*/
|
||||
bool enabled;
|
||||
sec_bin_t *bins;
|
||||
/* Number of bytes in all bins in the shard. */
|
||||
size_t bytes_cur;
|
||||
/* The next pszind to flush in the flush-some pathways. */
|
||||
pszind_t to_flush_next;
|
||||
sec_bin_stats_t stats;
|
||||
};
|
||||
|
||||
typedef struct sec_s sec_t;
|
||||
struct sec_s {
|
||||
pai_t pai;
|
||||
pai_t *fallback;
|
||||
|
||||
sec_opts_t opts;
|
||||
sec_shard_t *shards;
|
||||
pszind_t npsizes;
|
||||
sec_opts_t opts;
|
||||
sec_bin_t *bins;
|
||||
pszind_t npsizes;
|
||||
};
|
||||
|
||||
bool sec_init(tsdn_t *tsdn, sec_t *sec, base_t *base, pai_t *fallback,
|
||||
const sec_opts_t *opts);
|
||||
void sec_flush(tsdn_t *tsdn, sec_t *sec);
|
||||
void sec_disable(tsdn_t *tsdn, sec_t *sec);
|
||||
static inline bool
|
||||
sec_is_used(sec_t *sec) {
|
||||
return sec->opts.nshards != 0;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
sec_size_supported(sec_t *sec, size_t size) {
|
||||
return sec_is_used(sec) && size <= sec->opts.max_alloc;
|
||||
}
|
||||
|
||||
/* If sec does not have extent available, it will return NULL. */
|
||||
edata_t *sec_alloc(tsdn_t *tsdn, sec_t *sec, size_t size);
|
||||
void sec_fill(tsdn_t *tsdn, sec_t *sec, size_t size,
|
||||
edata_list_active_t *result, size_t nallocs);
|
||||
|
||||
/*
|
||||
* Upon return dalloc_list may be empty if edata is consumed by sec or non-empty
|
||||
* if there are extents that need to be flushed from cache. Please note, that
|
||||
* if we need to flush, extent(s) returned in the list to be deallocated
|
||||
* will almost certainly not contain the one being dalloc-ed (that one will be
|
||||
* considered "hot" and preserved in the cache, while "colder" ones are
|
||||
* returned).
|
||||
*/
|
||||
void sec_dalloc(tsdn_t *tsdn, sec_t *sec, edata_list_active_t *dalloc_list);
|
||||
|
||||
bool sec_init(tsdn_t *tsdn, sec_t *sec, base_t *base, const sec_opts_t *opts);
|
||||
|
||||
/* Fills to_flush with extents that need to be deallocated */
|
||||
void sec_flush(tsdn_t *tsdn, sec_t *sec, edata_list_active_t *to_flush);
|
||||
|
||||
/*
|
||||
* Morally, these two stats methods probably ought to be a single one (and the
|
||||
|
|
|
|||
|
|
@ -12,46 +12,39 @@ typedef struct sec_opts_s sec_opts_t;
|
|||
struct sec_opts_s {
|
||||
/*
|
||||
* We don't necessarily always use all the shards; requests are
|
||||
* distributed across shards [0, nshards - 1).
|
||||
* distributed across shards [0, nshards - 1). Once thread picks a
|
||||
* shard it will always use that one. If this value is set to 0 sec is
|
||||
* not used.
|
||||
*/
|
||||
size_t nshards;
|
||||
/*
|
||||
* We'll automatically refuse to cache any objects in this sec if
|
||||
* they're larger than max_alloc bytes, instead forwarding such objects
|
||||
* directly to the fallback.
|
||||
* they're larger than max_alloc bytes.
|
||||
*/
|
||||
size_t max_alloc;
|
||||
/*
|
||||
* Exceeding this amount of cached extents in a shard causes us to start
|
||||
* flushing bins in that shard until we fall below bytes_after_flush.
|
||||
* Exceeding this amount of cached extents in a bin causes us to flush
|
||||
* until we are 1/4 below max_bytes.
|
||||
*/
|
||||
size_t max_bytes;
|
||||
/*
|
||||
* The number of bytes (in all bins) we flush down to when we exceed
|
||||
* bytes_cur. We want this to be less than bytes_cur, because
|
||||
* otherwise we could get into situations where a shard undergoing
|
||||
* net-deallocation keeps bytes_cur very near to max_bytes, so that
|
||||
* most deallocations get immediately forwarded to the underlying PAI
|
||||
* implementation, defeating the point of the SEC.
|
||||
*/
|
||||
size_t bytes_after_flush;
|
||||
/*
|
||||
* When we can't satisfy an allocation out of the SEC because there are
|
||||
* no available ones cached, we allocate multiple of that size out of
|
||||
* the fallback allocator. Eventually we might want to do something
|
||||
* cleverer, but for now we just grab a fixed number.
|
||||
* no available ones cached, allocator will allocate a batch with extra
|
||||
* batch_fill_extra extents of the same size.
|
||||
*/
|
||||
size_t batch_fill_extra;
|
||||
};
|
||||
|
||||
#define SEC_OPTS_NSHARDS_DEFAULT 2
|
||||
#define SEC_OPTS_BATCH_FILL_EXTRA_DEFAULT 3
|
||||
#define SEC_OPTS_MAX_ALLOC_DEFAULT ((32 * 1024) < PAGE ? PAGE : (32 * 1024))
|
||||
#define SEC_OPTS_MAX_BYTES_DEFAULT \
|
||||
((256 * 1024) < (4 * SEC_OPTS_MAX_ALLOC_DEFAULT) \
|
||||
? (4 * SEC_OPTS_MAX_ALLOC_DEFAULT) \
|
||||
: (256 * 1024))
|
||||
|
||||
#define SEC_OPTS_DEFAULT \
|
||||
{ \
|
||||
/* nshards */ \
|
||||
4, /* max_alloc */ \
|
||||
(32 * 1024) < PAGE ? PAGE : (32 * 1024), /* max_bytes */ \
|
||||
256 * 1024, /* bytes_after_flush */ \
|
||||
128 * 1024, /* batch_fill_extra */ \
|
||||
0 \
|
||||
}
|
||||
{SEC_OPTS_NSHARDS_DEFAULT, SEC_OPTS_MAX_ALLOC_DEFAULT, \
|
||||
SEC_OPTS_MAX_BYTES_DEFAULT, SEC_OPTS_BATCH_FILL_EXTRA_DEFAULT}
|
||||
|
||||
#endif /* JEMALLOC_INTERNAL_SEC_OPTS_H */
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ enum witness_rank_e {
|
|||
WITNESS_RANK_DECAY = WITNESS_RANK_CORE,
|
||||
WITNESS_RANK_TCACHE_QL,
|
||||
|
||||
WITNESS_RANK_SEC_SHARD,
|
||||
WITNESS_RANK_SEC_BIN,
|
||||
|
||||
WITNESS_RANK_EXTENT_GROW,
|
||||
WITNESS_RANK_HPA_SHARD_GROW = WITNESS_RANK_EXTENT_GROW,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue