From 7c996861656f67dc74ab66f1bc6e758ed96c69b3 Mon Sep 17 00:00:00 2001 From: Shirui Cheng Date: Thu, 22 Aug 2024 14:50:08 -0700 Subject: [PATCH] Better handle burst allocation on tcache_alloc_small_hard --- include/jemalloc/internal/cache_bin.h | 10 ++ include/jemalloc/internal/tcache_structs.h | 4 +- src/tcache.c | 105 ++++++++++++++++++--- 3 files changed, 104 insertions(+), 15 deletions(-) diff --git a/include/jemalloc/internal/cache_bin.h b/include/jemalloc/internal/cache_bin.h index a7a5e40e..cb137af9 100644 --- a/include/jemalloc/internal/cache_bin.h +++ b/include/jemalloc/internal/cache_bin.h @@ -600,6 +600,16 @@ cache_bin_nitems_get_remote(cache_bin_t *bin, cache_bin_sz_t *ncached, */ } +/* + * For small bins, used to calculate how many items to fill at a time. + * The final nfill is calculated by (ncached_max >> (base - offset)). + */ +typedef struct cache_bin_fill_ctl_s cache_bin_fill_ctl_t; +struct cache_bin_fill_ctl_s { + uint8_t base; + uint8_t offset; +}; + /* * Limit how many items can be flushed in a batch (Which is the upper bound * for the nflush parameter in tcache_bin_flush_impl()). diff --git a/include/jemalloc/internal/tcache_structs.h b/include/jemalloc/internal/tcache_structs.h index 63e5db5d..e9a68152 100644 --- a/include/jemalloc/internal/tcache_structs.h +++ b/include/jemalloc/internal/tcache_structs.h @@ -39,8 +39,8 @@ struct tcache_slow_s { szind_t next_gc_bin; szind_t next_gc_bin_small; szind_t next_gc_bin_large; - /* For small bins, fill (ncached_max >> lg_fill_div). */ - uint8_t lg_fill_div[SC_NBINS]; + /* For small bins, help determine how many items to fill at a time. */ + cache_bin_fill_ctl_t bin_fill_ctl_do_not_access_directly[SC_NBINS]; /* For small bins, whether has been refilled since last GC. */ bool bin_refilled[SC_NBINS]; /* diff --git a/src/tcache.c b/src/tcache.c index 3d38700d..15da14da 100644 --- a/src/tcache.c +++ b/src/tcache.c @@ -121,6 +121,85 @@ tcache_gc_dalloc_postponed_event_wait(tsd_t *tsd) { return TE_MIN_START_WAIT; } +static inline void +tcache_bin_fill_ctl_init(tcache_slow_t *tcache_slow, szind_t szind) { + assert(szind < SC_NBINS); + cache_bin_fill_ctl_t *ctl = + &tcache_slow->bin_fill_ctl_do_not_access_directly[szind]; + ctl->base = 1; + ctl->offset = 0; +} + +static inline cache_bin_fill_ctl_t * +tcache_bin_fill_ctl_get(tcache_slow_t *tcache_slow, szind_t szind) { + assert(szind < SC_NBINS); + cache_bin_fill_ctl_t *ctl = + &tcache_slow->bin_fill_ctl_do_not_access_directly[szind]; + assert(ctl->base > ctl->offset); + return ctl; +} + +/* + * The number of items to be filled at a time for a given small bin is + * calculated by (ncached_max >> lg_fill_div). + * The actual ctl struct consists of two fields, i.e. base and offset, + * and the difference between the two(base - offset) is the final lg_fill_div. + * The base is adjusted during GC based on the traffic within a period of time, + * while the offset is updated in real time to handle the immediate traffic. + */ +static inline uint8_t +tcache_nfill_small_lg_div_get(tcache_slow_t *tcache_slow, szind_t szind) { + cache_bin_fill_ctl_t *ctl = tcache_bin_fill_ctl_get(tcache_slow, szind); + return (ctl->base - (opt_experimental_tcache_gc ? ctl->offset : 0)); +} + +/* + * When we want to fill more items to respond to burst load, + * offset is increased so that (base - offset) is decreased, + * which in return increases the number of items to be filled. + */ +static inline void +tcache_nfill_small_burst_prepare(tcache_slow_t *tcache_slow, szind_t szind) { + cache_bin_fill_ctl_t *ctl = tcache_bin_fill_ctl_get(tcache_slow, szind); + if (ctl->offset + 1 < ctl->base) { + ctl->offset++; + } +} + +static inline void +tcache_nfill_small_burst_reset(tcache_slow_t *tcache_slow, szind_t szind) { + cache_bin_fill_ctl_t *ctl = tcache_bin_fill_ctl_get(tcache_slow, szind); + ctl->offset = 0; +} + +/* + * limit == 0: indicating that the fill count should be increased, + * i.e. lg_div(base) should be decreased. + * + * limit != 0: limit is set to ncached_max, indicating that the fill + * count should be decreased, i.e. lg_div(base) should be increased. + */ +static inline void +tcache_nfill_small_gc_update(tcache_slow_t *tcache_slow, szind_t szind, + cache_bin_sz_t limit) { + cache_bin_fill_ctl_t *ctl = tcache_bin_fill_ctl_get(tcache_slow, szind); + if (!limit && ctl->base > 1) { + /* + * Increase fill count by 2X for small bins. Make sure + * lg_fill_div stays greater than 1. + */ + ctl->base--; + } else if (limit && (limit >> ctl->base) > 1) { + /* + * Reduce fill count by 2X. Limit lg_fill_div such that + * the fill count is always at least 1. + */ + ctl->base++; + } + /* Reset the offset for the next GC period. */ + ctl->offset = 0; +} + static uint8_t tcache_gc_item_delay_compute(szind_t szind) { assert(szind < SC_NBINS); @@ -298,21 +377,19 @@ tcache_gc_small(tsd_t *tsd, tcache_slow_t *tcache_slow, tcache_t *tcache, cache_bin_sz_t low_water = cache_bin_low_water_get(cache_bin); if (low_water > 0) { /* - * Reduce fill count by 2X. Limit lg_fill_div such that - * the fill count is always at least 1. + * There is unused items within the GC period => reduce fill count. + * limit field != 0 is borrowed to indicate that the fill count + * should be reduced. */ - if ((cache_bin_ncached_max_get(cache_bin) >> - tcache_slow->lg_fill_div[szind]) > 1) { - tcache_slow->lg_fill_div[szind]++; - } + tcache_nfill_small_gc_update(tcache_slow, szind, + /* limit */ cache_bin_ncached_max_get(cache_bin)); } else if (tcache_slow->bin_refilled[szind]) { /* - * Increase fill count by 2X for small bins. Make sure - * lg_fill_div stays greater than 0. + * There has been refills within the GC period => increase fill count. + * limit field set to 0 is borrowed to indicate that the fill count + * should be increased. */ - if (tcache_slow->lg_fill_div[szind] > 1) { - tcache_slow->lg_fill_div[szind]--; - } + tcache_nfill_small_gc_update(tcache_slow, szind, /* limit */ 0); tcache_slow->bin_refilled[szind] = false; } assert(!tcache_slow->bin_refilled[szind]); @@ -526,7 +603,7 @@ tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, assert(tcache_slow->arena != NULL); assert(!tcache_bin_disabled(binind, cache_bin, tcache_slow)); cache_bin_sz_t nfill = cache_bin_ncached_max_get(cache_bin) - >> tcache_slow->lg_fill_div[binind]; + >> tcache_nfill_small_lg_div_get(tcache_slow, binind); if (nfill == 0) { nfill = 1; } @@ -534,6 +611,7 @@ tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, /* nfill_min */ opt_experimental_tcache_gc ? ((nfill >> 1) + 1) : nfill, /* nfill_max */ nfill); tcache_slow->bin_refilled[binind] = true; + tcache_nfill_small_burst_prepare(tcache_slow, binind); ret = cache_bin_alloc(cache_bin, tcache_success); return ret; @@ -1059,6 +1137,7 @@ tcache_bin_flush_bottom(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, void tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, szind_t binind, unsigned rem) { + tcache_nfill_small_burst_reset(tcache->tcache_slow, binind); tcache_bin_flush_bottom(tsd, tcache, cache_bin, binind, rem, /* small */ true); } @@ -1233,7 +1312,7 @@ tcache_init(tsd_t *tsd, tcache_slow_t *tcache_slow, tcache_t *tcache, &cur_offset); for (unsigned i = 0; i < tcache_nbins; i++) { if (i < SC_NBINS) { - tcache_slow->lg_fill_div[i] = 1; + tcache_bin_fill_ctl_init(tcache_slow, i); tcache_slow->bin_refilled[i] = false; tcache_slow->bin_flush_delay_items[i] = tcache_gc_item_delay_compute(i);