diff --git a/include/jemalloc/internal/tcache_types.h b/include/jemalloc/internal/tcache_types.h index 578a199e..27516387 100644 --- a/include/jemalloc/internal/tcache_types.h +++ b/include/jemalloc/internal/tcache_types.h @@ -23,5 +23,6 @@ typedef struct tcaches_s tcaches_t; #define TCACHE_MAXCLASS_LIMIT ((size_t)1 << TCACHE_LG_MAXCLASS_LIMIT) #define TCACHE_NBINS_MAX (SC_NBINS + SC_NGROUP * \ (TCACHE_LG_MAXCLASS_LIMIT - SC_LG_LARGE_MINCLASS) + 1) +#define TCACHE_GC_NEIGHBOR_LIMIT ((uintptr_t)1 << 21) /* 2M */ #endif /* JEMALLOC_INTERNAL_TCACHE_TYPES_H */ diff --git a/src/tcache.c b/src/tcache.c index c300ed7d..35f18077 100644 --- a/src/tcache.c +++ b/src/tcache.c @@ -134,10 +134,162 @@ tcache_gc_item_delay_compute(szind_t szind) { return (uint8_t)item_delay; } +static inline void * +tcache_gc_small_heuristic_addr_get(tsd_t *tsd, tcache_slow_t *tcache_slow, + szind_t szind) { + assert(szind < SC_NBINS); + tsdn_t *tsdn = tsd_tsdn(tsd); + bin_t *bin = arena_bin_choose(tsdn, tcache_slow->arena, szind, NULL); + assert(bin != NULL); + + malloc_mutex_lock(tsdn, &bin->lock); + edata_t *slab = (bin->slabcur == NULL) ? + edata_heap_first(&bin->slabs_nonfull) : bin->slabcur; + assert(slab != NULL || edata_heap_empty(&bin->slabs_nonfull)); + void *ret = (slab != NULL) ? edata_addr_get(slab) : NULL; + assert(ret != NULL || slab == NULL); + malloc_mutex_unlock(tsdn, &bin->lock); + + return ret; +} + +static inline bool +tcache_gc_is_addr_remote(void *addr, uintptr_t min, uintptr_t max) { + assert(addr != NULL); + return ((uintptr_t)addr < min || (uintptr_t)addr >= max); +} + +static inline cache_bin_sz_t +tcache_gc_small_nremote_get(cache_bin_t *cache_bin, void *addr, + uintptr_t *addr_min, uintptr_t *addr_max, szind_t szind, size_t nflush) { + assert(addr != NULL && addr_min != NULL && addr_max != NULL); + /* The slab address range that the provided addr belongs to. */ + uintptr_t slab_min = (uintptr_t)addr; + uintptr_t slab_max = slab_min + bin_infos[szind].slab_size; + /* + * When growing retained virtual memory, it's increased exponentially, + * starting from 2M, so that the total number of disjoint virtual + * memory ranges retained by each shard is limited. + */ + uintptr_t neighbor_min = ((uintptr_t)addr > TCACHE_GC_NEIGHBOR_LIMIT) ? + ((uintptr_t)addr - TCACHE_GC_NEIGHBOR_LIMIT) : 0; + uintptr_t neighbor_max = ((uintptr_t)addr < (UINTPTR_MAX - + TCACHE_GC_NEIGHBOR_LIMIT)) ? ((uintptr_t)addr + + TCACHE_GC_NEIGHBOR_LIMIT) : UINTPTR_MAX; + + /* Scan the entire bin to count the number of remote pointers. */ + void **head = cache_bin->stack_head; + cache_bin_sz_t n_remote_slab = 0, n_remote_neighbor = 0; + cache_bin_sz_t ncached = cache_bin_ncached_get_local(cache_bin); + for (void **cur = head; cur < head + ncached; cur++) { + n_remote_slab += (cache_bin_sz_t)tcache_gc_is_addr_remote(*cur, + slab_min, slab_max); + n_remote_neighbor += (cache_bin_sz_t)tcache_gc_is_addr_remote(*cur, + neighbor_min, neighbor_max); + } + /* + * Note: since slab size is dynamic and can be larger than 2M, i.e. + * TCACHE_GC_NEIGHBOR_LIMIT, there is no guarantee as to which of + * n_remote_slab and n_remote_neighbor is greater. + */ + assert(n_remote_slab <= ncached && n_remote_neighbor <= ncached); + /* + * We first consider keeping ptrs from the neighboring addr range, + * since in most cases the range is greater than the slab range. + * So if the number of non-neighbor ptrs is more than the intended + * flush amount, we use it as the anchor for flushing. + */ + if (n_remote_neighbor >= nflush) { + *addr_min = neighbor_min; + *addr_max = neighbor_max; + return n_remote_neighbor; + } + /* + * We then consider only keeping ptrs from the local slab, and in most + * cases this is stricter, assuming that slab < 2M is the common case. + */ + *addr_min = slab_min; + *addr_max = slab_max; + return n_remote_slab; +} + +/* Shuffle the ptrs in the bin to put the remote pointers at the bottom. */ +static inline void +tcache_gc_small_bin_shuffle(cache_bin_t *cache_bin, cache_bin_sz_t nremote, + uintptr_t addr_min, uintptr_t addr_max) { + void **swap = NULL; + cache_bin_sz_t ncached = cache_bin_ncached_get_local(cache_bin); + cache_bin_sz_t ntop = ncached - nremote, cnt = 0; + assert(ntop > 0 && ntop < ncached); + /* + * Scan the [head, head + ntop) part of the cache bin, during which + * bubbling the non-remote ptrs to the top of the bin. + * After this, the [head, head + cnt) part of the bin contains only + * non-remote ptrs, and they're in the same relative order as before. + * While the [head + cnt, head + ntop) part contains only remote ptrs. + */ + void **head = cache_bin->stack_head; + for (void **cur = head; cur < head + ntop; cur++) { + if (!tcache_gc_is_addr_remote(*cur, addr_min, addr_max)) { + /* Tracks the number of non-remote ptrs seen so far. */ + cnt++; + /* + * There is remote ptr before the current non-remote ptr, + * swap the current non-remote ptr with the remote ptr, + * and increment the swap pointer so that it's still + * pointing to the top remote ptr in the bin. + */ + if (swap != NULL) { + assert(swap < cur); + assert(tcache_gc_is_addr_remote(*swap, addr_min, addr_max)); + void *tmp = *cur; + *cur = *swap; + *swap = tmp; + swap++; + assert(swap <= cur); + assert(tcache_gc_is_addr_remote(*swap, addr_min, addr_max)); + } + continue; + } else if (swap == NULL) { + /* Swap always points to the top remote ptr in the bin. */ + swap = cur; + } + } + /* + * Scan the [head + ntop, head + ncached) part of the cache bin, + * after which it should only contain remote ptrs. + */ + for (void **cur = head + ntop; cur < head + ncached; cur++) { + /* Early break if all non-remote ptrs have been moved. */ + if (cnt == ntop) { + break; + } + if (!tcache_gc_is_addr_remote(*cur, addr_min, addr_max)) { + assert(tcache_gc_is_addr_remote(*(head + cnt), addr_min, + addr_max)); + void *tmp = *cur; + *cur = *(head + cnt); + *(head + cnt) = tmp; + cnt++; + } + } + assert(cnt == ntop); + /* Sanity check to make sure the shuffle is done correctly. */ + for (void **cur = head; cur < head + ncached; cur++) { + assert(*cur != NULL); + assert(((cur < head + ntop) && !tcache_gc_is_addr_remote( + *cur, addr_min, addr_max)) || ((cur >= head + ntop) && + tcache_gc_is_addr_remote(*cur, addr_min, addr_max))); + } +} + static void tcache_gc_small(tsd_t *tsd, tcache_slow_t *tcache_slow, tcache_t *tcache, szind_t szind) { - /* Aim to flush 3/4 of items below low-water. */ + /* + * Aim to flush 3/4 of items below low-water, with remote pointers being + * prioritized for flushing. + */ assert(szind < SC_NBINS); cache_bin_t *cache_bin = &tcache->bins[szind]; @@ -158,8 +310,6 @@ tcache_gc_small(tsd_t *tsd, tcache_slow_t *tcache_slow, tcache_t *tcache, tcache_slow->bin_flush_delay_items[szind] = tcache_gc_item_delay_compute(szind); - tcache_bin_flush_small(tsd, tcache, cache_bin, szind, - (unsigned)(ncached - nflush)); /* * Reduce fill count by 2X. Limit lg_fill_div such that @@ -169,12 +319,70 @@ tcache_gc_small(tsd_t *tsd, tcache_slow_t *tcache_slow, tcache_t *tcache, tcache_slow->lg_fill_div[szind]) > 1) { tcache_slow->lg_fill_div[szind]++; } + + /* + * When the new tcache gc is not enabled, or simply the entire bin needs + * to be flushed, flush the bottom nflush items directly. + */ + if (!opt_experimental_tcache_gc || nflush == ncached) { + goto label_flush; + } + + /* Query arena binshard to get heuristic locality info. */ + void *addr = tcache_gc_small_heuristic_addr_get(tsd, tcache_slow, szind); + if (addr == NULL) { + goto label_flush; + } + + /* + * Use the queried addr above to get the number of remote ptrs in the + * bin, and the min/max of the local addr range. + */ + uintptr_t addr_min, addr_max; + cache_bin_sz_t nremote = tcache_gc_small_nremote_get(cache_bin, addr, + &addr_min, &addr_max, szind, nflush); + + /* + * Update the nflush to the larger value between the intended flush count + * and the number of remote ptrs. + */ + if (nremote > nflush) { + nflush = nremote; + } + /* + * When entering the locality check, nflush should be less than ncached, + * otherwise the entire bin should be flushed regardless. The only case + * when nflush gets updated to ncached after locality check is, when all + * the items in the bin are remote, in which case the entire bin should + * also be flushed. + */ + assert(nflush < ncached || nremote == ncached); + if (nremote == 0 || nremote == ncached) { + goto label_flush; + } + + /* + * Move the remote points to the bottom of the bin for flushing. + * As long as moved to the bottom, the order of these nremote ptrs + * does not matter, since they are going to be flushed anyway. + * The rest of the ptrs are moved to the top of the bin, and their + * relative order is maintained. + */ + tcache_gc_small_bin_shuffle(cache_bin, nremote, addr_min, addr_max); + +label_flush: + assert(nflush > 0 && nflush <= ncached); + tcache_bin_flush_small(tsd, tcache, cache_bin, szind, + (unsigned)(ncached - nflush)); } static void tcache_gc_large(tsd_t *tsd, tcache_slow_t *tcache_slow, tcache_t *tcache, szind_t szind) { - /* Like the small GC; flush 3/4 of untouched items. */ + /* + * Like the small GC, flush 3/4 of untouched items. However, simply flush + * the bottom nflush items, without any locality check. + */ assert(szind >= SC_NBINS); cache_bin_t *cache_bin = &tcache->bins[szind]; assert(!tcache_bin_disabled(szind, cache_bin, tcache->tcache_slow));