jemalloc/src/cache_bin.c
Qi Wang 72cfdce718 Allocate tcache stack from base allocator
When using metadata_thp, allocate tcache bin stacks from base0, which means they
will be placed on huge pages along with other metadata, instead of mixed with
other regular allocations.

In order to do so, modified the base allocator to support limited reuse: freed
tcached stacks (from thread termination) will be returned to base0 and made
available for reuse, but no merging will be attempted since they were bump
allocated out of base blocks. These reused base extents are managed using
separately allocated base edata_t -- they are cached in base->edata_avail when
the extent is all allocated.

One tricky part is, stats updating must be skipped for such reused extents
(since they were accounted for already, and there is no purging for base). This
requires tracking the "if is reused" state explicitly and bypass the stats
updates when allocating from them.
2023-09-18 12:18:32 -07:00

110 lines
3.6 KiB
C

#include "jemalloc/internal/jemalloc_preamble.h"
#include "jemalloc/internal/jemalloc_internal_includes.h"
#include "jemalloc/internal/bit_util.h"
#include "jemalloc/internal/cache_bin.h"
#include "jemalloc/internal/safety_check.h"
void
cache_bin_info_init(cache_bin_info_t *info,
cache_bin_sz_t ncached_max) {
assert(ncached_max <= CACHE_BIN_NCACHED_MAX);
size_t stack_size = (size_t)ncached_max * sizeof(void *);
assert(stack_size < ((size_t)1 << (sizeof(cache_bin_sz_t) * 8)));
info->ncached_max = (cache_bin_sz_t)ncached_max;
}
bool
cache_bin_stack_use_thp(void) {
/*
* If metadata_thp is enabled, allocating tcache stack from the base
* allocator for efficiency gains. The downside, however, is that base
* allocator never purges freed memory, and may cache a fair amount of
* memory after many threads are terminated and not reused.
*/
return metadata_thp_enabled();
}
void
cache_bin_info_compute_alloc(cache_bin_info_t *infos, szind_t ninfos,
size_t *size, size_t *alignment) {
/* For the total bin stack region (per tcache), reserve 2 more slots so
* that
* 1) the empty position can be safely read on the fast path before
* checking "is_empty"; and
* 2) the cur_ptr can go beyond the empty position by 1 step safely on
* the fast path (i.e. no overflow).
*/
*size = sizeof(void *) * 2;
for (szind_t i = 0; i < ninfos; i++) {
assert(infos[i].ncached_max > 0);
*size += infos[i].ncached_max * sizeof(void *);
}
/*
* When not using THP, align to at least PAGE, to minimize the # of TLBs
* needed by the smaller sizes; also helps if the larger sizes don't get
* used at all.
*/
*alignment = cache_bin_stack_use_thp() ? QUANTUM : PAGE;
}
void
cache_bin_preincrement(cache_bin_info_t *infos, szind_t ninfos, void *alloc,
size_t *cur_offset) {
if (config_debug) {
size_t computed_size;
size_t computed_alignment;
/* Pointer should be as aligned as we asked for. */
cache_bin_info_compute_alloc(infos, ninfos, &computed_size,
&computed_alignment);
assert(((uintptr_t)alloc & (computed_alignment - 1)) == 0);
}
*(uintptr_t *)((byte_t *)alloc + *cur_offset) =
cache_bin_preceding_junk;
*cur_offset += sizeof(void *);
}
void
cache_bin_postincrement(void *alloc, size_t *cur_offset) {
*(uintptr_t *)((byte_t *)alloc + *cur_offset) =
cache_bin_trailing_junk;
*cur_offset += sizeof(void *);
}
void
cache_bin_init(cache_bin_t *bin, cache_bin_info_t *info, void *alloc,
size_t *cur_offset) {
/*
* The full_position points to the lowest available space. Allocations
* will access the slots toward higher addresses (for the benefit of
* adjacent prefetch).
*/
void *stack_cur = (void *)((byte_t *)alloc + *cur_offset);
void *full_position = stack_cur;
uint16_t bin_stack_size = info->ncached_max * sizeof(void *);
*cur_offset += bin_stack_size;
void *empty_position = (void *)((byte_t *)alloc + *cur_offset);
/* Init to the empty position. */
bin->stack_head = (void **)empty_position;
bin->low_bits_low_water = (uint16_t)(uintptr_t)bin->stack_head;
bin->low_bits_full = (uint16_t)(uintptr_t)full_position;
bin->low_bits_empty = (uint16_t)(uintptr_t)empty_position;
cache_bin_info_init(&bin->bin_info, info->ncached_max);
cache_bin_sz_t free_spots = cache_bin_diff(bin,
bin->low_bits_full, (uint16_t)(uintptr_t)bin->stack_head);
assert(free_spots == bin_stack_size);
assert(cache_bin_ncached_get_local(bin, info) == 0);
assert(cache_bin_empty_position_get(bin) == empty_position);
assert(bin_stack_size > 0 || empty_position == full_position);
}
bool
cache_bin_still_zero_initialized(cache_bin_t *bin) {
return bin->stack_head == NULL;
}